1 /* @source prophecy application
2 **
3 ** Creates profiles and simple freuency matrices
4 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
5 ** @@
6 **
7 ** This program is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 ******************************************************************************/
21
22 #include "emboss.h"
23 #include <string.h>
24 #include <ctype.h>
25 #include <limits.h>
26
27 #define AZ 27
28 #define GRIBSKOV_LENGTH 27
29 #define HENIKOFF_LENGTH 27
30
31
32
33
34 static void prophecy_simple_matrix(const AjPSeqset seqset, AjPFile outf,
35 const AjPStr name,
36 ajint thresh);
37 static void prophecy_gribskov_profile(const AjPSeqset seqset,
38 AjPFile outf, const AjPStr name,
39 ajint thresh,
40 float gapopen, float gapextend,
41 AjPStr *cons);
42 static void prophecy_henikoff_profile(const AjPSeqset seqset,
43 const AjPMatrixf imtx,
44 ajint thresh, const AjPSeqCvt cvt,
45 AjPFile outf, const AjPStr name,
46 float gapopen, float gapextend,
47 AjPStr *cons);
48
49
50
51
52 /* @prog prophecy *************************************************************
53 **
54 ** Creates matrices/profiles from multiple alignments
55 **
56 ******************************************************************************/
57
main(int argc,char ** argv)58 int main(int argc, char **argv)
59 {
60
61 AjPSeqset seqset = NULL;
62 AjPFile outf = NULL;
63 AjPStr name = NULL;
64 AjPStr cons = NULL;
65
66 ajint thresh;
67 AjPStr type;
68
69 AjPMatrixf imtx = NULL;
70 AjPSeqCvt cvt = NULL;
71 float gapopen;
72 float gapextend;
73
74 embInit("prophecy", argc, argv);
75
76
77 seqset = ajAcdGetSeqset("sequence");
78 name = ajAcdGetString("name");
79 thresh = ajAcdGetInt("threshold");
80 imtx = ajAcdGetMatrixf("datafile");
81 type = ajAcdGetListSingle("type");
82 outf = ajAcdGetOutfile("outfile");
83
84 gapopen = ajAcdGetFloat("open");
85 gapextend = ajAcdGetFloat("extension");
86
87 gapopen = ajRoundFloat(gapopen, 8);
88 gapextend = ajRoundFloat(gapextend, 8);
89
90 cons = ajStrNewC("");
91
92
93 if(ajStrGetCharFirst(type) == 'F')
94 prophecy_simple_matrix(seqset,outf,name,thresh);
95
96 if(ajStrGetCharFirst(type) == 'G')
97 prophecy_gribskov_profile(seqset,outf,name,thresh,
98 gapopen,gapextend,&cons);
99
100 if(ajStrGetCharFirst(type) == 'H')
101 prophecy_henikoff_profile(seqset,imtx,thresh,cvt,outf,name,
102 gapopen,gapextend,&cons);
103
104 ajFileClose(&outf);
105
106 ajSeqsetDel(&seqset);
107 ajStrDel(&name);
108 ajStrDel(&cons);
109 ajMatrixfDel(&imtx);
110 ajStrDel(&type);
111
112 embExit();
113
114 return 0;
115 }
116
117
118
119
120 /* @funcstatic prophecy_simple_matrix *****************************************
121 **
122 ** Undocumented.
123 **
124 ** @param [r] seqset [const AjPSeqset] Undocumented
125 ** @param [u] outf [AjPFile] Undocumented
126 ** @param [r] name [const AjPStr] Undocumented
127 ** @param [r] thresh [ajint] Undocumented
128 ** @@
129 ******************************************************************************/
130
131
prophecy_simple_matrix(const AjPSeqset seqset,AjPFile outf,const AjPStr name,ajint thresh)132 static void prophecy_simple_matrix(const AjPSeqset seqset, AjPFile outf,
133 const AjPStr name,
134 ajint thresh)
135 {
136 const char *p;
137 ajuint nseqs;
138 ajuint mlen;
139 ajint len;
140 ajuint i;
141 ajint j;
142 ajint x;
143 ajint px;
144
145 ajint maxscore;
146 ajint score;
147 ajint *matrix[AZ+2];
148 AjPStr cons = NULL;
149 size_t stlen;
150
151 nseqs = ajSeqsetGetSize(seqset);
152 if(nseqs<2)
153 ajFatal("Insufficient sequences (%d) to create a matrix",nseqs);
154
155 mlen = ajSeqsetGetLen(seqset);
156
157 /* Check sequences are the same length. Warn if not */
158 for(i=0;i<nseqs;++i)
159 {
160 p = ajSeqsetGetseqSeqC(seqset,i);
161 if(strlen(p)!=mlen)
162 ajWarn("Sequence lengths are not equal!");
163 }
164
165 for(i=0;i<AZ+2;++i)
166 AJCNEW0(matrix[i], mlen);
167
168 /* Load matrix */
169 for(i=0;i<nseqs;++i)
170 {
171 p = ajSeqsetGetseqSeqC(seqset,i);
172 stlen = strlen(p);
173 len = (ajint) stlen;
174
175 for(j=0;j<len;++j)
176 {
177 x = toupper((ajint)*p++);
178 ++matrix[ajBasecodeToInt(x)][j];
179 }
180 }
181
182 /* Get consensus sequence */
183 cons = ajStrNew();
184 for(i=0;i<mlen;++i)
185 {
186 px=x=-INT_MAX;
187
188 for(j=0;j<AZ-1;++j)
189 if(matrix[j][i]>x)
190 {
191 x=matrix[j][i];
192 px=j;
193 }
194 ajStrAppendK(&cons,(char)(px+'A'));
195 }
196
197 /* Find maximum score for matrix */
198 maxscore = 0;
199 for(i=0;i<mlen;++i)
200 {
201 for(j=score=0;j<AZ;++j)
202 score = AJMAX(score,matrix[j][i]);
203 maxscore += score;
204 }
205
206 ajFmtPrintF(outf,"# Pure Frequency Matrix\n");
207 ajFmtPrintF(outf,"# Columns are amino acid counts A->Z\n");
208 ajFmtPrintF(outf,"# Rows are alignment positions 1->n\n");
209 ajFmtPrintF(outf,"Simple\n");
210 ajFmtPrintF(outf,"Name\t\t%s\n",ajStrGetPtr(name));
211 ajFmtPrintF(outf,"Length\t\t%d\n",mlen);
212 ajFmtPrintF(outf,"Maximum score\t%d\n",maxscore);
213 ajFmtPrintF(outf,"Thresh\t\t%d\n",thresh);
214 ajFmtPrintF(outf,"Consensus\t%s\n",ajStrGetPtr(cons));
215
216
217 for(i=0;i<mlen;++i)
218 {
219 for(j=0;j<AZ;++j)
220 ajFmtPrintF(outf,"%-2d ",matrix[j][i]);
221 ajFmtPrintF(outf,"\n");
222 }
223
224 for(i=0;i<AZ+2;++i)
225 AJFREE(matrix[i]);
226
227 ajStrDel(&cons);
228
229 return;
230 }
231
232
233
234
235 /* @funcstatic prophecy_gribskov_profile **************************************
236 **
237 ** Undocumented.
238 **
239 ** @param [r] seqset [const AjPSeqset] Undocumented
240 ** @param [u] outf [AjPFile] Undocumented
241 ** @param [r] name [const AjPStr] Undocumented
242 ** @param [r] thresh [ajint] Undocumented
243 ** @param [r] gapopen [float] Undocumented
244 ** @param [r] gapextend [float] Undocumented
245 ** @param [w] cons [AjPStr*] Undocumented
246 ** @@
247 ******************************************************************************/
248
prophecy_gribskov_profile(const AjPSeqset seqset,AjPFile outf,const AjPStr name,ajint thresh,float gapopen,float gapextend,AjPStr * cons)249 static void prophecy_gribskov_profile(const AjPSeqset seqset,
250 AjPFile outf, const AjPStr name,
251 ajint thresh,
252 float gapopen, float gapextend,
253 AjPStr *cons)
254 {
255 AjPMatrixf imtx = 0;
256 AjPSeqCvt cvt = 0;
257 AjPStr mname = NULL;
258 float **sub = NULL;
259
260 float **mat;
261 ajuint nseqs;
262 ajuint mlen;
263 ajuint i;
264 ajuint j;
265 static const char *valid="ACDEFGHIKLMNPQRSTVWY";
266 const char *p;
267 const char *q;
268 float score;
269 float sum;
270 ajint gsum;
271 float mmax;
272 float pmax;
273 float psum;
274 ajint start;
275 ajint end;
276 ajint pos;
277 float x;
278 ajint px;
279
280 float **weights;
281 ajint *gaps;
282
283
284 mname = ajStrNewC("Epprofile");
285 imtx = ajMatrixfNewFile(mname);
286 ajStrDel(&mname);
287
288 nseqs = ajSeqsetGetSize(seqset);
289 mlen = ajSeqsetGetLen(seqset);
290
291 sub = ajMatrixfGetMatrix(imtx);
292 cvt = ajMatrixfGetCvt(imtx);
293
294
295
296 /*
297 ** Set gaps to be maximum length of gap that can occur
298 ** including that position
299 */
300 AJCNEW(gaps, mlen);
301 for(i=0;i<mlen;++i)
302 {
303 gsum = 0;
304 for(j=0;j<nseqs;++j)
305 {
306 p=ajSeqsetGetseqSeqC(seqset,j);
307 if(i>=strlen(p))
308 continue;
309
310 if(ajBasecodeToInt(p[i])!=27) /* if not a gap */
311 continue;
312 pos = i;
313
314 while(pos>=0 && ajBasecodeToInt(p[pos])==27)
315 --pos;
316 start = ++pos;
317 pos = i;
318
319 while(pos<(ajint)mlen && ajBasecodeToInt(p[pos])==27)
320 ++pos;
321 end = pos-1;
322 gsum = AJMAX(gsum,(end-start)+1);
323 ajDebug("Gribskov gaps pos:%d seq:%d %d..%d (%d)\n",
324 j, i, start, end, gsum);
325 }
326 gaps[i] = gsum;
327 ajDebug("Gribskov gaps[%d] %d\n",
328 i, gsum);
329 }
330
331
332 /* get maximum score in scoring matrix */
333 mmax = 0.0;
334 p = valid;
335 while(*p)
336 {
337 q = valid;
338 while(*q)
339 {
340 mmax=(mmax>sub[ajSeqcvtGetCodeK(cvt,*p)][ajSeqcvtGetCodeK(cvt,*q)]) ? mmax :
341 sub[ajSeqcvtGetCodeK(cvt,*p)][ajSeqcvtGetCodeK(cvt,*q)];
342 ++q;
343 }
344 ++p;
345 }
346
347
348 /* Create the weight matrix and zero it */
349 AJCNEW(weights, mlen);
350 for(i=0;i<mlen;++i)
351 AJCNEW0(weights[i], GRIBSKOV_LENGTH+1);
352
353 /*
354 ** count the number of times each residue occurs at each
355 ** position in the alignment
356 */
357 for(i=0;i<mlen;++i)
358 for(j=0;j<nseqs;++j)
359 {
360 p = ajSeqsetGetseqSeqC(seqset,j);
361 if(i>=strlen(p))
362 continue;
363 weights[i][ajBasecodeToInt(p[i])] += ajSeqsetGetseqWeight(seqset,j);
364 }
365
366
367 px = -INT_MAX;
368 for(i=0;i<mlen;++i)
369 {
370 x = (float)-INT_MAX;
371
372 for(j=0;j<AZ-1;++j)
373 if(weights[i][j]>x)
374 {
375 x=weights[i][j];
376 px=j;
377 }
378 ajStrAppendK(cons,(char)(px+'A'));
379 }
380
381
382 /* Now normalise the weights */
383 for(i=0;i<mlen;++i)
384 for(j=0;j<GRIBSKOV_LENGTH;++j)
385 weights[i][j] /= (float)nseqs;
386
387
388 /* Create the profile matrix n*GRIBSKOV_LENGTH and zero it */
389 AJCNEW(mat, mlen);
390 for(i=0;i<mlen;++i)
391 AJCNEW0(mat[i],GRIBSKOV_LENGTH);
392
393 /* Fill the profile with aa scores */
394 for(i=0;i<mlen;++i)
395 for(p=valid;*p;++p)
396 {
397 sum = 0.0;
398 q = valid;
399 while(*q)
400 {
401 score = weights[i][ajBasecodeToInt(*q)];
402 score *= (float)(sub[ajSeqcvtGetCodeK(cvt,*p)][ajSeqcvtGetCodeK(cvt,*q)]);
403 sum += score;
404 ++q;
405 }
406 mat[i][ajBasecodeToInt(*p)] = sum;
407 }
408
409 /* Calculate gap penalties */
410 for(i=0;i<mlen;++i)
411 mat[i][GRIBSKOV_LENGTH-1]= (mmax / (gapopen+gapextend+(float)gaps[i]));
412
413
414 /* Get maximum matrix score */
415 psum = 0.0;
416 for(i=0;i<mlen;++i)
417 {
418 pmax = (float)-INT_MAX;
419 for(j=0;j<AZ;++j)
420 pmax=(pmax>mat[i][j]) ? pmax : mat[i][j];
421 psum += pmax;
422
423 ajDebug("matrix score [%d] %.3f psum: %.3f\n",
424 i, pmax, psum);
425 }
426
427 /* Print matrix */
428
429 ajFmtPrintF(outf,"# Gribskov Protein Profile\n");
430 ajFmtPrintF(outf,"# Columns are amino acids A->Z\n");
431 ajFmtPrintF(outf,"# Last column is indel penalty\n");
432 ajFmtPrintF(outf,"# Rows are alignment positions 1->n\n");
433 ajFmtPrintF(outf,"Gribskov\n");
434 ajFmtPrintF(outf,"Name\t\t%s\n",ajStrGetPtr(name));
435 ajFmtPrintF(outf,"Matrix\t\tpprofile\n");
436 ajFmtPrintF(outf,"Length\t\t%d\n",mlen);
437 ajFmtPrintF(outf,"Max_score\t%.2f\n",psum);
438 ajFmtPrintF(outf,"Threshold\t%d\n",thresh);
439 ajFmtPrintF(outf,"Gap_open\t%.2f\n",gapopen);
440 ajFmtPrintF(outf,"Gap_extend\t%.2f\n",gapextend);
441 ajFmtPrintF(outf,"Consensus\t%s\n",ajStrGetPtr(*cons));
442
443 for(i=0;i<mlen;++i)
444 {
445 for(j=0;j<GRIBSKOV_LENGTH;++j)
446 ajFmtPrintF(outf,"%.2f ",mat[i][j]);
447 ajFmtPrintF(outf,"%.2f\n",mat[i][GRIBSKOV_LENGTH-1]);
448 }
449
450
451 for(i=0;i<mlen;++i)
452 {
453 AJFREE(mat[i]);
454 AJFREE(weights[i]);
455 }
456 AJFREE(mat);
457 AJFREE(weights);
458
459 AJFREE(gaps);
460
461 ajMatrixfDel(&imtx);
462
463 return;
464 }
465
466
467
468
469 /* @funcstatic prophecy_henikoff_profile **************************************
470 **
471 ** Undocumented.
472 **
473 ** @param [r] seqset [const AjPSeqset] Undocumented
474 ** @param [r] imtx [const AjPMatrixf] Undocumented
475 ** @param [r] thresh [ajint] Undocumented
476 ** @param [r] cvt [const AjPSeqCvt] Undocumented
477 ** @param [u] outf [AjPFile] Undocumented
478 ** @param [r] name [const AjPStr] Undocumented
479 ** @param [r] gapopen [float] Undocumented
480 ** @param [r] gapextend [float] Undocumented
481 ** @param [w] cons [AjPStr*] Undocumented
482 ** @@
483 ******************************************************************************/
484
485
prophecy_henikoff_profile(const AjPSeqset seqset,const AjPMatrixf imtx,ajint thresh,const AjPSeqCvt cvt,AjPFile outf,const AjPStr name,float gapopen,float gapextend,AjPStr * cons)486 static void prophecy_henikoff_profile(const AjPSeqset seqset,
487 const AjPMatrixf imtx,
488 ajint thresh, const AjPSeqCvt cvt,
489 AjPFile outf, const AjPStr name,
490 float gapopen, float gapextend,
491 AjPStr *cons)
492 {
493 float **sub = NULL;
494 float **mat;
495 ajuint nseqs;
496 ajuint mlen;
497 ajuint i;
498 ajuint j;
499 static const char *valid="ACDEFGHIKLMNPQRSTVWY";
500 const char *p;
501 const char *q;
502 float score;
503 float sum;
504 float psum;
505 float pmax;
506 ajint gsum;
507 ajint mmax;
508 ajint start;
509 ajint end;
510 ajint pos;
511
512 float **weights;
513 ajint *gaps;
514 ajint *pcnt;
515
516 float x;
517 ajint px;
518
519
520 nseqs = ajSeqsetGetSize(seqset);
521 mlen = ajSeqsetGetLen(seqset);
522
523 sub = ajMatrixfGetMatrix(imtx);
524 cvt = ajMatrixfGetCvt(imtx);
525
526
527 /*
528 ** Set gaps to be maximum length of gap that can occur
529 ** including that position
530 */
531 AJCNEW(gaps, mlen);
532 for(i=0;i<mlen;++i)
533 {
534 gsum = 0;
535 for(j=0;j<nseqs;++j)
536 {
537 p=ajSeqsetGetseqSeqC(seqset,j);
538 if(i>=strlen(p))
539 continue;
540
541 if(ajBasecodeToInt(p[i])!=27)
542 continue; /* if not a gap */
543
544 pos = i;
545 while(pos>=0 && ajBasecodeToInt(p[pos])==27)
546 --pos;
547 start = ++pos;
548
549 pos = i;
550 while(pos<(ajint)mlen && ajBasecodeToInt(p[pos])==27)
551 ++pos;
552 end = pos-1;
553 gsum = AJMAX(gsum,(end-start)+1);
554 }
555 gaps[i] = gsum;
556 }
557
558 /* get maximum score in scoring matrix */
559 mmax = 0;
560 p = valid;
561 while(*p)
562 {
563 q=valid;
564 while(*q)
565 {
566 mmax = (mmax>sub[ajSeqcvtGetCodeK(cvt,*p)][ajSeqcvtGetCodeK(cvt,*q)]) ? mmax :
567 (ajint)sub[ajSeqcvtGetCodeK(cvt,*p)][ajSeqcvtGetCodeK(cvt,*q)];
568 ++q;
569 }
570 ++p;
571 }
572
573
574 /* Create the weight matrix and zero it */
575 AJCNEW(weights, mlen);
576 for(i=0;i<mlen;++i)
577 AJCNEW0(weights[i],HENIKOFF_LENGTH+1);
578
579 /*
580 ** count the number of times each residue occurs at each
581 ** position in the alignment
582 */
583 for(i=0;i<mlen;++i)
584 for(j=0;j<nseqs;++j)
585 {
586 p = ajSeqsetGetseqSeqC(seqset,j);
587 if(i>=strlen(p))
588 continue;
589 weights[i][ajBasecodeToInt(p[i])] +=
590 ajSeqsetGetseqWeight(seqset,j);
591 }
592
593 px = -INT_MAX;
594 for(i=0;i<mlen;++i)
595 {
596 x = (float)-INT_MAX;
597 for(j=0;j<AZ-1;++j)
598 if(weights[i][j]>x)
599 {
600 x = weights[i][j];
601 px=j;
602 }
603 ajStrAppendK(cons,(char)(px+'A'));
604 }
605
606
607
608 /* Count the number of different residues at each position */
609
610 AJCNEW0(pcnt, mlen);
611
612 for(i=0;i<mlen;++i)
613 for(j=0;j<HENIKOFF_LENGTH-1;++j)
614 if(weights[i][j])
615 ++pcnt[i];
616
617 /* weights = 1/(num diff res * count of residues at that position */
618 for(i=0;i<mlen;++i)
619 for(j=0;j<HENIKOFF_LENGTH-1;++j)
620 if(weights[i][j])
621 weights[i][j] = (float)1.0/(weights[i][j]*(float)pcnt[i]);
622
623
624 /* Create the profile matrix n*HENIKOFF_LENGTH */
625 AJCNEW(mat, mlen);
626 for(i=0;i<mlen;++i)
627 AJCNEW0(mat[i],HENIKOFF_LENGTH);
628
629 /* Fill the profile with aa scores */
630 for(i=0;i<mlen;++i)
631 for(p=valid;*p;++p)
632 {
633 sum = 0.0;
634 q = valid;
635 while(*q)
636 {
637 score = weights[i][ajBasecodeToInt(*q)];
638 score *= sub[ajSeqcvtGetCodeK(cvt,*p)]
639 [ajSeqcvtGetCodeK(cvt,*q)];
640 sum += score;
641 ++q;
642 }
643 mat[i][ajBasecodeToInt(*p)] = sum;
644 }
645
646 /* Calculate gap penalties */
647 for(i=0;i<mlen;++i)
648 mat[i][HENIKOFF_LENGTH-1]=(mmax / (gapopen+gapextend+
649 (float)gaps[i]));
650
651
652 /* Get maximum matrix score */
653 psum = 0.0;
654 for(i=0;i<mlen;++i)
655 {
656 pmax = (float)-INT_MAX;
657 for(j=0;j<HENIKOFF_LENGTH-1;++j)
658 pmax=(pmax>mat[i][j]) ? pmax : mat[i][j];
659 psum += pmax;
660 }
661
662 /* Print matrix */
663
664 ajFmtPrintF(outf,"# Henikoff Protein Profile\n");
665 ajFmtPrintF(outf,"# Columns are amino acids A->Z\n");
666 ajFmtPrintF(outf,"# Last column is indel penalty\n");
667 ajFmtPrintF(outf,"# Rows are alignment positions 1->n\n");
668 ajFmtPrintF(outf,"Henikoff\n");
669 ajFmtPrintF(outf,"Name\t\t%s\n",ajStrGetPtr(name));
670 ajFmtPrintF(outf,"Matrix\t\t%s\n",ajStrGetPtr(ajMatrixfGetName(imtx)));
671 ajFmtPrintF(outf,"Length\t\t%d\n",mlen);
672 ajFmtPrintF(outf,"Max_score\t%.2f\n",psum);
673 ajFmtPrintF(outf,"Threshold\t%d\n",thresh);
674 ajFmtPrintF(outf,"Gap_open\t%.2f\n",gapopen);
675 ajFmtPrintF(outf,"Gap_extend\t%.2f\n",gapextend);
676 ajFmtPrintF(outf,"Consensus\t%s\n",ajStrGetPtr(*cons));
677
678 for(i=0;i<mlen;++i)
679 {
680 for(j=0;j<HENIKOFF_LENGTH;++j)
681 ajFmtPrintF(outf,"%.2f ",mat[i][j]);
682 ajFmtPrintF(outf,"%.2f\n",mat[i][j-1]);
683 }
684
685
686 for(i=0;i<mlen;++i)
687 {
688 AJFREE(mat[i]);
689 AJFREE(weights[i]);
690 }
691 AJFREE(mat);
692 AJFREE(weights);
693 AJFREE(gaps);
694 AJFREE(pcnt);
695
696 return;
697 }
698