1 /* fastmodelmaker.c
2  *
3  * Construct a covariance model from an alignment, using an approximate
4  * but fast algorithm. Complexity is N^2 in memory, N^3 in time. Get
5  * back a covariance model, and information contents in bits for
6  * both primary and secondary structure.
7  *
8  *   - use Stormo/Gutell MIXY algorithm to calculate pairwise covariances
9  *     for all aligned column pairs i,j. This produces the matrix **mxy;
10  *     it is indexed 0..alen-1 by 0..alen-1, as a flipped diagonal matrix
11  *     mxy[j][i], j > i. The diagonal (i == j) is unused (and unallocated
12  *     for).
13  *
14  *   - use a quick and dirty Zuker-like algorithm to reduce this N^2
15  *     matrix to the optimal model of non-overlapping chords. This
16  *     produces the matrix **zmat. zmat is a flipped diagonal matrix,
17  *     zmat[j][i], j >= i. The diagonal (i==j) is initialized to zero.
18  *     All other cells take on positive values, summing covariances.
19  *
20  *   - traceback thru the zmat matrix and construct a dynamic binary
21  *     tree *ztr, excluding inserted columns. *ztr
22  *     is constructed much the same as a normal traceback tree is constructed.
23  *     emitl and emitr store the 0..alen-1 indices of the column this node is
24  *     responsible for, even for BEGIN and BIFURC nodes. type is
25  *     a node type (MATP_NODE), not a state type.
26  *     Note that these are exceptions to the indexing scheme used
27  *     by traceback structures everywhere else!
28  *
29  *   - Then, for each *individual* sequence, construct a fake traceback based
30  *     on *ztr. Use these tracebacks to collect statistics from the alignment
31  *     and initialize probabilities in a new model.
32  */
33 
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <math.h>
38 
39 #include "version.h"
40 #include "squid.h"
41 #include "structs.h"
42 #include "funcs.h"
43 
44 #ifdef MEMDEBUG
45 #include "dbmalloc.h"
46 #endif
47 
48 
49 static void mixy(char **aseq, int nseq, int alen, int ***ret_mixy);
50 static void free_mixy(int **mxy, int acol);
51 static void zfill(int **mxy, int acol, int ***ret_zmat, double *ret_sum);
52 static void free_zmat(int **zmat, int acol);
53 static void ztrace(int **zmat, double *gapfq, double gapthresh, int **mxy,
54 		   int acol, struct trace_s **ret_ztr);
55 
56 #ifdef DEBUG
57 static void dump_mixy(int **mxy, int alen);
58 static void dump_zmat(int **zmat, int alen);
59 static void dump_ztr(struct trace_s *ztr);
60 #endif /* DEBUG */
61 
62 /* Function: Fastmodelmaker()
63  *
64  * Purpose:  Given a sequence alignment, construct a reasonable
65  *           starting model. Returns the model, as well as numbers
66  *           for the primary and secondary structure information
67  *           content of the alignment.
68  *
69  * Args:     aseqs       - sequence alignment.
70  *           ainfo       - info about the alignment
71  *           nseq        - number of sequences
72  *           prior       - prior distributions for CM construction
73  *           gapthresh   - over this fraction of gaps, assign to INS
74  *           ret_secinfo - RETURN: sec structure info content (bits) (maybe NULL)
75  *           ret_cm      - RETURN: new model                         (maybe NULL)
76  *           ret_mtr     - RETURN: master traceback for aseqs        (maybe NULL)
77  *
78  * Return:   1 on success, 0 on failure.
79  */
80 int
Fastmodelmaker(char ** aseqs,AINFO * ainfo,int nseq,struct prior_s * prior,double gapthresh,double * ret_secinfo,struct cm_s ** ret_cm,struct trace_s ** ret_mtr)81 Fastmodelmaker(char **aseqs, AINFO *ainfo, int nseq, struct prior_s *prior,
82 	       double gapthresh, double *ret_secinfo,
83 	       struct cm_s **ret_cm, struct trace_s **ret_mtr)
84 {
85   struct cm_s *cm;
86   int        **mxy;
87   int        **zmat;
88   int          M;
89   double       secinfo;
90   double      *gapfq;           /* frequencies of gap occurrence per column */
91   struct trace_s  *ztr;
92   struct trace_s  *tr;          /* individual traceback      */
93   struct trmem_s  *pool;	/* memory pool for traceback */
94   int          idx;
95   int          gaps;
96   int          apos;
97 
98   /* Precalculate gap frequencies seen in each column of alignment
99    */
100   if ((gapfq = (double *) malloc (sizeof(double) * ainfo->alen)) == NULL)
101     Die("malloc failed, line %d file %s", __LINE__, __FILE__);
102   for (apos = 0; apos < ainfo->alen; apos++)
103     {
104       gaps = 0;
105       for (idx = 0; idx < nseq; idx++)
106 	if (isgap(aseqs[idx][apos])) gaps++;
107       gapfq[apos] = (double) gaps / (double) nseq;
108     }
109 
110 				/* build Mxy matrix */
111   mixy(aseqs, nseq, ainfo->alen, &mxy);
112 				/* dynamic programming fill of zmat, using mxy */
113   zfill(mxy, ainfo->alen, &zmat, &secinfo);
114 				/* traceback of zmat to make ztr tree */
115   ztrace(zmat, gapfq, gapthresh, mxy, ainfo->alen, &ztr);
116 /*  PrintTrace(stdout, ztr); */
117 
118   NumberMasterTrace(ztr, &M);
119 				/* allocate for a model */
120   if ((cm = AllocCM(M)) == NULL) Die("AllocCM() failed");
121   TopofyNewCM(cm, ztr);
122 
123   /* For each sequence: convert consensus tree ztr to individual fake traceback.
124    * Count traceback into new model.
125    */
126   for (idx = 0; idx < nseq; idx++)
127     {
128       Transmogrify(ztr, aseqs[idx], &tr, &pool);
129 
130       if (! TraceCount(cm, aseqs[idx],
131 		       (ainfo->sqinfo[idx].flags & SQINFO_WGT) ?
132 		       ainfo->sqinfo[idx].weight : 1.0,
133 		       tr))
134 	Die("TraceCount() failed");
135 
136       FreeTrace(tr, pool);
137     }
138 
139   if (! VerifyCM(cm))
140     Die("Bad cm after trace counts.");
141 
142 				/* convert CM to probabilities */
143   ProbifyCM(cm, prior);
144 				/* garbage collect & return */
145   free_mixy(mxy, ainfo->alen);
146   free_zmat(zmat, ainfo->alen);
147   free(gapfq);
148 
149   if (ret_mtr != NULL)     *ret_mtr     = ztr; else FreeTrace(ztr, NULL);
150   if (ret_cm != NULL)      *ret_cm      = cm;  else FreeCM(cm);
151   if (ret_secinfo != NULL) *ret_secinfo = secinfo / (double) nseq;
152   return 1;
153 }
154 
155 
156 
157 
158 /* Function: mixy()
159  *
160  * Purpose:  given a set of N aligned sequences aseq, calculate
161  *           pairwise covariances (mutual information). ret_mixy
162  *           is allocated, filled, and returned, as a diagonal 2D
163  *           (NxN) matrix of values. It must be freed by
164  *           the caller. It is a lower diagonal matrix mxy[j][i],
165  *           j > i, 0..alen-1 by 0..j-1.
166  *
167  *           The values in mxy are integers. They are the average
168  *           secondary structure information content (i.e. weighted for
169  *           the number of pairs actually occurring in columns i,j)
170  *           in bits, to two decimal places (i.e. info*100).
171  *
172  * Returns:  mxy, which must be free'd by caller with free_mixy().
173  */
174 static void
mixy(char ** aseq,int nseq,int alen,int *** ret_mxy)175 mixy(char    **aseq,            /* array of aligned sequences, flushed right  */
176      int       nseq,		/* number of aligned sequences */
177      int       alen,		/* length of each sequence (all the same)  */
178      int    ***ret_mxy)         /* RETURN: mxy array           */
179 {
180   int    **mxy;                 /* RETURN: diagonal covariance matrix  */
181   float   fx[ALPHASIZE];        /* singlet frequency vector            */
182   float   fy[ALPHASIZE];	/* another singlet frequency vector    */
183   float   fxy[ALPHASIZE][ALPHASIZE]; /* pairwise frequency 2D array    */
184   int     idx;			/* counter for sequences               */
185   int     i, j;			/* counters for columns x,y            */
186   int     symi, symj;		/* counters for symbols                */
187   int     pairs;		/* counter for pairs in which there are no gaps */
188   long    test;
189 
190   /* Allocate for mxy
191    */
192   if ((mxy = (int **) malloc (alen * sizeof(int *))) == NULL)
193     Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
194   for (j = 1; j < alen; j++)
195     if ((mxy[j] = (int *) malloc (j * sizeof(int))) == NULL)
196       Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
197 
198   test = (long) mxy[20];
199 
200   /* calculate mxy
201    */
202   for (j = 1; j < alen; j++)
203     for (i = 0; i < j; i++)
204       {
205 				/* zero counter array */
206 	for (symj = 0; symj < ALPHASIZE; symj++)
207 	  {
208 	    fx[symj] = fy[symj] = 0.0;
209 	    for (symi = 0; symi < ALPHASIZE; symi++)
210 	      fxy[symj][symi] = 0.0;
211 	  }
212 				/* count symbols in a column */
213 	pairs = 0;
214 	for (idx = 0; idx < nseq; idx++)
215 	  {
216 	    if (isgap(aseq[idx][i]) || isgap(aseq[idx][j]))
217 	      continue;
218 
219 	    symi = SymbolIndex(aseq[idx][i]);
220 	    symj = SymbolIndex(aseq[idx][j]);
221 
222 	    fx[symi] += 1.0;
223 	    fy[symj] += 1.0;
224 	    fxy[symi][symj] += 1.0;
225 	    pairs++;
226 	  }
227 
228 				/* convert to frequencies */
229 	if (pairs > 0)
230 	  for (symi = 0; symi < ALPHASIZE; symi++)
231 	    {
232 	      fx[symi] /= (float) pairs;
233 	      fy[symi] /= (float) pairs;
234 	      for (symj = 0; symj < ALPHASIZE; symj++)
235 		fxy[symi][symj] /= (float) pairs;
236 	    }
237 
238 	/* calculate mxy. 144.269504 is a conversion of ln's into
239          * bits * 100: i.e. 100 * (1/log(2))
240 	 */
241 	mxy[j][i] = 0;
242 	for (symi = 0; symi < ALPHASIZE; symi++)
243 	  for (symj = 0; symj < ALPHASIZE; symj++)
244 	    {
245 	      if (fxy[symi][symj] > 0.0)
246 		mxy[j][i] += (int) (144.269504 * fxy[symi][symj] *
247 				    log((fxy[symi][symj] / (fx[symi] * fy[symj]))));
248 	    }
249 
250 	/* Sat Jul 17 22:17:17 1993:  We weight by pairs to get an expected score
251 	 * over all the sequences. Fixes a problem that columns with few symbols
252          * could dominate the calculation just because of noise.
253 	 */
254 	mxy[j][i] =  (mxy[j][i] * pairs) / nseq;
255       }
256 
257   /* dump debugging info
258    */
259 
260 #ifdef DEBUG
261   dump_mixy(mxy, alen);
262 #endif
263   *ret_mxy = mxy;
264 }
265 
266 
267 
268 /* Function: free_mixy()
269  *
270  * Purpose:  free the space allocated for a flipped diagonal
271  *           covariance matrix. To do this we also need to
272  *           know alen, the number of columns in the starting
273  *           sequence alignment.
274  *
275  * Returns:  (void)
276  */
277 static void
free_mixy(int ** mxy,int alen)278 free_mixy(int    **mxy,
279 	  int      alen)
280 {
281   int j;
282 
283   for (j = 1; j < alen; j++)
284     free(mxy[j]);
285   free(mxy);
286 }
287 
288 
289 
290 
291 /* Function: zfill()
292  *
293  * Purpose:  Calculate the optimal structure for a covariance matrix
294  *           produced by mixy(). Uses a way-simplified form of the
295  *           Zuker/Nussinov dynamic programming RNA folding algorithm
296  *           to find the structure which a) the emitted pairs sum
297  *           to a maximum number of bits of covariance and b)
298  *           has no overlapping chords (no pseudoknots). The dynamic
299  *           programming matrix is allocated, filled, and returned.
300  *
301  * Returns:  ret_zmat is returned thru a passed pointer; it must be
302  *           free'd by the caller using free_zmat().
303  */
304 static void
zfill(int ** mxy,int acol,int *** ret_zmat,double * ret_sum)305 zfill(int     **mxy,             /* diagonal covariance matrix from mixy()    */
306       int       acol,		 /* size of mxy; number of aligned columns    */
307       int    ***ret_zmat,        /* RETURN: filled dynamic programming matrix */
308       double   *ret_sum)	 /* RETURN: total sum of Mxy for tree (bits)  */
309 {
310   int    **zmat;
311   int      i, j;
312   int      diff;
313   int      mid;
314 
315   /* Allocations.
316    * zmat is a flipped diagonal matrix, inclusive of the diagonal;
317    *  positions on both axes are numbered 0..acol-1.
318    */
319   if ((zmat = (int **) malloc (acol * sizeof (int *))) == NULL)
320     Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
321   for (j = 0; j < acol; j++)
322     if ((zmat[j] = (int *) malloc ((j+1) * sizeof(int))) == NULL)
323       Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
324 
325   /* Initialization.
326    * We initialize the diagonal to 0.
327    */
328   for (j = 0; j < acol; j++)
329     zmat[j][j] = 0;
330 
331   /* Dynamic programming stage.
332    * Our recursion is:
333    *    Sij = max { Si+1,j  (emit left, no covariance)
334    *                Si,j-1  (emit right, no covariance)
335    *                Si+1,j-1 + mxy[j][i].
336    *                max over mid: Si,mid + Smid+1,j (bifurcation)
337    *                }
338    */
339   for (diff = 1; diff < acol; diff++)
340     for (i = 0; (j = i+diff) < acol; i++)
341       {
342 	if (j-1 >= i+1)
343 	  zmat[j][i] = zmat[j-1][i+1] + mxy[j][i];
344 	else
345 	  zmat[j][i] = mxy[j][i];
346 	if (zmat[j][i+1] > zmat[j][i])
347 	  zmat[j][i] = zmat[j][i+1];
348 	if (zmat[j-1][i] > zmat[j][i])
349 	  zmat[j][i] = zmat[j-1][i];
350 	for (mid = i+1; mid < j-1; mid++)
351 	  if (zmat[mid][i] + zmat[j][mid+1] > zmat[j][i])
352 	    zmat[j][i] = zmat[mid][i] + zmat[j][mid+1];
353       }
354 
355   /* Return
356    */
357 #ifdef DEBUG
358   dump_zmat(zmat, acol);
359 #endif
360 
361   *ret_sum  = (double) zmat[acol-1][0] / 100.0;
362   *ret_zmat = zmat;
363 }
364 
365 
366 static void
free_zmat(int ** zmat,int acol)367 free_zmat(int    **zmat,
368 	  int      acol)
369 {
370   int j;
371 
372   for (j = 0; j < acol; j++)
373     free(zmat[j]);
374   free(zmat);
375 }
376 
377 
378 
379 /* Function: ztrace()
380  *
381  * Purpose:  Traceback through the dynamic programming matrix constructed
382  *           by zfill(). Constructs a dynamic binary tree (ztr) of ztrace_s
383  *           structures, which keep track of both topology and the
384  *           order in which various aligned columns are emitted.
385  *
386  *           ztr ends up being the "shell" or template upon which the
387  *           final model is built. ztr captures the branching structure
388  *           of the model tree.
389  *
390  *           Inserts are dealt with at this point. Columns with a gap frequency
391  *           exceeding gapthresh are excluded from ztr. The final tree ztr contains
392  *           MATR, MATL, MATP nodes only (with BEGIN and BIFURC of course).
393  *
394  * Data:     ztr: a traceback tree.
395  *                emitl = index of column emitted left (0..acol-1)
396  *                        or -1
397  *                emitr = index of column emitted right (0..acol-1)
398  *                        or -1
399  *                nodeidx = index of node in new model
400  *                type   = type of node (MATP_NODE, etc.)
401  *
402  * Return:   ret_ztr is allocated here and must be free'd by the caller.
403  */
404 static void
ztrace(int ** zmat,double * gapfq,double gapthresh,int ** mxy,int acol,struct trace_s ** ret_ztr)405 ztrace(int    **zmat,           /* dynamic programming matrix from zfill()  */
406        double  *gapfq,          /* frequencies of gaps in columns 0..acol-1 */
407        double   gapthresh,      /* above this, column is INS-generated      */
408        int    **mxy,            /* the covariance matrix from mixy()        */
409        int      acol,		/* number of aligned columns (size of zmat) */
410        struct trace_s **ret_ztr)/* RETURN: binary tree of best structure    */
411 {
412   struct trace_s *ztr;          /* binary tree of best structure       */
413   struct trace_s *curr_ztr;     /* ptr to node of ztr we're working on */
414   struct tracestack_s *dolist;  /* pushdown stack of active ztr nodes  */
415   int i,j;			/* coords in zmat (0..acol-1)          */
416   int mid;                      /* midpoint of a bifurcation           */
417 
418   /* Initialize.
419    * Start at i = 0, j = acol-1 and work towards diagonal.
420    */
421   InitTrace(&ztr, NULL);        /* start a trace tree */
422   dolist  = InitTracestack();	/* start a stack for traversing the trace tree */
423 				/* start with root aligned to 0..acol-1 */
424   curr_ztr = AttachTrace(ztr, NULL, 0, acol-1, -1, ROOT_NODE);
425   curr_ztr = AttachTrace(curr_ztr, NULL, 0, acol-1, -1, -1);
426   PushTracestack(dolist, curr_ztr);
427 
428   while ((curr_ztr = PopTracestack(dolist)) != NULL)
429     {
430 				/* where we are now in the traceback. */
431       i = curr_ztr->emitl;
432       j = curr_ztr->emitr;
433 
434 				/* dummy END state on trace tree leaves. */
435       if (i > j) curr_ztr->type = uEND_ST; /* never executes. */
436 
437 				/* watch out for diagonal, where j-1,i+1 is nonsense */
438       else if (i == j)		/* default to push-left; i is explained */
439 	{
440 	  curr_ztr->type  = MATL_NODE;
441 	  curr_ztr->nxtl->emitl = i+1;
442 	  curr_ztr->nxtl->emitr = j;
443 	}
444 
445       else if (j-1 == i && zmat[j][i] == mxy[j][i])
446 	{
447 	  curr_ztr->type = MATP_NODE;
448 	  curr_ztr->nxtl->emitl = i+1;
449 	  curr_ztr->nxtl->emitr = j-1;
450 	}
451 
452       else if (zmat[j][i] == zmat[j-1][i+1] + mxy[j][i])
453 	{
454 	  curr_ztr->type = MATP_NODE;
455 	  if (j-1 >= i+1)
456 	    PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i+1, j-1, -1,-1));
457 	  else
458 	    {
459 	      curr_ztr->nxtl->emitl = i+1;
460 	      curr_ztr->nxtl->emitr = j-1;
461 	    }
462 	}
463       else if (zmat[j][i] == zmat[j][i+1])
464 	{
465 	  curr_ztr->type  = MATL_NODE;
466 	  if ( j >= i+1)
467 	    PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i+1, j, -1,-1));
468 	  else
469 	    {
470 	      curr_ztr->nxtl->emitl = i+1;
471 	      curr_ztr->nxtl->emitr = j;
472 	    }
473 	}
474 
475       else if (zmat[j][i] == zmat[j-1][i])
476 	{
477 	  curr_ztr->type = MATR_NODE;
478 	  if ( j-1 >= i)
479 	    PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i, j-1, -1,-1));
480 	  else
481 	    {
482 	      curr_ztr->nxtl->emitl = i;
483 	      curr_ztr->nxtl->emitr = j-1;
484 	    }
485 	}
486 
487       else
488 	{
489 	  for (mid = i+1; mid < j-1; mid++)
490 	    if (zmat[j][i] == zmat[mid][i] + zmat[j][mid+1])
491 	      {
492 		struct trace_s *branch;
493 
494 				/* the current node is a bifurc node */
495 		curr_ztr->type  = BIFURC_NODE;
496 
497 				/* it will connect to BEGIN-SEGMENT
498 				 * nodes on either side, which are followed
499 				 * by normal segments again */
500 				/* right branch */
501 		branch = AttachTrace(curr_ztr, NULL, mid+1, j, -1, BEGINR_NODE);
502 		if (mid+1 <= j)
503 		  PushTracestack(dolist, AttachTrace(branch, NULL, mid+1, j, -1,-1));
504 		else
505 		  { branch->nxtl->emitl = mid+1; branch->nxtl->emitr = j; }
506 				/* left branch */
507 		branch = AttachTrace(curr_ztr, NULL, i, mid, -1, BEGINL_NODE);
508 		if (i <= mid)
509 		  PushTracestack(dolist, AttachTrace(branch, NULL, i, mid, -1,-1));
510 		else
511 		  { branch->nxtl->emitl = i; branch->nxtl->emitr = mid; }
512 		break;
513 	      }
514 	}
515 
516 				/* clean up current node: deal with insertions */
517       if (curr_ztr->type == MATP_NODE)
518 	{
519 	  if (gapfq[i] > gapthresh && gapfq[j] > gapthresh)
520 	    DeleteTracenode(curr_ztr, NULL);
521 
522 	  else if (gapfq[i] > gapthresh)
523 	    curr_ztr->type = MATR_NODE;
524 
525 	  else if (gapfq[j] > gapthresh)
526 	    curr_ztr->type = MATL_NODE;
527 	}
528       else if ( (curr_ztr->type == MATR_NODE && gapfq[j] > gapthresh) ||
529 	        (curr_ztr->type == MATL_NODE && gapfq[i] > gapthresh))
530 	DeleteTracenode(curr_ztr, NULL);
531     }
532 
533   FreeTracestack(dolist);
534 
535   *ret_ztr = ztr;
536 }
537 
538 
539 
540 
541 #ifdef DEBUG
542 static void
dump_mixy(int ** mxy,int alen)543 dump_mixy(int    **mxy,
544 	  int      alen)
545 {
546   int i, j;
547 
548   for (j = 1; j < alen; j++)
549     {
550       for (i = 0; i < j; i++)
551 	printf("%6d", mxy[j][i]);
552       puts("\n");
553     }
554 }
555 
556 static void
dump_zmat(int ** zmat,int alen)557 dump_zmat(int    **zmat,
558 	  int      alen)
559 {
560   int i, j;
561 
562   for (j = 0; j < alen; j++)
563     {
564       for (i = 0; i <= j; i++)
565 	printf("%6d", zmat[j][i]);
566       puts("\n");
567     }
568 }
569 
570 static void
dump_ztr(struct trace_s * ztr)571 dump_ztr(struct trace_s *ztr)
572 {
573   struct tracestack_s *dolist;
574   struct trace_s      *curr;
575 
576   dolist = InitTracestack();
577   PushTracestack(dolist, ztr->nxtl);
578 
579   while ((curr = PopTracestack(dolist)) != NULL)
580     {
581       printf("## ZTR STATE %#x\n", curr);
582       printf("emitl    : %d\n", curr->emitl);
583       printf("emitr    : %d\n", curr->emitr);
584       printf("nodeidx  : %d\n", curr->nodeidx);
585       printf("type     : %d\n", curr->type);
586       if (curr->nxtl != NULL)
587 	printf("nxtl     : %#x\n", curr->nxtl);
588       else
589 	printf("nxtl     : NULL\n");
590       if (curr->nxtr != NULL)
591 	printf("nxtr     : %#x\n", curr->nxtr);
592       else
593 	printf("nxtr     : NULL\n");
594 
595       if (curr->nxtr != NULL)
596 	PushTracestack(dolist, curr->nxtr);
597       if (curr->nxtl != NULL)
598 	PushTracestack(dolist, curr->nxtl);
599     }
600   FreeTracestack(dolist);
601 }
602 #endif
603 
604 
605