1 /* fastmodelmaker.c
2 *
3 * Construct a covariance model from an alignment, using an approximate
4 * but fast algorithm. Complexity is N^2 in memory, N^3 in time. Get
5 * back a covariance model, and information contents in bits for
6 * both primary and secondary structure.
7 *
8 * - use Stormo/Gutell MIXY algorithm to calculate pairwise covariances
9 * for all aligned column pairs i,j. This produces the matrix **mxy;
10 * it is indexed 0..alen-1 by 0..alen-1, as a flipped diagonal matrix
11 * mxy[j][i], j > i. The diagonal (i == j) is unused (and unallocated
12 * for).
13 *
14 * - use a quick and dirty Zuker-like algorithm to reduce this N^2
15 * matrix to the optimal model of non-overlapping chords. This
16 * produces the matrix **zmat. zmat is a flipped diagonal matrix,
17 * zmat[j][i], j >= i. The diagonal (i==j) is initialized to zero.
18 * All other cells take on positive values, summing covariances.
19 *
20 * - traceback thru the zmat matrix and construct a dynamic binary
21 * tree *ztr, excluding inserted columns. *ztr
22 * is constructed much the same as a normal traceback tree is constructed.
23 * emitl and emitr store the 0..alen-1 indices of the column this node is
24 * responsible for, even for BEGIN and BIFURC nodes. type is
25 * a node type (MATP_NODE), not a state type.
26 * Note that these are exceptions to the indexing scheme used
27 * by traceback structures everywhere else!
28 *
29 * - Then, for each *individual* sequence, construct a fake traceback based
30 * on *ztr. Use these tracebacks to collect statistics from the alignment
31 * and initialize probabilities in a new model.
32 */
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <math.h>
38
39 #include "version.h"
40 #include "squid.h"
41 #include "structs.h"
42 #include "funcs.h"
43
44 #ifdef MEMDEBUG
45 #include "dbmalloc.h"
46 #endif
47
48
49 static void mixy(char **aseq, int nseq, int alen, int ***ret_mixy);
50 static void free_mixy(int **mxy, int acol);
51 static void zfill(int **mxy, int acol, int ***ret_zmat, double *ret_sum);
52 static void free_zmat(int **zmat, int acol);
53 static void ztrace(int **zmat, double *gapfq, double gapthresh, int **mxy,
54 int acol, struct trace_s **ret_ztr);
55
56 #ifdef DEBUG
57 static void dump_mixy(int **mxy, int alen);
58 static void dump_zmat(int **zmat, int alen);
59 static void dump_ztr(struct trace_s *ztr);
60 #endif /* DEBUG */
61
62 /* Function: Fastmodelmaker()
63 *
64 * Purpose: Given a sequence alignment, construct a reasonable
65 * starting model. Returns the model, as well as numbers
66 * for the primary and secondary structure information
67 * content of the alignment.
68 *
69 * Args: aseqs - sequence alignment.
70 * ainfo - info about the alignment
71 * nseq - number of sequences
72 * prior - prior distributions for CM construction
73 * gapthresh - over this fraction of gaps, assign to INS
74 * ret_secinfo - RETURN: sec structure info content (bits) (maybe NULL)
75 * ret_cm - RETURN: new model (maybe NULL)
76 * ret_mtr - RETURN: master traceback for aseqs (maybe NULL)
77 *
78 * Return: 1 on success, 0 on failure.
79 */
80 int
Fastmodelmaker(char ** aseqs,AINFO * ainfo,int nseq,struct prior_s * prior,double gapthresh,double * ret_secinfo,struct cm_s ** ret_cm,struct trace_s ** ret_mtr)81 Fastmodelmaker(char **aseqs, AINFO *ainfo, int nseq, struct prior_s *prior,
82 double gapthresh, double *ret_secinfo,
83 struct cm_s **ret_cm, struct trace_s **ret_mtr)
84 {
85 struct cm_s *cm;
86 int **mxy;
87 int **zmat;
88 int M;
89 double secinfo;
90 double *gapfq; /* frequencies of gap occurrence per column */
91 struct trace_s *ztr;
92 struct trace_s *tr; /* individual traceback */
93 struct trmem_s *pool; /* memory pool for traceback */
94 int idx;
95 int gaps;
96 int apos;
97
98 /* Precalculate gap frequencies seen in each column of alignment
99 */
100 if ((gapfq = (double *) malloc (sizeof(double) * ainfo->alen)) == NULL)
101 Die("malloc failed, line %d file %s", __LINE__, __FILE__);
102 for (apos = 0; apos < ainfo->alen; apos++)
103 {
104 gaps = 0;
105 for (idx = 0; idx < nseq; idx++)
106 if (isgap(aseqs[idx][apos])) gaps++;
107 gapfq[apos] = (double) gaps / (double) nseq;
108 }
109
110 /* build Mxy matrix */
111 mixy(aseqs, nseq, ainfo->alen, &mxy);
112 /* dynamic programming fill of zmat, using mxy */
113 zfill(mxy, ainfo->alen, &zmat, &secinfo);
114 /* traceback of zmat to make ztr tree */
115 ztrace(zmat, gapfq, gapthresh, mxy, ainfo->alen, &ztr);
116 /* PrintTrace(stdout, ztr); */
117
118 NumberMasterTrace(ztr, &M);
119 /* allocate for a model */
120 if ((cm = AllocCM(M)) == NULL) Die("AllocCM() failed");
121 TopofyNewCM(cm, ztr);
122
123 /* For each sequence: convert consensus tree ztr to individual fake traceback.
124 * Count traceback into new model.
125 */
126 for (idx = 0; idx < nseq; idx++)
127 {
128 Transmogrify(ztr, aseqs[idx], &tr, &pool);
129
130 if (! TraceCount(cm, aseqs[idx],
131 (ainfo->sqinfo[idx].flags & SQINFO_WGT) ?
132 ainfo->sqinfo[idx].weight : 1.0,
133 tr))
134 Die("TraceCount() failed");
135
136 FreeTrace(tr, pool);
137 }
138
139 if (! VerifyCM(cm))
140 Die("Bad cm after trace counts.");
141
142 /* convert CM to probabilities */
143 ProbifyCM(cm, prior);
144 /* garbage collect & return */
145 free_mixy(mxy, ainfo->alen);
146 free_zmat(zmat, ainfo->alen);
147 free(gapfq);
148
149 if (ret_mtr != NULL) *ret_mtr = ztr; else FreeTrace(ztr, NULL);
150 if (ret_cm != NULL) *ret_cm = cm; else FreeCM(cm);
151 if (ret_secinfo != NULL) *ret_secinfo = secinfo / (double) nseq;
152 return 1;
153 }
154
155
156
157
158 /* Function: mixy()
159 *
160 * Purpose: given a set of N aligned sequences aseq, calculate
161 * pairwise covariances (mutual information). ret_mixy
162 * is allocated, filled, and returned, as a diagonal 2D
163 * (NxN) matrix of values. It must be freed by
164 * the caller. It is a lower diagonal matrix mxy[j][i],
165 * j > i, 0..alen-1 by 0..j-1.
166 *
167 * The values in mxy are integers. They are the average
168 * secondary structure information content (i.e. weighted for
169 * the number of pairs actually occurring in columns i,j)
170 * in bits, to two decimal places (i.e. info*100).
171 *
172 * Returns: mxy, which must be free'd by caller with free_mixy().
173 */
174 static void
mixy(char ** aseq,int nseq,int alen,int *** ret_mxy)175 mixy(char **aseq, /* array of aligned sequences, flushed right */
176 int nseq, /* number of aligned sequences */
177 int alen, /* length of each sequence (all the same) */
178 int ***ret_mxy) /* RETURN: mxy array */
179 {
180 int **mxy; /* RETURN: diagonal covariance matrix */
181 float fx[ALPHASIZE]; /* singlet frequency vector */
182 float fy[ALPHASIZE]; /* another singlet frequency vector */
183 float fxy[ALPHASIZE][ALPHASIZE]; /* pairwise frequency 2D array */
184 int idx; /* counter for sequences */
185 int i, j; /* counters for columns x,y */
186 int symi, symj; /* counters for symbols */
187 int pairs; /* counter for pairs in which there are no gaps */
188 long test;
189
190 /* Allocate for mxy
191 */
192 if ((mxy = (int **) malloc (alen * sizeof(int *))) == NULL)
193 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
194 for (j = 1; j < alen; j++)
195 if ((mxy[j] = (int *) malloc (j * sizeof(int))) == NULL)
196 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
197
198 test = (long) mxy[20];
199
200 /* calculate mxy
201 */
202 for (j = 1; j < alen; j++)
203 for (i = 0; i < j; i++)
204 {
205 /* zero counter array */
206 for (symj = 0; symj < ALPHASIZE; symj++)
207 {
208 fx[symj] = fy[symj] = 0.0;
209 for (symi = 0; symi < ALPHASIZE; symi++)
210 fxy[symj][symi] = 0.0;
211 }
212 /* count symbols in a column */
213 pairs = 0;
214 for (idx = 0; idx < nseq; idx++)
215 {
216 if (isgap(aseq[idx][i]) || isgap(aseq[idx][j]))
217 continue;
218
219 symi = SymbolIndex(aseq[idx][i]);
220 symj = SymbolIndex(aseq[idx][j]);
221
222 fx[symi] += 1.0;
223 fy[symj] += 1.0;
224 fxy[symi][symj] += 1.0;
225 pairs++;
226 }
227
228 /* convert to frequencies */
229 if (pairs > 0)
230 for (symi = 0; symi < ALPHASIZE; symi++)
231 {
232 fx[symi] /= (float) pairs;
233 fy[symi] /= (float) pairs;
234 for (symj = 0; symj < ALPHASIZE; symj++)
235 fxy[symi][symj] /= (float) pairs;
236 }
237
238 /* calculate mxy. 144.269504 is a conversion of ln's into
239 * bits * 100: i.e. 100 * (1/log(2))
240 */
241 mxy[j][i] = 0;
242 for (symi = 0; symi < ALPHASIZE; symi++)
243 for (symj = 0; symj < ALPHASIZE; symj++)
244 {
245 if (fxy[symi][symj] > 0.0)
246 mxy[j][i] += (int) (144.269504 * fxy[symi][symj] *
247 log((fxy[symi][symj] / (fx[symi] * fy[symj]))));
248 }
249
250 /* Sat Jul 17 22:17:17 1993: We weight by pairs to get an expected score
251 * over all the sequences. Fixes a problem that columns with few symbols
252 * could dominate the calculation just because of noise.
253 */
254 mxy[j][i] = (mxy[j][i] * pairs) / nseq;
255 }
256
257 /* dump debugging info
258 */
259
260 #ifdef DEBUG
261 dump_mixy(mxy, alen);
262 #endif
263 *ret_mxy = mxy;
264 }
265
266
267
268 /* Function: free_mixy()
269 *
270 * Purpose: free the space allocated for a flipped diagonal
271 * covariance matrix. To do this we also need to
272 * know alen, the number of columns in the starting
273 * sequence alignment.
274 *
275 * Returns: (void)
276 */
277 static void
free_mixy(int ** mxy,int alen)278 free_mixy(int **mxy,
279 int alen)
280 {
281 int j;
282
283 for (j = 1; j < alen; j++)
284 free(mxy[j]);
285 free(mxy);
286 }
287
288
289
290
291 /* Function: zfill()
292 *
293 * Purpose: Calculate the optimal structure for a covariance matrix
294 * produced by mixy(). Uses a way-simplified form of the
295 * Zuker/Nussinov dynamic programming RNA folding algorithm
296 * to find the structure which a) the emitted pairs sum
297 * to a maximum number of bits of covariance and b)
298 * has no overlapping chords (no pseudoknots). The dynamic
299 * programming matrix is allocated, filled, and returned.
300 *
301 * Returns: ret_zmat is returned thru a passed pointer; it must be
302 * free'd by the caller using free_zmat().
303 */
304 static void
zfill(int ** mxy,int acol,int *** ret_zmat,double * ret_sum)305 zfill(int **mxy, /* diagonal covariance matrix from mixy() */
306 int acol, /* size of mxy; number of aligned columns */
307 int ***ret_zmat, /* RETURN: filled dynamic programming matrix */
308 double *ret_sum) /* RETURN: total sum of Mxy for tree (bits) */
309 {
310 int **zmat;
311 int i, j;
312 int diff;
313 int mid;
314
315 /* Allocations.
316 * zmat is a flipped diagonal matrix, inclusive of the diagonal;
317 * positions on both axes are numbered 0..acol-1.
318 */
319 if ((zmat = (int **) malloc (acol * sizeof (int *))) == NULL)
320 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
321 for (j = 0; j < acol; j++)
322 if ((zmat[j] = (int *) malloc ((j+1) * sizeof(int))) == NULL)
323 Die("Memory allocation failure at %s line %d", __FILE__, __LINE__);
324
325 /* Initialization.
326 * We initialize the diagonal to 0.
327 */
328 for (j = 0; j < acol; j++)
329 zmat[j][j] = 0;
330
331 /* Dynamic programming stage.
332 * Our recursion is:
333 * Sij = max { Si+1,j (emit left, no covariance)
334 * Si,j-1 (emit right, no covariance)
335 * Si+1,j-1 + mxy[j][i].
336 * max over mid: Si,mid + Smid+1,j (bifurcation)
337 * }
338 */
339 for (diff = 1; diff < acol; diff++)
340 for (i = 0; (j = i+diff) < acol; i++)
341 {
342 if (j-1 >= i+1)
343 zmat[j][i] = zmat[j-1][i+1] + mxy[j][i];
344 else
345 zmat[j][i] = mxy[j][i];
346 if (zmat[j][i+1] > zmat[j][i])
347 zmat[j][i] = zmat[j][i+1];
348 if (zmat[j-1][i] > zmat[j][i])
349 zmat[j][i] = zmat[j-1][i];
350 for (mid = i+1; mid < j-1; mid++)
351 if (zmat[mid][i] + zmat[j][mid+1] > zmat[j][i])
352 zmat[j][i] = zmat[mid][i] + zmat[j][mid+1];
353 }
354
355 /* Return
356 */
357 #ifdef DEBUG
358 dump_zmat(zmat, acol);
359 #endif
360
361 *ret_sum = (double) zmat[acol-1][0] / 100.0;
362 *ret_zmat = zmat;
363 }
364
365
366 static void
free_zmat(int ** zmat,int acol)367 free_zmat(int **zmat,
368 int acol)
369 {
370 int j;
371
372 for (j = 0; j < acol; j++)
373 free(zmat[j]);
374 free(zmat);
375 }
376
377
378
379 /* Function: ztrace()
380 *
381 * Purpose: Traceback through the dynamic programming matrix constructed
382 * by zfill(). Constructs a dynamic binary tree (ztr) of ztrace_s
383 * structures, which keep track of both topology and the
384 * order in which various aligned columns are emitted.
385 *
386 * ztr ends up being the "shell" or template upon which the
387 * final model is built. ztr captures the branching structure
388 * of the model tree.
389 *
390 * Inserts are dealt with at this point. Columns with a gap frequency
391 * exceeding gapthresh are excluded from ztr. The final tree ztr contains
392 * MATR, MATL, MATP nodes only (with BEGIN and BIFURC of course).
393 *
394 * Data: ztr: a traceback tree.
395 * emitl = index of column emitted left (0..acol-1)
396 * or -1
397 * emitr = index of column emitted right (0..acol-1)
398 * or -1
399 * nodeidx = index of node in new model
400 * type = type of node (MATP_NODE, etc.)
401 *
402 * Return: ret_ztr is allocated here and must be free'd by the caller.
403 */
404 static void
ztrace(int ** zmat,double * gapfq,double gapthresh,int ** mxy,int acol,struct trace_s ** ret_ztr)405 ztrace(int **zmat, /* dynamic programming matrix from zfill() */
406 double *gapfq, /* frequencies of gaps in columns 0..acol-1 */
407 double gapthresh, /* above this, column is INS-generated */
408 int **mxy, /* the covariance matrix from mixy() */
409 int acol, /* number of aligned columns (size of zmat) */
410 struct trace_s **ret_ztr)/* RETURN: binary tree of best structure */
411 {
412 struct trace_s *ztr; /* binary tree of best structure */
413 struct trace_s *curr_ztr; /* ptr to node of ztr we're working on */
414 struct tracestack_s *dolist; /* pushdown stack of active ztr nodes */
415 int i,j; /* coords in zmat (0..acol-1) */
416 int mid; /* midpoint of a bifurcation */
417
418 /* Initialize.
419 * Start at i = 0, j = acol-1 and work towards diagonal.
420 */
421 InitTrace(&ztr, NULL); /* start a trace tree */
422 dolist = InitTracestack(); /* start a stack for traversing the trace tree */
423 /* start with root aligned to 0..acol-1 */
424 curr_ztr = AttachTrace(ztr, NULL, 0, acol-1, -1, ROOT_NODE);
425 curr_ztr = AttachTrace(curr_ztr, NULL, 0, acol-1, -1, -1);
426 PushTracestack(dolist, curr_ztr);
427
428 while ((curr_ztr = PopTracestack(dolist)) != NULL)
429 {
430 /* where we are now in the traceback. */
431 i = curr_ztr->emitl;
432 j = curr_ztr->emitr;
433
434 /* dummy END state on trace tree leaves. */
435 if (i > j) curr_ztr->type = uEND_ST; /* never executes. */
436
437 /* watch out for diagonal, where j-1,i+1 is nonsense */
438 else if (i == j) /* default to push-left; i is explained */
439 {
440 curr_ztr->type = MATL_NODE;
441 curr_ztr->nxtl->emitl = i+1;
442 curr_ztr->nxtl->emitr = j;
443 }
444
445 else if (j-1 == i && zmat[j][i] == mxy[j][i])
446 {
447 curr_ztr->type = MATP_NODE;
448 curr_ztr->nxtl->emitl = i+1;
449 curr_ztr->nxtl->emitr = j-1;
450 }
451
452 else if (zmat[j][i] == zmat[j-1][i+1] + mxy[j][i])
453 {
454 curr_ztr->type = MATP_NODE;
455 if (j-1 >= i+1)
456 PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i+1, j-1, -1,-1));
457 else
458 {
459 curr_ztr->nxtl->emitl = i+1;
460 curr_ztr->nxtl->emitr = j-1;
461 }
462 }
463 else if (zmat[j][i] == zmat[j][i+1])
464 {
465 curr_ztr->type = MATL_NODE;
466 if ( j >= i+1)
467 PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i+1, j, -1,-1));
468 else
469 {
470 curr_ztr->nxtl->emitl = i+1;
471 curr_ztr->nxtl->emitr = j;
472 }
473 }
474
475 else if (zmat[j][i] == zmat[j-1][i])
476 {
477 curr_ztr->type = MATR_NODE;
478 if ( j-1 >= i)
479 PushTracestack(dolist, AttachTrace(curr_ztr, NULL, i, j-1, -1,-1));
480 else
481 {
482 curr_ztr->nxtl->emitl = i;
483 curr_ztr->nxtl->emitr = j-1;
484 }
485 }
486
487 else
488 {
489 for (mid = i+1; mid < j-1; mid++)
490 if (zmat[j][i] == zmat[mid][i] + zmat[j][mid+1])
491 {
492 struct trace_s *branch;
493
494 /* the current node is a bifurc node */
495 curr_ztr->type = BIFURC_NODE;
496
497 /* it will connect to BEGIN-SEGMENT
498 * nodes on either side, which are followed
499 * by normal segments again */
500 /* right branch */
501 branch = AttachTrace(curr_ztr, NULL, mid+1, j, -1, BEGINR_NODE);
502 if (mid+1 <= j)
503 PushTracestack(dolist, AttachTrace(branch, NULL, mid+1, j, -1,-1));
504 else
505 { branch->nxtl->emitl = mid+1; branch->nxtl->emitr = j; }
506 /* left branch */
507 branch = AttachTrace(curr_ztr, NULL, i, mid, -1, BEGINL_NODE);
508 if (i <= mid)
509 PushTracestack(dolist, AttachTrace(branch, NULL, i, mid, -1,-1));
510 else
511 { branch->nxtl->emitl = i; branch->nxtl->emitr = mid; }
512 break;
513 }
514 }
515
516 /* clean up current node: deal with insertions */
517 if (curr_ztr->type == MATP_NODE)
518 {
519 if (gapfq[i] > gapthresh && gapfq[j] > gapthresh)
520 DeleteTracenode(curr_ztr, NULL);
521
522 else if (gapfq[i] > gapthresh)
523 curr_ztr->type = MATR_NODE;
524
525 else if (gapfq[j] > gapthresh)
526 curr_ztr->type = MATL_NODE;
527 }
528 else if ( (curr_ztr->type == MATR_NODE && gapfq[j] > gapthresh) ||
529 (curr_ztr->type == MATL_NODE && gapfq[i] > gapthresh))
530 DeleteTracenode(curr_ztr, NULL);
531 }
532
533 FreeTracestack(dolist);
534
535 *ret_ztr = ztr;
536 }
537
538
539
540
541 #ifdef DEBUG
542 static void
dump_mixy(int ** mxy,int alen)543 dump_mixy(int **mxy,
544 int alen)
545 {
546 int i, j;
547
548 for (j = 1; j < alen; j++)
549 {
550 for (i = 0; i < j; i++)
551 printf("%6d", mxy[j][i]);
552 puts("\n");
553 }
554 }
555
556 static void
dump_zmat(int ** zmat,int alen)557 dump_zmat(int **zmat,
558 int alen)
559 {
560 int i, j;
561
562 for (j = 0; j < alen; j++)
563 {
564 for (i = 0; i <= j; i++)
565 printf("%6d", zmat[j][i]);
566 puts("\n");
567 }
568 }
569
570 static void
dump_ztr(struct trace_s * ztr)571 dump_ztr(struct trace_s *ztr)
572 {
573 struct tracestack_s *dolist;
574 struct trace_s *curr;
575
576 dolist = InitTracestack();
577 PushTracestack(dolist, ztr->nxtl);
578
579 while ((curr = PopTracestack(dolist)) != NULL)
580 {
581 printf("## ZTR STATE %#x\n", curr);
582 printf("emitl : %d\n", curr->emitl);
583 printf("emitr : %d\n", curr->emitr);
584 printf("nodeidx : %d\n", curr->nodeidx);
585 printf("type : %d\n", curr->type);
586 if (curr->nxtl != NULL)
587 printf("nxtl : %#x\n", curr->nxtl);
588 else
589 printf("nxtl : NULL\n");
590 if (curr->nxtr != NULL)
591 printf("nxtr : %#x\n", curr->nxtr);
592 else
593 printf("nxtr : NULL\n");
594
595 if (curr->nxtr != NULL)
596 PushTracestack(dolist, curr->nxtr);
597 if (curr->nxtl != NULL)
598 PushTracestack(dolist, curr->nxtl);
599 }
600 FreeTracestack(dolist);
601 }
602 #endif
603
604
605