1 /* modelmaking.c
2  * Tue Oct  4 15:33:21 1994
3  *
4  * Bring together common elements of the model construction process.
5  * Also, provides EasyModelmaker() for making a model given a structure.
6  *
7  * All model makers have in common that they construct a "master" traceback
8  * for the alignment, specifying which columns are match vs. insert and
9  * how the model tree branches. This traceback is assigned a numbering
10  * system by NumberMasterTrace(), which returns the number of nodes;
11  * the caller then allocates a new CM. This new model is numbered (assigned
12  * a branching structure) by TopofyNewCM(). Then individual tracebacks
13  * are constructed from individual aligned sequences by Transmogrify().
14  * The individual tracebacks are counted into a new model with TraceCount()
15  * and the counts converted to probabilities with ProbifyCM().
16  *
17  * The master tree is a (slightly misused) trace_s structure with the following
18  * properties:
19  *     insert columns are not represented at all. Transmogrify() must deal.
20  *
21  *     emitl, emitr == 0..alen-1 coords of assigned columns. Set and valid for all
22  *                     nodes, even non-emitters. END values will be
23  *                     on the off-diagonal, emitl = emitr+1. (If this is
24  *                     not true, Transmogrify() breaks.) The trace
25  *                     construction function is responsible for this.
26  *
27  *     nodeidx == this is numbered by preorder traversal by NumberMasterTrace(),
28  *                which also numbers a model. ENDs are not explicitly
29  *                represented in a CM, so they get nodeidx = -1.
30  *
31  *     type == a number 0..6 for *node* type. (Usually this is a unique
32  *             state type identifier.) ENDs are -1. The trace construction
33  *             function is reponsible for this.
34  */
35 
36 #include <stdio.h>
37 #include <stdlib.h>
38 
39 #include "version.h"
40 #include "squid.h"
41 #include "structs.h"
42 #include "funcs.h"
43 
44 #ifdef MEMDEBUG
45 #include "dbmalloc.h"
46 #endif
47 
48 /* Function: NumberMasterTrace()
49  *
50  * Purpose:  Given a master trace for an alignment, number the trace tree
51  *           (in the nodeidx field) in preorder traversal. END nodes
52  *           will not be represented explicitly in the final CM. They
53  *           get numbered -1.
54  */
55 void
NumberMasterTrace(struct trace_s * mtr,int * ret_nodes)56 NumberMasterTrace(struct trace_s *mtr, int *ret_nodes)
57 {
58   struct trace_s      *curr;
59   struct tracestack_s *dolist;
60   int nodes = 0;
61 
62   dolist = InitTracestack();
63   PushTracestack(dolist, mtr->nxtl); /* push root onto stack */
64 
65   while ((curr = PopTracestack(dolist)) != NULL)
66     {
67       if (curr->nxtl == NULL)   /* END node */
68 	curr->nodeidx = -1;
69       else		            /* other nodes */
70 	curr->nodeidx = nodes++;
71 
72       if (curr->nxtr != NULL)  PushTracestack(dolist, curr->nxtr);
73       if (curr->nxtl != NULL)  PushTracestack(dolist, curr->nxtl);
74     }
75 
76   FreeTracestack(dolist);
77   *ret_nodes = nodes;
78 }
79 
80 
81 /* Function: TopofyNewCM()
82  *
83  * Purpose:  Given the mtr master traceback tree, which defines the
84  *           topology of the model, write the nxt and nxt2 connections
85  *           into the model. For the most part, these are already
86  *           contained in mtr thanks to NumberMasterTrace(); the
87  *           only tricky bit is converting END states from multiple
88  *           real states (in mtr) to -1 nxt flags in the cm.
89  *
90  * Return:   1 on success, 0 on failure.
91  */
92 void
TopofyNewCM(struct cm_s * cm,struct trace_s * mtr)93 TopofyNewCM(struct cm_s *cm, struct trace_s *mtr)
94 {
95   struct tracestack_s *dolist;
96   struct trace_s      *curr;
97 
98   dolist = InitTracestack();
99   PushTracestack(dolist, mtr->nxtl); /* push ROOT onto stack */
100 
101   while ((curr = PopTracestack(dolist)) != NULL)
102     {
103       if (curr->nxtl == NULL) continue; /* ignore ENDs */
104       if (curr->nxtr != NULL)		/* deal with BIFURC states */
105 	{
106 	  cm->nd[curr->nodeidx].nxt2 = curr->nxtr->nodeidx;
107 	  PushTracestack(dolist, curr->nxtr);
108 	}
109       else
110 	cm->nd[curr->nodeidx].nxt2 = -1;
111 
112 				/* watch out for curr pointing to END states. */
113       cm->nd[curr->nodeidx].type = curr->type;
114       cm->nd[curr->nodeidx].nxt = (curr->nxtl->nxtl == NULL) ? -1 : curr->nxtl->nodeidx;
115       PushTracestack(dolist, curr->nxtl);
116     }
117   FreeTracestack(dolist);
118 }
119 
120 
121 /* Function: Transmogrify()
122  *
123  * Purpose:  Given a master consensus traceback, create an individual
124  *           "fake" traceback. The fake traceback contains inserts
125  *           and converts the type field of mtr (which contains NODE
126  *           type indices) into _ST type indices, including proper
127  *           classification of nodes into DEL_ST or the various match
128  *           states depending on what aseq[idx] looks like.
129  *
130  * Args:     mtr      - master consensus traceback tree
131  *           aseq     - 0..alen-1 aligned sequence.
132  *           ret_tr   - RETURN: individual traceback
133  *           ret_pool - RETURN: memory pool for traceback
134  *
135  * Return:   (void). *ret_tr must be free'd by the caller
136  */
137 void
Transmogrify(struct trace_s * mtr,char * aseq,struct trace_s ** ret_tr,struct trmem_s ** ret_pool)138 Transmogrify(struct trace_s *mtr, char *aseq, struct trace_s **ret_tr, struct trmem_s **ret_pool)
139 {
140   struct trace_s *tr;
141   struct trmem_s *pool;
142   struct tracestack_s *mtr_stack;
143   struct tracestack_s *tr_stack;
144   struct trace_s      *curr_mtr;
145   struct trace_s      *curr_tr;
146   int                 i2,j2;
147 
148   mtr_stack = InitTracestack();
149   tr_stack  = InitTracestack();
150   InitTrace(&tr, &pool);
151 
152   /* Push ROOT onto both stacks
153    */
154   PushTracestack(mtr_stack, mtr->nxtl);
155   PushTracestack(tr_stack, AttachTrace(tr, pool, -1, -1, 0, uBEGIN_ST));
156 
157   while ((curr_mtr = PopTracestack(mtr_stack)) != NULL)
158     {
159       curr_tr = PopTracestack(tr_stack);
160 
161       switch (curr_mtr->type) {
162       case uEND_ST:
163 	DeleteTracenode(curr_tr, pool);
164 	break;
165 
166       case MATP_NODE:
167 	if (isgap(aseq[curr_mtr->emitl]))
168 	  {
169 	    if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uDEL_ST;
170 	    else                              curr_tr->type = uMATR_ST;
171 	  }
172 	else
173 	  {
174 	    if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uMATL_ST;
175 	    else                              curr_tr->type = uMATP_ST;
176 	  }
177 
178 	/* May have to deal with INSL and INSR; INSL precedes INSR
179 	 */
180 	for (i2 = curr_mtr->emitl+1; i2 < curr_mtr->nxtl->emitl; i2++)
181 	  if (!isgap(aseq[i2]))
182 	    curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
183 
184 	/* May have to deal with INSR
185 	 */
186 	for (j2 = curr_mtr->emitr-1; j2 > curr_mtr->nxtl->emitr; j2--)
187 	  if (! isgap(aseq[j2]))
188 	    curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST);
189 	break;
190 
191       case MATL_NODE:
192 	if (isgap(aseq[curr_mtr->emitl])) curr_tr->type = uDEL_ST;
193 	else                              curr_tr->type = uMATL_ST;
194 
195 	/* May have to deal with INSL
196 	 */
197 	for (i2 = curr_mtr->emitl+1; i2 < curr_mtr->nxtl->emitl; i2++)
198 	  if (!isgap(aseq[i2]))
199 	    curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
200 	break;
201 
202       case MATR_NODE:
203 	if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uDEL_ST;
204 	else                              curr_tr->type = uMATR_ST;
205 
206 	/* May have to deal with INSR */
207 	for (j2 = curr_mtr->emitr-1; j2 > curr_mtr->nxtl->emitr; j2--)
208 	  if (! isgap(aseq[j2]))
209 	    curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST);
210 	break;
211 
212       case BIFURC_NODE: curr_tr->type  = uBIFURC_ST;  break;
213       case BEGINL_NODE: curr_tr->type  = uBEGIN_ST;   break;
214 
215       case BEGINR_NODE:
216 	curr_tr->type  = uBEGIN_ST;
217 
218 	/* May have to deal with INSL.
219 	 * Inserts from BEGINR are *inclusive* of i
220 	 */
221 	for (i2 = curr_mtr->emitl; i2 < curr_mtr->nxtl->emitl; i2++)
222 	  if (!isgap(aseq[i2]))
223 	    curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
224 
225 	break;
226 
227       case ROOT_NODE:
228 	curr_tr->type  = uBEGIN_ST;
229 
230 	/* May have to deal with INSL and INSR; note INSL precedes INSR
231 	 * inserts from root are inclusive of i
232 	 */
233 	for (i2 = curr_mtr->emitl; i2 < curr_mtr->nxtl->emitl; i2++)
234 	  if (!isgap(aseq[i2]))
235 	    curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
236 
237 	/* May have to deal with INSR
238 	 */
239 	for (j2 = curr_mtr->emitr; j2 > curr_mtr->nxtl->emitr; j2--)
240 	  if (! isgap(aseq[j2]))
241 	    curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST);
242 	break;
243 
244       default: Die("Invalid node type %d", curr_mtr->type);
245       }
246 
247       /* Push the children onto stacks, if they're not END nodes
248        */
249       if (curr_mtr->nxtr != NULL)
250 	{
251 	  PushTracestack(mtr_stack, curr_mtr->nxtr);
252 	  PushTracestack(tr_stack, AttachTrace(curr_tr, pool, curr_mtr->nxtr->emitl, curr_mtr->nxtr->emitr,
253 					       curr_mtr->nxtr->nodeidx, curr_mtr->nxtr->type));
254 	}
255       if (curr_mtr->nxtl != NULL)
256 	{
257 	  PushTracestack(mtr_stack, curr_mtr->nxtl);
258 	  PushTracestack(tr_stack, AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, curr_mtr->nxtl->emitr,
259 					       curr_mtr->nxtl->nodeidx, curr_mtr->nxtl->type));
260 	}
261     }
262   FreeTracestack(mtr_stack);
263   FreeTracestack(tr_stack);
264   *ret_pool = pool;
265   *ret_tr   = tr;
266 }
267 
268 
269 
270 /* Function: EasyModelmaker()
271  *
272  * Purpose:  The customer always knows best.
273  *
274  *           Construct a model given a stated structure. The structure
275  *           is provided via a "cs" (consensus sequence) line, as would
276  *           occur in an annotated SELEX file. Only > and < characters
277  *           in this line are interpreted (as base pairs).
278  *
279  *           Match vs. insert can be determined one of two ways. By default,
280  *           the assignment is made by "gapthresh"; for columns with
281  *           fractional occurence of gaps greater than this, the column
282  *           is assigned to insert. If "use_rf" is TRUE, the rf (reference)
283  *           line is interpreted as the assignment -- columns with non-space
284  *           characters in the rf line are assigned to MATCH.
285  *
286  *           Both rf and cs are provided in the ainfo structure.
287  *
288  * Args:     aseq    - aligned sequences. [0..nseq-1] by [0..alen-1]
289  *           ainfo   - info about the alignment, including alen, cs,
290  *                     and rf
291  *           nseq    - number of seqs in aseq
292  *           prior   - prior distributions for CM construction
293  *           gapthresh - over this fraction of gaps, assign column as INS
294  *           use_rf  - if TRUE, use rf field of ainfo for MAT/INS assignment
295  *           ret_cm  - RETURN: new model                      (maybe NULL)
296  *           ret_mtr - RETURN: master traceback for alignment (maybe NULL)
297  *
298  * Return:   void
299  *           cm is allocated here. FreeCM(*ret_cm).
300  *           tr is allocated here. FreeTrace() on each one, then free(*ret_tr).
301  */
302 void
EasyModelmaker(char ** aseq,AINFO * ainfo,int nseq,struct prior_s * prior,double gapthresh,int use_rf,struct cm_s ** ret_cm,struct trace_s ** ret_mtr)303 EasyModelmaker(char **aseq, AINFO *ainfo, int nseq, struct prior_s *prior,
304 	       double gapthresh, int use_rf, struct cm_s **ret_cm, struct trace_s **ret_mtr)
305 {
306   struct cm_s         *cm;	/* new covariance model                */
307   struct trace_s      *mtr;	/* master traceback tree for alignment */
308   struct trace_s      *tr;      /* individual sequence traceback tree  */
309   struct trmem_s      *pool;	/* memory pool for traceback tree      */
310   struct tracestack_s *dolist;
311   struct trace_s      *cur;
312   int   *matassign;
313   int    nodes;
314   int    idx, apos;
315   int   *ct;
316   int    i,j, nxti, nxtj;
317 
318   if (! (ainfo->flags & AINFO_CS)) Die("No cs (consensus structure) line available for that alignment.");
319 
320   /* Determine match/insert assignments
321    * matassign is 0..alen-1. Values are 1 if MAT, 0 if INS.
322    */
323   matassign = (int *) MallocOrDie(sizeof(int) * ainfo->alen);
324   if (use_rf)
325     {
326       if (! (ainfo->flags & AINFO_RF)) Die("No rf (reference coord) line available for that alignment.");
327       for (apos = 0; apos < ainfo->alen; apos++)
328 	matassign[apos] = (ainfo->rf[apos] == ' ') ? 0 : 1;
329     }
330   else
331     {
332       int gaps;
333       for (apos = 0; apos < ainfo->alen; apos++)
334 	{
335 	  for (gaps = 0, idx = 0; idx < nseq; idx++)
336 	    if (isgap(aseq[idx][apos])) gaps++;
337 	  matassign[apos] = ((double) gaps / (double) nseq > gapthresh) ? 0 : 1;
338 	}
339     }
340 
341   /* Determine a "ct" array, base-pairing partners for each position
342    */
343   if (! KHS2ct(ainfo->cs, ainfo->alen, FALSE, &ct))  Die("Consensus structure string is inconsistent");
344 
345   /* Make sure the consensus structure "ct" is consistent with the match assignments.
346    * Wipe out all structure under INS; including the base-paired
347    * partner of INS-assigned positions
348    */
349   for (apos = 0; apos < ainfo->alen; apos++)
350     if (! matassign[apos])
351       {
352 	if (ct[apos] != -1)  ct[ct[apos]] = -1;
353 	ct[apos] = -1;
354       }
355 
356   /* Construct a master traceback tree.
357    * This code is borrowed from yarn's KHS2Trace().
358    * mtr's emitl, emitr, and type are properly set by this section.
359    */
360   InitTrace(&mtr, NULL);
361   dolist = InitTracestack();
362   cur = AttachTrace(mtr, NULL, 0, ainfo->alen-1, -1, ROOT_NODE);    /* attach the root       */
363   PushTracestack(dolist, cur);
364 
365   while ((cur = PopTracestack(dolist)) != NULL)
366     {
367       i = cur->emitl;
368       j = cur->emitr;
369 
370       /* This node accounts for i..j, but we don't know how yet.
371        * Six possibilities:
372        *    i > j; this is an END state; do nothing.
373        *    this is already assigned as a BEGIN; push i,j
374        *    i is unpaired; this is a MATL state; push i+1, j
375        *    j is unpaired; this is a MATR state; push i,j-1
376        *    i,j pair to each other; this is a MATP state; push i+1,j-1
377        *    i,j pair but not to each other; this is a BIFURC state;
378        *        pick mid ip <= mid < jp; push BEGIN i,mid and working i,mid,
379        *        and push BEGIN mid+1,j and working mid+1,j
380        */
381       if (i > j) cur->type = uEND_ST;
382 
383       else if (cur->type == ROOT_NODE)
384 	{ /* try to push i,j; but deal with INSL and INSR */
385 	  for (nxti = i; nxti <= j; nxti++)    if (matassign[nxti]) break;
386 	  for (nxtj = j; nxtj >= nxti; nxtj--) if (matassign[nxtj]) break;
387 	  if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, nxtj, -1, uEND_ST));
388 	  else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = nxtj; } /* deal with END_ST */
389 	}
390 
391       else if (cur->type == BEGINL_NODE) /* no inserts */
392 	{
393 	  if (i <= j)
394 	    PushTracestack(dolist, AttachTrace(cur, NULL, i, j, -1, uEND_ST));
395 	  else
396 	    { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; }
397 	}
398 
399       else if (cur->type == BEGINR_NODE) /* INSL */
400 	{
401 	  for (nxti = i; nxti <= j; nxti++)    if (matassign[nxti]) break;
402 	  if (nxti <= j) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, j, -1, uEND_ST));
403 	  else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } /* deal with END_ST */
404 	}
405 
406       else if (ct[i] == -1) 	/* i unpaired. This is a MATL node; allow INSL */
407 	{
408 	  cur->type = MATL_NODE;
409 	  for (nxti = i+1; nxti <= j; nxti++)  if (matassign[nxti]) break;
410 	  if (nxti <= j) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, j, -1, uEND_ST));
411 	  else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } /* deal with END_ST */
412 	}
413 
414       else if (ct[j] == -1) 	/* j unpaired. MATR node. Deal with INSR */
415 	{
416 	  cur->type = MATR_NODE;
417 	  for (nxtj = j-1; nxtj >= i; nxtj--) if (matassign[nxtj]) break;
418 	  if (i <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, i, nxtj, -1, uEND_ST));
419 	  else { cur->nxtl->emitl = i; cur->nxtl->emitr = nxtj; } /* deal with END_ST */
420 	}
421 
422       else if (ct[i] == j) 	/* i,j paired to each other. MATP. deal with INSL, INSR */
423 	{
424 	  cur->type = MATP_NODE;
425 	  for (nxti = i+1; nxti <= j; nxti++)    if (matassign[nxti]) break;
426 	  for (nxtj = j-1; nxtj >= nxti; nxtj--) if (matassign[nxtj]) break;
427 	  if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, nxtj, -1, uEND_ST));
428 	  else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = nxtj; } /* deal with END_ST */
429 	}
430 
431       else /* i,j paired but not to each other. BIFURC. no INS. */
432 	{  /* by convention, right side of bifurc deals with insert in middle */
433 	  cur->type = BIFURC_NODE;
434 	  PushTracestack(dolist, AttachTrace(cur, NULL, ct[i]+1, j, -1, BEGINR_NODE));
435 	  PushTracestack(dolist, AttachTrace(cur, NULL, i, ct[i], -1, BEGINL_NODE));
436 	}
437     }	/* while something's on dolist stack */
438   FreeTracestack(dolist);
439   free(ct);
440 
441   /* Now, do the drill for constructing a model using this master trace.
442    */
443   NumberMasterTrace(mtr, &nodes);
444   if ((cm = AllocCM(nodes)) == NULL)
445     Die("failed to allocate for new model of %d nodes\n", nodes);
446   TopofyNewCM(cm, mtr);
447 
448   for (idx = 0; idx < nseq; idx++)
449     {
450       Transmogrify(mtr, aseq[idx], &tr, &pool);
451       if (! TraceCount(cm, aseq[idx],
452 		       (ainfo->sqinfo[idx].flags & SQINFO_WGT) ? (double) ainfo->sqinfo[idx].weight : 1.0,
453 		       tr))
454 	Die("TraceCount() failed");
455       FreeTrace(tr, pool);
456     }
457   ProbifyCM(cm, prior);
458 
459   free(matassign);
460   if (ret_cm  != NULL) *ret_cm  = cm;  else FreeCM(cm);
461   if (ret_mtr != NULL) *ret_mtr = mtr; else FreeTrace(mtr, NULL);
462 }
463