1 /* modelmaking.c
2 * Tue Oct 4 15:33:21 1994
3 *
4 * Bring together common elements of the model construction process.
5 * Also, provides EasyModelmaker() for making a model given a structure.
6 *
7 * All model makers have in common that they construct a "master" traceback
8 * for the alignment, specifying which columns are match vs. insert and
9 * how the model tree branches. This traceback is assigned a numbering
10 * system by NumberMasterTrace(), which returns the number of nodes;
11 * the caller then allocates a new CM. This new model is numbered (assigned
12 * a branching structure) by TopofyNewCM(). Then individual tracebacks
13 * are constructed from individual aligned sequences by Transmogrify().
14 * The individual tracebacks are counted into a new model with TraceCount()
15 * and the counts converted to probabilities with ProbifyCM().
16 *
17 * The master tree is a (slightly misused) trace_s structure with the following
18 * properties:
19 * insert columns are not represented at all. Transmogrify() must deal.
20 *
21 * emitl, emitr == 0..alen-1 coords of assigned columns. Set and valid for all
22 * nodes, even non-emitters. END values will be
23 * on the off-diagonal, emitl = emitr+1. (If this is
24 * not true, Transmogrify() breaks.) The trace
25 * construction function is responsible for this.
26 *
27 * nodeidx == this is numbered by preorder traversal by NumberMasterTrace(),
28 * which also numbers a model. ENDs are not explicitly
29 * represented in a CM, so they get nodeidx = -1.
30 *
31 * type == a number 0..6 for *node* type. (Usually this is a unique
32 * state type identifier.) ENDs are -1. The trace construction
33 * function is reponsible for this.
34 */
35
36 #include <stdio.h>
37 #include <stdlib.h>
38
39 #include "version.h"
40 #include "squid.h"
41 #include "structs.h"
42 #include "funcs.h"
43
44 #ifdef MEMDEBUG
45 #include "dbmalloc.h"
46 #endif
47
48 /* Function: NumberMasterTrace()
49 *
50 * Purpose: Given a master trace for an alignment, number the trace tree
51 * (in the nodeidx field) in preorder traversal. END nodes
52 * will not be represented explicitly in the final CM. They
53 * get numbered -1.
54 */
55 void
NumberMasterTrace(struct trace_s * mtr,int * ret_nodes)56 NumberMasterTrace(struct trace_s *mtr, int *ret_nodes)
57 {
58 struct trace_s *curr;
59 struct tracestack_s *dolist;
60 int nodes = 0;
61
62 dolist = InitTracestack();
63 PushTracestack(dolist, mtr->nxtl); /* push root onto stack */
64
65 while ((curr = PopTracestack(dolist)) != NULL)
66 {
67 if (curr->nxtl == NULL) /* END node */
68 curr->nodeidx = -1;
69 else /* other nodes */
70 curr->nodeidx = nodes++;
71
72 if (curr->nxtr != NULL) PushTracestack(dolist, curr->nxtr);
73 if (curr->nxtl != NULL) PushTracestack(dolist, curr->nxtl);
74 }
75
76 FreeTracestack(dolist);
77 *ret_nodes = nodes;
78 }
79
80
81 /* Function: TopofyNewCM()
82 *
83 * Purpose: Given the mtr master traceback tree, which defines the
84 * topology of the model, write the nxt and nxt2 connections
85 * into the model. For the most part, these are already
86 * contained in mtr thanks to NumberMasterTrace(); the
87 * only tricky bit is converting END states from multiple
88 * real states (in mtr) to -1 nxt flags in the cm.
89 *
90 * Return: 1 on success, 0 on failure.
91 */
92 void
TopofyNewCM(struct cm_s * cm,struct trace_s * mtr)93 TopofyNewCM(struct cm_s *cm, struct trace_s *mtr)
94 {
95 struct tracestack_s *dolist;
96 struct trace_s *curr;
97
98 dolist = InitTracestack();
99 PushTracestack(dolist, mtr->nxtl); /* push ROOT onto stack */
100
101 while ((curr = PopTracestack(dolist)) != NULL)
102 {
103 if (curr->nxtl == NULL) continue; /* ignore ENDs */
104 if (curr->nxtr != NULL) /* deal with BIFURC states */
105 {
106 cm->nd[curr->nodeidx].nxt2 = curr->nxtr->nodeidx;
107 PushTracestack(dolist, curr->nxtr);
108 }
109 else
110 cm->nd[curr->nodeidx].nxt2 = -1;
111
112 /* watch out for curr pointing to END states. */
113 cm->nd[curr->nodeidx].type = curr->type;
114 cm->nd[curr->nodeidx].nxt = (curr->nxtl->nxtl == NULL) ? -1 : curr->nxtl->nodeidx;
115 PushTracestack(dolist, curr->nxtl);
116 }
117 FreeTracestack(dolist);
118 }
119
120
121 /* Function: Transmogrify()
122 *
123 * Purpose: Given a master consensus traceback, create an individual
124 * "fake" traceback. The fake traceback contains inserts
125 * and converts the type field of mtr (which contains NODE
126 * type indices) into _ST type indices, including proper
127 * classification of nodes into DEL_ST or the various match
128 * states depending on what aseq[idx] looks like.
129 *
130 * Args: mtr - master consensus traceback tree
131 * aseq - 0..alen-1 aligned sequence.
132 * ret_tr - RETURN: individual traceback
133 * ret_pool - RETURN: memory pool for traceback
134 *
135 * Return: (void). *ret_tr must be free'd by the caller
136 */
137 void
Transmogrify(struct trace_s * mtr,char * aseq,struct trace_s ** ret_tr,struct trmem_s ** ret_pool)138 Transmogrify(struct trace_s *mtr, char *aseq, struct trace_s **ret_tr, struct trmem_s **ret_pool)
139 {
140 struct trace_s *tr;
141 struct trmem_s *pool;
142 struct tracestack_s *mtr_stack;
143 struct tracestack_s *tr_stack;
144 struct trace_s *curr_mtr;
145 struct trace_s *curr_tr;
146 int i2,j2;
147
148 mtr_stack = InitTracestack();
149 tr_stack = InitTracestack();
150 InitTrace(&tr, &pool);
151
152 /* Push ROOT onto both stacks
153 */
154 PushTracestack(mtr_stack, mtr->nxtl);
155 PushTracestack(tr_stack, AttachTrace(tr, pool, -1, -1, 0, uBEGIN_ST));
156
157 while ((curr_mtr = PopTracestack(mtr_stack)) != NULL)
158 {
159 curr_tr = PopTracestack(tr_stack);
160
161 switch (curr_mtr->type) {
162 case uEND_ST:
163 DeleteTracenode(curr_tr, pool);
164 break;
165
166 case MATP_NODE:
167 if (isgap(aseq[curr_mtr->emitl]))
168 {
169 if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uDEL_ST;
170 else curr_tr->type = uMATR_ST;
171 }
172 else
173 {
174 if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uMATL_ST;
175 else curr_tr->type = uMATP_ST;
176 }
177
178 /* May have to deal with INSL and INSR; INSL precedes INSR
179 */
180 for (i2 = curr_mtr->emitl+1; i2 < curr_mtr->nxtl->emitl; i2++)
181 if (!isgap(aseq[i2]))
182 curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
183
184 /* May have to deal with INSR
185 */
186 for (j2 = curr_mtr->emitr-1; j2 > curr_mtr->nxtl->emitr; j2--)
187 if (! isgap(aseq[j2]))
188 curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST);
189 break;
190
191 case MATL_NODE:
192 if (isgap(aseq[curr_mtr->emitl])) curr_tr->type = uDEL_ST;
193 else curr_tr->type = uMATL_ST;
194
195 /* May have to deal with INSL
196 */
197 for (i2 = curr_mtr->emitl+1; i2 < curr_mtr->nxtl->emitl; i2++)
198 if (!isgap(aseq[i2]))
199 curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
200 break;
201
202 case MATR_NODE:
203 if (isgap(aseq[curr_mtr->emitr])) curr_tr->type = uDEL_ST;
204 else curr_tr->type = uMATR_ST;
205
206 /* May have to deal with INSR */
207 for (j2 = curr_mtr->emitr-1; j2 > curr_mtr->nxtl->emitr; j2--)
208 if (! isgap(aseq[j2]))
209 curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST);
210 break;
211
212 case BIFURC_NODE: curr_tr->type = uBIFURC_ST; break;
213 case BEGINL_NODE: curr_tr->type = uBEGIN_ST; break;
214
215 case BEGINR_NODE:
216 curr_tr->type = uBEGIN_ST;
217
218 /* May have to deal with INSL.
219 * Inserts from BEGINR are *inclusive* of i
220 */
221 for (i2 = curr_mtr->emitl; i2 < curr_mtr->nxtl->emitl; i2++)
222 if (!isgap(aseq[i2]))
223 curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
224
225 break;
226
227 case ROOT_NODE:
228 curr_tr->type = uBEGIN_ST;
229
230 /* May have to deal with INSL and INSR; note INSL precedes INSR
231 * inserts from root are inclusive of i
232 */
233 for (i2 = curr_mtr->emitl; i2 < curr_mtr->nxtl->emitl; i2++)
234 if (!isgap(aseq[i2]))
235 curr_tr = AttachTrace(curr_tr, pool, i2, curr_mtr->emitr, curr_mtr->nodeidx, uINSL_ST);
236
237 /* May have to deal with INSR
238 */
239 for (j2 = curr_mtr->emitr; j2 > curr_mtr->nxtl->emitr; j2--)
240 if (! isgap(aseq[j2]))
241 curr_tr = AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, j2, curr_mtr->nodeidx, uINSR_ST);
242 break;
243
244 default: Die("Invalid node type %d", curr_mtr->type);
245 }
246
247 /* Push the children onto stacks, if they're not END nodes
248 */
249 if (curr_mtr->nxtr != NULL)
250 {
251 PushTracestack(mtr_stack, curr_mtr->nxtr);
252 PushTracestack(tr_stack, AttachTrace(curr_tr, pool, curr_mtr->nxtr->emitl, curr_mtr->nxtr->emitr,
253 curr_mtr->nxtr->nodeidx, curr_mtr->nxtr->type));
254 }
255 if (curr_mtr->nxtl != NULL)
256 {
257 PushTracestack(mtr_stack, curr_mtr->nxtl);
258 PushTracestack(tr_stack, AttachTrace(curr_tr, pool, curr_mtr->nxtl->emitl, curr_mtr->nxtl->emitr,
259 curr_mtr->nxtl->nodeidx, curr_mtr->nxtl->type));
260 }
261 }
262 FreeTracestack(mtr_stack);
263 FreeTracestack(tr_stack);
264 *ret_pool = pool;
265 *ret_tr = tr;
266 }
267
268
269
270 /* Function: EasyModelmaker()
271 *
272 * Purpose: The customer always knows best.
273 *
274 * Construct a model given a stated structure. The structure
275 * is provided via a "cs" (consensus sequence) line, as would
276 * occur in an annotated SELEX file. Only > and < characters
277 * in this line are interpreted (as base pairs).
278 *
279 * Match vs. insert can be determined one of two ways. By default,
280 * the assignment is made by "gapthresh"; for columns with
281 * fractional occurence of gaps greater than this, the column
282 * is assigned to insert. If "use_rf" is TRUE, the rf (reference)
283 * line is interpreted as the assignment -- columns with non-space
284 * characters in the rf line are assigned to MATCH.
285 *
286 * Both rf and cs are provided in the ainfo structure.
287 *
288 * Args: aseq - aligned sequences. [0..nseq-1] by [0..alen-1]
289 * ainfo - info about the alignment, including alen, cs,
290 * and rf
291 * nseq - number of seqs in aseq
292 * prior - prior distributions for CM construction
293 * gapthresh - over this fraction of gaps, assign column as INS
294 * use_rf - if TRUE, use rf field of ainfo for MAT/INS assignment
295 * ret_cm - RETURN: new model (maybe NULL)
296 * ret_mtr - RETURN: master traceback for alignment (maybe NULL)
297 *
298 * Return: void
299 * cm is allocated here. FreeCM(*ret_cm).
300 * tr is allocated here. FreeTrace() on each one, then free(*ret_tr).
301 */
302 void
EasyModelmaker(char ** aseq,AINFO * ainfo,int nseq,struct prior_s * prior,double gapthresh,int use_rf,struct cm_s ** ret_cm,struct trace_s ** ret_mtr)303 EasyModelmaker(char **aseq, AINFO *ainfo, int nseq, struct prior_s *prior,
304 double gapthresh, int use_rf, struct cm_s **ret_cm, struct trace_s **ret_mtr)
305 {
306 struct cm_s *cm; /* new covariance model */
307 struct trace_s *mtr; /* master traceback tree for alignment */
308 struct trace_s *tr; /* individual sequence traceback tree */
309 struct trmem_s *pool; /* memory pool for traceback tree */
310 struct tracestack_s *dolist;
311 struct trace_s *cur;
312 int *matassign;
313 int nodes;
314 int idx, apos;
315 int *ct;
316 int i,j, nxti, nxtj;
317
318 if (! (ainfo->flags & AINFO_CS)) Die("No cs (consensus structure) line available for that alignment.");
319
320 /* Determine match/insert assignments
321 * matassign is 0..alen-1. Values are 1 if MAT, 0 if INS.
322 */
323 matassign = (int *) MallocOrDie(sizeof(int) * ainfo->alen);
324 if (use_rf)
325 {
326 if (! (ainfo->flags & AINFO_RF)) Die("No rf (reference coord) line available for that alignment.");
327 for (apos = 0; apos < ainfo->alen; apos++)
328 matassign[apos] = (ainfo->rf[apos] == ' ') ? 0 : 1;
329 }
330 else
331 {
332 int gaps;
333 for (apos = 0; apos < ainfo->alen; apos++)
334 {
335 for (gaps = 0, idx = 0; idx < nseq; idx++)
336 if (isgap(aseq[idx][apos])) gaps++;
337 matassign[apos] = ((double) gaps / (double) nseq > gapthresh) ? 0 : 1;
338 }
339 }
340
341 /* Determine a "ct" array, base-pairing partners for each position
342 */
343 if (! KHS2ct(ainfo->cs, ainfo->alen, FALSE, &ct)) Die("Consensus structure string is inconsistent");
344
345 /* Make sure the consensus structure "ct" is consistent with the match assignments.
346 * Wipe out all structure under INS; including the base-paired
347 * partner of INS-assigned positions
348 */
349 for (apos = 0; apos < ainfo->alen; apos++)
350 if (! matassign[apos])
351 {
352 if (ct[apos] != -1) ct[ct[apos]] = -1;
353 ct[apos] = -1;
354 }
355
356 /* Construct a master traceback tree.
357 * This code is borrowed from yarn's KHS2Trace().
358 * mtr's emitl, emitr, and type are properly set by this section.
359 */
360 InitTrace(&mtr, NULL);
361 dolist = InitTracestack();
362 cur = AttachTrace(mtr, NULL, 0, ainfo->alen-1, -1, ROOT_NODE); /* attach the root */
363 PushTracestack(dolist, cur);
364
365 while ((cur = PopTracestack(dolist)) != NULL)
366 {
367 i = cur->emitl;
368 j = cur->emitr;
369
370 /* This node accounts for i..j, but we don't know how yet.
371 * Six possibilities:
372 * i > j; this is an END state; do nothing.
373 * this is already assigned as a BEGIN; push i,j
374 * i is unpaired; this is a MATL state; push i+1, j
375 * j is unpaired; this is a MATR state; push i,j-1
376 * i,j pair to each other; this is a MATP state; push i+1,j-1
377 * i,j pair but not to each other; this is a BIFURC state;
378 * pick mid ip <= mid < jp; push BEGIN i,mid and working i,mid,
379 * and push BEGIN mid+1,j and working mid+1,j
380 */
381 if (i > j) cur->type = uEND_ST;
382
383 else if (cur->type == ROOT_NODE)
384 { /* try to push i,j; but deal with INSL and INSR */
385 for (nxti = i; nxti <= j; nxti++) if (matassign[nxti]) break;
386 for (nxtj = j; nxtj >= nxti; nxtj--) if (matassign[nxtj]) break;
387 if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, nxtj, -1, uEND_ST));
388 else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = nxtj; } /* deal with END_ST */
389 }
390
391 else if (cur->type == BEGINL_NODE) /* no inserts */
392 {
393 if (i <= j)
394 PushTracestack(dolist, AttachTrace(cur, NULL, i, j, -1, uEND_ST));
395 else
396 { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; }
397 }
398
399 else if (cur->type == BEGINR_NODE) /* INSL */
400 {
401 for (nxti = i; nxti <= j; nxti++) if (matassign[nxti]) break;
402 if (nxti <= j) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, j, -1, uEND_ST));
403 else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } /* deal with END_ST */
404 }
405
406 else if (ct[i] == -1) /* i unpaired. This is a MATL node; allow INSL */
407 {
408 cur->type = MATL_NODE;
409 for (nxti = i+1; nxti <= j; nxti++) if (matassign[nxti]) break;
410 if (nxti <= j) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, j, -1, uEND_ST));
411 else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = j; } /* deal with END_ST */
412 }
413
414 else if (ct[j] == -1) /* j unpaired. MATR node. Deal with INSR */
415 {
416 cur->type = MATR_NODE;
417 for (nxtj = j-1; nxtj >= i; nxtj--) if (matassign[nxtj]) break;
418 if (i <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, i, nxtj, -1, uEND_ST));
419 else { cur->nxtl->emitl = i; cur->nxtl->emitr = nxtj; } /* deal with END_ST */
420 }
421
422 else if (ct[i] == j) /* i,j paired to each other. MATP. deal with INSL, INSR */
423 {
424 cur->type = MATP_NODE;
425 for (nxti = i+1; nxti <= j; nxti++) if (matassign[nxti]) break;
426 for (nxtj = j-1; nxtj >= nxti; nxtj--) if (matassign[nxtj]) break;
427 if (nxti <= nxtj) PushTracestack(dolist, AttachTrace(cur, NULL, nxti, nxtj, -1, uEND_ST));
428 else { cur->nxtl->emitl = nxti; cur->nxtl->emitr = nxtj; } /* deal with END_ST */
429 }
430
431 else /* i,j paired but not to each other. BIFURC. no INS. */
432 { /* by convention, right side of bifurc deals with insert in middle */
433 cur->type = BIFURC_NODE;
434 PushTracestack(dolist, AttachTrace(cur, NULL, ct[i]+1, j, -1, BEGINR_NODE));
435 PushTracestack(dolist, AttachTrace(cur, NULL, i, ct[i], -1, BEGINL_NODE));
436 }
437 } /* while something's on dolist stack */
438 FreeTracestack(dolist);
439 free(ct);
440
441 /* Now, do the drill for constructing a model using this master trace.
442 */
443 NumberMasterTrace(mtr, &nodes);
444 if ((cm = AllocCM(nodes)) == NULL)
445 Die("failed to allocate for new model of %d nodes\n", nodes);
446 TopofyNewCM(cm, mtr);
447
448 for (idx = 0; idx < nseq; idx++)
449 {
450 Transmogrify(mtr, aseq[idx], &tr, &pool);
451 if (! TraceCount(cm, aseq[idx],
452 (ainfo->sqinfo[idx].flags & SQINFO_WGT) ? (double) ainfo->sqinfo[idx].weight : 1.0,
453 tr))
454 Die("TraceCount() failed");
455 FreeTrace(tr, pool);
456 }
457 ProbifyCM(cm, prior);
458
459 free(matassign);
460 if (ret_cm != NULL) *ret_cm = cm; else FreeCM(cm);
461 if (ret_mtr != NULL) *ret_mtr = mtr; else FreeTrace(mtr, NULL);
462 }
463