1 /************************************************************
2 * HMMER - Biological sequence analysis with profile-HMMs
3 * Copyright (C) 1992-1998 Washington University School of Medicine
4 *
5 * This source code is distributed under the terms of the
6 * GNU General Public License. See the files COPYING and
7 * GNULICENSE for details.
8 *
9 ************************************************************/
10
11 /* core_algorithms.c
12 * SRE, Mon Nov 11 15:58:52 1996
13 * RCS $Id: core_algorithms.c,v 1.1.1.1 2001/06/18 13:59:48 birney Exp $
14 *
15 * Simple and robust "research" implementations of Forward, Backward,
16 * and Viterbi for Plan7.
17 */
18
19 #include "structs.h"
20 #include "config.h"
21 #include "funcs.h"
22 #include "squid.h"
23
24 #include <assert.h>
25
26 #ifdef MEMDEBUG
27 #include "dbmalloc.h"
28 #endif
29
30 static float get_wee_midpt(struct plan7_s *hmm, char *dsq, int L,
31 int k1, enum p7stype t1, int s1,
32 int k3, enum p7stype t3, int s3,
33 int *ret_k2, enum p7stype *ret_t2, int *ret_s2);
34
35
36 /* Function: AllocPlan7Matrix()
37 *
38 * Purpose: Allocate a dynamic programming matrix for standard Forward,
39 * Backward, or Viterbi, with scores kept as scaled log-odds
40 * integers. Keeps 2D arrays compact in RAM in an attempt
41 * to maximize cache hits. Sets up individual ptrs to the
42 * four matrix components as a convenience.
43 *
44 * Args: rows - number of rows to allocate; typically L+1
45 * M - size of model
46 * xmx, mmx, imx, dmx
47 * - RETURN: ptrs to four mx components as a convenience
48 *
49 * Return: mx
50 * mx is allocated here. Caller frees with FreeDPMatrix(mx).
51 */
52
53 struct dpmatrix_s *
AllocPlan7Matrix(int rows,int M,int *** xmx,int *** mmx,int *** imx,int *** dmx)54 AllocPlan7Matrix(int rows, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx)
55 {
56 struct dpmatrix_s *mx;
57 int i;
58
59 mx = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));
60 mx->xmx = (int **) MallocOrDie (sizeof(int *) * rows);
61 mx->mmx = (int **) MallocOrDie (sizeof(int *) * rows);
62 mx->imx = (int **) MallocOrDie (sizeof(int *) * rows);
63 mx->dmx = (int **) MallocOrDie (sizeof(int *) * rows);
64 mx->xmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*5));
65 mx->mmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2)));
66 mx->imx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2)));
67 mx->dmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2)));
68 for (i = 1; i < rows; i++)
69 {
70 mx->xmx[i] = mx->xmx[0] + (i*5);
71 mx->mmx[i] = mx->mmx[0] + (i*(M+2));
72 mx->imx[i] = mx->imx[0] + (i*(M+2));
73 mx->dmx[i] = mx->dmx[0] + (i*(M+2));
74 }
75
76 if (xmx != NULL) *xmx = mx->xmx;
77 if (mmx != NULL) *mmx = mx->mmx;
78 if (imx != NULL) *imx = mx->imx;
79 if (dmx != NULL) *dmx = mx->dmx;
80 return mx;
81 }
82
83 /* Function: FreePlan7Matrix()
84 *
85 * Purpose: Free a dynamic programming matrix allocated by AllocPlan7Matrix().
86 *
87 * Return: (void)
88 */
89 void
FreePlan7Matrix(struct dpmatrix_s * mx)90 FreePlan7Matrix(struct dpmatrix_s *mx)
91 {
92 free (mx->xmx[0]);
93 free (mx->mmx[0]);
94 free (mx->imx[0]);
95 free (mx->dmx[0]);
96 free (mx->xmx);
97 free (mx->mmx);
98 free (mx->imx);
99 free (mx->dmx);
100 free (mx);
101 }
102
103 /* Function: P7ViterbiSize()
104 * Date: SRE, Fri Mar 6 15:13:20 1998 [St. Louis]
105 *
106 * Purpose: Returns the ballpark predicted memory requirement for a
107 * P7Viterbi() alignment, in MB.
108 *
109 * Args: L - length of sequence
110 * M - length of HMM
111 *
112 * Returns: # of MB
113 */
114 int
P7ViterbiSize(int L,int M)115 P7ViterbiSize(int L, int M)
116 {
117 return ((sizeof(struct dpmatrix_s) + /* matrix structure */
118 3 * (L+1) * (M+2) * sizeof(int) + /* main matrix is O(NM) */
119 4 * (L+1) * sizeof(int *) + /* ptrs into rows of matrix */
120 5 * (L+1) * sizeof(int)) /* 5 special states */
121 / 1000000);
122 }
123
124 /* Function: P7SmallViterbiSize()
125 * Date: SRE, Fri Mar 6 15:20:04 1998 [St. Louis]
126 *
127 * Purpose: Returns the ballpark predicted memory requirement for
128 * a P7SmallViterbi() alignment, in MB.
129 *
130 * P7SmallViterbi() is a wrapper, calling both P7ParsingViterbi()
131 * and P7WeeViterbi(). P7ParsingViterbi() typically dominates
132 * the memory requirement, so the value returned
133 * is the P7ParsingViterbi() number.
134 *
135 * Args: L - length of sequence
136 * M - length of HMM
137 *
138 * Returns: # of MB
139 */
140 int
P7SmallViterbiSize(int L,int M)141 P7SmallViterbiSize(int L, int M)
142 {
143 return ((2 * sizeof(struct dpmatrix_s) +
144 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */
145 16 * sizeof(int *) + /* ptrs into rows of matrix */
146 20 * sizeof(int) + /* 5 special states */
147 2 * (L+1) * sizeof(int)) /* traceback indices */
148 / 1000000);
149 }
150
151
152 /* Function: P7WeeViterbiSize()
153 * Date: SRE, Fri Mar 6 15:40:42 1998 [St. Louis]
154 *
155 * Purpose: Returns the ballpark predicted memory requirement for
156 * a P7WeeViterbi() alignment, in MB.
157 *
158 * Args: L - length of sequence
159 * M - length of HMM
160 *
161 * Returns: # of MB
162 */
163 int
P7WeeViterbiSize(int L,int M)164 P7WeeViterbiSize(int L, int M)
165 {
166 return ((2 * sizeof(struct dpmatrix_s) +
167 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */
168 16 * sizeof(int *) + /* ptrs into rows of matrix */
169 20 * sizeof(int) + /* 5 special states */
170 2 * (L+2) * sizeof(int) + /* stacks for starts/ends (overkill) */
171 (L+2) * sizeof(int) + /* k assignments to seq positions */
172 (L+2) * sizeof(enum p7stype)) /* state assignments to seq positions */
173 / 1000000);
174 }
175
176
177 /* Function: P7Forward()
178 *
179 * Purpose: The Forward dynamic programming algorithm.
180 * The scaling issue is dealt with by working in log space
181 * and calling ILogsum(); this is a slow but robust approach.
182 *
183 * Args: dsq - sequence in digitized form
184 * L - length of dsq
185 * hmm - the model
186 * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted
187 *
188 * Return: log P(S|M)/P(S|R), as a bit score.
189 */
190 float
P7Forward(char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s ** ret_mx)191 P7Forward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
192 {
193 struct dpmatrix_s *mx;
194 int **xmx;
195 int **mmx;
196 int **imx;
197 int **dmx;
198 int i,k;
199 int sc;
200
201 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
202 */
203 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
204
205 /* Initialization of the zero row.
206 * Note that xmx[i][stN] = 0 by definition for all i,
207 * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need
208 * to be calculated in DP matrices.
209 */
210 xmx[0][XMN] = 0; /* S->N, p=1 */
211 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
212 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
213 for (k = 0; k <= hmm->M; k++)
214 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
215
216 /* Recursion. Done as a pull.
217 * Note some slightly wasteful boundary conditions:
218 * tsc[0] = -INFTY for all eight transitions (no node 0)
219 * D_M and I_M are wastefully calculated (they don't exist)
220 */
221 for (i = 1; i <= L; i++)
222 {
223 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
224 for (k = 1; k < hmm->M; k++)
225 {
226 mmx[i][k] = ILogsum(ILogsum(mmx[i-1][k-1] + hmm->tsc[k-1][TMM],
227 imx[i-1][k-1] + hmm->tsc[k-1][TIM]),
228 ILogsum(xmx[i-1][XMB] + hmm->bsc[k],
229 dmx[i-1][k-1] + hmm->tsc[k-1][TDM]));
230 mmx[i][k] += hmm->msc[(int) dsq[i]][k];
231
232 dmx[i][k] = ILogsum(mmx[i][k-1] + hmm->tsc[k-1][TMD],
233 dmx[i][k-1] + hmm->tsc[k-1][TDD]);
234 imx[i][k] = ILogsum(mmx[i-1][k] + hmm->tsc[k][TMI],
235 imx[i-1][k] + hmm->tsc[k][TII]);
236 imx[i][k] += hmm->isc[(int) dsq[i]][k];
237 }
238 mmx[i][hmm->M] = ILogsum(ILogsum(mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM],
239 imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]),
240 ILogsum(xmx[i-1][XMB] + hmm->bsc[hmm->M-1],
241 dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM]));
242 mmx[i][hmm->M] += hmm->msc[(int) dsq[i]][hmm->M];
243
244 /* Now the special states.
245 * remember, C and J emissions are zero score by definition
246 */
247 xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];
248
249 xmx[i][XME] = -INFTY;
250 for (k = 1; k <= hmm->M; k++)
251 xmx[i][XME] = ILogsum(xmx[i][XME], mmx[i][k] + hmm->esc[k]);
252
253 xmx[i][XMJ] = ILogsum(xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP],
254 xmx[i][XME] + hmm->xsc[XTE][LOOP]);
255
256 xmx[i][XMB] = ILogsum(xmx[i][XMN] + hmm->xsc[XTN][MOVE],
257 xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]);
258
259 xmx[i][XMC] = ILogsum(xmx[i-1][XMC] + hmm->xsc[XTC][LOOP],
260 xmx[i][XME] + hmm->xsc[XTE][MOVE]);
261 }
262
263 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
264
265 if (ret_mx != NULL) *ret_mx = mx;
266 else FreePlan7Matrix(mx);
267
268 return Scorify(sc); /* the total Forward score. */
269 }
270
271
272 /* Function: P7Viterbi()
273 *
274 * Purpose: The Viterbi dynamic programming algorithm.
275 * Identical to Forward() except that max's
276 * replace sum's.
277 *
278 * Args: dsq - sequence in digitized form
279 * L - length of dsq
280 * hmm - the model
281 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
282 *
283 * Return: log P(S|M)/P(S|R), as a bit score
284 */
285 float
P7Viterbi(char * dsq,int L,struct plan7_s * hmm,struct p7trace_s ** ret_tr)286 P7Viterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
287 {
288 struct dpmatrix_s *mx;
289 struct p7trace_s *tr;
290 int **xmx;
291 int **mmx;
292 int **imx;
293 int **dmx;
294 int i,k;
295 int sc;
296
297 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
298 */
299 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
300
301 /* Initialization of the zero row.
302 */
303 xmx[0][XMN] = 0; /* S->N, p=1 */
304 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
305 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
306 for (k = 0; k <= hmm->M; k++)
307 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
308
309 /* Recursion. Done as a pull.
310 * Note some slightly wasteful boundary conditions:
311 * tsc[0] = -INFTY for all eight transitions (no node 0)
312 * D_M and I_M are wastefully calculated (they don't exist)
313 */
314 for (i = 1; i <= L; i++) {
315 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
316
317 for (k = 1; k <= hmm->M; k++) {
318 /* match state */
319 mmx[i][k] = -INFTY;
320 if ((sc = mmx[i-1][k-1] + hmm->tsc[k-1][TMM]) > mmx[i][k])
321 mmx[i][k] = sc;
322 if ((sc = imx[i-1][k-1] + hmm->tsc[k-1][TIM]) > mmx[i][k])
323 mmx[i][k] = sc;
324 if ((sc = xmx[i-1][XMB] + hmm->bsc[k]) > mmx[i][k])
325 mmx[i][k] = sc;
326 if ((sc = dmx[i-1][k-1] + hmm->tsc[k-1][TDM]) > mmx[i][k])
327 mmx[i][k] = sc;
328 if (hmm->msc[(int) dsq[i]][k] != -INFTY) mmx[i][k] += hmm->msc[(int) dsq[i]][k];
329 else mmx[i][k] = -INFTY;
330
331 /* delete state */
332 dmx[i][k] = -INFTY;
333 if ((sc = mmx[i][k-1] + hmm->tsc[k-1][TMD]) > dmx[i][k])
334 dmx[i][k] = sc;
335 if ((sc = dmx[i][k-1] + hmm->tsc[k-1][TDD]) > dmx[i][k])
336 dmx[i][k] = sc;
337
338 /* insert state */
339 if (k < hmm->M) {
340 imx[i][k] = -INFTY;
341 if ((sc = mmx[i-1][k] + hmm->tsc[k][TMI]) > imx[i][k])
342 imx[i][k] = sc;
343 if ((sc = imx[i-1][k] + hmm->tsc[k][TII]) > imx[i][k])
344 imx[i][k] = sc;
345 if (hmm->isc[(int)dsq[i]][k] != -INFTY) imx[i][k] += hmm->isc[(int) dsq[i]][k];
346 else imx[i][k] = -INFTY;
347 }
348 }
349
350 /* Now the special states. Order is important here.
351 * remember, C and J emissions are zero score by definition,
352 */
353 /* N state */
354 xmx[i][XMN] = -INFTY;
355 if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
356 xmx[i][XMN] = sc;
357
358 /* E state */
359 xmx[i][XME] = -INFTY;
360 for (k = 1; k <= hmm->M; k++)
361 if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME])
362 xmx[i][XME] = sc;
363 /* J state */
364 xmx[i][XMJ] = -INFTY;
365 if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
366 xmx[i][XMJ] = sc;
367 if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
368 xmx[i][XMJ] = sc;
369
370 /* B state */
371 xmx[i][XMB] = -INFTY;
372 if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
373 xmx[i][XMB] = sc;
374 if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
375 xmx[i][XMB] = sc;
376
377 /* C state */
378 xmx[i][XMC] = -INFTY;
379 if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
380 xmx[i][XMC] = sc;
381 if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
382 xmx[i][XMC] = sc;
383 }
384 /* T state (not stored) */
385 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
386
387 if (ret_tr != NULL) {
388 P7ViterbiTrace(hmm, dsq, L, mx, &tr);
389 *ret_tr = tr;
390 }
391
392 FreePlan7Matrix(mx);
393 return Scorify(sc); /* the total Viterbi score. */
394 }
395
396
397 /* Function: P7ViterbiTrace()
398 * Date: SRE, Sat Aug 23 10:30:11 1997 (St. Louis Lambert Field)
399 *
400 * Purpose: Traceback of a Viterbi matrix: i.e. retrieval
401 * of optimum alignment.
402 *
403 * Args: hmm - hmm, log odds form, used to make mx
404 * dsq - sequence aligned to (digital form) 1..N
405 * N - length of seq
406 * mx - the matrix to trace back in, N x hmm->M
407 * ret_tr - RETURN: traceback.
408 *
409 * Return: (void)
410 * ret_tr is allocated here. Free using P7FreeTrace().
411 */
412 void
P7ViterbiTrace(struct plan7_s * hmm,char * dsq,int N,struct dpmatrix_s * mx,struct p7trace_s ** ret_tr)413 P7ViterbiTrace(struct plan7_s *hmm, char *dsq, int N,
414 struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
415 {
416 struct p7trace_s *tr;
417 int curralloc; /* current allocated length of trace */
418 int tpos; /* position in trace */
419 int i; /* position in seq (1..N) */
420 int k; /* position in model (1..M) */
421 int **xmx, **mmx, **imx, **dmx;
422 int sc; /* temp var for pre-emission score */
423
424 /* Overallocate for the trace.
425 * S-N-B- ... - E-C-T : 6 states + N is minimum trace;
426 * add N more as buffer.
427 */
428 curralloc = N * 2 + 6;
429 P7AllocTrace(curralloc, &tr);
430
431 xmx = mx->xmx;
432 mmx = mx->mmx;
433 imx = mx->imx;
434 dmx = mx->dmx;
435
436 /* Initialization of trace
437 * We do it back to front; ReverseTrace() is called later.
438 */
439 tr->statetype[0] = STT;
440 tr->nodeidx[0] = 0;
441 tr->pos[0] = 0;
442 tr->statetype[1] = STC;
443 tr->nodeidx[1] = 0;
444 tr->pos[1] = 0;
445 tpos = 2;
446 i = N; /* current i (seq pos) we're trying to assign */
447
448 /* Traceback
449 */
450 while (tr->statetype[tpos-1] != STS) {
451 switch (tr->statetype[tpos-1]) {
452 case STM: /* M connects from i-1,k-1, or B */
453 sc = mmx[i+1][k+1] - hmm->msc[(int) dsq[i+1]][k+1];
454 if (sc == xmx[i][XMB] + hmm->bsc[k+1])
455 {
456 /* Check for wing unfolding */
457 if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1])
458 while (k > 0)
459 {
460 tr->statetype[tpos] = STD;
461 tr->nodeidx[tpos] = k--;
462 tr->pos[tpos] = 0;
463 tpos++;
464 if (tpos == curralloc)
465 { /* grow trace if necessary */
466 curralloc += N;
467 P7ReallocTrace(tr, curralloc);
468 }
469 }
470
471 tr->statetype[tpos] = STB;
472 tr->nodeidx[tpos] = 0;
473 tr->pos[tpos] = 0;
474 }
475 else if (sc == mmx[i][k] + hmm->tsc[k][TMM])
476 {
477 tr->statetype[tpos] = STM;
478 tr->nodeidx[tpos] = k--;
479 tr->pos[tpos] = i--;
480 }
481 else if (sc == imx[i][k] + hmm->tsc[k][TIM])
482 {
483 tr->statetype[tpos] = STI;
484 tr->nodeidx[tpos] = k;
485 tr->pos[tpos] = i--;
486 }
487 else if (sc == dmx[i][k] + hmm->tsc[k][TDM])
488 {
489 tr->statetype[tpos] = STD;
490 tr->nodeidx[tpos] = k--;
491 tr->pos[tpos] = 0;
492 }
493 else Die("traceback failed");
494 break;
495
496 case STD: /* D connects from M,D */
497 if (dmx[i][k+1] == mmx[i][k] + hmm->tsc[k][TMD])
498 {
499 tr->statetype[tpos] = STM;
500 tr->nodeidx[tpos] = k--;
501 tr->pos[tpos] = i--;
502 }
503 else if (dmx[i][k+1] == dmx[i][k] + hmm->tsc[k][TDD])
504 {
505 tr->statetype[tpos] = STD;
506 tr->nodeidx[tpos] = k--;
507 tr->pos[tpos] = 0;
508 }
509 else Die("traceback failed");
510 break;
511
512 case STI: /* I connects from M,I */
513 sc = imx[i+1][k] - hmm->isc[(int) dsq[i+1]][k];
514 if (sc == mmx[i][k] + hmm->tsc[k][TMI])
515 {
516 tr->statetype[tpos] = STM;
517 tr->nodeidx[tpos] = k--;
518 tr->pos[tpos] = i--;
519 }
520 else if (sc == imx[i][k] + hmm->tsc[k][TII])
521 {
522 tr->statetype[tpos] = STI;
523 tr->nodeidx[tpos] = k;
524 tr->pos[tpos] = i--;
525 }
526 else Die("traceback failed");
527 break;
528
529 case STN: /* N connects from S, N */
530 if (i == 0 && xmx[i][XMN] == 0)
531 {
532 tr->statetype[tpos] = STS;
533 tr->nodeidx[tpos] = 0;
534 tr->pos[tpos] = 0;
535 }
536 else if (i > 0 && xmx[i+1][XMN] == xmx[i][XMN] + hmm->xsc[XTN][LOOP])
537 {
538 tr->statetype[tpos] = STN;
539 tr->nodeidx[tpos] = 0;
540 tr->pos[tpos] = 0; /* note convention adherence: */
541 tr->pos[tpos-1] = i--; /* first N doesn't emit */
542 }
543 else Die("traceback failed");
544 break;
545
546 case STB: /* B connects from N, J */
547 if (xmx[i][XMB] == xmx[i][XMN] + hmm->xsc[XTN][MOVE])
548 {
549 tr->statetype[tpos] = STN;
550 tr->nodeidx[tpos] = 0;
551 tr->pos[tpos] = 0;
552 }
553 else if (xmx[i][XMB] == xmx[i][XMJ] + hmm->xsc[XTJ][MOVE])
554 {
555 tr->statetype[tpos] = STJ;
556 tr->nodeidx[tpos] = 0;
557 tr->pos[tpos] = 0;
558 }
559 else Die("traceback failed");
560 break;
561
562 case STE: /* E connects from any M state. k set here */
563 for (k = hmm->M; k >= 1; k--)
564 if (xmx[i][XME] == mmx[i][k] + hmm->esc[k])
565 {
566 /* check for wing unfolding */
567 if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k])
568 {
569 int dk; /* need a tmp k while moving thru delete wing */
570 for (dk = hmm->M; dk > k; dk--)
571 {
572 tr->statetype[tpos] = STD;
573 tr->nodeidx[tpos] = dk;
574 tr->pos[tpos] = 0;
575 tpos++;
576 if (tpos == curralloc)
577 { /* grow trace if necessary */
578 curralloc += N;
579 P7ReallocTrace(tr, curralloc);
580 }
581 }
582 }
583
584 tr->statetype[tpos] = STM;
585 tr->nodeidx[tpos] = k--;
586 tr->pos[tpos] = i--;
587 break;
588 }
589 if (k < 0) Die("traceback failed");
590 break;
591
592 case STC: /* C comes from C, E */
593 if (xmx[i][XMC] == xmx[i-1][XMC] + hmm->xsc[XTC][LOOP])
594 {
595 tr->statetype[tpos] = STC;
596 tr->nodeidx[tpos] = 0;
597 tr->pos[tpos] = 0; /* note convention adherence: */
598 tr->pos[tpos-1] = i--; /* first C doesn't emit */
599 }
600 else if (xmx[i][XMC] == xmx[i][XME] + hmm->xsc[XTE][MOVE])
601 {
602 tr->statetype[tpos] = STE;
603 tr->nodeidx[tpos] = 0;
604 tr->pos[tpos] = 0; /* E is a nonemitter */
605 }
606 else Die("Traceback failed.");
607 break;
608
609 case STJ: /* J connects from E, J */
610 if (xmx[i][XMJ] == xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP])
611 {
612 tr->statetype[tpos] = STJ;
613 tr->nodeidx[tpos] = 0;
614 tr->pos[tpos] = 0; /* note convention adherence: */
615 tr->pos[tpos-1] = i--; /* first J doesn't emit */
616 }
617 else if (xmx[i][XMJ] == xmx[i][XME] + hmm->xsc[XTE][LOOP])
618 {
619 tr->statetype[tpos] = STE;
620 tr->nodeidx[tpos] = 0;
621 tr->pos[tpos] = 0; /* E is a nonemitter */
622 }
623 else Die("Traceback failed.");
624 break;
625
626 default:
627 Die("traceback failed");
628
629 } /* end switch over statetype[tpos-1] */
630
631 tpos++;
632 if (tpos == curralloc)
633 { /* grow trace if necessary */
634 curralloc += N;
635 P7ReallocTrace(tr, curralloc);
636 }
637
638 } /* end traceback, at S state; tpos == tlen now */
639 tr->tlen = tpos;
640 P7ReverseTrace(tr);
641 *ret_tr = tr;
642 }
643
644
645 /* Function: P7SmallViterbi()
646 * Date: SRE, Fri Mar 6 15:29:41 1998 [St. Louis]
647 *
648 * Purpose: Wrapper function, for linear memory alignment
649 * with same arguments as P7Viterbi().
650 *
651 * Calls P7ParsingViterbi to break the sequence
652 * into fragments. Then, based on size of fragments,
653 * calls either P7Viterbi() or P7WeeViterbi() to
654 * get traces for them. Finally, assembles all these
655 * traces together to produce an overall optimal
656 * trace for the sequence.
657 *
658 * If the trace isn't needed for some reason,
659 * all we do is call P7ParsingViterbi.
660 *
661 * Args: dsq - sequence in digitized form
662 * L - length of dsq
663 * hmm - the model
664 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
665 *
666 * Returns: Score of optimal alignment in bits.
667 */
668 float
P7SmallViterbi(char * dsq,int L,struct plan7_s * hmm,struct p7trace_s ** ret_tr)669 P7SmallViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
670 {
671 struct p7trace_s *ctr; /* collapsed trace of optimal parse */
672 struct p7trace_s *tr; /* full trace of optimal alignment */
673 struct p7trace_s **tarr; /* trace array */
674 int ndom; /* number of subsequences */
675 int i; /* counter over domains */
676 int pos; /* position in sequence */
677 int tpos; /* position in trace */
678 int tlen; /* length of full trace */
679 int sqlen; /* length of a subsequence */
680 int totlen; /* length of L matched by model (as opposed to N/C/J) */
681 float sc; /* score of optimal alignment */
682 int t2; /* position in a subtrace */
683
684 /* Step 1. Call P7ParsingViterbi to calculate an optimal parse
685 * of the sequence into single-hit subsequences; this parse
686 * is returned in a "collapsed" trace
687 */
688 sc = P7ParsingViterbi(dsq, L, hmm, &ctr);
689
690 /* If we don't want full trace, we're done */
691 if (ret_tr == NULL)
692 {
693 P7FreeTrace(ctr);
694 return sc;
695 }
696
697 /* Step 2. Call either P7Viterbi or P7WeeViterbi on each subsequence
698 * to recover a full traceback of each, collecting them in
699 * an array.
700 */
701 ndom = ctr->tlen/2 - 1;
702 tarr = MallocOrDie(sizeof(struct p7trace_s *) * ndom);
703 tlen = totlen = 0;
704 for (i = 0; i < ndom; i++)
705 {
706 sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1]; /* length of subseq */
707
708 if (P7ViterbiSize(sqlen, hmm->M) > RAMLIMIT)
709 P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));
710 else
711 P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));
712
713 tlen += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */
714 totlen += sqlen;
715 }
716
717 /* Step 3. Compose the subtraces into one big final trace.
718 * This is wasteful because we're going to TraceDecompose()
719 * it again in both hmmsearch and hmmpfam to look at
720 * individual domains; but we do it anyway so the P7SmallViterbi
721 * interface looks exactly like the P7Viterbi interface. Maybe
722 * long traces shouldn't include all the N/J/C states anyway,
723 * since they're unambiguously implied.
724 */
725
726 /* Calculate total trace len and alloc;
727 * nonemitting SNCT + nonemitting J's + emitting NJC
728 */
729 tlen += 4 + (ndom-1) + (L-totlen);
730 P7AllocTrace(tlen, &tr);
731 tr->tlen = tlen;
732
733 /* Add N-terminal trace framework
734 */
735 tr->statetype[0] = STS;
736 tr->nodeidx[0] = 0;
737 tr->pos[0] = 0;
738 tr->statetype[1] = STN;
739 tr->nodeidx[1] = 0;
740 tr->pos[1] = 0;
741 tpos = 2;
742 /* add implied N's */
743 for (pos = 1; pos <= ctr->pos[1]; pos++)
744 {
745 tr->statetype[tpos] = STN;
746 tr->nodeidx[tpos] = 0;
747 tr->pos[tpos] = pos;
748 tpos++;
749 }
750
751 /* Add each subseq trace in, with its appropriate
752 * sequence offset derived from the collapsed trace
753 */
754 for (i = 0; i < ndom; i++)
755 { /* skip SN, CT framework at ends */
756 for (t2 = 2; t2 < tarr[i]->tlen-2; t2++)
757 {
758 tr->statetype[tpos] = tarr[i]->statetype[t2];
759 tr->nodeidx[tpos] = tarr[i]->nodeidx[t2];
760 if (tarr[i]->pos[t2] > 0)
761 tr->pos[tpos] = tarr[i]->pos[t2] + ctr->pos[i*2+1];
762 else
763 tr->pos[tpos] = 0;
764 tpos++;
765 }
766 /* add nonemitting J or C */
767 tr->statetype[tpos] = (i == ndom-1) ? STC : STJ;
768 tr->nodeidx[tpos] = 0;
769 tr->pos[tpos] = 0;
770 tpos++;
771 /* add implied emitting J's */
772 if (i != ndom-1)
773 for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++)
774 {
775 tr->statetype[tpos] = STJ;
776 tr->nodeidx[tpos] = 0;
777 tr->pos[tpos] = pos;
778 tpos++;
779 }
780 }
781
782 /* add implied C's */
783 for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++)
784 {
785 tr->statetype[tpos] = STC;
786 tr->nodeidx[tpos] = 0;
787 tr->pos[tpos] = pos;
788 tpos++;
789 }
790 /* add terminal T */
791 tr->statetype[tpos] = STT;
792 tr->nodeidx[tpos] = 0;
793 tr->pos[tpos] = 0;
794 tpos++;
795
796 for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]);
797 free(tarr);
798 P7FreeTrace(ctr);
799
800 *ret_tr = tr;
801 return sc;
802 }
803
804
805
806
807 /* Function: P7ParsingViterbi()
808 * Date: SRE, Wed Mar 4 14:07:31 1998 [St. Louis]
809 *
810 * Purpose: The "hmmfs" linear-memory algorithm for finding
811 * the optimal alignment of a very long sequence to
812 * a looping, multihit (e.g. Plan7) model, parsing it into
813 * a series of nonoverlapping subsequences that match
814 * the model once. Other algorithms (e.g. P7Viterbi()
815 * or P7WeeViterbi()) are applied subsequently to
816 * these subsequences to recover complete alignments.
817 *
818 * The hmmfs algorithm appears briefly in [Durbin98],
819 * but is otherwise unpublished.
820 *
821 * The traceback structure returned is special: a
822 * "collapsed" trace S->B->E->...->B->E->T, where
823 * stateidx is unused and pos is used to indicate the
824 * position of B and E in the sequence. The matched
825 * subsequence is B_pos+1...E_pos. The number of
826 * matches in the trace is (tlen/2)-1.
827 *
828 * Args: dsq - sequence in digitized form
829 * L - length of dsq
830 * hmm - the model (log odds scores ready)
831 * ret_tr - RETURN: a collapsed traceback.
832 *
833 * Returns: Score of the optimal Viterbi alignment, in bits.
834 */
835 float
P7ParsingViterbi(char * dsq,int L,struct plan7_s * hmm,struct p7trace_s ** ret_tr)836 P7ParsingViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
837 {
838 struct dpmatrix_s *mx; /* two rows of score matrix */
839 struct dpmatrix_s *tmx; /* two rows of misused score matrix: traceback ptrs */
840 struct p7trace_s *tr; /* RETURN: collapsed traceback */
841 int **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */
842 int **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */
843 int *btr, *etr; /* O(L) trace ptrs for B, E state pts in seq */
844 int sc; /* integer score of optimal alignment */
845 int i,k,tpos; /* index for seq, model, trace position */
846 int cur, prv; /* indices for rolling dp matrix */
847 int curralloc; /* size of allocation for tr */
848
849
850 /* Alloc a DP matrix and traceback pointers, two rows each, O(M).
851 * Alloc two O(L) arrays to trace back through the sequence thru B and E.
852 */
853 mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
854 tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr);
855 btr = MallocOrDie(sizeof(int) * (L+1));
856 etr = MallocOrDie(sizeof(int) * (L+1));
857
858 /* Initialization of the zero row.
859 */
860 xmx[0][XMN] = 0; /* S->N, p=1 */
861 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
862 btr[0] = 0;
863 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
864 etr[0] = -1;
865 for (k = 0; k <= hmm->M; k++)
866 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
867
868 /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick.
869 * Note some slightly wasteful boundary conditions:
870 * tsc[0] = -INFTY for all eight transitions (no node 0)
871 * D_M and I_M are wastefully calculated (they don't exist)
872 *
873 * Notes on traceback pointer propagation.
874 * - In the path B->E, we propagate the i that B was aligned to in the optimal
875 * alignment, via mtr, dtr, and itr.
876 * - When we reach an E, we record the i of the B it started from in etr.
877 * - In a looping path E->J...->B or terminal path E->C...->T, we propagate
878 * the i that E was aligned to in the optimal alignment via xtr[][XMC]
879 * and xtr[][XMJ].
880 * - When we enter B, we record the i of the best previous E, or 0 if there
881 * isn't one, in btr.
882 */
883 for (i = 1; i <= L; i++) {
884 cur = i % 2;
885 prv = !cur;
886
887 mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;
888
889 for (k = 1; k <= hmm->M; k++) {
890 /* match state */
891 mmx[cur][k] = -INFTY;
892 if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY)
893 { mmx[cur][k] = sc; mtr[cur][k] = mtr[prv][k-1]; }
894 if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k])
895 { mmx[cur][k] = sc; mtr[cur][k] = itr[prv][k-1]; }
896 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
897 { mmx[cur][k] = sc; mtr[cur][k] = i-1; }
898 if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k])
899 { mmx[cur][k] = sc; mtr[cur][k] = dtr[prv][k-1]; }
900 if (hmm->msc[(int) dsq[i]][k] != -INFTY)
901 mmx[cur][k] += hmm->msc[(int) dsq[i]][k];
902 else
903 mmx[cur][k] = -INFTY;
904
905 /* delete state */
906 dmx[cur][k] = -INFTY;
907 if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)
908 { dmx[cur][k] = sc; dtr[cur][k] = mtr[cur][k-1]; }
909 if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
910 { dmx[cur][k] = sc; dtr[cur][k] = dtr[cur][k-1]; }
911
912 /* insert state */
913 if (k < hmm->M) {
914 imx[cur][k] = -INFTY;
915 if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY)
916 { imx[cur][k] = sc; itr[cur][k] = mtr[prv][k]; }
917 if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k])
918 { imx[cur][k] = sc; itr[cur][k] = itr[prv][k]; }
919 if (hmm->isc[(int) dsq[i]][k] != -INFTY)
920 imx[cur][k] += hmm->isc[(int) dsq[i]][k];
921 else
922 imx[cur][k] = -INFTY;
923 }
924 }
925
926 /* Now the special states. Order is important here.
927 * remember, C and J emissions are zero score by definition,
928 */
929 /* N state */
930 xmx[cur][XMN] = -INFTY;
931 if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
932 xmx[cur][XMN] = sc;
933 /* E state */
934 xmx[cur][XME] = -INFTY;
935 for (k = 1; k <= hmm->M; k++)
936 if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
937 { xmx[cur][XME] = sc; etr[i] = mtr[cur][k]; }
938 /* J state */
939 xmx[cur][XMJ] = -INFTY;
940 if ((sc = xmx[prv][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
941 { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = xtr[prv][XMJ]; }
942 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][LOOP]) > xmx[cur][XMJ])
943 { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = i; }
944 /* B state */
945 xmx[cur][XMB] = -INFTY;
946 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
947 { xmx[cur][XMB] = sc; btr[i] = 0; }
948 if ((sc = xmx[cur][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[cur][XMB])
949 { xmx[cur][XMB] = sc; btr[i] = xtr[cur][XMJ]; }
950 /* C state */
951 xmx[cur][XMC] = -INFTY;
952 if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
953 { xmx[cur][XMC] = sc; xtr[cur][XMC] = xtr[prv][XMC]; }
954 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
955 { xmx[cur][XMC] = sc; xtr[cur][XMC] = i; }
956 }
957 /* T state (not stored) */
958 sc = xmx[cur][XMC] + hmm->xsc[XTC][MOVE];
959
960 /*****************************************************************
961 * Collapsed traceback stage.
962 * xtr[L%2][XMC] contains the position j of the previous E
963 * etr[j] contains the position i of the previous B
964 * btr[i] contains the position j of the previous E, or 0
965 * continue until btr[i] = 0.
966 *****************************************************************/
967
968 curralloc = 2; /* minimum: no hits */
969 P7AllocTrace(curralloc, &tr);
970
971 /* Init of collapsed trace. Back to front; we ReverseTrace() later.
972 */
973 tpos = 0;
974 tr->statetype[tpos] = STT;
975 tr->pos[tpos] = 0;
976 i = xtr[L%2][XMC];
977 while (i > 0)
978 {
979 curralloc += 2;
980 P7ReallocTrace(tr, curralloc);
981
982 tpos++;
983 tr->statetype[tpos] = STE;
984 tr->pos[tpos] = i;
985 i = etr[i];
986
987 tpos++;
988 tr->statetype[tpos] = STB;
989 tr->pos[tpos] = i;
990 i = btr[i];
991 }
992
993 tpos++;
994 tr->statetype[tpos] = STS;
995 tr->pos[tpos] = 0;
996 tr->tlen = tpos + 1;
997 P7ReverseTrace(tr);
998
999 FreePlan7Matrix(mx);
1000 FreePlan7Matrix(tmx);
1001 free(btr);
1002 free(etr);
1003
1004 *ret_tr = tr;
1005 return Scorify(sc);
1006 }
1007
1008 /* Function: P7WeeViterbi()
1009 * Date: SRE, Wed Mar 4 08:24:04 1998 [St. Louis]
1010 *
1011 * Purpose: Hirschberg/Myers/Miller linear memory alignment.
1012 * See [Hirschberg75,MyM-88a] for the idea of the algorithm.
1013 * Adapted to HMM implementation.
1014 *
1015 * Requires that you /know/ that there's only
1016 * one hit to the model in the sequence: either
1017 * because you're forcing single-hit, or you've
1018 * previously called P7ParsingViterbi to parse
1019 * the sequence into single-hit segments. The reason
1020 * for this is that a cyclic model (a la Plan7)
1021 * defeats the nice divide and conquer trick.
1022 * (I think some trickery with propagated trace pointers
1023 * could get around this but haven't explored it.)
1024 * This is implemented by ignoring transitions
1025 * to/from J state.
1026 *
1027 * Args: dsq - sequence in digitized form
1028 * L - length of dsq
1029 * hmm - the model
1030 * ret_tr - RETURN: traceback.
1031 *
1032 * Returns: Score of the optimal Viterbi alignment.
1033 */
1034 float
P7WeeViterbi(char * dsq,int L,struct plan7_s * hmm,struct p7trace_s ** ret_tr)1035 P7WeeViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
1036 {
1037 struct p7trace_s *tr; /* RETURN: traceback */
1038 int *kassign; /* 0..L+1, alignment of seq positions to model nodes */
1039 enum p7stype *tassign; /* 0..L+1, alignment of seq positions to state types */
1040 int *endlist; /* stack of end points on sequence to work on */
1041 int *startlist; /* stack of start points on sequence to work on */
1042 int lpos; /* position in endlist, startlist */
1043 int k1, k2, k3; /* start, mid, end in model */
1044 enum p7stype t1, t2, t3; /* start, mid, end in state type */
1045 int s1, s2, s3; /* start, mid, end in sequence */
1046 float sc; /* score of segment optimal alignment */
1047 float ret_sc; /* optimal score over complete seq */
1048 int tlen; /* length needed for trace */
1049 int i, k, tpos; /* index in sequence, model, trace */
1050
1051
1052 /* Initialize.
1053 */
1054 kassign = MallocOrDie (sizeof(int) * (L+1));
1055 tassign = MallocOrDie (sizeof(enum p7stype) * (L+1));
1056 endlist = MallocOrDie (sizeof(int) * (L+1));
1057 startlist = MallocOrDie (sizeof(int) * (L+1));
1058
1059 lpos = 0;
1060 startlist[lpos] = 1;
1061 endlist[lpos] = L;
1062 kassign[1] = 1;
1063 kassign[L] = hmm->M;
1064 tassign[1] = STS;
1065 tassign[L] = STT;
1066
1067 /* Recursive divide-and-conquer alignment.
1068 */
1069 while (lpos >= 0)
1070 {
1071 /* Pop a segment off the stack */
1072 s1 = startlist[lpos];
1073 k1 = kassign[s1];
1074 t1 = tassign[s1];
1075 s3 = endlist[lpos];
1076 k3 = kassign[s3];
1077 t3 = tassign[s3];
1078 lpos--;
1079 /* find optimal midpoint of segment */
1080 sc = get_wee_midpt(hmm, dsq, L, k1, t1, s1, k3, t3, s3, &k2, &t2, &s2);
1081 kassign[s2] = k2;
1082 tassign[s2] = t2;
1083 /* score is valid on first pass */
1084 if (t1 == STS && t3 == STT) ret_sc = sc;
1085
1086 /* push N-terminal segment on stack */
1087 if (t2 != STN && (s2 - s1 > 1 || (s2 - s1 == 1 && t1 == STS)))
1088 {
1089 lpos++;
1090 startlist[lpos] = s1;
1091 endlist[lpos] = s2;
1092 }
1093 /* push C-terminal segment on stack */
1094 if (t2 != STC && (s3 - s2 > 1 || (s3 - s2 == 1 && t3 == STT)))
1095 {
1096 lpos++;
1097 startlist[lpos] = s2;
1098 endlist[lpos] = s3;
1099 }
1100
1101 if (t2 == STN)
1102 { /* if we see STN midpoint, we know the whole N-term is STN */
1103 for (; s2 >= s1; s2--) {
1104 kassign[s2] = 1;
1105 tassign[s2] = STN;
1106 }
1107 }
1108 if (t2 == STC)
1109 { /* if we see STC midpoint, we know whole C-term is STC */
1110 for (; s2 <= s3; s2++) {
1111 kassign[s2] = hmm->M;
1112 tassign[s2] = STC;
1113 }
1114 }
1115 }
1116
1117 /*****************************************************************
1118 * Construct a traceback structure from kassign/tassign by interpolating
1119 * necessary states.
1120 * Trace allocation is as follows. We clearly need L emitting states.
1121 * We also need nonemitting states as follows:
1122 * STS,STN,STB,STE,STC,STT = 6
1123 * STD: count k2-k1-1 in kassign M->M's
1124 * Also, count N->M's and M->C's (potential wing unfoldings),
1125 *****************************************************************/
1126
1127 tlen = L + 6;
1128 for (i = 1; i < L; i++)
1129 {
1130 if (tassign[i] == STM && tassign[i+1] == STM)
1131 tlen += kassign[i+1] - kassign[i] - 1;
1132 if (tassign[i] == STN && tassign[i+1] == STM)
1133 tlen += kassign[i+1] - 1;
1134 if (tassign[i] == STM && tassign[i+1] == STC)
1135 tlen += hmm->M - kassign[i];
1136 }
1137 P7AllocTrace(tlen, &tr);
1138
1139 tr->statetype[0] = STS;
1140 tr->nodeidx[0] = 0;
1141 tr->pos[0] = 0;
1142 tr->statetype[1] = STN;
1143 tr->nodeidx[1] = 0;
1144 tr->pos[1] = 0;
1145 tpos = 2;
1146
1147 for (i = 1; i <= L; i++)
1148 {
1149 switch(tassign[i]) {
1150 case STM:
1151 /* check for first match state */
1152 if (tr->statetype[tpos-1] == STN) {
1153 tr->statetype[tpos] = STB;
1154 tr->nodeidx[tpos] = 0;
1155 tr->pos[tpos] = 0;
1156 tpos++;
1157 /* check for wing unfolding */
1158 if (Prob2Score(hmm->begin[kassign[i]], hmm->p1) + INTSCALE <= hmm->bsc[kassign[i]])
1159 for (k = 1; k < kassign[i]; k++) {
1160 tr->statetype[tpos] = STD;
1161 tr->nodeidx[tpos] = k;
1162 tr->pos[tpos] = 0;
1163 tpos++;
1164 }
1165 }
1166 /* do the match state itself */
1167 tr->statetype[tpos] = STM;
1168 tr->nodeidx[tpos] = kassign[i];
1169 tr->pos[tpos] = i;
1170 tpos++;
1171 /* do any deletes necessary 'til next match */
1172 if (i < L && tassign[i+1] == STM && kassign[i+1] - kassign[i] > 1)
1173 for (k = kassign[i] + 1; k < kassign[i+1]; k++)
1174 {
1175 tr->statetype[tpos] = STD;
1176 tr->nodeidx[tpos] = k;
1177 tr->pos[tpos] = 0;
1178 tpos++;
1179 }
1180 /* check for last match state */
1181 if (i == L || tassign[i+1] == STC) {
1182 /* check for wing unfolding */
1183 if (Prob2Score(hmm->end[kassign[i-1]], 1.) + INTSCALE <= hmm->esc[kassign[i-1]])
1184 for (k = kassign[i]+1; k <= hmm->M; k++)
1185 {
1186 tr->statetype[tpos] = STD;
1187 tr->nodeidx[tpos] = k;
1188 tr->pos[tpos] = 0;
1189 tpos++;
1190 }
1191 /* add on the end state */
1192 tr->statetype[tpos] = STE;
1193 tr->nodeidx[tpos] = 0;
1194 tr->pos[tpos] = 0;
1195 tpos++;
1196 /* and a nonemitting C state */
1197 tr->statetype[tpos] = STC;
1198 tr->nodeidx[tpos] = 0;
1199 tr->pos[tpos] = 0;
1200 tpos++;
1201 }
1202 break;
1203
1204 case STI:
1205 tr->statetype[tpos] = STI;
1206 tr->nodeidx[tpos] = kassign[i];
1207 tr->pos[tpos] = i;
1208 tpos++;
1209 break;
1210
1211 case STN:
1212 tr->statetype[tpos] = STN;
1213 tr->nodeidx[tpos] = 0;
1214 tr->pos[tpos] = i;
1215 tpos++;
1216 break;
1217
1218 case STC:
1219 tr->statetype[tpos] = STC;
1220 tr->nodeidx[tpos] = 0;
1221 tr->pos[tpos] = i;
1222 tpos++;
1223 break;
1224
1225 default: Die("Bogus state %s", Statetype(tassign[i]));
1226 }
1227 }
1228 /* terminate the trace */
1229 tr->statetype[tpos] = STT;
1230 tr->nodeidx[tpos] = 0;
1231 tr->pos[tpos] = 0;
1232 tr->tlen = tpos+1;
1233
1234 *ret_tr = tr;
1235
1236 free(kassign);
1237 free(tassign);
1238 free(startlist);
1239 free(endlist);
1240 return ret_sc;
1241 }
1242
1243
1244 /* Function: Plan7ESTViterbi()
1245 *
1246 * Purpose: Frameshift-tolerant alignment of protein model to cDNA EST.
1247 *
1248 *
1249 */
1250 float
Plan7ESTViterbi(char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s ** ret_mx)1251 Plan7ESTViterbi(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
1252 {
1253 struct dpmatrix_s *mx;
1254 int **xmx;
1255 int **mmx;
1256 int **imx;
1257 int **dmx;
1258 int i,k;
1259 int sc;
1260 int codon;
1261
1262 /* Allocate a DP matrix with 0..L rows, 0..M+1 columns.
1263 */
1264 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
1265
1266 /* Initialization of the zero row (DNA sequence of length 0)
1267 * Note that xmx[i][stN] = 0 by definition for all i,
1268 * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need
1269 * to be calculated in DP matrices.
1270 */
1271 xmx[0][XMN] = 0; /* S->N, p=1 */
1272 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
1273 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
1274 for (k = 0; k <= hmm->M; k++)
1275 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
1276
1277 /* Initialization of the first row (DNA sequence of length 1)
1278 * only N can make this nucleotide.
1279 */
1280 xmx[1][XMN] = xmx[0][XMN] + hmm->xsc[XTN][LOOP];
1281 xmx[1][XMB] = xmx[1][XMN] + hmm->xsc[XTN][MOVE];
1282 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need 2 nt to get here */
1283 for (k = 0; k <= hmm->M; k++)
1284 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need 2 nt to get into model */
1285
1286 /* Recursion. Done as a pull.
1287 * Note some slightly wasteful boundary conditions:
1288 * tsc[0] = -INFTY for all eight transitions (no node 0)
1289 * D_M and I_M are wastefully calculated (they don't exist)
1290 */
1291 for (i = 2; i <= L; i++) {
1292 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
1293
1294 /* crude calculation of lookup value for codon */
1295 if (i > 2) {
1296 if (dsq[i-2] < 4 && dsq[i-1] < 4 && dsq[i] < 4)
1297 codon = dsq[i-2] * 16 + dsq[i-1] * 4 + dsq[i];
1298 else
1299 codon = 64; /* ambiguous codon; punt */
1300 }
1301
1302 for (k = 1; k <= hmm->M; k++) {
1303 /* match state */
1304 if (i > 2) {
1305 mmx[i][k] = mmx[i-3][k-1] + hmm->tsc[k-1][TMM];
1306 if ((sc = imx[i-3][k-1] + hmm->tsc[k-1][TIM]) > mmx[i][k])
1307 mmx[i][k] = sc;
1308 if ((sc = xmx[i-3][XMB] + hmm->bsc[k]) > mmx[i][k])
1309 mmx[i][k] = sc;
1310 if ((sc = dmx[i-3][k-1] + hmm->tsc[k-1][TDM]) > mmx[i][k])
1311 mmx[i][k] = sc;
1312 mmx[i][k] += hmm->dnam[codon][k];
1313 }
1314 /* -1 frameshifts into match state */
1315 if ((sc = mmx[i-2][k-1] + hmm->tsc[k-1][TMM] + hmm->dna2) > mmx[i][k])
1316 mmx[i][k] = sc;
1317 if ((sc = imx[i-2][k-1] + hmm->tsc[k-1][TIM] + hmm->dna2) > mmx[i][k])
1318 mmx[i][k] = sc;
1319 if ((sc = xmx[i-2][XMB] + hmm->bsc[k] + hmm->dna2) > mmx[i][k])
1320 mmx[i][k] = sc;
1321 if ((sc = dmx[i-2][k-1] + hmm->tsc[k-1][TDM] + hmm->dna2) > mmx[i][k])
1322 mmx[i][k] = sc;
1323
1324 /* +1 frameshifts into match state */
1325 if (i > 3) {
1326 if ((sc = mmx[i-4][k-1] + hmm->tsc[k-1][TMM] + hmm->dna4) > mmx[i][k])
1327 mmx[i][k] = sc;
1328 if ((sc = imx[i-4][k-1] + hmm->tsc[k-1][TIM] + hmm->dna4) > mmx[i][k])
1329 mmx[i][k] = sc;
1330 if ((sc = xmx[i-4][XMB] + hmm->bsc[k] + hmm->dna4) > mmx[i][k])
1331 mmx[i][k] = sc;
1332 if ((sc = dmx[i-4][k-1] + hmm->tsc[k-1][TDM] + hmm->dna4) > mmx[i][k])
1333 mmx[i][k] = sc;
1334 }
1335 /* delete state */
1336 dmx[i][k] = mmx[i][k-1] + hmm->tsc[k-1][TMD];
1337 if ((sc = dmx[i][k-1] + hmm->tsc[k-1][TDD]) > dmx[i][k])
1338 dmx[i][k] = sc;
1339
1340 /* insert state */
1341 if (i > 2) {
1342 imx[i][k] = mmx[i-3][k] + hmm->tsc[k][TMI];
1343 if ((sc = imx[i-3][k] + hmm->tsc[k][TII]) > imx[i][k])
1344 imx[i][k] = sc;
1345 imx[i][k] += hmm->dnai[codon][k];
1346 }
1347
1348 /* -1 frameshifts into insert state */
1349 if ((sc = mmx[i-2][k] + hmm->tsc[k][TMI] + hmm->dna2) > imx[i][k])
1350 imx[i][k] = sc;
1351 if ((sc = imx[i-2][k] + hmm->tsc[k][TII] + hmm->dna2) > imx[i][k])
1352 imx[i][k] = sc;
1353
1354 /* +1 frameshifts into insert state */
1355 if (i > 4) {
1356 if ((sc = mmx[i-4][k] + hmm->tsc[k][TMI] + hmm->dna4) > imx[i][k])
1357 imx[i][k] = sc;
1358 if ((sc = imx[i-4][k] + hmm->tsc[k][TII] + hmm->dna4) > imx[i][k])
1359 imx[i][k] = sc;
1360 }
1361 }
1362 /* Now the special states. Order is important here.
1363 * remember, C and J emissions are zero score by definition,
1364 */
1365 /* N state: +1 nucleotide */
1366 xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];
1367 /* E state: collect from M's, and last D */
1368 xmx[i][XME] = dmx[i][hmm->M]; /* transition prob from last D = 1.0 */
1369 for (k = 1; k <= hmm->M; k++)
1370 if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME])
1371 xmx[i][XME] = sc;
1372 /* J state: +1 nucleotide */
1373 xmx[i][XMJ] = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP];
1374 if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
1375 xmx[i][XMJ] = sc;
1376 /* B state: collect from N,J */
1377 xmx[i][XMB] = xmx[i][XMN] + hmm->xsc[XTN][MOVE];
1378 if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
1379 xmx[i][XMB] = sc;
1380 /* C state: +1 nucleotide */
1381 xmx[i][XMC] = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP];
1382 if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
1383 xmx[i][XMC] = sc;
1384 }
1385
1386 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
1387
1388 if (ret_mx != NULL) *ret_mx = mx;
1389 else FreePlan7Matrix(mx);
1390
1391 return Scorify(sc); /* the total Viterbi score. */
1392 }
1393
1394
1395
1396 /* Function: get_wee_midpt()
1397 * Date: SRE, Wed Mar 4 08:27:11 1998 [St. Louis]
1398 *
1399 * Purpose: The heart of the divide and conquer algorithm
1400 * for P7WeeViterbi(). This function is called
1401 * recursively to find successive optimal midpoints
1402 * in the alignment matrix. See P7WeeViterbi() for
1403 * further comments on the assumptions of this algorithm.
1404 *
1405 * Args: hmm - the model, set up for integer scores
1406 * dsq - the sequence, digitized
1407 * L - length of the sequence
1408 * k1 - model node to start with, 1..M
1409 * t1 - state type to start with, STM | STI | STN | STC; STS to start
1410 * s1 - sequence position to start with, 1..L; 1 to start
1411 * k3 - model node to end with, 1..M
1412 * t3 - state type to end with, STM | STI | STN | STC; STT to start
1413 * s3 - sequence position to end with, 1..L; L to start
1414 * ret_k2 - RETURN: optimal midpoint, node position in model
1415 * ret_t2 - RETURN: optimal midpoint, state type
1416 * ret_s2 - RETURN: optimal midpoint, sequence position
1417 *
1418 * Returns: score of optimal alignment, in bits.
1419 */
1420 static float
get_wee_midpt(struct plan7_s * hmm,char * dsq,int L,int k1,enum p7stype t1,int s1,int k3,enum p7stype t3,int s3,int * ret_k2,enum p7stype * ret_t2,int * ret_s2)1421 get_wee_midpt(struct plan7_s *hmm, char *dsq, int L,
1422 int k1, enum p7stype t1, int s1,
1423 int k3, enum p7stype t3, int s3,
1424 int *ret_k2, enum p7stype *ret_t2, int *ret_s2)
1425 {
1426 struct dpmatrix_s *fwd;
1427 struct dpmatrix_s *bck;
1428 int **xmx; /* convenience ptr into special states */
1429 int **mmx; /* convenience ptr into match states */
1430 int **imx; /* convenience ptr into insert states */
1431 int **dmx; /* convenience ptr into delete states */
1432 int k2;
1433 enum p7stype t2;
1434 int s2;
1435 int cur, prv, nxt; /* current, previous, next row index (0 or 1)*/
1436 int i,k; /* indices for seq, model */
1437 int sc; /* integer score */
1438 int max; /* maximum integer score */
1439 int start; /* s1 to start at (need, for STS special case) */
1440
1441
1442 /* Choose our midpoint.
1443 * Special cases: s1, s3 adjacent and t1 == STS: s2 = s1
1444 * s1, s3 adjacent and t3 == STT: s2 = s3
1445 * (where we must replace STS, STT eventually)
1446 */
1447 s2 = s1 + (s3-s1) / 2;
1448 if (s3-s1 == 1 && t1 == STS) s2 = s1;
1449 if (s3-s1 == 1 && t3 == STT) s2 = s3;
1450
1451 /* STS is a special case. STS aligns to row zero by convention,
1452 * but we'll be passed s1=1, t1=STS. We have to init on row
1453 * zero then start DP on row 1.
1454 */
1455 start = (t1 == STS) ? 0 : s1;
1456
1457 /* Allocate our forward two rows.
1458 * Initialize row zero.
1459 */
1460 fwd = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1461 cur = start%2;
1462 xmx[cur][XMN] = xmx[cur][XMB] = -INFTY;
1463 xmx[cur][XME] = xmx[cur][XMC] = -INFTY;
1464 for (k = k1; k <= k3; k++)
1465 mmx[cur][k] = imx[cur][k] = dmx[cur][k] = -INFTY;
1466
1467 /* Where to put our zero for our start point...
1468 * (only possible to start on an emitting state; J disallowed)
1469 */
1470 switch (t1) {
1471 case STM: mmx[cur][k1] = 0; break;
1472 case STI: imx[cur][k1] = 0; break;
1473 case STN: xmx[cur][XMN] = 0; break;
1474 case STC: xmx[cur][XMC] = 0; break;
1475 case STS: xmx[cur][XMN] = 0; break;
1476 default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t1));
1477 }
1478
1479 /* Still initializing.
1480 * Deal with pulling horizontal matrix moves in initial row.
1481 * These are any transitions to nonemitters:
1482 * STM-> E, D
1483 * STI-> none
1484 * STN-> B
1485 * STC-> (T, but we never observe this in the forward pass of a d&c)
1486 * STE-> C
1487 * STS-> (N, already implied by setting xmx[cur][XMN] = 0)
1488 * STB-> M
1489 */
1490 if (t1 == STM)
1491 {
1492 for (k = k1+1; k <= k3; k++)
1493 { /* transits into STD */
1494 dmx[cur][k] = -INFTY;
1495 if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)
1496 dmx[cur][k] = sc;
1497 if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
1498 dmx[cur][k] = sc;
1499 }
1500 /* transit into STE */
1501 xmx[cur][XME] = -INFTY;
1502 if ((sc = mmx[cur][k1] + hmm->esc[k1]) > -INFTY)
1503 xmx[cur][XME] = sc;
1504 }
1505 /* transit into STB from STN */
1506 xmx[cur][XMB] = -INFTY;
1507 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1508 xmx[cur][XMB] = sc;
1509 /* transit into STC from STE */
1510 xmx[cur][XMC] = -INFTY;
1511 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > -INFTY)
1512 xmx[cur][XMC] = sc;
1513
1514 /* Done initializing.
1515 * Start recursive DP; sweep forward to chosen s2 midpoint. Done as a pull.
1516 */
1517 for (i = start+1; i <= s2; i++) {
1518 cur = i % 2;
1519 prv = !cur;
1520
1521 mmx[cur][k1] = imx[cur][k1] = dmx[cur][k1] = -INFTY;
1522
1523 /* Insert state in column k1, and B->M transition in k1.
1524 */
1525 if (k1 < hmm->M) {
1526 imx[cur][k1] = -INFTY;
1527 if ((sc = mmx[prv][k1] + hmm->tsc[k1][TMI]) > -INFTY)
1528 imx[cur][k1] = sc;
1529 if ((sc = imx[prv][k1] + hmm->tsc[k1][TII]) > imx[cur][k1])
1530 imx[cur][k1] = sc;
1531 if (hmm->isc[(int) dsq[i]][k1] != -INFTY)
1532 imx[cur][k1] += hmm->isc[(int) dsq[i]][k1];
1533 else
1534 imx[cur][k1] = -INFTY;
1535 }
1536 if ((sc = xmx[prv][XMB] + hmm->bsc[k1]) > -INFTY)
1537 mmx[cur][k1] = sc;
1538 if (hmm->msc[(int) dsq[i]][k1] != -INFTY)
1539 mmx[cur][k1] += hmm->msc[(int) dsq[i]][k1];
1540 else
1541 mmx[cur][k1] = -INFTY;
1542
1543 /* Main chunk of recursion across model positions
1544 */
1545 for (k = k1+1; k <= k3; k++) {
1546 /* match state */
1547 mmx[cur][k] = -INFTY;
1548 if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY)
1549 mmx[cur][k] = sc;
1550 if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k])
1551 mmx[cur][k] = sc;
1552 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
1553 mmx[cur][k] = sc;
1554 if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k])
1555 mmx[cur][k] = sc;
1556 if (hmm->msc[(int) dsq[i]][k] != -INFTY)
1557 mmx[cur][k] += hmm->msc[(int) dsq[i]][k];
1558 else
1559 mmx[cur][k] = -INFTY;
1560
1561 /* delete state */
1562 dmx[cur][k] = -INFTY;
1563 if (k < hmm->M) {
1564 if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)
1565 dmx[cur][k] = sc;
1566 if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
1567 dmx[cur][k] = sc;
1568 }
1569
1570 /* insert state */
1571 imx[cur][k] = -INFTY;
1572 if (k < hmm->M) {
1573 if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY)
1574 imx[cur][k] = sc;
1575 if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k])
1576 imx[cur][k] = sc;
1577 if (hmm->isc[(int) dsq[i]][k] != -INFTY)
1578 imx[cur][k] += hmm->isc[(int) dsq[i]][k];
1579 else
1580 imx[cur][k] = -INFTY;
1581 }
1582 }
1583 /* N state */
1584 xmx[cur][XMN] = -INFTY;
1585 if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1586 xmx[cur][XMN] = sc;
1587 /* E state */
1588 xmx[cur][XME] = -INFTY;
1589 for (k = k1; k <= k3 && k <= hmm->M; k++)
1590 if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
1591 xmx[cur][XME] = sc;
1592 /* B state */
1593 xmx[cur][XMB] = -INFTY;
1594 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1595 xmx[cur][XMB] = sc;
1596 /* C state */
1597 xmx[cur][XMC] = -INFTY;
1598 if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1599 xmx[cur][XMC] = sc;
1600 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
1601 xmx[cur][XMC] = sc;
1602 }
1603
1604 /* Row s2%2 in fwd matrix now contains valid scores from s1 (start) to s2,
1605 * with J transitions disallowed (no cycles through model).
1606 */
1607
1608 /*****************************************************************
1609 * Backwards pass.
1610 *****************************************************************/
1611
1612 /* Allocate our backwards two rows. Init last row.
1613 */
1614 bck = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1615 nxt = s3%2;
1616 xmx[nxt][XMN] = xmx[nxt][XMB] = -INFTY;
1617 xmx[nxt][XME] = xmx[nxt][XMC] = -INFTY;
1618 for (k = k1; k <= k3 + 1; k++)
1619 mmx[nxt][k] = imx[nxt][k] = dmx[nxt][k] = -INFTY;
1620 cur = !nxt;
1621 mmx[cur][k3+1] = imx[cur][k3+1] = dmx[cur][k3+1] = -INFTY;
1622
1623 /* Where to put the zero for our end point on last row.
1624 */
1625 switch (t3) {
1626 case STM: mmx[nxt][k3] = 0; break;
1627 case STI: imx[nxt][k3] = 0; break;
1628 case STN: xmx[nxt][XMN] = 0; break;
1629 case STC: xmx[nxt][XMC] = 0; break; /* must be an emitting C */
1630 case STT: xmx[nxt][XMC] = hmm->xsc[XTC][MOVE]; break; /* C->T implied */
1631 default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t3));
1632 }
1633
1634 /* Still initializing.
1635 * In the case t3==STT, there are a few horizontal moves possible
1636 * on row s3, because STT isn't an emitter. All other states are
1637 * emitters, so their connections have to be to the previous row s3-1.
1638 */
1639 if (t3 == STT)
1640 { /* E->C */
1641 xmx[nxt][XME] = xmx[nxt][XMC] + hmm->xsc[XTE][MOVE];
1642 /* M->E */
1643 for (k = k3; k >= k1; k--) {
1644 mmx[nxt][k] = xmx[nxt][XME] + hmm->esc[k];
1645 if (s3 != s2)
1646 mmx[nxt][k] += hmm->msc[(int)dsq[s3]][k];
1647 }
1648 }
1649
1650 /* Start recursive DP; sweep backwards to chosen s2 midpoint.
1651 * Done as a pull. M, I scores at current row do /not/ include
1652 * emission scores. Be careful of integer underflow.
1653 */
1654 for (i = s3-1; i >= s2; i--) {
1655 /* note i < L, so i+1 is always a legal index */
1656 cur = i%2;
1657 nxt = !cur;
1658 /* C pulls from C (T is special cased) */
1659 xmx[cur][XMC] = -INFTY;
1660 if ((sc = xmx[nxt][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1661 xmx[cur][XMC] = sc;
1662 /* B pulls from M's */
1663 xmx[cur][XMB] = -INFTY;
1664 for (k = k1; k <= k3; k++)
1665 if ((sc = mmx[nxt][k] + hmm->bsc[k]) > xmx[cur][XMB])
1666 xmx[cur][XMB] = sc;
1667 /* E pulls from C (J disallowed) */
1668 xmx[cur][XME] = -INFTY;
1669 if ((sc = xmx[cur][XMC] + hmm->xsc[XTE][MOVE]) > -INFTY)
1670 xmx[cur][XME] = sc;
1671 /* N pulls from B, N */
1672 xmx[cur][XMN] = -INFTY;
1673 if ((sc = xmx[cur][XMB] + hmm->xsc[XTN][MOVE]) > -INFTY)
1674 xmx[cur][XMN] = sc;
1675 if ((sc = xmx[nxt][XMN] + hmm->xsc[XTN][LOOP]) > xmx[cur][XMN])
1676 xmx[cur][XMN] = sc;
1677
1678 /* Main recursion across model
1679 */
1680 for (k = k3; k >= k1; k--) {
1681 /* special case k == M */
1682 if (k == hmm->M) {
1683 mmx[cur][k] = xmx[cur][XME]; /* p=1 transition to E by definition */
1684 dmx[cur][k] = -INFTY; /* doesn't exist */
1685 imx[cur][k] = -INFTY; /* doesn't exist */
1686 if (i != s2)
1687 mmx[cur][k] += hmm->msc[(int)dsq[i]][k];
1688 continue;
1689 } /* below this k < M, so k+1 is a legal index */
1690
1691 /* pull into match state */
1692 mmx[cur][k] = -INFTY;
1693 if ((sc = xmx[cur][XME] + hmm->esc[k]) > -INFTY)
1694 mmx[cur][k] = sc;
1695 if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TMM]) > mmx[cur][k])
1696 mmx[cur][k] = sc;
1697 if ((sc = imx[nxt][k] + hmm->tsc[k][TMI]) > mmx[cur][k])
1698 mmx[cur][k] = sc;
1699 if ((sc = dmx[cur][k+1] + hmm->tsc[k][TMD]) > mmx[cur][k])
1700 mmx[cur][k] = sc;
1701 if (i != s2)
1702 mmx[cur][k] += hmm->msc[(int)dsq[i]][k];
1703
1704 /* pull into delete state */
1705 dmx[cur][k] = -INFTY;
1706 if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TDM]) > -INFTY)
1707 dmx[cur][k] = sc;
1708 if ((sc = dmx[cur][k+1] + hmm->tsc[k][TDD]) > dmx[cur][k])
1709 dmx[cur][k] = sc;
1710 /* pull into insert state */
1711 imx[cur][k] = -INFTY;
1712 if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TIM]) > -INFTY)
1713 imx[cur][k] = sc;
1714 if ((sc = imx[nxt][k] + hmm->tsc[k][TII]) > imx[cur][k])
1715 imx[cur][k] = sc;
1716 if (i != s2)
1717 imx[cur][k] += hmm->isc[(int)dsq[i]][k];
1718
1719 }
1720 }
1721
1722 /*****************************************************************
1723 * DP complete; we have both forward and backward passes. Now we
1724 * look across the s2 row and find the optimal emitting state.
1725 *****************************************************************/
1726
1727 cur = s2%2;
1728 max = -INFTY;
1729 for (k = k1; k <= k3; k++)
1730 {
1731 if ((sc = fwd->mmx[cur][k] + bck->mmx[cur][k]) > max)
1732 { k2 = k; t2 = STM; max = sc; }
1733 if ((sc = fwd->imx[cur][k] + bck->imx[cur][k]) > max)
1734 { k2 = k; t2 = STI; max = sc; }
1735 }
1736 if ((sc = fwd->xmx[cur][XMN] + bck->xmx[cur][XMN]) > max)
1737 { k2 = 1; t2 = STN; max = sc; }
1738 if ((sc = fwd->xmx[cur][XMC] + bck->xmx[cur][XMC]) > max)
1739 { k2 = hmm->M; t2 = STC; max = sc; }
1740
1741 /*****************************************************************
1742 * Garbage collection, return.
1743 *****************************************************************/
1744
1745 FreePlan7Matrix(fwd);
1746 FreePlan7Matrix(bck);
1747 *ret_k2 = k2;
1748 *ret_t2 = t2;
1749 *ret_s2 = s2;
1750 return Scorify(max);
1751 }
1752