1 #include <U2Core/disable-warnings.h>
2 U2_DISABLE_WARNINGS
3
4 /************************************************************
5 * HMMER - Biological sequence analysis with profile HMMs
6 * Copyright (C) 1992-2003 Washington University School of Medicine
7 * All Rights Reserved
8 *
9 * This source code is distributed under the terms of the
10 * GNU General Public License. See the files COPYING and LICENSE
11 * for details.
12 ************************************************************/
13
14 /* core_algorithms.c
15 * SRE, Mon Nov 11 15:58:52 1996
16 * CVS $Id: core_algorithms.c,v 1.35 2003/10/03 18:26:37 eddy Exp $
17 *
18 * Simple and robust "research" implementations of Forward, Backward,
19 * and Viterbi for Plan7. For optimized replacements for some of these functions,
20 * see fast_algorithms.c.
21 */
22
23 #include "funcs.h"
24
25
26 static float get_wee_midpt(struct plan7_s *hmm, unsigned char *dsq, int L,
27 int k1, char t1, int s1,
28 int k3, char t3, int s3,
29 int *ret_k2, char *ret_t2, int *ret_s2);
30
31
32 /* Function: CreatePlan7Matrix()
33 *
34 * Purpose: Create a dynamic programming matrix for standard Forward,
35 * Backward, or Viterbi, with scores kept as scaled log-odds
36 * integers. Keeps 2D arrays compact in RAM in an attempt
37 * to maximize cache hits.
38 *
39 * The mx structure can be dynamically grown, if a new
40 * HMM or seq exceeds the currently allocated size. Dynamic
41 * growing is more efficient than an alloc/free of a whole
42 * matrix for every new target. The ResizePlan7Matrix()
43 * call does this reallocation, if needed. Here, in the
44 * creation step, we set up some pads - to inform the resizing
45 * call how much to overallocate when it realloc's.
46 *
47 * Args: N - N+1 rows are allocated, for sequence.
48 * M - size of model in nodes
49 * padN - over-realloc in seq/row dimension, or 0
50 * padM - over-realloc in HMM/column dimension, or 0
51 *
52 * Return: mx
53 * mx is allocated here. Caller frees with FreePlan7Matrix(mx).
54 */
55 #ifndef ALTIVEC
56 struct dpmatrix_s *
CreatePlan7Matrix(int N,int M,int padN,int padM)57 CreatePlan7Matrix(int N, int M, int padN, int padM)
58 {
59 struct dpmatrix_s *mx;
60 int i;
61
62 mx = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));
63 mx->xmx = (int **) MallocOrDie (sizeof(int *) * (N+1));
64 mx->mmx = (int **) MallocOrDie (sizeof(int *) * (N+1));
65 mx->imx = (int **) MallocOrDie (sizeof(int *) * (N+1));
66 mx->dmx = (int **) MallocOrDie (sizeof(int *) * (N+1));
67 mx->xmx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*5));
68 mx->mmx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*(M+2)));
69 mx->imx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*(M+2)));
70 mx->dmx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*(M+2)));
71
72 /* The indirect assignment below looks wasteful; it's actually
73 * used for aligning data on 16-byte boundaries as a cache
74 * optimization in the fast altivec implementation
75 */
76 mx->xmx[0] = (int *) mx->xmx_mem;
77 mx->mmx[0] = (int *) mx->mmx_mem;
78 mx->imx[0] = (int *) mx->imx_mem;
79 mx->dmx[0] = (int *) mx->dmx_mem;
80 for (i = 1; i <= N; i++)
81 {
82 mx->xmx[i] = mx->xmx[0] + (i*5);
83 mx->mmx[i] = mx->mmx[0] + (i*(M+2));
84 mx->imx[i] = mx->imx[0] + (i*(M+2));
85 mx->dmx[i] = mx->dmx[0] + (i*(M+2));
86 }
87
88 mx->maxN = N;
89 mx->maxM = M;
90 mx->padN = padN;
91 mx->padM = padM;
92
93 return mx;
94 }
95
96 #endif //ALTIVEC
97
98 /* Function: ResizePlan7Matrix()
99 *
100 * Purpose: Reallocate a dynamic programming matrix, if necessary,
101 * for a problem of NxM: sequence length N, model size M.
102 * (N=1 for small memory score-only variants; we allocate
103 * N+1 rows in the DP matrix.)
104 *
105 * We know (because of the way hmmsearch and hmmpfam are coded)
106 * that only one of the two dimensions is going to change
107 * in size after the first call to ResizePlan7Matrix();
108 * that is, for hmmsearch, we have one HMM of fixed size M
109 * and our target sequences may grow in N; for hmmpfam,
110 * we have one sequence of fixed size N and our target models
111 * may grow in M. What we have to watch out for is P7SmallViterbi()
112 * working on a divide and conquer problem and passing us N < maxN,
113 * M > maxM; we should definitely *not* reallocate a smaller N.
114 * Since we know that only one dimension is going to grow,
115 * we aren't scared of reallocating to maxN,maxM. (If both
116 * M and N could grow, we would be more worried.)
117 *
118 * Returns individual ptrs to the four matrix components
119 * as a convenience.
120 *
121 * Args: mx - an already allocated model to grow.
122 * N - seq length to allocate for; N+1 rows
123 * M - size of model
124 * xmx, mmx, imx, dmx
125 * - RETURN: ptrs to four mx components as a convenience
126 *
127 * Return: (void)
128 * mx is (re)allocated here.
129 */
130
131 #ifndef ALTIVEC
132
133 void
ResizePlan7Matrix(struct dpmatrix_s * mx,int N,int M,int *** xmx,int *** mmx,int *** imx,int *** dmx)134 ResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M,
135 int ***xmx, int ***mmx, int ***imx, int ***dmx)
136 {
137 int i;
138
139 if (N <= mx->maxN && M <= mx->maxM) goto DONE;
140
141 if (N > mx->maxN) {
142 N += mx->padN;
143 mx->maxN = N;
144 mx->xmx = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1));
145 mx->mmx = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1));
146 mx->imx = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1));
147 mx->dmx = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1));
148 }
149
150 if (M > mx->maxM) {
151 M += mx->padM;
152 mx->maxM = M;
153 }
154
155 mx->xmx_mem = (void *) ReallocOrDie (mx->xmx_mem, sizeof(int) * ((mx->maxN+1)*5));
156 mx->mmx_mem = (void *) ReallocOrDie (mx->mmx_mem, sizeof(int) * ((mx->maxN+1)*(mx->maxM+2)));
157 mx->imx_mem = (void *) ReallocOrDie (mx->imx_mem, sizeof(int) * ((mx->maxN+1)*(mx->maxM+2)));
158 mx->dmx_mem = (void *) ReallocOrDie (mx->dmx_mem, sizeof(int) * ((mx->maxN+1)*(mx->maxM+2)));
159
160 mx->xmx[0] = (int *) mx->xmx_mem;
161 mx->mmx[0] = (int *) mx->mmx_mem;
162 mx->imx[0] = (int *) mx->imx_mem;
163 mx->dmx[0] = (int *) mx->dmx_mem;
164
165 for (i = 1; i <= mx->maxN; i++)
166 {
167 mx->xmx[i] = mx->xmx[0] + (i*5);
168 mx->mmx[i] = mx->mmx[0] + (i*(mx->maxM+2));
169 mx->imx[i] = mx->imx[0] + (i*(mx->maxM+2));
170 mx->dmx[i] = mx->dmx[0] + (i*(mx->maxM+2));
171 }
172
173 DONE:
174 if (xmx != NULL) *xmx = mx->xmx;
175 if (mmx != NULL) *mmx = mx->mmx;
176 if (imx != NULL) *imx = mx->imx;
177 if (dmx != NULL) *dmx = mx->dmx;
178 }
179 #endif //ALTIVEC
180 /* Function: AllocPlan7Matrix()
181 * Date: SRE, Tue Nov 19 07:14:47 2002 [St. Louis]
182 *
183 * Purpose: Used to be the main allocator for dp matrices; we used to
184 * allocate, calculate, free. But this spent a lot of time
185 * in malloc(). Replaced with Create..() and Resize..() to
186 * allow matrix reuse in P7Viterbi(), the main alignment
187 * engine. But matrices are alloc'ed by other alignment engines
188 * too, ones that are less frequently called and less
189 * important to optimization of cpu performance. Instead of
190 * tracking changes through them, for now, provide
191 * an Alloc...() call with the same API that's just a wrapper.
192 *
193 * Args: rows - generally L+1, or 2; # of DP rows in seq dimension to alloc
194 * M - size of model, in nodes
195 * xmx, mmx, imx, dmx
196 * - RETURN: ptrs to four mx components as a convenience
197 *
198 * Returns: mx
199 * Caller free's w/ FreePlan7Matrix()
200 */
201 struct dpmatrix_s *
AllocPlan7Matrix(int rows,int M,int *** xmx,int *** mmx,int *** imx,int *** dmx)202 AllocPlan7Matrix(int rows, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx)
203 {
204 struct dpmatrix_s *mx;
205 mx = CreatePlan7Matrix(rows-1, M, 0, 0);
206 if (xmx != NULL) *xmx = mx->xmx;
207 if (mmx != NULL) *mmx = mx->mmx;
208 if (imx != NULL) *imx = mx->imx;
209 if (dmx != NULL) *dmx = mx->dmx;
210 return mx;
211 }
212
213
214 /* Function: FreePlan7Matrix()
215 *
216 * Purpose: Free a dynamic programming matrix allocated by CreatePlan7Matrix().
217 *
218 * Return: (void)
219 */
220 void
FreePlan7Matrix(struct dpmatrix_s * mx)221 FreePlan7Matrix(struct dpmatrix_s *mx)
222 {
223 free (mx->xmx_mem);
224 free (mx->mmx_mem);
225 free (mx->imx_mem);
226 free (mx->dmx_mem);
227 free (mx->xmx);
228 free (mx->mmx);
229 free (mx->imx);
230 free (mx->dmx);
231 free (mx);
232 }
233
234 /* Function: AllocShadowMatrix()
235 *
236 * Purpose: Allocate a dynamic programming traceback pointer matrix for
237 * a Viterbi algorithm.
238 *
239 * Args: rows - number of rows to allocate; typically L+1
240 * M - size of model
241 * xtb, mtb, itb, dtb
242 * - RETURN: ptrs to four mx components as a convenience
243 *
244 * Return: mx
245 * mx is allocated here. Caller frees with FreeDPMatrix(mx).
246 */
247
248 struct dpshadow_s *
AllocShadowMatrix(int rows,int M,char *** xtb,char *** mtb,char *** itb,char *** dtb)249 AllocShadowMatrix(int rows, int M, char ***xtb, char ***mtb, char ***itb, char ***dtb)
250 {
251 struct dpshadow_s *tb;
252 int i;
253
254 tb = (struct dpshadow_s *) MallocOrDie (sizeof(struct dpshadow_s));
255 tb->xtb = (char **) MallocOrDie (sizeof(char *) * rows);
256 tb->mtb = (char **) MallocOrDie (sizeof(char *) * rows);
257 tb->itb = (char **) MallocOrDie (sizeof(char *) * rows);
258 tb->dtb = (char **) MallocOrDie (sizeof(char *) * rows);
259 tb->esrc = (int *) MallocOrDie (sizeof(int) * rows);
260 tb->xtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*5));
261 tb->mtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2)));
262 tb->itb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2)));
263 tb->dtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2)));
264 for (i = 1; i < rows; i++)
265 {
266 tb->xtb[i] = tb->xtb[0] + (i*5);
267 tb->mtb[i] = tb->mtb[0] + (i*(M+2));
268 tb->itb[i] = tb->itb[0] + (i*(M+2));
269 tb->dtb[i] = tb->dtb[0] + (i*(M+2));
270 }
271
272 if (xtb != NULL) *xtb = tb->xtb;
273 if (mtb != NULL) *mtb = tb->mtb;
274 if (itb != NULL) *itb = tb->itb;
275 if (dtb != NULL) *dtb = tb->dtb;
276 return tb;
277 }
278
279 /* Function: FreeShadowMatrix()
280 *
281 * Purpose: Free a dynamic programming matrix allocated by AllocShadowMatrix().
282 *
283 * Return: (void)
284 */
285 void
FreeShadowMatrix(struct dpshadow_s * tb)286 FreeShadowMatrix(struct dpshadow_s *tb)
287 {
288 free (tb->xtb[0]);
289 free (tb->mtb[0]);
290 free (tb->itb[0]);
291 free (tb->dtb[0]);
292 free (tb->esrc);
293 free (tb->xtb);
294 free (tb->mtb);
295 free (tb->itb);
296 free (tb->dtb);
297 free (tb);
298 }
299
300
301
302 /* Function: P7ViterbiSpaceOK()
303 * Incept: SRE, Wed Oct 1 12:53:13 2003 [St. Louis]
304 *
305 * Purpose: Returns TRUE if the existing matrix allocation
306 * is already sufficient to hold the requested MxN, or if
307 * the matrix can be expanded in M and/or N without
308 * exceeding RAMLIMIT megabytes.
309 *
310 * This gets called anytime we're about to do P7Viterbi().
311 * If it returns FALSE, we switch into the appropriate
312 * small-memory alternative: P7SmallViterbi() or
313 * P7WeeViterbi().
314 *
315 * Checking the DP problem size against P7ViterbiSize()
316 * is not enough, because the DP matrix may be already
317 * allocated in MxN. For example, if we're already
318 * allocated to maxM,maxN of 1447x979, and we try to
319 * do a problem of MxN=12x125000, P7ViterbiSize() may
320 * think that's fine - but when we resize, we can only
321 * grow, so we'll expand to 1447x125000, which is
322 * likely over the RAMLIMIT. [bug#h26; xref SLT7 p.122]
323 *
324 * Args:
325 *
326 * Returns: TRUE if we can run P7Viterbi(); FALSE if we need
327 * to use a small memory variant.
328 *
329 * Xref: STL7 p.122.
330 */
331 int
P7ViterbiSpaceOK(int L,int M,struct dpmatrix_s * mx)332 P7ViterbiSpaceOK(int L, int M, struct dpmatrix_s *mx)
333 {
334 int newM;
335 int newN;
336
337 if (M > mx->maxM) newM = M + mx->padM; else newM = mx->maxM;
338 if (L > mx->maxN) newN = L + mx->padN; else newN = mx->maxN;
339
340 if (P7ViterbiSize(newN, newM) <= RAMLIMIT)
341 return TRUE;
342 else
343 return FALSE;
344 }
345
346
347 /* Function: P7ViterbiSize()
348 * Date: SRE, Fri Mar 6 15:13:20 1998 [St. Louis]
349 *
350 * Purpose: Returns the ballpark predicted memory requirement for a
351 * P7Viterbi() alignment, in MB.
352 *
353 * Currently L must fit in an int (< 2 GB), but we have
354 * to deal with LM > 2 GB - e.g. watch out for overflow, do
355 * the whole calculation in floating point. Bug here detected
356 * in 2.1.1 by David Harper, Sanger Centre.
357 *
358 * Args: L - length of sequence
359 * M - length of HMM
360 *
361 * Returns: # of MB
362 */
363 int
P7ViterbiSize(int L,int M)364 P7ViterbiSize(int L, int M)
365 {
366 float Mbytes;
367
368 /* We're excessively precise here, but it doesn't cost
369 * us anything to be pedantic. The four terms are:
370 * 1. the matrix structure itself;
371 * 2. the O(NM) main matrix (this dominates!)
372 * 3. ptrs into the rows of the matrix
373 * 4. storage for 5 special states. (xmx)
374 */
375 Mbytes = (float) sizeof(struct dpmatrix_s);
376 Mbytes += 3. * (float) (L+1) * (float) (M+2) * (float) sizeof(int);
377 Mbytes += 4. * (float) (L+1) * (float) sizeof(int *);
378 Mbytes += 5. * (float) (L+1) * (float) sizeof(int);
379 Mbytes /= 1048576.;
380 return (int) Mbytes;
381 }
382
383 /* Function: P7SmallViterbiSize()
384 * Date: SRE, Fri Mar 6 15:20:04 1998 [St. Louis]
385 *
386 * Purpose: Returns the ballpark predicted memory requirement for
387 * a P7SmallViterbi() alignment, in MB.
388 *
389 * P7SmallViterbi() is a wrapper, calling both P7ParsingViterbi()
390 * and P7WeeViterbi(). P7ParsingViterbi() typically dominates
391 * the memory requirement, so the value returned
392 * is the P7ParsingViterbi() number.
393 *
394 * We don't (yet) worry about overflow issues like we did with
395 * P7ViterbiSize(). We'll have many other 32-bit int issues in the
396 * code if we overflow here.
397 *
398 * Args: L - length of sequence
399 * M - length of HMM
400 *
401 * Returns: # of MB
402 */
403 int
P7SmallViterbiSize(int L,int M)404 P7SmallViterbiSize(int L, int M)
405 {
406 return ((2 * sizeof(struct dpmatrix_s) +
407 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */
408 16 * sizeof(int *) + /* ptrs into rows of matrix */
409 20 * sizeof(int) + /* 5 special states */
410 2 * (L+1) * sizeof(int)) /* traceback indices */
411 / 1000000);
412 }
413
414
415 /* Function: P7WeeViterbiSize()
416 * Date: SRE, Fri Mar 6 15:40:42 1998 [St. Louis]
417 *
418 * Purpose: Returns the ballpark predicted memory requirement for
419 * a P7WeeViterbi() alignment, in MB.
420 *
421 * Args: L - length of sequence
422 * M - length of HMM
423 *
424 * Returns: # of MB
425 */
426 int
P7WeeViterbiSize(int L,int M)427 P7WeeViterbiSize(int L, int M)
428 {
429 return ((2 * sizeof(struct dpmatrix_s) +
430 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */
431 16 * sizeof(int *) + /* ptrs into rows of matrix */
432 20 * sizeof(int) + /* 5 special states */
433 2 * (L+2) * sizeof(int) + /* stacks for starts/ends (overkill) */
434 (L+2) * sizeof(int) + /* k assignments to seq positions */
435 (L+2) * sizeof(char)) /* state assignments to seq pos */
436 / 1000000);
437 }
438
439
440 /* Function: P7Forward()
441 *
442 * Purpose: The Forward dynamic programming algorithm.
443 * The scaling issue is dealt with by working in log space
444 * and calling ILogsum(); this is a slow but robust approach.
445 *
446 * Args: dsq - sequence in digitized form
447 * L - length of dsq
448 * hmm - the model
449 * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted
450 *
451 * Return: log P(S|M)/P(S|R), as a bit score.
452 */
453 float
P7Forward(unsigned char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s ** ret_mx)454 P7Forward(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
455 {
456 struct dpmatrix_s *mx;
457 int **xmx;
458 int **mmx;
459 int **imx;
460 int **dmx;
461 int i,k;
462 int sc;
463
464 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
465 */
466 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
467
468 /* Initialization of the zero row.
469 * Note that xmx[i][stN] = 0 by definition for all i,
470 * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need
471 * to be calculated in DP matrices.
472 */
473 xmx[0][XMN] = 0; /* S->N, p=1 */
474 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
475 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
476 for (k = 0; k <= hmm->M; k++)
477 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
478
479 /* Recursion. Done as a pull.
480 * Note some slightly wasteful boundary conditions:
481 * tsc[0] = -INFTY for all eight transitions (no node 0)
482 */
483 for (i = 1; i <= L; i++)
484 {
485 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
486 for (k = 1; k < hmm->M; k++)
487 {
488 mmx[i][k] = ILogsum(ILogsum(mmx[i-1][k-1] + hmm->tsc[TMM][k-1],
489 imx[i-1][k-1] + hmm->tsc[TIM][k-1]),
490 ILogsum(xmx[i-1][XMB] + hmm->bsc[k],
491 dmx[i-1][k-1] + hmm->tsc[TDM][k-1]));
492 mmx[i][k] += hmm->msc[dsq[i]][k];
493
494 dmx[i][k] = ILogsum(mmx[i][k-1] + hmm->tsc[TMD][k-1],
495 dmx[i][k-1] + hmm->tsc[TDD][k-1]);
496 imx[i][k] = ILogsum(mmx[i-1][k] + hmm->tsc[TMI][k],
497 imx[i-1][k] + hmm->tsc[TII][k]);
498 imx[i][k] += hmm->isc[dsq[i]][k];
499 }
500 mmx[i][hmm->M] = ILogsum(ILogsum(mmx[i-1][hmm->M-1] + hmm->tsc[TMM][hmm->M-1],
501 imx[i-1][hmm->M-1] + hmm->tsc[TIM][hmm->M-1]),
502 ILogsum(xmx[i-1][XMB] + hmm->bsc[hmm->M],
503 dmx[i-1][hmm->M-1] + hmm->tsc[TDM][hmm->M-1]));
504 mmx[i][hmm->M] += hmm->msc[dsq[i]][hmm->M];
505
506 /* Now the special states.
507 * remember, C and J emissions are zero score by definition
508 */
509 xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];
510
511 xmx[i][XME] = -INFTY;
512 for (k = 1; k <= hmm->M; k++)
513 xmx[i][XME] = ILogsum(xmx[i][XME], mmx[i][k] + hmm->esc[k]);
514
515 xmx[i][XMJ] = ILogsum(xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP],
516 xmx[i][XME] + hmm->xsc[XTE][LOOP]);
517
518 xmx[i][XMB] = ILogsum(xmx[i][XMN] + hmm->xsc[XTN][MOVE],
519 xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]);
520
521 xmx[i][XMC] = ILogsum(xmx[i-1][XMC] + hmm->xsc[XTC][LOOP],
522 xmx[i][XME] + hmm->xsc[XTE][MOVE]);
523 }
524
525 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
526
527 if (ret_mx != NULL) *ret_mx = mx;
528 else FreePlan7Matrix(mx);
529
530 return Scorify(sc); /* the total Forward score. */
531 }
532
533
534 #ifdef SLOW
535 /* Function: P7Viterbi()
536 *
537 * Purpose: The Viterbi dynamic programming algorithm.
538 * Identical to Forward() except that max's
539 * replace sum's.
540 *
541 * This is the slower, more understandable version
542 * of P7Viterbi(). The default version in fast_algorithms.c
543 * is portably optimized and more difficult to understand;
544 * the ALTIVEC version in fast_algorithms.c is vectorized
545 * with Altivec-specific code, and is pretty opaque.
546 *
547 * This function is not enabled by default; it is only
548 * activated by -DSLOW at compile time.
549 *
550 * Args: dsq - sequence in digitized form
551 * L - length of dsq
552 * hmm - the model
553 * mx - reused DP matrix
554 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
555 *
556 * Return: log P(S|M)/P(S|R), as a bit score
557 */
558 float
P7Viterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s * mx,struct p7trace_s ** ret_tr)559 P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
560 {
561 struct p7trace_s *tr;
562 int **xmx;
563 int **mmx;
564 int **imx;
565 int **dmx;
566 int i,k;
567 int sc;
568
569 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
570 */
571 ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);
572
573 /* Initialization of the zero row.
574 */
575 xmx[0][XMN] = 0; /* S->N, p=1 */
576 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
577 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
578 for (k = 0; k <= hmm->M; k++)
579 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
580
581 /* Recursion. Done as a pull.
582 * Note some slightly wasteful boundary conditions:
583 * tsc[0] = -INFTY for all eight transitions (no node 0)
584 * D_M and I_M are wastefully calculated (they don't exist)
585 */
586 for (i = 1; i <= L; i++) {
587 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
588
589 for (k = 1; k <= hmm->M; k++) {
590 /* match state */
591 mmx[i][k] = -INFTY;
592 if ((sc = mmx[i-1][k-1] + hmm->tsc[TMM][k-1]) > mmx[i][k])
593 mmx[i][k] = sc;
594 if ((sc = imx[i-1][k-1] + hmm->tsc[TIM][k-1]) > mmx[i][k])
595 mmx[i][k] = sc;
596 if ((sc = xmx[i-1][XMB] + hmm->bsc[k]) > mmx[i][k])
597 mmx[i][k] = sc;
598 if ((sc = dmx[i-1][k-1] + hmm->tsc[TDM][k-1]) > mmx[i][k])
599 mmx[i][k] = sc;
600 if (hmm->msc[dsq[i]][k] != -INFTY) mmx[i][k] += hmm->msc[dsq[i]][k];
601 else mmx[i][k] = -INFTY;
602
603 /* delete state */
604 dmx[i][k] = -INFTY;
605 if ((sc = mmx[i][k-1] + hmm->tsc[TMD][k-1]) > dmx[i][k])
606 dmx[i][k] = sc;
607 if ((sc = dmx[i][k-1] + hmm->tsc[TDD][k-1]) > dmx[i][k])
608 dmx[i][k] = sc;
609
610 /* insert state */
611 if (k < hmm->M) {
612 imx[i][k] = -INFTY;
613 if ((sc = mmx[i-1][k] + hmm->tsc[TMI][k]) > imx[i][k])
614 imx[i][k] = sc;
615 if ((sc = imx[i-1][k] + hmm->tsc[TII][k]) > imx[i][k])
616 imx[i][k] = sc;
617 if (hmm->isc[dsq[i]][k] != -INFTY) imx[i][k] += hmm->isc[dsq[i]][k];
618 else imx[i][k] = -INFTY;
619 }
620 }
621
622 /* Now the special states. Order is important here.
623 * remember, C and J emissions are zero score by definition,
624 */
625 /* N state */
626 xmx[i][XMN] = -INFTY;
627 if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
628 xmx[i][XMN] = sc;
629
630 /* E state */
631 xmx[i][XME] = -INFTY;
632 for (k = 1; k <= hmm->M; k++)
633 if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME])
634 xmx[i][XME] = sc;
635 /* J state */
636 xmx[i][XMJ] = -INFTY;
637 if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
638 xmx[i][XMJ] = sc;
639 if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
640 xmx[i][XMJ] = sc;
641
642 /* B state */
643 xmx[i][XMB] = -INFTY;
644 if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
645 xmx[i][XMB] = sc;
646 if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
647 xmx[i][XMB] = sc;
648
649 /* C state */
650 xmx[i][XMC] = -INFTY;
651 if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
652 xmx[i][XMC] = sc;
653 if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
654 xmx[i][XMC] = sc;
655 }
656 /* T state (not stored) */
657 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
658
659 if (ret_tr != NULL) {
660 P7ViterbiTrace(hmm, dsq, L, mx, &tr);
661 *ret_tr = tr;
662 }
663
664 return Scorify(sc); /* the total Viterbi score. */
665 }
666 #endif /*SLOW*/
667
668 /* Function: P7ViterbiTrace()
669 * Date: SRE, Sat Aug 23 10:30:11 1997 (St. Louis Lambert Field)
670 *
671 * Purpose: Traceback of a Viterbi matrix: i.e. retrieval
672 * of optimum alignment.
673 *
674 * Args: hmm - hmm, log odds form, used to make mx
675 * dsq - sequence aligned to (digital form) 1..N
676 * N - length of seq
677 * mx - the matrix to trace back in, N x hmm->M
678 * ret_tr - RETURN: traceback.
679 *
680 * Return: (void)
681 * ret_tr is allocated here. Free using P7FreeTrace().
682 */
683 void
P7ViterbiTrace(struct plan7_s * hmm,unsigned char * dsq,int N,struct dpmatrix_s * mx,struct p7trace_s ** ret_tr)684 P7ViterbiTrace(struct plan7_s *hmm, unsigned char *dsq, int N,
685 struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
686 {
687 struct p7trace_s *tr;
688 int curralloc; /* current allocated length of trace */
689 int tpos; /* position in trace */
690 int i; /* position in seq (1..N) */
691 int k; /* position in model (1..M) */
692 int **xmx, **mmx, **imx, **dmx;
693 int sc; /* temp var for pre-emission score */
694
695 /* Overallocate for the trace.
696 * S-N-B- ... - E-C-T : 6 states + N is minimum trace;
697 * add N more as buffer.
698 */
699 curralloc = N * 2 + 6;
700 P7AllocTrace(curralloc, &tr);
701
702 xmx = mx->xmx;
703 mmx = mx->mmx;
704 imx = mx->imx;
705 dmx = mx->dmx;
706
707 /* Initialization of trace
708 * We do it back to front; ReverseTrace() is called later.
709 */
710 tr->statetype[0] = STT;
711 tr->nodeidx[0] = 0;
712 tr->pos[0] = 0;
713 tr->statetype[1] = STC;
714 tr->nodeidx[1] = 0;
715 tr->pos[1] = 0;
716 tpos = 2;
717 i = N; /* current i (seq pos) we're trying to assign */
718
719 /* Traceback
720 */
721 while (tr->statetype[tpos-1] != STS) {
722 switch (tr->statetype[tpos-1]) {
723 case STM: /* M connects from i-1,k-1, or B */
724 sc = mmx[i+1][k+1] - hmm->msc[dsq[i+1]][k+1];
725 if (sc <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
726 else if (sc == xmx[i][XMB] + hmm->bsc[k+1])
727 {
728 /* Check for wing unfolding */
729 if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1])
730 while (k > 0)
731 {
732 tr->statetype[tpos] = STD;
733 tr->nodeidx[tpos] = k--;
734 tr->pos[tpos] = 0;
735 tpos++;
736 if (tpos == curralloc)
737 { /* grow trace if necessary */
738 curralloc += N;
739 P7ReallocTrace(tr, curralloc);
740 }
741 }
742
743 tr->statetype[tpos] = STB;
744 tr->nodeidx[tpos] = 0;
745 tr->pos[tpos] = 0;
746 }
747 else if (sc == mmx[i][k] + hmm->tsc[TMM][k])
748 {
749 tr->statetype[tpos] = STM;
750 tr->nodeidx[tpos] = k--;
751 tr->pos[tpos] = i--;
752 }
753 else if (sc == imx[i][k] + hmm->tsc[TIM][k])
754 {
755 tr->statetype[tpos] = STI;
756 tr->nodeidx[tpos] = k;
757 tr->pos[tpos] = i--;
758 }
759 else if (sc == dmx[i][k] + hmm->tsc[TDM][k])
760 {
761 tr->statetype[tpos] = STD;
762 tr->nodeidx[tpos] = k--;
763 tr->pos[tpos] = 0;
764 }
765 else
766 Die("traceback failed");
767 break;
768
769 case STD: /* D connects from M,D */
770 if (dmx[i][k+1] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
771 else if (dmx[i][k+1] == mmx[i][k] + hmm->tsc[TMD][k])
772 {
773 tr->statetype[tpos] = STM;
774 tr->nodeidx[tpos] = k--;
775 tr->pos[tpos] = i--;
776 }
777 else if (dmx[i][k+1] == dmx[i][k] + hmm->tsc[TDD][k])
778 {
779 tr->statetype[tpos] = STD;
780 tr->nodeidx[tpos] = k--;
781 tr->pos[tpos] = 0;
782 }
783 else Die("traceback failed");
784 break;
785
786 case STI: /* I connects from M,I */
787 sc = imx[i+1][k] - hmm->isc[dsq[i+1]][k];
788 if (sc <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
789 else if (sc == mmx[i][k] + hmm->tsc[TMI][k])
790 {
791 tr->statetype[tpos] = STM;
792 tr->nodeidx[tpos] = k--;
793 tr->pos[tpos] = i--;
794 }
795 else if (sc == imx[i][k] + hmm->tsc[TII][k])
796 {
797 tr->statetype[tpos] = STI;
798 tr->nodeidx[tpos] = k;
799 tr->pos[tpos] = i--;
800 }
801 else Die("traceback failed");
802 break;
803
804 case STN: /* N connects from S, N */
805 if (i == 0 && xmx[i][XMN] == 0)
806 {
807 tr->statetype[tpos] = STS;
808 tr->nodeidx[tpos] = 0;
809 tr->pos[tpos] = 0;
810 }
811 else if (i > 0 && xmx[i+1][XMN] == xmx[i][XMN] + hmm->xsc[XTN][LOOP])
812 {
813 tr->statetype[tpos] = STN;
814 tr->nodeidx[tpos] = 0;
815 tr->pos[tpos] = 0; /* note convention adherence: */
816 tr->pos[tpos-1] = i--; /* first N doesn't emit */
817 }
818 else Die("traceback failed");
819 break;
820
821 case STB: /* B connects from N, J */
822 if (xmx[i][XMB] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
823 else if (xmx[i][XMB] == xmx[i][XMN] + hmm->xsc[XTN][MOVE])
824 {
825 tr->statetype[tpos] = STN;
826 tr->nodeidx[tpos] = 0;
827 tr->pos[tpos] = 0;
828 }
829 else if (xmx[i][XMB] == xmx[i][XMJ] + hmm->xsc[XTJ][MOVE])
830 {
831 tr->statetype[tpos] = STJ;
832 tr->nodeidx[tpos] = 0;
833 tr->pos[tpos] = 0;
834 }
835
836 else Die("traceback failed");
837 break;
838
839 case STE: /* E connects from any M state. k set here */
840 if (xmx[i][XME] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
841 for (k = hmm->M; k >= 1; k--)
842 if (xmx[i][XME] == mmx[i][k] + hmm->esc[k])
843 {
844 /* check for wing unfolding */
845 if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k])
846 {
847 int dk; /* need a tmp k while moving thru delete wing */
848 for (dk = hmm->M; dk > k; dk--)
849 {
850 tr->statetype[tpos] = STD;
851 tr->nodeidx[tpos] = dk;
852 tr->pos[tpos] = 0;
853 tpos++;
854 if (tpos == curralloc)
855 { /* grow trace if necessary */
856 curralloc += N;
857 P7ReallocTrace(tr, curralloc);
858 }
859 }
860 }
861
862 tr->statetype[tpos] = STM;
863 tr->nodeidx[tpos] = k--;
864 tr->pos[tpos] = i--;
865 break;
866 }
867 if (k < 0) Die("traceback failed");
868 break;
869
870 case STC: /* C comes from C, E */
871 if (xmx[i][XMC] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
872 else if (xmx[i][XMC] == xmx[i-1][XMC] + hmm->xsc[XTC][LOOP])
873 {
874 tr->statetype[tpos] = STC;
875 tr->nodeidx[tpos] = 0;
876 tr->pos[tpos] = 0; /* note convention adherence: */
877 tr->pos[tpos-1] = i--; /* first C doesn't emit */
878 }
879 else if (xmx[i][XMC] == xmx[i][XME] + hmm->xsc[XTE][MOVE])
880 {
881 tr->statetype[tpos] = STE;
882 tr->nodeidx[tpos] = 0;
883 tr->pos[tpos] = 0; /* E is a nonemitter */
884 }
885
886 else Die("Traceback failed.");
887 break;
888
889 case STJ: /* J connects from E, J */
890 if (xmx[i][XMJ] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }
891 else if (xmx[i][XMJ] == xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP])
892 {
893 tr->statetype[tpos] = STJ;
894 tr->nodeidx[tpos] = 0;
895 tr->pos[tpos] = 0; /* note convention adherence: */
896 tr->pos[tpos-1] = i--; /* first J doesn't emit */
897 }
898 else if (xmx[i][XMJ] == xmx[i][XME] + hmm->xsc[XTE][LOOP])
899 {
900 tr->statetype[tpos] = STE;
901 tr->nodeidx[tpos] = 0;
902 tr->pos[tpos] = 0; /* E is a nonemitter */
903 }
904
905 else Die("Traceback failed.");
906 break;
907
908 default:
909 Die("traceback failed");
910
911 } /* end switch over statetype[tpos-1] */
912
913 tpos++;
914 if (tpos == curralloc)
915 { /* grow trace if necessary */
916 curralloc += N;
917 P7ReallocTrace(tr, curralloc);
918 }
919
920 } /* end traceback, at S state; tpos == tlen now */
921 tr->tlen = tpos;
922 P7ReverseTrace(tr);
923 *ret_tr = tr;
924 }
925
926
927 /* Function: P7SmallViterbi()
928 * Date: SRE, Fri Mar 6 15:29:41 1998 [St. Louis]
929 *
930 * Purpose: Wrapper function, for linear memory alignment
931 * with same arguments as P7Viterbi().
932 *
933 * Calls P7ParsingViterbi to break the sequence
934 * into fragments. Then, based on size of fragments,
935 * calls either P7Viterbi() or P7WeeViterbi() to
936 * get traces for them. Finally, assembles all these
937 * traces together to produce an overall optimal
938 * trace for the sequence.
939 *
940 * If the trace isn't needed for some reason,
941 * all we do is call P7ParsingViterbi.
942 *
943 * Args: dsq - sequence in digitized form
944 * L - length of dsq
945 * hmm - the model
946 * mx - DP matrix, growable
947 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
948 *
949 * Returns: Score of optimal alignment in bits.
950 */
951 float
P7SmallViterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s * mx,struct p7trace_s ** ret_tr,int & progress)952 P7SmallViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr, int& progress)
953 {
954 struct p7trace_s *ctr; /* collapsed trace of optimal parse */
955 struct p7trace_s *tr; /* full trace of optimal alignment */
956 struct p7trace_s **tarr; /* trace array */
957 int ndom; /* number of subsequences */
958 int i; /* counter over domains */
959 int pos; /* position in sequence */
960 int tpos; /* position in trace */
961 int tlen; /* length of full trace */
962 int sqlen; /* length of a subsequence */
963 int totlen; /* length of L matched by model (as opposed to N/C/J) */
964 float sc; /* score of optimal alignment */
965 int t2; /* position in a subtrace */
966
967 /* Step 1. Call P7ParsingViterbi to calculate an optimal parse
968 * of the sequence into single-hit subsequences; this parse
969 * is returned in a "collapsed" trace
970 */
971 sc = P7ParsingViterbi(dsq, L, hmm, &ctr, progress);
972
973 /* If we don't want full trace, we're done;
974 * also, if parsing viterbi returned a NULL trace we're done. */
975 if (ctr == NULL || ret_tr == NULL)
976 {
977 P7FreeTrace(ctr);
978 return sc;
979 }
980
981 /* Step 2. Call either P7Viterbi or P7WeeViterbi on each subsequence
982 * to recover a full traceback of each, collecting them in
983 * an array.
984 */
985 ndom = ctr->tlen/2 - 1;
986 tarr = (struct p7trace_s **)MallocOrDie(sizeof(struct p7trace_s *) * ndom);
987 tlen = totlen = 0;
988 for (i = 0; i < ndom; i++)
989 {
990 sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1]; /* length of subseq */
991
992 if (P7ViterbiSpaceOK(sqlen, hmm->M, mx))
993 {
994 SQD_DPRINTF1((" -- using P7Viterbi on an %dx%d subproblem\n",
995 hmm->M, sqlen));
996 P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, mx, &(tarr[i]));
997 }
998 else
999 {
1000 SQD_DPRINTF1((" -- using P7WeeViterbi on an %dx%d subproblem\n",
1001 hmm->M, sqlen));
1002 P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));
1003 }
1004
1005 tlen += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */
1006 totlen += sqlen;
1007 }
1008
1009 /* Step 3. Compose the subtraces into one big final trace.
1010 * This is wasteful because we're going to TraceDecompose()
1011 * it again in both hmmsearch and hmmpfam to look at
1012 * individual domains; but we do it anyway so the P7SmallViterbi
1013 * interface looks exactly like the P7Viterbi interface. Maybe
1014 * long traces shouldn't include all the N/J/C states anyway,
1015 * since they're unambiguously implied.
1016 */
1017
1018 /* Calculate total trace len and alloc;
1019 * nonemitting SNCT + nonemitting J's + emitting NJC
1020 */
1021 tlen += 4 + (ndom-1) + (L-totlen);
1022 P7AllocTrace(tlen, &tr);
1023 tr->tlen = tlen;
1024
1025 /* Add N-terminal trace framework
1026 */
1027 tr->statetype[0] = STS;
1028 tr->nodeidx[0] = 0;
1029 tr->pos[0] = 0;
1030 tr->statetype[1] = STN;
1031 tr->nodeidx[1] = 0;
1032 tr->pos[1] = 0;
1033 tpos = 2;
1034 /* add implied N's */
1035 for (pos = 1; pos <= ctr->pos[1]; pos++)
1036 {
1037 tr->statetype[tpos] = STN;
1038 tr->nodeidx[tpos] = 0;
1039 tr->pos[tpos] = pos;
1040 tpos++;
1041 }
1042
1043 /* Add each subseq trace in, with its appropriate
1044 * sequence offset derived from the collapsed trace
1045 */
1046 for (i = 0; i < ndom; i++)
1047 { /* skip SN, CT framework at ends */
1048 for (t2 = 2; t2 < tarr[i]->tlen-2; t2++)
1049 {
1050 tr->statetype[tpos] = tarr[i]->statetype[t2];
1051 tr->nodeidx[tpos] = tarr[i]->nodeidx[t2];
1052 if (tarr[i]->pos[t2] > 0)
1053 tr->pos[tpos] = tarr[i]->pos[t2] + ctr->pos[i*2+1];
1054 else
1055 tr->pos[tpos] = 0;
1056 tpos++;
1057 }
1058 /* add nonemitting J or C */
1059 tr->statetype[tpos] = (i == ndom-1) ? STC : STJ;
1060 tr->nodeidx[tpos] = 0;
1061 tr->pos[tpos] = 0;
1062 tpos++;
1063 /* add implied emitting J's */
1064 if (i != ndom-1)
1065 for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++)
1066 {
1067 tr->statetype[tpos] = STJ;
1068 tr->nodeidx[tpos] = 0;
1069 tr->pos[tpos] = pos;
1070 tpos++;
1071 }
1072 }
1073
1074 /* add implied C's */
1075 for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++)
1076 {
1077 tr->statetype[tpos] = STC;
1078 tr->nodeidx[tpos] = 0;
1079 tr->pos[tpos] = pos;
1080 tpos++;
1081 }
1082 /* add terminal T */
1083 tr->statetype[tpos] = STT;
1084 tr->nodeidx[tpos] = 0;
1085 tr->pos[tpos] = 0;
1086 tpos++;
1087
1088 for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]);
1089 free(tarr);
1090 P7FreeTrace(ctr);
1091
1092 *ret_tr = tr;
1093 return sc;
1094 }
1095
1096
1097
1098
1099 /* Function: P7ParsingViterbi()
1100 * Date: SRE, Wed Mar 4 14:07:31 1998 [St. Louis]
1101 *
1102 * Purpose: The "hmmfs" linear-memory algorithm for finding
1103 * the optimal alignment of a very long sequence to
1104 * a looping, multihit (e.g. Plan7) model, parsing it into
1105 * a series of nonoverlapping subsequences that match
1106 * the model once. Other algorithms (e.g. P7Viterbi()
1107 * or P7WeeViterbi()) are applied subsequently to
1108 * these subsequences to recover complete alignments.
1109 *
1110 * The hmmfs algorithm appears briefly in [Durbin98],
1111 * but is otherwise unpublished.
1112 *
1113 * The traceback structure returned is special: a
1114 * "collapsed" trace S->B->E->...->B->E->T, where
1115 * stateidx is unused and pos is used to indicate the
1116 * position of B and E in the sequence. The matched
1117 * subsequence is B_pos+1...E_pos. The number of
1118 * matches in the trace is (tlen/2)-1.
1119 *
1120 * Args: dsq - sequence in digitized form
1121 * L - length of dsq
1122 * hmm - the model (log odds scores ready)
1123 * ret_tr - RETURN: a collapsed traceback.
1124 *
1125 * Returns: Score of the optimal Viterbi alignment, in bits.
1126 */
1127 float
P7ParsingViterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct p7trace_s ** ret_tr,int & progress)1128 P7ParsingViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr, int& progress)
1129 {
1130 struct dpmatrix_s *mx; /* two rows of score matrix */
1131 struct dpmatrix_s *tmx; /* two rows of misused score matrix: traceback ptrs */
1132 struct p7trace_s *tr; /* RETURN: collapsed traceback */
1133 int **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */
1134 int **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */
1135 int *btr, *etr; /* O(L) trace ptrs for B, E state pts in seq */
1136 int sc; /* integer score of optimal alignment */
1137 int i,k,tpos; /* index for seq, model, trace position */
1138 int cur, prv; /* indices for rolling dp matrix */
1139 int curralloc; /* size of allocation for tr */
1140
1141
1142 /* Alloc a DP matrix and traceback pointers, two rows each, O(M).
1143 * Alloc two O(L) arrays to trace back through the sequence thru B and E.
1144 */
1145 mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1146 tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr);
1147 btr = (int*)MallocOrDie(sizeof(int) * (L+1));
1148 etr = (int*)MallocOrDie(sizeof(int) * (L+1));
1149
1150 /* Initialization of the zero row.
1151 */
1152 xmx[0][XMN] = 0; /* S->N, p=1 */
1153 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
1154 btr[0] = 0;
1155 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
1156 etr[0] = -1;
1157 for (k = 0; k <= hmm->M; k++)
1158 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
1159
1160 /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick.
1161 * Note some slightly wasteful boundary conditions:
1162 * tsc[0] = -INFTY for all eight transitions (no node 0)
1163 * D_M and I_M are wastefully calculated (they don't exist)
1164 *
1165 * Notes on traceback pointer propagation.
1166 * - In the path B->E, we propagate the i that B was aligned to in the optimal
1167 * alignment, via mtr, dtr, and itr.
1168 * - When we reach an E, we record the i of the B it started from in etr.
1169 * - In a looping path E->J...->B or terminal path E->C...->T, we propagate
1170 * the i that E was aligned to in the optimal alignment via xtr[][XMC]
1171 * and xtr[][XMJ].
1172 * - When we enter B, we record the i of the best previous E, or 0 if there
1173 * isn't one, in btr.
1174 */
1175 for (i = 1; i <= L; i++) {
1176 cur = i % 2;
1177 prv = !cur;
1178
1179 mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;
1180
1181 for (k = 1; k <= hmm->M; k++) {
1182 /* match state */
1183 mmx[cur][k] = -INFTY;
1184 if ((sc = mmx[prv][k-1] + hmm->tsc[TMM][k-1]) > -INFTY)
1185 { mmx[cur][k] = sc; mtr[cur][k] = mtr[prv][k-1]; }
1186 if ((sc = imx[prv][k-1] + hmm->tsc[TIM][k-1]) > mmx[cur][k])
1187 { mmx[cur][k] = sc; mtr[cur][k] = itr[prv][k-1]; }
1188 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
1189 { mmx[cur][k] = sc; mtr[cur][k] = i-1; }
1190 if ((sc = dmx[prv][k-1] + hmm->tsc[TDM][k-1]) > mmx[cur][k])
1191 { mmx[cur][k] = sc; mtr[cur][k] = dtr[prv][k-1]; }
1192 if (hmm->msc[dsq[i]][k] != -INFTY)
1193 mmx[cur][k] += hmm->msc[dsq[i]][k];
1194 else
1195 mmx[cur][k] = -INFTY;
1196
1197 /* delete state */
1198 dmx[cur][k] = -INFTY;
1199 if ((sc = mmx[cur][k-1] + hmm->tsc[TMD][k-1]) > -INFTY)
1200 { dmx[cur][k] = sc; dtr[cur][k] = mtr[cur][k-1]; }
1201 if ((sc = dmx[cur][k-1] + hmm->tsc[TDD][k-1]) > dmx[cur][k])
1202 { dmx[cur][k] = sc; dtr[cur][k] = dtr[cur][k-1]; }
1203
1204 /* insert state */
1205 if (k < hmm->M) {
1206 imx[cur][k] = -INFTY;
1207 if ((sc = mmx[prv][k] + hmm->tsc[TMI][k]) > -INFTY)
1208 { imx[cur][k] = sc; itr[cur][k] = mtr[prv][k]; }
1209 if ((sc = imx[prv][k] + hmm->tsc[TII][k]) > imx[cur][k])
1210 { imx[cur][k] = sc; itr[cur][k] = itr[prv][k]; }
1211 if (hmm->isc[dsq[i]][k] != -INFTY)
1212 imx[cur][k] += hmm->isc[dsq[i]][k];
1213 else
1214 imx[cur][k] = -INFTY;
1215 }
1216 }
1217
1218 /* Now the special states. Order is important here.
1219 * remember, C and J emissions are zero score by definition,
1220 */
1221 /* N state */
1222 xmx[cur][XMN] = -INFTY;
1223 if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1224 xmx[cur][XMN] = sc;
1225 /* E state */
1226 xmx[cur][XME] = -INFTY;
1227 for (k = 1; k <= hmm->M; k++)
1228 if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
1229 { xmx[cur][XME] = sc; etr[i] = mtr[cur][k]; }
1230 /* J state */
1231 xmx[cur][XMJ] = -INFTY;
1232 if ((sc = xmx[prv][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
1233 { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = xtr[prv][XMJ]; }
1234 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][LOOP]) > xmx[cur][XMJ])
1235 { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = i; }
1236 /* B state */
1237 xmx[cur][XMB] = -INFTY;
1238 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1239 { xmx[cur][XMB] = sc; btr[i] = 0; }
1240 if ((sc = xmx[cur][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[cur][XMB])
1241 { xmx[cur][XMB] = sc; btr[i] = xtr[cur][XMJ]; }
1242 /* C state */
1243 xmx[cur][XMC] = -INFTY;
1244 if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1245 { xmx[cur][XMC] = sc; xtr[cur][XMC] = xtr[prv][XMC]; }
1246 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
1247 { xmx[cur][XMC] = sc; xtr[cur][XMC] = i; }
1248
1249 progress = int(100*float(i)/L);
1250 }
1251 /* T state (not stored) */
1252 sc = xmx[cur][XMC] + hmm->xsc[XTC][MOVE];
1253
1254 /*****************************************************************
1255 * Collapsed traceback stage.
1256 * xtr[L%2][XMC] contains the position j of the previous E
1257 * etr[j] contains the position i of the previous B
1258 * btr[i] contains the position j of the previous E, or 0
1259 * continue until btr[i] = 0.
1260 *****************************************************************/
1261
1262 curralloc = 2; /* minimum: no hits */
1263 P7AllocTrace(curralloc, &tr);
1264
1265 /* Init of collapsed trace. Back to front; we ReverseTrace() later.
1266 */
1267 tpos = 0;
1268 tr->statetype[tpos] = STT;
1269 tr->pos[tpos] = 0;
1270 i = xtr[L%2][XMC];
1271 while (i > 0)
1272 {
1273 curralloc += 2;
1274 P7ReallocTrace(tr, curralloc);
1275
1276 tpos++;
1277 tr->statetype[tpos] = STE;
1278 tr->pos[tpos] = i;
1279 i = etr[i];
1280
1281 tpos++;
1282 tr->statetype[tpos] = STB;
1283 tr->pos[tpos] = i;
1284 i = btr[i];
1285 }
1286
1287 tpos++;
1288 tr->statetype[tpos] = STS;
1289 tr->pos[tpos] = 0;
1290 tr->tlen = tpos + 1;
1291 P7ReverseTrace(tr);
1292
1293 FreePlan7Matrix(mx);
1294 FreePlan7Matrix(tmx);
1295 free(btr);
1296 free(etr);
1297
1298 *ret_tr = tr;
1299 return Scorify(sc);
1300 }
1301
1302 /* Function: P7WeeViterbi()
1303 * Date: SRE, Wed Mar 4 08:24:04 1998 [St. Louis]
1304 *
1305 * Purpose: Hirschberg/Myers/Miller linear memory alignment.
1306 * See [Hirschberg75,MyM-88a] for the idea of the algorithm.
1307 * Adapted to HMM implementation.
1308 *
1309 * Requires that you /know/ that there's only
1310 * one hit to the model in the sequence: either
1311 * because you're forcing single-hit, or you've
1312 * previously called P7ParsingViterbi to parse
1313 * the sequence into single-hit segments. The reason
1314 * for this is that a cyclic model (a la Plan7)
1315 * defeats the nice divide and conquer trick.
1316 * (I think some trickery with propagated trace pointers
1317 * could get around this but haven't explored it.)
1318 * This is implemented by ignoring transitions
1319 * to/from J state.
1320 *
1321 * Args: dsq - sequence in digitized form
1322 * L - length of dsq
1323 * hmm - the model
1324 * ret_tr - RETURN: traceback.
1325 *
1326 * Returns: Score of the optimal Viterbi alignment.
1327 */
1328 float
P7WeeViterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct p7trace_s ** ret_tr)1329 P7WeeViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
1330 {
1331 struct p7trace_s *tr; /* RETURN: traceback */
1332 int *kassign; /* 0..L+1, alignment of seq positions to model nodes */
1333 char *tassign; /* 0..L+1, alignment of seq positions to state types */
1334 int *endlist; /* stack of end points on sequence to work on */
1335 int *startlist; /* stack of start points on sequence to work on */
1336 int lpos; /* position in endlist, startlist */
1337 int k1, k2, k3; /* start, mid, end in model */
1338 char t1, t2, t3; /* start, mid, end in state type */
1339 int s1, s2, s3; /* start, mid, end in sequence */
1340 float sc; /* score of segment optimal alignment */
1341 float ret_sc; /* optimal score over complete seq */
1342 int tlen; /* length needed for trace */
1343 int i, k, tpos; /* index in sequence, model, trace */
1344
1345
1346 /* Initialize.
1347 */
1348 kassign = (int*) MallocOrDie (sizeof(int) * (L+1));
1349 tassign = (char*)MallocOrDie (sizeof(char)* (L+1));
1350 endlist = (int*) MallocOrDie (sizeof(int) * (L+1));
1351 startlist = (int*) MallocOrDie (sizeof(int) * (L+1));
1352
1353 lpos = 0;
1354 startlist[lpos] = 1;
1355 endlist[lpos] = L;
1356 kassign[1] = 1;
1357 kassign[L] = hmm->M;
1358 tassign[1] = STS; /* temporary boundary condition! will become N or M */
1359 tassign[L] = STT; /* temporary boundary condition! will become M or C */
1360
1361 /* Recursive divide-and-conquer alignment.
1362 */
1363 while (lpos >= 0)
1364 {
1365 /* Pop a segment off the stack */
1366 s1 = startlist[lpos];
1367 k1 = kassign[s1];
1368 t1 = tassign[s1];
1369 s3 = endlist[lpos];
1370 k3 = kassign[s3];
1371 t3 = tassign[s3];
1372 lpos--;
1373 /* find optimal midpoint of segment */
1374 sc = get_wee_midpt(hmm, dsq, L, k1, t1, s1, k3, t3, s3, &k2, &t2, &s2);
1375 kassign[s2] = k2;
1376 tassign[s2] = t2;
1377 /* score is valid on first pass */
1378 if (t1 == STS && t3 == STT) ret_sc = sc;
1379
1380 /* push N-terminal segment on stack */
1381 if (t2 != STN && (s2 - s1 > 1 || (s2 - s1 == 1 && t1 == STS)))
1382 {
1383 lpos++;
1384 startlist[lpos] = s1;
1385 endlist[lpos] = s2;
1386 }
1387 /* push C-terminal segment on stack */
1388 if (t2 != STC && (s3 - s2 > 1 || (s3 - s2 == 1 && t3 == STT)))
1389 {
1390 lpos++;
1391 startlist[lpos] = s2;
1392 endlist[lpos] = s3;
1393 }
1394
1395 if (t2 == STN)
1396 { /* if we see STN midpoint, we know the whole N-term is STN */
1397 for (; s2 >= s1; s2--) {
1398 kassign[s2] = 1;
1399 tassign[s2] = STN;
1400 }
1401 }
1402 if (t2 == STC)
1403 { /* if we see STC midpoint, we know whole C-term is STC */
1404 for (; s2 <= s3; s2++) {
1405 kassign[s2] = hmm->M;
1406 tassign[s2] = STC;
1407 }
1408 }
1409 }
1410
1411 /*****************************************************************
1412 * Construct a traceback structure from kassign/tassign by interpolating
1413 * necessary states.
1414 * Trace allocation is as follows. We clearly need L emitting states.
1415 * We also need nonemitting states as follows:
1416 * STS,STN,STB,STE,STC,STT = 6
1417 * STD: count k2-k1-1 in kassign M->M's
1418 * Also, count N->M's and M->C's (potential wing unfoldings)...
1419 * ...and be careful to check wing unfoldings when there aren't
1420 * any emitting N or C flanks! (bugfix, 2.1.1b)
1421 *****************************************************************/
1422
1423 tlen = L + 6;
1424 for (i = 1; i < L; i++) {
1425 if (tassign[i] == STM && tassign[i+1] == STM)
1426 tlen += kassign[i+1] - kassign[i] - 1;
1427 if (tassign[i] == STN && tassign[i+1] == STM)
1428 tlen += kassign[i+1] - 1;
1429 if (tassign[i] == STM && tassign[i+1] == STC)
1430 tlen += hmm->M - kassign[i];
1431
1432 }
1433 if (tassign[1] == STM) tlen += kassign[1] - 1;
1434 if (tassign[L] == STM) tlen += hmm->M - kassign[L];
1435
1436 P7AllocTrace(tlen, &tr);
1437
1438 tr->statetype[0] = STS;
1439 tr->nodeidx[0] = 0;
1440 tr->pos[0] = 0;
1441 tr->statetype[1] = STN;
1442 tr->nodeidx[1] = 0;
1443 tr->pos[1] = 0;
1444 tpos = 2;
1445
1446 for (i = 1; i <= L; i++)
1447 {
1448 switch(tassign[i]) {
1449 case STM:
1450 /* check for first match state */
1451 if (tr->statetype[tpos-1] == STN) {
1452 tr->statetype[tpos] = STB;
1453 tr->nodeidx[tpos] = 0;
1454 tr->pos[tpos] = 0;
1455 tpos++;
1456 /* check for wing unfolding */
1457 if (Prob2Score(hmm->begin[kassign[i]], hmm->p1) + INTSCALE <= hmm->bsc[kassign[i]])
1458 for (k = 1; k < kassign[i]; k++) {
1459 tr->statetype[tpos] = STD;
1460 tr->nodeidx[tpos] = k;
1461 tr->pos[tpos] = 0;
1462 tpos++;
1463 }
1464 }
1465 /* do the match state itself */
1466 tr->statetype[tpos] = STM;
1467 tr->nodeidx[tpos] = kassign[i];
1468 tr->pos[tpos] = i;
1469 tpos++;
1470 /* do any deletes necessary 'til next match */
1471 if (i < L && tassign[i+1] == STM && kassign[i+1] - kassign[i] > 1)
1472 for (k = kassign[i] + 1; k < kassign[i+1]; k++)
1473 {
1474 tr->statetype[tpos] = STD;
1475 tr->nodeidx[tpos] = k;
1476 tr->pos[tpos] = 0;
1477 tpos++;
1478 }
1479 /* check for last match state */
1480 if (i == L || tassign[i+1] == STC) {
1481 /* check for wing unfolding */
1482 if (Prob2Score(hmm->end[kassign[i-1]], 1.) + INTSCALE <= hmm->esc[kassign[i-1]])
1483 for (k = kassign[i]+1; k <= hmm->M; k++)
1484 {
1485 tr->statetype[tpos] = STD;
1486 tr->nodeidx[tpos] = k;
1487 tr->pos[tpos] = 0;
1488 tpos++;
1489 }
1490 /* add on the end state */
1491 tr->statetype[tpos] = STE;
1492 tr->nodeidx[tpos] = 0;
1493 tr->pos[tpos] = 0;
1494 tpos++;
1495 /* and a nonemitting C state */
1496 tr->statetype[tpos] = STC;
1497 tr->nodeidx[tpos] = 0;
1498 tr->pos[tpos] = 0;
1499 tpos++;
1500 }
1501 break;
1502
1503 case STI:
1504 tr->statetype[tpos] = STI;
1505 tr->nodeidx[tpos] = kassign[i];
1506 tr->pos[tpos] = i;
1507 tpos++;
1508 break;
1509
1510 case STN:
1511 tr->statetype[tpos] = STN;
1512 tr->nodeidx[tpos] = 0;
1513 tr->pos[tpos] = i;
1514 tpos++;
1515 break;
1516
1517 case STC:
1518 tr->statetype[tpos] = STC;
1519 tr->nodeidx[tpos] = 0;
1520 tr->pos[tpos] = i;
1521 tpos++;
1522 break;
1523
1524 default: Die("Bogus state %s", Statetype(tassign[i]));
1525 }
1526 }
1527 /* terminate the trace */
1528 tr->statetype[tpos] = STT;
1529 tr->nodeidx[tpos] = 0;
1530 tr->pos[tpos] = 0;
1531 tr->tlen = tpos+1;
1532
1533 *ret_tr = tr;
1534
1535 free(kassign);
1536 free(tassign);
1537 free(startlist);
1538 free(endlist);
1539 return ret_sc;
1540 }
1541
1542
1543 /* Function: Plan7ESTViterbi()
1544 *
1545 * Purpose: Frameshift-tolerant alignment of protein model to cDNA EST.
1546 *
1547 *
1548 */
1549 float
Plan7ESTViterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s ** ret_mx)1550 Plan7ESTViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
1551 {
1552 struct dpmatrix_s *mx;
1553 int **xmx;
1554 int **mmx;
1555 int **imx;
1556 int **dmx;
1557 int i,k;
1558 int sc;
1559 int codon;
1560
1561 /* Allocate a DP matrix with 0..L rows, 0..M+1 columns.
1562 */
1563 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
1564
1565 /* Initialization of the zero row (DNA sequence of length 0)
1566 * Note that xmx[i][stN] = 0 by definition for all i,
1567 * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need
1568 * to be calculated in DP matrices.
1569 */
1570 xmx[0][XMN] = 0; /* S->N, p=1 */
1571 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
1572 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
1573 for (k = 0; k <= hmm->M; k++)
1574 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
1575
1576 /* Initialization of the first row (DNA sequence of length 1);
1577 * only N state can make this nucleotide.
1578 */
1579 xmx[1][XMN] = xmx[0][XMN] + hmm->xsc[XTN][LOOP];
1580 xmx[1][XMB] = xmx[1][XMN] + hmm->xsc[XTN][MOVE];
1581 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need 2 nt to get here */
1582 for (k = 0; k <= hmm->M; k++)
1583 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need 2 nt to get into model */
1584
1585 /* Recursion. Done as a pull.
1586 * Note some slightly wasteful boundary conditions:
1587 * tsc[0] = -INFTY for all eight transitions (no node 0)
1588 * D_M and I_M are wastefully calculated (they don't exist)
1589 */
1590 for (i = 2; i <= L; i++) {
1591 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
1592
1593 /* crude calculation of lookup value for codon */
1594 if (i > 2) {
1595 if (dsq[i-2] < 4 && dsq[i-1] < 4 && dsq[i] < 4)
1596 codon = dsq[i-2] * 16 + dsq[i-1] * 4 + dsq[i];
1597 else
1598 codon = 64; /* ambiguous codon; punt */
1599 }
1600
1601 for (k = 1; k <= hmm->M; k++) {
1602 /* match state */
1603 if (i > 2) {
1604 mmx[i][k] = mmx[i-3][k-1] + hmm->tsc[TMM][k-1];
1605 if ((sc = imx[i-3][k-1] + hmm->tsc[TIM][k-1]) > mmx[i][k])
1606 mmx[i][k] = sc;
1607 if ((sc = xmx[i-3][XMB] + hmm->bsc[k]) > mmx[i][k])
1608 mmx[i][k] = sc;
1609 if ((sc = dmx[i-3][k-1] + hmm->tsc[TDM][k-1]) > mmx[i][k])
1610 mmx[i][k] = sc;
1611 mmx[i][k] += hmm->dnam[codon][k];
1612 }
1613 /* -1 frameshifts into match state */
1614 if ((sc = mmx[i-2][k-1] + hmm->tsc[TMM][k-1] + hmm->dna2) > mmx[i][k])
1615 mmx[i][k] = sc;
1616 if ((sc = imx[i-2][k-1] + hmm->tsc[TIM][k-1] + hmm->dna2) > mmx[i][k])
1617 mmx[i][k] = sc;
1618 if ((sc = xmx[i-2][XMB] + hmm->bsc[k] + hmm->dna2) > mmx[i][k])
1619 mmx[i][k] = sc;
1620 if ((sc = dmx[i-2][k-1] + hmm->tsc[TDM][k-1] + hmm->dna2) > mmx[i][k])
1621 mmx[i][k] = sc;
1622
1623 /* +1 frameshifts into match state */
1624 if (i > 3) {
1625 if ((sc = mmx[i-4][k-1] + hmm->tsc[TMM][k-1] + hmm->dna4) > mmx[i][k])
1626 mmx[i][k] = sc;
1627 if ((sc = imx[i-4][k-1] + hmm->tsc[TIM][k-1] + hmm->dna4) > mmx[i][k])
1628 mmx[i][k] = sc;
1629 if ((sc = xmx[i-4][XMB] + hmm->bsc[k] + hmm->dna4) > mmx[i][k])
1630 mmx[i][k] = sc;
1631 if ((sc = dmx[i-4][k-1] + hmm->tsc[TDM][k-1] + hmm->dna4) > mmx[i][k])
1632 mmx[i][k] = sc;
1633 }
1634 /* delete state */
1635 dmx[i][k] = mmx[i][k-1] + hmm->tsc[TMD][k-1];
1636 if ((sc = dmx[i][k-1] + hmm->tsc[TDD][k-1]) > dmx[i][k])
1637 dmx[i][k] = sc;
1638
1639 /* insert state */
1640 if (i > 2) {
1641 imx[i][k] = mmx[i-3][k] + hmm->tsc[TMI][k];
1642 if ((sc = imx[i-3][k] + hmm->tsc[TII][k]) > imx[i][k])
1643 imx[i][k] = sc;
1644 imx[i][k] += hmm->dnai[codon][k];
1645 }
1646
1647 /* -1 frameshifts into insert state */
1648 if ((sc = mmx[i-2][k] + hmm->tsc[TMI][k] + hmm->dna2) > imx[i][k])
1649 imx[i][k] = sc;
1650 if ((sc = imx[i-2][k] + hmm->tsc[TII][k] + hmm->dna2) > imx[i][k])
1651 imx[i][k] = sc;
1652
1653 /* +1 frameshifts into insert state */
1654 if (i > 4) {
1655 if ((sc = mmx[i-4][k] + hmm->tsc[TMI][k] + hmm->dna4) > imx[i][k])
1656 imx[i][k] = sc;
1657 if ((sc = imx[i-4][k] + hmm->tsc[TII][k] + hmm->dna4) > imx[i][k])
1658 imx[i][k] = sc;
1659 }
1660 }
1661 /* Now the special states. Order is important here.
1662 * remember, C and J emissions are zero score by definition,
1663 */
1664 /* N state: +1 nucleotide */
1665 xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];
1666 /* E state: collect from M's, and last D */
1667 xmx[i][XME] = dmx[i][hmm->M]; /* transition prob from last D = 1.0 */
1668 for (k = 1; k <= hmm->M; k++)
1669 if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME])
1670 xmx[i][XME] = sc;
1671 /* J state: +1 nucleotide */
1672 xmx[i][XMJ] = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP];
1673 if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
1674 xmx[i][XMJ] = sc;
1675 /* B state: collect from N,J */
1676 xmx[i][XMB] = xmx[i][XMN] + hmm->xsc[XTN][MOVE];
1677 if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
1678 xmx[i][XMB] = sc;
1679 /* C state: +1 nucleotide */
1680 xmx[i][XMC] = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP];
1681 if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
1682 xmx[i][XMC] = sc;
1683 }
1684
1685 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
1686
1687 if (ret_mx != NULL) *ret_mx = mx;
1688 else FreePlan7Matrix(mx);
1689
1690 return Scorify(sc); /* the total Viterbi score. */
1691 }
1692
1693
1694
1695 /* Function: get_wee_midpt()
1696 * Date: SRE, Wed Mar 4 08:27:11 1998 [St. Louis]
1697 *
1698 * Purpose: The heart of the divide and conquer algorithm
1699 * for P7WeeViterbi(). This function is called
1700 * recursively to find successive optimal midpoints
1701 * in the alignment matrix. See P7WeeViterbi() for
1702 * further comments on the assumptions of this algorithm.
1703 *
1704 * Args: hmm - the model, set up for integer scores
1705 * dsq - the sequence, digitized
1706 * L - length of the sequence
1707 * k1 - model node to start with, 1..M
1708 * t1 - state type to start with, STM | STI | STN | STC; STS to start
1709 * s1 - sequence position to start with, 1..L; 1 to start
1710 * k3 - model node to end with, 1..M
1711 * t3 - state type to end with, STM | STI | STN | STC; STT to start
1712 * s3 - sequence position to end with, 1..L; L to start
1713 * ret_k2 - RETURN: optimal midpoint, node position in model
1714 * ret_t2 - RETURN: optimal midpoint, state type
1715 * ret_s2 - RETURN: optimal midpoint, sequence position
1716 *
1717 * Returns: score of optimal alignment, in bits.
1718 */
1719 static float
get_wee_midpt(struct plan7_s * hmm,unsigned char * dsq,int L,int k1,char t1,int s1,int k3,char t3,int s3,int * ret_k2,char * ret_t2,int * ret_s2)1720 get_wee_midpt(struct plan7_s *hmm, unsigned char *dsq, int L,
1721 int k1, char t1, int s1,
1722 int k3, char t3, int s3,
1723 int *ret_k2, char *ret_t2, int *ret_s2)
1724 {
1725 struct dpmatrix_s *fwd;
1726 struct dpmatrix_s *bck;
1727 int **xmx; /* convenience ptr into special states */
1728 int **mmx; /* convenience ptr into match states */
1729 int **imx; /* convenience ptr into insert states */
1730 int **dmx; /* convenience ptr into delete states */
1731 int k2;
1732 char t2;
1733 int s2;
1734 int cur, prv, nxt; /* current, previous, next row index (0 or 1)*/
1735 int i,k; /* indices for seq, model */
1736 int sc; /* integer score */
1737 int max; /* maximum integer score */
1738 int start; /* s1 to start at (need, for STS special case) */
1739
1740
1741 /* Choose our midpoint.
1742 * Special cases: s1, s3 adjacent and t1 == STS: s2 = s1
1743 * s1, s3 adjacent and t3 == STT: s2 = s3
1744 * (where we must replace STS, STT eventually)
1745 */
1746 s2 = s1 + (s3-s1) / 2;
1747 if (s3-s1 == 1 && t1 == STS) s2 = s1;
1748 if (s3-s1 == 1 && t3 == STT) s2 = s3;
1749
1750 /* STS is a special case. STS aligns to row zero by convention,
1751 * but we'll be passed s1=1, t1=STS. We have to init on row
1752 * zero then start DP on row 1.
1753 */
1754 start = (t1 == STS) ? 0 : s1;
1755
1756 /* Allocate our forward two rows.
1757 * Initialize row zero.
1758 */
1759 fwd = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1760 cur = start%2;
1761 xmx[cur][XMN] = xmx[cur][XMB] = -INFTY;
1762 xmx[cur][XME] = xmx[cur][XMC] = -INFTY;
1763 for (k = k1; k <= k3; k++)
1764 mmx[cur][k] = imx[cur][k] = dmx[cur][k] = -INFTY;
1765
1766 /* Where to put our zero for our start point...
1767 * (only possible to start on an emitting state; J disallowed)
1768 */
1769 switch (t1) {
1770 case STM: mmx[cur][k1] = 0; break;
1771 case STI: imx[cur][k1] = 0; break;
1772 case STN: xmx[cur][XMN] = 0; break;
1773 case STC: xmx[cur][XMC] = 0; break;
1774 case STS: xmx[cur][XMN] = 0; break;
1775 default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t1));
1776 }
1777
1778 /* Still initializing.
1779 * Deal with pulling horizontal matrix moves in initial row.
1780 * These are any transitions to nonemitters:
1781 * STM-> E, D
1782 * STI-> none
1783 * STN-> B
1784 * STC-> (T, but we never observe this in the forward pass of a d&c)
1785 * STE-> C
1786 * STS-> (N, already implied by setting xmx[cur][XMN] = 0)
1787 * STB-> M
1788 */
1789 if (t1 == STM)
1790 {
1791 for (k = k1+1; k <= k3; k++)
1792 { /* transits into STD */
1793 dmx[cur][k] = -INFTY;
1794 if ((sc = mmx[cur][k-1] + hmm->tsc[TMD][k-1]) > -INFTY)
1795 dmx[cur][k] = sc;
1796 if ((sc = dmx[cur][k-1] + hmm->tsc[TDD][k-1]) > dmx[cur][k])
1797 dmx[cur][k] = sc;
1798 }
1799 /* transit into STE */
1800 xmx[cur][XME] = -INFTY;
1801 if ((sc = mmx[cur][k1] + hmm->esc[k1]) > -INFTY)
1802 xmx[cur][XME] = sc;
1803 }
1804 /* transit into STB from STN */
1805 xmx[cur][XMB] = -INFTY;
1806 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1807 xmx[cur][XMB] = sc;
1808 /* transit into STC from STE */
1809 xmx[cur][XMC] = -INFTY;
1810 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > -INFTY)
1811 xmx[cur][XMC] = sc;
1812
1813 /* Done initializing.
1814 * Start recursive DP; sweep forward to chosen s2 midpoint. Done as a pull.
1815 */
1816 for (i = start+1; i <= s2; i++) {
1817 cur = i % 2;
1818 prv = !cur;
1819
1820 mmx[cur][k1] = imx[cur][k1] = dmx[cur][k1] = -INFTY;
1821
1822 /* Insert state in column k1, and B->M transition in k1.
1823 */
1824 if (k1 < hmm->M) {
1825 imx[cur][k1] = -INFTY;
1826 if ((sc = mmx[prv][k1] + hmm->tsc[TMI][k1]) > -INFTY)
1827 imx[cur][k1] = sc;
1828 if ((sc = imx[prv][k1] + hmm->tsc[TII][k1]) > imx[cur][k1])
1829 imx[cur][k1] = sc;
1830 if (hmm->isc[dsq[i]][k1] != -INFTY)
1831 imx[cur][k1] += hmm->isc[dsq[i]][k1];
1832 else
1833 imx[cur][k1] = -INFTY;
1834 }
1835 if ((sc = xmx[prv][XMB] + hmm->bsc[k1]) > -INFTY)
1836 mmx[cur][k1] = sc;
1837 if (hmm->msc[dsq[i]][k1] != -INFTY)
1838 mmx[cur][k1] += hmm->msc[dsq[i]][k1];
1839 else
1840 mmx[cur][k1] = -INFTY;
1841
1842 /* Main chunk of recursion across model positions
1843 */
1844 for (k = k1+1; k <= k3; k++) {
1845 /* match state */
1846 mmx[cur][k] = -INFTY;
1847 if ((sc = mmx[prv][k-1] + hmm->tsc[TMM][k-1]) > -INFTY)
1848 mmx[cur][k] = sc;
1849 if ((sc = imx[prv][k-1] + hmm->tsc[TIM][k-1]) > mmx[cur][k])
1850 mmx[cur][k] = sc;
1851 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
1852 mmx[cur][k] = sc;
1853 if ((sc = dmx[prv][k-1] + hmm->tsc[TDM][k-1]) > mmx[cur][k])
1854 mmx[cur][k] = sc;
1855 if (hmm->msc[dsq[i]][k] != -INFTY)
1856 mmx[cur][k] += hmm->msc[dsq[i]][k];
1857 else
1858 mmx[cur][k] = -INFTY;
1859
1860 /* delete state */
1861 dmx[cur][k] = -INFTY;
1862 if (k < hmm->M) {
1863 if ((sc = mmx[cur][k-1] + hmm->tsc[TMD][k-1]) > -INFTY)
1864 dmx[cur][k] = sc;
1865 if ((sc = dmx[cur][k-1] + hmm->tsc[TDD][k-1]) > dmx[cur][k])
1866 dmx[cur][k] = sc;
1867 }
1868
1869 /* insert state */
1870 imx[cur][k] = -INFTY;
1871 if (k < hmm->M) {
1872 if ((sc = mmx[prv][k] + hmm->tsc[TMI][k]) > -INFTY)
1873 imx[cur][k] = sc;
1874 if ((sc = imx[prv][k] + hmm->tsc[TII][k]) > imx[cur][k])
1875 imx[cur][k] = sc;
1876 if (hmm->isc[dsq[i]][k] != -INFTY)
1877 imx[cur][k] += hmm->isc[dsq[i]][k];
1878 else
1879 imx[cur][k] = -INFTY;
1880 }
1881 }
1882 /* N state */
1883 xmx[cur][XMN] = -INFTY;
1884 if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1885 xmx[cur][XMN] = sc;
1886 /* E state */
1887 xmx[cur][XME] = -INFTY;
1888 for (k = k1; k <= k3 && k <= hmm->M; k++)
1889 if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
1890 xmx[cur][XME] = sc;
1891 /* B state */
1892 xmx[cur][XMB] = -INFTY;
1893 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1894 xmx[cur][XMB] = sc;
1895 /* C state */
1896 xmx[cur][XMC] = -INFTY;
1897 if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1898 xmx[cur][XMC] = sc;
1899 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
1900 xmx[cur][XMC] = sc;
1901 }
1902
1903 /* Row s2%2 in fwd matrix now contains valid scores from s1 (start) to s2,
1904 * with J transitions disallowed (no cycles through model).
1905 */
1906
1907 /*****************************************************************
1908 * Backwards pass.
1909 *****************************************************************/
1910
1911 /* Allocate our backwards two rows. Init last row.
1912 */
1913 bck = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1914 nxt = s3%2;
1915 xmx[nxt][XMN] = xmx[nxt][XMB] = -INFTY;
1916 xmx[nxt][XME] = xmx[nxt][XMC] = -INFTY;
1917 for (k = k1; k <= k3 + 1; k++)
1918 mmx[nxt][k] = imx[nxt][k] = dmx[nxt][k] = -INFTY;
1919 cur = !nxt;
1920 mmx[cur][k3+1] = imx[cur][k3+1] = dmx[cur][k3+1] = -INFTY;
1921
1922 /* Where to put the zero for our end point on last row.
1923 */
1924 switch (t3) {
1925 case STM: mmx[nxt][k3] = 0; break;
1926 case STI: imx[nxt][k3] = 0; break;
1927 case STN: xmx[nxt][XMN] = 0; break;
1928 case STC: xmx[nxt][XMC] = 0; break; /* must be an emitting C */
1929 case STT: xmx[nxt][XMC] = hmm->xsc[XTC][MOVE]; break; /* C->T implied */
1930 default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t3));
1931 }
1932
1933 /* Still initializing.
1934 * In the case t3==STT, there are a few horizontal moves possible
1935 * on row s3, because STT isn't an emitter. All other states are
1936 * emitters, so their connections have to be to the previous row s3-1.
1937 */
1938 if (t3 == STT)
1939 { /* E->C */
1940 xmx[nxt][XME] = xmx[nxt][XMC] + hmm->xsc[XTE][MOVE];
1941 /* M->E */
1942 for (k = k3; k >= k1; k--) {
1943 mmx[nxt][k] = xmx[nxt][XME] + hmm->esc[k];
1944 if (s3 != s2)
1945 mmx[nxt][k] += hmm->msc[dsq[s3]][k];
1946 }
1947 }
1948
1949 /* Start recursive DP; sweep backwards to chosen s2 midpoint.
1950 * Done as a pull. M, I scores at current row do /not/ include
1951 * emission scores. Be careful of integer underflow.
1952 */
1953 for (i = s3-1; i >= s2; i--) {
1954 /* note i < L, so i+1 is always a legal index */
1955 cur = i%2;
1956 nxt = !cur;
1957 /* C pulls from C (T is special cased) */
1958 xmx[cur][XMC] = -INFTY;
1959 if ((sc = xmx[nxt][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1960 xmx[cur][XMC] = sc;
1961 /* B pulls from M's */
1962 xmx[cur][XMB] = -INFTY;
1963 for (k = k1; k <= k3; k++)
1964 if ((sc = mmx[nxt][k] + hmm->bsc[k]) > xmx[cur][XMB])
1965 xmx[cur][XMB] = sc;
1966 /* E pulls from C (J disallowed) */
1967 xmx[cur][XME] = -INFTY;
1968 if ((sc = xmx[cur][XMC] + hmm->xsc[XTE][MOVE]) > -INFTY)
1969 xmx[cur][XME] = sc;
1970 /* N pulls from B, N */
1971 xmx[cur][XMN] = -INFTY;
1972 if ((sc = xmx[cur][XMB] + hmm->xsc[XTN][MOVE]) > -INFTY)
1973 xmx[cur][XMN] = sc;
1974 if ((sc = xmx[nxt][XMN] + hmm->xsc[XTN][LOOP]) > xmx[cur][XMN])
1975 xmx[cur][XMN] = sc;
1976
1977 /* Main recursion across model
1978 */
1979 for (k = k3; k >= k1; k--) {
1980 /* special case k == M */
1981 if (k == hmm->M) {
1982 mmx[cur][k] = xmx[cur][XME]; /* p=1 transition to E by definition */
1983 dmx[cur][k] = -INFTY; /* doesn't exist */
1984 imx[cur][k] = -INFTY; /* doesn't exist */
1985 if (i != s2)
1986 mmx[cur][k] += hmm->msc[dsq[i]][k];
1987 continue;
1988 } /* below this k < M, so k+1 is a legal index */
1989
1990 /* pull into match state */
1991 mmx[cur][k] = -INFTY;
1992 if ((sc = xmx[cur][XME] + hmm->esc[k]) > -INFTY)
1993 mmx[cur][k] = sc;
1994 if ((sc = mmx[nxt][k+1] + hmm->tsc[TMM][k]) > mmx[cur][k])
1995 mmx[cur][k] = sc;
1996 if ((sc = imx[nxt][k] + hmm->tsc[TMI][k]) > mmx[cur][k])
1997 mmx[cur][k] = sc;
1998 if ((sc = dmx[cur][k+1] + hmm->tsc[TMD][k]) > mmx[cur][k])
1999 mmx[cur][k] = sc;
2000 if (i != s2)
2001 mmx[cur][k] += hmm->msc[dsq[i]][k];
2002
2003 /* pull into delete state */
2004 dmx[cur][k] = -INFTY;
2005 if ((sc = mmx[nxt][k+1] + hmm->tsc[TDM][k]) > -INFTY)
2006 dmx[cur][k] = sc;
2007 if ((sc = dmx[cur][k+1] + hmm->tsc[TDD][k]) > dmx[cur][k])
2008 dmx[cur][k] = sc;
2009 /* pull into insert state */
2010 imx[cur][k] = -INFTY;
2011 if ((sc = mmx[nxt][k+1] + hmm->tsc[TIM][k]) > -INFTY)
2012 imx[cur][k] = sc;
2013 if ((sc = imx[nxt][k] + hmm->tsc[TII][k]) > imx[cur][k])
2014 imx[cur][k] = sc;
2015 if (i != s2)
2016 imx[cur][k] += hmm->isc[dsq[i]][k];
2017
2018 }
2019 }
2020
2021 /*****************************************************************
2022 * DP complete; we have both forward and backward passes. Now we
2023 * look across the s2 row and find the optimal emitting state.
2024 *****************************************************************/
2025
2026 cur = s2%2;
2027 max = -INFTY;
2028 for (k = k1; k <= k3; k++)
2029 {
2030 if ((sc = fwd->mmx[cur][k] + bck->mmx[cur][k]) > max)
2031 { k2 = k; t2 = STM; max = sc; }
2032 if ((sc = fwd->imx[cur][k] + bck->imx[cur][k]) > max)
2033 { k2 = k; t2 = STI; max = sc; }
2034 }
2035 if ((sc = fwd->xmx[cur][XMN] + bck->xmx[cur][XMN]) > max)
2036 { k2 = 1; t2 = STN; max = sc; }
2037 if ((sc = fwd->xmx[cur][XMC] + bck->xmx[cur][XMC]) > max)
2038 { k2 = hmm->M; t2 = STC; max = sc; }
2039
2040 /*****************************************************************
2041 * Garbage collection, return.
2042 *****************************************************************/
2043
2044 FreePlan7Matrix(fwd);
2045 FreePlan7Matrix(bck);
2046 *ret_k2 = k2;
2047 *ret_t2 = t2;
2048 *ret_s2 = s2;
2049 return Scorify(max);
2050 }
2051
2052
2053 /* Function: P7ViterbiAlignAlignment()
2054 * Date: SRE, Sat Jul 4 13:39:00 1998 [St. Louis]
2055 *
2056 * Purpose: Align a multiple alignment to an HMM without
2057 * changing the multiple alignment itself.
2058 * Adapted from P7Viterbi().
2059 *
2060 * Heuristic; not a guaranteed optimal alignment.
2061 * Guaranteeing an optimal alignment appears difficult.
2062 * [cryptic note to myself:] In paths connecting to I* metastates,
2063 * recursion breaks down; if there is a gap in the
2064 * previous column for a given seq, we can't determine what state the
2065 * I* metastate corresponds to for this sequence, unless we
2066 * look back in the DP matrix. The lookback would either involve
2067 * recursing back to the previous M* metastate (giving a
2068 * O(MN^2) algorithm instead of O(MN)) or expanding the I*
2069 * metastate into 3^nseq separate I* metastates to keep track
2070 * of which of three states each seq is in. Since the second
2071 * option blows up exponentially w/ nseq, it is not attractive.
2072 * If the first option were used, the correct algorithm would be related to
2073 * modelmakers.c:Maxmodelmaker(), but somewhat more difficult.
2074 *
2075 * The heuristic approach here is to calculate a "consensus"
2076 * sequence from the alignment, and align the consensus to the HMM.
2077 * Some hackery is employed, weighting transitions and emissions
2078 * to make things work (re: con and mocc arrays).
2079 *
2080 * Args: aseq - aligned sequences
2081 * ainfo - info for aseqs (includes alen, nseq, wgt)
2082 * hmm - model to align to
2083 *
2084 * Returns: Traceback. Caller must free with P7FreeTrace().
2085 * pos[] contains alignment columns, indexed 1..alen.
2086 * statetype[] contains metastates M*, etc. as STM, etc.
2087 */
2088 struct p7trace_s *
P7ViterbiAlignAlignment(MSA * msa,struct plan7_s * hmm)2089 P7ViterbiAlignAlignment(MSA *msa, struct plan7_s *hmm)
2090 {
2091 struct dpmatrix_s *mx; /* Viterbi calculation lattice (two rows) */
2092 struct dpshadow_s *tb; /* shadow matrix of traceback pointers */
2093 struct p7trace_s *tr; /* RETURN: traceback */
2094 int **xmx, **mmx, **imx, **dmx;
2095 char **xtb, **mtb, **itb, **dtb;
2096 float **con; /* [1..alen][0..Alphabet_size-1], consensus counts */
2097 float *mocc; /* fractional occupancy of a column; used to weight transitions */
2098 int i; /* counter for columns */
2099 int k; /* counter for model positions */
2100 int idx; /* counter for seqs */
2101 int sym; /* counter for alphabet symbols */
2102 int sc; /* temp variable for holding score */
2103 float denom; /* total weight of seqs; used to "normalize" counts */
2104 int cur, prv;
2105
2106 //get HMMERTaskLocalData
2107 HMMERTaskLocalData *tld = getHMMERTaskLocalData();
2108 alphabet_s *al = &tld->al;
2109
2110 /* The "consensus" is a counts matrix, [1..alen][0..Alphabet_size-1].
2111 * Gaps are not counted explicitly, but columns with lots of gaps get
2112 * less total weight because they have fewer counts.
2113 */
2114 /* allocation */
2115 con = (float**)MallocOrDie(sizeof(float *) * (msa->alen+1));
2116 mocc = (float*)MallocOrDie(sizeof(float) * (msa->alen+1));
2117 for (i = 1; i <= msa->alen; i++) {
2118 con[i] = (float*)MallocOrDie(sizeof(float) * al->Alphabet_size);
2119 FSet(con[i], al->Alphabet_size, 0.0);
2120 }
2121 mocc[0] = -9999.;
2122 /* initialization */
2123 /* note: aseq is off by one, 0..alen-1 */
2124 /* "normalized" to have a max total count of 1 per col */
2125 denom = FSum(msa->wgt, msa->nseq);
2126 for (i = 1; i <= msa->alen; i++)
2127 {
2128 for (idx = 0; idx < msa->nseq; idx++)
2129 if (! isgap(msa->aseq[idx][i-1]))
2130 P7CountSymbol(con[i], SYMIDX(al, msa->aseq[idx][i-1]), msa->wgt[idx]);
2131 FScale(con[i], al->Alphabet_size, 1./denom);
2132 mocc[i] = FSum(con[i], al->Alphabet_size);
2133 }
2134
2135 /* Allocate a DP matrix with 2 rows, 0..M columns,
2136 * and a shadow matrix with 0,1..alen rows, 0..M columns.
2137 */
2138 mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
2139 tb = AllocShadowMatrix(msa->alen+1, hmm->M, &xtb, &mtb, &itb, &dtb);
2140
2141 /* Initialization of the zero row.
2142 */
2143 xmx[0][XMN] = 0; /* S->N, p=1 */
2144 xtb[0][XMN] = STS;
2145 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
2146 xtb[0][XMB] = STN;
2147 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
2148 tb->esrc[0] = 0;
2149 xtb[0][XMC] = xtb[0][XMJ] = STBOGUS;
2150 for (k = 0; k <= hmm->M; k++) {
2151 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
2152 mtb[0][k] = itb[0][k] = dtb[0][k] = STBOGUS;
2153 }
2154
2155 /* Recursion. Done as a pull.
2156 * Note some slightly wasteful boundary conditions:
2157 * tsc[0] = -INFTY for all eight transitions (no node 0)
2158 * D_M and I_M are wastefully calculated (they don't exist)
2159 */
2160 for (i = 1; i <= msa->alen; i++) {
2161 cur = i % 2;
2162 prv = ! cur;
2163
2164 mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;
2165 mtb[i][0] = itb[i][0] = dtb[i][0] = STBOGUS;
2166
2167 for (k = 1; k <= hmm->M; k++) {
2168 /* match state */
2169 mmx[cur][k] = -INFTY;
2170 mtb[i][k] = STBOGUS;
2171 if (mmx[prv][k-1] > -INFTY && hmm->tsc[TMM][k-1] > -INFTY &&
2172 (sc = mmx[prv][k-1] + hmm->tsc[TMM][k-1]) > mmx[cur][k])
2173 { mmx[cur][k] = sc; mtb[i][k] = STM; }
2174 if (imx[prv][k-1] > -INFTY && hmm->tsc[TIM][k-1] > -INFTY &&
2175 (sc = imx[prv][k-1] + hmm->tsc[TIM][k-1] * mocc[i-1]) > mmx[cur][k])
2176 { mmx[cur][k] = sc; mtb[i][k] = STI; }
2177 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
2178 { mmx[cur][k] = sc; mtb[i][k] = STB; }
2179 if (dmx[prv][k-1] > -INFTY && hmm->tsc[TDM][k-1] > -INFTY &&
2180 (sc = dmx[prv][k-1] + hmm->tsc[TDM][k-1]) > mmx[cur][k])
2181 { mmx[cur][k] = sc; mtb[i][k] = STD; }
2182 /* average over "consensus" sequence */
2183 for (sym = 0; sym < al->Alphabet_size; sym++)
2184 {
2185 if (con[i][sym] > 0 && hmm->msc[sym][k] == -INFTY) { mmx[cur][k] = -INFTY; break; }
2186 mmx[cur][k] += hmm->msc[sym][k] * con[i][sym];
2187 }
2188
2189 /* delete state */
2190 dmx[cur][k] = -INFTY;
2191 dtb[i][k] = STBOGUS;
2192 if (mmx[cur][k-1] > -INFTY && hmm->tsc[TMD][k-1] > -INFTY &&
2193 (sc = mmx[cur][k-1] + hmm->tsc[TMD][k-1]) > dmx[cur][k])
2194 { dmx[cur][k] = sc; dtb[i][k] = STM; }
2195 if (dmx[cur][k-1] > -INFTY && hmm->tsc[TDD][k-1] > -INFTY &&
2196 (sc = dmx[cur][k-1] + hmm->tsc[TDD][k-1]) > dmx[cur][k])
2197 { dmx[cur][k] = sc; dtb[i][k] = STD; }
2198
2199 /* insert state */
2200 if (k < hmm->M) {
2201 imx[cur][k] = -INFTY;
2202 itb[i][k] = STBOGUS;
2203 if (mmx[prv][k] > -INFTY && hmm->tsc[TMI][k] > -INFTY &&
2204 (sc = mmx[prv][k] + hmm->tsc[TMI][k] * mocc[i]) > imx[cur][k])
2205 { imx[cur][k] = sc; itb[i][k] = STM; }
2206 if (imx[prv][k] > -INFTY && hmm->tsc[TII][k] > -INFTY &&
2207 (sc = imx[prv][k] + hmm->tsc[TII][k] * mocc[i-1] * mocc[i]) > imx[cur][k])
2208 { imx[cur][k] = sc; itb[i][k] = STI; }
2209 /* average over "consensus" sequence */
2210 for (sym = 0; sym < al->Alphabet_size; sym++)
2211 {
2212 if (con[i][sym] > 0 && hmm->isc[sym][k] == -INFTY) { imx[cur][k] = -INFTY; break; }
2213 imx[cur][k] += hmm->isc[sym][k] * con[i][sym];
2214 }
2215 }
2216 }
2217
2218 /* Now the special states. Order is important here.
2219 * remember, N, C, and J emissions are zero score by definition.
2220 */
2221 /* N state */
2222 xmx[cur][XMN] = -INFTY;
2223 xtb[i][XMN] = STBOGUS;
2224 if (xmx[prv][XMN] > -INFTY && hmm->xsc[XTN][LOOP] > -INFTY &&
2225 (sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP] * mocc[i]) > -INFTY)
2226 { xmx[cur][XMN] = sc; xtb[i][XMN] = STN; }
2227 /* E state */
2228 xmx[cur][XME] = -INFTY;
2229 xtb[i][XME] = STBOGUS;
2230 for (k = 1; k <= hmm->M; k++)
2231 if (mmx[cur][k] > -INFTY && hmm->esc[k] > -INFTY &&
2232 (sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
2233 { xmx[cur][XME] = sc; tb->esrc[i] = k; }
2234
2235 /* we don't check J state */
2236 /* B state; don't connect from J */
2237 xmx[cur][XMB] = -INFTY;
2238 xtb[i][XMB] = STBOGUS;
2239 if (xmx[cur][XMN] > -INFTY && hmm->xsc[XTN][MOVE] > -INFTY &&
2240 (sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > xmx[cur][XMB])
2241 { xmx[cur][XMB] = sc; xtb[i][XMB] = STN; }
2242
2243 /* C state */
2244 xmx[cur][XMC] = -INFTY;
2245 xtb[i][XMC] = STBOGUS;
2246 if (xmx[prv][XMC] > -INFTY && hmm->xsc[XTC][LOOP] > -INFTY &&
2247 (sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP] * mocc[i]) > -INFTY)
2248 { xmx[cur][XMC] = sc; xtb[i][XMC] = STC; }
2249 if (xmx[cur][XME] > -INFTY && hmm->xsc[XTE][MOVE] > -INFTY &&
2250 (sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
2251 { xmx[cur][XMC] = sc; xtb[i][XMC] = STE; }
2252 }
2253 /* T state (not stored in mx) */
2254 sc = xmx[msa->alen%2][XMC] + hmm->xsc[XTC][MOVE];
2255
2256 /* do the traceback */
2257 tr = ShadowTrace(tb, hmm, msa->alen);
2258 /* cleanup and return */
2259 FreePlan7Matrix(mx);
2260 FreeShadowMatrix(tb);
2261 for (i = 1; i <= msa->alen; i++)
2262 free(con[i]);
2263 free(con);
2264 free(mocc);
2265
2266 return tr;
2267 }
2268
2269
2270
2271 /* Function: ShadowTrace()
2272 * Date: SRE, Sun Jul 5 11:38:24 1998 [St. Louis]
2273 *
2274 * Purpose: Given a shadow matrix, trace it back, and return
2275 * the trace.
2276 *
2277 * Args: tb - shadow matrix of traceback pointers
2278 * hmm - the model (needed for figuring out wing unfolding)
2279 * L - sequence length
2280 *
2281 * Returns: traceback. Caller must free w/ P7FreeTrace().
2282 */
2283 struct p7trace_s *
ShadowTrace(struct dpshadow_s * tb,struct plan7_s * hmm,int L)2284 ShadowTrace(struct dpshadow_s *tb, struct plan7_s *hmm, int L)
2285 {
2286 struct p7trace_s *tr;
2287 int curralloc; /* current allocated length of trace */
2288 int tpos; /* position in trace */
2289 int i; /* position in seq (1..N) */
2290 int k; /* position in model (1..M) */
2291 char nxtstate; /* next state to assign in traceback */
2292
2293 /* Overallocate for the trace.
2294 * S-N-B- ... - E-C-T : 6 states + L is minimum trace;
2295 * add L more as buffer.
2296 */
2297 curralloc = L * 2 + 6;
2298 P7AllocTrace(curralloc, &tr);
2299
2300 /* Initialization of trace
2301 * We do it back to front; ReverseTrace() is called later.
2302 */
2303 tr->statetype[0] = STT;
2304 tr->nodeidx[0] = 0;
2305 tr->pos[0] = 0;
2306 tpos = 1;
2307 i = L; /* current i (seq pos) we're trying to assign */
2308 k = 0; /* current k (model pos) we're trying to assign */
2309 nxtstate = STC; /* assign the C state first, for C->T */
2310
2311 /* Traceback
2312 */
2313 while (nxtstate != STS) {
2314 switch (nxtstate) {
2315 case STM:
2316 tr->statetype[tpos] = STM;
2317 nxtstate = tb->mtb[i][k];
2318 tr->nodeidx[tpos] = k--;
2319 tr->pos[tpos] = i--;
2320 tpos++;
2321 break;
2322
2323 case STI:
2324 tr->statetype[tpos] = STI;
2325 nxtstate = tb->itb[i][k];
2326 tr->nodeidx[tpos] = k;
2327 tr->pos[tpos] = i--;
2328 tpos++;
2329 break;
2330
2331 case STD:
2332 tr->statetype[tpos] = STD;
2333 nxtstate = tb->dtb[i][k];
2334 tr->nodeidx[tpos] = k--;
2335 tr->pos[tpos] = 0;
2336 tpos++;
2337 break;
2338
2339 case STN:
2340 tr->statetype[tpos] = STN;
2341 nxtstate = tb->xtb[i][XMN];
2342 tr->nodeidx[tpos] = 0;
2343 tr->pos[tpos] = (nxtstate == STN) ? i-- : 0; /* N->N; 2nd one emits. */
2344 tpos++;
2345 break;
2346
2347 case STB:
2348 /* Check for wing unfolding */
2349 if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1])
2350 while (k > 0)
2351 {
2352 tr->statetype[tpos] = STD;
2353 tr->nodeidx[tpos] = k--;
2354 tr->pos[tpos] = 0;
2355 tpos++;
2356 if (tpos == curralloc)
2357 { /* grow trace if necessary */
2358 curralloc += L;
2359 P7ReallocTrace(tr, curralloc);
2360 }
2361 }
2362
2363 tr->statetype[tpos] = STB;
2364 nxtstate = tb->xtb[i][XMB];
2365 tr->nodeidx[tpos] = 0;
2366 tr->pos[tpos] = 0;
2367 tpos++;
2368 break;
2369
2370 case STJ:
2371 tr->statetype[tpos] = STJ;
2372 nxtstate = tb->xtb[i][XMJ];
2373 tr->nodeidx[tpos] = 0;
2374 tr->pos[tpos] = (nxtstate == STJ) ? i-- : 0; /* J->J; 2nd one emits. */
2375 tpos++;
2376 break;
2377
2378 case STE:
2379 tr->statetype[tpos] = STE;
2380 tr->nodeidx[tpos] = 0;
2381 tr->pos[tpos] = 0;
2382 k = tb->esrc[i];
2383 nxtstate = STM;
2384 tpos++;
2385 /* check for wing unfolding */
2386 if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k])
2387 {
2388 int dk; /* need a tmp k while moving thru delete wing */
2389 for (dk = hmm->M; dk > k; dk--)
2390 {
2391 tr->statetype[tpos] = STD;
2392 tr->nodeidx[tpos] = dk;
2393 tr->pos[tpos] = 0;
2394 tpos++;
2395 if (tpos == curralloc)
2396 { /* grow trace if necessary */
2397 curralloc += L;
2398 P7ReallocTrace(tr, curralloc);
2399 }
2400 }
2401 }
2402 break;
2403
2404 case STC:
2405 tr->statetype[tpos] = STC;
2406 nxtstate = tb->xtb[i][XMC];
2407 tr->nodeidx[tpos] = 0;
2408 tr->pos[tpos] = (nxtstate == STC) ? i-- : 0; /* C->C; 2nd one emits. */
2409 tpos++;
2410 break;
2411
2412 default:
2413 Die("HMMER: Bad state (%s) in ShadowTrace()\n", Statetype(nxtstate));
2414
2415 } /* end switch over nxtstate */
2416
2417 if (tpos == curralloc)
2418 { /* grow trace if necessary */
2419 curralloc += L;
2420 P7ReallocTrace(tr, curralloc);
2421 }
2422
2423 } /* end traceback, just before assigning S state */
2424
2425 tr->statetype[tpos] = STS;
2426 tr->nodeidx[tpos] = 0;
2427 tr->pos[tpos] = 0;
2428 tr->tlen = tpos + 1;
2429
2430 P7ReverseTrace(tr);
2431 return tr;
2432 }
2433
2434
2435
2436 /* Function: PostprocessSignificantHit()
2437 * Date: SRE, Wed Dec 20 12:11:01 2000 [StL]
2438 *
2439 * Purpose: Add a significant hit to per-seq and per-domain hit
2440 * lists, after postprocessing the scores appropriately,
2441 * and making sure per-domain scores add up to the per-seq
2442 * score.
2443 *
2444 * [doesn't really belong in core_algorithms.c, because
2445 * it's more of a hack than an algorithm, but on the other
2446 * hand it's now part of the core of how HMMER scores
2447 * things. Maybe there should be a core_hacks.c.]
2448 *
2449 * Given: active hit lists for per-seq and per-domain
2450 * scores (e.g. hmmpfam and hmmsearch, collating their
2451 * results), and a new hit that's significant enough
2452 * that it may need to be reported in final output.
2453 *
2454 * Breaks the traceback into individual domain traces;
2455 * scores each one of them, then applies null2 correction
2456 * for biased composition. Recalculates the per-seq score
2457 * as the sum of the per-domain scores. Stores the hits
2458 * in the lists, for eventual sorting and output by the
2459 * caller.
2460 *
2461 * Notes: In principle we've got the score, and a pvalue, and a traceback
2462 * by doing the Viterbi algorithm, right? What else is left
2463 * to do? Well, in practice, life is more complicated, because
2464 * of the trace-dependent null2 score correction.
2465 *
2466 * After a null2 score correction is carried out on
2467 * each domain (the default) the number of detected domains
2468 * with scores > 0 may have decreased. We want the
2469 * global (per-seq) hit list to have the recalculated number of
2470 * domains, not necessarily what Viterbi gave us.
2471 *
2472 * Also, since we want the global score to be the sum of
2473 * the individual domains, but the null2 correction is
2474 * applied to each domain individually, we have to calculate
2475 * an adjusted global score. (To do otherwise invites
2476 * subtle inconsistencies; xref bug 2.)
2477 *
2478 * We don't have final evalues, so we may put a few
2479 * more hits into the hit lists than we end up reporting.
2480 * The main output routine is responsible for final
2481 * enforcement of the thresholds.
2482 *
2483 * This routine is NOT THREADSAFE. When multithreaded,
2484 * with using shared ghit/dhit output buffers, calls to
2485 * PostprocessSignificantHit() need to be protected.
2486 *
2487 * Args: ghit - an active list of per-seq (global) hits
2488 * dhit - an active list of per-domain hits
2489 * tr - the significant HMM/seq traceback we'll report on
2490 * hmm - ptr to the HMM
2491 * dsq - digitized sequence (1..L)
2492 * L - length of dsq
2493 * seqname - name of sequence (same as targname, in hmmsearch)
2494 * seqacc - seq's accession (or NULL)
2495 * seqdesc - seq's description (or NULL)
2496 * do_forward - TRUE if we've already calculated final per-seq score
2497 * sc_override - per-seq score to use if do_forward is TRUE
2498 * do_null2 - TRUE to apply the null2 scoring correction
2499 * thresh - contains the threshold/cutoff information.
2500 * hmmpfam_mode - TRUE if called by hmmpfam, else assumes hmmsearch;
2501 * affects how the lists' sort keys are set.
2502 *
2503 * Returns: the recalculated per-seq score (or sc_override),
2504 * as appropriate, for subsequent storage in the histogram
2505 */
2506 float
2507 //PostprocessSignificantHit(struct alphabet_s * al, struct tophit_s *ghit,
PostprocessSignificantHit(struct tophit_s * ghit,struct tophit_s * dhit,struct p7trace_s * tr,struct plan7_s * hmm,unsigned char * dsq,int L,char * seqname,char * seqacc,char * seqdesc,int do_forward,float sc_override,int do_null2,struct threshold_s * thresh,int hmmpfam_mode)2508 PostprocessSignificantHit(struct tophit_s *ghit,
2509 struct tophit_s *dhit,
2510 struct p7trace_s *tr,
2511 struct plan7_s *hmm,
2512 unsigned char *dsq,
2513 int L,
2514 char *seqname,
2515 char *seqacc,
2516 char *seqdesc,
2517 int do_forward,
2518 float sc_override,
2519 int do_null2,
2520 struct threshold_s *thresh,
2521 int hmmpfam_mode)
2522 {
2523 struct p7trace_s **tarr; /* array of per-domain traces */
2524 struct fancyali_s *ali; /* alignment of a domain */
2525 int ntr; /* number of domain traces from Viterbi */
2526 int tidx; /* index for traces (0..ntr-1) */
2527 int ndom; /* # of domains accepted in sequence */
2528 int didx; /* index for domains (1..ndom) */
2529 int k1, k2; /* start, stop coord in model */
2530 int i1, i2; /* start, stop in sequence */
2531 float whole_sc; /* whole sequence score = \sum domain scores */
2532 float *score; /* array of raw scores for each domain */
2533 int *usedomain; /* TRUE if this domain is accepted */
2534 double whole_pval;
2535 double pvalue;
2536 double sortkey;
2537
2538 /* Special case: rarely, the alignment was totally impossible
2539 * and tr is NULL.
2540 */
2541 if (tr == NULL) return sc_override;
2542
2543 /* Break the trace into one or more individual domains.
2544 */
2545 TraceDecompose(tr, &tarr, &ntr);
2546 if (ntr == 0) Die("TraceDecompose() screwup"); /* "can't happen" (!) */
2547
2548 /* Rescore each domain, apply null2 correction if asked.
2549 * Mark positive-scoring ones (we'll definitely report those),
2550 * and include their score in the whole sequence score.
2551 */
2552 score = (float*)MallocOrDie(sizeof(float) * ntr);
2553 usedomain = (int*)MallocOrDie(sizeof(int) * ntr);
2554 ndom = 0;
2555 whole_sc = 0.;
2556 for (tidx = 0; tidx < ntr; tidx++)
2557 {
2558 score[tidx] = P7TraceScore(hmm, dsq, tarr[tidx]);
2559 if (do_null2) score[tidx] -= TraceScoreCorrection(hmm, tarr[tidx], dsq);
2560 if (score[tidx] > 0.0) {
2561 usedomain[tidx] = TRUE;
2562 ndom++;
2563 whole_sc += score[tidx];
2564 } else
2565 usedomain[tidx] = FALSE;
2566 }
2567
2568 /* Make sure at least one positive scoring domain is in
2569 * the trace. If not, invoke "weak single domain" rules:
2570 * we will always report at least one domain per sequence, even
2571 * if it has a negative score. (HMMER's Plan7 architecture can report
2572 * one negative scoring domain but not more.)
2573 */
2574 if (ndom == 0) {
2575 tidx = FArgMax(score, ntr);
2576 usedomain[tidx] = TRUE;
2577 whole_sc = score[tidx];
2578 ndom = 1;
2579 }
2580
2581 /* Implement --do_forward: override the trace-dependent sum-of-domain
2582 * whole score, use the P7Forward() score that the called passed
2583 * us instead. This is a hack; null2 is trace-dependent and
2584 * thus undefined for P7Forward() scoring; see commentary in hmmpfam.c.
2585 */
2586 if (do_forward) whole_sc = sc_override;
2587
2588 /* Go through and put all the accepted domains into the hit list.
2589 */
2590 whole_pval = PValue(hmm, whole_sc);
2591 for (tidx = 0, didx = 1; tidx < ntr; tidx++) {
2592 if (! usedomain[tidx]) continue;
2593
2594 TraceSimpleBounds(tarr[tidx], &i1, &i2, &k1, &k2);
2595 pvalue = PValue(hmm, score[tidx]);
2596
2597 if (pvalue <= thresh->domE && score[tidx] >= thresh->domT) {
2598 ali = CreateFancyAli(tarr[tidx], hmm, dsq, seqname);
2599
2600 if (hmmpfam_mode)
2601 sortkey = -1.*(double)i1; /* hmmpfam: sort on position in seq */
2602 else
2603 sortkey = score[tidx]; /* hmmsearch: sort on E (monotonic w/ sc) */
2604
2605 RegisterHit(dhit, sortkey,
2606 pvalue, score[tidx],
2607 whole_pval, whole_sc,
2608 hmmpfam_mode ? hmm->name : seqname,
2609 hmmpfam_mode ? hmm->acc : seqacc,
2610 hmmpfam_mode ? hmm->desc : seqdesc,
2611 i1,i2, L,
2612 k1,k2, hmm->M,
2613 didx,ndom,ali);
2614 }
2615 didx++;
2616 }
2617
2618 /* Now register the global hit, with the domain-derived score.
2619 */
2620
2621 /* sorting:
2622 * hmmpfam has to worry that score and E-value are not monotonic
2623 * when multiple HMMs (with different EVD parameters) are potential
2624 * targets. Therefore in hmmpfam_mode we apply a weird hack
2625 * to sort primarily on E-value, but on score
2626 * for really good hits with E=0.0... works because we can
2627 * assume 100000. > -log(DBL_MIN).
2628 * hmmsearch simply sorts on score (which for a single HMM, we
2629 * know is monotonic with E-value).
2630 */
2631 if (hmmpfam_mode)
2632 sortkey = (whole_pval > 0.0) ? -1.*log(whole_pval) : 100000. + whole_sc;
2633 else
2634 sortkey = whole_sc;
2635
2636 /* Note: we've recalculated whole_sc and it may have decreased
2637 * after the null2 correction was applied. For Pfam GA, TC,
2638 * or NC cutoffs, we have to be sure that everything on the
2639 * hitlist is correct (the hmmpfam output routine assumes it,
2640 * otherwise it would have to reload each HMM to get its
2641 * cutoffs). In all other cases, though, we don't care if
2642 * the hit list has a bit too many things on it, because the
2643 * output routine in hmmsearch or hmmpfam will check against
2644 * the cutoffs. Hence we only need to check against globT
2645 * (it may be set by GA, TC, or NC) but not globE.
2646 * - SRE, CSHL genome mtg May 2001
2647 */
2648 if (whole_sc >= thresh->globT) {
2649 RegisterHit(ghit, sortkey,
2650 whole_pval, whole_sc,
2651 0., 0., /* no mother seq */
2652 hmmpfam_mode ? hmm->name : seqname,
2653 hmmpfam_mode ? hmm->acc : seqacc,
2654 hmmpfam_mode ? hmm->desc : seqdesc,
2655 0,0,0, /* seq positions */
2656 0,0,0, /* HMM positions */
2657 0, ndom, /* # domains info */
2658 NULL); /* alignment info */
2659 }
2660
2661 /* Clean up and return.
2662 */
2663 for (tidx = 0; tidx < ntr; tidx++)
2664 P7FreeTrace(tarr[tidx]);
2665 free(tarr);
2666 free(score);
2667 free(usedomain);
2668 return whole_sc;
2669 }
2670