1 /* cm_p7_band.c
2 *
3 * Functions for p7 HMM banding.
4 * BEWARE: only partially implemented.
5 */
6
7 #include "esl_config.h"
8 #include "p7_config.h"
9 #include "config.h"
10
11 #include <stdio.h>
12 #include <math.h>
13 #include <assert.h>
14
15 #include "easel.h"
16 #include "esl_sse.h"
17 #include "esl_vectorops.h"
18
19 #include "hmmer.h"
20
21 #include "infernal.h"
22
23 #define p7_IMPOSSIBLE -987654321
24
25 /* Function: p7_gmx_Match2DMatrix()
26 * Synopsis: Copy the dp match cells of a generic matrix
27 * to a ESL_DMATRIX, for visualization with esl_dmx_Visualize()
28 *
29 * Incept: SRE, Fri Jul 13 09:56:04 2007 [Janelia]
30 *
31 * Purpose: Dump matrix <gx> to stream <fp> for diagnostics.
32 */
33 int
p7_gmx_Match2DMatrix(P7_GMX * gx,int do_diff,ESL_DMATRIX ** ret_D,double * ret_min,double * ret_max)34 p7_gmx_Match2DMatrix(P7_GMX *gx, int do_diff, ESL_DMATRIX **ret_D, double *ret_min, double *ret_max)
35 {
36 int i, k;
37 ESL_DMATRIX *D;
38 double min = eslINFINITY;
39 double max = -eslINFINITY;
40 float sc;
41
42 D = esl_dmatrix_Create(gx->M+1, gx->L);
43 /* fill k == 0 row, the X matrix E state (logically, the begin state) scores */
44 for (i = 1; i <= gx->L; i++) {
45 D->mx[0][(i-1)] = gx->xmx[i * p7G_NXCELLS + 0];
46 }
47
48 for (i = 1; i <= gx->L; i++) {
49 for (k = 1; k <= gx->M; k++) {
50 sc = gx->dp[i][k * p7G_NSCELLS + p7G_M];
51 if(do_diff) {
52 if((i < 2) || (k < 2)) D->mx[k][(i-1)] = 0.;
53 else D->mx[k][(i-1)] = sc - ESL_MAX(D->mx[k][(i-2)], D->mx[0][(i-2)]);
54 }
55 else {
56 D->mx[k][(i-1)] = sc;
57 }
58 min = ESL_MIN(min, D->mx[k][(i-1)]);
59 max = ESL_MAX(max, D->mx[k][(i-1)]);
60 }
61 }
62
63 *ret_D = D;
64 *ret_min = min;
65 *ret_max = max;
66 return eslOK;
67 }
68
69 /****************************************************************
70 * Stolen from hmmer/h3/heatmap.c SVN revision 2171
71 * as dmx_Visualize. Then modified so that the full
72 * matrix is printed (not half split diagonally).
73 */
74 /* my_dmx_Visualize()
75 * Incept: SRE, Wed Jan 24 11:58:21 2007 [Janelia]
76 *
77 * Purpose:
78 *
79 * Color scheme roughly follows Tufte, Envisioning
80 * Information, p.91, where he shows a beautiful
81 * bathymetric chart. The CMYK values conjoin two
82 * recommendations from ColorBrewer (Cindy Brewer
83 * and Mark Harrower)
84 * [http://www.personal.psu.edu/cab38/ColorBrewer/ColorBrewer.html],
85 * specifically the 9-class sequential2 Blues and
86 * 9-class sequential YlOrBr.
87 *
88 * Might eventually become part of Easel, once mature?
89 *
90 * Note: Binning rules basically follow same convention as
91 * esl_histogram. nb = xmax-xmin/w, so w = xmax-xmin/nb;
92 * picking bin is (int) ceil((x - xmin)/w) - 1. (xref
93 * esl_histogram_Score2Bin()). This makes bin b contain
94 * values bw+min < x <= (b+1)w+min. (Which means that
95 * min itself falls in bin -1, whoops - but we catch
96 * all bin<0 and bin>=nshades and put them in the extremes.
97 *
98 * Args:
99 *
100 * Returns:
101 *
102 * Throws: (no abnormal error conditions)
103 *
104 * Xref:
105 */
106 int
my_dmx_Visualize(FILE * fp,ESL_DMATRIX * D,double min,double max,double min2fill)107 my_dmx_Visualize(FILE *fp, ESL_DMATRIX *D, double min, double max, double min2fill)
108 {
109 int nshades = 18;
110 double cyan[] = { 1.00, 1.00, 0.90, 0.75, 0.57, 0.38, 0.24, 0.13, 0.03,
111 0.00, 0.00, 0.00, 0.00, 0.00, 0.07, 0.20, 0.40, 0.60};
112 double magenta[] = { 0.55, 0.45, 0.34, 0.22, 0.14, 0.08, 0.06, 0.03, 0.01,
113 0.00, 0.03, 0.11, 0.23, 0.40, 0.55, 0.67, 0.75, 0.80};
114 double yellow[] = { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
115 0.10, 0.25, 0.40, 0.65, 0.80, 0.90, 1.00, 1.00, 1.00};
116 double black[] = { 0.30, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
117 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00};
118 double w;
119 int i,j;
120 int bin;
121 int boxsize; /* box size in points */
122 int xcoord, ycoord; /* postscript coords in points */
123 int leftmargin, rightmargin;
124 int bottommargin, topmargin;
125 float fboxsize; /* box size in fractional points */
126
127 /* Set some defaults that might become arguments later.
128 */
129 leftmargin = rightmargin = 20;
130 bottommargin = topmargin = 20;
131
132 /* Determine some working parameters
133 */
134 w = (max-min) / (double) nshades; /* w = bin size for assigning values->colors*/
135 boxsize = ESL_MAX(1, (ESL_MIN((792 - bottommargin) / D->n,
136 (612 - leftmargin) / D->m)));
137 fboxsize= ESL_MIN( (792. - ((float) bottommargin + topmargin)) / (float) D->n,
138 (612. - ((float) leftmargin + rightmargin)) / (float) D->m);
139
140
141 fprintf(fp, "%.4f %.4f scale\n", (fboxsize/(float) boxsize), (fboxsize/(float) boxsize));
142 /* printf("n: %d\nm: %d\n", D->n, D->m); */
143 for (i = 0; i < D->n; i++) {
144 /* printf("\n"); */
145 /* for (j = i; j < D->n; j++) */
146 for (j = 0; j < D->m; j++)
147 {
148 /* printf("i: %4d j: %4d %5.1f\n", i, j, D->mx[i][j]); */
149 xcoord = j * boxsize + leftmargin;
150 ycoord = (D->m-(i+1)) * boxsize + bottommargin; /* difference w/heatmap.c: (D->m-i+1) */
151
152 if (D->mx[i][j] < min2fill) continue;
153 /* if ((i > 0) && (j > 0) && (D->mx[i][j] <= D->mx[i-1][j-1]) && (D->mx[i][j] < min2fill)) continue;*/
154 else if (D->mx[i][j] == -eslINFINITY) bin = 0;
155 else if (D->mx[i][j] == eslINFINITY) bin = nshades-1;
156 else {
157 bin = (int) ceil((D->mx[i][j] - min) / w) - 1;
158 if (bin < 0) bin = 0;
159 if (bin >= nshades) bin = nshades-1;
160 }
161
162 printf("%4d %4d %10.3f\n", i, j, D->mx[i][j]);
163 fprintf(fp, "newpath\n");
164 fprintf(fp, " %d %d moveto\n", xcoord, ycoord);
165 fprintf(fp, " 0 %d rlineto\n", boxsize);
166 fprintf(fp, " %d 0 rlineto\n", boxsize);
167 fprintf(fp, " 0 -%d rlineto\n", boxsize);
168 fprintf(fp, " closepath\n");
169 fprintf(fp, " %.2f %.2f %.2f %.2f setcmykcolor\n",
170 cyan[bin], magenta[bin], yellow[bin], black[bin]);
171 fprintf(fp, " fill\n");
172 }
173 }
174 fprintf(fp, "showpage\n");
175 return eslOK;
176 }
177
178
179 /* Function: my_p7_GTraceMSV()
180 * Incept: EPN, Mon Aug 11 10:01:23 2008
181 *
182 * Purpose: Traceback of a MSV matrix: retrieval of optimum alignment.
183 *
184 * Based on p7_GTrace().
185 *
186 * This function is currently implemented as a
187 * reconstruction traceback, rather than using a shadow
188 * matrix. Because H3 uses floating point scores, and we
189 * can't compare floats for equality, we have to compare
190 * floats for near-equality and therefore, formally, we can
191 * only guarantee a near-optimal traceback. However, even in
192 * the unlikely event that a suboptimal is returned, the
193 * score difference from true optimal will be negligible.
194 *
195 * Args: dsq - digital sequence aligned to, 1..L
196 * L - length of <dsq>
197 * gm - profile
198 * mx - MSV matrix to trace, L x M
199 * tr - storage for the recovered traceback.
200 *
201 * Return: <eslOK> on success.
202 * <eslFAIL> if even the optimal path has zero probability;
203 * in this case, the trace is set blank (<tr->N = 0>).
204 * <eslEINCOMPAT> if the optimal trace is discontiguous wrt
205 * the sequence, node k > kp emits residue i < ip, while kp
206 * emits ip. In this case, trace is invalid - caller must
207 * know this.
208 *
209 * Note: Care is taken to evaluate the prev+tsc+emission
210 * calculations in exactly the same order that Viterbi did
211 * them, lest you get numerical problems with
212 * a+b+c = d; d-c != a+b because d,c are nearly equal.
213 * (This bug appeared in dev: xref J1/121.)
214 */
215
216 /* for HMMER3 P7 HMMs */
217 #define P7MMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_M])
218 #define P7XMX(i,s) (xmx[(i) * p7G_NXCELLS + (s)])
219 #define P7MSC(k) (rsc[(k) * p7P_NR + p7P_MSC])
220
221 /* for CP9 HMMs */
222 #define CP9TSC(s,k) (tsc[(k) * cp9O_NTRANS + (s)])
223
224 int
my_p7_GTraceMSV(const ESL_DSQ * dsq,int L,const P7_PROFILE * gm,const P7_GMX * gx,P7_TRACE * tr,int ** ret_i2k,int ** ret_k2i,float ** ret_isc,int ** ret_iconflict)225 my_p7_GTraceMSV(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr, int **ret_i2k, int **ret_k2i, float **ret_isc, int **ret_iconflict)
226 {
227 int status = eslOK;
228 int i; /* position in seq (1..L) */
229 int k; /* position in model (1..M) */
230 int M = gm->M;
231 float **dp = gx->dp;
232 float *xmx = gx->xmx;
233 float tol = 1e-5;
234 float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY;
235 /* new vars */
236 float tloop = logf((float) L / (float) (L+3));
237 float tmove = logf( 3.0f / (float) (L+3));
238 float tbmk = logf( 2.0f / ((float) gm->M * (float) (gm->M+1)));
239 float tec = logf(0.5f);
240
241 int *k2i; /* [0.1..k..gm->M] = i, residue i emitted from node k's match state in MSV trace */
242 int *i2k; /* [0.1..i..L] = k, residue i emitted from node k's match state in MSV trace */
243 int *iconflict; /* [0.1..i..L] = TRUE | FALSE. TRUE to eventually remove the kmer with pin at i */
244 float *isc; /*[0.1..i..L] = sc, match emission score for residue i is sc */
245
246 ESL_ALLOC(k2i, sizeof(int) * (gm->M+1));
247 ESL_ALLOC(i2k, sizeof(int) * (L+1));
248 ESL_ALLOC(iconflict, sizeof(int) * (L+1));
249 ESL_ALLOC(isc, sizeof(float) * (L+1));
250 esl_vec_ISet(k2i, (gm->M+1), -1);
251 esl_vec_ISet(i2k, (L+1), -1);
252 esl_vec_ISet(iconflict, (L+1), FALSE);
253 esl_vec_FSet(isc, (L+1), -eslINFINITY);
254
255 if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR;
256
257 /* Initialization.
258 * (back to front. ReverseTrace() called later.)
259 */
260 if ((status = p7_trace_Append(tr, p7T_T, 0, 0)) != eslOK) goto ERROR;
261 if ((status = p7_trace_Append(tr, p7T_C, 0, 0)) != eslOK) goto ERROR;
262 i = L; /* next position to explain in seq */
263
264 /* Traceback
265 */
266 while (tr->st[tr->N-1] != p7T_S) {
267 float const *rsc = gm->rsc[dsq[i]];
268
269 switch (tr->st[tr->N-1]) {
270 case p7T_C: /* C(i) comes from E(i) */
271 if (P7XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i);
272
273 if (esl_FCompare(P7XMX(i, p7G_C), P7XMX(i-1, p7G_C) + tloop, tol) == eslOK) {
274 tr->i[tr->N-1] = i--; /* first C doesn't emit: subsequent ones do */
275 status = p7_trace_Append(tr, p7T_C, 0, 0);
276 } else if (esl_FCompare(P7XMX(i, p7G_C), P7XMX(i, p7G_E) + tec, tol) == eslOK)
277 status = p7_trace_Append(tr, p7T_E, 0, 0);
278 else ESL_XEXCEPTION(eslFAIL, "C at i=%d couldn't be traced", i);
279 break;
280
281 case p7T_E: /* E connects from any M state. k set here */
282 if (P7XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i);
283
284 if (esl_FCompare(P7XMX(i, p7G_E), P7MMX(i,M), tol) == eslOK) { k = M; status = p7_trace_Append(tr, p7T_M, M, i); }
285 else {
286 for (k = M-1; k >= 1; k--)
287 if (esl_FCompare(P7XMX(i, p7G_E), P7MMX(i,k) + esc, tol) == eslOK)
288 { status = p7_trace_Append(tr, p7T_M, k, i); break; }
289 if (k < 0) ESL_XEXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i);
290 }
291 break;
292
293 case p7T_M: /* M connects from i-1,k-1, or B */
294 if (P7MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i);
295 if (esl_FCompare(P7MMX(i,k), P7XMX(i-1,p7G_B) + tbmk + P7MSC(k), tol) == eslOK) status = p7_trace_Append(tr, p7T_B, 0, 0);
296 else if (esl_FCompare(P7MMX(i,k), P7MMX(i-1,k-1) + P7MSC(k), tol) == eslOK) {
297 status = p7_trace_Append(tr, p7T_M, k-1, i-1);
298 /*if(k2i[(k-1)] != -1) { status = eslEINCOMPAT; printf("! discontiguous trace k2i[k-1=%d] != -1 (%d) i-1 = %d\n", k-1, k2i[(k-1)], i-1); goto ERROR;} */
299 /*if(i2k[(i-1)] != -1) { status = eslEINCOMPAT; printf("! discontiguous trace i2k[i-1=%d] != -1 (%d) k-1 = %d\n", i-1, i2k[(i-1)], k-1); goto ERROR;} */
300 if(i2k[(i-1)] != -1 || k2i[(k-1)] != -1) {
301 iconflict[(k2i[(k-1)])] = TRUE; /* eventually remove pin kmer that included k we *thought* pinned i-1 */
302 iconflict[(i-1)] = TRUE; /* eventually remove pin kmer that includes k we currently think pins i-1 */
303 }
304 k2i[(k-1)] = i-1;
305 i2k[(i-1)] = k-1;
306 isc[(i-1)] = P7MSC(k);
307 }
308 else ESL_XEXCEPTION(eslFAIL, "M at k=%d,i=%d couldn't be traced", k,i);
309
310 if (status != eslOK) goto ERROR;
311 k--;
312 i--;
313 break;
314
315 case p7T_N: /* N connects from S, N */
316 if (P7XMX(i, p7G_N) <= p7_IMPOSSIBLE) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i);
317
318 if (i == 0) status = p7_trace_Append(tr, p7T_S, 0, 0);
319 else if (esl_FCompare(P7XMX(i,p7G_N), P7XMX(i-1, p7G_N) + tloop, tol) == eslOK)
320 {
321 tr->i[tr->N-1] = i--;
322 status = p7_trace_Append(tr, p7T_N, 0, 0);
323 }
324 else ESL_XEXCEPTION(eslFAIL, "N at i=%d couldn't be traced", i);
325 break;
326
327 case p7T_B: /* B connects from N, J */
328 if (P7XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i);
329
330 if (esl_FCompare(P7XMX(i,p7G_B), P7XMX(i, p7G_N) + tmove, tol) == eslOK)
331 status = p7_trace_Append(tr, p7T_N, 0, 0);
332 else if (esl_FCompare(P7XMX(i,p7G_B), P7XMX(i, p7G_J) + tmove, tol) == eslOK)
333 status = p7_trace_Append(tr, p7T_J, 0, 0);
334 else ESL_XEXCEPTION(eslFAIL, "B at i=%d couldn't be traced", i);
335 break;
336
337 case p7T_J: /* J connects from E(i) or J(i-1) */
338 if (P7XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i);
339
340 if (esl_FCompare(P7XMX(i,p7G_J), P7XMX(i-1,p7G_J) + tloop, tol) == eslOK) {
341 tr->i[tr->N-1] = i--;
342 status = p7_trace_Append(tr, p7T_J, 0, 0);
343 } else if (esl_FCompare(P7XMX(i,p7G_J), P7XMX(i,p7G_E) + tec, tol) == eslOK)
344 status = p7_trace_Append(tr, p7T_E, 0, 0);
345 else ESL_XEXCEPTION(eslFAIL, "J at i=%d couldn't be traced", i);
346 break;
347
348 default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback");
349 } /* end switch over statetype[tpos-1] */
350
351 if (status != eslOK) goto ERROR;
352 } /* end traceback, at S state */
353
354 if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR;
355 if (ret_i2k != NULL) { *ret_i2k = i2k; } else free(i2k);
356 if (ret_k2i != NULL) { *ret_k2i = k2i; } else free(k2i);
357 if (ret_isc != NULL) { *ret_isc = isc; } else free(isc);
358 if (ret_iconflict != NULL) { *ret_iconflict = iconflict; } else free(iconflict);
359 return eslOK;
360
361 ERROR:
362 if (ret_i2k != NULL) { *ret_i2k = i2k; } else free(i2k);
363 if (ret_k2i != NULL) { *ret_k2i = k2i; } else free(k2i);
364 if (ret_isc != NULL) { *ret_isc = isc; } else free(isc);
365 return status;
366 }
367
368
369
370 /* Function: Parsetree2i_to_k()
371 * Date: EPN, Mon Aug 11 13:41:38 2008
372 *
373 * Purpose: Given a parsetree, fill a vector i2k[0.1..i..L] = k, saying that
374 * residue i is emitted into consensus column k. If k <= 0, this implies
375 * residue i was inserted after consensus column (-1 * k).
376 *
377 * Returns: eslOK on success
378 * eslEINCOMPAT on contract violation.
379 */
380 int
Parsetree2i_to_k(CM_t * cm,CMEmitMap_t * emap,int L,char * errbuf,Parsetree_t * tr,int ** ret_i2k)381 Parsetree2i_to_k(CM_t *cm, CMEmitMap_t *emap, int L, char *errbuf, Parsetree_t *tr, int **ret_i2k)
382 {
383 int status; /* Easel status code */
384 int tidx; /* counter through positions in the parsetree */
385 int v; /* state index in CM */
386 int nd;
387 int *i2k;
388
389 ESL_ALLOC(i2k, sizeof(int) * (L+1));
390 esl_vec_ISet(i2k, (L+1), -1 * (cm->clen + 1));
391
392 /* contract check */
393 if(emap == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "Parsetree2i_to_k(): emap == NULL.");
394
395 /* trivial preorder traverse, since we're already numbered that way */
396 for (tidx = 0; tidx < tr->n; tidx++) {
397 v = tr->state[tidx]; /* index of parent state in CM */
398 nd = cm->ndidx[v];
399 if (v == cm->M) continue; /* special case: v is EL, local alignment end */
400 switch (cm->sttype[v]) {
401 case MP_st:
402 i2k[tr->emitl[tidx]] = emap->lpos[nd];
403 i2k[tr->emitr[tidx]] = emap->rpos[nd];
404 break;
405
406 case ML_st:
407 i2k[tr->emitl[tidx]] = emap->lpos[nd];
408 break;
409
410 case MR_st:
411 i2k[tr->emitr[tidx]] = emap->rpos[nd];
412 break;
413
414 case IL_st:
415 i2k[tr->emitl[tidx]] = (-1 * emap->lpos[nd]);
416 break;
417
418 case IR_st:
419 i2k[tr->emitr[tidx]] = (-1 * (emap->rpos[nd] - 1)); /* IR's emit to before consensus column rpos */
420 break;
421
422 }
423 }
424
425 *ret_i2k = i2k;
426 return eslOK;
427
428 ERROR:
429 ESL_FAIL(status, errbuf, "Parsetree2i_to_k(), memory allocation error.");
430 return status; /* NEVERREACHED */
431 }
432
433 /* Function: prune_i2k()
434 * Incept: EPN, Mon Aug 11 15:02:35 2008
435 *
436 * Purpose: Prune an i2k array of pins. Optionally prune by the following
437 * criteria (in this order):
438 * o match score of the pin
439 * o length (n) of n-mer each pin exists in
440 * o proximity to end of n-mer
441 *
442 * Args: i2k - the input pin array, modified (pruned) in place
443 * iconflict - [0.1..i..L] TRUE if i:i2k[i] pin is involved in a conflict, remove the whole nmer pin
444 * isc - score (match emission) for each pin
445 * L - length of current sequence
446 * phi - phi array, phi[k][v] is expected number of times (probability)
447 * state v (0 = match, 1 insert, 2 = delete) in
448 * node k is *entered*. Node 0 is special, state 0 = B state, state 1 = N_state, state 2 = NULL
449 * Calculated *without* taking insert->insert transitions into account.
450 * min_sc - minimum score to allow as a pin, 0. to allow any score
451 * min_len - min n-mer size, 1 to allow any size
452 * min_end - min distance from end to allow, prune away any others, 0 to not prune based on end proximity
453 * min_mprob - min match phi probability to allow in a pin
454 * min_mcprob - min cumulative match phi probability to allow in a nmer pin
455 * max_iprob - max insert phi probability to allow in a pin
456 * max_ilprob - max insert phi probability to allow in a state to the left of a pin
457 *
458 * Return: <eslOK> on success.
459 *
460 */
461 int
prune_i2k(int * i2k,int * iconflict,float * isc,int L,double ** phi,float min_sc,int min_len,int min_end,float min_mprob,float min_mcprob,float max_iprob,float max_ilprob)462 prune_i2k(int *i2k, int *iconflict, float *isc, int L, double **phi, float min_sc, int min_len, int min_end, float min_mprob, float min_mcprob, float max_iprob, float max_ilprob)
463 {
464 int i, j;
465
466 int do_end;
467 int do_sc;
468 int do_len;
469 float tol = 1e-5;
470 float mcprob = 1.; /* cumulative match probability */
471 int n = 0;
472 int ip;
473
474 do_sc = (esl_FCompare(0., min_sc, tol) == eslOK) ? FALSE : TRUE;
475 do_end = (min_end == 0) ? FALSE : TRUE;
476 do_len = (min_len == 1) ? FALSE : TRUE;
477
478 /* pass 1, prune on conflict from trace, this must be first pass */
479 for(i = 1; i <= L; i++) {
480 if(iconflict[i] == TRUE) {
481 /* remove entire nmer's pins */
482 ip = i;
483 while((ip > 0) && (i2k[ip] != -1)) i2k[ip--] = -1;
484 ip = i;
485 while((ip <= L) && (i2k[ip] != -1)) i2k[ip++] = -1;
486 }
487 }
488
489 /* pass 2, prune on scores */
490 if(do_sc) {
491 for(i = 1; i <= L; i++) if((i2k[i] != -1) && (isc[i] < min_sc)) i2k[i] = -1;
492 }
493
494 /* pass 3, prune on phi values */
495 for(i = 1; i <= L; i++) {
496 if(i2k[i] != -1) { /* position i is currently pinned */
497 if ((phi[i2k[i]][HMMMATCH] < min_mprob) || /* match prob too low */
498 (phi[i2k[i]][HMMINSERT] > max_iprob) || /* insert to the right prob too high */
499 (phi[(i2k[i]-1)][HMMINSERT] > max_ilprob)) { /* insert to the left prob too high */
500 /* remove pin */
501 /*printf("removing pin %4d %.4f\n", i, phi[i2k[i]][HMMMATCH]);*/
502 i2k[i] = -1;
503 }
504 }
505 }
506
507 /* pass 4, prune on nmer length, match score, match phi probability, and distance from end */
508 if(do_len || do_end) { /* determine the size n of the n-mer each residue i is in */
509 for(i = 1; i <= L; i++) {
510 if((i2k[i] == -1) || /* position i is not pinned */
511 ((i2k[(i-1)] != -1) && !(i2k[(i-1)] == (i2k[i]-1)))) { /* position i is pinned to k, position i-1 is pinned to kp, but k and kp are not consecutive */
512 /* we just ended an n-mer (n >= 0) */
513 if(n > 0 && n < min_len) {
514 for(j = (i - n); j < i; j++) i2k[j] = -1; /* remove the n-mer */
515 mcprob = 1.;
516 }
517 else if (do_end && n >= min_len) { /* n >= min_len, remove those within do_end of end */
518 for(j = (i - n); j < (i - (n+1)) + min_end; j++) i2k[j] = -1; /* remove the part of the n-mer within nend residues of the beginning edge */
519 for(j = (i - min_end); j < i; j++) i2k[j] = -1; /* remove the part of the n-mer within nend residues of the end edge */
520 }
521 if(i2k[i] == -1) { /* position i is not pinned */
522 mcprob = 1.;
523 n = 0;
524 }
525 else { /* position i is pinned to k, position i-1 is pinned to kp, but k and kp are not consecutive */
526 mcprob = phi[(i2k[i])][HMMMATCH];
527 n = 1;
528 if(mcprob < min_mcprob) { i2k[i] = -1; n = 0; mcprob = 1.; }
529 }
530 }
531 else {
532 n++;
533 mcprob *= phi[(i2k[i])][HMMMATCH];
534 if(mcprob < min_mcprob) { /* remove the n-mer up to this point */
535 for(j = (i - n); j < i; j++) i2k[j] = -1; /* remove the n-mer */
536 mcprob = 1.;
537 n = 0;
538 }
539 }
540 }
541 /* deal with possibility that last n residues were an n-mer, with n < min_len */
542 if(n > 0 && n < min_len) { for(j = (i - n); j < i; j++) i2k[j] = -1; /* remove the n-mer */}
543 }
544
545 return eslOK;
546 }
547
548 /* Function: p7_pins2bands()
549 * Incept: EPN, Mon Aug 11 15:41:44 2008
550 *
551 * Purpose: Given an i2k pins array, determine the imin and imax bands.
552 *
553 * Args: i2k - the input pin array, modified (pruned) in place
554 * errbuf - for error messages
555 * L - length of current sequence
556 * M - number of nodes in the HMM
557 * pad - pad on each side of pin, if pad = 3, we allow +/- 3 residues from pin
558 * ret_kmin - [0.i..L] = k, min node k for residue i
559 * ret_kmax - [0.i..L] = k, max node k for residue i
560 * ret_ncells - number of cells within bands, to return
561 #if 0
562 * ret_imin - [0.k..M] = i, min residue i for node k
563 * ret_imax - [0.k..M] = i, max residue i for node k
564 #endif
565 *
566 * Return: <eslOK> on success.
567 *
568 */
569 int
p7_pins2bands(int * i2k,char * errbuf,int L,int M,int pad,int ** ret_kmin,int ** ret_kmax,int * ret_ncells)570 p7_pins2bands(int *i2k, char *errbuf, int L, int M, int pad, int **ret_kmin, int **ret_kmax, int *ret_ncells)
571 {
572 int status;
573
574 int i;
575 int kn = 0;
576 int kx = M;
577 int *kmin, *kmax;
578
579 ESL_ALLOC(kmin, sizeof(int) * (L+1));
580 ESL_ALLOC(kmax, sizeof(int) * (L+1));
581
582 /* traverse residues left to right to get kmins */
583 for(i = 0; i <= L; i++) {
584 if(i2k[i] != -1) {
585 if(kn >= i2k[i] && kn > 1) {
586 ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error i: %d, i2k[i]: %d but current kn: %d\n", i, i2k[i], kn);
587 }
588 kn = ESL_MAX(1, i2k[i] - pad);
589 }
590 kmin[i] = kn;
591 }
592
593 /* traverse nodes right to left to get imaxs */
594 for(i = L; i >= 0; i--) {
595 if(i2k[i] != -1) {
596 if(kx <= i2k[i] && kx < L) ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error: i: %d, i2k[i]: %d but current kx: %d\n", i, i2k[i], kx);
597 kx = ESL_MIN(M, i2k[i] + pad);
598 }
599 kmax[i] = kx;
600 }
601
602 /* M_0 == B state, which must start the parse with i == 0 */
603 kmin[0] = 0;
604
605 /* get number of cells if wanted */
606 int ncells;
607 if(ret_ncells != NULL) {
608 ncells = 0;
609 for(i = 1; i <= L; i++) ncells += kmax[i] - kmin[i] + 1;
610 *ret_ncells = ncells;
611 }
612
613 if(ret_kmin != NULL) { *ret_kmin = kmin; } else free(kmin);
614 if(ret_kmax != NULL) { *ret_kmax = kmax; } else free(kmax);
615 return eslOK;
616
617 ERROR:
618 ESL_FAIL(status, errbuf, "p7_pins2bands() memory error.");
619 return status; /* NEVERREACHED */
620
621 #if 0
622 /* if we want to get imin/imax instead of kmin/kmax */
623 int in = 1;
624 int ix = L;
625 int *imin;
626 int *imax;
627 int *k2i;
628
629 ESL_ALLOC(imin, sizeof(int) * (M+1));
630 ESL_ALLOC(imax, sizeof(int) * (M+1));
631 ESL_ALLOC(k2i, sizeof(int) * (M+1));
632
633 imin[0] = imax[0] = -1;
634 esl_vec_ISet(k2i, (M+1), -1);
635
636 for(i = 1; i <= L; i++) if(i2k[i] != -1) k2i[i2k[i]] = i;
637
638 /* traverse nodes left to right to get imins */
639 for(k = 1; k <= M; k++) {
640 if(k2i[k] != -1) {
641 if(k2i[k] != 1 && in >= k2i[k]) ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error k: %d, k2i[k]: %d but current in: %d\n", k, k2i[k], in);
642 in = ESL_MAX(1, k2i[k] - pad);
643 }
644 imin[k] = in;
645 }
646
647 /* traverse nodes right to left to get imaxs */
648 for(k = M; k >= 1; k--) {
649 if(k2i[k] != L && k2i[k] != -1) {
650 if(ix <= k2i[k]) ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error: k: %d, k2i[k]: %d but current ix: %d\n", k, k2i[k], ix);
651 ix = ESL_MIN(L, k2i[k] + pad);
652 }
653 imax[k] = ix;
654 }
655
656 free(k2i);
657
658 /* get number of cells if wanted */
659 int ncells;
660 if(ret_ncells != NULL) {
661 ncells = 0;
662 for(k = 1; k <= M; k++) ncells += imax[k] - imin[k] + 1;
663 *ret_ncells = ncells;
664 }
665 if(ret_imin != NULL) { *ret_imin = imin; } else free(imin);
666 if(ret_imax != NULL) { *ret_imax = imax; } else free(imax);
667 #endif
668 }
669
670 /* Function: DumpP7Bands()
671 * Incept: EPN, Thu Aug 14 08:45:54 2008
672 *
673 * Purpose: Given i2k and kmin, kmax arrays, print them.
674 *
675 * Args: i2k - [0.k..M] = i, node k is pinned to residue i
676 * kmin - [0.i..L] = k, min node k for residue i
677 * kmax - [0.i..L] = k, max node k for residue i
678 * L - length of current sequence
679 *
680 * Return: <eslOK> on success.
681 *
682 */
683 int
DumpP7Bands(FILE * fp,int * i2k,int * kmin,int * kmax,int L)684 DumpP7Bands(FILE *fp, int *i2k, int *kmin, int *kmax, int L)
685 {
686 int i;
687
688 fprintf(fp, "# %4s %4s %4s ... %4s\n", "i", "i2k", "kmin", "kmax");
689 fprintf(fp, "# %4s %4s %13s\n", "----", "----", "-------------");
690 for(i = 0; i <= L; i++) {
691 fprintf(fp, " %4d %4d %4d ... %4d\n", i, i2k[i], kmin[i], kmax[i]);
692 }
693 return eslOK;
694
695 }
696
697 /* Function: cp9_ForwardP7B()
698 *
699 * Purpose: Runs the banded Forward dynamic programming algorithm on an
700 * input sequence (1..L). Complements cp9_BackwardP7B().
701 * The 'P7B' suffix indicates plan 7 HMM derived bands
702 * in the kmin and kmax arrays are applied. This function
703 * was derived from cp9_Forward(), differences from that
704 * function were introduced solely to impose bands on the
705 * matrix.
706 *
707 * Because of the bands, some options that exist to cp9_Forward()
708 * (like be_efficient and do_scan and reporting hits in scan mode)
709 * are not available here. This function is meant to be used
710 * solely for first stage of a Forward, Backward, Posterior type
711 * calculation.
712 *
713 * Also due to bands, only L is passed as seq length, instead
714 * of i0 and j0 (seq start/stop). This simplifies the application
715 * of bands kmin[i]/kmax[i] refers to residue i.
716 *
717 * This function requires that local EL states are turned off, if
718 * they're not it returns. A separate function should exist to align
719 * with EL states on, I don't think it's worth the extra computations to
720 * make 1 function that handles both.
721 *
722 * See additional notes in cp9_Forward() "Purpose" section.
723 *
724 * Args:
725 * cp9 - the CP9 HMM
726 * errbuf - char buffer for error messages
727 * mx - the matrix, expanded to correct size (if nec), and filled here
728 * dsq - sequence in digitized form
729 * L - start of target subsequence (1 for beginning of dsq)
730 * kmin - [0.1..i..N] minimum k for residue i
731 * kmax - [0.1..i..N] maximum k for residue i
732 * ret_sc - RETURN: log P(S|M,bands)/P(S|R), as a bit score
733 *
734 * Returns: eslOK on success;
735 * eslEINCOMPAT on contract violation;
736 */
737 #define INBAND(i,k) ((k >= kmin[i]) && (k <= kmax[i]))
738
739 int
cp9_ForwardP7B(CP9_t * cp9,char * errbuf,CP9_MX * mx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,float * ret_sc)740 cp9_ForwardP7B(CP9_t *cp9, char *errbuf, CP9_MX *mx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, float *ret_sc)
741 {
742 int status;
743 int i; /* j-W: position in the subsequence */
744 int k; /* CP9 HMM node position */
745 int **mmx; /* DP matrix for match state scores [0..1][0..cp9->M] */
746 int **imx; /* DP matrix for insert state scores [0..1][0..cp9->M] */
747 int **dmx; /* DP matrix for delete state scores [0..1][0..cp9->M] */
748 int **elmx; /* DP matrix for EL state scores [0..1][0..cp9->M] */
749 int *erow; /* end score for each position [0..1] */
750 int M; /* cp9->M, query length, number of consensus nodes of model */
751 int kp, kn, kx, kpcur, kpprv;
752
753 /* Contract checks */
754 if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, cp9 is NULL.\n");
755 if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, dsq is NULL.");
756 if(mx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, mx is NULL.\n");
757 if(mx->M != cp9->M) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, mx->M != cp9->M.\n");
758 if(kmin == NULL || kmax == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, kmin and/or kmax == NULL.\n");
759 if(cp9->flags & CPLAN9_EL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, cp9 EL flag up.\n");
760
761 M = cp9->M;
762
763 int const *tsc = cp9->otsc; /* ptr to efficiently ordered transition scores */
764
765 /* gamma allocation and initialization.
766 * This is a little SHMM that finds an optimal scoring parse
767 * of multiple nonoverlapping hits. */
768
769 /* Rearrange DP matrix for this seq */
770 if((status = GrowCP9Matrix(mx, errbuf, L, M, kmin, kmax, &mmx, &imx, &dmx, &elmx, &erow)) != eslOK) return status;
771 ESL_DPRINTF2(("#DEBUG: cp9_ForwardP7B(): CP9 matrix size: %.8f Mb rows: %d.\n", mx->size_Mb, mx->rows));
772
773 /* Initialization of the zero row. */
774 mmx[0][0] = 0; /* M_0 is state B, and everything starts in B */
775 imx[0][0] = -INFTY; /* I_0 is state N, can't get here without emitting*/
776 dmx[0][0] = -INFTY; /* D_0 doesn't exist. */
777 elmx[0][0]= -INFTY; /* can't go from B to EL state */
778 erow[0] = -INFTY;
779
780 /* Because there's a D state for every node 1..M,
781 dmx[0][k] is possible for all k 1..M (if it's within kmin[0]..kmax[0]) */
782 int sc;
783 i = 0;
784 kn = ESL_MAX(1, kmin[0]);
785 kp = kn - kmin[0];
786 for (k = kn; k <= kmax[0]; k++, kp++) {
787 assert(kp >= 0);
788 mmx[0][kp] = imx[0][kp] = elmx[0][kp] = -INFTY; /* need seq to get here */
789 sc = -INFTY;
790 if(kp > 0) { /* if kp == 0, kp-1 is outside the bands */
791 sc = ILogsum(ILogsum(mmx[0][kp-1] + CP9TSC(cp9O_MD,k-1),
792 imx[0][kp-1] + CP9TSC(cp9O_ID,k-1)),
793 dmx[0][kp-1] + CP9TSC(cp9O_DD,k-1));
794 }
795 dmx[0][kp] = sc;
796 }
797 /* We can do a full parse through all delete states. */
798 erow[0] = -INFTY;
799 if(INBAND(0, M)) { erow[0] = dmx[0][M] + CP9TSC(cp9O_DM,M); }
800
801 /*****************************************************************
802 * The main loop: scan the sequence from position 1 to L.
803 *****************************************************************/
804 /* Recursion. */
805
806 for (i = 1; i <= L; i++)
807 {
808 int const *isc = cp9->isc[dsq[i]];
809 int const *msc = cp9->msc[dsq[i]];
810 int endsc = -INFTY;
811 int sc;
812
813 if(kmin[i] == 0) {
814 mmx[i][0] = -INFTY;
815 dmx[i][0] = -INFTY; /*D_0 is non-existent*/
816 elmx[i][0] = -INFTY; /*no EL state for node 0 */
817 sc = ILogsum(ILogsum(mmx[i-1][0] + CP9TSC(cp9O_MI,0),
818 imx[i-1][0] + CP9TSC(cp9O_II,0)),
819 dmx[i-1][0] + CP9TSC(cp9O_DI,0));
820 imx[i][0] = sc + isc[0];
821 kn = 1; /* kmin[i] + 1 */
822 }
823 else {
824 kn = kmin[i];
825 }
826
827 /*match state*/
828 kn = ESL_MAX(kn, (kmin[i-1]+1)); /* start at first cell from which we can look back to a valid cell at *mx[i-1][k-1] */
829 kx = ESL_MIN(kmax[i], kmax[i-1]+1);
830
831 /* NOT SURE ABOUT THIS AND HOW IT COUPLES WITH BLOCK ABOVE kmin[i] == 0 */
832 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
833 for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
834
835 kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
836 kpprv = kn - kmin[i-1];
837 for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
838 /*printf("M i: %d kpprv: %d\n", i, kpprv);*/
839 assert((kpprv-1) >= 0);
840
841 sc = ILogsum(ILogsum(mmx[i-1][kpprv-1] + CP9TSC(cp9O_MM,k-1),
842 imx[i-1][kpprv-1] + CP9TSC(cp9O_IM,k-1)),
843 dmx[i-1][kpprv-1] + CP9TSC(cp9O_DM,k-1));
844
845 /* FIX ME: inefficient! check B->M_K transition */
846 if(INBAND(i-1, 0)) { /* if i-1 is in k == 0's band */
847 assert(kmin[(i-1)] == 0);
848 if(mmx[i-1][0] != -INFTY)
849 sc = ILogsum(sc, mmx[i-1][0] + CP9TSC(cp9O_BM,k));
850 }
851
852 if(sc != -INFTY) {
853 mmx[i][kpcur] = sc + msc[k];
854 /* E state update */
855 endsc = ILogsum(endsc, mmx[i][kpcur] + CP9TSC(cp9O_ME,k));
856 }
857 else {
858 mmx[i][kpcur] = -INFTY;
859 /* don't update E state */
860 }
861 ///printf("mmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kpcur, mmx[i][kpcur]);
862 }
863
864 /* insert state*/
865 kn = ESL_MAX(kmin[i], kmin[i-1]);
866 kx = ESL_MIN(kmax[i], kmax[i-1]);
867
868 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
869 for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
870
871 kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
872 kpprv = kn - kmin[i-1];
873 for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
874 /*insert state*/
875 assert(kpprv >= 0);
876 /* HERE, EVENTUALLY IF kmin/kmax differ b/t Match and Inserts:
877 * only look at match states from k that have i-1 within band */
878 /* all insert states from k should have i-1 within band */
879 /*printf("I i: %d kpprv: %d\n", i, kpprv);*/
880 sc = ILogsum(ILogsum(mmx[i-1][kpprv] + CP9TSC(cp9O_MI,k),
881 imx[i-1][kpprv] + CP9TSC(cp9O_II,k)),
882 dmx[i-1][kpprv] + CP9TSC(cp9O_DI,k));
883 if(sc != -INFTY) imx[i][kpcur] = sc + isc[k];
884 else imx[i][kpcur] = -INFTY;
885 ///printf("imx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kpcur, imx[i][kpcur]);
886 }
887
888 /*delete state*/
889 kn = kmin[i]+1;
890
891 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) dmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
892 kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
893 for (k = kn; k <= kmax[i]; k++, kpcur++) { /* should I be adding one for delete off-by-one?? */
894 sc = ILogsum(ILogsum(mmx[i][kpcur-1] + CP9TSC(cp9O_MD,k-1),
895 imx[i][kpcur-1] + CP9TSC(cp9O_ID,k-1)),
896 dmx[i][kpcur-1] + CP9TSC(cp9O_DD,k-1));
897 dmx[i][kpcur] = sc;
898 ///printf("dmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kpcur, dmx[i][kpcur]);
899 }
900 /*printf("mmx [jp:%d][%d]: %d\n", jp, k, mmx[j][k]);
901 printf("imx [jp:%d][%d]: %d\n", jp, k, imx[j][k]);
902 printf("dmx [jp:%d][%d]: %d\n", jp, k, dmx[j][k]);
903 printf("elmx[jp:%d][%d]: %d\n", jp, k, elmx[j][k]);*/
904
905 if(INBAND(i, M)) {
906 endsc = ILogsum(ILogsum(endsc, dmx[i][M-kmin[i]] + CP9TSC(cp9O_DM,M)), /* transition from D_M -> end */
907 imx[i][M-kmin[i]] + CP9TSC(cp9O_IM,M)); /* transition from I_M -> end */
908 }
909 /* transition penalty to EL incurred when EL was entered */
910 /*printf("endsc: %d\n", endsc);*/
911
912 erow[i] = endsc;
913 } /* end loop over end positions i */
914
915 *ret_sc = Scorify(erow[L]);
916 ESL_DPRINTF1(("#DEBUG: cp9_ForwardP7B() return score: %10.4f\n", Scorify(erow[L])));
917
918 return eslOK;
919 }
920
921
922
923 /* Function: cp9_ForwardP7B_OLD_WITH_EL()
924 *
925 * Purpose: Runs the banded Forward dynamic programming algorithm on an
926 * input sequence (1..L). Complements cp9_BackwardP7B().
927 * The 'P7B' suffix indicates plan 7 HMM derived bands
928 * in the kmin and kmax arrays are applied. This function
929 * was derived from cp9_Forward(), differences from that
930 * function were introduced solely to impose bands on the
931 * matrix.
932 *
933 * Because of the bands, some options that exist to cp9_Forward()
934 * (like be_efficient and do_scan and reporting hits in scan mode)
935 * are not available here. This function is meant to be used
936 * solely for first stage of a ForwardP7B, BackwardP7B, Posterior type
937 * calculation.
938 *
939 * Also due to bands, only L is passed as seq length, instead
940 * of i0 and j0 (seq start/stop). This simplifies the application
941 * of bands kmin[i]/kmax[i] refers to residue i.
942 *
943 * See additional notes in cp9_Forward() "Purpose" section.
944 *
945 * NOTE: EPN, Thu Aug 14 09:54:36 2008
946 * This is the original version written to potentially handle EL
947 * states, but I think omitting them when they're off is significantly
948 * more efficient, so I wrote a competing function called
949 * cp9_ForwardP7B(). Note the included EL dp steps HAVE NOT BEEN
950 * TESTED YET! I just left them here as a starting point if I
951 * ever want to implement them.
952 *
953 * Args:
954 * cp9 - the CP9 HMM
955 * errbuf - char buffer for error messages
956 * mx - the matrix, expanded to correct size (if nec), and filled here
957 * dsq - sequence in digitized form
958 * L - start of target subsequence (1 for beginning of dsq)
959 * kmin - [0.1..i..N] minimum k for residue i
960 * kmax - [0.1..i..N] maximum k for residue i
961 * ret_sc - RETURN: log P(S|M,bands)/P(S|R), as a bit score
962 *
963 * Returns: eslOK on success;
964 * eslEINCOMPAT on contract violation;
965 */
966
967 int
cp9_ForwardP7B_OLD_WITH_EL(CP9_t * cp9,char * errbuf,CP9_MX * mx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,float * ret_sc)968 cp9_ForwardP7B_OLD_WITH_EL(CP9_t *cp9, char *errbuf, CP9_MX *mx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, float *ret_sc)
969 {
970 int status;
971 int i; /* j-W: position in the subsequence */
972 int k; /* CP9 HMM node position */
973 int **mmx; /* DP matrix for match state scores [0..1][0..cp9->M] */
974 int **imx; /* DP matrix for insert state scores [0..1][0..cp9->M] */
975 int **dmx; /* DP matrix for delete state scores [0..1][0..cp9->M] */
976 int **elmx; /* DP matrix for EL state scores [0..1][0..cp9->M] */
977 int *erow; /* end score for each position [0..1] */
978 int c; /* counter for EL states */
979 int M; /* cp9->M, query length, number of consensus nodes of model */
980 int kn, kx, kpcur, kpprv, kpprv_el;
981
982 /* Contract checks */
983 if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, cm->cp9 is NULL.\n");
984 if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, dsq is NULL.");
985 if(mx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, mx is NULL.\n");
986 if(mx->M != cp9->M) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, mx->M != cp9->M.\n");
987 if(kmin == NULL || kmax == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, kmin and/or kmax == NULL.\n");
988
989 M = cp9->M;
990
991 int const *tsc = cp9->otsc; /* ptr to efficiently ordered transition scores */
992
993 /* gamma allocation and initialization.
994 * This is a little SHMM that finds an optimal scoring parse
995 * of multiple nonoverlapping hits. */
996
997 /* Grow DP matrix if nec, to either 2 rows or L+1 rows (depending on be_efficient),
998 * stays M+1 columns */
999 if((status = GrowCP9Matrix(mx, errbuf, L, M, kmin, kmax, &mmx, &imx, &dmx, &elmx, &erow)) != eslOK) return status;
1000 ESL_DPRINTF2(("#DEBUG: cp9_ForwardP7B_OLD_WITH_EL(): CP9 matrix size: %.8f Mb rows: %d.\n", mx->size_Mb, mx->rows));
1001
1002 /* Initialization of the zero row. */
1003 mmx[0][0] = 0; /* M_0 is state B, and everything starts in B */
1004 imx[0][0] = -INFTY; /* I_0 is state N, can't get here without emitting*/
1005 dmx[0][0] = -INFTY; /* D_0 doesn't exist. */
1006 elmx[0][0]= -INFTY; /* can't go from B to EL state */
1007 erow[0] = -INFTY;
1008
1009 /* Because there's a D state for every node 1..M,
1010 dmx[0][k] is possible for all k 1..M */
1011 int sc;
1012 i = 0;
1013 kn = ESL_MAX(1, kmin[0]+1);
1014 kpcur = kn - kmin[0];
1015 for (k = kn; k <= kmax[0]; k++, kpcur++) {
1016 assert(kpcur >= 1);
1017 mmx[0][kpcur] = imx[0][kpcur] = elmx[0][kpcur] = -INFTY; /* need seq to get here */
1018 sc = ILogsum(ILogsum(mmx[0][kpcur-1] + CP9TSC(cp9O_MD,k-1),
1019 imx[0][kpcur-1] + CP9TSC(cp9O_ID,k-1)),
1020 dmx[0][kpcur-1] + CP9TSC(cp9O_DD,k-1));
1021 dmx[0][kpcur] = sc;
1022 }
1023 /* We can do a full parse through all delete states. */
1024 erow[0] = -INFTY;
1025 if(INBAND(0, M)) { erow[0] = dmx[0][M] + CP9TSC(cp9O_DM,M); }
1026
1027 /*****************************************************************
1028 * The main loop: scan the sequence from position 1 to L.
1029 *****************************************************************/
1030 /* Recursion. */
1031
1032 for (i = 1; i <= L; i++)
1033 {
1034 int const *isc = cp9->isc[dsq[i]];
1035 int const *msc = cp9->msc[dsq[i]];
1036 int endsc = -INFTY;
1037 int el_selfsc = cp9->el_selfsc;
1038 int sc;
1039
1040 if(kmin[i] == 0) {
1041 mmx[i][0] = -INFTY;
1042 dmx[i][0] = -INFTY; /*D_0 is non-existent*/
1043 elmx[i][0] = -INFTY; /*no EL state for node 0 */
1044 sc = ILogsum(ILogsum(mmx[i-1][0] + CP9TSC(cp9O_MI,0),
1045 imx[i-1][0] + CP9TSC(cp9O_II,0)),
1046 dmx[i-1][0] + CP9TSC(cp9O_DI,0));
1047 imx[i][0] = sc + isc[0];
1048 kn = 1; /* kmin[i] + 1 */
1049 }
1050 else {
1051 kn = kmin[i];
1052 }
1053
1054 /*match state*/
1055 kn = ESL_MAX(kn, (kmin[i-1]+1)); /* start at first cell from which we can look back to a valid cell at *mx[i-1][k-1] */
1056 kx = ESL_MIN(kmax[i], kmax[i-1]+1);
1057
1058 /* NOT SURE ABOUT THIS AND HOW IT COUPLES WITH BLOCK ABOVE kmin[i] == 0 */
1059 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1060 for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1061
1062 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1063 for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1064
1065 kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
1066 kpprv = kn - kmin[i-1];
1067 for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
1068 /*printf("M i: %d kpprv: %d\n", i, kpprv);*/
1069 assert((kpprv-1) >= 0);
1070
1071 sc = ILogsum(ILogsum(mmx[i-1][kpprv-1] + CP9TSC(cp9O_MM,k-1),
1072 imx[i-1][kpprv-1] + CP9TSC(cp9O_IM,k-1)),
1073 dmx[i-1][kpprv-1] + CP9TSC(cp9O_DM,k-1));
1074
1075 /* FIX ME: inefficient! check B->M_K transition */
1076 if(INBAND(i-1, 0)) { /* if i-1 is in k == 0's band */
1077 assert(kmin[(i-1)] == 0);
1078 if(mmx[i-1][0] != -INFTY)
1079 sc = ILogsum(sc, mmx[i-1][0] + CP9TSC(cp9O_BM,k));
1080 }
1081
1082 /* check possibility we came from an EL, if they're valid */
1083 for(c = 0; c < cp9->el_from_ct[k]; c++) { /* el_from_ct[k] is >= 0 */
1084 if(INBAND(i-1, cp9->el_from_idx[k][c])) {
1085 kpprv_el = cp9->el_from_idx[k][c] - kmin[(i-1)];
1086 sc = ILogsum(sc, elmx[i-1][kpprv_el]);
1087 }
1088 } /* transition penalty to EL incurred when EL was entered */
1089 if(sc != -INFTY) {
1090 mmx[i][kpcur] = sc + msc[k];
1091 /* E state update */
1092 endsc = ILogsum(endsc, mmx[i][kpcur] + CP9TSC(cp9O_ME,k));
1093 }
1094 else {
1095 mmx[i][kpcur] = -INFTY;
1096 /* don't update E state */
1097 }
1098 /*printf("k: %4d mmx[i:%4d][kpcur:%4d]: %d\n", k, i, kpcur, mmx[i][kpcur]);*/
1099
1100 /* el state */
1101 sc = -INFTY;
1102 if((cp9->flags & CPLAN9_EL) && cp9->has_el[k]) /* not all HMM nodes have an EL state (for ex:
1103 HMM nodes that map to right half of a MATP_MP) */
1104 {
1105 if(mmx[i][kpcur] != -INFTY) {
1106 kpprv_el = k - kmin[(i-1)];
1107 sc = ILogsum(mmx[i][kpcur] + CP9TSC(cp9O_MEL,k), /* transitioned from cur node's match state */
1108 elmx[i-1][kpprv_el] + el_selfsc); /* transitioned from cur node's EL state emitted ip on transition */
1109 }
1110 }
1111 elmx[i][kpcur] = sc;
1112 }
1113
1114 /* insert state*/
1115 kn = ESL_MAX(kmin[i], kmin[i-1]);
1116 kx = ESL_MIN(kmax[i], kmax[i-1]);
1117
1118 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1119 for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1120
1121 kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
1122 kpprv = kn - kmin[i-1];
1123 for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
1124 /*insert state*/
1125 assert(kpprv >= 0);
1126 /* HERE, EVENTUALLY IF kmin/kmax differ b/t Match and Inserts:
1127 * only look at match states from k that have i-1 within band */
1128 /* all insert states from k should have i-1 within band */
1129 /*printf("I i: %d kpprv: %d\n", i, kpprv);*/
1130 sc = ILogsum(ILogsum(mmx[i-1][kpprv] + CP9TSC(cp9O_MI,k),
1131 imx[i-1][kpprv] + CP9TSC(cp9O_II,k)),
1132 dmx[i-1][kpprv] + CP9TSC(cp9O_DI,k));
1133 if(sc != -INFTY) imx[i][kpcur] = sc + isc[k];
1134 else imx[i][kpcur] = -INFTY;
1135 /*printf("k: %4d imx[i:%4d][kpcur:%4d]: %d\n", k, i, kpcur, imx[i][kpcur]);*/
1136 }
1137
1138 /*delete state*/
1139 kn = kmin[i]+1;
1140
1141 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) dmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1142 kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
1143 for (k = kn; k <= kmax[i]; k++, kpcur++) { /* should I be adding one for delete off-by-one?? */
1144 sc = ILogsum(ILogsum(mmx[i][kpcur-1] + CP9TSC(cp9O_MD,k-1),
1145 imx[i][kpcur-1] + CP9TSC(cp9O_ID,k-1)),
1146 dmx[i][kpcur-1] + CP9TSC(cp9O_DD,k-1));
1147 dmx[i][kpcur] = sc;
1148 /*printf("k: %4d dmx[i:%4d][kpcur:%4d]: %d\n", k, i, kpcur, dmx[i][kpcur]);*/
1149 }
1150 /*printf("mmx [jp:%d][%d]: %d\n", jp, k, mmx[j][k]);
1151 printf("imx [jp:%d][%d]: %d\n", jp, k, imx[j][k]);
1152 printf("dmx [jp:%d][%d]: %d\n", jp, k, dmx[j][k]);
1153 printf("elmx[jp:%d][%d]: %d\n", jp, k, elmx[j][k]);*/
1154
1155 if(INBAND(i, M)) {
1156 endsc = ILogsum(ILogsum(endsc, dmx[i][M-kmin[i]] + CP9TSC(cp9O_DM,M)), /* transition from D_M -> end */
1157 imx[i][M-kmin[i]] + CP9TSC(cp9O_IM,M)); /* transition from I_M -> end */
1158 for(c = 0; c < cp9->el_from_ct[M+1]; c++) { /* el_from_ct[k] is >= 0 */
1159 if(INBAND(i, cp9->el_from_idx[M+1][c])) {
1160 kpprv_el = cp9->el_from_idx[M+1][c] - kmin[i];
1161 endsc = ILogsum(endsc, elmx[i][kpprv_el]);
1162 }
1163 }
1164 }
1165 /* transition penalty to EL incurred when EL was entered */
1166 /*printf("endsc: %d\n", endsc);*/
1167
1168 erow[i] = endsc;
1169 } /* end loop over end positions i */
1170
1171 *ret_sc = Scorify(erow[L]);
1172 ESL_DPRINTF1(("#DEBUG: cp9_ForwardP7B_OLD_WITH_EL() return score: %10.4f\n", Scorify(erow[L])));
1173
1174 return eslOK;
1175 }
1176
1177 /* Function: cp9_BackwardP7B()
1178 *
1179 * Purpose: Runs the banded Backward dynamic programming algorithm on an
1180 * input sequence (1..L). Complements cp9_ForwardP7B().
1181 * The 'P7B' suffix indicates plan 7 HMM derived bands
1182 * in the kmin and kmax arrays are applied. This function
1183 * was derived from cp9_Backward(), differences from that
1184 * function were introduced solely to impose bands on the
1185 * matrix.
1186 *
1187 * Because of the bands, some options that exist to cp9_Forward()
1188 * (like be_efficient and do_scan and reporting hits in scan mode)
1189 * are not available here. This function is meant to be used
1190 * solely for second stage of a Forward, Backward, Posterior type
1191 * calculation.
1192 *
1193 * Also due to bands, only L is passed as seq length, instead
1194 * of i0 and j0 (seq start/stop). This simplifies the application
1195 * of bands kmin[i]/kmax[i] refers to residue i.
1196 *
1197 * This function requires that local EL states are turned off, if
1198 * they're not it returns. A separate function should exist to align
1199 * with EL states on, I don't think it's worth the extra computations to
1200 * make 1 function that handles both.
1201 *
1202 * See additional notes in cp9_Backward() "Purpose" section.
1203 *
1204 * Args:
1205 * cp9 - the CP9 HMM
1206 * errbuf - char buffer for error messages
1207 * mx - the matrix, expanded to correct size (if nec), and filled here
1208 * dsq - sequence in digitized form
1209 * L - start of target subsequence (1 for beginning of dsq)
1210 * kmin - [0.1..i..N] minimum k for residue i
1211 * kmax - [0.1..i..N] maximum k for residue i
1212 * ret_sc - RETURN: log P(S|M,bands)/P(S|R), as a bit score, this is B->M[0][0]
1213 *
1214 * Returns: eslOK on success;
1215 * eslEINCOMPAT on contract violation;
1216 */
1217 int
cp9_BackwardP7B(CP9_t * cp9,char * errbuf,CP9_MX * mx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,float * ret_sc)1218 cp9_BackwardP7B(CP9_t *cp9, char *errbuf, CP9_MX *mx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, float *ret_sc)
1219 {
1220 int status;
1221 int i; /* j-W: position in the subsequence */
1222 int k; /* CP9 HMM node position */
1223 int **mmx; /* DP matrix for match state scores [0..1][0..cp9->M] */
1224 int **imx; /* DP matrix for insert state scores [0..1][0..cp9->M] */
1225 int **dmx; /* DP matrix for delete state scores [0..1][0..cp9->M] */
1226 int **elmx; /* DP matrix for EL state scores [0..1][0..cp9->M] */
1227 int *erow; /* end score for each position [0..1] */
1228 int c; /* counter for EL states */
1229 int M; /* cp9->M */
1230 int kpcur, kpprv, kpcur_el;
1231 int kprv, kprvn, kprvx;
1232 int kn, kx;
1233 float fsc;
1234
1235 /* Contract checks */
1236 if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, cp9 is NULL.\n");
1237 if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, dsq is NULL.");
1238 if(mx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, mx is NULL.\n");
1239 if(mx->M != cp9->M) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, mx->M != cm->clen.\n");
1240 if(cp9->flags & CPLAN9_EL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, cp9 EL flag up!\n");
1241
1242 M = cp9->M;
1243
1244 int const *tsc = cp9->otsc; /* ptr to efficiently ordered transition scores */
1245
1246 /* Rearrange DP matrix for this seq */
1247 if((status = GrowCP9Matrix(mx, errbuf, L, M, kmin, kmax, &mmx, &imx, &dmx, &elmx, &erow)) != eslOK) return status;
1248 ESL_DPRINTF2(("#DEBUG: cp9_BackwardP7B(): CP9 matrix size: %.8f Mb rows: %d.\n", mx->size_Mb, mx->rows));
1249
1250 /* Initialization of the L row. */
1251 i = L;
1252
1253 /*******************************************************************
1254 * 0 Handle EL, looking at EL_k->E for all valid k.
1255 * we're going backwards so we have to work out of order, we could get
1256 * around this by storing the nodes each EL goes TO in an el_to_ct[] vec. */
1257 /* init to -INFTY */
1258 kpcur = 0;
1259 for (k = kmin[i]; k <= kmax[i]; k++, kpcur++) elmx[i][kpcur] = -INFTY;
1260 if(cp9->flags & CPLAN9_EL)
1261 {
1262 for(c = 0; c < cp9->el_from_ct[cp9->M+1]; c++) /* el_from_ct[cp9->M+1] holds # ELs that can go to END */
1263 if(INBAND(i, cp9->el_from_idx[M+1][c])) {
1264 kpcur_el = cp9->el_from_idx[M+1][c] - kmin[i];
1265 elmx[i][kpcur_el] = 0.;
1266 }
1267 }
1268 /*******************************************************************/
1269
1270 /* elmx[cur][cp9->M] is either 0 (if EL_M exists (it would nec be in el_from_idx[cp9->M+1] array if it does, so
1271 * it would be filled with 0 in above loop), or -INFTY if it doesn't exist. We don't add possibility of EL_M -> EL_M
1272 * self loop b/c it's impossible to do that without emitting, and we've already seen our last res emitted.
1273 * either way we don't have to modify it */
1274
1275 if(INBAND(i, M)) { /* if i is in k == M's band */
1276 assert(M == kmax[i]);
1277 kpcur = M-kmin[i];
1278 mmx[i][kpcur] = 0. +
1279 ILogsum(elmx[i][kpcur] + CP9TSC(cp9O_MEL, M),/* M_M<-EL_M<-E, with 0 self loops in EL_M */
1280 CP9TSC(cp9O_ME,M)); /* M_M<-E ... everything ends in E (the 0; 2^0=1.0) */
1281 mmx[i][kpcur] += cp9->msc[dsq[i]][M]; /* ... + emitted match symbol */
1282 imx[i][kpcur] = 0. + CP9TSC(cp9O_IM,M); /* I_M<-E ... everything ends in E (the 0; 2^0=1.0) */
1283 imx[i][kpcur] += cp9->isc[dsq[i]][M]; /* ... + emitted insert symbol */
1284 dmx[i][kpcur] = CP9TSC(cp9O_DM,M); /* D_M<-E */
1285 kx = M-1;
1286 kpcur--;
1287 }
1288 else { kx = kmax[i]; kpcur = kmax[i]-kmin[i]; }
1289 /*******************************************************************
1290 * No need to look at EL_k->M_M b/c elmx[i] with i == L means last emitted residue was L+1
1291 * and this is impossible if we've come from M_M (only would be valid if we were coming from
1292 * E which is handled above with the EL_k->E code).
1293 *******************************************************************/
1294
1295 for (k = kx; k >= kmin[i]; k--, kpcur--)
1296 {
1297 mmx[i][kpcur] = 0 + CP9TSC(cp9O_ME,k); /*M_k<- E */
1298
1299 if(INBAND(i, k+1)) {
1300 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_MD,k));
1301 }
1302 if(cp9->flags & CPLAN9_EL)
1303 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], elmx[i][kpcur] + CP9TSC(cp9O_MEL,k));
1304
1305 mmx[i][kpcur] += cp9->msc[dsq[i]][k];
1306
1307 /*******************************************************************
1308 * No need to look at EL_k->M_M b/c elmx[i] with i == L means last emitted residue was L+1
1309 * and this is impossible if we've come from M_M (only would be valid if we were coming from
1310 * E which is handled above with the EL_k->E code).
1311 *******************************************************************/
1312
1313 if(INBAND(i, k+1)) {
1314 imx[i][kpcur] = dmx[i][kpcur+1] + CP9TSC(cp9O_ID,k);
1315 imx[i][kpcur] += cp9->isc[dsq[i]][k];
1316
1317 dmx[i][kpcur] = dmx[i][kpcur+1] + CP9TSC(cp9O_DD,k);
1318 }
1319 else {
1320 imx[i][kpcur] = -INFTY;
1321 dmx[i][kpcur] = -INFTY;
1322 }
1323 /* elmx[i][k] was set above, out of order */
1324
1325 ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, mmx[i][kpcur]);
1326 ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, imx[i][kpcur]);
1327 ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, dmx[i][kpcur]);
1328 }
1329
1330 if(INBAND(i, 0)) {
1331 /* remember M_0 is special, the B state, a non-emitter */
1332 mmx[i][0] = dmx[i][1] + CP9TSC(cp9O_MD,0); /* M_0(B)->D_1, no seq emitted, all deletes */
1333 /* above line is diff from CPBackwardOLD() which has mmx[i][0] = -INFTY; */
1334 imx[i][0] = dmx[i][1] + CP9TSC(cp9O_ID,0);
1335 imx[i][0] += cp9->isc[dsq[i]][0];
1336
1337 dmx[i][0] = -INFTY; /*D_0 doesn't exist*/
1338 elmx[i][0] = -INFTY; /*EL_0 doesn't exist*/
1339 }
1340
1341 /*****************************************************************
1342 * The main loop: scan the sequence from position j0-1 to i0.
1343 *****************************************************************/
1344 /* Reision */
1345 for (i = L-1; i >= 1; i--)
1346 {
1347 /* init EL mx to -INFTY */
1348 kpcur = 0;
1349 for (k = kmin[i]; k <= kmax[i]; k++, kpcur++) elmx[i][kpcur] = -INFTY;
1350
1351 /* Deal with node k == M first */
1352 /* elmx[i][k] could have come from self (EL_k), we
1353 * can't have come from END b/c we haven't emitted the last res of the seq yet.
1354 */
1355 if(INBAND(i, M)) {
1356 kpcur = M-kmin[i];
1357 if((cp9->flags & CPLAN9_EL) && (cp9->has_el[M]))
1358 elmx[i][kpcur] = elmx[i][kpcur] + cp9->el_selfsc;
1359
1360 if(INBAND(i+1, M)) {
1361 kpprv = M-kmin[i+1];
1362 mmx[i][kpcur] = imx[i+1][kpprv] + CP9TSC(cp9O_MI,M);
1363 mmx[i][kpcur] += cp9->msc[dsq[i]][M];
1364
1365 imx[i][kpcur] = imx[i+1][kpprv] + CP9TSC(cp9O_II,M);
1366 imx[i][kpcur] += cp9->isc[dsq[i]][M];
1367
1368 dmx[i][M] = imx[i+1][kpprv] + CP9TSC(cp9O_DI,M);
1369 }
1370 else {
1371 mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = -INFTY;
1372 }
1373
1374 if((cp9->flags & CPLAN9_EL) && (cp9->has_el[M]))
1375 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], elmx[i][kpcur] + CP9TSC(cp9O_MEL,M));
1376
1377 /*******************************************************************
1378 * 1b Handle EL, looking at EL_k->M_M for all valid k.
1379 * EL_k->M_M transition, which has no transition penalty */
1380 if(INBAND(i+1, M)) {
1381 if(cp9->flags & CPLAN9_EL)
1382 {
1383 for(c = 0; c < cp9->el_from_ct[M]; c++) /* el_from_ct[M] holds # ELs that can go to M_M */
1384 if(INBAND(i, cp9->el_from_idx[M][c])) {
1385 kpcur_el = cp9->el_from_idx[M][c] - kmin[i];
1386 elmx[i][kpcur_el] = ILogsum(elmx[i][kpcur_el], mmx[i+1][kpprv]);
1387 }
1388 }
1389 }
1390 }
1391
1392 /*********************************************************/
1393 /* MATCH: *_k <- M_k+1 transitions FROM a match state*/
1394 kn = ESL_MAX(kmin[i], kmin[i+1]-1); /* start at first cell from which we can look ahead to a valid cell at mmx[i-1][k-1] */
1395 kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1396 kx = ESL_MIN(kmax[i], kmax[i+1]-1);
1397
1398 for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1399 for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1400
1401 kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1402 kpprv = kx - kmin[i+1];
1403 for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1404 {
1405 /* Handle EL, looking at EL_k->M_k for all valid k and EL_k->EL_k
1406 * we're going backwards so we have to work out of order
1407 * we could get around this by storing the nodes each EL goes TO
1408 * in an el_to_ct[] vector. */
1409 if(cp9->flags & CPLAN9_EL) {
1410 for(c = 0; c < cp9->el_from_ct[k]; c++) { /* el_from_ct[k] holds # ELs that can go to M_k */
1411 if(INBAND(i, cp9->el_from_idx[k][c])) {
1412 kpcur_el = cp9->el_from_idx[k][c] - kmin[i];
1413 elmx[i][kpcur_el] = ILogsum(elmx[i][kpcur_el], mmx[i+1][kpprv]);
1414 /* EL<-M, penalty incurred when we enter EL (i.e. leave going backwards) */
1415 }
1416 }
1417 }
1418
1419 /* Finish off elmx[i][k] with possibility of coming from self (EL_k),
1420 * elmx[i][k] will have been filled by block above for ks > current k,
1421 * no M_k -> EL_k' with k' > k */
1422 if(INBAND(i+1, k)) {
1423 if((cp9->flags & CPLAN9_EL) && (cp9->has_el[k]))
1424 elmx[i][k] = ILogsum(elmx[i][kpcur], elmx[i+1][kpprv] + cp9->el_selfsc);
1425 }
1426 mmx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_MM,k);
1427 imx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_IM,k);
1428 dmx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_DM,k);
1429 }
1430
1431 /*********************************************************/
1432 /* INSERTIONS: *_k <- I_k+1 transitions FROM a insert state*/
1433 kn = ESL_MAX(kmin[i], kmin[i+1]); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1434 kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1435 kx = ESL_MIN(kmax[i], kmax[i+1]);
1436
1437 kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1438 kpprv = kx - kmin[i+1];
1439 for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1440 {
1441 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_MI,k));
1442 imx[i][kpcur] = ILogsum(imx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_II,k));
1443 dmx[i][kpcur] = ILogsum(dmx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_DI,k));
1444 }
1445
1446 /*********************************************************/
1447 /* DELETIONS: *_k <- D_k+1 transitions FROM a delete state*/
1448 kn = ESL_MAX(kmin[i], kmin[i]-1); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1449 kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1450 kx = ESL_MIN(kmax[i], kmax[i]-1);
1451
1452 kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1453 for (k = kx; k >= kn; k--, kpcur--)
1454 {
1455 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_MD,k));
1456 imx[i][kpcur] = ILogsum(imx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_ID,k));
1457 dmx[i][kpcur] = ILogsum(dmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_DD,k));
1458
1459 /* now add in the emission score */
1460 mmx[i][kpcur] += cp9->msc[dsq[i]][k];
1461 imx[i][kpcur] += cp9->isc[dsq[i]][k];
1462 }
1463 /* there's one valid cell that we didn't consider in above loop to add emission score
1464 * b/c D_k <- D_k+1 (so k+1 is out of bounds for Delete) */
1465 for(k = kx+1; k <= kmax[i]; k++) {
1466 kpcur = k - kmin[i]; /* unnec, loop above ends with this */
1467 mmx[i][kpcur] += cp9->msc[dsq[i]][k];
1468 imx[i][kpcur] += cp9->isc[dsq[i]][k];
1469 }
1470 /*********************************************************/
1471 kpcur = kmax[i] - kmin[i];
1472 for(k = kmax[i]; k >= kmin[i] && k > 0; k--, kpcur--) {
1473 ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, mmx[i][kpcur]);
1474 ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, imx[i][kpcur]);
1475 ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, dmx[i][kpcur]);
1476 }
1477 /* special case when k == 0 */
1478 kpcur = 0;
1479 kpprv = 0 - kmin[i+1];
1480 if(INBAND(i, 0)) {
1481 assert(kmin[i] == 0);
1482 dmx[i][kpcur] = -INFTY; /* D_0 does not exist */
1483 elmx[i][kpcur] = -INFTY; /* EL_0 does not exist */
1484
1485 /* INSERT k=0 */
1486 imx[i][kpcur] = -INFTY;
1487 /* imx[i][0] is filled same as imx[i][1..k] in the loop above */
1488 if(INBAND(i+1, 1)) {
1489 if(mmx[i+1][kpprv+1] != -INFTY)
1490 imx[i][kpcur] = ILogsum(imx[i][kpcur], mmx[i+1][kpprv+1] + CP9TSC(cp9O_IM,0));
1491 }
1492 if(INBAND(i+1, 0)) {
1493 if(imx[i+1][kpprv] != -INFTY)
1494 imx[i][kpcur] = ILogsum(imx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_II,0));
1495 }
1496 if(INBAND(i, 1)) {
1497 if(dmx[i][kpcur+1] != -INFTY)
1498 imx[i][kpcur] = ILogsum(imx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_ID,0));
1499 }
1500
1501 /*M_0 is the B state, it doesn't emit, and can be reached from any match via a begin transition */
1502 kprvn = ESL_MAX(1, kmin[i+1]);
1503 kprvx = kmax[i+1];
1504 /* careful, don't change kpcur - this is a special case the M_0 state, we loop over all M_k children, only kpprv changes */
1505 kpprv = kprvx - kmin[i+1];
1506 mmx[i][kpcur] = -INFTY;
1507 for(kprv = kprvx; kprv >= kprvn; kprv--, kpprv--) {
1508 if(mmx[i+1][kpprv] != -INFTY)
1509 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (mmx[i+1][kpprv] + CP9TSC(cp9O_BM,kprv)));
1510 }
1511 k = 0;
1512 if(INBAND(i+1, 0)) {
1513 kpprv = k - kmin[i+1];
1514 if(imx[i+1][kpprv] != -INFTY) {
1515 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (imx[i+1][kpprv] + CP9TSC(cp9O_MI,0)));
1516 }
1517 }
1518 if(INBAND(i, 1)) {
1519 if(dmx[i][kpcur+1] != -INFTY) {
1520 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (dmx[i][kpcur+1] + CP9TSC(cp9O_MD,0))); /* B->D_1 */
1521 }
1522 }
1523 }
1524 ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, 0, kpcur, mmx[i][kpcur]);
1525 ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, 0, kpcur, imx[i][kpcur]);
1526 ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, 0, kpcur, dmx[i][kpcur]);
1527 }
1528
1529 /*******************************************************************/
1530 /* Special case: i == 0 */
1531
1532 /* initialize all match, inserts, deletes and ELs to -INFTY
1533 * deletes and M_0 cells MAY get changed later, inserts and ELs always stay -INFTY
1534 * b/c we need at least 1 residue to get to those cells */
1535
1536 i = 0;
1537 kpcur = k - kmin[0];
1538 for (kpcur = 0; kpcur <= kmax[0] - kmin[0]; kpcur++) mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = elmx[i][kpcur] = -INFTY;
1539
1540 /* D_M(i == 0) <- I_M(i = 1) */
1541 if(INBAND(i, M)) {
1542 kpcur = M - kmin[i];
1543 kpprv = M - kmin[i+1];
1544 if(INBAND(i+1, M)) {
1545 dmx[i][kpcur] = imx[i+1][kpprv] + CP9TSC(cp9O_DI,M);
1546 }
1547 }
1548
1549 /* update D_k(i == 0) cells */
1550 /* D_k(i == 0) <- M_k+1(i == 1) */
1551 kn = ESL_MAX(kmin[i], kmin[i+1]-1); /* start at first cell from which we can look back to a valid cell at *mx[i-1][k-1] */
1552 kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1553 kx = ESL_MIN(kmax[i], kmax[i+1]-1);
1554 kpcur = kx - kmin[i];
1555 kpprv = kx - kmin[i+1];
1556 for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1557 dmx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_DM,k);
1558
1559 /* D_k(i == 0) <- I_k(i == 1) */
1560 kn = ESL_MAX(kmin[i], kmin[i+1]); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1561 kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1562 kx = ESL_MIN(kmax[i], kmax[i+1]);
1563 kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1564 kpprv = kx - kmin[i+1];
1565 for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1566 dmx[i][kpcur] = ILogsum(dmx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_DI,k));
1567
1568 /* D_k(i == 0) <- D_k+1(i == 0) */
1569 kn = ESL_MAX(kmin[i], kmin[i]-1); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1570 kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1571 kx = ESL_MIN(kmax[i], kmax[i]-1);
1572 kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1573 for (k = kx; k >= kn; k--, kpcur--)
1574 dmx[i][kpcur] = ILogsum(dmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_DD,k));
1575
1576 ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, mmx[i][kpcur]);
1577 ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, imx[i][kpcur]);
1578 ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, dmx[i][kpcur]);
1579
1580 /* Case when k == 0 (i is still 0) */
1581 k = 0;
1582 if(INBAND(i, 0)) {
1583 assert(kmin[i] == 0);
1584 imx[i][0] = -INFTY; /* need seq to get here */
1585 dmx[i][0] = -INFTY; /* D_0 does not exist */
1586 elmx[i][0] = -INFTY; /* EL_0 does not exist */
1587 mmx[i][0] = -INFTY;
1588
1589 /*M_0 is the B state, it doesn't emit, and can be reached from any match via a begin transition */
1590 kprvn = ESL_MAX(1, kmin[i+1]);
1591 kprvx = kmax[i+1];
1592
1593 kpcur = 0; /* special case, M_0 = B state */
1594 /* careful, this is a different assignment to kpcur b/c it's the M_0 state, we loop over all M_k children, only kpprv changes */
1595 kpprv = kprvx - kmin[i+1];
1596 mmx[i][kpcur] = -INFTY;
1597 for(kprv = kprvx; kprv >= kprvn; kprv--, kpprv--) {
1598 if(mmx[i+1][kpprv] != -INFTY)
1599 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (mmx[i+1][kpprv] + CP9TSC(cp9O_BM,kprv)));
1600 }
1601 k = 0;
1602 if(INBAND(i+1, 0)) {
1603 kpprv = k - kmin[i+1];
1604 if(imx[i+1][kpprv] != -INFTY) {
1605 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (imx[i+1][kpprv] + CP9TSC(cp9O_MI,0)));
1606 }
1607 }
1608 if(INBAND(i, 1)) {
1609 if(dmx[i][kpcur+1] != -INFTY) {
1610 mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (dmx[i][kpcur+1] + CP9TSC(cp9O_MD,0))); /* B->D_1 */
1611 }
1612 }
1613 }
1614 /* No EL contribution here, can't go B->EL_* */
1615
1616 fsc = Scorify(mmx[i][0]);
1617 /**********************************************************************************/
1618 /* End of Backward recursion */
1619
1620 ESL_DPRINTF1(("#DEBUG: cp9_BackwardP7B() return score: %10.4f\n", fsc));
1621 return eslOK;
1622 }
1623
1624
1625 /* Function: cp9_CheckFBP7B()
1626 *
1627 * Purpose: Debugging function to make sure the P7 banded
1628 * DP functions cp9_ForwardP7B() and cp9_BackwardP7B()
1629 * are working by checking:
1630 * For all positions i, and states k within kmin[i]..kmax[i]:
1631 * sum_k f[i][k] * b[i][k] = P(x|hmm)
1632 *
1633 * Args: fmx - p7 banded forward dp matrix, already filled
1634 * bmx - p7 banded backward dp matrix, already filled
1635 * hmm - the model
1636 * sc - P(x|hmm, p7 bands) the probability of the entire
1637 * seq given the model
1638 * i0 - start of target subsequence (often 1, beginning of dsq)
1639 * j0 - end of target subsequence (often L, end of dsq)
1640 * dsq - the digitized sequence
1641 *
1642 * Note about sequence position indexing: although this function
1643 * works on a subsequence from i0 to j0, fmx and bmx have offset indices,
1644 * from 1 to L, with L = j0-i0+1.
1645 *
1646 * Return: eslOK on success;
1647 * eslFAIL if any residue fails check
1648 */
1649 int
cp9_CheckFBP7B(CP9_MX * fmx,CP9_MX * bmx,CP9_t * hmm,char * errbuf,float sc,int i0,int j0,ESL_DSQ * dsq,int * kmin,int * kmax)1650 cp9_CheckFBP7B(CP9_MX *fmx, CP9_MX *bmx, CP9_t *hmm, char *errbuf, float sc, int i0, int j0, ESL_DSQ *dsq, int *kmin, int *kmax)
1651 {
1652 if(fmx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_CheckFBP7B(), fmx is NULL.\n");
1653 if(bmx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_CheckFBP7B(), bmx is NULL.\n");
1654 if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_CheckFBP7B(), dsq is NULL.");
1655
1656 int k, i;
1657 float max_diff; /* maximum allowed difference between sc and
1658 * sum_k f[i][k] * b[i][k] for any i */
1659 float diff;
1660 int fb_sum;
1661 float fb_sc;
1662 int L; /* subsequence length */
1663 int kp; /* k', relative k within band; k-kmin[i] */
1664 int to_add;
1665
1666 L = j0-i0+1; /* the length of the subsequence */
1667 max_diff = 0.1; /* tolerance, must be within .1 bits of original score */
1668
1669 /* In all possible paths through the model, each residue of the sequence must have
1670 * been emitted by exactly 1 insert, match or EL state. */
1671 for (i = 1; i <= L; i++) {
1672 fb_sum = -INFTY;
1673 kp = 0;
1674 for (k = kmin[i]; k <= kmax[i]; k++, kp++) {
1675 if (fmx->mmx[i][kp] == -INFTY) to_add = -INFTY;
1676 else if(bmx->mmx[i][kp] == -INFTY) to_add = -INFTY;
1677 else {
1678 to_add = fmx->mmx[i][kp] + bmx->mmx[i][kp];
1679 if(k > 0) to_add -= hmm->msc[dsq[i]][k];
1680 }
1681 /* hmm->msc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx
1682 * unless, we're talking about M_0, the B state, it doesn't emit */
1683 fb_sum = ILogsum(fb_sum, to_add);
1684
1685 /*printf("fmx->mmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kp, fmx->mmx[i][kp]);
1686 printf("bmx->mmx[i:%4d][k:%4d(%4d)]: %d sum: %d\n", i, k, kp, (bmx->mmx[i][kp]-hmm->msc[dsq[i]][k]), fb_sum);*/
1687
1688 if (fmx->imx[i][kp] == -INFTY) to_add = -INFTY;
1689 else if(bmx->imx[i][kp] == -INFTY) to_add = -INFTY;
1690 else {
1691 to_add = fmx->imx[i][kp] + bmx->imx[i][kp];
1692 to_add -= hmm->isc[dsq[i]][k];
1693 }
1694 /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1695 fb_sum = ILogsum(fb_sum, to_add);
1696
1697 /*printf("fmx->imx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kp, fmx->imx[i][kp]);
1698 printf("bmx->imx[i:%4d][k:%4d(%4d)]: %d sum: %d\n", i, k, kp, (bmx->imx[i][kp]-hmm->isc[dsq[i]][k]), fb_sum);*/
1699
1700 if (fmx->elmx[i][kp] == -INFTY) to_add = -INFTY;
1701 else if(bmx->elmx[i][kp] == -INFTY) to_add = -INFTY;
1702 else {
1703 to_add = fmx->elmx[i][kp] + bmx->elmx[i][kp];
1704 /* EL emissions are by definition zero scoring */
1705 }
1706 fb_sum = ILogsum(fb_sum, to_add);
1707
1708 /*printf("fmx->elmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kp, fmx->elmx[i][kp]);
1709 printf("bmx->elmx[i:%4d][k:%4d(%4d)]: %d sum: %d\n", i, k, kp, bmx->elmx[i][kp], fb_sum);*/
1710 }
1711 fb_sc = Scorify(fb_sum);
1712 diff = fabs(fb_sc - sc);
1713 /*printf("FB CHECK: i: %4d %10.4f %10.4f (%10.4f)\n", i, fb_sc, sc, diff);*/
1714 if((fabs(diff) > max_diff))
1715 ESL_FAIL(eslFAIL, errbuf, "cp9_CheckFB(), residue at posn i:%d violates sum_k f[i][k]*b[i][k]=P(x|hmm), sum_k = %.4f bits (should be %.4f)\n", i, fb_sc, sc);
1716 }
1717 ESL_DPRINTF1(("#DEBUG: cp9_CheckFB() passed, Forward/Backward matrices pass check.\n"));
1718 /*printf("cp9_CheckFB() passed, Forward/Backward matrices pass check.\n");*/
1719 return eslOK;
1720 }
1721
1722
1723
1724 /* Function: cp9_Seq2BandsP7B
1725 * Date : EPN, Fri Aug 15 13:43:21 2008
1726 *
1727 * Purpose: Given a CM with precalc'ed CP9 HMM, CP9Map, and HMMER3 plan 7
1728 * HMM bands for the CP9 HMM DP matrices, a sequence and
1729 * a CP9Bands_t structure, calculate the CP9 HMM bands and store them
1730 * in the CP9Bands_t structure.
1731 *
1732 * Note: this function was never updated to handle truncated
1733 * alignment.
1734 *
1735 * Args: cm - the covariance model
1736 * errbuf - char buffer for reporting errors
1737 * fmx - CP9 dp matrix for Forward()
1738 * bmx - CP9 dp matrix for Backward()
1739 * pmx - CP9 dp matrix to fill with posteriors, can == bmx
1740 * dsq - sequence in digitized form
1741 * L - length of sequence we're aligning (1..L)
1742 * cp9b - PRE-ALLOCATED, the HMM bands for this sequence, filled here.
1743 * kmin - P7 dervied band to enforce: [0.i..L] = k, min node k for residue i
1744 * kmax - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1745 * debug_level - verbosity level for debugging printf()s
1746 * Return: eslOK on success;
1747 *
1748 */
1749 int
cp9_Seq2BandsP7B(CM_t * cm,char * errbuf,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,ESL_DSQ * dsq,int L,CP9Bands_t * cp9b,int * kmin,int * kmax,int debug_level)1750 cp9_Seq2BandsP7B(CM_t *cm, char *errbuf, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, ESL_DSQ *dsq, int L, CP9Bands_t *cp9b, int *kmin, int *kmax, int debug_level)
1751 {
1752 int status;
1753 int use_sums; /* TRUE to fill and use posterior sums during HMM band calc, yields wider bands */
1754 float sc;
1755 int do_old_hmm2ij;
1756 CP9_t *cp9 = NULL; /* ptr to cp9 HMM (this could be Lcp9, Rcp9, Tcp9 if we update this function to possibly handle truncated alignment) */
1757
1758 /* Contract checks */
1759 if(cm->cp9map == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, but cm->cp9map is NULL.\n");
1760 if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, dsq is NULL.");
1761 if(!((cm->align_opts & CM_ALIGN_HBANDED) || (cm->search_opts & CM_SEARCH_HBANDED))) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, CM_ALIGN_HBANDED and CM_SEARCH_HBANDED flags both down, exactly 1 must be up.\n");
1762 if((cm->search_opts & CM_SEARCH_HMMALNBANDS) && (!(cm->search_opts & CM_SEARCH_HBANDED))) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, CM_SEARCH_HMMALNBANDS flag raised, but not CM_SEARCH_HBANDED flag, this doesn't make sense\n");
1763 if(cm->tau > 0.5) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, cm->tau (%f) > 0.5, we can't deal.", cm->tau);
1764
1765 use_sums = ((cm->align_opts & CM_ALIGN_SUMS) || (cm->search_opts & CM_SEARCH_SUMS)) ? TRUE : FALSE;
1766 do_old_hmm2ij = ((cm->align_opts & CM_ALIGN_HMM2IJOLD) || (cm->search_opts & CM_SEARCH_HMM2IJOLD)) ? TRUE : FALSE;
1767
1768 /* Determine which cp9 HMM to use: If the CM has local begins on, use cm->cp9loc, else use cm->cp9glb */
1769 cp9 = cm->cp9;
1770 if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2Posteriors, relevant cp9 is NULL.\n");
1771
1772 /* Step 1: Get HMM Forward/Backward DP matrices.
1773 * Step 2: F/B -> HMM bands.
1774 * Step 3: HMM bands -> CM bands.
1775 */
1776
1777 /* Step 1: Get HMM Forward/Backward DP matrices. */
1778 if((status = cp9_ForwardP7B (cp9, errbuf, fmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1779 if((status = cp9_BackwardP7B(cp9, errbuf, bmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1780
1781 if(cm->align_opts & CM_ALIGN_CHECKFB) {
1782 if((status = cp9_CheckFBP7B(fmx, bmx, cp9, errbuf, sc, 1, L, dsq, kmin, kmax)) != eslOK) return status;
1783 printf("Forward/Backward matrices checked.\n");
1784 }
1785
1786 /* Step 2: F/B -> HMM bands. */
1787 if(use_sums){
1788 printf("USE SUMS!\n");
1789 exit(1);
1790 }
1791 else {
1792 if((status = cp9_FB2HMMBandsP7B(cp9, errbuf, dsq, fmx, bmx, pmx, cp9b, L, cp9b->hmm_M,
1793 (1.-cm->tau), do_old_hmm2ij, kmin, kmax, debug_level)) != eslOK) return status;
1794 cp9b->tau = cm->tau;
1795 }
1796 if(debug_level > 0) cp9_DebugPrintHMMBands(stdout, L, cp9b, cm->tau, 1);
1797
1798 /* Step 3: HMM bands -> CM bands. */
1799 if(do_old_hmm2ij) {
1800 if((status = cp9_HMM2ijBands_OLD(cm, errbuf, cm->cp9b, cm->cp9map, 1, L, FALSE, debug_level)) != eslOK) return status;
1801 }
1802 else {
1803 if((status = cp9_HMM2ijBands(cm, errbuf, cp9, cm->cp9b, cm->cp9map, 1, L, FALSE, FALSE, debug_level)) != eslOK) return status;
1804 /* For debugging, uncomment this block:
1805 if((status = cp9_HMM2ijBands(cm, errbuf, cm->cp9b, cm->cp9map, i0, j0, doing_search, FALSE, debug_level)) != eslOK) {
1806 ESL_SQ *tmp;
1807 tmp = esl_sq_CreateDigitalFrom(cm->abc, "irrelevant", dsq+i0-1, (j0-i0+1), NULL, NULL, NULL);
1808 esl_sq_Textize(tmp);
1809 printf("HEY! cm: %s\n", cm->name);
1810 printf(">irrelevant\n%s\n", tmp->seq);
1811 esl_sq_Destroy(tmp);
1812 return status;
1813 }
1814 */
1815 }
1816
1817 /* Use the CM bands on i and j to get bands on d, specific to j. */
1818 /* cp9_GrowHDBands() must be called before ij2d_bands() so hdmin, hdmax are adjusted for new seq */
1819 if((status = cp9_GrowHDBands(cp9b, errbuf)) != eslOK) return status;
1820 ij2d_bands(cm, L, cp9b->imin, cp9b->imax, cp9b->jmin, cp9b->jmax, cp9b->hdmin, cp9b->hdmax, FALSE, debug_level);
1821
1822 #if eslDEBUGLEVEL >= 1
1823 if((status = cp9_ValidateBands(cm, errbuf, cp9b, 1, L, FALSE)) != eslOK) return status;
1824 ESL_DPRINTF1(("#DEBUG: bands validated.\n"));
1825 #endif
1826 if(debug_level > 0) debug_print_ij_bands(cm);
1827
1828 if(debug_level > 0) PrintDPCellsSaved_jd(cm, cp9b->jmin, cp9b->jmax, cp9b->hdmin, cp9b->hdmax, L);
1829
1830 return eslOK;
1831 }
1832
1833
1834 /* Function: cp9_Seq2PosteriorsP7B
1835 * Date : EPN, Tue Aug 19 13:12:12 2008
1836 *
1837 * Purpose: Given a CM with precalc'ed CP9 HMM and CP9Map, and HMMER3 plan 7
1838 * HMM bands for the CP9 HMM DP matrices, and a sequence,
1839 * run HMM Forward and Backward algorithms, and return a CP9 posterior
1840 * matrix.
1841 *
1842 * Note: this function was never updated to handle
1843 * truncated alignment (b/c it's currently not hooked up
1844 * to any of the Infernal applications).
1845 *
1846 * Args: cm - the covariance model
1847 * errbuf - char buffer for error messages
1848 * fmx - CP9 dp matrix for Forward()
1849 * bmx - CP9 dp matrix for Backward()
1850 * pmx - CP9 dp matrix to fill with posteriors, can == bmx
1851 * dsq - sequence in digitized form
1852 * L - length of the sequence we're aligning (1..L)
1853 * kmin - P7 dervied band to enforce: [0.i..L] = k, min node k for residue i
1854 * kmax - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1855 * debug_level - verbosity level for debugging printf()s
1856 *
1857 * Return: eslOK on success
1858 */
1859 int
cp9_Seq2PosteriorsP7B(CM_t * cm,char * errbuf,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,int debug_level)1860 cp9_Seq2PosteriorsP7B(CM_t *cm, char *errbuf, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, int debug_level)
1861 {
1862 int status;
1863 float sc;
1864 CP9_t *cp9 = NULL; /* ptr to cp9 HMM (this could be Lcp9, Rcp9, Tcp9 if we update this function to possibly handle truncated alignment) */
1865
1866 /* Contract checks */
1867 if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors(), dsq is NULL.");
1868 if(cm->cp9map == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors, but cm->cp9map is NULL.\n");
1869 if((cm->align_opts & CM_ALIGN_HBANDED) && (cm->search_opts & CM_SEARCH_HBANDED))
1870 ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors, CM_ALIGN_HBANDED and CM_SEARCH_HBANDED flags both up, exactly 1 must be up.\n");
1871 if((cm->search_opts & CM_SEARCH_HMMALNBANDS) && (! (cm->search_opts & CM_SEARCH_HBANDED)))
1872 ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors, CM_SEARCH_HMMALNBANDS flag raised, but not CM_SEARCH_HBANDED flag, this doesn't make sense\n");
1873
1874 /* determine which cp9 HMM to use, if CM is has local begins use cp9 (its local too) else use cp9glb (its global) */
1875 cp9 = cm->cp9;
1876 if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2Posteriors, relevant cp9 is NULL.\n");
1877
1878 /* Step 1: Get HMM posteriors.*/
1879 if((status = cp9_ForwardP7B (cp9, errbuf, fmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1880 if(debug_level > 0) printf("CP9P7B Forward score : %.4f\n", sc);
1881
1882 if((status = cp9_BackwardP7B(cp9, errbuf, bmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1883 if(debug_level > 0) printf("CP9 Backward score : %.4f\n", sc);
1884
1885 if(cm->align_opts & CM_ALIGN_CHECKFB) {
1886 if((status = cp9_CheckFBP7B(fmx, bmx, cp9, errbuf, sc, 1, L, dsq, kmin, kmax)) != eslOK) return status;
1887 printf("Forward/Backward matrices checked.\n");
1888 }
1889
1890 /* Get posteriors */
1891 if((status = cp9_PosteriorP7B(dsq, errbuf, L, cp9, fmx, bmx, pmx, kmin, kmax)) != eslOK) return status;
1892
1893 return eslOK;
1894 }
1895
1896
1897 /* Function: cp9_PosteriorP7B()
1898 * based on Ian Holmes' hmmer/src/postprob.c::P7EmitterPosterior()
1899 *
1900 * Purpose: Combines HMMER3 p7 banded Forward and Backward matrices into a
1901 * posterior probability matrix. For emitters (match and inserts) the
1902 * entries in row i of this matrix are the logs of the posterior
1903 * probabilities of each state emitting symbol i of the sequence.
1904 * For non-emitters the entries in row i of this matrix are the
1905 * logs of the posterior probabilities of each state being 'visited'
1906 * when the last emitted residue in the parse was symbol i of the
1907 * sequence.
1908 *
1909 * Args: dsq - sequence in digitized form
1910 * errbuf - for error messages
1911 * L - length of target subsequence 1..L
1912 * hmm - the model
1913 * forward - pre-calculated forward matrix
1914 * backward - pre-calculated backward matrix
1915 * pm - pre-allocated dynamic programming matrix for posteriors
1916 * kmin - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1917 * kmax - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1918 *
1919 * Return: eslOK on success;
1920 */
1921 int
cp9_PosteriorP7B(ESL_DSQ * dsq,char * errbuf,int L,CP9_t * hmm,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,int * kmin,int * kmax)1922 cp9_PosteriorP7B(ESL_DSQ *dsq, char *errbuf, int L, CP9_t *hmm, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, int *kmin, int *kmax)
1923 {
1924 int i;
1925 int k;
1926 int sc;
1927 int M = hmm->M;
1928 int kp, kn, kx;
1929 /*float temp_sc;*/
1930
1931 if(bmx != pmx) GrowCP9Matrix(pmx, errbuf, L, M, kmin, kmax, NULL, NULL, NULL, NULL, NULL);
1932
1933 /* parses must start/stop at (i = 1)/(j = L) */
1934 sc = bmx->mmx[0][0];
1935
1936 /* note boundary conditions, i = 1 */
1937 assert(kmin[0] == 0);
1938 pmx->mmx[0][0] = fmx->mmx[0][0] + bmx->mmx[0][0] - sc; /* fmx->mmx[0][0] is 0, bmx->mmx[1][0] is overall score */
1939 pmx->imx[0][0] = -INFTY; /*need seq to get here*/
1940 pmx->dmx[0][0] = -INFTY; /*D_0 does not exist*/
1941 i = 0;
1942 kn = ESL_MAX(kmin[i], 1);
1943 kx = kmax[i];
1944 kp = kn - kmin[i];
1945 for (k = kn; k <= kx; k++, kp++) {
1946 pmx->mmx[0][kp] = -INFTY; /*need seq to get here*/
1947 pmx->imx[0][kp] = -INFTY; /*need seq to get here*/
1948 pmx->dmx[0][kp] = fmx->dmx[0][kp] + bmx->dmx[0][kp] - sc;
1949 }
1950
1951 for (i = 1; i <= L; i++) {
1952 k = 0;
1953 if(INBAND(i,0)) {
1954 kp = k - kmin[i];
1955 assert(kp == 0);
1956 pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - sc, -INFTY); /* M_0 doesn't emit */
1957 pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][0] - sc, -INFTY);
1958 /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1959 pmx->dmx[i][kp] = -INFTY; /* D_0 doesn't exist */
1960 }
1961
1962 kn = ESL_MAX(kmin[i], 1);
1963 kx = kmax[i];
1964 kp = kn - kmin[i];
1965 for(k = kn; k <= kx; k++, kp++)
1966 {
1967 pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - hmm->msc[dsq[i]][k] - sc, -INFTY);
1968 /*hmm->msc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1969 pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][k] - sc, -INFTY);
1970 /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1971 pmx->dmx[i][kp] = ESL_MAX(fmx->dmx[i][kp] + bmx->dmx[i][kp] - sc, -INFTY);
1972 }
1973 }
1974
1975 /*
1976 float temp_sc;
1977 for(i = 0; i <= L; i++)
1978 {
1979 kp = 0;
1980 for(k = kmin[i]; k <= kmax[i]; k++, kp++)
1981 {
1982 temp_sc = Score2Prob(mx->mmx[i][kp], 1.);
1983 if(temp_sc > .0001)
1984 printf("mx->mmx[%3d][%3d]: %9d | %8f\n", i, k, mx->mmx[i][kp], temp_sc);
1985 temp_sc = Score2Prob(mx->imx[i][kp], 1.);
1986 if(temp_sc > .0001)
1987 printf("mx->imx[%3d][%3d]: %9d | %8f\n", i, k, mx->imx[i][kp], temp_sc);
1988 temp_sc = Score2Prob(mx->dmx[i][kp], 1.);
1989 if(temp_sc > .0001)
1990 printf("mx->dmx[%3d][%3d]: %9d | %8f\n", i, k, mx->dmx[i][kp], temp_sc);
1991 }
1992 }*/
1993 return eslOK;
1994 }
1995
1996 /* Function: cp9_FB2HMMBandsP7B()
1997 * Date: EPN, Fri Aug 15 14:00:59 2008
1998 *
1999 * Purpose: Determine the band on all HMM states given HMMER3 Plan 7 Banded
2000 * Forward and Backward matrices. Do this by calculating and summing
2001 * log posterior probabilities that each state emitted/was visited at each posn,
2002 * starting at the band ends, and creeping in, until the half the
2003 * maximum allowable probability excluded is reached on each side.
2004 *
2005 * Args:
2006 *
2007 * CP9_t hmm the HMM
2008 * errbuf char buffer for error messages
2009 * CP9_MX fmx: forward DP matrix, already calc'ed
2010 * CP9_MX bmx: backward DP matrix, already calc'ed
2011 * CP9_MX pmx: DP matrix for posteriors, filled here, can == bmx
2012 * dsq the digitized sequence
2013 * CP9Bands_t cp9b CP9 bands data structure
2014 * int L length of target subsequence (1..L)
2015 * int M number of nodes in HMM (num columns of pmx matrix)
2016 * double p_thresh the probability mass we're requiring is within each band
2017 * int do_old_hmm2ij TRUE if we'll use old cp9_HMM2ijBands_OLD() function downstream
2018 * int kmin P7 dervied band to enforce: [0.i..L] = k, min node k for residue i
2019 * int kmax P7 derived band to enforce: [0.i..L] = k, min node k for residue i
2020 * int debug_level [0..3] tells the function what level of debugging print
2021 * statements to print.
2022 *
2023 * Returns: eslOK on success;
2024 */
2025 int
cp9_FB2HMMBandsP7B(CP9_t * hmm,char * errbuf,ESL_DSQ * dsq,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,CP9Bands_t * cp9b,int L,int M,double p_thresh,int do_old_hmm2ij,int * kmin,int * kmax,int debug_level)2026 cp9_FB2HMMBandsP7B(CP9_t *hmm, char *errbuf, ESL_DSQ *dsq, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, CP9Bands_t *cp9b,
2027 int L, int M, double p_thresh, int do_old_hmm2ij, int *kmin, int *kmax, int debug_level)
2028 {
2029 int status;
2030 int k; /* counter over nodes of the model */
2031 int thresh = Prob2Score(((1. - p_thresh)/2.), 1.); /* allowable prob mass excluded on each side */
2032
2033 /* *_m = match, *_i = insert, *_d = delete */
2034 int *kthresh_m, *kthresh_i, *kthresh_d; /* [0..k..hmm->M], individual thresholds for each state */
2035 int *nset_m, *nset_i, *nset_d; /* [0..k..hmm->M], has minimum been set for this state? */
2036 int *xset_m, *xset_i, *xset_d; /* [0..k..hmm->M], has maximum been set for this state? */
2037 int *mass_m, *mass_i, *mass_d; /* [0..k..hmm->M], summed log prob of pmx->mx[i][k] from 0..k or k..L */
2038 int i; /* actual position */
2039 int sc; /* summed score of all parses (derived from backward matrix)
2040 * if(cm->search_opts & CM_SEARCH_HMMALNBANDS) Forward and Backward
2041 * were run in 'scan mode' where each residue can be begin/end of a parse,
2042 * so we have to sum up parses that end at each posn,
2043 * if ! (cm->search_opts & CM_SEARCH_HMMALNBANDS) we know we have
2044 * to start at residue 1 and end at residue L, so sc is simply bmx->mmx[0][0]
2045 */
2046 int hmm_is_localized; /* TRUE if HMM has local begins, ends or ELs on */
2047 int kp, kn, kx;
2048
2049 hmm_is_localized = ((hmm->flags & CPLAN9_LOCAL_BEGIN) || (hmm->flags & CPLAN9_LOCAL_END) || (hmm->flags & CPLAN9_EL)) ? TRUE : FALSE;
2050
2051 if(bmx != pmx) GrowCP9Matrix(pmx, errbuf, L, M, kmin, kmax, NULL, NULL, NULL, NULL, NULL);
2052
2053 /* allocations and initializations */
2054 ESL_ALLOC(nset_m, sizeof(int) * (M+1));
2055 ESL_ALLOC(nset_i, sizeof(int) * (M+1));
2056 ESL_ALLOC(nset_d, sizeof(int) * (M+1));
2057 ESL_ALLOC(xset_m, sizeof(int) * (M+1));
2058 ESL_ALLOC(xset_i, sizeof(int) * (M+1));
2059 ESL_ALLOC(xset_d, sizeof(int) * (M+1));
2060 ESL_ALLOC(mass_m, sizeof(int) * (M+1));
2061 ESL_ALLOC(mass_i, sizeof(int) * (M+1));
2062 ESL_ALLOC(mass_d, sizeof(int) * (M+1));
2063 ESL_ALLOC(kthresh_m, sizeof(int) * (M+1));
2064 ESL_ALLOC(kthresh_i, sizeof(int) * (M+1));
2065 ESL_ALLOC(kthresh_d, sizeof(int) * (M+1));
2066
2067 esl_vec_ISet(mass_m, M+1, -INFTY);
2068 esl_vec_ISet(mass_i, M+1, -INFTY);
2069 esl_vec_ISet(mass_d, M+1, -INFTY);
2070 esl_vec_ISet(nset_m, M+1, FALSE);
2071 esl_vec_ISet(nset_i, M+1, FALSE);
2072 esl_vec_ISet(nset_d, M+1, FALSE);
2073 esl_vec_ISet(xset_m, M+1, FALSE);
2074 esl_vec_ISet(xset_i, M+1, FALSE);
2075 esl_vec_ISet(xset_d, M+1, FALSE);
2076
2077 sc = bmx->mmx[0][0]; /* Forward/Backward run in 'align mode' parses must start at 1, end at L */
2078 /* sc is summed log prob of all possible parses of seq 1..L */
2079
2080 /* note boundary conditions, i = 1 */
2081 assert(kmin[0] == 0);
2082 pmx->mmx[0][0] = fmx->mmx[0][0] + bmx->mmx[0][0] - sc; /* fmx->mmx[0][0] is 0, bmx->mmx[1][0] is overall score */
2083 pmx->imx[0][0] = -INFTY; /*need seq to get here*/
2084 pmx->dmx[0][0] = -INFTY; /*D_0 does not exist*/
2085 if((mass_m[0] = pmx->mmx[0][0]) > thresh) {
2086 cp9b->pn_min_m[0] = 0;
2087 nset_m[0] = TRUE;
2088 }
2089 mass_i[0] = -INFTY; /* b/c pmx->imx[0][0] is -INFTY, set above */
2090 mass_d[0] = -INFTY; /* b/c pmx->dmx[0][0] is -INFTY, set above */
2091
2092 i = 0;
2093 kn = ESL_MAX(kmin[i], 1);
2094 kx = kmax[i];
2095 kp = kn - kmin[i];
2096 for (k = kn; k <= kx; k++, kp++) {
2097 pmx->mmx[0][kp] = -INFTY; /*need seq to get here*/
2098 pmx->imx[0][kp] = -INFTY; /*need seq to get here*/
2099 pmx->dmx[0][kp] = fmx->dmx[0][kp] + bmx->dmx[0][kp] - sc;
2100 /* mass_m[k] doesn't change b/c pmx->mmx[0][kp] is -INFTY */
2101 /* mass_i[k] doesn't change b/c pmx->imx[0][kp] is -INFTY */
2102 if((mass_d[k] = pmx->dmx[0][kp]) > thresh) {
2103 cp9b->pn_min_d[k] = 0;
2104 nset_d[k] = TRUE;
2105 }
2106 }
2107
2108 /* Find minimum position in band for each state (M,I,D) of each node (0..M) */
2109 for (i = 1; i <= L; i++) {
2110 k = 0;
2111 if(INBAND(i,0)) {
2112 kp = k - kmin[i];
2113 assert(kp == 0);
2114 pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - sc, -INFTY); /* M_0 doesn't emit */
2115 if(! nset_m[0]) {
2116 if((mass_m[0] = ILogsum(mass_m[0], pmx->mmx[i][kp])) > thresh) {
2117 cp9b->pn_min_m[0] = i;
2118 nset_m[0] = TRUE;
2119 }
2120 }
2121
2122 pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][0] - sc, -INFTY);
2123 /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
2124 if(! nset_i[0]) {
2125 if((mass_i[0] = ILogsum(mass_i[0], pmx->imx[i][kp])) > thresh) {
2126 cp9b->pn_min_i[0] = i;
2127 nset_i[0] = TRUE;
2128 }
2129 }
2130 pmx->dmx[i][kp] = -INFTY; /* D_0 doesn't exist */
2131 }
2132
2133 kn = ESL_MAX(kmin[i], 1);
2134 kx = kmax[i];
2135 kp = kn - kmin[i];
2136 for(k = kn; k <= kx; k++, kp++)
2137 {
2138 pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - hmm->msc[dsq[i]][k] - sc, -INFTY);
2139 /*hmm->msc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
2140 pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][k] - sc, -INFTY);
2141 /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
2142 pmx->dmx[i][kp] = ESL_MAX(fmx->dmx[i][kp] + bmx->dmx[i][kp] - sc, -INFTY);
2143
2144 if(! nset_m[k]) {
2145 if((mass_m[k] = ILogsum(mass_m[k], pmx->mmx[i][kp])) > thresh) {
2146 cp9b->pn_min_m[k] = i;
2147 nset_m[k] = TRUE;
2148 }
2149 }
2150 if(! nset_i[k]) {
2151 if((mass_i[k] = ILogsum(mass_i[k], pmx->imx[i][kp])) > thresh) {
2152 cp9b->pn_min_i[k] = i;
2153 nset_i[k] = TRUE;
2154 }
2155 }
2156 if(! nset_d[k]) {
2157 if((mass_d[k] = ILogsum(mass_d[k], pmx->dmx[i][kp])) > thresh) {
2158 cp9b->pn_min_d[k] = i;
2159 nset_d[k] = TRUE;
2160 }
2161 }
2162 }
2163 }
2164
2165 esl_vec_ISet(mass_m, M+1, -INFTY);
2166 esl_vec_ISet(mass_i, M+1, -INFTY);
2167 esl_vec_ISet(mass_d, M+1, -INFTY);
2168 /* Find maximum position in band for each state (M,I,D) of each node (0..M)
2169 * by moving from L down to 1 */
2170 for (i = L; i >= 1; i--) /* i is the relative position in the seq */
2171 {
2172 kp = 0;
2173 for(k = kmin[i]; k <= kmax[i]; k++, kp++)
2174 {
2175 if(! xset_m[k]) {
2176 if((mass_m[k] = ILogsum(mass_m[k], pmx->mmx[i][kp])) > thresh) {
2177 cp9b->pn_max_m[k] = i;
2178 xset_m[k] = TRUE;
2179 }
2180 }
2181 if(! xset_i[k]) {
2182 if((mass_i[k] = ILogsum(mass_i[k], pmx->imx[i][kp])) > thresh) {
2183 cp9b->pn_max_i[k] = i;
2184 xset_i[k] = TRUE;
2185 }
2186 }
2187 if(! xset_d[k]) {
2188 if((mass_d[k] = ILogsum(mass_d[k], pmx->dmx[i][kp])) > thresh) {
2189 cp9b->pn_max_d[k] = i;
2190 xset_d[k] = TRUE;
2191 }
2192 }
2193 }
2194 }
2195 /* note boundary conditions, i = 0 */
2196 if(INBAND(0,0)) {
2197 assert(kmin[0] == 0);
2198 if(! xset_m[0]) {
2199 if((mass_m[0] = ILogsum(mass_m[0], pmx->mmx[0][0])) > thresh) {
2200 cp9b->pn_max_m[0] = 0;
2201 xset_m[0] = TRUE;
2202 }
2203 }
2204 }
2205 /* mass_i[0] is unchaged because b/c pmx->imx[0][0] is -INFTY, set above */
2206 /* mass_d[0] is unchaged because b/c pmx->dmx[0][0] is -INFTY, set above */
2207 kn = ESL_MAX(kmin[i], 1);
2208 kx = kmax[i];
2209 kp = kn - kmin[i];
2210 for(k = kn; k <= kx; k++, kp++) {
2211 /* mass_m[k] doesn't change b/c pmx->mmx[0][k] is -INFTY */
2212 /* mass_i[k] doesn't change b/c pmx->mmx[0][k] is -INFTY */
2213 if(!xset_d[k]) {
2214 if((mass_d[k] = ILogsum(mass_d[k], pmx->dmx[0][kp])) > thresh) {
2215 cp9b->pn_max_d[k] = 0;
2216 xset_d[k] = TRUE;
2217 }
2218 }
2219 }
2220
2221 /* new technique as of EPN, Sun Jan 27 08:48:34 2008 */
2222 /* Some states may not have had their min/max set. This occurs if the entire
2223 * state is outside the band (i.e. the summed probability the state is entered for ANY i
2224 * is less than our threshold. Current strategy in this situation is to set the
2225 * pn_min_* and pn_max_* values as special flags, (-2) so the function that
2226 * uses them to derive i and j bands knows this is the case and handles it
2227 * accordingly.
2228 */
2229 int mset;
2230 int dset;
2231 for(k = 0; k <= M; k++) {
2232 mset = dset = TRUE;
2233 /* theoretically either nset_*[k] and xset_*[k] should be either both TRUE or both
2234 * FALSE, but I'm slightly worried about rare precision issues, so we check if one
2235 * or the other is unset, and if so, we set both to argmax position */
2236 if(((! nset_m[k])) || (! xset_m[k]) || (cp9b->pn_max_m[k] < cp9b->pn_min_m[k])) {
2237 cp9b->pn_min_m[k] = cp9b->pn_max_m[k] = -1;
2238 mset = FALSE;
2239 }
2240 if(((! nset_i[k])) || (! xset_i[k]) || (cp9b->pn_max_i[k] < cp9b->pn_min_i[k])) {
2241 cp9b->pn_min_i[k] = cp9b->pn_max_i[k] = -1;
2242 }
2243 if(((! nset_d[k])) || (! xset_d[k]) || (cp9b->pn_max_d[k] < cp9b->pn_min_d[k])) {
2244 cp9b->pn_min_d[k] = cp9b->pn_max_d[k] = -1;
2245 dset = FALSE;
2246 }
2247 if((!hmm_is_localized) && (mset == FALSE && dset == FALSE)) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "node: %d match nor delete HMM state bands were set in non-localized, non-scanning HMM, lower tau (should be << 0.5).\n", k);
2248 }
2249
2250 cp9b->pn_min_d[0] = -1; /* D_0 doesn't exist */
2251 cp9b->pn_max_d[0] = -1; /* D_0 doesn't exist */
2252
2253 if(debug_level > 0) cp9_DebugPrintHMMBands(stdout, L, cp9b, (1.-p_thresh), 1);
2254
2255 free(mass_m);
2256 free(mass_i);
2257 free(mass_d);
2258 free(nset_m);
2259 free(nset_i);
2260 free(nset_d);
2261 free(xset_m);
2262 free(xset_i);
2263 free(xset_d);
2264 free(kthresh_m);
2265 free(kthresh_i);
2266 free(kthresh_d);
2267
2268 return eslOK;
2269
2270 ERROR:
2271 ESL_FAIL(status, errbuf, "Memory allocation error.\n");
2272 }
2273
2274
2275
2276 /* Function: p7_Seq2Bands
2277 * Date : EPN, Fri Aug 15 14:29:01 2008
2278 *
2279 * Purpose: Given a CM with a valid Plan 7 HMM - run the MSV algorithm
2280 * to determine bands to be used on the CP9 HMM parse.
2281 *
2282 * Args: cm - the covariance model
2283 * errbuf - char buffer for reporting errors
2284 * P7_PROFILE - generic profile
2285 * P7_GMX - generic P7 dp matrix
2286 * P7_BG - P7 null model
2287 * P7_TR - P7 trace
2288 * dsq - sequence in digitized form
2289 * L - length of sequence we're aligning (1..L)
2290 * phi - phi array, phi[k][v] is expected number of times (probability)
2291 * state v (0 = match, 1 insert, 2 = delete) in
2292 * node k is *entered*. Node 0 is special, state 0 = B state, state 1 = N_state, state 2 = NULL
2293 * Calculated *without* taking insert->insert transitions into account.
2294 * sc7 - minimum score to allow as a pin, 0. to allow any score
2295 * len7 - min n-mer size, 1 to allow any size
2296 * end7 - min distance from end to allow, prune away any others, 0 to not prune based on end proximity
2297 * mprob7 - min match phi probability to allow in a pin
2298 * mcprob7 - min cumulative match phi probability to allow in a nmer pin
2299 * iprob7 - max insert phi probability to allow in a pin
2300 * ilprob7 - max insert phi probability to allow in a state to the left of a pin
2301 * ret_i2k - [0.i..L] = k, residue i emitted from node k's match state in MSV trace
2302 * ret_kmin - [0.i..L] = k, min node k for residue i
2303 * ret_kmax - [0.i..L] = k, max node k for residue i
2304 * ret_ncells - number of cells within bands, to return
2305 *
2306 * Return: eslOK on success;
2307 *
2308 */
2309 int
p7_Seq2Bands(CM_t * cm,char * errbuf,P7_PROFILE * gm,P7_GMX * gx,P7_BG * bg,P7_TRACE * p7_tr,ESL_DSQ * dsq,int L,double ** phi,float sc7,int len7,int end7,float mprob7,float mcprob7,float iprob7,float ilprob7,int pad7,int ** ret_i2k,int ** ret_kmin,int ** ret_kmax,int * ret_ncells)2310 p7_Seq2Bands(CM_t *cm, char *errbuf, P7_PROFILE *gm, P7_GMX *gx, P7_BG *bg, P7_TRACE *p7_tr, ESL_DSQ *dsq, int L,
2311 double **phi, float sc7, int len7, int end7, float mprob7, float mcprob7, float iprob7, float ilprob7, int pad7,
2312 int **ret_i2k, int **ret_kmin, int **ret_kmax, int *ret_ncells)
2313 {
2314 int status;
2315 float usc, nullsc;
2316 int *k2i, *i2k;
2317 float *isc;
2318 int *iconflict;
2319 int *kmin, *kmax;
2320 int ncells;
2321
2322 /* setup for all modes */
2323 p7_bg_SetLength(bg, L);
2324 p7_bg_NullOne(bg, dsq, L, &nullsc);
2325
2326 /* generic mode setup */
2327 p7_gmx_GrowTo(gx, cm->mlp7->M, L);
2328 p7_ReconfigLength(gm, L);
2329 gx->M = cm->mlp7->M;
2330 gx->L = L;
2331
2332 /* Step 1: MSV algorithm M->M transitions only
2333 * Step 2: traceback MSV to get pins
2334 * Step 3: prune pins
2335 * Step 4: pins -> bands
2336 */
2337
2338 /* Step 1: MSV algorithm */
2339
2340 /* optimized MSV */
2341 /*
2342 p7_oprofile_ReconfigLength(cm->mlp7_om, L);
2343 esl_stopwatch_Start(watch);
2344 my_p7_MSVFilter(dsq, L, cm->mlp7_om, ox, gx, &usc);
2345 got esl_stopwatch_Stop(watch);
2346 FormatTimeString(time_buf, watch->user, TRUE);
2347 fprintf(stdout, "OMSV %8.2f %11s\n", ((usc -nullsc) / eslCONST_LOG2), time_buf);
2348 */
2349
2350 p7_GMSV(dsq, L, gm, gx, 2.0, &usc);
2351
2352 /* Step 2: traceback MSV */
2353 status = my_p7_GTraceMSV(dsq, L, gm, gx, p7_tr, &i2k, &k2i, &isc, &iconflict);
2354
2355 /* Step 3: prune pins */
2356 if(status == eslOK) { /* trace is valid */
2357 prune_i2k(i2k, iconflict, isc, L, phi, sc7, len7, end7, mprob7, mcprob7, iprob7, ilprob7);
2358 }
2359 else if (status == eslEINCOMPAT) { /* trace was discontiguous, abort! remove all pins */
2360 esl_vec_ISet(k2i, (cm->mlp7->M+1), -1);
2361 esl_vec_ISet(i2k, (L+1), -1);
2362 return status;
2363 }
2364
2365 /* Step 4: pins -> bands */
2366 if((status = p7_pins2bands(i2k, errbuf, L, cm->clen, pad7, &kmin, &kmax, &ncells)) != eslOK) return status;
2367 /*DumpP7Bands(stdout, i2k, kmin, kmax, L); */
2368
2369 /* print gmx in heatmap format */
2370 /*
2371 ESL_DMATRIX *D;
2372 double min;
2373 double max;
2374 FILE *hfp;
2375 p7_gmx_Match2DMatrix(gx, TRUE, &D, &min, &max);
2376 hfp = fopen("cur.ps", "w");
2377 my_dmx_Visualize(hfp, D, 0.01, max, 0.01);
2378 fclose(hfp);
2379 esl_dmatrix_Destroy(D);
2380 */
2381
2382 *ret_i2k = i2k;
2383 *ret_kmin = kmin;
2384 *ret_kmax = kmax;
2385 *ret_ncells = ncells;
2386
2387 free(iconflict);
2388 free(isc);
2389 free(k2i);
2390
2391 return eslOK;
2392 }
2393
2394
2395 /**************************************************************
2396 * Function: CP9NodeForPosnP7B()
2397 * Incept: EPN, Tue Aug 19 14:30:51 2008
2398 *
2399 * Purpose: Given a P7 banded CP9 posterior matrix,
2400 * determine the node of the CP9 HMM that is most likely to
2401 * have emitted (from either its Match or Insert state)
2402 * a given posn in the target sequence.
2403 *
2404 * Args: hmm - the CM plan 9 HMM
2405 * errbuf - for error messages
2406 * x - posn of target subsequence we're interested in
2407 * L - last position of target sequence
2408 * post - the posterior matrix for the hmm
2409 * kn - min node k for residue x
2410 * kx - max node k for residue x
2411 * ret_node - RETURN: index of node with highest probability of emitting x
2412 * ret_type - RETURN: type of state in ret_node with highest probability
2413 * print_flag- TRUE to print out info on most likely node
2414 *
2415 *
2416 * Returns: eslOK on success;
2417 * eslEINVAL on contract violation.
2418 * eslEINCOMPAT if kmin[x] >=
2419 */
2420 int
CP9NodeForPosnP7B(CP9_t * hmm,char * errbuf,int x,CP9_MX * post,int kn,int kx,int * ret_node,int * ret_type,int print_flag)2421 CP9NodeForPosnP7B(CP9_t *hmm, char *errbuf, int x, CP9_MX *post,
2422 int kn, int kx, int *ret_node, int *ret_type, int print_flag)
2423 {
2424 /* post->mmx[i][kp]: posterior probability that posn i was emitted from node k's
2425 match state, k = kp + kmin[i] */
2426 int max_k; /* node index with highest posterior probability of emitting posn x */
2427 int max_type; /* type of state in max_k node with max probability '0' for match,
2428 '1' for insert */
2429 int max_sc; /* score (log probability) from post matrix for max_k node max_type state type */
2430 int k; /* counter over nodes */
2431 int kp; /* k': k offset in position x's band */
2432
2433 if(kn > kx) ESL_FAIL(eslEINVAL, errbuf, "ERROR in CP9NodeForPosn(), kn (%d) > kx (%d)\n", kn, kx);
2434
2435 kp = 0;
2436 k = kn;
2437 if(post->mmx[x][0] > post->imx[x][0]) {
2438 max_sc = post->mmx[x][0];
2439 max_type = 0; /* match */
2440 }
2441 else {
2442 max_sc = post->imx[x][0];
2443 max_type = 1; /* insert */
2444 }
2445 max_k = k;
2446
2447 /* move left to right through HMM nodes */
2448 for(k = kn+1, kp = 1; k <= kx; k++, kp++) {
2449 if(post->mmx[x][kp] > max_sc) {
2450 max_k = k;
2451 max_sc = post->mmx[x][kp];
2452 max_type = 0; /* match */
2453 }
2454 if(post->imx[x][kp] > max_sc) {
2455 max_k = k;
2456 max_sc = post->imx[x][kp];
2457 max_type = 1; /* insert */
2458 }
2459 }
2460
2461 if(print_flag) {
2462 if(max_type == 0) printf("MATCH | mx->mmx[%3d][%3d]: %9d | %8f\n", x, max_k, post->mmx[x][max_k-kn], Score2Prob(post->mmx[x][max_k-kn], 1.));
2463 else printf("INSERT | mx->imx[%3d][%3d]: %9d | %8f\n", x, max_k, post->imx[x][max_k-kn], Score2Prob(post->imx[x][max_k-kn], 1.));
2464 }
2465 *ret_node = max_k;
2466 *ret_type = max_type;
2467 return eslOK;
2468 }
2469
2470 /* Function: P7BandsAdjustForSubCM()
2471 * Incept: EPN, Tue Aug 19 14:47:55 2008
2472 *
2473 * Purpose: Correct k bands kmin, kmax built from an original CM for it's sub CM model
2474 * that models spos..epos.
2475 *
2476 * Args: kmin - [0.i..L] = k, min node k for residue i
2477 * kmax - [0.i..L] = k, max node k for residue i
2478 * L - length of current sequence
2479 * spos - min k valid in sub CM
2480 * epos - max k valid in sub CM
2481 *
2482 * Return: <eslOK> on success.
2483 *
2484 */
2485 int
P7BandsAdjustForSubCM(int * kmin,int * kmax,int L,int spos,int epos)2486 P7BandsAdjustForSubCM(int *kmin, int *kmax, int L, int spos, int epos)
2487 {
2488 int i;
2489 int M = epos - spos + 1;
2490 for(i = 0; i <= L; i++) {
2491 kmin[i] = ESL_MAX(kmin[i] - (spos-1), 0);
2492 kmin[i] = ESL_MIN(kmin[i], M);
2493
2494 kmax[i] = ESL_MAX(kmax[i] - (spos-1), 0);
2495 kmax[i] = ESL_MIN(kmax[i], M);
2496 }
2497 kmin[0] = 0; /* hard-coded, M_0 is begin state, it must emit full sequence */
2498
2499 return eslOK;
2500 }
2501