1 /* cm_p7_band.c
2  *
3  * Functions for p7 HMM banding.
4  * BEWARE: only partially implemented.
5  */
6 
7 #include "esl_config.h"
8 #include "p7_config.h"
9 #include "config.h"
10 
11 #include <stdio.h>
12 #include <math.h>
13 #include <assert.h>
14 
15 #include "easel.h"
16 #include "esl_sse.h"
17 #include "esl_vectorops.h"
18 
19 #include "hmmer.h"
20 
21 #include "infernal.h"
22 
23 #define  p7_IMPOSSIBLE -987654321
24 
25 /* Function:  p7_gmx_Match2DMatrix()
26  * Synopsis:  Copy the dp match cells of a generic matrix
27  *            to a ESL_DMATRIX, for visualization with esl_dmx_Visualize()
28  *
29  * Incept:    SRE, Fri Jul 13 09:56:04 2007 [Janelia]
30  *
31  * Purpose:   Dump matrix <gx> to stream <fp> for diagnostics.
32  */
33 int
p7_gmx_Match2DMatrix(P7_GMX * gx,int do_diff,ESL_DMATRIX ** ret_D,double * ret_min,double * ret_max)34 p7_gmx_Match2DMatrix(P7_GMX *gx, int do_diff, ESL_DMATRIX **ret_D, double *ret_min, double *ret_max)
35 {
36   int i, k;
37   ESL_DMATRIX *D;
38   double min =  eslINFINITY;
39   double max = -eslINFINITY;
40   float sc;
41 
42   D = esl_dmatrix_Create(gx->M+1, gx->L);
43   /* fill k == 0 row, the X matrix E state (logically, the begin state) scores */
44   for (i = 1; i <= gx->L; i++) {
45     D->mx[0][(i-1)] = gx->xmx[i * p7G_NXCELLS + 0];
46   }
47 
48   for (i = 1; i <= gx->L; i++) {
49     for (k = 1; k <= gx->M; k++) {
50       sc = gx->dp[i][k * p7G_NSCELLS + p7G_M];
51       if(do_diff) {
52 	if((i < 2) || (k < 2)) D->mx[k][(i-1)] = 0.;
53 	else                   D->mx[k][(i-1)] = sc - ESL_MAX(D->mx[k][(i-2)], D->mx[0][(i-2)]);
54       }
55       else {
56 	D->mx[k][(i-1)] = sc;
57       }
58       min = ESL_MIN(min, D->mx[k][(i-1)]);
59       max = ESL_MAX(max, D->mx[k][(i-1)]);
60     }
61   }
62 
63   *ret_D = D;
64   *ret_min = min;
65   *ret_max = max;
66   return eslOK;
67 }
68 
69 /****************************************************************
70  * Stolen from hmmer/h3/heatmap.c SVN revision 2171
71  * as dmx_Visualize. Then modified so that the full
72  * matrix is printed (not half split diagonally).
73  */
74 /* my_dmx_Visualize()
75  * Incept:    SRE, Wed Jan 24 11:58:21 2007 [Janelia]
76  *
77  * Purpose:
78  *
79  *            Color scheme roughly follows Tufte, Envisioning
80  *            Information, p.91, where he shows a beautiful
81  *            bathymetric chart. The CMYK values conjoin two
82  *            recommendations from ColorBrewer (Cindy Brewer
83  *            and Mark Harrower)
84  *            [http://www.personal.psu.edu/cab38/ColorBrewer/ColorBrewer.html],
85  *            specifically the 9-class sequential2 Blues and
86  *            9-class sequential YlOrBr.
87  *
88  *            Might eventually become part of Easel, once mature?
89  *
90  * Note:      Binning rules basically follow same convention as
91  *            esl_histogram. nb = xmax-xmin/w, so w = xmax-xmin/nb;
92  *            picking bin is (int) ceil((x - xmin)/w) - 1. (xref
93  *            esl_histogram_Score2Bin()). This makes bin b contain
94  *            values bw+min < x <= (b+1)w+min. (Which means that
95  *            min itself falls in bin -1, whoops - but we catch
96  *            all bin<0 and bin>=nshades and put them in the extremes.
97  *
98  * Args:
99  *
100  * Returns:
101  *
102  * Throws:    (no abnormal error conditions)
103  *
104  * Xref:
105  */
106 int
my_dmx_Visualize(FILE * fp,ESL_DMATRIX * D,double min,double max,double min2fill)107 my_dmx_Visualize(FILE *fp, ESL_DMATRIX *D, double min, double max, double min2fill)
108 {
109    int    nshades   = 18;
110    double cyan[]    = { 1.00, 1.00, 0.90, 0.75, 0.57, 0.38, 0.24, 0.13, 0.03,
111 			0.00, 0.00, 0.00, 0.00, 0.00, 0.07, 0.20, 0.40, 0.60};
112    double magenta[] = { 0.55, 0.45, 0.34, 0.22, 0.14, 0.08, 0.06, 0.03, 0.01,
113 			0.00, 0.03, 0.11, 0.23, 0.40, 0.55, 0.67, 0.75, 0.80};
114    double yellow[]  = { 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
115 			0.10, 0.25, 0.40, 0.65, 0.80, 0.90, 1.00, 1.00, 1.00};
116    double black[]   = { 0.30, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
117 			0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00};
118    double w;
119    int    i,j;
120    int    bin;
121    int    boxsize;		/* box size in points */
122    int    xcoord, ycoord;	/* postscript coords in points */
123    int    leftmargin, rightmargin;
124    int    bottommargin, topmargin;
125    float  fboxsize;		/* box size in fractional points */
126 
127    /* Set some defaults that might become arguments later.
128     */
129    leftmargin   = rightmargin = 20;
130    bottommargin = topmargin   = 20;
131 
132    /* Determine some working parameters
133     */
134    w = (max-min) / (double) nshades; /* w = bin size for assigning values->colors*/
135    boxsize = ESL_MAX(1, (ESL_MIN((792 - bottommargin) / D->n,
136 				 (612 - leftmargin)   / D->m)));
137    fboxsize= ESL_MIN( (792. - ((float) bottommargin + topmargin))   / (float) D->n,
138 		      (612. - ((float) leftmargin   + rightmargin)) / (float) D->m);
139 
140 
141    fprintf(fp, "%.4f %.4f scale\n", (fboxsize/(float) boxsize), (fboxsize/(float) boxsize));
142    /* printf("n: %d\nm: %d\n", D->n, D->m); */
143    for (i = 0; i < D->n; i++) {
144      /* printf("\n"); */
145      /* for (j = i; j < D->n; j++) */
146      for (j = 0; j < D->m; j++)
147        {
148 	 /* printf("i: %4d j: %4d %5.1f\n", i, j, D->mx[i][j]); */
149 	 xcoord = j * boxsize + leftmargin;
150 	 ycoord = (D->m-(i+1)) * boxsize + bottommargin; /* difference w/heatmap.c: (D->m-i+1) */
151 
152 	 if      (D->mx[i][j]  <  min2fill)    continue;
153 	 /* if      ((i > 0) && (j > 0) && (D->mx[i][j] <=  D->mx[i-1][j-1]) && (D->mx[i][j] < min2fill))    continue;*/
154 	 else if (D->mx[i][j] == -eslINFINITY) bin = 0;
155 	 else if (D->mx[i][j] ==  eslINFINITY) bin = nshades-1;
156 	 else {
157 	   bin    = (int) ceil((D->mx[i][j] - min) / w) - 1;
158 	   if (bin < 0)        bin = 0;
159 	   if (bin >= nshades) bin = nshades-1;
160 	 }
161 
162 	 printf("%4d %4d %10.3f\n", i, j, D->mx[i][j]);
163 	 fprintf(fp, "newpath\n");
164 	 fprintf(fp, "  %d %d moveto\n", xcoord, ycoord);
165 	 fprintf(fp, "  0  %d rlineto\n", boxsize);
166 	 fprintf(fp, "  %d 0  rlineto\n", boxsize);
167 	 fprintf(fp, "  0 -%d rlineto\n", boxsize);
168 	 fprintf(fp, "  closepath\n");
169 	 fprintf(fp, " %.2f %.2f %.2f %.2f setcmykcolor\n",
170 		 cyan[bin], magenta[bin], yellow[bin], black[bin]);
171 	 fprintf(fp, "  fill\n");
172        }
173    }
174   fprintf(fp, "showpage\n");
175   return eslOK;
176 }
177 
178 
179 /* Function: my_p7_GTraceMSV()
180  * Incept:   EPN, Mon Aug 11 10:01:23 2008
181  *
182  * Purpose:  Traceback of a MSV matrix: retrieval of optimum alignment.
183  *
184  *           Based on p7_GTrace().
185  *
186  *           This function is currently implemented as a
187  *           reconstruction traceback, rather than using a shadow
188  *           matrix. Because H3 uses floating point scores, and we
189  *           can't compare floats for equality, we have to compare
190  *           floats for near-equality and therefore, formally, we can
191  *           only guarantee a near-optimal traceback. However, even in
192  *           the unlikely event that a suboptimal is returned, the
193  *           score difference from true optimal will be negligible.
194  *
195  * Args:     dsq    - digital sequence aligned to, 1..L
196  *           L      - length of <dsq>
197  *           gm     - profile
198  *           mx     - MSV matrix to trace, L x M
199  *           tr     - storage for the recovered traceback.
200  *
201  * Return:   <eslOK> on success.
202  *           <eslFAIL> if even the optimal path has zero probability;
203  *           in this case, the trace is set blank (<tr->N = 0>).
204  *           <eslEINCOMPAT> if the optimal trace is discontiguous wrt
205  *           the sequence, node k > kp emits residue i < ip, while kp
206  *           emits ip. In this case, trace is invalid - caller must
207  *           know this.
208  *
209  * Note:     Care is taken to evaluate the prev+tsc+emission
210  *           calculations in exactly the same order that Viterbi did
211  *           them, lest you get numerical problems with
212  *           a+b+c = d; d-c != a+b because d,c are nearly equal.
213  *           (This bug appeared in dev: xref J1/121.)
214  */
215 
216 /* for HMMER3 P7 HMMs */
217 #define P7MMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_M])
218 #define P7XMX(i,s) (xmx[(i) * p7G_NXCELLS + (s)])
219 #define P7MSC(k)   (rsc[(k) * p7P_NR     + p7P_MSC])
220 
221 /* for CP9 HMMs */
222 #define CP9TSC(s,k) (tsc[(k) * cp9O_NTRANS + (s)])
223 
224 int
my_p7_GTraceMSV(const ESL_DSQ * dsq,int L,const P7_PROFILE * gm,const P7_GMX * gx,P7_TRACE * tr,int ** ret_i2k,int ** ret_k2i,float ** ret_isc,int ** ret_iconflict)225 my_p7_GTraceMSV(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr, int **ret_i2k, int **ret_k2i, float **ret_isc, int **ret_iconflict)
226 {
227   int     status = eslOK;
228   int     i;			/* position in seq (1..L) */
229   int     k;			/* position in model (1..M) */
230   int     M   = gm->M;
231   float **dp  = gx->dp;
232   float  *xmx = gx->xmx;
233   float   tol = 1e-5;
234   float   esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY;
235   /* new vars */
236   float        tloop = logf((float) L / (float) (L+3));
237   float        tmove = logf(     3.0f / (float) (L+3));
238   float        tbmk  = logf(     2.0f / ((float) gm->M * (float) (gm->M+1)));
239   float        tec   = logf(0.5f);
240 
241   int *k2i; /* [0.1..k..gm->M] = i, residue i emitted from node k's match state in MSV trace */
242   int *i2k; /* [0.1..i..L]     = k, residue i emitted from node k's match state in MSV trace */
243   int *iconflict; /* [0.1..i..L] = TRUE | FALSE. TRUE to eventually remove the kmer with pin at i */
244   float *isc; /*[0.1..i..L]    = sc, match emission score for residue i is sc */
245 
246   ESL_ALLOC(k2i, sizeof(int)   * (gm->M+1));
247   ESL_ALLOC(i2k, sizeof(int)   * (L+1));
248   ESL_ALLOC(iconflict, sizeof(int)   * (L+1));
249   ESL_ALLOC(isc, sizeof(float) * (L+1));
250   esl_vec_ISet(k2i, (gm->M+1), -1);
251   esl_vec_ISet(i2k, (L+1),     -1);
252   esl_vec_ISet(iconflict, (L+1), FALSE);
253   esl_vec_FSet(isc, (L+1),     -eslINFINITY);
254 
255   if ((status = p7_trace_Reuse(tr)) != eslOK) goto ERROR;
256 
257   /* Initialization.
258    * (back to front. ReverseTrace() called later.)
259    */
260   if ((status = p7_trace_Append(tr, p7T_T, 0, 0)) != eslOK) goto ERROR;
261   if ((status = p7_trace_Append(tr, p7T_C, 0, 0)) != eslOK) goto ERROR;
262   i    = L;			/* next position to explain in seq */
263 
264   /* Traceback
265    */
266   while (tr->st[tr->N-1] != p7T_S) {
267     float const *rsc = gm->rsc[dsq[i]];
268 
269     switch (tr->st[tr->N-1]) {
270     case p7T_C:		/* C(i) comes from E(i) */
271       if   (P7XMX(i,p7G_C) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible C reached at i=%d", i);
272 
273       if (esl_FCompare(P7XMX(i, p7G_C), P7XMX(i-1, p7G_C) + tloop, tol) == eslOK) {
274 	tr->i[tr->N-1]    = i--;  /* first C doesn't emit: subsequent ones do */
275 	status = p7_trace_Append(tr, p7T_C, 0, 0);
276       } else if (esl_FCompare(P7XMX(i, p7G_C), P7XMX(i, p7G_E) + tec, tol) == eslOK)
277 	status = p7_trace_Append(tr, p7T_E, 0, 0);
278       else ESL_XEXCEPTION(eslFAIL, "C at i=%d couldn't be traced", i);
279       break;
280 
281     case p7T_E:		/* E connects from any M state. k set here */
282       if (P7XMX(i, p7G_E) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible E reached at i=%d", i);
283 
284       if (esl_FCompare(P7XMX(i, p7G_E), P7MMX(i,M), tol) == eslOK) { k = M; status = p7_trace_Append(tr, p7T_M, M, i); }
285       else {
286 	for (k = M-1; k >= 1; k--)
287 	  if (esl_FCompare(P7XMX(i, p7G_E), P7MMX(i,k) + esc, tol) == eslOK)
288 	    { status = p7_trace_Append(tr, p7T_M, k, i); break; }
289 	if (k < 0) ESL_XEXCEPTION(eslFAIL, "E at i=%d couldn't be traced", i);
290       }
291       break;
292 
293     case p7T_M:			/* M connects from i-1,k-1, or B */
294       if (P7MMX(i,k) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible M reached at k=%d,i=%d", k,i);
295       if      (esl_FCompare(P7MMX(i,k), P7XMX(i-1,p7G_B) + tbmk  + P7MSC(k), tol) == eslOK) status = p7_trace_Append(tr, p7T_B, 0,   0);
296       else if (esl_FCompare(P7MMX(i,k), P7MMX(i-1,k-1)           + P7MSC(k), tol) == eslOK) {
297 	status = p7_trace_Append(tr, p7T_M, k-1, i-1);
298 	/*if(k2i[(k-1)] != -1) { status = eslEINCOMPAT; printf("! discontiguous trace k2i[k-1=%d] != -1 (%d) i-1 = %d\n", k-1, k2i[(k-1)], i-1); goto ERROR;} */
299 	/*if(i2k[(i-1)] != -1) { status = eslEINCOMPAT; printf("! discontiguous trace i2k[i-1=%d] != -1 (%d) k-1 = %d\n", i-1, i2k[(i-1)], k-1); goto ERROR;} */
300 	if(i2k[(i-1)] != -1 || k2i[(k-1)] != -1) {
301 	  iconflict[(k2i[(k-1)])] = TRUE; /* eventually remove pin kmer that included k we *thought* pinned i-1 */
302 	  iconflict[(i-1)]        = TRUE; /* eventually remove pin kmer that includes k we currently think pins i-1 */
303 	}
304 	k2i[(k-1)] = i-1;
305 	i2k[(i-1)] = k-1;
306 	isc[(i-1)] = P7MSC(k);
307       }
308       else ESL_XEXCEPTION(eslFAIL, "M at k=%d,i=%d couldn't be traced", k,i);
309 
310       if (status != eslOK) goto ERROR;
311       k--;
312       i--;
313       break;
314 
315     case p7T_N:			/* N connects from S, N */
316       if (P7XMX(i, p7G_N) <= p7_IMPOSSIBLE) ESL_XEXCEPTION(eslFAIL, "impossible N reached at i=%d", i);
317 
318       if (i == 0) status = p7_trace_Append(tr, p7T_S, 0, 0);
319       else if (esl_FCompare(P7XMX(i,p7G_N), P7XMX(i-1, p7G_N) + tloop, tol) == eslOK)
320 	{
321 	  tr->i[tr->N-1] = i--;
322 	  status = p7_trace_Append(tr, p7T_N, 0, 0);
323 	}
324       else ESL_XEXCEPTION(eslFAIL, "N at i=%d couldn't be traced", i);
325       break;
326 
327     case p7T_B:			/* B connects from N, J */
328       if (P7XMX(i,p7G_B) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible B reached at i=%d", i);
329 
330       if (esl_FCompare(P7XMX(i,p7G_B), P7XMX(i, p7G_N) + tmove, tol)  == eslOK)
331 	status = p7_trace_Append(tr, p7T_N, 0, 0);
332       else if (esl_FCompare(P7XMX(i,p7G_B),  P7XMX(i, p7G_J) + tmove, tol) == eslOK)
333 	status = p7_trace_Append(tr, p7T_J, 0, 0);
334       else  ESL_XEXCEPTION(eslFAIL, "B at i=%d couldn't be traced", i);
335       break;
336 
337     case p7T_J:			/* J connects from E(i) or J(i-1) */
338       if (P7XMX(i,p7G_J) == -eslINFINITY) ESL_XEXCEPTION(eslFAIL, "impossible J reached at i=%d", i);
339 
340       if (esl_FCompare(P7XMX(i,p7G_J), P7XMX(i-1,p7G_J) + tloop, tol) == eslOK) {
341 	tr->i[tr->N-1] = i--;
342 	status = p7_trace_Append(tr, p7T_J, 0, 0);
343       } else if (esl_FCompare(P7XMX(i,p7G_J), P7XMX(i,p7G_E) + tec, tol) == eslOK)
344 	status = p7_trace_Append(tr, p7T_E, 0, 0);
345       else  ESL_XEXCEPTION(eslFAIL, "J at i=%d couldn't be traced", i);
346       break;
347 
348     default: ESL_XEXCEPTION(eslFAIL, "bogus state in traceback");
349     } /* end switch over statetype[tpos-1] */
350 
351     if (status != eslOK) goto ERROR;
352   } /* end traceback, at S state */
353 
354   if ((status = p7_trace_Reverse(tr)) != eslOK) goto ERROR;
355   if (ret_i2k != NULL) { *ret_i2k = i2k; } else free(i2k);
356   if (ret_k2i != NULL) { *ret_k2i = k2i; } else free(k2i);
357   if (ret_isc != NULL) { *ret_isc = isc; } else free(isc);
358   if (ret_iconflict != NULL) { *ret_iconflict = iconflict; } else free(iconflict);
359   return eslOK;
360 
361  ERROR:
362   if (ret_i2k != NULL) { *ret_i2k = i2k; } else free(i2k);
363   if (ret_k2i != NULL) { *ret_k2i = k2i; } else free(k2i);
364   if (ret_isc != NULL) { *ret_isc = isc; } else free(isc);
365   return status;
366 }
367 
368 
369 
370 /* Function: Parsetree2i_to_k()
371  * Date:     EPN, Mon Aug 11 13:41:38 2008
372  *
373  * Purpose:  Given a parsetree, fill a vector i2k[0.1..i..L] = k, saying that
374  *           residue i is emitted into consensus column k. If k <= 0, this implies
375  *           residue i was inserted after consensus column (-1 * k).
376  *
377  * Returns:  eslOK on success
378  *           eslEINCOMPAT on contract violation.
379  */
380 int
Parsetree2i_to_k(CM_t * cm,CMEmitMap_t * emap,int L,char * errbuf,Parsetree_t * tr,int ** ret_i2k)381 Parsetree2i_to_k(CM_t *cm, CMEmitMap_t *emap, int L, char *errbuf, Parsetree_t *tr, int **ret_i2k)
382 {
383   int status;                   /* Easel status code */
384   int tidx;			/* counter through positions in the parsetree        */
385   int v;			/* state index in CM */
386   int nd;
387   int *i2k;
388 
389   ESL_ALLOC(i2k, sizeof(int) * (L+1));
390   esl_vec_ISet(i2k, (L+1), -1 * (cm->clen + 1));
391 
392   /* contract check */
393   if(emap   == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "Parsetree2i_to_k(): emap == NULL.");
394 
395 		/* trivial preorder traverse, since we're already numbered that way */
396   for (tidx = 0; tidx < tr->n; tidx++) {
397     v = tr->state[tidx];        	/* index of parent state in CM */
398     nd = cm->ndidx[v];
399     if (v == cm->M) continue;      	/* special case: v is EL, local alignment end */
400     switch (cm->sttype[v]) {
401     case MP_st:
402       i2k[tr->emitl[tidx]] = emap->lpos[nd];
403       i2k[tr->emitr[tidx]] = emap->rpos[nd];
404       break;
405 
406     case ML_st:
407       i2k[tr->emitl[tidx]] = emap->lpos[nd];
408       break;
409 
410     case MR_st:
411       i2k[tr->emitr[tidx]] = emap->rpos[nd];
412       break;
413 
414     case IL_st:
415       i2k[tr->emitl[tidx]] = (-1 * emap->lpos[nd]);
416       break;
417 
418     case IR_st:
419       i2k[tr->emitr[tidx]] = (-1 * (emap->rpos[nd] - 1)); /* IR's emit to before consensus column rpos */
420       break;
421 
422     }
423   }
424 
425   *ret_i2k = i2k;
426   return eslOK;
427 
428  ERROR:
429   ESL_FAIL(status, errbuf, "Parsetree2i_to_k(), memory allocation error.");
430   return status; /* NEVERREACHED */
431 }
432 
433 /* Function: prune_i2k()
434  * Incept:   EPN, Mon Aug 11 15:02:35 2008
435  *
436  * Purpose:  Prune an i2k array of pins. Optionally prune by the following
437  *           criteria (in this order):
438  *           o match score of the pin
439  *           o length (n) of n-mer each pin exists in
440  *           o proximity to end of n-mer
441  *
442  * Args:     i2k        - the input pin array, modified (pruned) in place
443  *           iconflict  - [0.1..i..L] TRUE if i:i2k[i] pin is involved in a conflict, remove the whole nmer pin
444  *           isc        - score (match emission) for each pin
445  *           L          - length of current sequence
446  *           phi        - phi array, phi[k][v] is expected number of times (probability)
447  *                        state v (0 = match, 1 insert, 2 = delete) in
448  *                        node k is *entered*. Node 0 is special, state 0 = B state, state 1 = N_state, state 2 = NULL
449  *                        Calculated *without* taking insert->insert transitions into account.
450  *           min_sc     - minimum score to allow as a pin, 0. to allow any score
451  *           min_len    - min n-mer size, 1 to allow any size
452  *           min_end    - min distance from end to allow, prune away any others, 0 to not prune based on end proximity
453  *           min_mprob  - min match phi probability to allow in a pin
454  *           min_mcprob - min cumulative match phi probability to allow in a nmer pin
455  *           max_iprob  - max insert phi probability to allow in a pin
456  *           max_ilprob - max insert phi probability to allow in a state to the left of a pin
457  *
458  * Return:   <eslOK> on success.
459  *
460  */
461 int
prune_i2k(int * i2k,int * iconflict,float * isc,int L,double ** phi,float min_sc,int min_len,int min_end,float min_mprob,float min_mcprob,float max_iprob,float max_ilprob)462 prune_i2k(int *i2k, int *iconflict, float *isc, int L, double **phi, float min_sc, int min_len, int min_end, float min_mprob, float min_mcprob, float max_iprob, float max_ilprob)
463 {
464   int     i, j;
465 
466   int do_end;
467   int do_sc;
468   int do_len;
469   float   tol = 1e-5;
470   float mcprob = 1.; /* cumulative match probability */
471   int n = 0;
472   int ip;
473 
474   do_sc   = (esl_FCompare(0., min_sc, tol) == eslOK) ? FALSE : TRUE;
475   do_end  = (min_end == 0)  ? FALSE : TRUE;
476   do_len  = (min_len == 1) ? FALSE : TRUE;
477 
478   /* pass 1, prune on conflict from trace, this must be first pass */
479   for(i = 1; i <= L; i++) {
480     if(iconflict[i] == TRUE) {
481       /* remove entire nmer's pins */
482       ip = i;
483       while((ip > 0)  && (i2k[ip] != -1)) i2k[ip--] = -1;
484       ip = i;
485       while((ip <= L) && (i2k[ip] != -1)) i2k[ip++] = -1;
486     }
487   }
488 
489   /* pass 2, prune on scores */
490   if(do_sc) {
491     for(i = 1; i <= L; i++) if((i2k[i] != -1) && (isc[i] < min_sc)) i2k[i] = -1;
492   }
493 
494   /* pass 3, prune on phi values */
495   for(i = 1; i <= L; i++) {
496     if(i2k[i] != -1) { /* position i is currently pinned */
497        if ((phi[i2k[i]][HMMMATCH]      < min_mprob)   || /* match  prob too low */
498 	   (phi[i2k[i]][HMMINSERT]     > max_iprob)   || /* insert to the right prob too high */
499 	   (phi[(i2k[i]-1)][HMMINSERT] > max_ilprob)) {  /* insert to the left  prob too high */
500 	 /* remove pin */
501 	 /*printf("removing pin %4d %.4f\n", i, phi[i2k[i]][HMMMATCH]);*/
502 	 i2k[i] = -1;
503        }
504     }
505   }
506 
507   /* pass 4, prune on nmer length, match score, match phi probability, and distance from end */
508   if(do_len || do_end) { /* determine the size n of the n-mer each residue i is in */
509     for(i = 1; i <= L; i++) {
510       if((i2k[i] == -1) || /* position i is not pinned */
511 	 ((i2k[(i-1)] != -1) && !(i2k[(i-1)] == (i2k[i]-1)))) { /* position i is pinned to k, position i-1 is pinned to kp, but k and kp are not consecutive */
512 	/* we just ended an n-mer (n >= 0) */
513 	if(n > 0 && n < min_len) {
514 	  for(j = (i - n); j < i; j++) i2k[j] = -1; /* remove the n-mer */
515 	  mcprob = 1.;
516 	}
517 	else if (do_end && n >= min_len) { /* n >= min_len, remove those within do_end of end */
518 	  for(j = (i - n);        j < (i - (n+1)) + min_end; j++) i2k[j] = -1; /* remove the part of the n-mer within nend residues of the beginning edge */
519 	  for(j = (i - min_end);  j < i;                     j++) i2k[j] = -1; /* remove the part of the n-mer within nend residues of the end edge */
520 	}
521 	if(i2k[i] == -1) { /* position i is not pinned */
522 	  mcprob = 1.;
523 	  n = 0;
524 	}
525 	else { /* position i is pinned to k, position i-1 is pinned to kp, but k and kp are not consecutive */
526 	  mcprob = phi[(i2k[i])][HMMMATCH];
527 	  n = 1;
528 	  if(mcprob < min_mcprob) { i2k[i] = -1; n = 0; mcprob = 1.; }
529 	}
530       }
531       else {
532 	n++;
533 	mcprob *= phi[(i2k[i])][HMMMATCH];
534 	if(mcprob < min_mcprob) { /* remove the n-mer up to this point */
535 	  for(j = (i - n); j < i; j++) i2k[j] = -1; /* remove the n-mer */
536 	  mcprob = 1.;
537 	  n = 0;
538 	}
539       }
540     }
541     /* deal with possibility that last n residues were an n-mer, with n < min_len */
542     if(n > 0 && n < min_len) { for(j = (i - n); j < i; j++) i2k[j] = -1; /* remove the n-mer */}
543   }
544 
545   return eslOK;
546 }
547 
548 /* Function: p7_pins2bands()
549  * Incept:   EPN, Mon Aug 11 15:41:44 2008
550  *
551  * Purpose:  Given an i2k pins array, determine the imin and imax bands.
552  *
553  * Args:     i2k      - the input pin array, modified (pruned) in place
554  *           errbuf   - for error messages
555  *           L        - length of current sequence
556  *           M        - number of nodes in the HMM
557  *           pad      - pad on each side of pin, if pad = 3, we allow +/- 3 residues from pin
558  *           ret_kmin - [0.i..L] = k, min node k for residue i
559  *           ret_kmax - [0.i..L] = k, max node k for residue i
560  *           ret_ncells - number of cells within bands, to return
561 #if 0
562  *           ret_imin - [0.k..M] = i, min residue i for node k
563  *           ret_imax - [0.k..M] = i, max residue i for node k
564 #endif
565  *
566  * Return:   <eslOK> on success.
567  *
568  */
569 int
p7_pins2bands(int * i2k,char * errbuf,int L,int M,int pad,int ** ret_kmin,int ** ret_kmax,int * ret_ncells)570 p7_pins2bands(int *i2k, char *errbuf, int L, int M, int pad, int **ret_kmin, int **ret_kmax, int *ret_ncells)
571 {
572   int     status;
573 
574   int i;
575   int kn = 0;
576   int kx = M;
577   int *kmin, *kmax;
578 
579   ESL_ALLOC(kmin, sizeof(int) * (L+1));
580   ESL_ALLOC(kmax, sizeof(int) * (L+1));
581 
582   /* traverse residues left to right to get kmins */
583   for(i = 0; i <= L; i++) {
584     if(i2k[i] != -1) {
585       if(kn >= i2k[i] && kn > 1) {
586 	ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error i: %d, i2k[i]: %d but current kn: %d\n", i, i2k[i], kn);
587       }
588       kn = ESL_MAX(1, i2k[i] - pad);
589     }
590     kmin[i] = kn;
591   }
592 
593   /* traverse nodes right to left to get imaxs */
594   for(i = L; i >= 0; i--) {
595     if(i2k[i] != -1) {
596       if(kx <= i2k[i] && kx < L) ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error: i: %d, i2k[i]: %d but current kx: %d\n", i, i2k[i], kx);
597       kx = ESL_MIN(M, i2k[i] + pad);
598     }
599     kmax[i] = kx;
600   }
601 
602   /* M_0 == B state, which must start the parse with i == 0 */
603   kmin[0] = 0;
604 
605   /* get number of cells if wanted */
606   int ncells;
607   if(ret_ncells != NULL) {
608     ncells = 0;
609     for(i = 1; i <= L; i++) ncells += kmax[i] - kmin[i] + 1;
610     *ret_ncells = ncells;
611   }
612 
613   if(ret_kmin != NULL) { *ret_kmin = kmin; } else free(kmin);
614   if(ret_kmax != NULL) { *ret_kmax = kmax; } else free(kmax);
615   return eslOK;
616 
617  ERROR:
618   ESL_FAIL(status, errbuf, "p7_pins2bands() memory error.");
619   return status; /* NEVERREACHED */
620 
621 #if 0
622   /* if we want to get imin/imax instead of kmin/kmax */
623   int in = 1;
624   int ix = L;
625   int *imin;
626   int *imax;
627   int *k2i;
628 
629   ESL_ALLOC(imin, sizeof(int) * (M+1));
630   ESL_ALLOC(imax, sizeof(int) * (M+1));
631   ESL_ALLOC(k2i,  sizeof(int) * (M+1));
632 
633   imin[0] = imax[0] = -1;
634   esl_vec_ISet(k2i, (M+1), -1);
635 
636   for(i = 1; i <= L; i++) if(i2k[i] != -1) k2i[i2k[i]] = i;
637 
638   /* traverse nodes left to right to get imins */
639   for(k = 1; k <= M; k++) {
640     if(k2i[k] != -1) {
641       if(k2i[k] != 1 && in >= k2i[k]) ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error k: %d, k2i[k]: %d but current in: %d\n", k, k2i[k], in);
642       in = ESL_MAX(1, k2i[k] - pad);
643     }
644     imin[k] = in;
645   }
646 
647   /* traverse nodes right to left to get imaxs */
648   for(k = M; k >= 1; k--) {
649     if(k2i[k] != L && k2i[k] != -1) {
650       if(ix <= k2i[k]) ESL_FAIL(eslFAIL, errbuf, "p7_pins2bands() error: k: %d, k2i[k]: %d but current ix: %d\n", k, k2i[k], ix);
651       ix = ESL_MIN(L, k2i[k] + pad);
652     }
653     imax[k] = ix;
654   }
655 
656   free(k2i);
657 
658   /* get number of cells if wanted */
659   int ncells;
660   if(ret_ncells != NULL) {
661     ncells = 0;
662     for(k = 1; k <= M; k++) ncells += imax[k] - imin[k] + 1;
663     *ret_ncells = ncells;
664   }
665   if(ret_imin != NULL) { *ret_imin = imin; } else free(imin);
666   if(ret_imax != NULL) { *ret_imax = imax; } else free(imax);
667 #endif
668 }
669 
670 /* Function: DumpP7Bands()
671  * Incept:   EPN, Thu Aug 14 08:45:54 2008
672  *
673  * Purpose:  Given i2k and kmin, kmax arrays, print them.
674  *
675  * Args:     i2k      - [0.k..M] = i, node k is pinned to residue i
676  *           kmin     - [0.i..L] = k, min node k for residue i
677  *           kmax     - [0.i..L] = k, max node k for residue i
678  *           L        - length of current sequence
679  *
680  * Return:   <eslOK> on success.
681  *
682  */
683 int
DumpP7Bands(FILE * fp,int * i2k,int * kmin,int * kmax,int L)684 DumpP7Bands(FILE *fp, int *i2k, int *kmin, int *kmax, int L)
685 {
686   int i;
687 
688   fprintf(fp, "# %4s  %4s  %4s ... %4s\n", "i", "i2k", "kmin", "kmax");
689   fprintf(fp, "# %4s  %4s  %13s\n", "----", "----", "-------------");
690   for(i = 0; i <= L; i++) {
691     fprintf(fp, "  %4d  %4d  %4d ... %4d\n", i, i2k[i], kmin[i], kmax[i]);
692   }
693   return eslOK;
694 
695 }
696 
697 /* Function: cp9_ForwardP7B()
698  *
699  * Purpose:  Runs the banded Forward dynamic programming algorithm on an
700  *           input sequence (1..L). Complements cp9_BackwardP7B().
701  *           The 'P7B' suffix indicates plan 7 HMM derived bands
702  *           in the kmin and kmax arrays are applied.  This function
703  *           was derived from cp9_Forward(), differences from that
704  *           function were introduced solely to impose bands on the
705  *           matrix.
706  *
707  *           Because of the bands, some options that exist to cp9_Forward()
708  *           (like be_efficient and do_scan and reporting hits in scan mode)
709  *           are not available here. This function is meant to be used
710  *           solely for first stage of a Forward, Backward, Posterior type
711  *           calculation.
712  *
713  *           Also due to bands, only L is passed as seq length, instead
714  *           of i0 and j0 (seq start/stop).  This simplifies the application
715  *           of bands kmin[i]/kmax[i] refers to residue i.
716  *
717  *           This function requires that local EL states are turned off, if
718  *           they're not it returns. A separate function should exist to align
719  *           with EL states on, I don't think it's worth the extra computations to
720  *           make 1 function that handles both.
721  *
722  *           See additional notes in cp9_Forward() "Purpose" section.
723  *
724  * Args:
725  *           cp9       - the CP9 HMM
726  *           errbuf    - char buffer for error messages
727  *           mx        - the matrix, expanded to correct size (if nec), and filled here
728  *           dsq       - sequence in digitized form
729  *           L         - start of target subsequence (1 for beginning of dsq)
730  *           kmin      - [0.1..i..N] minimum k for residue i
731  *           kmax      - [0.1..i..N] maximum k for residue i
732  *           ret_sc    - RETURN: log P(S|M,bands)/P(S|R), as a bit score
733  *
734  * Returns:  eslOK on success;
735  *           eslEINCOMPAT on contract violation;
736  */
737 #define INBAND(i,k) ((k >= kmin[i]) && (k <= kmax[i]))
738 
739 int
cp9_ForwardP7B(CP9_t * cp9,char * errbuf,CP9_MX * mx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,float * ret_sc)740 cp9_ForwardP7B(CP9_t *cp9, char *errbuf, CP9_MX *mx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, float *ret_sc)
741 {
742   int          status;
743   int          i;           /* j-W: position in the subsequence                             */
744   int          k;           /* CP9 HMM node position                                        */
745   int        **mmx;         /* DP matrix for match  state scores [0..1][0..cp9->M]          */
746   int        **imx;         /* DP matrix for insert state scores [0..1][0..cp9->M]          */
747   int        **dmx;         /* DP matrix for delete state scores [0..1][0..cp9->M]          */
748   int        **elmx;        /* DP matrix for EL state scores [0..1][0..cp9->M]              */
749   int         *erow;        /* end score for each position [0..1]                           */
750   int          M;           /* cp9->M, query length, number of consensus nodes of model */
751   int          kp, kn, kx, kpcur, kpprv;
752 
753   /* Contract checks */
754   if(cp9 == NULL)                      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, cp9 is NULL.\n");
755   if(dsq == NULL)                      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, dsq is NULL.");
756   if(mx == NULL)                       ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, mx is NULL.\n");
757   if(mx->M != cp9->M)                 ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, mx->M != cp9->M.\n");
758   if(kmin == NULL || kmax == NULL)     ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, kmin and/or kmax == NULL.\n");
759   if(cp9->flags & CPLAN9_EL)           ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B, cp9 EL flag up.\n");
760 
761   M = cp9->M;
762 
763   int const *tsc = cp9->otsc; /* ptr to efficiently ordered transition scores           */
764 
765   /* gamma allocation and initialization.
766    * This is a little SHMM that finds an optimal scoring parse
767    * of multiple nonoverlapping hits. */
768 
769   /* Rearrange DP matrix for this seq */
770   if((status = GrowCP9Matrix(mx, errbuf, L, M, kmin, kmax, &mmx, &imx, &dmx, &elmx, &erow)) != eslOK) return status;
771   ESL_DPRINTF2(("#DEBUG: cp9_ForwardP7B(): CP9 matrix size: %.8f Mb rows: %d.\n", mx->size_Mb, mx->rows));
772 
773   /* Initialization of the zero row. */
774   mmx[0][0] = 0;      /* M_0 is state B, and everything starts in B */
775   imx[0][0] = -INFTY; /* I_0 is state N, can't get here without emitting*/
776   dmx[0][0] = -INFTY; /* D_0 doesn't exist. */
777   elmx[0][0]= -INFTY; /* can't go from B to EL state */
778   erow[0]   = -INFTY;
779 
780   /* Because there's a D state for every node 1..M,
781      dmx[0][k] is possible for all k 1..M (if it's within kmin[0]..kmax[0]) */
782   int sc;
783   i = 0;
784   kn = ESL_MAX(1, kmin[0]);
785   kp = kn - kmin[0];
786   for (k = kn; k <= kmax[0]; k++, kp++) {
787     assert(kp >= 0);
788     mmx[0][kp] = imx[0][kp] = elmx[0][kp] = -INFTY;      /* need seq to get here */
789     sc = -INFTY;
790     if(kp > 0) { /* if kp == 0, kp-1 is outside the bands */
791       sc = ILogsum(ILogsum(mmx[0][kp-1] + CP9TSC(cp9O_MD,k-1),
792 			   imx[0][kp-1] + CP9TSC(cp9O_ID,k-1)),
793 		   dmx[0][kp-1] + CP9TSC(cp9O_DD,k-1));
794     }
795     dmx[0][kp] = sc;
796   }
797   /* We can do a full parse through all delete states. */
798   erow[0] = -INFTY;
799   if(INBAND(0, M)) { erow[0] = dmx[0][M] + CP9TSC(cp9O_DM,M); }
800 
801   /*****************************************************************
802    * The main loop: scan the sequence from position 1 to L.
803    *****************************************************************/
804   /* Recursion. */
805 
806   for (i = 1; i <= L; i++)
807     {
808       int const *isc = cp9->isc[dsq[i]];
809       int const *msc = cp9->msc[dsq[i]];
810       int endsc     = -INFTY;
811       int sc;
812 
813       if(kmin[i] == 0) {
814 	mmx[i][0]  = -INFTY;
815 	dmx[i][0]  = -INFTY;  /*D_0 is non-existent*/
816 	elmx[i][0] = -INFTY;  /*no EL state for node 0 */
817 	sc = ILogsum(ILogsum(mmx[i-1][0] + CP9TSC(cp9O_MI,0),
818 			     imx[i-1][0] + CP9TSC(cp9O_II,0)),
819 		     dmx[i-1][0] + CP9TSC(cp9O_DI,0));
820 	imx[i][0] = sc + isc[0];
821 	kn = 1; /* kmin[i] + 1 */
822       }
823       else {
824 	kn = kmin[i];
825       }
826 
827       /*match state*/
828       kn = ESL_MAX(kn, (kmin[i-1]+1)); /* start at first cell from which we can look back to a valid cell at *mx[i-1][k-1] */
829       kx = ESL_MIN(kmax[i], kmax[i-1]+1);
830 
831       /* NOT SURE ABOUT THIS AND HOW IT COUPLES WITH BLOCK ABOVE kmin[i] == 0 */
832       for (kpcur = 0;            kpcur < (kn - kmin[i]);   kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
833       for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
834 
835       kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
836       kpprv = kn - kmin[i-1];
837       for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
838 	/*printf("M i: %d kpprv: %d\n", i, kpprv);*/
839 	assert((kpprv-1) >= 0);
840 
841 	sc = ILogsum(ILogsum(mmx[i-1][kpprv-1] + CP9TSC(cp9O_MM,k-1),
842 			     imx[i-1][kpprv-1] + CP9TSC(cp9O_IM,k-1)),
843 		     dmx[i-1][kpprv-1] + CP9TSC(cp9O_DM,k-1));
844 
845 	/* FIX ME: inefficient! check B->M_K transition */
846 	if(INBAND(i-1, 0)) { /* if i-1 is in k == 0's band */
847 	  assert(kmin[(i-1)] == 0);
848 	  if(mmx[i-1][0] != -INFTY)
849 	   sc = ILogsum(sc, mmx[i-1][0] + CP9TSC(cp9O_BM,k));
850 	}
851 
852 	if(sc != -INFTY) {
853 	  mmx[i][kpcur] = sc + msc[k];
854 	  /* E state update */
855 	  endsc = ILogsum(endsc, mmx[i][kpcur] + CP9TSC(cp9O_ME,k));
856 	}
857 	else {
858 	  mmx[i][kpcur] = -INFTY;
859 	  /* don't update E state */
860 	}
861 	///printf("mmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kpcur, mmx[i][kpcur]);
862       }
863 
864       /* insert state*/
865       kn = ESL_MAX(kmin[i], kmin[i-1]);
866       kx = ESL_MIN(kmax[i], kmax[i-1]);
867 
868       for (kpcur = 0;            kpcur < (kn - kmin[i]);   kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
869       for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
870 
871       kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
872       kpprv = kn - kmin[i-1];
873       for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
874 	/*insert state*/
875 	assert(kpprv >= 0);
876 	/* HERE, EVENTUALLY IF kmin/kmax differ b/t Match and Inserts:
877 	 * only look at match states from k that have i-1 within band */
878 	/* all insert states from k should have i-1 within band */
879 	/*printf("I i: %d kpprv: %d\n", i, kpprv);*/
880 	sc = ILogsum(ILogsum(mmx[i-1][kpprv] + CP9TSC(cp9O_MI,k),
881 			     imx[i-1][kpprv] + CP9TSC(cp9O_II,k)),
882 		     dmx[i-1][kpprv] + CP9TSC(cp9O_DI,k));
883 	if(sc != -INFTY) imx[i][kpcur] = sc + isc[k];
884 	else             imx[i][kpcur] = -INFTY;
885 	///printf("imx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kpcur, imx[i][kpcur]);
886       }
887 
888       /*delete state*/
889       kn = kmin[i]+1;
890 
891       for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) dmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
892       kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
893       for (k = kn; k <= kmax[i]; k++, kpcur++) { /* should I be adding one for delete off-by-one?? */
894 	sc = ILogsum(ILogsum(mmx[i][kpcur-1] + CP9TSC(cp9O_MD,k-1),
895 			     imx[i][kpcur-1] + CP9TSC(cp9O_ID,k-1)),
896 		     dmx[i][kpcur-1] + CP9TSC(cp9O_DD,k-1));
897 	dmx[i][kpcur] = sc;
898 	///printf("dmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kpcur, dmx[i][kpcur]);
899       }
900 	  /*printf("mmx [jp:%d][%d]: %d\n", jp, k, mmx[j][k]);
901 	    printf("imx [jp:%d][%d]: %d\n", jp, k, imx[j][k]);
902 	    printf("dmx [jp:%d][%d]: %d\n", jp, k, dmx[j][k]);
903 	    printf("elmx[jp:%d][%d]: %d\n", jp, k, elmx[j][k]);*/
904 
905       if(INBAND(i, M)) {
906 	endsc = ILogsum(ILogsum(endsc, dmx[i][M-kmin[i]] + CP9TSC(cp9O_DM,M)), /* transition from D_M -> end */
907 			imx[i][M-kmin[i]] + CP9TSC(cp9O_IM,M)); /* transition from I_M -> end */
908       }
909       /* transition penalty to EL incurred when EL was entered */
910       /*printf("endsc: %d\n", endsc);*/
911 
912       erow[i] = endsc;
913     } /* end loop over end positions i */
914 
915   *ret_sc = Scorify(erow[L]);
916   ESL_DPRINTF1(("#DEBUG: cp9_ForwardP7B() return score: %10.4f\n", Scorify(erow[L])));
917 
918   return eslOK;
919 }
920 
921 
922 
923 /* Function: cp9_ForwardP7B_OLD_WITH_EL()
924  *
925  * Purpose:  Runs the banded Forward dynamic programming algorithm on an
926  *           input sequence (1..L). Complements cp9_BackwardP7B().
927  *           The 'P7B' suffix indicates plan 7 HMM derived bands
928  *           in the kmin and kmax arrays are applied.  This function
929  *           was derived from cp9_Forward(), differences from that
930  *           function were introduced solely to impose bands on the
931  *           matrix.
932  *
933  *           Because of the bands, some options that exist to cp9_Forward()
934  *           (like be_efficient and do_scan and reporting hits in scan mode)
935  *           are not available here. This function is meant to be used
936  *           solely for first stage of a ForwardP7B, BackwardP7B, Posterior type
937  *           calculation.
938  *
939  *           Also due to bands, only L is passed as seq length, instead
940  *           of i0 and j0 (seq start/stop).  This simplifies the application
941  *           of bands kmin[i]/kmax[i] refers to residue i.
942  *
943  *           See additional notes in cp9_Forward() "Purpose" section.
944  *
945  *           NOTE: EPN, Thu Aug 14 09:54:36 2008
946  *           This is the original version written to potentially handle EL
947  *           states, but I think omitting them when they're off is significantly
948  *           more efficient, so I wrote a competing function called
949  *           cp9_ForwardP7B(). Note the included EL dp steps HAVE NOT BEEN
950  *           TESTED YET! I just left them here as a starting point if I
951  *           ever want to implement them.
952  *
953  * Args:
954  *           cp9       - the CP9 HMM
955  *           errbuf    - char buffer for error messages
956  *           mx        - the matrix, expanded to correct size (if nec), and filled here
957  *           dsq       - sequence in digitized form
958  *           L         - start of target subsequence (1 for beginning of dsq)
959  *           kmin      - [0.1..i..N] minimum k for residue i
960  *           kmax      - [0.1..i..N] maximum k for residue i
961  *           ret_sc    - RETURN: log P(S|M,bands)/P(S|R), as a bit score
962  *
963  * Returns:  eslOK on success;
964  *           eslEINCOMPAT on contract violation;
965  */
966 
967 int
cp9_ForwardP7B_OLD_WITH_EL(CP9_t * cp9,char * errbuf,CP9_MX * mx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,float * ret_sc)968 cp9_ForwardP7B_OLD_WITH_EL(CP9_t *cp9, char *errbuf, CP9_MX *mx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, float *ret_sc)
969 {
970   int          status;
971   int          i;           /* j-W: position in the subsequence                             */
972   int          k;           /* CP9 HMM node position                                        */
973   int        **mmx;         /* DP matrix for match  state scores [0..1][0..cp9->M]      */
974   int        **imx;         /* DP matrix for insert state scores [0..1][0..cp9->M]      */
975   int        **dmx;         /* DP matrix for delete state scores [0..1][0..cp9->M]      */
976   int        **elmx;        /* DP matrix for EL state scores [0..1][0..cp9->M]          */
977   int         *erow;        /* end score for each position [0..1]                           */
978   int          c;           /* counter for EL states                                        */
979   int          M;           /* cp9->M, query length, number of consensus nodes of model */
980   int          kn, kx, kpcur, kpprv, kpprv_el;
981 
982   /* Contract checks */
983   if(cp9 == NULL)                      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, cm->cp9 is NULL.\n");
984   if(dsq == NULL)                      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, dsq is NULL.");
985   if(mx == NULL)                       ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, mx is NULL.\n");
986   if(mx->M != cp9->M)                  ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, mx->M != cp9->M.\n");
987   if(kmin == NULL || kmax == NULL)     ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_ForwardP7B_OLD_WITH_EL, kmin and/or kmax == NULL.\n");
988 
989   M = cp9->M;
990 
991   int const *tsc = cp9->otsc; /* ptr to efficiently ordered transition scores           */
992 
993   /* gamma allocation and initialization.
994    * This is a little SHMM that finds an optimal scoring parse
995    * of multiple nonoverlapping hits. */
996 
997   /* Grow DP matrix if nec, to either 2 rows or L+1 rows (depending on be_efficient),
998    * stays M+1 columns */
999   if((status = GrowCP9Matrix(mx, errbuf, L, M, kmin, kmax, &mmx, &imx, &dmx, &elmx, &erow)) != eslOK) return status;
1000   ESL_DPRINTF2(("#DEBUG: cp9_ForwardP7B_OLD_WITH_EL(): CP9 matrix size: %.8f Mb rows: %d.\n", mx->size_Mb, mx->rows));
1001 
1002   /* Initialization of the zero row. */
1003   mmx[0][0] = 0;      /* M_0 is state B, and everything starts in B */
1004   imx[0][0] = -INFTY; /* I_0 is state N, can't get here without emitting*/
1005   dmx[0][0] = -INFTY; /* D_0 doesn't exist. */
1006   elmx[0][0]= -INFTY; /* can't go from B to EL state */
1007   erow[0]   = -INFTY;
1008 
1009   /* Because there's a D state for every node 1..M,
1010      dmx[0][k] is possible for all k 1..M */
1011   int sc;
1012   i = 0;
1013   kn    = ESL_MAX(1, kmin[0]+1);
1014   kpcur = kn - kmin[0];
1015   for (k = kn; k <= kmax[0]; k++, kpcur++) {
1016     assert(kpcur >= 1);
1017     mmx[0][kpcur] = imx[0][kpcur] = elmx[0][kpcur] = -INFTY;      /* need seq to get here */
1018     sc = ILogsum(ILogsum(mmx[0][kpcur-1] + CP9TSC(cp9O_MD,k-1),
1019 			 imx[0][kpcur-1] + CP9TSC(cp9O_ID,k-1)),
1020 		 dmx[0][kpcur-1] + CP9TSC(cp9O_DD,k-1));
1021     dmx[0][kpcur] = sc;
1022   }
1023   /* We can do a full parse through all delete states. */
1024   erow[0] = -INFTY;
1025   if(INBAND(0, M)) { erow[0] = dmx[0][M] + CP9TSC(cp9O_DM,M); }
1026 
1027   /*****************************************************************
1028    * The main loop: scan the sequence from position 1 to L.
1029    *****************************************************************/
1030   /* Recursion. */
1031 
1032   for (i = 1; i <= L; i++)
1033     {
1034       int const *isc = cp9->isc[dsq[i]];
1035       int const *msc = cp9->msc[dsq[i]];
1036       int endsc     = -INFTY;
1037       int el_selfsc = cp9->el_selfsc;
1038       int sc;
1039 
1040       if(kmin[i] == 0) {
1041 	mmx[i][0]  = -INFTY;
1042 	dmx[i][0]  = -INFTY;  /*D_0 is non-existent*/
1043 	elmx[i][0] = -INFTY;  /*no EL state for node 0 */
1044 	sc = ILogsum(ILogsum(mmx[i-1][0] + CP9TSC(cp9O_MI,0),
1045 			     imx[i-1][0] + CP9TSC(cp9O_II,0)),
1046 		     dmx[i-1][0] + CP9TSC(cp9O_DI,0));
1047 	imx[i][0] = sc + isc[0];
1048 	kn = 1; /* kmin[i] + 1 */
1049       }
1050       else {
1051 	kn = kmin[i];
1052       }
1053 
1054       /*match state*/
1055       kn = ESL_MAX(kn, (kmin[i-1]+1)); /* start at first cell from which we can look back to a valid cell at *mx[i-1][k-1] */
1056       kx = ESL_MIN(kmax[i], kmax[i-1]+1);
1057 
1058       /* NOT SURE ABOUT THIS AND HOW IT COUPLES WITH BLOCK ABOVE kmin[i] == 0 */
1059       for (kpcur = 0;            kpcur < (kn - kmin[i]);   kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1060       for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) mmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1061 
1062       for (kpcur = 0;            kpcur < (kn - kmin[i]);   kpcur++) elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1063       for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1064 
1065       kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
1066       kpprv = kn - kmin[i-1];
1067       for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
1068 	/*printf("M i: %d kpprv: %d\n", i, kpprv);*/
1069 	assert((kpprv-1) >= 0);
1070 
1071 	sc = ILogsum(ILogsum(mmx[i-1][kpprv-1] + CP9TSC(cp9O_MM,k-1),
1072 			     imx[i-1][kpprv-1] + CP9TSC(cp9O_IM,k-1)),
1073 		     dmx[i-1][kpprv-1] + CP9TSC(cp9O_DM,k-1));
1074 
1075 	/* FIX ME: inefficient! check B->M_K transition */
1076 	if(INBAND(i-1, 0)) { /* if i-1 is in k == 0's band */
1077 	  assert(kmin[(i-1)] == 0);
1078 	  if(mmx[i-1][0] != -INFTY)
1079 	   sc = ILogsum(sc, mmx[i-1][0] + CP9TSC(cp9O_BM,k));
1080 	}
1081 
1082 	/* check possibility we came from an EL, if they're valid */
1083 	for(c = 0; c < cp9->el_from_ct[k]; c++) { /* el_from_ct[k] is >= 0 */
1084 	  if(INBAND(i-1, cp9->el_from_idx[k][c])) {
1085 	    kpprv_el = cp9->el_from_idx[k][c] - kmin[(i-1)];
1086 	    sc = ILogsum(sc, elmx[i-1][kpprv_el]);
1087 	  }
1088 	} /* transition penalty to EL incurred when EL was entered */
1089 	if(sc != -INFTY) {
1090 	  mmx[i][kpcur] = sc + msc[k];
1091 	  /* E state update */
1092 	  endsc = ILogsum(endsc, mmx[i][kpcur] + CP9TSC(cp9O_ME,k));
1093 	}
1094 	else {
1095 	  mmx[i][kpcur] = -INFTY;
1096 	  /* don't update E state */
1097 	}
1098 	/*printf("k: %4d mmx[i:%4d][kpcur:%4d]: %d\n", k, i, kpcur, mmx[i][kpcur]);*/
1099 
1100 	/* el state */
1101 	sc = -INFTY;
1102 	if((cp9->flags & CPLAN9_EL) && cp9->has_el[k]) /* not all HMM nodes have an EL state (for ex:
1103 								  HMM nodes that map to right half of a MATP_MP) */
1104 	  {
1105 	    if(mmx[i][kpcur] != -INFTY) {
1106 	      kpprv_el = k - kmin[(i-1)];
1107 	      sc = ILogsum(mmx[i][kpcur]  + CP9TSC(cp9O_MEL,k), /* transitioned from cur node's match state */
1108 			   elmx[i-1][kpprv_el] + el_selfsc);      /* transitioned from cur node's EL state emitted ip on transition */
1109 	    }
1110 	  }
1111 	elmx[i][kpcur] = sc;
1112       }
1113 
1114       /* insert state*/
1115       kn = ESL_MAX(kmin[i], kmin[i-1]);
1116       kx = ESL_MIN(kmax[i], kmax[i-1]);
1117 
1118       for (kpcur = 0;            kpcur < (kn - kmin[i]);   kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1119       for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) imx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1120 
1121       kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
1122       kpprv = kn - kmin[i-1];
1123       for (k = kn; k <= kx; k++, kpcur++, kpprv++) {
1124 	/*insert state*/
1125 	assert(kpprv >= 0);
1126 	/* HERE, EVENTUALLY IF kmin/kmax differ b/t Match and Inserts:
1127 	 * only look at match states from k that have i-1 within band */
1128 	/* all insert states from k should have i-1 within band */
1129 	/*printf("I i: %d kpprv: %d\n", i, kpprv);*/
1130 	sc = ILogsum(ILogsum(mmx[i-1][kpprv] + CP9TSC(cp9O_MI,k),
1131 			     imx[i-1][kpprv] + CP9TSC(cp9O_II,k)),
1132 		     dmx[i-1][kpprv] + CP9TSC(cp9O_DI,k));
1133 	if(sc != -INFTY) imx[i][kpcur] = sc + isc[k];
1134 	else             imx[i][kpcur] = -INFTY;
1135 	/*printf("k: %4d imx[i:%4d][kpcur:%4d]: %d\n", k, i, kpcur, imx[i][kpcur]);*/
1136       }
1137 
1138       /*delete state*/
1139       kn = kmin[i]+1;
1140 
1141       for (kpcur = 0; kpcur < (kn - kmin[i]); kpcur++) dmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1142       kpcur = kn - kmin[i]; /* unnec, loop above ends with this */
1143       for (k = kn; k <= kmax[i]; k++, kpcur++) { /* should I be adding one for delete off-by-one?? */
1144 	sc = ILogsum(ILogsum(mmx[i][kpcur-1] + CP9TSC(cp9O_MD,k-1),
1145 			     imx[i][kpcur-1] + CP9TSC(cp9O_ID,k-1)),
1146 		     dmx[i][kpcur-1] + CP9TSC(cp9O_DD,k-1));
1147 	dmx[i][kpcur] = sc;
1148 	/*printf("k: %4d dmx[i:%4d][kpcur:%4d]: %d\n", k, i, kpcur, dmx[i][kpcur]);*/
1149       }
1150 	  /*printf("mmx [jp:%d][%d]: %d\n", jp, k, mmx[j][k]);
1151 	    printf("imx [jp:%d][%d]: %d\n", jp, k, imx[j][k]);
1152 	    printf("dmx [jp:%d][%d]: %d\n", jp, k, dmx[j][k]);
1153 	    printf("elmx[jp:%d][%d]: %d\n", jp, k, elmx[j][k]);*/
1154 
1155       if(INBAND(i, M)) {
1156 	endsc = ILogsum(ILogsum(endsc, dmx[i][M-kmin[i]] + CP9TSC(cp9O_DM,M)), /* transition from D_M -> end */
1157 			imx[i][M-kmin[i]] + CP9TSC(cp9O_IM,M)); /* transition from I_M -> end */
1158 	for(c = 0; c < cp9->el_from_ct[M+1]; c++) { /* el_from_ct[k] is >= 0 */
1159 	  if(INBAND(i, cp9->el_from_idx[M+1][c])) {
1160 	    kpprv_el = cp9->el_from_idx[M+1][c] - kmin[i];
1161 	    endsc = ILogsum(endsc, elmx[i][kpprv_el]);
1162 	  }
1163 	}
1164       }
1165 	/* transition penalty to EL incurred when EL was entered */
1166       /*printf("endsc: %d\n", endsc);*/
1167 
1168       erow[i] = endsc;
1169     } /* end loop over end positions i */
1170 
1171   *ret_sc = Scorify(erow[L]);
1172   ESL_DPRINTF1(("#DEBUG: cp9_ForwardP7B_OLD_WITH_EL() return score: %10.4f\n", Scorify(erow[L])));
1173 
1174   return eslOK;
1175 }
1176 
1177 /* Function: cp9_BackwardP7B()
1178  *
1179  * Purpose:  Runs the banded Backward dynamic programming algorithm on an
1180  *           input sequence (1..L). Complements cp9_ForwardP7B().
1181  *           The 'P7B' suffix indicates plan 7 HMM derived bands
1182  *           in the kmin and kmax arrays are applied.  This function
1183  *           was derived from cp9_Backward(), differences from that
1184  *           function were introduced solely to impose bands on the
1185  *           matrix.
1186  *
1187  *           Because of the bands, some options that exist to cp9_Forward()
1188  *           (like be_efficient and do_scan and reporting hits in scan mode)
1189  *           are not available here. This function is meant to be used
1190  *           solely for second stage of a Forward, Backward, Posterior type
1191  *           calculation.
1192  *
1193  *           Also due to bands, only L is passed as seq length, instead
1194  *           of i0 and j0 (seq start/stop).  This simplifies the application
1195  *           of bands kmin[i]/kmax[i] refers to residue i.
1196  *
1197  *           This function requires that local EL states are turned off, if
1198  *           they're not it returns. A separate function should exist to align
1199  *           with EL states on, I don't think it's worth the extra computations to
1200  *           make 1 function that handles both.
1201  *
1202  *           See additional notes in cp9_Backward() "Purpose" section.
1203  *
1204  * Args:
1205  *           cp9       - the CP9 HMM
1206  *           errbuf    - char buffer for error messages
1207  *           mx        - the matrix, expanded to correct size (if nec), and filled here
1208  *           dsq       - sequence in digitized form
1209  *           L         - start of target subsequence (1 for beginning of dsq)
1210  *           kmin      - [0.1..i..N] minimum k for residue i
1211  *           kmax      - [0.1..i..N] maximum k for residue i
1212  *           ret_sc    - RETURN: log P(S|M,bands)/P(S|R), as a bit score, this is B->M[0][0]
1213  *
1214  * Returns:  eslOK on success;
1215  *           eslEINCOMPAT on contract violation;
1216  */
1217 int
cp9_BackwardP7B(CP9_t * cp9,char * errbuf,CP9_MX * mx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,float * ret_sc)1218 cp9_BackwardP7B(CP9_t *cp9, char *errbuf, CP9_MX *mx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, float *ret_sc)
1219 {
1220   int          status;
1221   int          i;           /*     j-W: position in the subsequence                         */
1222   int          k;           /* CP9 HMM node position                                        */
1223   int        **mmx;         /* DP matrix for match  state scores [0..1][0..cp9->M]      */
1224   int        **imx;         /* DP matrix for insert state scores [0..1][0..cp9->M]      */
1225   int        **dmx;         /* DP matrix for delete state scores [0..1][0..cp9->M]      */
1226   int        **elmx;        /* DP matrix for EL state scores [0..1][0..cp9->M]          */
1227   int         *erow;        /* end score for each position [0..1]                           */
1228   int          c;           /* counter for EL states */
1229   int          M;           /* cp9->M */
1230   int          kpcur, kpprv, kpcur_el;
1231   int          kprv, kprvn, kprvx;
1232   int          kn, kx;
1233   float        fsc;
1234 
1235   /* Contract checks */
1236   if(cp9 == NULL)                      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, cp9 is NULL.\n");
1237   if(dsq == NULL)                      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, dsq is NULL.");
1238   if(mx == NULL)                       ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, mx is NULL.\n");
1239   if(mx->M != cp9->M)                  ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, mx->M != cm->clen.\n");
1240   if(cp9->flags & CPLAN9_EL)       ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_BackwardP7B, cp9 EL flag up!\n");
1241 
1242   M = cp9->M;
1243 
1244   int const *tsc = cp9->otsc; /* ptr to efficiently ordered transition scores           */
1245 
1246   /* Rearrange DP matrix for this seq */
1247   if((status = GrowCP9Matrix(mx, errbuf, L, M, kmin, kmax, &mmx, &imx, &dmx, &elmx, &erow)) != eslOK) return status;
1248   ESL_DPRINTF2(("#DEBUG: cp9_BackwardP7B(): CP9 matrix size: %.8f Mb rows: %d.\n", mx->size_Mb, mx->rows));
1249 
1250   /* Initialization of the L row. */
1251   i = L;
1252 
1253   /*******************************************************************
1254    * 0 Handle EL, looking at EL_k->E for all valid k.
1255    * we're going backwards so we have to work out of order, we could get
1256    * around this by storing the nodes each EL goes TO in an el_to_ct[] vec. */
1257   /* init to -INFTY */
1258   kpcur = 0;
1259   for (k = kmin[i]; k <= kmax[i]; k++, kpcur++) elmx[i][kpcur] = -INFTY;
1260   if(cp9->flags & CPLAN9_EL)
1261     {
1262       for(c = 0; c < cp9->el_from_ct[cp9->M+1]; c++) /* el_from_ct[cp9->M+1] holds # ELs that can go to END */
1263 	if(INBAND(i, cp9->el_from_idx[M+1][c])) {
1264 	  kpcur_el = cp9->el_from_idx[M+1][c] - kmin[i];
1265 	  elmx[i][kpcur_el] = 0.;
1266 	}
1267     }
1268   /*******************************************************************/
1269 
1270   /* elmx[cur][cp9->M] is either 0 (if EL_M exists (it would nec be in el_from_idx[cp9->M+1] array if it does, so
1271    * it would be filled with 0 in above loop), or -INFTY if it doesn't exist. We don't add possibility of EL_M -> EL_M
1272    * self loop b/c it's impossible to do that without emitting, and we've already seen our last res emitted.
1273    * either way we don't have to modify it */
1274 
1275   if(INBAND(i, M)) { /* if i is in k == M's band */
1276     assert(M == kmax[i]);
1277     kpcur = M-kmin[i];
1278     mmx[i][kpcur]  = 0. +
1279       ILogsum(elmx[i][kpcur] + CP9TSC(cp9O_MEL, M),/* M_M<-EL_M<-E, with 0 self loops in EL_M */
1280 	      CP9TSC(cp9O_ME,M));                      /* M_M<-E ... everything ends in E (the 0; 2^0=1.0) */
1281     mmx[i][kpcur] += cp9->msc[dsq[i]][M];  /* ... + emitted match symbol */
1282     imx[i][kpcur]  = 0. + CP9TSC(cp9O_IM,M);     /* I_M<-E ... everything ends in E (the 0; 2^0=1.0) */
1283     imx[i][kpcur] += cp9->isc[dsq[i]][M];  /* ... + emitted insert symbol */
1284     dmx[i][kpcur]  = CP9TSC(cp9O_DM,M);          /* D_M<-E */
1285     kx = M-1;
1286     kpcur--;
1287   }
1288   else { kx = kmax[i]; kpcur = kmax[i]-kmin[i]; }
1289   /*******************************************************************
1290    * No need to look at EL_k->M_M b/c elmx[i] with i == L means last emitted residue was L+1
1291    * and this is impossible if we've come from M_M (only would be valid if we were coming from
1292    * E which is handled above with the EL_k->E code).
1293    *******************************************************************/
1294 
1295   for (k = kx; k >= kmin[i]; k--, kpcur--)
1296     {
1297       mmx[i][kpcur]  = 0 + CP9TSC(cp9O_ME,k);  /*M_k<- E */
1298 
1299       if(INBAND(i, k+1)) {
1300 	mmx[i][kpcur]  = ILogsum(mmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_MD,k));
1301       }
1302       if(cp9->flags & CPLAN9_EL)
1303 	mmx[i][kpcur]  = ILogsum(mmx[i][kpcur], elmx[i][kpcur] + CP9TSC(cp9O_MEL,k));
1304 
1305       mmx[i][kpcur] += cp9->msc[dsq[i]][k];
1306 
1307       /*******************************************************************
1308        * No need to look at EL_k->M_M b/c elmx[i] with i == L means last emitted residue was L+1
1309        * and this is impossible if we've come from M_M (only would be valid if we were coming from
1310        * E which is handled above with the EL_k->E code).
1311        *******************************************************************/
1312 
1313       if(INBAND(i, k+1)) {
1314 	imx[i][kpcur]  = dmx[i][kpcur+1] + CP9TSC(cp9O_ID,k);
1315 	imx[i][kpcur] += cp9->isc[dsq[i]][k];
1316 
1317 	dmx[i][kpcur]  = dmx[i][kpcur+1] + CP9TSC(cp9O_DD,k);
1318       }
1319       else {
1320 	imx[i][kpcur] = -INFTY;
1321 	dmx[i][kpcur] = -INFTY;
1322       }
1323       /* elmx[i][k] was set above, out of order */
1324 
1325       ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, mmx[i][kpcur]);
1326       ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, imx[i][kpcur]);
1327       ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, dmx[i][kpcur]);
1328     }
1329 
1330   if(INBAND(i, 0)) {
1331     /* remember M_0 is special, the B state, a non-emitter */
1332     mmx[i][0]  = dmx[i][1] + CP9TSC(cp9O_MD,0); /* M_0(B)->D_1, no seq emitted, all deletes */
1333     /* above line is diff from CPBackwardOLD() which has mmx[i][0] = -INFTY; */
1334     imx[i][0]  = dmx[i][1] + CP9TSC(cp9O_ID,0);
1335     imx[i][0] += cp9->isc[dsq[i]][0];
1336 
1337     dmx[i][0]   = -INFTY; /*D_0 doesn't exist*/
1338     elmx[i][0]  = -INFTY; /*EL_0 doesn't exist*/
1339   }
1340 
1341   /*****************************************************************
1342    * The main loop: scan the sequence from position j0-1 to i0.
1343    *****************************************************************/
1344   /* Reision */
1345   for (i = L-1; i >= 1; i--)
1346     {
1347       /* init EL mx to -INFTY */
1348       kpcur = 0;
1349       for (k = kmin[i]; k <= kmax[i]; k++, kpcur++) elmx[i][kpcur] = -INFTY;
1350 
1351       /* Deal with node k == M first */
1352       /* elmx[i][k] could have come from self (EL_k), we
1353        * can't have come from END b/c we haven't emitted the last res of the seq yet.
1354        */
1355       if(INBAND(i, M)) {
1356 	kpcur = M-kmin[i];
1357 	if((cp9->flags & CPLAN9_EL) && (cp9->has_el[M]))
1358 	  elmx[i][kpcur] = elmx[i][kpcur] + cp9->el_selfsc;
1359 
1360 	if(INBAND(i+1, M)) {
1361 	  kpprv = M-kmin[i+1];
1362 	  mmx[i][kpcur]  = imx[i+1][kpprv] + CP9TSC(cp9O_MI,M);
1363 	  mmx[i][kpcur] += cp9->msc[dsq[i]][M];
1364 
1365 	  imx[i][kpcur]  = imx[i+1][kpprv] + CP9TSC(cp9O_II,M);
1366 	  imx[i][kpcur] += cp9->isc[dsq[i]][M];
1367 
1368 	  dmx[i][M]  = imx[i+1][kpprv] + CP9TSC(cp9O_DI,M);
1369 	}
1370 	else {
1371 	  mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = -INFTY;
1372 	}
1373 
1374 	if((cp9->flags & CPLAN9_EL) && (cp9->has_el[M]))
1375 	  mmx[i][kpcur] = ILogsum(mmx[i][kpcur], elmx[i][kpcur] + CP9TSC(cp9O_MEL,M));
1376 
1377 	/*******************************************************************
1378 	 * 1b Handle EL, looking at EL_k->M_M for all valid k.
1379 	 * EL_k->M_M transition, which has no transition penalty */
1380 	if(INBAND(i+1, M)) {
1381 	  if(cp9->flags & CPLAN9_EL)
1382 	    {
1383 	      for(c = 0; c < cp9->el_from_ct[M]; c++) /* el_from_ct[M] holds # ELs that can go to M_M */
1384 		if(INBAND(i, cp9->el_from_idx[M][c])) {
1385 		  kpcur_el = cp9->el_from_idx[M][c] - kmin[i];
1386 		  elmx[i][kpcur_el] = ILogsum(elmx[i][kpcur_el], mmx[i+1][kpprv]);
1387 		}
1388 	    }
1389 	}
1390       }
1391 
1392       /*********************************************************/
1393       /* MATCH: *_k <- M_k+1 transitions FROM a match state*/
1394       kn = ESL_MAX(kmin[i], kmin[i+1]-1); /* start at first cell from which we can look ahead to a valid cell at mmx[i-1][k-1] */
1395       kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1396       kx = ESL_MIN(kmax[i], kmax[i+1]-1);
1397 
1398       for (kpcur = 0;            kpcur < (kn - kmin[i]);   kpcur++) mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1399       for (kpcur = kx-kmin[i]+1; kpcur <= kmax[i]-kmin[i]; kpcur++) mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = elmx[i][kpcur] = -INFTY; /* impossible to reach these guys */
1400 
1401       kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1402       kpprv = kx - kmin[i+1];
1403       for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1404 	{
1405 	  /* Handle EL, looking at EL_k->M_k for all valid k and EL_k->EL_k
1406 	   * we're going backwards so we have to work out of order
1407 	   * we could get around this by storing the nodes each EL goes TO
1408 	   * in an el_to_ct[] vector. */
1409 	  if(cp9->flags & CPLAN9_EL) {
1410 	    for(c = 0; c < cp9->el_from_ct[k]; c++) { /* el_from_ct[k] holds # ELs that can go to M_k */
1411 	      if(INBAND(i, cp9->el_from_idx[k][c])) {
1412 		kpcur_el = cp9->el_from_idx[k][c] - kmin[i];
1413 		elmx[i][kpcur_el] = ILogsum(elmx[i][kpcur_el], mmx[i+1][kpprv]);
1414 		/* EL<-M, penalty incurred when we enter EL (i.e. leave going backwards) */
1415 	      }
1416 	    }
1417 	  }
1418 
1419 	  /* Finish off elmx[i][k] with possibility of coming from self (EL_k),
1420 	   * elmx[i][k] will have been filled by block above for ks > current k,
1421 	   * no M_k -> EL_k' with k' > k */
1422 	  if(INBAND(i+1, k)) {
1423 	    if((cp9->flags & CPLAN9_EL) && (cp9->has_el[k]))
1424 	      elmx[i][k] = ILogsum(elmx[i][kpcur], elmx[i+1][kpprv] + cp9->el_selfsc);
1425 	  }
1426 	  mmx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_MM,k);
1427 	  imx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_IM,k);
1428 	  dmx[i][kpcur] = mmx[i+1][kpprv+1] + CP9TSC(cp9O_DM,k);
1429 	}
1430 
1431       /*********************************************************/
1432       /* INSERTIONS: *_k <- I_k+1 transitions FROM a insert state*/
1433       kn = ESL_MAX(kmin[i], kmin[i+1]); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1434       kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1435       kx = ESL_MIN(kmax[i], kmax[i+1]);
1436 
1437       kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1438       kpprv = kx - kmin[i+1];
1439       for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1440 	{
1441 	  mmx[i][kpcur] = ILogsum(mmx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_MI,k));
1442 	  imx[i][kpcur] = ILogsum(imx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_II,k));
1443 	  dmx[i][kpcur] = ILogsum(dmx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_DI,k));
1444 	}
1445 
1446       /*********************************************************/
1447       /* DELETIONS: *_k <- D_k+1 transitions FROM a delete state*/
1448       kn = ESL_MAX(kmin[i], kmin[i]-1); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1449       kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1450       kx = ESL_MIN(kmax[i], kmax[i]-1);
1451 
1452       kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1453       for (k = kx; k >= kn; k--, kpcur--)
1454 	{
1455 	  mmx[i][kpcur] = ILogsum(mmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_MD,k));
1456 	  imx[i][kpcur] = ILogsum(imx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_ID,k));
1457 	  dmx[i][kpcur] = ILogsum(dmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_DD,k));
1458 
1459 	  /* now add in the emission score */
1460 	  mmx[i][kpcur] += cp9->msc[dsq[i]][k];
1461 	  imx[i][kpcur] += cp9->isc[dsq[i]][k];
1462 	}
1463       /* there's one valid cell that we didn't consider in above loop to add emission score
1464        * b/c D_k <- D_k+1 (so k+1 is out of bounds for Delete) */
1465       for(k = kx+1; k <= kmax[i]; k++) {
1466 	kpcur = k - kmin[i]; /* unnec, loop above ends with this */
1467 	mmx[i][kpcur] += cp9->msc[dsq[i]][k];
1468 	imx[i][kpcur] += cp9->isc[dsq[i]][k];
1469       }
1470       /*********************************************************/
1471       kpcur = kmax[i] - kmin[i];
1472       for(k = kmax[i]; k >= kmin[i] && k > 0; k--, kpcur--) {
1473 	////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, mmx[i][kpcur]);
1474 	////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, imx[i][kpcur]);
1475 	////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, dmx[i][kpcur]);
1476       }
1477       /* special case when k == 0 */
1478       kpcur = 0;
1479       kpprv = 0 - kmin[i+1];
1480       if(INBAND(i, 0)) {
1481 	assert(kmin[i] == 0);
1482 	dmx[i][kpcur]  = -INFTY; /* D_0 does not exist */
1483 	elmx[i][kpcur] = -INFTY; /* EL_0 does not exist */
1484 
1485 	/* INSERT k=0 */
1486 	imx[i][kpcur] = -INFTY;
1487 	/* imx[i][0] is filled same as imx[i][1..k] in the loop above */
1488 	if(INBAND(i+1, 1)) {
1489 	  if(mmx[i+1][kpprv+1] != -INFTY)
1490 	    imx[i][kpcur] = ILogsum(imx[i][kpcur], mmx[i+1][kpprv+1] + CP9TSC(cp9O_IM,0));
1491 	}
1492 	if(INBAND(i+1, 0)) {
1493 	  if(imx[i+1][kpprv] != -INFTY)
1494 	    imx[i][kpcur] = ILogsum(imx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_II,0));
1495 	}
1496 	if(INBAND(i, 1)) {
1497 	  if(dmx[i][kpcur+1] != -INFTY)
1498 	    imx[i][kpcur] = ILogsum(imx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_ID,0));
1499 	}
1500 
1501 	/*M_0 is the B state, it doesn't emit, and can be reached from any match via a begin transition */
1502 	kprvn = ESL_MAX(1, kmin[i+1]);
1503 	kprvx = kmax[i+1];
1504 	/* careful, don't change kpcur - this is a special case the M_0 state, we loop over all M_k children, only kpprv changes */
1505 	kpprv = kprvx - kmin[i+1];
1506 	mmx[i][kpcur] = -INFTY;
1507 	for(kprv = kprvx; kprv >= kprvn; kprv--, kpprv--) {
1508 	  if(mmx[i+1][kpprv] != -INFTY)
1509 	    mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (mmx[i+1][kpprv] + CP9TSC(cp9O_BM,kprv)));
1510 	}
1511 	k = 0;
1512 	if(INBAND(i+1, 0)) {
1513 	  kpprv = k - kmin[i+1];
1514 	  if(imx[i+1][kpprv] != -INFTY) {
1515 	    mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (imx[i+1][kpprv] + CP9TSC(cp9O_MI,0)));
1516 	  }
1517 	}
1518 	if(INBAND(i, 1)) {
1519 	  if(dmx[i][kpcur+1] != -INFTY) {
1520 	    mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (dmx[i][kpcur+1] + CP9TSC(cp9O_MD,0)));     /* B->D_1 */
1521 	  }
1522 	}
1523       }
1524       ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, 0, kpcur, mmx[i][kpcur]);
1525       ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, 0, kpcur, imx[i][kpcur]);
1526       ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, 0, kpcur, dmx[i][kpcur]);
1527     }
1528 
1529   /*******************************************************************/
1530   /* Special case: i == 0 */
1531 
1532   /* initialize all match, inserts, deletes and ELs to -INFTY
1533    * deletes and M_0 cells MAY get changed later, inserts and ELs always stay -INFTY
1534    * b/c we need at least 1 residue to get to those cells */
1535 
1536   i = 0;
1537   kpcur = k - kmin[0];
1538   for (kpcur = 0; kpcur <= kmax[0] - kmin[0]; kpcur++) mmx[i][kpcur] = imx[i][kpcur] = dmx[i][kpcur] = elmx[i][kpcur] = -INFTY;
1539 
1540   /* D_M(i == 0) <- I_M(i = 1) */
1541   if(INBAND(i, M)) {
1542     kpcur = M - kmin[i];
1543     kpprv = M - kmin[i+1];
1544     if(INBAND(i+1, M)) {
1545       dmx[i][kpcur]  = imx[i+1][kpprv] + CP9TSC(cp9O_DI,M);
1546     }
1547   }
1548 
1549   /* update D_k(i == 0) cells */
1550   /* D_k(i == 0) <- M_k+1(i == 1) */
1551   kn = ESL_MAX(kmin[i], kmin[i+1]-1); /* start at first cell from which we can look back to a valid cell at *mx[i-1][k-1] */
1552   kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1553   kx = ESL_MIN(kmax[i], kmax[i+1]-1);
1554   kpcur = kx - kmin[i];
1555   kpprv = kx - kmin[i+1];
1556   for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1557     dmx[i][kpcur]  = mmx[i+1][kpprv+1] + CP9TSC(cp9O_DM,k);
1558 
1559   /* D_k(i == 0) <- I_k(i == 1) */
1560   kn = ESL_MAX(kmin[i], kmin[i+1]); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1561   kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1562   kx = ESL_MIN(kmax[i], kmax[i+1]);
1563   kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1564   kpprv = kx - kmin[i+1];
1565   for (k = kx; k >= kn; k--, kpcur--, kpprv--)
1566     dmx[i][kpcur] = ILogsum(dmx[i][kpcur], imx[i+1][kpprv] + CP9TSC(cp9O_DI,k));
1567 
1568   /* D_k(i == 0) <- D_k+1(i == 0) */
1569   kn = ESL_MAX(kmin[i], kmin[i]-1); /* start at first cell from which we can look ahead to a valid cell at imx[i-1][k] */
1570   kn = ESL_MAX(kn, 1); /* kn can't go all the way down to 0, that's a special case, handled outside the main loop */
1571   kx = ESL_MIN(kmax[i], kmax[i]-1);
1572   kpcur = kx - kmin[i]; /* unnec, loop above ends with this */
1573   for (k = kx; k >= kn; k--, kpcur--)
1574     dmx[i][kpcur] = ILogsum(dmx[i][kpcur], dmx[i][kpcur+1] + CP9TSC(cp9O_DD,k));
1575 
1576     ////printf("mmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, mmx[i][kpcur]);
1577     ////printf("imx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, imx[i][kpcur]);
1578     ////printf("dmx[i:%4d][k:%4d(kp:%4d)] %10d\n", i, k, kpcur, dmx[i][kpcur]);
1579 
1580   /* Case when k == 0 (i is still 0) */
1581   k = 0;
1582   if(INBAND(i, 0)) {
1583     assert(kmin[i]  == 0);
1584     imx[i][0] = -INFTY; /* need seq to get here */
1585     dmx[i][0]   = -INFTY; /* D_0 does not exist */
1586     elmx[i][0]  = -INFTY; /* EL_0 does not exist */
1587     mmx[i][0] = -INFTY;
1588 
1589     /*M_0 is the B state, it doesn't emit, and can be reached from any match via a begin transition */
1590     kprvn = ESL_MAX(1, kmin[i+1]);
1591     kprvx = kmax[i+1];
1592 
1593     kpcur = 0; /* special case, M_0 = B state */
1594     /* careful, this is a different assignment to kpcur b/c it's the M_0 state, we loop over all M_k children, only kpprv changes */
1595     kpprv = kprvx - kmin[i+1];
1596     mmx[i][kpcur] = -INFTY;
1597     for(kprv = kprvx; kprv >= kprvn; kprv--, kpprv--) {
1598       if(mmx[i+1][kpprv] != -INFTY)
1599 	mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (mmx[i+1][kpprv] + CP9TSC(cp9O_BM,kprv)));
1600     }
1601     k = 0;
1602     if(INBAND(i+1, 0)) {
1603       kpprv = k - kmin[i+1];
1604       if(imx[i+1][kpprv] != -INFTY) {
1605 	mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (imx[i+1][kpprv] + CP9TSC(cp9O_MI,0)));
1606       }
1607     }
1608     if(INBAND(i, 1)) {
1609       if(dmx[i][kpcur+1] != -INFTY) {
1610 	mmx[i][kpcur] = ILogsum(mmx[i][kpcur], (dmx[i][kpcur+1] + CP9TSC(cp9O_MD,0)));     /* B->D_1 */
1611       }
1612     }
1613   }
1614   /* No EL contribution here, can't go B->EL_* */
1615 
1616   fsc = Scorify(mmx[i][0]);
1617   /**********************************************************************************/
1618   /* End of Backward recursion */
1619 
1620   ESL_DPRINTF1(("#DEBUG: cp9_BackwardP7B() return score: %10.4f\n", fsc));
1621   return eslOK;
1622 }
1623 
1624 
1625 /* Function: cp9_CheckFBP7B()
1626  *
1627  * Purpose:  Debugging function to make sure the P7 banded
1628  *           DP functions cp9_ForwardP7B() and cp9_BackwardP7B()
1629  *            are working by checking:
1630  *           For all positions i, and states k within kmin[i]..kmax[i]:
1631  *             sum_k f[i][k] * b[i][k] = P(x|hmm)
1632  *
1633  * Args:     fmx    - p7 banded forward dp matrix, already filled
1634  *           bmx    - p7 banded backward dp matrix, already filled
1635  *           hmm    - the model
1636  *           sc     - P(x|hmm, p7 bands) the probability of the entire
1637  *                    seq given the model
1638  *           i0     - start of target subsequence (often 1, beginning of dsq)
1639  *           j0     - end of target subsequence (often L, end of dsq)
1640  *           dsq    - the digitized sequence
1641  *
1642  * Note about sequence position indexing: although this function
1643  * works on a subsequence from i0 to j0, fmx and bmx have offset indices,
1644  * from 1 to L, with L = j0-i0+1.
1645  *
1646  * Return:   eslOK on success;
1647  *           eslFAIL if any residue fails check
1648  */
1649 int
cp9_CheckFBP7B(CP9_MX * fmx,CP9_MX * bmx,CP9_t * hmm,char * errbuf,float sc,int i0,int j0,ESL_DSQ * dsq,int * kmin,int * kmax)1650 cp9_CheckFBP7B(CP9_MX *fmx, CP9_MX *bmx, CP9_t *hmm, char *errbuf, float sc, int i0, int j0, ESL_DSQ *dsq, int *kmin, int *kmax)
1651 {
1652   if(fmx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_CheckFBP7B(), fmx is NULL.\n");
1653   if(bmx == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_CheckFBP7B(), bmx is NULL.\n");
1654   if(dsq == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_CheckFBP7B(), dsq is NULL.");
1655 
1656   int k, i;
1657   float max_diff;  /* maximum allowed difference between sc and
1658 		    * sum_k f[i][k] * b[i][k] for any i */
1659   float diff;
1660   int fb_sum;
1661   float fb_sc;
1662   int   L;		/* subsequence length */
1663   int   kp;             /* k', relative k within band; k-kmin[i] */
1664   int to_add;
1665 
1666   L  = j0-i0+1;		/* the length of the subsequence */
1667   max_diff = 0.1;       /* tolerance, must be within .1 bits of original score */
1668 
1669   /* In all possible paths through the model, each residue of the sequence must have
1670    * been emitted by exactly 1 insert, match or EL state. */
1671   for (i = 1; i <= L; i++) {
1672     fb_sum = -INFTY;
1673     kp = 0;
1674     for (k = kmin[i]; k <= kmax[i]; k++, kp++) {
1675       if     (fmx->mmx[i][kp] == -INFTY) to_add = -INFTY;
1676       else if(bmx->mmx[i][kp] == -INFTY) to_add = -INFTY;
1677       else {
1678 	to_add = fmx->mmx[i][kp] + bmx->mmx[i][kp];
1679 	if(k > 0) to_add -= hmm->msc[dsq[i]][k];
1680       }
1681       /* hmm->msc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx
1682        * unless, we're talking about M_0, the B state, it doesn't emit */
1683       fb_sum = ILogsum(fb_sum, to_add);
1684 
1685       /*printf("fmx->mmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kp, fmx->mmx[i][kp]);
1686 	printf("bmx->mmx[i:%4d][k:%4d(%4d)]: %d sum: %d\n", i, k, kp, (bmx->mmx[i][kp]-hmm->msc[dsq[i]][k]), fb_sum);*/
1687 
1688       if     (fmx->imx[i][kp] == -INFTY) to_add = -INFTY;
1689       else if(bmx->imx[i][kp] == -INFTY) to_add = -INFTY;
1690       else  {
1691 	to_add  = fmx->imx[i][kp] + bmx->imx[i][kp];
1692 	to_add -= hmm->isc[dsq[i]][k];
1693       }
1694       /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1695       fb_sum = ILogsum(fb_sum, to_add);
1696 
1697       /*printf("fmx->imx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kp, fmx->imx[i][kp]);
1698 	printf("bmx->imx[i:%4d][k:%4d(%4d)]: %d sum: %d\n", i, k, kp, (bmx->imx[i][kp]-hmm->isc[dsq[i]][k]), fb_sum);*/
1699 
1700       if     (fmx->elmx[i][kp] == -INFTY) to_add = -INFTY;
1701       else if(bmx->elmx[i][kp] == -INFTY) to_add = -INFTY;
1702       else  {
1703 	to_add  = fmx->elmx[i][kp] + bmx->elmx[i][kp];
1704 	/* EL emissions are by definition zero scoring */
1705       }
1706       fb_sum = ILogsum(fb_sum, to_add);
1707 
1708       /*printf("fmx->elmx[i:%4d][k:%4d(%4d)]: %d\n", i, k, kp, fmx->elmx[i][kp]);
1709 	printf("bmx->elmx[i:%4d][k:%4d(%4d)]: %d sum: %d\n", i, k, kp, bmx->elmx[i][kp], fb_sum);*/
1710     }
1711     fb_sc  = Scorify(fb_sum);
1712     diff = fabs(fb_sc - sc);
1713     /*printf("FB CHECK: i: %4d %10.4f %10.4f (%10.4f)\n", i, fb_sc, sc, diff);*/
1714     if((fabs(diff) > max_diff))
1715       ESL_FAIL(eslFAIL, errbuf, "cp9_CheckFB(), residue at posn i:%d violates sum_k f[i][k]*b[i][k]=P(x|hmm), sum_k = %.4f bits (should be %.4f)\n", i, fb_sc, sc);
1716   }
1717   ESL_DPRINTF1(("#DEBUG: cp9_CheckFB() passed, Forward/Backward matrices pass check.\n"));
1718   /*printf("cp9_CheckFB() passed, Forward/Backward matrices pass check.\n");*/
1719   return eslOK;
1720 }
1721 
1722 
1723 
1724 /* Function: cp9_Seq2BandsP7B
1725  * Date    : EPN, Fri Aug 15 13:43:21 2008
1726  *
1727  * Purpose:  Given a CM with precalc'ed CP9 HMM, CP9Map, and HMMER3 plan 7
1728  *           HMM bands for the CP9 HMM DP matrices, a sequence and
1729  *           a CP9Bands_t structure, calculate the CP9 HMM bands and store them
1730  *           in the CP9Bands_t structure.
1731  *
1732  *           Note: this function was never updated to handle truncated
1733  *           alignment.
1734  *
1735  * Args:     cm          - the covariance model
1736  *           errbuf      - char buffer for reporting errors
1737  *           fmx         - CP9 dp matrix for Forward()
1738  *           bmx         - CP9 dp matrix for Backward()
1739  *           pmx         - CP9 dp matrix to fill with posteriors, can == bmx
1740  *           dsq         - sequence in digitized form
1741  *           L           - length of sequence we're aligning (1..L)
1742  *           cp9b        - PRE-ALLOCATED, the HMM bands for this sequence, filled here.
1743  *           kmin        - P7 dervied band to enforce: [0.i..L] = k, min node k for residue i
1744  *           kmax        - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1745  *           debug_level - verbosity level for debugging printf()s
1746  * Return:  eslOK on success;
1747  *
1748  */
1749 int
cp9_Seq2BandsP7B(CM_t * cm,char * errbuf,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,ESL_DSQ * dsq,int L,CP9Bands_t * cp9b,int * kmin,int * kmax,int debug_level)1750 cp9_Seq2BandsP7B(CM_t *cm, char *errbuf, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, ESL_DSQ *dsq, int L, CP9Bands_t *cp9b, int *kmin, int *kmax, int debug_level)
1751 {
1752   int   status;
1753   int   use_sums;     /* TRUE to fill and use posterior sums during HMM band calc, yields wider bands  */
1754   float sc;
1755   int do_old_hmm2ij;
1756   CP9_t *cp9 = NULL;  /* ptr to cp9 HMM (this could be Lcp9, Rcp9, Tcp9 if we update this function to possibly handle truncated alignment) */
1757 
1758   /* Contract checks */
1759   if(cm->cp9map == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, but cm->cp9map is NULL.\n");
1760   if(dsq == NULL)        ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, dsq is NULL.");
1761   if(!((cm->align_opts & CM_ALIGN_HBANDED) || (cm->search_opts & CM_SEARCH_HBANDED)))        ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, CM_ALIGN_HBANDED and CM_SEARCH_HBANDED flags both down, exactly 1 must be up.\n");
1762   if((cm->search_opts & CM_SEARCH_HMMALNBANDS) && (!(cm->search_opts & CM_SEARCH_HBANDED))) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, CM_SEARCH_HMMALNBANDS flag raised, but not CM_SEARCH_HBANDED flag, this doesn't make sense\n");
1763   if(cm->tau > 0.5)      ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2BandsP7B, cm->tau (%f) > 0.5, we can't deal.", cm->tau);
1764 
1765   use_sums = ((cm->align_opts & CM_ALIGN_SUMS) || (cm->search_opts & CM_SEARCH_SUMS)) ? TRUE : FALSE;
1766   do_old_hmm2ij = ((cm->align_opts & CM_ALIGN_HMM2IJOLD) || (cm->search_opts & CM_SEARCH_HMM2IJOLD)) ? TRUE : FALSE;
1767 
1768   /* Determine which cp9 HMM to use: If the CM has local begins on, use cm->cp9loc, else use cm->cp9glb */
1769   cp9 = cm->cp9;
1770   if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2Posteriors, relevant cp9 is NULL.\n");
1771 
1772   /* Step 1: Get HMM Forward/Backward DP matrices.
1773    * Step 2: F/B       -> HMM bands.
1774    * Step 3: HMM bands -> CM bands.
1775    */
1776 
1777   /* Step 1: Get HMM Forward/Backward DP matrices. */
1778   if((status = cp9_ForwardP7B (cp9, errbuf, fmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1779   if((status = cp9_BackwardP7B(cp9, errbuf, bmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1780 
1781   if(cm->align_opts & CM_ALIGN_CHECKFB) {
1782     if((status = cp9_CheckFBP7B(fmx, bmx, cp9, errbuf, sc, 1, L, dsq, kmin, kmax)) != eslOK) return status;
1783     printf("Forward/Backward matrices checked.\n");
1784   }
1785 
1786   /* Step 2: F/B -> HMM bands. */
1787   if(use_sums){
1788     printf("USE SUMS!\n");
1789     exit(1);
1790   }
1791   else {
1792     if((status = cp9_FB2HMMBandsP7B(cp9, errbuf, dsq, fmx, bmx, pmx, cp9b, L, cp9b->hmm_M,
1793 				    (1.-cm->tau), do_old_hmm2ij, kmin, kmax, debug_level)) != eslOK) return status;
1794     cp9b->tau = cm->tau;
1795   }
1796   if(debug_level > 0) cp9_DebugPrintHMMBands(stdout, L, cp9b, cm->tau, 1);
1797 
1798   /* Step 3: HMM bands  ->  CM bands. */
1799   if(do_old_hmm2ij) {
1800     if((status = cp9_HMM2ijBands_OLD(cm, errbuf, cm->cp9b, cm->cp9map, 1, L, FALSE, debug_level)) != eslOK) return status;
1801   }
1802   else {
1803     if((status = cp9_HMM2ijBands(cm, errbuf, cp9, cm->cp9b, cm->cp9map, 1, L, FALSE, FALSE, debug_level)) != eslOK) return status;
1804     /* For debugging, uncomment this block:
1805        if((status = cp9_HMM2ijBands(cm, errbuf, cm->cp9b, cm->cp9map, i0, j0, doing_search, FALSE, debug_level)) != eslOK) {
1806        ESL_SQ *tmp;
1807        tmp = esl_sq_CreateDigitalFrom(cm->abc, "irrelevant", dsq+i0-1, (j0-i0+1), NULL, NULL, NULL);
1808        esl_sq_Textize(tmp);
1809        printf("HEY! cm: %s\n", cm->name);
1810        printf(">irrelevant\n%s\n", tmp->seq);
1811        esl_sq_Destroy(tmp);
1812        return status;
1813     }
1814     */
1815   }
1816 
1817   /* Use the CM bands on i and j to get bands on d, specific to j. */
1818   /* cp9_GrowHDBands() must be called before ij2d_bands() so hdmin, hdmax are adjusted for new seq */
1819   if((status = cp9_GrowHDBands(cp9b, errbuf)) != eslOK) return status;
1820   ij2d_bands(cm, L, cp9b->imin, cp9b->imax, cp9b->jmin, cp9b->jmax, cp9b->hdmin, cp9b->hdmax, FALSE, debug_level);
1821 
1822 #if eslDEBUGLEVEL >= 1
1823   if((status = cp9_ValidateBands(cm, errbuf, cp9b, 1, L, FALSE)) != eslOK) return status;
1824   ESL_DPRINTF1(("#DEBUG: bands validated.\n"));
1825 #endif
1826   if(debug_level > 0) debug_print_ij_bands(cm);
1827 
1828   if(debug_level > 0) PrintDPCellsSaved_jd(cm, cp9b->jmin, cp9b->jmax, cp9b->hdmin, cp9b->hdmax, L);
1829 
1830   return eslOK;
1831 }
1832 
1833 
1834 /* Function: cp9_Seq2PosteriorsP7B
1835  * Date    : EPN, Tue Aug 19 13:12:12 2008
1836  *
1837  * Purpose:  Given a CM with precalc'ed CP9 HMM and CP9Map, and HMMER3 plan 7
1838  *           HMM bands for the CP9 HMM DP matrices, and a sequence,
1839  *           run HMM Forward and Backward algorithms, and return a CP9 posterior
1840  *           matrix.
1841  *
1842  *           Note: this function was never updated to handle
1843  *           truncated alignment (b/c it's currently not hooked up
1844  *           to any of the Infernal applications).
1845  *
1846  * Args:     cm           - the covariance model
1847  *           errbuf       - char buffer for error messages
1848  *           fmx          - CP9 dp matrix for Forward()
1849  *           bmx          - CP9 dp matrix for Backward()
1850  *           pmx          - CP9 dp matrix to fill with posteriors, can == bmx
1851  *           dsq          - sequence in digitized form
1852  *           L            - length of the sequence we're aligning (1..L)
1853  *           kmin         - P7 dervied band to enforce: [0.i..L] = k, min node k for residue i
1854  *           kmax         - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1855  *           debug_level  - verbosity level for debugging printf()s
1856  *
1857  * Return:  eslOK on success
1858  */
1859 int
cp9_Seq2PosteriorsP7B(CM_t * cm,char * errbuf,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,ESL_DSQ * dsq,int L,int * kmin,int * kmax,int debug_level)1860 cp9_Seq2PosteriorsP7B(CM_t *cm, char *errbuf, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, ESL_DSQ *dsq, int L, int *kmin, int *kmax, int debug_level)
1861 {
1862   int status;
1863   float sc;
1864   CP9_t *cp9 = NULL;  /* ptr to cp9 HMM (this could be Lcp9, Rcp9, Tcp9 if we update this function to possibly handle truncated alignment) */
1865 
1866   /* Contract checks */
1867   if(dsq == NULL)        ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors(), dsq is NULL.");
1868   if(cm->cp9map == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors, but cm->cp9map is NULL.\n");
1869   if((cm->align_opts & CM_ALIGN_HBANDED) && (cm->search_opts & CM_SEARCH_HBANDED))
1870     ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors, CM_ALIGN_HBANDED and CM_SEARCH_HBANDED flags both up, exactly 1 must be up.\n");
1871   if((cm->search_opts & CM_SEARCH_HMMALNBANDS) && (! (cm->search_opts & CM_SEARCH_HBANDED)))
1872     ESL_FAIL(eslEINCOMPAT, errbuf, "in cp9_Seq2Posteriors, CM_SEARCH_HMMALNBANDS flag raised, but not CM_SEARCH_HBANDED flag, this doesn't make sense\n");
1873 
1874   /* determine which cp9 HMM to use, if CM is has local begins use cp9 (its local too) else use cp9glb (its global) */
1875   cp9 = cm->cp9;
1876   if(cp9 == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_Seq2Posteriors, relevant cp9 is NULL.\n");
1877 
1878   /* Step 1: Get HMM posteriors.*/
1879   if((status = cp9_ForwardP7B (cp9, errbuf, fmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1880   if(debug_level > 0) printf("CP9P7B Forward  score : %.4f\n", sc);
1881 
1882   if((status = cp9_BackwardP7B(cp9, errbuf, bmx, dsq, L, kmin, kmax, &sc)) != eslOK) return status;
1883   if(debug_level > 0) printf("CP9 Backward  score : %.4f\n", sc);
1884 
1885   if(cm->align_opts & CM_ALIGN_CHECKFB) {
1886     if((status = cp9_CheckFBP7B(fmx, bmx, cp9, errbuf, sc, 1, L, dsq, kmin, kmax)) != eslOK) return status;
1887     printf("Forward/Backward matrices checked.\n");
1888   }
1889 
1890   /* Get posteriors */
1891   if((status = cp9_PosteriorP7B(dsq, errbuf, L, cp9, fmx, bmx, pmx, kmin, kmax)) != eslOK) return status;
1892 
1893   return eslOK;
1894 }
1895 
1896 
1897 /* Function: cp9_PosteriorP7B()
1898  * based on Ian Holmes' hmmer/src/postprob.c::P7EmitterPosterior()
1899  *
1900  * Purpose:  Combines HMMER3 p7 banded Forward and Backward matrices into a
1901  *           posterior probability matrix. For emitters (match and inserts) the
1902  *           entries in row i of this matrix are the logs of the posterior
1903  *           probabilities of each state emitting symbol i of the sequence.
1904  *           For non-emitters the entries in row i of this matrix are the
1905  *           logs of the posterior probabilities of each state being 'visited'
1906  *           when the last emitted residue in the parse was symbol i of the
1907  *           sequence.
1908  *
1909  * Args:     dsq      - sequence in digitized form
1910  *           errbuf   - for error messages
1911  *           L        - length of target subsequence 1..L
1912  *           hmm      - the model
1913  *           forward  - pre-calculated forward matrix
1914  *           backward - pre-calculated backward matrix
1915  *           pm       - pre-allocated dynamic programming matrix for posteriors
1916  *           kmin     - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1917  *           kmax     - P7 derived band to enforce: [0.i..L] = k, min node k for residue i
1918  *
1919  * Return:   eslOK on success;
1920  */
1921 int
cp9_PosteriorP7B(ESL_DSQ * dsq,char * errbuf,int L,CP9_t * hmm,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,int * kmin,int * kmax)1922 cp9_PosteriorP7B(ESL_DSQ *dsq, char *errbuf, int L, CP9_t *hmm, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, int *kmin, int *kmax)
1923 {
1924   int i;
1925   int k;
1926   int sc;
1927   int M = hmm->M;
1928   int kp, kn, kx;
1929   /*float temp_sc;*/
1930 
1931   if(bmx != pmx) GrowCP9Matrix(pmx, errbuf, L, M, kmin, kmax, NULL, NULL, NULL, NULL, NULL);
1932 
1933   /* parses must start/stop at (i = 1)/(j = L) */
1934   sc = bmx->mmx[0][0];
1935 
1936   /* note boundary conditions, i = 1 */
1937   assert(kmin[0] == 0);
1938   pmx->mmx[0][0] = fmx->mmx[0][0] + bmx->mmx[0][0] - sc; /* fmx->mmx[0][0] is 0, bmx->mmx[1][0] is overall score */
1939   pmx->imx[0][0] = -INFTY; /*need seq to get here*/
1940   pmx->dmx[0][0] = -INFTY; /*D_0 does not exist*/
1941   i  = 0;
1942   kn = ESL_MAX(kmin[i], 1);
1943   kx = kmax[i];
1944   kp = kn - kmin[i];
1945   for (k = kn; k <= kx; k++, kp++) {
1946     pmx->mmx[0][kp] = -INFTY; /*need seq to get here*/
1947     pmx->imx[0][kp] = -INFTY; /*need seq to get here*/
1948     pmx->dmx[0][kp] = fmx->dmx[0][kp] + bmx->dmx[0][kp] - sc;
1949   }
1950 
1951   for (i = 1; i <= L; i++) {
1952       k = 0;
1953       if(INBAND(i,0)) {
1954 	kp = k - kmin[i];
1955 	assert(kp == 0);
1956 	pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - sc, -INFTY); /* M_0 doesn't emit */
1957 	pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][0] - sc, -INFTY);
1958 	/*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1959 	pmx->dmx[i][kp] = -INFTY; /* D_0 doesn't exist */
1960       }
1961 
1962       kn = ESL_MAX(kmin[i], 1);
1963       kx = kmax[i];
1964       kp = kn - kmin[i];
1965       for(k = kn; k <= kx; k++, kp++)
1966 	{
1967 	  pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - hmm->msc[dsq[i]][k] - sc, -INFTY);
1968 	  /*hmm->msc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1969 	  pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][k] - sc, -INFTY);
1970 	  /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
1971 	  pmx->dmx[i][kp] = ESL_MAX(fmx->dmx[i][kp] + bmx->dmx[i][kp] - sc, -INFTY);
1972 	}
1973     }
1974 
1975   /*
1976     float temp_sc;
1977     for(i = 0; i <= L; i++)
1978     {
1979     kp = 0;
1980     for(k = kmin[i]; k <= kmax[i]; k++, kp++)
1981     {
1982     temp_sc = Score2Prob(mx->mmx[i][kp], 1.);
1983     if(temp_sc > .0001)
1984     printf("mx->mmx[%3d][%3d]: %9d | %8f\n", i, k, mx->mmx[i][kp], temp_sc);
1985     temp_sc = Score2Prob(mx->imx[i][kp], 1.);
1986     if(temp_sc > .0001)
1987     printf("mx->imx[%3d][%3d]: %9d | %8f\n", i, k, mx->imx[i][kp], temp_sc);
1988     temp_sc = Score2Prob(mx->dmx[i][kp], 1.);
1989     if(temp_sc > .0001)
1990     printf("mx->dmx[%3d][%3d]: %9d | %8f\n", i, k, mx->dmx[i][kp], temp_sc);
1991     }
1992     }*/
1993   return eslOK;
1994 }
1995 
1996 /* Function: cp9_FB2HMMBandsP7B()
1997  * Date:     EPN, Fri Aug 15 14:00:59 2008
1998  *
1999  * Purpose: Determine the band on all HMM states given HMMER3 Plan 7 Banded
2000  *          Forward and Backward matrices. Do this by calculating and summing
2001  *          log posterior probabilities that each state emitted/was visited at each posn,
2002  *          starting at the band ends, and creeping in, until the half the
2003  *          maximum allowable probability excluded is reached on each side.
2004  *
2005  * Args:
2006  *
2007  * CP9_t hmm        the HMM
2008  * errbuf           char buffer for error messages
2009  * CP9_MX fmx:      forward DP matrix, already calc'ed
2010  * CP9_MX bmx:      backward DP matrix, already calc'ed
2011  * CP9_MX pmx:      DP matrix for posteriors, filled here, can == bmx
2012  * dsq              the digitized sequence
2013  * CP9Bands_t cp9b  CP9 bands data structure
2014  * int   L          length of target subsequence (1..L)
2015  * int   M          number of nodes in HMM (num columns of pmx matrix)
2016  * double p_thresh  the probability mass we're requiring is within each band
2017  * int do_old_hmm2ij TRUE if we'll use old cp9_HMM2ijBands_OLD() function downstream
2018  * int kmin         P7 dervied band to enforce: [0.i..L] = k, min node k for residue i
2019  * int kmax         P7 derived band to enforce: [0.i..L] = k, min node k for residue i
2020  * int debug_level  [0..3] tells the function what level of debugging print
2021  *                  statements to print.
2022  *
2023  * Returns: eslOK on success;
2024  */
2025 int
cp9_FB2HMMBandsP7B(CP9_t * hmm,char * errbuf,ESL_DSQ * dsq,CP9_MX * fmx,CP9_MX * bmx,CP9_MX * pmx,CP9Bands_t * cp9b,int L,int M,double p_thresh,int do_old_hmm2ij,int * kmin,int * kmax,int debug_level)2026 cp9_FB2HMMBandsP7B(CP9_t *hmm, char *errbuf, ESL_DSQ *dsq, CP9_MX *fmx, CP9_MX *bmx, CP9_MX *pmx, CP9Bands_t *cp9b,
2027 		   int L, int M, double p_thresh, int do_old_hmm2ij, int *kmin, int *kmax, int debug_level)
2028 {
2029   int status;
2030   int k;                                  /* counter over nodes of the model */
2031   int thresh = Prob2Score(((1. - p_thresh)/2.), 1.); /* allowable prob mass excluded on each side */
2032 
2033   /* *_m = match, *_i = insert, *_d = delete */
2034   int *kthresh_m, *kthresh_i, *kthresh_d; /* [0..k..hmm->M], individual thresholds for each state */
2035   int *nset_m, *nset_i, *nset_d;          /* [0..k..hmm->M], has minimum been set for this state? */
2036   int *xset_m, *xset_i, *xset_d;          /* [0..k..hmm->M], has maximum been set for this state? */
2037   int *mass_m, *mass_i, *mass_d;          /* [0..k..hmm->M], summed log prob of pmx->mx[i][k] from 0..k or k..L */
2038   int i;                                  /* actual position */
2039   int sc;                                 /* summed score of all parses (derived from backward matrix)
2040 					   * if(cm->search_opts & CM_SEARCH_HMMALNBANDS) Forward and Backward
2041 					   * were run in 'scan mode' where each residue can be begin/end of a parse,
2042 					   * so we have to sum up parses that end at each posn,
2043 					   * if ! (cm->search_opts & CM_SEARCH_HMMALNBANDS) we know we have
2044 					   * to start at residue 1 and end at residue L, so sc is simply bmx->mmx[0][0]
2045 					   */
2046   int hmm_is_localized;                   /* TRUE if HMM has local begins, ends or ELs on */
2047   int kp, kn, kx;
2048 
2049   hmm_is_localized = ((hmm->flags & CPLAN9_LOCAL_BEGIN) || (hmm->flags & CPLAN9_LOCAL_END) || (hmm->flags & CPLAN9_EL)) ? TRUE : FALSE;
2050 
2051   if(bmx != pmx) GrowCP9Matrix(pmx, errbuf, L, M, kmin, kmax, NULL, NULL, NULL, NULL, NULL);
2052 
2053   /* allocations and initializations */
2054   ESL_ALLOC(nset_m, sizeof(int) * (M+1));
2055   ESL_ALLOC(nset_i, sizeof(int) * (M+1));
2056   ESL_ALLOC(nset_d, sizeof(int) * (M+1));
2057   ESL_ALLOC(xset_m, sizeof(int) * (M+1));
2058   ESL_ALLOC(xset_i, sizeof(int) * (M+1));
2059   ESL_ALLOC(xset_d, sizeof(int) * (M+1));
2060   ESL_ALLOC(mass_m, sizeof(int) * (M+1));
2061   ESL_ALLOC(mass_i, sizeof(int) * (M+1));
2062   ESL_ALLOC(mass_d, sizeof(int) * (M+1));
2063   ESL_ALLOC(kthresh_m, sizeof(int) * (M+1));
2064   ESL_ALLOC(kthresh_i, sizeof(int) * (M+1));
2065   ESL_ALLOC(kthresh_d, sizeof(int) * (M+1));
2066 
2067   esl_vec_ISet(mass_m, M+1, -INFTY);
2068   esl_vec_ISet(mass_i, M+1, -INFTY);
2069   esl_vec_ISet(mass_d, M+1, -INFTY);
2070   esl_vec_ISet(nset_m, M+1, FALSE);
2071   esl_vec_ISet(nset_i, M+1, FALSE);
2072   esl_vec_ISet(nset_d, M+1, FALSE);
2073   esl_vec_ISet(xset_m, M+1, FALSE);
2074   esl_vec_ISet(xset_i, M+1, FALSE);
2075   esl_vec_ISet(xset_d, M+1, FALSE);
2076 
2077   sc = bmx->mmx[0][0]; /* Forward/Backward run in 'align mode' parses must start at 1, end at L */
2078   /* sc is summed log prob of all possible parses of seq 1..L */
2079 
2080   /* note boundary conditions, i = 1 */
2081   assert(kmin[0] == 0);
2082   pmx->mmx[0][0] = fmx->mmx[0][0] + bmx->mmx[0][0] - sc; /* fmx->mmx[0][0] is 0, bmx->mmx[1][0] is overall score */
2083   pmx->imx[0][0] = -INFTY; /*need seq to get here*/
2084   pmx->dmx[0][0] = -INFTY; /*D_0 does not exist*/
2085   if((mass_m[0] = pmx->mmx[0][0]) > thresh) {
2086     cp9b->pn_min_m[0] = 0;
2087     nset_m[0] = TRUE;
2088   }
2089   mass_i[0] = -INFTY; /* b/c pmx->imx[0][0] is -INFTY, set above */
2090   mass_d[0] = -INFTY; /* b/c pmx->dmx[0][0] is -INFTY, set above */
2091 
2092   i  = 0;
2093   kn = ESL_MAX(kmin[i], 1);
2094   kx = kmax[i];
2095   kp = kn - kmin[i];
2096   for (k = kn; k <= kx; k++, kp++) {
2097     pmx->mmx[0][kp] = -INFTY; /*need seq to get here*/
2098     pmx->imx[0][kp] = -INFTY; /*need seq to get here*/
2099     pmx->dmx[0][kp] = fmx->dmx[0][kp] + bmx->dmx[0][kp] - sc;
2100     /* mass_m[k] doesn't change b/c pmx->mmx[0][kp] is -INFTY */
2101     /* mass_i[k] doesn't change b/c pmx->imx[0][kp] is -INFTY */
2102     if((mass_d[k] = pmx->dmx[0][kp]) > thresh) {
2103       cp9b->pn_min_d[k] = 0;
2104       nset_d[k] = TRUE;
2105     }
2106   }
2107 
2108   /* Find minimum position in band for each state (M,I,D) of each node (0..M) */
2109   for (i = 1; i <= L; i++) {
2110       k = 0;
2111       if(INBAND(i,0)) {
2112 	kp = k - kmin[i];
2113 	assert(kp == 0);
2114 	pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - sc, -INFTY); /* M_0 doesn't emit */
2115 	if(! nset_m[0]) {
2116 	  if((mass_m[0] = ILogsum(mass_m[0], pmx->mmx[i][kp])) > thresh) {
2117 	    cp9b->pn_min_m[0] = i;
2118 	    nset_m[0] = TRUE;
2119 	  }
2120 	}
2121 
2122 	pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][0] - sc, -INFTY);
2123 	/*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
2124 	if(! nset_i[0]) {
2125 	  if((mass_i[0] = ILogsum(mass_i[0], pmx->imx[i][kp])) > thresh) {
2126 	    cp9b->pn_min_i[0] = i;
2127 	    nset_i[0] = TRUE;
2128 	  }
2129 	}
2130 	pmx->dmx[i][kp] = -INFTY; /* D_0 doesn't exist */
2131       }
2132 
2133       kn = ESL_MAX(kmin[i], 1);
2134       kx = kmax[i];
2135       kp = kn - kmin[i];
2136       for(k = kn; k <= kx; k++, kp++)
2137 	{
2138 	  pmx->mmx[i][kp] = ESL_MAX(fmx->mmx[i][kp] + bmx->mmx[i][kp] - hmm->msc[dsq[i]][k] - sc, -INFTY);
2139 	  /*hmm->msc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
2140 	  pmx->imx[i][kp] = ESL_MAX(fmx->imx[i][kp] + bmx->imx[i][kp] - hmm->isc[dsq[i]][k] - sc, -INFTY);
2141 	  /*hmm->isc[dsq[i]][k] will have been counted in both fmx->mmx and bmx->mmx*/
2142 	  pmx->dmx[i][kp] = ESL_MAX(fmx->dmx[i][kp] + bmx->dmx[i][kp] - sc, -INFTY);
2143 
2144 	  if(! nset_m[k]) {
2145 	    if((mass_m[k] = ILogsum(mass_m[k], pmx->mmx[i][kp])) > thresh) {
2146 	      cp9b->pn_min_m[k] = i;
2147 	      nset_m[k] = TRUE;
2148 	    }
2149 	  }
2150 	  if(! nset_i[k]) {
2151 	    if((mass_i[k] = ILogsum(mass_i[k], pmx->imx[i][kp])) > thresh) {
2152 	      cp9b->pn_min_i[k] = i;
2153 	      nset_i[k] = TRUE;
2154 	    }
2155 	  }
2156 	  if(! nset_d[k]) {
2157 	    if((mass_d[k] = ILogsum(mass_d[k], pmx->dmx[i][kp])) > thresh) {
2158 	      cp9b->pn_min_d[k] = i;
2159 	      nset_d[k] = TRUE;
2160 	    }
2161 	  }
2162 	}
2163     }
2164 
2165   esl_vec_ISet(mass_m, M+1, -INFTY);
2166   esl_vec_ISet(mass_i, M+1, -INFTY);
2167   esl_vec_ISet(mass_d, M+1, -INFTY);
2168   /* Find maximum position in band for each state (M,I,D) of each node (0..M)
2169    * by moving from L down to 1 */
2170   for (i = L; i >= 1; i--) /* i is the relative position in the seq */
2171     {
2172       kp = 0;
2173       for(k = kmin[i]; k <= kmax[i]; k++, kp++)
2174 	{
2175 	  if(! xset_m[k]) {
2176 	    if((mass_m[k] = ILogsum(mass_m[k], pmx->mmx[i][kp])) > thresh) {
2177 	      cp9b->pn_max_m[k] = i;
2178 	      xset_m[k] = TRUE;
2179 	    }
2180 	  }
2181 	  if(! xset_i[k]) {
2182 	    if((mass_i[k] = ILogsum(mass_i[k], pmx->imx[i][kp])) > thresh) {
2183 	      cp9b->pn_max_i[k] = i;
2184 	      xset_i[k] = TRUE;
2185 	    }
2186 	  }
2187 	  if(! xset_d[k]) {
2188 	    if((mass_d[k] = ILogsum(mass_d[k], pmx->dmx[i][kp])) > thresh) {
2189 	      cp9b->pn_max_d[k] = i;
2190 	      xset_d[k] = TRUE;
2191 	    }
2192 	  }
2193 	}
2194     }
2195   /* note boundary conditions, i = 0 */
2196   if(INBAND(0,0)) {
2197     assert(kmin[0] == 0);
2198     if(! xset_m[0]) {
2199       if((mass_m[0] = ILogsum(mass_m[0], pmx->mmx[0][0])) > thresh) {
2200 	cp9b->pn_max_m[0] = 0;
2201 	xset_m[0] = TRUE;
2202       }
2203     }
2204   }
2205   /* mass_i[0] is unchaged because b/c pmx->imx[0][0] is -INFTY, set above */
2206   /* mass_d[0] is unchaged because b/c pmx->dmx[0][0] is -INFTY, set above */
2207   kn = ESL_MAX(kmin[i], 1);
2208   kx = kmax[i];
2209   kp = kn - kmin[i];
2210   for(k = kn; k <= kx; k++, kp++) {
2211     /* mass_m[k] doesn't change b/c pmx->mmx[0][k] is -INFTY */
2212     /* mass_i[k] doesn't change b/c pmx->mmx[0][k] is -INFTY */
2213     if(!xset_d[k]) {
2214       if((mass_d[k] = ILogsum(mass_d[k], pmx->dmx[0][kp])) > thresh) {
2215 	cp9b->pn_max_d[k] = 0;
2216 	xset_d[k] = TRUE;
2217       }
2218     }
2219   }
2220 
2221   /* new technique as of EPN, Sun Jan 27 08:48:34 2008 */
2222   /* Some states may not have had their min/max set. This occurs if the entire
2223    * state is outside the band (i.e. the summed probability the state is entered for ANY i
2224    * is less than our threshold. Current strategy in this situation is to set the
2225    * pn_min_* and pn_max_* values as special flags, (-2) so the function that
2226    * uses them to derive i and j bands knows this is the case and handles it
2227    * accordingly.
2228    */
2229   int mset;
2230   int dset;
2231   for(k = 0; k <= M; k++) {
2232     mset = dset = TRUE;
2233     /* theoretically either nset_*[k] and xset_*[k] should be either both TRUE or both
2234      * FALSE, but I'm slightly worried about rare precision issues, so we check if one
2235      * or the other is unset, and if so, we set both to argmax position */
2236     if(((! nset_m[k])) || (! xset_m[k]) || (cp9b->pn_max_m[k] < cp9b->pn_min_m[k])) {
2237       cp9b->pn_min_m[k] = cp9b->pn_max_m[k] = -1;
2238       mset = FALSE;
2239     }
2240     if(((! nset_i[k])) || (! xset_i[k]) || (cp9b->pn_max_i[k] < cp9b->pn_min_i[k])) {
2241       cp9b->pn_min_i[k] = cp9b->pn_max_i[k] = -1;
2242     }
2243     if(((! nset_d[k])) || (! xset_d[k]) || (cp9b->pn_max_d[k] < cp9b->pn_min_d[k])) {
2244       cp9b->pn_min_d[k] = cp9b->pn_max_d[k] = -1;
2245       dset = FALSE;
2246     }
2247     if((!hmm_is_localized) && (mset == FALSE && dset == FALSE)) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "node: %d match nor delete HMM state bands were set in non-localized, non-scanning HMM, lower tau (should be << 0.5).\n", k);
2248   }
2249 
2250   cp9b->pn_min_d[0] = -1; /* D_0 doesn't exist */
2251   cp9b->pn_max_d[0] = -1; /* D_0 doesn't exist */
2252 
2253   if(debug_level > 0) cp9_DebugPrintHMMBands(stdout, L, cp9b, (1.-p_thresh), 1);
2254 
2255   free(mass_m);
2256   free(mass_i);
2257   free(mass_d);
2258   free(nset_m);
2259   free(nset_i);
2260   free(nset_d);
2261   free(xset_m);
2262   free(xset_i);
2263   free(xset_d);
2264   free(kthresh_m);
2265   free(kthresh_i);
2266   free(kthresh_d);
2267 
2268   return eslOK;
2269 
2270  ERROR:
2271   ESL_FAIL(status, errbuf, "Memory allocation error.\n");
2272 }
2273 
2274 
2275 
2276 /* Function: p7_Seq2Bands
2277  * Date    : EPN, Fri Aug 15 14:29:01 2008
2278  *
2279  * Purpose:  Given a CM with a valid Plan 7 HMM - run the MSV algorithm
2280  *           to determine bands to be used on the CP9 HMM parse.
2281  *
2282  * Args:     cm          - the covariance model
2283  *           errbuf      - char buffer for reporting errors
2284  *           P7_PROFILE  - generic profile
2285  *           P7_GMX      - generic P7 dp matrix
2286  *           P7_BG       - P7 null model
2287  *           P7_TR       - P7 trace
2288  *           dsq         - sequence in digitized form
2289  *           L           - length of sequence we're aligning (1..L)
2290  *           phi         - phi array, phi[k][v] is expected number of times (probability)
2291  *                         state v (0 = match, 1 insert, 2 = delete) in
2292  *                         node k is *entered*. Node 0 is special, state 0 = B state, state 1 = N_state, state 2 = NULL
2293  *                         Calculated *without* taking insert->insert transitions into account.
2294  *           sc7         - minimum score to allow as a pin, 0. to allow any score
2295  *           len7        - min n-mer size, 1 to allow any size
2296  *           end7        - min distance from end to allow, prune away any others, 0 to not prune based on end proximity
2297  *           mprob7      - min match phi probability to allow in a pin
2298  *           mcprob7     - min cumulative match phi probability to allow in a nmer pin
2299  *           iprob7      - max insert phi probability to allow in a pin
2300  *           ilprob7     - max insert phi probability to allow in a state to the left of a pin
2301  *           ret_i2k     - [0.i..L] = k, residue i emitted from node k's match state in MSV trace
2302  *           ret_kmin    - [0.i..L] = k, min node k for residue i
2303  *           ret_kmax    - [0.i..L] = k, max node k for residue i
2304  *           ret_ncells  - number of cells within bands, to return
2305  *
2306  * Return:  eslOK on success;
2307  *
2308  */
2309 int
p7_Seq2Bands(CM_t * cm,char * errbuf,P7_PROFILE * gm,P7_GMX * gx,P7_BG * bg,P7_TRACE * p7_tr,ESL_DSQ * dsq,int L,double ** phi,float sc7,int len7,int end7,float mprob7,float mcprob7,float iprob7,float ilprob7,int pad7,int ** ret_i2k,int ** ret_kmin,int ** ret_kmax,int * ret_ncells)2310 p7_Seq2Bands(CM_t *cm, char *errbuf, P7_PROFILE *gm, P7_GMX *gx, P7_BG *bg, P7_TRACE *p7_tr, ESL_DSQ *dsq, int L,
2311 	     double **phi, float sc7, int len7, int end7, float mprob7, float mcprob7, float iprob7, float ilprob7, int pad7,
2312 	     int **ret_i2k, int **ret_kmin, int **ret_kmax, int *ret_ncells)
2313 {
2314   int   status;
2315   float usc, nullsc;
2316   int *k2i, *i2k;
2317   float *isc;
2318   int *iconflict;
2319   int *kmin, *kmax;
2320   int ncells;
2321 
2322   /* setup for all modes */
2323   p7_bg_SetLength(bg, L);
2324   p7_bg_NullOne(bg, dsq, L, &nullsc);
2325 
2326   /* generic mode setup */
2327   p7_gmx_GrowTo(gx, cm->mlp7->M, L);
2328   p7_ReconfigLength(gm, L);
2329   gx->M = cm->mlp7->M;
2330   gx->L = L;
2331 
2332   /* Step 1: MSV algorithm M->M transitions only
2333    * Step 2: traceback MSV to get pins
2334    * Step 3: prune pins
2335    * Step 4: pins -> bands
2336    */
2337 
2338   /* Step 1: MSV algorithm */
2339 
2340   /* optimized MSV */
2341   /*
2342     p7_oprofile_ReconfigLength(cm->mlp7_om, L);
2343     esl_stopwatch_Start(watch);
2344     my_p7_MSVFilter(dsq, L, cm->mlp7_om, ox, gx, &usc);
2345     got    esl_stopwatch_Stop(watch);
2346     FormatTimeString(time_buf, watch->user, TRUE);
2347     fprintf(stdout, "OMSV  %8.2f  %11s\n", ((usc -nullsc) / eslCONST_LOG2), time_buf);
2348   */
2349 
2350   p7_GMSV(dsq, L, gm, gx, 2.0, &usc);
2351 
2352   /* Step 2: traceback MSV */
2353   status = my_p7_GTraceMSV(dsq, L, gm, gx, p7_tr, &i2k, &k2i, &isc, &iconflict);
2354 
2355   /* Step 3: prune pins */
2356   if(status == eslOK) { /* trace is valid */
2357     prune_i2k(i2k, iconflict, isc, L, phi, sc7, len7, end7, mprob7, mcprob7, iprob7, ilprob7);
2358   }
2359   else if (status == eslEINCOMPAT) { /* trace was discontiguous, abort! remove all pins */
2360     esl_vec_ISet(k2i, (cm->mlp7->M+1), -1);
2361     esl_vec_ISet(i2k, (L+1), -1);
2362     return status;
2363   }
2364 
2365   /* Step 4: pins -> bands */
2366   if((status = p7_pins2bands(i2k, errbuf, L, cm->clen, pad7, &kmin, &kmax, &ncells)) != eslOK) return status;
2367   /*DumpP7Bands(stdout, i2k, kmin, kmax, L); */
2368 
2369   /* print gmx in heatmap format */
2370   /*
2371     ESL_DMATRIX *D;
2372     double min;
2373     double max;
2374     FILE *hfp;
2375     p7_gmx_Match2DMatrix(gx, TRUE, &D, &min, &max);
2376     hfp = fopen("cur.ps", "w");
2377     my_dmx_Visualize(hfp, D, 0.01, max, 0.01);
2378     fclose(hfp);
2379     esl_dmatrix_Destroy(D);
2380     */
2381 
2382   *ret_i2k  = i2k;
2383   *ret_kmin = kmin;
2384   *ret_kmax = kmax;
2385   *ret_ncells = ncells;
2386 
2387   free(iconflict);
2388   free(isc);
2389   free(k2i);
2390 
2391   return eslOK;
2392 }
2393 
2394 
2395 /**************************************************************
2396  * Function: CP9NodeForPosnP7B()
2397  * Incept:   EPN, Tue Aug 19 14:30:51 2008
2398  *
2399  * Purpose:  Given a P7 banded CP9 posterior matrix,
2400  *           determine the node of the CP9 HMM that is most likely to
2401  *           have emitted (from either its Match or Insert state)
2402  *           a given posn in the target sequence.
2403  *
2404  * Args:     hmm       - the CM plan 9 HMM
2405  *           errbuf    - for error messages
2406  *           x         - posn of target subsequence we're interested in
2407  *           L         - last position of target sequence
2408  *           post      - the posterior matrix for the hmm
2409  *           kn        - min node k for residue x
2410  *           kx        - max node k for residue x
2411  *           ret_node  - RETURN: index of node with highest probability of emitting x
2412  *           ret_type  - RETURN: type of state in ret_node with highest probability
2413  *           print_flag- TRUE to print out info on most likely node
2414  *
2415  *
2416  * Returns:  eslOK on success;
2417  *           eslEINVAL on contract violation.
2418  *           eslEINCOMPAT if kmin[x] >=
2419  */
2420 int
CP9NodeForPosnP7B(CP9_t * hmm,char * errbuf,int x,CP9_MX * post,int kn,int kx,int * ret_node,int * ret_type,int print_flag)2421 CP9NodeForPosnP7B(CP9_t *hmm, char *errbuf, int x, CP9_MX *post,
2422 		  int kn, int kx, int *ret_node, int *ret_type, int print_flag)
2423 {
2424   /* post->mmx[i][kp]: posterior probability that posn i was emitted from node k's
2425      match state, k = kp + kmin[i] */
2426   int  max_k;    /* node index with highest posterior probability of emitting posn x */
2427   int  max_type; /* type of state in max_k node with max probability '0' for match,
2428 		    '1' for insert */
2429   int  max_sc;   /* score (log probability) from post matrix for max_k node max_type state type */
2430   int  k;        /* counter over nodes */
2431   int  kp;       /* k': k offset in position x's band */
2432 
2433   if(kn > kx) ESL_FAIL(eslEINVAL, errbuf, "ERROR in CP9NodeForPosn(), kn (%d) > kx (%d)\n", kn, kx);
2434 
2435   kp = 0;
2436   k  = kn;
2437   if(post->mmx[x][0] > post->imx[x][0]) {
2438     max_sc     = post->mmx[x][0];
2439     max_type   = 0; /* match */
2440   }
2441   else {
2442     max_sc     = post->imx[x][0];
2443     max_type   = 1; /* insert */
2444   }
2445   max_k = k;
2446 
2447   /* move left to right through HMM nodes */
2448   for(k = kn+1, kp = 1; k <= kx; k++, kp++) {
2449     if(post->mmx[x][kp] > max_sc) {
2450       max_k  = k;
2451       max_sc = post->mmx[x][kp];
2452       max_type = 0; /* match */
2453     }
2454     if(post->imx[x][kp] > max_sc) {
2455       max_k  = k;
2456       max_sc = post->imx[x][kp];
2457       max_type = 1; /* insert */
2458     }
2459   }
2460 
2461   if(print_flag) {
2462     if(max_type == 0) printf("MATCH  | mx->mmx[%3d][%3d]: %9d | %8f\n", x, max_k, post->mmx[x][max_k-kn],  Score2Prob(post->mmx[x][max_k-kn], 1.));
2463     else      	      printf("INSERT | mx->imx[%3d][%3d]: %9d | %8f\n", x, max_k, post->imx[x][max_k-kn], Score2Prob(post->imx[x][max_k-kn], 1.));
2464   }
2465   *ret_node = max_k;
2466   *ret_type = max_type;
2467   return eslOK;
2468 }
2469 
2470 /* Function: P7BandsAdjustForSubCM()
2471  * Incept:   EPN, Tue Aug 19 14:47:55 2008
2472  *
2473  * Purpose:  Correct k bands kmin, kmax built from an original CM for it's sub CM model
2474  *           that models spos..epos.
2475  *
2476  * Args:     kmin     - [0.i..L] = k, min node k for residue i
2477  *           kmax     - [0.i..L] = k, max node k for residue i
2478  *           L        - length of current sequence
2479  *           spos     - min k valid in sub CM
2480  *           epos     - max k valid in sub CM
2481  *
2482  * Return:   <eslOK> on success.
2483  *
2484  */
2485 int
P7BandsAdjustForSubCM(int * kmin,int * kmax,int L,int spos,int epos)2486 P7BandsAdjustForSubCM(int *kmin, int *kmax, int L, int spos, int epos)
2487 {
2488   int i;
2489   int M = epos - spos + 1;
2490   for(i = 0; i <= L; i++) {
2491     kmin[i] = ESL_MAX(kmin[i] - (spos-1), 0);
2492     kmin[i] = ESL_MIN(kmin[i], M);
2493 
2494     kmax[i] = ESL_MAX(kmax[i] - (spos-1), 0);
2495     kmax[i] = ESL_MIN(kmax[i], M);
2496   }
2497   kmin[0] = 0; /* hard-coded, M_0 is begin state, it must emit full sequence */
2498 
2499   return eslOK;
2500 }
2501