1 /* CM_TR_OPTS and CM_TR_PENALTIES: data structures with information
2  * relevant to truncated DP functions.
3  *
4  * Contents:
5  *    1. CM_TR_PENALTIES data structure functions,
6  *       info on truncated alignment score penalties.
7  *
8  * EPN, Sat Jan 21 14:02:26 2012
9  */
10 #include "esl_config.h"
11 #include "p7_config.h"
12 #include "config.h"
13 
14 #include <stdlib.h>
15 #include <string.h>
16 #include <limits.h>
17 
18 #include "easel.h"
19 #include "esl_vectorops.h"
20 
21 #include "hmmer.h"
22 
23 #include "infernal.h"
24 
25 /*****************************************************************
26  *   1. CM_TR_PENALTIES data structure functions,
27  *       info on truncated alignment score penalties.
28  *****************************************************************/
29 
30 /* Function: cm_tr_penalties_Create()
31  * Date:     EPN, Sat Jan 21 12:03:52 2012
32  *
33  * Purpose:  Allocate and initialize a CM_TR_PENALTIES object.
34  *           A CM and its emit map are required to determine
35  *           truncation penalty scores. This is annoyingly
36  *           complex, see verbose notes within code below.
37  *
38  *           Some of the code in this function, specifically
39  *           that which calculates the probability of a fragment
40  *           aligning at a given node, is checkable, but only
41  *           if we disallow truncated begins into insert states.
42  *           However, we want to allow truncated begins in reality.
43  *           I've left in a flag for ignoring inserts (<ignore_inserts>)
44  *           I used in testing this function. Set it to TRUE to
45  *           perform the test.
46  *
47  * Returns:  Newly allocated CM_TR_PENALTIES object. NULL if out
48  *           of memory.
49  */
50 CM_TR_PENALTIES *
cm_tr_penalties_Create(CM_t * cm,int ignore_inserts,char * errbuf)51 cm_tr_penalties_Create(CM_t *cm, int ignore_inserts, char *errbuf)
52 {
53   int status;
54   int v, nd, m, i1, i2;
55   int lpos, rpos;
56   int i;
57 
58   /* variables used for determining ratio of inserts to match at each consensus position */
59   float   *mexpocc  = NULL;  /* [0..c..clen] probability match  state is used to emit at cons posn c */
60   float   *iexpocc  = NULL;  /* [0..c..clen] probability insert state is used to emit after cons posn c */
61   double  *psi      = NULL;  /* [0..v..M-1]  expected occupancy of state v */
62   float    m_psi, i1_psi, i2_psi; /* temp psi values */
63   float    summed_psi;
64   CM_TR_PENALTIES *trp = NULL;
65 
66   /* variables used for calculating global truncation penalties */
67   float g_5and3; /* fragment probability if 5' and 3' truncation are allowed */
68   float g_5or3;  /* fragment probability if 5' or  3' truncation are allowed */
69 
70   /* variables used for calculating local truncation penalties */
71   float *begin = NULL;           /* local begin probabilities 0..v..M-1 */
72   int   subtree_clen;            /* consensus length of subtree under this node */
73   float prv53, prv5, prv3;       /* previous node's fragment probability, 5'&3', 5' only, 3'only */
74   float cur53, cur5, cur3;       /* current node's fragment probability, 5'&3', 5' only, 3'only */
75   int   nfrag53, nfrag5, nfrag3; /* number of fragments, 5'&3', 5' only, 3'only */
76 
77   if(cm == NULL || cm->emap == NULL) goto ERROR;
78 
79   ESL_ALLOC(trp, sizeof(CM_TR_PENALTIES));
80 
81   trp->M = cm->M;
82   trp->ignored_inserts = ignore_inserts;
83 
84   /* Define truncation penalties for each state v. This will be
85    * the score for doing a truncated begin into state v.
86    *
87    * Important note: For this discussion we assume that sequences can
88    * only be truncated at consensus positions, which means we don't
89    * have to worry about truncated begins into inserts. This is an
90    * approximation (also made by Diana and Sean in the 2009 trCYK
91    * paper) that greatly simplifies the explanation of the calculation
92    * of the truncation penalties.  The examples in my ELN3 notebook
93    * also use this simplification. However, I need to be able to do
94    * truncated begins into insert states in some cases (some pass/mode
95    * combinations see ELN bottom of p.47). I explain first the
96    * rationale for calculating truncation penalties ignoring inserts
97    * and then I describe how I adapt those penalties to allow
98    * for inserts.
99    *
100    * This is a lengthy comment. I've divided it into 3 sections:
101    * Section 1. Global mode truncation penalties, ignoring inserts.
102    * Section 2. Local mode truncation penalties, ignoring inserts.
103    * Section 3. Adapting truncation penalties to allow for inserts.
104    *
105    **************************************************************
106    * Section 1. Global mode truncation penalties, ignoring inserts.
107    *
108    * We want the truncation penalty to be the log of the probability
109    * that the particular fragment we're aligning was generated from
110    * the following generative process. The generative process differs
111    * between global and local mode.
112    *
113    * In global mode:
114    * o Sample global parsetree which spans consensus positions 1..clen.
115    * o Randomly choose g and h in range 1..clen, where h >= g and
116    *   truncate sequence from g..h. The first residue will either be
117    *   an insert before position g, or a match at position g of the
118    *   model. The final residue will either be an insert after position
119    *   h or a match at position h of the model.
120    *
121    * All g,h fragments are equiprobable, so the probability of any
122    * particular fragment is 2 / (clen * (clen+1)). So log_2 of this
123    * value is the truncation penalty for all truncated alignments in
124    * global mode where both 5' and 3' truncation are allowed.
125    *
126    * We store this penalty, per-state in the
127    * g_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1].  The penalty is
128    * identical for all emitting states. The penalty value for
129    * non-emitters is IMPOSSIBLE because truncated begins are
130    * not allowed into non-emitters.
131    *
132    * If only 5' OR 3' truncation is allowed, we only truncate at g or
133    * h, which menas there's 1/clen possible fragments and log_2
134    * (1/clen) is our global truncation penalty.
135    *
136    * However, if 5' truncation is allowed we can only do a truncated
137    * begin into states that with a consensus subtree that spans
138    * position clen (since we don't allow a truncation at the 3' end).
139    * Thus any state whose subtree that doesn't span clen gets
140    * an IMPOSSIBLE value for its truncation score in:
141    * g_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1].
142    *
143    * Likewise, if 3' truncation is allowed we can only do a truncated
144    * begin into states that with a consensus subtree that spans
145    * position 1 (since we don't allow a truncation at the 5' end).
146    *
147    * There's an example of computing all three types of penalties for
148    * a simple CM in ELN 3 p43.
149    *
150    ************************************************************
151    * Section 2. Local mode truncation penalties, ignoring inserts.
152    *
153    * Generative process that generates fragments in local mode:
154    * o Sample local begin state b with consensus subtree from i..j from
155    *   local begin state distribution.
156    * o Randomly choose g and h in range i..j, where h >= g and
157    *   truncate sequence from g..h. The first residue will either be
158    *   an insert before position g, or a match at position g of the
159    *   model. The final residue will either be an insert after position
160    *   h or a match at position h of the model.
161    *
162    * Unlike in global mode, in local mode all fragments are not
163    * equiprobable since the local begin state distribution can be
164    * anything, and each b allows different sets of fragments to be
165    * generated (because they can only span from i to j).
166    *
167    * The truncation penalty should be the log of the probability of
168    * aligning the current fragment to the model. So we need to know
169    * the probability of generating each possible fragment.
170    * We could calculate probability of any fragment g,h with the
171    * following inefficient algorithm:
172    *
173    * For each start fragment point g,
174    *   For each start fragment point h,
175    *     For each state v,
176    *       If lpos[v] <= g && rpos[v] >= h, then
177    *       prob[g][h] += begin[v] * 2. / (st_clen[v] * (st_clen[v]+1));
178    *
179    * Where lpos[v]/rpos[v] are the left/right consensus positions in
180    * consensus subtree rooted at state v. And st_clen[v] is rpos[v] -
181    * lpos[v] + 1, the consensus length of that subtree.
182    *
183    * This gives us prob[g][h], the probability of generating fragment
184    * g,h. But we want to apply the penalty to a state, not to a
185    * fragment, to avoid needing to know the fragment boundaries g,h
186    * during the DP recursion when applying the penalty.
187    *
188    * To facilitate this, we need to find state t, the state with
189    * smallest subtree that contains g,h. State t is relevant because
190    * it is the state which will root the alignment of the fragment g,h
191    * by using a truncated begin transition into t. This gives a new
192    * algorithm:
193    *
194    * For each start fragment point g,
195    *   For each start fragment point h,
196    *     Identify state t, the max valued state for which
197    *       lpos[v] <= g && rpos[v] >= h, then {
198    *         prob[t] += prob[g][h]
199    *         fcount[t]++;
200    *       }
201    *
202    * prob[t] will be the probability of observing an alignment that
203    * uses a truncated begin into t to align any fragment. Then we take
204    * average over all fragments: prob[t] / fcount[t] (since we'll only
205    * be aligning one of those fragments) and use the log of that
206    * probability as the penalty for observing a truncated alignment
207    * rooted at state t. Conveniently, it turns out that all fragments
208    * that share t are equiprobable (have equal prob[g][h] values), so
209    * the average probability is the actual probability for each
210    * fragment, and thus the correct penalty to apply.
211    *
212    * Fortunately, we can compute the correct penalty much more
213    * efficiently than the two algorithms shown above. The
214    * efficient way is implemented below. A test that the penalties
215    * are correctly computed is in cm_tr_penalties_Validate().
216    *
217    * This discussion assumes we're truncating 5' and 3', but if we're
218    * only truncating 5' or 3' The situation is a little different.
219    *
220    * There's an example of computing all three types of penalties for
221    * a simple CM in ELN3 p44-45.
222    *
223    ************************************************************
224    * Section 3. Adapting truncation penalties to allow for inserts.
225    *
226    * We need to be able to do truncated begins into insert states
227    * because we enforce that the first/final residue of a sequence be
228    * included in 5'/3' truncated alignments and we want to be able
229    * to properly align those residues if they're probably emitted
230    * by insert states.
231    *
232    * The methods/logic explained in sections 1 and 2 above I believe
233    * is correct IF we ignore inserts (assume truncated begins into
234    * them are impossible). But we need to allow inserts, so I modify
235    * the truncation penalties as described above to allow for inserts
236    * as follows. We can calculate the appropriate truncated begin
237    * penalty for all MATP_MP, MATL_ML, MATR_MR, BIF_B states as with
238    * the methods described above by ignoring inserts. This gives us a
239    * probability p of using that state as the root of the truncated
240    * alignment, i.e. the truncated begin state. (The log_2 of this
241    * probability is the penalty.) We then partition p amongst the
242    * MATP_MP, MATL_ML, MATR_MR, BIF_B states and any parent insert
243    * states, i.e. any insert state that can transition into the
244    * match/bif state. For each match/bif state there's 0, 1 or 2
245    * parent inserts. We then partition p based on the relative
246    * expected occupancy of these inserts versus the match/bif state.
247    *
248    * This is certainly 'incorrect' in that it doesn't reflect the
249    * true probability of a fragment being aligned to each of the
250    * states, but it should be a close approximation. I think doing
251    * it correctly is basically impossible in the context of a single
252    * state-specific penalty (i.e. the penalty would have to be per-fragment
253    * which would be hard to deal with in the DP functions).
254    */
255 
256   /* allocate and initialize the penalty arrays */
257   ESL_ALLOC(trp->g_ptyAA,  sizeof(float *) * NTRPENALTY);
258   ESL_ALLOC(trp->l_ptyAA,  sizeof(float *) * NTRPENALTY);
259   ESL_ALLOC(trp->ig_ptyAA, sizeof(int *)   * NTRPENALTY);
260   ESL_ALLOC(trp->il_ptyAA, sizeof(int *)   * NTRPENALTY);
261 
262   for(i = 0; i < NTRPENALTY; i++) {
263     trp->g_ptyAA[i]  = NULL;
264     trp->l_ptyAA[i]  = NULL;
265     trp->il_ptyAA[i] = NULL;
266     trp->ig_ptyAA[i] = NULL;
267     ESL_ALLOC(trp->g_ptyAA[i],  sizeof(float) * cm->M);
268     ESL_ALLOC(trp->l_ptyAA[i],  sizeof(float) * cm->M);
269     ESL_ALLOC(trp->ig_ptyAA[i], sizeof(int)   * cm->M);
270     ESL_ALLOC(trp->il_ptyAA[i], sizeof(int)   * cm->M);
271     esl_vec_FSet(trp->g_ptyAA[i],   cm->M, IMPOSSIBLE);
272     esl_vec_FSet(trp->l_ptyAA[i],   cm->M, IMPOSSIBLE);
273     esl_vec_ISet(trp->ig_ptyAA[i],  cm->M, -INFTY);
274     esl_vec_ISet(trp->il_ptyAA[i],  cm->M, -INFTY);
275   }
276 
277   /* DumpEmitMap(stdout, cm->emap, cm); */
278 
279   /* Calculate local begin probabilities and expected occupancy */
280   ESL_ALLOC(begin, sizeof(float) * cm->M);
281   cm_CalculateLocalBeginProbs(cm, cm->pbegin, cm->t, begin);
282   if((status = cm_ExpectedPositionOccupancy(cm, &mexpocc, &iexpocc, &psi, NULL, NULL, NULL)) != eslOK) goto ERROR;
283 
284   /* Fill global and local truncation penalties in a single loop. We
285    * step through all nodes and set the truncation penalties for the
286    * MATP_MP, MATL_ML, MATR_MR, and BIF_B states and any parent
287    * inserts (i1, i2) of those states.
288    */
289   g_5and3 = 2. / (cm->clen * (cm->clen+1)); /* for global mode: probability of all fragments if we're truncating 5' and 3' */
290   g_5or3  = 1. / cm->clen;                  /* for global mode: probability of all fragments if we're only truncating 5' or  3' */
291 
292   prv5 = prv3 = prv53 = 0.; /* initialize 'previous' probability values used for calc'ing local truncation penalties */
293   for(nd = 0; nd < cm->nodes; nd++) {
294     lpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) ? cm->emap->lpos[nd] : cm->emap->lpos[nd] + 1;
295     rpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) ? cm->emap->rpos[nd] : cm->emap->rpos[nd] - 1;
296 
297     /* now set penalties for match and insert states m, i1 and maybe i2 (if we're a MATP_MP or BIF_B) */
298     if(cm->ndtype[nd] == END_nd) {
299       prv5 = prv3 = prv53 = 0.;
300     }
301     else if(cm->ndtype[nd] == BEGL_nd || cm->ndtype[nd] == BEGR_nd) {
302       prv5  = (cm->ndtype[nd] == BEGL_nd) ? 0. : trp->l_ptyAA[TRPENALTY_5P_ONLY][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */;
303       prv3  = (cm->ndtype[nd] == BEGR_nd) ? 0. : trp->l_ptyAA[TRPENALTY_3P_ONLY][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */;
304       prv53 = trp->l_ptyAA[TRPENALTY_5P_AND_3P][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */
305     }
306     else if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd) {
307       /* determine match states and insert states that pertain to this node */
308       m = cm->nodemap[nd]; /* MATP_MP, MATL_ML, MATR_MR, or BIF_B */
309       InsertsGivenNodeIndex(cm, nd-1, &i1, &i2);
310 
311       m_psi = psi[m];
312       if(cm->ndtype[nd] == MATP_MP) { m_psi += (psi[m+1] + psi[m+2]); } /* include MATP_ML and MATP_MR psi */
313       i1_psi = (i1 == -1) ? 0. : psi[i1];
314       i2_psi = (i2 == -1) ? 0. : psi[i2];
315       summed_psi = m_psi + i1_psi + i2_psi;
316       if(ignore_inserts) {
317 	i1_psi = i2_psi = 0.;
318 	summed_psi = m_psi;
319       }
320 
321       /* Global penalties */
322       /* sanity check, we should only set truncation penalty once per state */
323       if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][m]))  goto ERROR;
324       if((i1 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
325       if((i2 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;
326       /* divide up the probability g_5and3 amongst relevant states m, i1, i2, weighted by psi */
327       trp->g_ptyAA[TRPENALTY_5P_AND_3P][m]  = (m_psi  / summed_psi) * g_5and3;
328       if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * g_5and3;
329       if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * g_5and3;
330 
331       /* same thing, for 5P only and 3P only */
332       if(rpos == cm->clen) { /* else it will remain IMPOSSIBLE */
333 	trp->g_ptyAA[TRPENALTY_5P_ONLY][m]  = (m_psi / summed_psi) * g_5or3;
334 	if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
335 	if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
336       }
337       if(lpos == 1) { /* else it will remain IMPOSSIBLE */
338 	trp->g_ptyAA[TRPENALTY_3P_ONLY][m]  = (m_psi  / summed_psi) * g_5or3;
339 	if(i1 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
340 	if(i2 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
341       }
342 
343       /* Local penalties */
344       subtree_clen = rpos - lpos + 1;
345       nfrag5  = subtree_clen;
346       nfrag3  = subtree_clen;
347       nfrag53 = (subtree_clen * (subtree_clen+1)) / 2;
348 
349       /* determine probability of observing a fragment aligned at
350        * state m (here, m is what I call t above and in notes) and
351        * partition that probability between m and i1 and/or i2 by
352        * relative occupancy of match versus inserts
353        */
354       cur5  = begin[m] / (float) nfrag5  + prv5;
355       cur3  = begin[m] / (float) nfrag3  + prv3;
356       cur53 = begin[m] / (float) nfrag53 + prv53;
357 
358       /* sanity check, we should only set truncation penalty once per state */
359       if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][m]))  goto ERROR;
360       if((i1 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
361       if((i2 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;
362 
363       trp->l_ptyAA[TRPENALTY_5P_AND_3P][m]  = (m_psi  / summed_psi) * cur53;
364       if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * cur53;
365       if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * cur53;
366 
367       trp->l_ptyAA[TRPENALTY_5P_ONLY][m]  = (m_psi  / summed_psi) * cur5;
368       if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * cur5;
369       if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * cur5;
370 
371       trp->l_ptyAA[TRPENALTY_3P_ONLY][m]  = (m_psi  / summed_psi) * cur3;
372       if(i1 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * cur3;
373       if(i2 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * cur3;
374 
375       prv5  = (cm->ndtype[nd] == MATL_nd) ? cur5 : 0.;
376       prv3  = (cm->ndtype[nd] == MATR_nd) ? cur3 : 0.;
377       prv53 = cur53;
378     }
379   }
380 
381   /* all penalties are currently probabilities, convert them to log
382    * probs and set integer penalties (careful, we have to check if
383    * IMPOSSIBLE first)
384    */
385   for(v = 0; v < cm->M; v++)
386     {
387       if((cm->stid[v] == MATP_MP || cm->stid[v] == MATL_ML || cm->stid[v] == MATR_MR || cm->stid[v] == BIF_B) ||
388 	 ((cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) && (! StateIsDetached(cm, v))))
389 	{
390 	  /* Check for rare special case: if we're a MATP_IL and next
391 	   * two states are MATP_IR and END_E, then we won't have set
392 	   * a trunction penalty. This state will keep an impossible
393 	   * truncated begin score, if we did a truncated begin into
394 	   * it we'd just emit from the MATP_IL and then go to the
395 	   * END_E anyway (the MATP_IR will be detached.
396 	   */
397 	  if(cm->stid[v] == MATP_IL && cm->ndtype[cm->ndidx[v]+1] == END_nd) continue;
398 
399 	  /* glocal 5P AND 3P: all of these should have been set to a non-IMPOSSIBLE value */
400 	  if(! NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
401 	  trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
402 	  trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]);
403 
404 	  /* glocal 5P only: some may be IMPOSSIBLE */
405 	  if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) {
406 	    trp->ig_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
407 	    trp->g_ptyAA[TRPENALTY_5P_ONLY][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_ONLY][v]);
408 	  }
409 	  /* glocal 5P only: some may be IMPOSSIBLE */
410 	  if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) {
411 	    trp->ig_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
412 	    trp->g_ptyAA[TRPENALTY_3P_ONLY][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_3P_ONLY][v]);
413 	  }
414 
415 	  /* local penalties all of these should have been set to a non-IMPOSSIBLE value */
416 	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
417 	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_ONLY][v]))   goto ERROR;
418 	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_3P_ONLY][v]))   goto ERROR;
419 
420 	  trp->il_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
421 	  trp->il_ptyAA[TRPENALTY_5P_ONLY][v]   = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
422 	  trp->il_ptyAA[TRPENALTY_3P_ONLY][v]   = Prob2Score(trp->l_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
423 	  trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]  = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]);
424 	  trp->l_ptyAA[TRPENALTY_5P_ONLY][v]    = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]);
425 	  trp->l_ptyAA[TRPENALTY_3P_ONLY][v]    = sreLOG2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]);
426 	}
427     }
428 
429   if(ignore_inserts) {
430     if((status = cm_tr_penalties_Validate(trp, cm, 0.0001, errbuf)) != eslOK) { printf("%s", errbuf);  goto ERROR; }
431   }
432 
433   /* cm_tr_penalties_Dump(stdout, cm, trp); */
434 
435   if(mexpocc != NULL) free(mexpocc);
436   if(iexpocc != NULL) free(iexpocc);
437   if(psi     != NULL) free(psi);
438   if(begin   != NULL) free(begin);
439 
440   return trp;
441 
442  ERROR:
443   if(mexpocc != NULL) free(mexpocc);
444   if(iexpocc != NULL) free(iexpocc);
445   if(psi     != NULL) free(psi);
446   if(begin   != NULL) free(begin);
447   if(trp     != NULL) cm_tr_penalties_Destroy(trp);
448 
449   return NULL;
450 }
451 
452 /* Function: cm_tr_penalties_IdxForPass()
453  * Date:     EPN, Wed Feb 15 15:07:54 2012
454  *
455  * Purpose:  Return the appropriate truncation
456  *           penalty index given a pipeline pass
457  *           index.
458  *
459  * Returns:  truncation penalty index, either
460  *           TRPENALTY_5P_AND_3P, TRPENALTY_5P_ONLY, or
461  *           TRPENALTY_3P_ONLY.
462  */
463 int
cm_tr_penalties_IdxForPass(int pass_idx)464 cm_tr_penalties_IdxForPass(int pass_idx)
465 {
466   switch(pass_idx) {
467   case PLI_PASS_5P_ONLY_FORCE:   return TRPENALTY_5P_ONLY; break;
468   case PLI_PASS_3P_ONLY_FORCE:   return TRPENALTY_3P_ONLY; break;
469   case PLI_PASS_5P_AND_3P_FORCE: return TRPENALTY_5P_AND_3P; break;
470   case PLI_PASS_5P_AND_3P_ANY:   return TRPENALTY_5P_AND_3P; break;
471   default: return -1; break;
472   }
473 }
474 
475 /* Function: cm_tr_penalties_Validate()
476  * Date:     EPN, Fri Jan 27 14:57:04 2012
477  *
478  * Purpose:  Validate a CM_TR_PENALTIES object by checking that
479  *           all possible fragments in local mode sum to 1.0
480  *           for the three scenarios: 5' and 3' truncation,
481  *           5' truncation only and 3' truncation only.
482  *
483  *           This is an expensive test and was written only to test
484  *           the code that determines fragment probability (really
485  *           only for local mode) in cm_tr_penalties_Create().  It can
486  *           only be run if the <ignore_inserts> flag was set to TRUE
487  *           when cm_tr_penalties_Create() was called.  However, in
488  *           real life that inserts should not be ignored, so this
489  *           test should never actually be run except during testing
490  *           (it also is helpful for understanding the logic behind
491  *           the derivation of the truncated begin
492  *           penalties/probabilities).
493  *
494  * Returns:  eslOK if all checks pass within tolerance level.
495  *           eslFAIL if any check fails, errbuf is filled.
496  */
497 int
cm_tr_penalties_Validate(CM_TR_PENALTIES * trp,CM_t * cm,double tol,char * errbuf)498 cm_tr_penalties_Validate(CM_TR_PENALTIES *trp, CM_t *cm, double tol, char *errbuf)
499 {
500   if(! trp->ignored_inserts) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), trp->ignored_inserts flag is not TRUE");
501 
502   /* This is an expensive test of the trp->l_ptyAA values, the truncation
503    * penalties for local mode alignment. We test each of the three arrays
504    * in trp->ptyAA, one each for the following three scenarios:
505    *
506    * 1. trp->l_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1]: penalty for state v
507    *    when 5' and 3' truncation are allowed.
508    * 2. trp->l_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1]: penalty for state v when
509    *    only 5' truncation is allowed.
510    * 3. trp->l_ptyAA[TRPENALTY_3P_ONLY][0..v..M-1]: penalty for state v when
511    *    only 3' truncation is allowed.
512    *
513    * The test is to enumerate all possible g,h fragments in the
514    * consensus yield 1..clen, for those that can possibly be generated
515    * in the scenario (^), determine the state t with the smallest
516    * subtree yield that contains g..h. This is the state at which an
517    * alignment of a g..h fragment would be rooted. We then add the
518    * probability of a truncated parsetree rooted at v (that is,
519    * exp_2(trp->l_ptyAA[][t])) to a growing sum. After all fragments
520    * are considered the sum should be 1.0.  If it is then our
521    * penalties are valid, if not they're invalid and we computed them
522    * incorrectly.
523    *
524    * (^): When 5' and 3' truncation are both allowed, all fragments can be
525    * generated, but not all fragments (for most models) can be generated if
526    * only 5' or 3' truncation is allowed.
527    *
528    */
529 
530   double sump = 0.;  /* the sum, should be 1.0 after all fragments are considered */
531   int    lpos, rpos; /* left and right consensus positions of a parsetree */
532   int    g, h;       /* fragment start/stop */
533   int    keep_going; /* break the loop when this is set to FALSE */
534   int    nd, v;
535   /* test 1: trp->l_ptyAA[TRPENALTY_5P_AND_3P]: */
536   for(g = 1; g <= cm->clen; g++) {
537     for(h = g; h <= cm->clen; h++) {
538       /* determine which node a truncated parsetree from [a..b] would align to,
539        * this will be lowest node in the model whose subtree spans a..b
540        */
541       nd = cm->nodes-1;
542       keep_going = TRUE;
543       while(keep_going) {
544 	if(nd == 0) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate: 5' and 3' test, unable to find node that spans %d..%d\n", g, h);
545 	lpos = cm->emap->lpos[nd];
546 	rpos = cm->emap->rpos[nd];
547 	if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) lpos++; /* lpos was one less than what we want */
548 	if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) rpos--; /* rpos was one more than what we want */
549 	if((cm->ndtype[nd] == BIF_nd || cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd) &&
550 	   (lpos <= g && rpos >= h)) {
551 	  keep_going = FALSE;
552 	}
553 	else { nd--; }
554       }
555       v = cm->nodemap[nd];
556       sump += sreEXP2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]);
557       /* printf("LRBOTH g: %3d h: %3d nd: %3d adding %10.5f  (%10.5f)\n", g, h, nd, trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], sump); */
558     }
559   }
560   printf("L and R sump:  %.5f\n", sump);
561   if(esl_DCompare(1.0, sump, tol) != eslOK) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), 5' and 3' truncation test failed (%g != 1.0)", sump);
562 
563   /* test 2: trp->l_ptyAA[TRPENALTY_5P_ONLY]: */
564   sump = 0.;
565   for(g = 1; g <= cm->clen; g++) {
566     for(h = g; h <= cm->clen; h++) {
567       /* determine which node a truncated parsetree from [g..h] would align to,
568        * this will be lowest node in the model whose subtree spans g..h.
569        * Since we're only truncating on the left, an alignment from
570        * g..h may be impossible, only those fragments for which a node exists with
571        * lpos <= g and rpos==h will be possible.
572        */
573       nd = cm->nodes-1;
574       keep_going = TRUE;
575       while(keep_going && nd > 0) {
576 	lpos = cm->emap->lpos[nd];
577 	rpos = cm->emap->rpos[nd];
578 	if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) lpos++; /* lpos was one less than what we want */
579 	if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) rpos--; /* rpos was one more than what we want */
580 	if((cm->ndtype[nd] == BIF_nd || cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd) &&
581 	   (lpos <= g && rpos == h)) {
582 	  keep_going = FALSE;
583 	}
584 	else { nd--; }
585       }
586       if(keep_going == FALSE) {
587 	v = cm->nodemap[nd];
588 	sump += sreEXP2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]);
589       }
590     }
591   }
592   printf("L only  sump:  %.5f\n", sump);
593   if(esl_DCompare(1.0, sump, tol) != eslOK) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), 5' only truncation test failed (%g != 1.0)", sump);
594 
595   /* test 3: trp->l_ptyAA[TRPENALTY_3P_ONLY]: */
596   sump = 0.;
597   for(g = 1; g <= cm->clen; g++) {
598     for(h = g; h <= cm->clen; h++) {
599       /* determine which node a truncated parsetree from [g..h] would align to,
600        * this will be lowest node in the model whose subtree spans g..h
601        * since we're only truncating on the right, an alignment from
602        * g..h may be impossible, only those for which a node exists with
603        * lpos==g and rpos >= h will be possible.
604        */
605       nd = cm->nodes-1;
606       keep_going = TRUE;
607       while(keep_going && nd > 0) {
608 	lpos = cm->emap->lpos[nd];
609 	rpos = cm->emap->rpos[nd];
610 	if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) lpos++; /* lpos was one less than what we want */
611 	if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) rpos--; /* rpos was one more than what we want */
612 	if((cm->ndtype[nd] == BIF_nd || cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd) &&
613 	   (lpos == g && rpos >= h)) {
614 	  keep_going = FALSE;
615 	}
616 	else { nd--; }
617       }
618       if(keep_going == FALSE) {
619 	v = cm->nodemap[nd];
620 	sump += sreEXP2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]);
621       }
622     }
623   }
624   printf("R only  sump:  %.5f\n", sump);
625   if(esl_DCompare(1.0, sump, tol) != eslOK) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), 3' only truncation test failed (%g != 1.0)", sump);
626 
627   return eslOK;
628 }
629 
630 /* Function: cm_tr_penalties_Sizeof()
631  * Date:     EPN, Sat Jan 21 15:37:58 2012
632  *
633  * Purpose:  Calculate and return the size of a CM_TR_PENALTIES
634  *           object in Mb.
635  */
636 
637 float
cm_tr_penalties_Sizeof(CM_TR_PENALTIES * trp)638 cm_tr_penalties_Sizeof(CM_TR_PENALTIES *trp)
639 {
640   float bytes = 0.;
641 
642   if(trp == NULL) return 0.;
643 
644   bytes = sizeof(CM_TR_PENALTIES);
645   bytes += sizeof(float *) * NTRPENALTY; /* g_ptyAA, 1st dim */
646   bytes += sizeof(int *)   * NTRPENALTY; /* ig_ptyAA, 1st dim */
647   bytes += sizeof(float *) * NTRPENALTY; /* l_ptyAA, 1st dim */
648   bytes += sizeof(int *)   * NTRPENALTY; /* il_ptyAA, 1st dim */
649   bytes += sizeof(float) * NTRPENALTY * trp->M; /* g_ptyAA, 2nd dim */
650   bytes += sizeof(int)   * NTRPENALTY * trp->M; /* ig_ptyAA, 2nd dim */
651   bytes += sizeof(float) * NTRPENALTY * trp->M; /* l_ptyAA, 2nd dim */
652   bytes += sizeof(int)   * NTRPENALTY * trp->M; /* il_ptyAA, 2nd dim */
653 
654   return bytes / 1000000.;
655 }
656 
657 /* Function:  cm_tr_penalties_Dump()
658  *
659  * Purpose:   Print contents of the <CM_TR_PENALTIES> <trp> to
660  *            stream <fp> for inspection.
661  *
662  * Returns:   void
663  */
664 void
cm_tr_penalties_Dump(FILE * fp,const CM_t * cm,const CM_TR_PENALTIES * trp)665 cm_tr_penalties_Dump(FILE *fp, const CM_t *cm, const CM_TR_PENALTIES *trp)
666 {
667   int v, nd, subtree_clen;
668 
669   fprintf(fp, "CM_TR_PENALTIES dump\n");
670   fprintf(fp, "--------------------\n");
671   fprintf(fp, "M               = %d\n", trp->M);
672   fprintf(fp, "ignored_inserts = %s\n", trp->ignored_inserts ? "TRUE" : "FALSE");
673   fprintf(fp, "clen            = %d\n", cm->clen);
674 
675   fprintf(fp, "\nglobal/glocal penalties:\n");
676   fprintf(fp, "%5s  %5s  %7s  %7s  %10s  %10s  %10s  %10s  %10s  %10s\n", "stidx", "ndidx", "stid", "st_clen", "f5P_AND_3P", "i5P_AND_3P", "f5P_ONLY", "i5P_ONLY", "f3P_ONLY", "i3P_ONLY");
677   fprintf(fp, "%5s  %5s  %7s  %7s  %10s  %10s  %10s  %10s  %10s  %10s\n", "-----", "-----", "-------", "-------", "----------", "----------", "----------", "----------", "----------", "-----------");
678   fprintf(fp, "\n");
679   for(v = 0; v < cm->M; v++) {
680     nd = cm->ndidx[v];
681     subtree_clen = cm->emap->rpos[nd]-cm->emap->lpos[nd]+1;
682     if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) subtree_clen--; /* lpos was one less than what we want */
683     if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) subtree_clen--; /* rpos was one more than what we want */
684     fprintf(fp, "%5d  %5d  %-7s  %7d", v, cm->ndidx[v], CMStateid(cm->stid[v]), subtree_clen);
685     if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) {
686       fprintf(fp, "  %10.3f  %10d", trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v]);
687     }
688     else {
689       fprintf(fp, "  %10s  %10s", "IMPOSSIBLE", "-INFTY");
690     }
691     if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) {
692       fprintf(fp, "  %10.3f  %10d", trp->g_ptyAA[TRPENALTY_5P_ONLY][v], trp->ig_ptyAA[TRPENALTY_5P_ONLY][v]);
693     }
694     else {
695       fprintf(fp, "  %10s  %10s", "IMPOSSIBLE", "-INFTY");
696     }
697     if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) {
698       fprintf(fp, "  %10.3f  %10d", trp->g_ptyAA[TRPENALTY_3P_ONLY][v], trp->ig_ptyAA[TRPENALTY_3P_ONLY][v]);
699     }
700     else {
701       fprintf(fp, "  %10s  %10s", "IMPOSSIBLE", "-INFTY");
702     }
703     fprintf(fp, "\n");
704   }
705 
706   fprintf(fp, "\nlocal penalties:\n");
707   fprintf(fp, "%5s  %5s  %7s  %7s  %10s  %10s  %10s  %10s  %10s  %10s\n", "stidx", "ndidx", "stid", "st_clen", "f5P_AND_3P", "i5P_AND_3P", "f5P_ONLY", "i5P_ONLY", "f3P_ONLY", "i3P_ONLY");
708   fprintf(fp, "%5s  %5s  %7s  %7s  %10s  %10s  %10s  %10s  %10s  %10s\n", "-----", "-----", "-------", "-------", "----------", "----------", "----------", "----------", "----------", "-----------");
709   fprintf(fp, "\n");
710   for(v = 0; v < cm->M; v++) {
711     nd = cm->ndidx[v];
712     subtree_clen = cm->emap->rpos[nd]-cm->emap->lpos[nd]+1;
713     if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) subtree_clen--; /* lpos was one less than what we want */
714     if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) subtree_clen--; /* rpos was one more than what we want */
715     fprintf(fp, "%5d  %5d  %-7s  %7d", v, cm->ndidx[v], CMStateid(cm->stid[v]), subtree_clen);
716     if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v])) {
717       fprintf(fp, "  %10.3f  %10d", trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], trp->il_ptyAA[TRPENALTY_5P_AND_3P][v]);
718     }
719     else {
720       fprintf(fp, "  %10s  %10s", "IMPOSSIBLE", "-INFTY");
721     }
722     if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_ONLY][v])) {
723       fprintf(fp, "  %10.3f  %10d", trp->l_ptyAA[TRPENALTY_5P_ONLY][v], trp->il_ptyAA[TRPENALTY_5P_ONLY][v]);
724     }
725     else {
726       fprintf(fp, "  %10s  %10s", "IMPOSSIBLE", "-INFTY");
727     }
728     if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_3P_ONLY][v])) {
729       fprintf(fp, "  %10.3f  %10d", trp->l_ptyAA[TRPENALTY_3P_ONLY][v], trp->il_ptyAA[TRPENALTY_3P_ONLY][v]);
730     }
731     else {
732       fprintf(fp, "  %10s  %10s", "IMPOSSIBLE", "-INFTY");
733     }
734     fprintf(fp, "\n");
735   }
736 
737   return;
738 }
739 
740 /* Function: cm_tr_penalties_Destroy()
741  * Date:     EPN, Sat Jan 21 10:30:53 2012
742  *
743  * Purpose:  Destroy a CM_TR_PENALTIES object.
744  *
745  * Returns:  void
746  */
747 void
cm_tr_penalties_Destroy(CM_TR_PENALTIES * trp)748 cm_tr_penalties_Destroy(CM_TR_PENALTIES *trp)
749 {
750   int i;
751 
752   if(trp == NULL) return;
753 
754   if(trp->g_ptyAA != NULL) {
755     for(i = 0; i < NTRPENALTY; i++) {
756       if(trp->g_ptyAA[i] != NULL) free(trp->g_ptyAA[i]);
757     }
758     free(trp->g_ptyAA);
759   }
760   if(trp->ig_ptyAA != NULL) {
761     for(i = 0; i < NTRPENALTY; i++) {
762       if(trp->ig_ptyAA[i] != NULL) free(trp->ig_ptyAA[i]);
763     }
764     free(trp->ig_ptyAA);
765   }
766 
767   if(trp->l_ptyAA != NULL) {
768     for(i = 0; i < NTRPENALTY; i++) {
769       if(trp->l_ptyAA[i] != NULL) free(trp->l_ptyAA[i]);
770     }
771     free(trp->l_ptyAA);
772   }
773   if(trp->il_ptyAA != NULL) {
774     for(i = 0; i < NTRPENALTY; i++) {
775       if(trp->il_ptyAA[i] != NULL) free(trp->il_ptyAA[i]);
776     }
777     free(trp->il_ptyAA);
778   }
779 
780   free(trp);
781   trp = NULL;
782 
783   return;
784 }
785