1 /* CM_TR_OPTS and CM_TR_PENALTIES: data structures with information
2 * relevant to truncated DP functions.
3 *
4 * Contents:
5 * 1. CM_TR_PENALTIES data structure functions,
6 * info on truncated alignment score penalties.
7 *
8 * EPN, Sat Jan 21 14:02:26 2012
9 */
10 #include "esl_config.h"
11 #include "p7_config.h"
12 #include "config.h"
13
14 #include <stdlib.h>
15 #include <string.h>
16 #include <limits.h>
17
18 #include "easel.h"
19 #include "esl_vectorops.h"
20
21 #include "hmmer.h"
22
23 #include "infernal.h"
24
25 /*****************************************************************
26 * 1. CM_TR_PENALTIES data structure functions,
27 * info on truncated alignment score penalties.
28 *****************************************************************/
29
30 /* Function: cm_tr_penalties_Create()
31 * Date: EPN, Sat Jan 21 12:03:52 2012
32 *
33 * Purpose: Allocate and initialize a CM_TR_PENALTIES object.
34 * A CM and its emit map are required to determine
35 * truncation penalty scores. This is annoyingly
36 * complex, see verbose notes within code below.
37 *
38 * Some of the code in this function, specifically
39 * that which calculates the probability of a fragment
40 * aligning at a given node, is checkable, but only
41 * if we disallow truncated begins into insert states.
42 * However, we want to allow truncated begins in reality.
43 * I've left in a flag for ignoring inserts (<ignore_inserts>)
44 * I used in testing this function. Set it to TRUE to
45 * perform the test.
46 *
47 * Returns: Newly allocated CM_TR_PENALTIES object. NULL if out
48 * of memory.
49 */
50 CM_TR_PENALTIES *
cm_tr_penalties_Create(CM_t * cm,int ignore_inserts,char * errbuf)51 cm_tr_penalties_Create(CM_t *cm, int ignore_inserts, char *errbuf)
52 {
53 int status;
54 int v, nd, m, i1, i2;
55 int lpos, rpos;
56 int i;
57
58 /* variables used for determining ratio of inserts to match at each consensus position */
59 float *mexpocc = NULL; /* [0..c..clen] probability match state is used to emit at cons posn c */
60 float *iexpocc = NULL; /* [0..c..clen] probability insert state is used to emit after cons posn c */
61 double *psi = NULL; /* [0..v..M-1] expected occupancy of state v */
62 float m_psi, i1_psi, i2_psi; /* temp psi values */
63 float summed_psi;
64 CM_TR_PENALTIES *trp = NULL;
65
66 /* variables used for calculating global truncation penalties */
67 float g_5and3; /* fragment probability if 5' and 3' truncation are allowed */
68 float g_5or3; /* fragment probability if 5' or 3' truncation are allowed */
69
70 /* variables used for calculating local truncation penalties */
71 float *begin = NULL; /* local begin probabilities 0..v..M-1 */
72 int subtree_clen; /* consensus length of subtree under this node */
73 float prv53, prv5, prv3; /* previous node's fragment probability, 5'&3', 5' only, 3'only */
74 float cur53, cur5, cur3; /* current node's fragment probability, 5'&3', 5' only, 3'only */
75 int nfrag53, nfrag5, nfrag3; /* number of fragments, 5'&3', 5' only, 3'only */
76
77 if(cm == NULL || cm->emap == NULL) goto ERROR;
78
79 ESL_ALLOC(trp, sizeof(CM_TR_PENALTIES));
80
81 trp->M = cm->M;
82 trp->ignored_inserts = ignore_inserts;
83
84 /* Define truncation penalties for each state v. This will be
85 * the score for doing a truncated begin into state v.
86 *
87 * Important note: For this discussion we assume that sequences can
88 * only be truncated at consensus positions, which means we don't
89 * have to worry about truncated begins into inserts. This is an
90 * approximation (also made by Diana and Sean in the 2009 trCYK
91 * paper) that greatly simplifies the explanation of the calculation
92 * of the truncation penalties. The examples in my ELN3 notebook
93 * also use this simplification. However, I need to be able to do
94 * truncated begins into insert states in some cases (some pass/mode
95 * combinations see ELN bottom of p.47). I explain first the
96 * rationale for calculating truncation penalties ignoring inserts
97 * and then I describe how I adapt those penalties to allow
98 * for inserts.
99 *
100 * This is a lengthy comment. I've divided it into 3 sections:
101 * Section 1. Global mode truncation penalties, ignoring inserts.
102 * Section 2. Local mode truncation penalties, ignoring inserts.
103 * Section 3. Adapting truncation penalties to allow for inserts.
104 *
105 **************************************************************
106 * Section 1. Global mode truncation penalties, ignoring inserts.
107 *
108 * We want the truncation penalty to be the log of the probability
109 * that the particular fragment we're aligning was generated from
110 * the following generative process. The generative process differs
111 * between global and local mode.
112 *
113 * In global mode:
114 * o Sample global parsetree which spans consensus positions 1..clen.
115 * o Randomly choose g and h in range 1..clen, where h >= g and
116 * truncate sequence from g..h. The first residue will either be
117 * an insert before position g, or a match at position g of the
118 * model. The final residue will either be an insert after position
119 * h or a match at position h of the model.
120 *
121 * All g,h fragments are equiprobable, so the probability of any
122 * particular fragment is 2 / (clen * (clen+1)). So log_2 of this
123 * value is the truncation penalty for all truncated alignments in
124 * global mode where both 5' and 3' truncation are allowed.
125 *
126 * We store this penalty, per-state in the
127 * g_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1]. The penalty is
128 * identical for all emitting states. The penalty value for
129 * non-emitters is IMPOSSIBLE because truncated begins are
130 * not allowed into non-emitters.
131 *
132 * If only 5' OR 3' truncation is allowed, we only truncate at g or
133 * h, which menas there's 1/clen possible fragments and log_2
134 * (1/clen) is our global truncation penalty.
135 *
136 * However, if 5' truncation is allowed we can only do a truncated
137 * begin into states that with a consensus subtree that spans
138 * position clen (since we don't allow a truncation at the 3' end).
139 * Thus any state whose subtree that doesn't span clen gets
140 * an IMPOSSIBLE value for its truncation score in:
141 * g_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1].
142 *
143 * Likewise, if 3' truncation is allowed we can only do a truncated
144 * begin into states that with a consensus subtree that spans
145 * position 1 (since we don't allow a truncation at the 5' end).
146 *
147 * There's an example of computing all three types of penalties for
148 * a simple CM in ELN 3 p43.
149 *
150 ************************************************************
151 * Section 2. Local mode truncation penalties, ignoring inserts.
152 *
153 * Generative process that generates fragments in local mode:
154 * o Sample local begin state b with consensus subtree from i..j from
155 * local begin state distribution.
156 * o Randomly choose g and h in range i..j, where h >= g and
157 * truncate sequence from g..h. The first residue will either be
158 * an insert before position g, or a match at position g of the
159 * model. The final residue will either be an insert after position
160 * h or a match at position h of the model.
161 *
162 * Unlike in global mode, in local mode all fragments are not
163 * equiprobable since the local begin state distribution can be
164 * anything, and each b allows different sets of fragments to be
165 * generated (because they can only span from i to j).
166 *
167 * The truncation penalty should be the log of the probability of
168 * aligning the current fragment to the model. So we need to know
169 * the probability of generating each possible fragment.
170 * We could calculate probability of any fragment g,h with the
171 * following inefficient algorithm:
172 *
173 * For each start fragment point g,
174 * For each start fragment point h,
175 * For each state v,
176 * If lpos[v] <= g && rpos[v] >= h, then
177 * prob[g][h] += begin[v] * 2. / (st_clen[v] * (st_clen[v]+1));
178 *
179 * Where lpos[v]/rpos[v] are the left/right consensus positions in
180 * consensus subtree rooted at state v. And st_clen[v] is rpos[v] -
181 * lpos[v] + 1, the consensus length of that subtree.
182 *
183 * This gives us prob[g][h], the probability of generating fragment
184 * g,h. But we want to apply the penalty to a state, not to a
185 * fragment, to avoid needing to know the fragment boundaries g,h
186 * during the DP recursion when applying the penalty.
187 *
188 * To facilitate this, we need to find state t, the state with
189 * smallest subtree that contains g,h. State t is relevant because
190 * it is the state which will root the alignment of the fragment g,h
191 * by using a truncated begin transition into t. This gives a new
192 * algorithm:
193 *
194 * For each start fragment point g,
195 * For each start fragment point h,
196 * Identify state t, the max valued state for which
197 * lpos[v] <= g && rpos[v] >= h, then {
198 * prob[t] += prob[g][h]
199 * fcount[t]++;
200 * }
201 *
202 * prob[t] will be the probability of observing an alignment that
203 * uses a truncated begin into t to align any fragment. Then we take
204 * average over all fragments: prob[t] / fcount[t] (since we'll only
205 * be aligning one of those fragments) and use the log of that
206 * probability as the penalty for observing a truncated alignment
207 * rooted at state t. Conveniently, it turns out that all fragments
208 * that share t are equiprobable (have equal prob[g][h] values), so
209 * the average probability is the actual probability for each
210 * fragment, and thus the correct penalty to apply.
211 *
212 * Fortunately, we can compute the correct penalty much more
213 * efficiently than the two algorithms shown above. The
214 * efficient way is implemented below. A test that the penalties
215 * are correctly computed is in cm_tr_penalties_Validate().
216 *
217 * This discussion assumes we're truncating 5' and 3', but if we're
218 * only truncating 5' or 3' The situation is a little different.
219 *
220 * There's an example of computing all three types of penalties for
221 * a simple CM in ELN3 p44-45.
222 *
223 ************************************************************
224 * Section 3. Adapting truncation penalties to allow for inserts.
225 *
226 * We need to be able to do truncated begins into insert states
227 * because we enforce that the first/final residue of a sequence be
228 * included in 5'/3' truncated alignments and we want to be able
229 * to properly align those residues if they're probably emitted
230 * by insert states.
231 *
232 * The methods/logic explained in sections 1 and 2 above I believe
233 * is correct IF we ignore inserts (assume truncated begins into
234 * them are impossible). But we need to allow inserts, so I modify
235 * the truncation penalties as described above to allow for inserts
236 * as follows. We can calculate the appropriate truncated begin
237 * penalty for all MATP_MP, MATL_ML, MATR_MR, BIF_B states as with
238 * the methods described above by ignoring inserts. This gives us a
239 * probability p of using that state as the root of the truncated
240 * alignment, i.e. the truncated begin state. (The log_2 of this
241 * probability is the penalty.) We then partition p amongst the
242 * MATP_MP, MATL_ML, MATR_MR, BIF_B states and any parent insert
243 * states, i.e. any insert state that can transition into the
244 * match/bif state. For each match/bif state there's 0, 1 or 2
245 * parent inserts. We then partition p based on the relative
246 * expected occupancy of these inserts versus the match/bif state.
247 *
248 * This is certainly 'incorrect' in that it doesn't reflect the
249 * true probability of a fragment being aligned to each of the
250 * states, but it should be a close approximation. I think doing
251 * it correctly is basically impossible in the context of a single
252 * state-specific penalty (i.e. the penalty would have to be per-fragment
253 * which would be hard to deal with in the DP functions).
254 */
255
256 /* allocate and initialize the penalty arrays */
257 ESL_ALLOC(trp->g_ptyAA, sizeof(float *) * NTRPENALTY);
258 ESL_ALLOC(trp->l_ptyAA, sizeof(float *) * NTRPENALTY);
259 ESL_ALLOC(trp->ig_ptyAA, sizeof(int *) * NTRPENALTY);
260 ESL_ALLOC(trp->il_ptyAA, sizeof(int *) * NTRPENALTY);
261
262 for(i = 0; i < NTRPENALTY; i++) {
263 trp->g_ptyAA[i] = NULL;
264 trp->l_ptyAA[i] = NULL;
265 trp->il_ptyAA[i] = NULL;
266 trp->ig_ptyAA[i] = NULL;
267 ESL_ALLOC(trp->g_ptyAA[i], sizeof(float) * cm->M);
268 ESL_ALLOC(trp->l_ptyAA[i], sizeof(float) * cm->M);
269 ESL_ALLOC(trp->ig_ptyAA[i], sizeof(int) * cm->M);
270 ESL_ALLOC(trp->il_ptyAA[i], sizeof(int) * cm->M);
271 esl_vec_FSet(trp->g_ptyAA[i], cm->M, IMPOSSIBLE);
272 esl_vec_FSet(trp->l_ptyAA[i], cm->M, IMPOSSIBLE);
273 esl_vec_ISet(trp->ig_ptyAA[i], cm->M, -INFTY);
274 esl_vec_ISet(trp->il_ptyAA[i], cm->M, -INFTY);
275 }
276
277 /* DumpEmitMap(stdout, cm->emap, cm); */
278
279 /* Calculate local begin probabilities and expected occupancy */
280 ESL_ALLOC(begin, sizeof(float) * cm->M);
281 cm_CalculateLocalBeginProbs(cm, cm->pbegin, cm->t, begin);
282 if((status = cm_ExpectedPositionOccupancy(cm, &mexpocc, &iexpocc, &psi, NULL, NULL, NULL)) != eslOK) goto ERROR;
283
284 /* Fill global and local truncation penalties in a single loop. We
285 * step through all nodes and set the truncation penalties for the
286 * MATP_MP, MATL_ML, MATR_MR, and BIF_B states and any parent
287 * inserts (i1, i2) of those states.
288 */
289 g_5and3 = 2. / (cm->clen * (cm->clen+1)); /* for global mode: probability of all fragments if we're truncating 5' and 3' */
290 g_5or3 = 1. / cm->clen; /* for global mode: probability of all fragments if we're only truncating 5' or 3' */
291
292 prv5 = prv3 = prv53 = 0.; /* initialize 'previous' probability values used for calc'ing local truncation penalties */
293 for(nd = 0; nd < cm->nodes; nd++) {
294 lpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) ? cm->emap->lpos[nd] : cm->emap->lpos[nd] + 1;
295 rpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) ? cm->emap->rpos[nd] : cm->emap->rpos[nd] - 1;
296
297 /* now set penalties for match and insert states m, i1 and maybe i2 (if we're a MATP_MP or BIF_B) */
298 if(cm->ndtype[nd] == END_nd) {
299 prv5 = prv3 = prv53 = 0.;
300 }
301 else if(cm->ndtype[nd] == BEGL_nd || cm->ndtype[nd] == BEGR_nd) {
302 prv5 = (cm->ndtype[nd] == BEGL_nd) ? 0. : trp->l_ptyAA[TRPENALTY_5P_ONLY][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */;
303 prv3 = (cm->ndtype[nd] == BEGR_nd) ? 0. : trp->l_ptyAA[TRPENALTY_3P_ONLY][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */;
304 prv53 = trp->l_ptyAA[TRPENALTY_5P_AND_3P][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */
305 }
306 else if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd) {
307 /* determine match states and insert states that pertain to this node */
308 m = cm->nodemap[nd]; /* MATP_MP, MATL_ML, MATR_MR, or BIF_B */
309 InsertsGivenNodeIndex(cm, nd-1, &i1, &i2);
310
311 m_psi = psi[m];
312 if(cm->ndtype[nd] == MATP_MP) { m_psi += (psi[m+1] + psi[m+2]); } /* include MATP_ML and MATP_MR psi */
313 i1_psi = (i1 == -1) ? 0. : psi[i1];
314 i2_psi = (i2 == -1) ? 0. : psi[i2];
315 summed_psi = m_psi + i1_psi + i2_psi;
316 if(ignore_inserts) {
317 i1_psi = i2_psi = 0.;
318 summed_psi = m_psi;
319 }
320
321 /* Global penalties */
322 /* sanity check, we should only set truncation penalty once per state */
323 if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][m])) goto ERROR;
324 if((i1 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
325 if((i2 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;
326 /* divide up the probability g_5and3 amongst relevant states m, i1, i2, weighted by psi */
327 trp->g_ptyAA[TRPENALTY_5P_AND_3P][m] = (m_psi / summed_psi) * g_5and3;
328 if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * g_5and3;
329 if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * g_5and3;
330
331 /* same thing, for 5P only and 3P only */
332 if(rpos == cm->clen) { /* else it will remain IMPOSSIBLE */
333 trp->g_ptyAA[TRPENALTY_5P_ONLY][m] = (m_psi / summed_psi) * g_5or3;
334 if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
335 if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
336 }
337 if(lpos == 1) { /* else it will remain IMPOSSIBLE */
338 trp->g_ptyAA[TRPENALTY_3P_ONLY][m] = (m_psi / summed_psi) * g_5or3;
339 if(i1 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
340 if(i2 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
341 }
342
343 /* Local penalties */
344 subtree_clen = rpos - lpos + 1;
345 nfrag5 = subtree_clen;
346 nfrag3 = subtree_clen;
347 nfrag53 = (subtree_clen * (subtree_clen+1)) / 2;
348
349 /* determine probability of observing a fragment aligned at
350 * state m (here, m is what I call t above and in notes) and
351 * partition that probability between m and i1 and/or i2 by
352 * relative occupancy of match versus inserts
353 */
354 cur5 = begin[m] / (float) nfrag5 + prv5;
355 cur3 = begin[m] / (float) nfrag3 + prv3;
356 cur53 = begin[m] / (float) nfrag53 + prv53;
357
358 /* sanity check, we should only set truncation penalty once per state */
359 if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][m])) goto ERROR;
360 if((i1 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
361 if((i2 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;
362
363 trp->l_ptyAA[TRPENALTY_5P_AND_3P][m] = (m_psi / summed_psi) * cur53;
364 if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * cur53;
365 if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * cur53;
366
367 trp->l_ptyAA[TRPENALTY_5P_ONLY][m] = (m_psi / summed_psi) * cur5;
368 if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * cur5;
369 if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * cur5;
370
371 trp->l_ptyAA[TRPENALTY_3P_ONLY][m] = (m_psi / summed_psi) * cur3;
372 if(i1 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * cur3;
373 if(i2 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * cur3;
374
375 prv5 = (cm->ndtype[nd] == MATL_nd) ? cur5 : 0.;
376 prv3 = (cm->ndtype[nd] == MATR_nd) ? cur3 : 0.;
377 prv53 = cur53;
378 }
379 }
380
381 /* all penalties are currently probabilities, convert them to log
382 * probs and set integer penalties (careful, we have to check if
383 * IMPOSSIBLE first)
384 */
385 for(v = 0; v < cm->M; v++)
386 {
387 if((cm->stid[v] == MATP_MP || cm->stid[v] == MATL_ML || cm->stid[v] == MATR_MR || cm->stid[v] == BIF_B) ||
388 ((cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) && (! StateIsDetached(cm, v))))
389 {
390 /* Check for rare special case: if we're a MATP_IL and next
391 * two states are MATP_IR and END_E, then we won't have set
392 * a trunction penalty. This state will keep an impossible
393 * truncated begin score, if we did a truncated begin into
394 * it we'd just emit from the MATP_IL and then go to the
395 * END_E anyway (the MATP_IR will be detached.
396 */
397 if(cm->stid[v] == MATP_IL && cm->ndtype[cm->ndidx[v]+1] == END_nd) continue;
398
399 /* glocal 5P AND 3P: all of these should have been set to a non-IMPOSSIBLE value */
400 if(! NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
401 trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
402 trp->g_ptyAA[TRPENALTY_5P_AND_3P][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]);
403
404 /* glocal 5P only: some may be IMPOSSIBLE */
405 if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) {
406 trp->ig_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
407 trp->g_ptyAA[TRPENALTY_5P_ONLY][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_ONLY][v]);
408 }
409 /* glocal 5P only: some may be IMPOSSIBLE */
410 if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) {
411 trp->ig_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
412 trp->g_ptyAA[TRPENALTY_3P_ONLY][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_3P_ONLY][v]);
413 }
414
415 /* local penalties all of these should have been set to a non-IMPOSSIBLE value */
416 if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
417 if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_ONLY][v])) goto ERROR;
418 if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_3P_ONLY][v])) goto ERROR;
419
420 trp->il_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
421 trp->il_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
422 trp->il_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
423 trp->l_ptyAA[TRPENALTY_5P_AND_3P][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]);
424 trp->l_ptyAA[TRPENALTY_5P_ONLY][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]);
425 trp->l_ptyAA[TRPENALTY_3P_ONLY][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]);
426 }
427 }
428
429 if(ignore_inserts) {
430 if((status = cm_tr_penalties_Validate(trp, cm, 0.0001, errbuf)) != eslOK) { printf("%s", errbuf); goto ERROR; }
431 }
432
433 /* cm_tr_penalties_Dump(stdout, cm, trp); */
434
435 if(mexpocc != NULL) free(mexpocc);
436 if(iexpocc != NULL) free(iexpocc);
437 if(psi != NULL) free(psi);
438 if(begin != NULL) free(begin);
439
440 return trp;
441
442 ERROR:
443 if(mexpocc != NULL) free(mexpocc);
444 if(iexpocc != NULL) free(iexpocc);
445 if(psi != NULL) free(psi);
446 if(begin != NULL) free(begin);
447 if(trp != NULL) cm_tr_penalties_Destroy(trp);
448
449 return NULL;
450 }
451
452 /* Function: cm_tr_penalties_IdxForPass()
453 * Date: EPN, Wed Feb 15 15:07:54 2012
454 *
455 * Purpose: Return the appropriate truncation
456 * penalty index given a pipeline pass
457 * index.
458 *
459 * Returns: truncation penalty index, either
460 * TRPENALTY_5P_AND_3P, TRPENALTY_5P_ONLY, or
461 * TRPENALTY_3P_ONLY.
462 */
463 int
cm_tr_penalties_IdxForPass(int pass_idx)464 cm_tr_penalties_IdxForPass(int pass_idx)
465 {
466 switch(pass_idx) {
467 case PLI_PASS_5P_ONLY_FORCE: return TRPENALTY_5P_ONLY; break;
468 case PLI_PASS_3P_ONLY_FORCE: return TRPENALTY_3P_ONLY; break;
469 case PLI_PASS_5P_AND_3P_FORCE: return TRPENALTY_5P_AND_3P; break;
470 case PLI_PASS_5P_AND_3P_ANY: return TRPENALTY_5P_AND_3P; break;
471 default: return -1; break;
472 }
473 }
474
475 /* Function: cm_tr_penalties_Validate()
476 * Date: EPN, Fri Jan 27 14:57:04 2012
477 *
478 * Purpose: Validate a CM_TR_PENALTIES object by checking that
479 * all possible fragments in local mode sum to 1.0
480 * for the three scenarios: 5' and 3' truncation,
481 * 5' truncation only and 3' truncation only.
482 *
483 * This is an expensive test and was written only to test
484 * the code that determines fragment probability (really
485 * only for local mode) in cm_tr_penalties_Create(). It can
486 * only be run if the <ignore_inserts> flag was set to TRUE
487 * when cm_tr_penalties_Create() was called. However, in
488 * real life that inserts should not be ignored, so this
489 * test should never actually be run except during testing
490 * (it also is helpful for understanding the logic behind
491 * the derivation of the truncated begin
492 * penalties/probabilities).
493 *
494 * Returns: eslOK if all checks pass within tolerance level.
495 * eslFAIL if any check fails, errbuf is filled.
496 */
497 int
cm_tr_penalties_Validate(CM_TR_PENALTIES * trp,CM_t * cm,double tol,char * errbuf)498 cm_tr_penalties_Validate(CM_TR_PENALTIES *trp, CM_t *cm, double tol, char *errbuf)
499 {
500 if(! trp->ignored_inserts) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), trp->ignored_inserts flag is not TRUE");
501
502 /* This is an expensive test of the trp->l_ptyAA values, the truncation
503 * penalties for local mode alignment. We test each of the three arrays
504 * in trp->ptyAA, one each for the following three scenarios:
505 *
506 * 1. trp->l_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1]: penalty for state v
507 * when 5' and 3' truncation are allowed.
508 * 2. trp->l_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1]: penalty for state v when
509 * only 5' truncation is allowed.
510 * 3. trp->l_ptyAA[TRPENALTY_3P_ONLY][0..v..M-1]: penalty for state v when
511 * only 3' truncation is allowed.
512 *
513 * The test is to enumerate all possible g,h fragments in the
514 * consensus yield 1..clen, for those that can possibly be generated
515 * in the scenario (^), determine the state t with the smallest
516 * subtree yield that contains g..h. This is the state at which an
517 * alignment of a g..h fragment would be rooted. We then add the
518 * probability of a truncated parsetree rooted at v (that is,
519 * exp_2(trp->l_ptyAA[][t])) to a growing sum. After all fragments
520 * are considered the sum should be 1.0. If it is then our
521 * penalties are valid, if not they're invalid and we computed them
522 * incorrectly.
523 *
524 * (^): When 5' and 3' truncation are both allowed, all fragments can be
525 * generated, but not all fragments (for most models) can be generated if
526 * only 5' or 3' truncation is allowed.
527 *
528 */
529
530 double sump = 0.; /* the sum, should be 1.0 after all fragments are considered */
531 int lpos, rpos; /* left and right consensus positions of a parsetree */
532 int g, h; /* fragment start/stop */
533 int keep_going; /* break the loop when this is set to FALSE */
534 int nd, v;
535 /* test 1: trp->l_ptyAA[TRPENALTY_5P_AND_3P]: */
536 for(g = 1; g <= cm->clen; g++) {
537 for(h = g; h <= cm->clen; h++) {
538 /* determine which node a truncated parsetree from [a..b] would align to,
539 * this will be lowest node in the model whose subtree spans a..b
540 */
541 nd = cm->nodes-1;
542 keep_going = TRUE;
543 while(keep_going) {
544 if(nd == 0) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate: 5' and 3' test, unable to find node that spans %d..%d\n", g, h);
545 lpos = cm->emap->lpos[nd];
546 rpos = cm->emap->rpos[nd];
547 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) lpos++; /* lpos was one less than what we want */
548 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) rpos--; /* rpos was one more than what we want */
549 if((cm->ndtype[nd] == BIF_nd || cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd) &&
550 (lpos <= g && rpos >= h)) {
551 keep_going = FALSE;
552 }
553 else { nd--; }
554 }
555 v = cm->nodemap[nd];
556 sump += sreEXP2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]);
557 /* printf("LRBOTH g: %3d h: %3d nd: %3d adding %10.5f (%10.5f)\n", g, h, nd, trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], sump); */
558 }
559 }
560 printf("L and R sump: %.5f\n", sump);
561 if(esl_DCompare(1.0, sump, tol) != eslOK) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), 5' and 3' truncation test failed (%g != 1.0)", sump);
562
563 /* test 2: trp->l_ptyAA[TRPENALTY_5P_ONLY]: */
564 sump = 0.;
565 for(g = 1; g <= cm->clen; g++) {
566 for(h = g; h <= cm->clen; h++) {
567 /* determine which node a truncated parsetree from [g..h] would align to,
568 * this will be lowest node in the model whose subtree spans g..h.
569 * Since we're only truncating on the left, an alignment from
570 * g..h may be impossible, only those fragments for which a node exists with
571 * lpos <= g and rpos==h will be possible.
572 */
573 nd = cm->nodes-1;
574 keep_going = TRUE;
575 while(keep_going && nd > 0) {
576 lpos = cm->emap->lpos[nd];
577 rpos = cm->emap->rpos[nd];
578 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) lpos++; /* lpos was one less than what we want */
579 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) rpos--; /* rpos was one more than what we want */
580 if((cm->ndtype[nd] == BIF_nd || cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd) &&
581 (lpos <= g && rpos == h)) {
582 keep_going = FALSE;
583 }
584 else { nd--; }
585 }
586 if(keep_going == FALSE) {
587 v = cm->nodemap[nd];
588 sump += sreEXP2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]);
589 }
590 }
591 }
592 printf("L only sump: %.5f\n", sump);
593 if(esl_DCompare(1.0, sump, tol) != eslOK) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), 5' only truncation test failed (%g != 1.0)", sump);
594
595 /* test 3: trp->l_ptyAA[TRPENALTY_3P_ONLY]: */
596 sump = 0.;
597 for(g = 1; g <= cm->clen; g++) {
598 for(h = g; h <= cm->clen; h++) {
599 /* determine which node a truncated parsetree from [g..h] would align to,
600 * this will be lowest node in the model whose subtree spans g..h
601 * since we're only truncating on the right, an alignment from
602 * g..h may be impossible, only those for which a node exists with
603 * lpos==g and rpos >= h will be possible.
604 */
605 nd = cm->nodes-1;
606 keep_going = TRUE;
607 while(keep_going && nd > 0) {
608 lpos = cm->emap->lpos[nd];
609 rpos = cm->emap->rpos[nd];
610 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) lpos++; /* lpos was one less than what we want */
611 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) rpos--; /* rpos was one more than what we want */
612 if((cm->ndtype[nd] == BIF_nd || cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd) &&
613 (lpos == g && rpos >= h)) {
614 keep_going = FALSE;
615 }
616 else { nd--; }
617 }
618 if(keep_going == FALSE) {
619 v = cm->nodemap[nd];
620 sump += sreEXP2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]);
621 }
622 }
623 }
624 printf("R only sump: %.5f\n", sump);
625 if(esl_DCompare(1.0, sump, tol) != eslOK) ESL_FAIL(eslFAIL, errbuf, "cm_tr_penalties_Validate(), 3' only truncation test failed (%g != 1.0)", sump);
626
627 return eslOK;
628 }
629
630 /* Function: cm_tr_penalties_Sizeof()
631 * Date: EPN, Sat Jan 21 15:37:58 2012
632 *
633 * Purpose: Calculate and return the size of a CM_TR_PENALTIES
634 * object in Mb.
635 */
636
637 float
cm_tr_penalties_Sizeof(CM_TR_PENALTIES * trp)638 cm_tr_penalties_Sizeof(CM_TR_PENALTIES *trp)
639 {
640 float bytes = 0.;
641
642 if(trp == NULL) return 0.;
643
644 bytes = sizeof(CM_TR_PENALTIES);
645 bytes += sizeof(float *) * NTRPENALTY; /* g_ptyAA, 1st dim */
646 bytes += sizeof(int *) * NTRPENALTY; /* ig_ptyAA, 1st dim */
647 bytes += sizeof(float *) * NTRPENALTY; /* l_ptyAA, 1st dim */
648 bytes += sizeof(int *) * NTRPENALTY; /* il_ptyAA, 1st dim */
649 bytes += sizeof(float) * NTRPENALTY * trp->M; /* g_ptyAA, 2nd dim */
650 bytes += sizeof(int) * NTRPENALTY * trp->M; /* ig_ptyAA, 2nd dim */
651 bytes += sizeof(float) * NTRPENALTY * trp->M; /* l_ptyAA, 2nd dim */
652 bytes += sizeof(int) * NTRPENALTY * trp->M; /* il_ptyAA, 2nd dim */
653
654 return bytes / 1000000.;
655 }
656
657 /* Function: cm_tr_penalties_Dump()
658 *
659 * Purpose: Print contents of the <CM_TR_PENALTIES> <trp> to
660 * stream <fp> for inspection.
661 *
662 * Returns: void
663 */
664 void
cm_tr_penalties_Dump(FILE * fp,const CM_t * cm,const CM_TR_PENALTIES * trp)665 cm_tr_penalties_Dump(FILE *fp, const CM_t *cm, const CM_TR_PENALTIES *trp)
666 {
667 int v, nd, subtree_clen;
668
669 fprintf(fp, "CM_TR_PENALTIES dump\n");
670 fprintf(fp, "--------------------\n");
671 fprintf(fp, "M = %d\n", trp->M);
672 fprintf(fp, "ignored_inserts = %s\n", trp->ignored_inserts ? "TRUE" : "FALSE");
673 fprintf(fp, "clen = %d\n", cm->clen);
674
675 fprintf(fp, "\nglobal/glocal penalties:\n");
676 fprintf(fp, "%5s %5s %7s %7s %10s %10s %10s %10s %10s %10s\n", "stidx", "ndidx", "stid", "st_clen", "f5P_AND_3P", "i5P_AND_3P", "f5P_ONLY", "i5P_ONLY", "f3P_ONLY", "i3P_ONLY");
677 fprintf(fp, "%5s %5s %7s %7s %10s %10s %10s %10s %10s %10s\n", "-----", "-----", "-------", "-------", "----------", "----------", "----------", "----------", "----------", "-----------");
678 fprintf(fp, "\n");
679 for(v = 0; v < cm->M; v++) {
680 nd = cm->ndidx[v];
681 subtree_clen = cm->emap->rpos[nd]-cm->emap->lpos[nd]+1;
682 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) subtree_clen--; /* lpos was one less than what we want */
683 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) subtree_clen--; /* rpos was one more than what we want */
684 fprintf(fp, "%5d %5d %-7s %7d", v, cm->ndidx[v], CMStateid(cm->stid[v]), subtree_clen);
685 if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) {
686 fprintf(fp, " %10.3f %10d", trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v]);
687 }
688 else {
689 fprintf(fp, " %10s %10s", "IMPOSSIBLE", "-INFTY");
690 }
691 if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) {
692 fprintf(fp, " %10.3f %10d", trp->g_ptyAA[TRPENALTY_5P_ONLY][v], trp->ig_ptyAA[TRPENALTY_5P_ONLY][v]);
693 }
694 else {
695 fprintf(fp, " %10s %10s", "IMPOSSIBLE", "-INFTY");
696 }
697 if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) {
698 fprintf(fp, " %10.3f %10d", trp->g_ptyAA[TRPENALTY_3P_ONLY][v], trp->ig_ptyAA[TRPENALTY_3P_ONLY][v]);
699 }
700 else {
701 fprintf(fp, " %10s %10s", "IMPOSSIBLE", "-INFTY");
702 }
703 fprintf(fp, "\n");
704 }
705
706 fprintf(fp, "\nlocal penalties:\n");
707 fprintf(fp, "%5s %5s %7s %7s %10s %10s %10s %10s %10s %10s\n", "stidx", "ndidx", "stid", "st_clen", "f5P_AND_3P", "i5P_AND_3P", "f5P_ONLY", "i5P_ONLY", "f3P_ONLY", "i3P_ONLY");
708 fprintf(fp, "%5s %5s %7s %7s %10s %10s %10s %10s %10s %10s\n", "-----", "-----", "-------", "-------", "----------", "----------", "----------", "----------", "----------", "-----------");
709 fprintf(fp, "\n");
710 for(v = 0; v < cm->M; v++) {
711 nd = cm->ndidx[v];
712 subtree_clen = cm->emap->rpos[nd]-cm->emap->lpos[nd]+1;
713 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATL_nd) subtree_clen--; /* lpos was one less than what we want */
714 if(cm->ndtype[nd] != MATP_nd && cm->ndtype[nd] != MATR_nd) subtree_clen--; /* rpos was one more than what we want */
715 fprintf(fp, "%5d %5d %-7s %7d", v, cm->ndidx[v], CMStateid(cm->stid[v]), subtree_clen);
716 if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v])) {
717 fprintf(fp, " %10.3f %10d", trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], trp->il_ptyAA[TRPENALTY_5P_AND_3P][v]);
718 }
719 else {
720 fprintf(fp, " %10s %10s", "IMPOSSIBLE", "-INFTY");
721 }
722 if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_ONLY][v])) {
723 fprintf(fp, " %10.3f %10d", trp->l_ptyAA[TRPENALTY_5P_ONLY][v], trp->il_ptyAA[TRPENALTY_5P_ONLY][v]);
724 }
725 else {
726 fprintf(fp, " %10s %10s", "IMPOSSIBLE", "-INFTY");
727 }
728 if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_3P_ONLY][v])) {
729 fprintf(fp, " %10.3f %10d", trp->l_ptyAA[TRPENALTY_3P_ONLY][v], trp->il_ptyAA[TRPENALTY_3P_ONLY][v]);
730 }
731 else {
732 fprintf(fp, " %10s %10s", "IMPOSSIBLE", "-INFTY");
733 }
734 fprintf(fp, "\n");
735 }
736
737 return;
738 }
739
740 /* Function: cm_tr_penalties_Destroy()
741 * Date: EPN, Sat Jan 21 10:30:53 2012
742 *
743 * Purpose: Destroy a CM_TR_PENALTIES object.
744 *
745 * Returns: void
746 */
747 void
cm_tr_penalties_Destroy(CM_TR_PENALTIES * trp)748 cm_tr_penalties_Destroy(CM_TR_PENALTIES *trp)
749 {
750 int i;
751
752 if(trp == NULL) return;
753
754 if(trp->g_ptyAA != NULL) {
755 for(i = 0; i < NTRPENALTY; i++) {
756 if(trp->g_ptyAA[i] != NULL) free(trp->g_ptyAA[i]);
757 }
758 free(trp->g_ptyAA);
759 }
760 if(trp->ig_ptyAA != NULL) {
761 for(i = 0; i < NTRPENALTY; i++) {
762 if(trp->ig_ptyAA[i] != NULL) free(trp->ig_ptyAA[i]);
763 }
764 free(trp->ig_ptyAA);
765 }
766
767 if(trp->l_ptyAA != NULL) {
768 for(i = 0; i < NTRPENALTY; i++) {
769 if(trp->l_ptyAA[i] != NULL) free(trp->l_ptyAA[i]);
770 }
771 free(trp->l_ptyAA);
772 }
773 if(trp->il_ptyAA != NULL) {
774 for(i = 0; i < NTRPENALTY; i++) {
775 if(trp->il_ptyAA[i] != NULL) free(trp->il_ptyAA[i]);
776 }
777 free(trp->il_ptyAA);
778 }
779
780 free(trp);
781 trp = NULL;
782
783 return;
784 }
785