1 /*****************************************************************
2 * cm_submodel.c (formerly sub_cm.c)
3 * EPN 07.25.06 (Benasque)
4 *
5 * Building submodels (sub CMs) from a template CM, that represent
6 * a contiguous subset of the consensus columns that were modelled
7 * by the template CM.
8 *
9 * These functions are still under development. No guarantees.
10 *
11 * NOTE: 'sub CM' here does not correspond to Zasha Weinberg's use
12 * of the term for filtering with subtrees of the model. To
13 * be unambiguous, I should have not used 'sub CM', it's bad
14 * form on my part. However 'sub CM' is so engrained in the
15 * codebase at this point, I'm wary to change it, so it stays.
16 *
17 */
18
19 #include "esl_config.h"
20 #include "p7_config.h"
21 #include "config.h"
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <math.h>
27
28 #include "easel.h"
29 #include "esl_alphabet.h"
30 #include "esl_random.h"
31 #include "esl_stack.h"
32 #include "esl_vectorops.h"
33 #include "esl_wuss.h"
34
35 #include "hmmer.h"
36
37 #include "infernal.h"
38
39 static void map_orig2sub_cm(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int print_flag);
40 static int map_orig2sub_cm_helper(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int sub_v);
41 static int cm2sub_cm_check_id_next_node(CM_t *orig_cm, CM_t *sub_cm, int orig_nd, int sub_nd,
42 CMSubMap_t *submap, CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map,
43 int print_flag);
44 static void cm2sub_cm_emit_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, int v_s, int v_o1, int v_o2,
45 CMSubMap_t *submap);
46 static void cm2sub_cm_trans_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_s,
47 CMSubMap_t *submap);
48 static void cm2sub_cm_trans_probs_S(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_start,
49 CMSubMap_t *submap);
50 static void cm2sub_cm_trans_probs_B_E(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_end,
51 CMSubMap_t *submap, int print_flag);
52 static void cm2sub_cm_add_single_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int orig_y,
53 int sub_v, int yoffset, double *orig_psi, char ***tmap);
54 static float cm2sub_cm_sum_subpaths(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int start, int end,
55 int init_sub_start, char ***tmap, double *orig_psi);
56 static int cm_trans_check(CM_t *cm, int a, int b);
57 static void cm2sub_cm_subtract_root_subpaths(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap,
58 CMSubMap_t *submap, int print_flag);
59 static void cm2sub_cm_find_impossible_misc_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
60 CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag);
61 static void cm2sub_cm_find_impossible_matr_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
62 CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag);
63
64
65 /**************************************************************************
66 * EPN 10.25.06
67 * Function: AllocSubMap()
68 *
69 * Purpose: Determine maps between a template CM and a sub CM, which probably
70 * has less structure (less MATP nodes) than the template. Do this
71 * a bit indirectly, first get maps from each CM to a CP9 HMM,
72 * then use these maps to map the CMs to each other.
73 * See infernal.h for more info on the submap data structure.
74 *
75 * Args
76 * CM_t *sub_cm - the sub CM
77 * CM_t *orig_cm - the original template CM
78 * int sstruct - the first (leftmost) consensus posn we're keeping structure for
79 * int estruct - the last (rightmost) consensus posn we're keeping structure for
80 */
81
82 CMSubMap_t *
AllocSubMap(CM_t * sub_cm,CM_t * orig_cm,int sstruct,int estruct)83 AllocSubMap(CM_t *sub_cm, CM_t *orig_cm, int sstruct, int estruct)
84 {
85 CMSubMap_t *submap;
86 int v;
87 int status;
88
89 ESL_ALLOC(submap, sizeof(struct submap_s));
90
91 submap->sub_M = sub_cm->M;
92 submap->orig_M = orig_cm->M;
93 /* Determine clen of the orig_cm */
94 submap->orig_clen = 0;
95 for(v = 0; v <= orig_cm->M; v++)
96 {
97 if(orig_cm->stid[v] == MATP_MP)
98 submap->orig_clen += 2;
99 else if(orig_cm->stid[v] == MATL_ML || orig_cm->stid[v] == MATR_MR)
100 submap->orig_clen++;
101 }
102 submap->sstruct = sstruct;
103 submap->estruct = estruct;
104 submap->sub_clen = (submap->estruct-submap->sstruct+1);
105 submap->spos = submap->sstruct;
106 submap->epos = submap->estruct;
107
108 /* Allocate and initialize arrays */
109 ESL_ALLOC(submap->s2o_id, sizeof(int) * (sub_cm->M+1));
110 ESL_ALLOC(submap->s2o_smap, sizeof(int *) * (sub_cm->M+1));
111 for(v = 0; v <= sub_cm->M; v++)
112 {
113 submap->s2o_id[v] = FALSE;
114 ESL_ALLOC(submap->s2o_smap[v], sizeof(int) * 2);
115 submap->s2o_smap[v][0] = -1;
116 submap->s2o_smap[v][1] = -1;
117 }
118
119 ESL_ALLOC(submap->o2s_smap, sizeof(int *) * (orig_cm->M+1));
120 for(v = 0; v <= orig_cm->M; v++)
121 {
122 ESL_ALLOC(submap->o2s_smap[v], sizeof(int) * 2);
123 submap->o2s_smap[v][0] = -1;
124 submap->o2s_smap[v][1] = -1;
125 }
126 return submap;
127
128 ERROR:
129 cm_Fail("Memory allocation error.");
130 return NULL; /* never reached */
131 }
132
133 /* Function: FreeSubMap()
134 * Returns: (void)
135 */
136
137 void
FreeSubMap(CMSubMap_t * submap)138 FreeSubMap(CMSubMap_t *submap)
139 {
140 int v;
141 for(v = 0; v <= submap->sub_M; v++)
142 free(submap->s2o_smap[v]);
143 free(submap->s2o_smap);
144 for(v = 0; v <= submap->orig_M; v++)
145 free(submap->o2s_smap[v]);
146 free(submap->o2s_smap);
147 free(submap->s2o_id);
148 free(submap);
149 }
150
151 /**************************************************************************
152 * EPN 10.26.06
153 * Function: AllocSubInfo()
154 *
155 * Purpose: Store information about a sub CM that is useful for testing
156 * if it constructed correctly.
157 *
158 * Args:
159 * int clen; - consensus length of the sub_cm
160 * Returns: CMSubInfo_t
161 */
162
163 CMSubInfo_t *
AllocSubInfo(int clen)164 AllocSubInfo(int clen)
165 {
166 int status;
167 CMSubInfo_t *subinfo;
168 int i;
169 int ncases;
170
171 ESL_ALLOC(subinfo, sizeof(struct subinfo_s));
172 /* Allocate and initialize arrays */
173 ESL_ALLOC(subinfo->imp_cc, sizeof(int) * (clen + 2));
174 for(i = 0; i <= clen+1; i++)
175 subinfo->imp_cc[i] = FALSE;
176
177 /* 6 possible cases for predicting we get HMM distros wrong */
178 ncases = 6;
179 ESL_ALLOC(subinfo->apredict_ct, sizeof(int) * (ncases+1));
180 ESL_ALLOC(subinfo->spredict_ct, sizeof(int) * (ncases+1));
181 ESL_ALLOC(subinfo->awrong_ct, sizeof(int) * (ncases+1));
182 ESL_ALLOC(subinfo->swrong_ct, sizeof(int) * (ncases+1));
183 for(i = 0; i <= ncases; i++)
184 {
185 subinfo->apredict_ct[i] = 0;
186 subinfo->spredict_ct[i] = 0;
187 subinfo->awrong_ct[i] = 0;
188 subinfo->swrong_ct[i] = 0;
189 }
190 return subinfo;
191
192 ERROR:
193 cm_Fail("Memory allocation error.");
194 return NULL; /* never reached */
195 }
196
197 /* Function: FreeSubInfo()
198 * Returns: void
199 */
200
201 void
FreeSubInfo(CMSubInfo_t * subinfo)202 FreeSubInfo(CMSubInfo_t *subinfo)
203 {
204 free(subinfo->imp_cc);
205 free(subinfo->apredict_ct);
206 free(subinfo->spredict_ct);
207 free(subinfo->awrong_ct);
208 free(subinfo->swrong_ct);
209 free(subinfo);
210 }
211
212 /**************************************************************************
213 * EPN 09.22.06
214 * Function: map_orig2sub_cm()
215 *
216 * Purpose: Determine maps between a template CM and a sub CM, which probably
217 * has less structure (less MATP nodes) than the template. Do this
218 * a bit indirectly, first get maps from each CM to a CP9 HMM,
219 * then use these maps to map the CMs to each other.
220 * Args:
221 * CM_t *orig_cm - the original template CM
222 * CM_t *sub_cm - the sub CM
223 * CMSubMap_t *submap - the sub CM map
224 * int print_flag - TRUE to print out useful debugging info
225 * Returns: (void)
226 */
227 void
map_orig2sub_cm(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int print_flag)228 map_orig2sub_cm(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int print_flag)
229 {
230 int status;
231 int k_s; /* HMM node counter */
232 int v_o;
233 int v_s;
234 int v; /* state index in CM */
235 int n_o;
236 int n_s;
237 char **nodetypes;
238 char **sttypes;
239 int sub_k;
240 int orig_k;
241 int sub_nd;
242 int orig_nd;
243 int x, y;
244 CP9Map_t *orig_cp9map;
245 CP9Map_t *sub_cp9map;
246
247 ESL_ALLOC(sttypes, sizeof(char *) * 10);
248 sttypes[0] = "D";
249 sttypes[1] = "MP";
250 sttypes[2] = "ML";
251 sttypes[3] = "MR";
252 sttypes[4] = "IL";
253 sttypes[5] = "IR";
254 sttypes[6] = "S";
255 sttypes[7] = "E";
256 sttypes[8] = "B";
257 sttypes[9] = "EL";
258
259 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
260 nodetypes[0] = "BIF";
261 nodetypes[1] = "MATP";
262 nodetypes[2] = "MATL";
263 nodetypes[3] = "MATR";
264 nodetypes[4] = "BEGL";
265 nodetypes[5] = "BEGR";
266 nodetypes[6] = "ROOT";
267 nodetypes[7] = "END";
268
269 /* sanity check */
270 if(sub_cm->M > orig_cm->M)
271 cm_Fail("ERROR: sub_cm has more states than orig_cm in map_orig2sub_cm()\n");
272
273 /* We want maps from the orig_cm to a CP9 HMM and from the
274 * sub_cm to a CP9 HMM, but we don't need the actual HMMs, just
275 * the maps. */
276 /* Allocate and initialize the cp9maps */
277 orig_cp9map = AllocCP9Map(orig_cm);
278 sub_cp9map = AllocCP9Map(sub_cm);
279 /* Map the CM states to CP9 states and nodes, and vice versa */
280 CP9_map_cm2hmm(orig_cm, orig_cp9map, print_flag);
281 CP9_map_cm2hmm(sub_cm, sub_cp9map, print_flag);
282
283 /* Step through the consensus columns, filling in maps */
284 /* ROOT is special: */
285 v_s = 0; /* ROOT_S */
286 v_o = 0; /* ROOT_S */
287 map_orig2sub_cm_helper(orig_cm, sub_cm, submap, v_o, v_s);
288
289 /* ROOT_IL inserts before orig cc submap->spos */
290 k_s = 1; /* insert */
291 for(x = 0; x <= 1; x++)
292 for(y = 0; y <= 1; y++)
293 map_orig2sub_cm_helper(orig_cm, sub_cm, submap,
294 orig_cp9map->hns2cs[submap->spos-1][k_s][x],
295 sub_cp9map->hns2cs[0][k_s][y]);
296
297 /* ROOT_IR inserts after orig cc submap->epos */
298 k_s = 1; /* insert */
299 for(x = 0; x <= 1; x++)
300 for(y = 0; y <= 1; y++)
301 map_orig2sub_cm_helper(orig_cm, sub_cm, submap,
302 orig_cp9map->hns2cs[submap->epos][k_s][x],
303 sub_cp9map->hns2cs[submap->epos-submap->spos+1][k_s][y]);
304
305 for(sub_k = 1; sub_k <= sub_cp9map->hmm_M; sub_k++)
306 {
307 orig_k = sub_k + submap->spos - 1;
308 sub_nd = sub_cp9map->pos2nd[sub_k];
309 orig_nd = orig_cp9map->pos2nd[orig_k];
310
311 /* Check for an easy case to save time in the future: when
312 * the orig_cm and sub_cm nodes that map to this column are
313 * of the same type, and also the orig_cm and sub_cm nodes
314 * that map to the next column are also of the same type.
315 * We check for this in cm2sub_cm_check_id_next_node() if TRUE,
316 * then submap->s2o_id[sub_v] is set to TRUE for all states
317 * in this node. Later this will save time by just copying
318 * transition and emission parameters from the orig_cm for
319 * these states.
320 */
321 cm2sub_cm_check_id_next_node(orig_cm, sub_cm, orig_nd, sub_nd, submap, orig_cp9map,
322 sub_cp9map, print_flag);
323 for(k_s = 0; k_s < 3; k_s++) /* k_s = 0 (match), k_s = 1 (insert),
324 k_s = 2 (delete) */
325 for(x = 0; x <= 1; x++)
326 for(y = 0; y <= 1; y++)
327 map_orig2sub_cm_helper(orig_cm, sub_cm, submap,
328 orig_cp9map->hns2cs[orig_k][k_s][x],
329 sub_cp9map->hns2cs[sub_k][k_s][y]);
330 }
331 /* NOTE: We ignore mapping B states, E states and S states (except ROOT_S). We'll handle these guys
332 * specially when we fill in transition probabilities. The reason we don't map them is that
333 * I think its impossible to do it robustly. I saw cases with SSU where there were BIF nodes in the
334 * sub_cm for which I couldn't figure out which BIF nodes (and correspondingly BEGL and BEGR nodes)
335 * in the orig_cm they should map to. Regardless, even if there is a pretty, nice way I'm abandoning
336 * an attempt to find it for a crude way - we will handle the transitions involving these guys
337 * specially, without a need for a mapping between CMs.
338 */
339
340 /* If we're in debugging mode and print_flag == TRUE, we print and check the map */
341 if(print_flag)
342 {
343 /* First print submap->s2o_id */
344 for(v = 0; v <= sub_cm->M; v++)
345 if(submap->s2o_id[v] == TRUE)
346 printf("submap->s2o_id[%d] TRUE\n", v);
347 else
348 printf("submap->s2o_id[%d] FALSE\n", v);
349
350 printf("\n\n\nMAP\n\n\n");
351 for(v_s = 0; v_s < sub_cm->M; v_s++)
352 {
353 n_s = sub_cm->ndidx[v_s];
354 v_o = submap->s2o_smap[v_s][0];
355 if(sub_cm->sttype[v_s] == E_st)
356 printf("sub v:%4d END\n", v_s);
357 if(sub_cm->sttype[v_s] == S_st)
358 printf("sub v:%4d START\n", v_s);
359 if(sub_cm->sttype[v_s] == B_st)
360 printf("sub v:%4d BIF_B\n", v_s);
361 if((sub_cm->sttype[v_s] != B_st &&
362 sub_cm->sttype[v_s] != S_st) &&
363 sub_cm->sttype[v_s] != E_st)
364 {
365 if(v_o == -1 && sub_cm->sttype[(v_s+1)] == E_st) /* v_s is a dead insert */
366 continue;
367 if(v_o == -1 && sub_cm->sttype[v_s] != E_st)
368 cm_Fail("ERROR sub_cm state: %d type: %s node type: %s doesn't map to any state in orig_cm\n", v_s, sttypes[(int) sub_cm->sttype[v_s]], nodetypes[(int) sub_cm->ndtype[n_s]]);
369
370 n_o = orig_cm->ndidx[v_o];
371 if(print_flag) printf("sub v:%4d(%4d) %6s%6s | orig v:%4d(%4d) %6s%6s\n", v_s, n_s, nodetypes[(int) sub_cm->ndtype[n_s]], sttypes[(int) sub_cm->sttype[v_s]], v_o, n_o, nodetypes[(int) orig_cm->ndtype[n_o]], sttypes[(int) orig_cm->sttype[v_o]]);
372 /* check to make sure submap->o2s_smap is consistent */
373 if(submap->o2s_smap[v_o][0] != v_s && submap->o2s_smap[v_o][1] != v_s)
374 cm_Fail("ERROR inconsistency; neither o2s_smap[%d][0] and [1] is %d\n", v_o, v_s);
375
376 v_o = submap->s2o_smap[v_s][1];
377 if(v_o != -1)
378 {
379 n_o = orig_cm->ndidx[v_o];
380 if(print_flag) printf(" | orig v:%4d(%4d) %6s%6s\n", v_o, n_o, nodetypes[(int) orig_cm->ndtype[n_o]], sttypes[(int) orig_cm->sttype[v_o]]);
381 /* check to make sure o2s_smap is consistent */
382 if(submap->o2s_smap[v_o][0] != v_s && submap->o2s_smap[v_o][1] != v_s)
383 {
384 cm_Fail("ERROR inconsistency; neither o2s_smap[%d][0] and [1] is %d\n", v_o, v_s);
385 }
386 }
387 }
388 }
389 }
390
391 /* Clean up and return */
392 FreeCP9Map(orig_cp9map);
393 FreeCP9Map(sub_cp9map);
394 free(sttypes);
395 free(nodetypes);
396 return;
397
398 ERROR:
399 cm_Fail("Memory allocation error.");
400 }
401
402 /**************************************************************************
403 * EPN 08.30.06
404 * map_orig2sub_cm_helper()
405 *
406 * helper function for map_orig2sub_cm(),
407 *
408 * Purpose: Fill in specific part of the map, given orig_v (orig_cm state),
409 * sub_v (sub_cm state)
410 * Args:
411 * CM_t *orig_cm
412 * CM_t *sub_cm
413 * CMSubMap_t *submap
414 * int orig_v;
415 * int sub_v;
416 * Returns: 1 if a new mapping was made, 0 if not
417 */
418 static int
map_orig2sub_cm_helper(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int orig_v,int sub_v)419 map_orig2sub_cm_helper(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int sub_v)
420 {
421 int sub_nd;
422 int orig_nd;
423
424 /*printf("\nin helper: orig_v: %d sub_v: %d\n", orig_v, sub_v);*/
425
426 if(orig_v == -1 || sub_v == -1)
427 return 0;
428
429 /* check to see if we already have this mapping */
430 if(submap->o2s_smap[orig_v][0] == sub_v || submap->o2s_smap[orig_v][1] == sub_v)
431 return 0;
432
433 orig_nd = orig_cm->ndidx[orig_v];
434 sub_nd = sub_cm->ndidx[sub_v];
435
436 if(sub_cm->sttype[sub_v] == IL_st || sub_cm->sttype[sub_v] == IR_st)
437 {
438 /* Make sure that neither orig_v nor sub_v is a detached insert state,
439 * if either is, we return b/c it's irrelevant, and we don't store that info in the maps */
440 if(orig_cm->sttype[(orig_v+1)] == E_st || sub_cm->sttype[(sub_v+1)] == E_st)
441 return 0;
442 }
443 /* 09.14.06 I think that some of the code that checks for cases where we wnat to avoid mapping is unnecessary!,
444 * but I'm not sure what...*/
445
446 /* Check for a case we want to avoid mapping. We exploit the fact that we know a MATP_nd in a sub_cm MUST be mapped
447 * to a corresponding MATP_nd in the original template CM: */
448 if(sub_cm->ndtype[sub_nd] == MATP_nd)
449 if(orig_cm->ndtype[orig_nd] == MATP_nd && sub_cm->sttype[sub_v] != orig_cm->sttype[orig_v])
450 return 0;
451
452 /* Fill in submap->o2s_smap */
453 if(submap->o2s_smap[orig_v][0] == -1)
454 {
455 if (submap->o2s_smap[orig_v][1] != -1)
456 cm_Fail("ERROR in map_orig2sub_cm_helper, submap->o2s_smap[%d][0] is -1 but submap->o2s_smap[%d][1] is not, this shouldn't happen.\n", orig_v, orig_v);
457 else
458 submap->o2s_smap[orig_v][0] = sub_v;
459 }
460 else if (submap->o2s_smap[orig_v][1] != -1)
461 {
462 if(submap->o2s_smap[orig_v][0] == sub_v || submap->o2s_smap[orig_v][1] == sub_v)
463 /* abort!, we already have this mapping */
464 return 0;
465 else
466 cm_Fail("ERROR in map_orig2sub_cm_helper, submap->o2s_smap[%d][0] is not -1 and submap->o2s_smap[%d][1] is not -1, this shouldn't happen.\n", orig_v, orig_v);
467 }
468 else /* submap->o2s_smap[orig_v][0] != -1 && submap->o2s_smap[orig_v][1] == -1 */
469 {
470 if(submap->o2s_smap[orig_v][0] == sub_v || submap->o2s_smap[orig_v][1] == sub_v)
471 /* abort!, we already have this mapping */
472 return 0;
473 submap->o2s_smap[orig_v][1] = sub_v;
474 }
475
476 /* now fill in submap->s2o_smap */
477 if(submap->s2o_smap[sub_v][0] == -1)
478 if (submap->s2o_smap[sub_v][1] != -1)
479 cm_Fail("ERROR in map_sub2orig_cm_helper, submap->s2o_smap[%d][0] is -1 but submap->s2o_smap[%d][1] is not, this shouldn't happen.\n", sub_v, sub_v);
480 else
481 submap->s2o_smap[sub_v][0] = orig_v;
482 else if (submap->s2o_smap[sub_v][1] != -1)
483 cm_Fail("ERROR in map_sub2orig_cm_helper, submap->s2o_smap[%d][0] is not -1 and submap->s2o_smap[%d][1] is not -1, this shouldn't happen.\n", sub_v, sub_v);
484 else /* submap->s2o_smap[sub_v][0] != -1 && submap->s2o_smap[sub_v][1] == -1 */
485 submap->s2o_smap[sub_v][1] = orig_v;
486 return 1;
487 }
488
489
490 /****************************************************************************
491 * Function: build_sub_cm()
492 * EPN 08.28.06
493 *
494 * Purpose: Given a template CM (orig_cm) built from an MSA of consensus columns
495 * 1..M, build a new, "sub CM" (sub_cm), that only models well-nested base pairs
496 * that occur (have both left and right halves) between consensus columns
497 * [sstruct..estruct]. The sub_cm will only model the consensus
498 * columns between [sstruct..estruct].
499 *
500 * We attempt to construct the sub CM such that:
501 *
502 * 1. All 'mapped' states in the orig_cm and sub_cm have identical
503 * 'psi' values, i.e. the expected number of times entered in a single parse
504 * is identical.
505 *
506 * 2. Alignments sampled from it should 'follow the same probability distributions'
507 * as alignments sampled from the template CM and then truncated,
508 * removing columns outside [sstruct..estruct] Specifically, 'follow
509 * the same probability distributions' means if we built a ML HMM (Weinberg
510 * style) from the resulting infinite alignments, we would have identical
511 * HMMs for the sub-CM alignments and for the truncated template CMs. It turns
512 * out this is impossible for certain situations
513 * that cause differences in the topology of the orig_cm and sub_cm. These
514 * situations, so-called 'impossible cases', cause the distributions out of a
515 * single ML HMM node to be different between the orig_cm, but are relatively
516 * rare (a rough, rough estimate is 1% of ML HMM nodes are affected). We can
517 * predict when all of these situations will occur, but sometimes when we
518 * predict a ML HMM node will be off in this way, it is in fact identical
519 * between the sub_cm and orig_cm for reasons I can't quite grasp.
520 *
521 * This function builds a sub_cm and then potentially performs up to 3 checks
522 * to see if 1 and 2 above are satisfied (it only checks if 2 holds for
523 * non-impossible cases, and that we can accurately predict the impossible
524 * cases). The 3 potential checks are:
525 *
526 * A. Calculate the expected number of times that each state in the orig_cm
527 * and sub_cm is entered (psi[v] = exp # times state v is entered), and
528 * make sure these values are within 'threshold' of each other for all
529 * states that map between the orig_cm and sub_cm. This is performed
530 * by the 'check_orig_psi_vs_sub_psi' function, which is called within
531 * this function if either the 'do_acheck' and/or 'do_scheck' parameters
532 * passed in are TRUE.
533 *
534 * B. Analytically (not by sampling) building 2 ML HMMs, one from the orig_cm
535 * and one from the sub_cm, predicting the impossible cases and testing t
536 * o make sure all the corresponding parameters for non-impossible cases
537 * are within 'threshold' of each other. This is done within the
538 * check_sub_cm() function which is called from this function if the
539 * 'do_acheck' parameter passed in is set to TRUE.
540 *
541 * C. Build a CP9 ML HMM analytically from the sub_cm. Sample a deep MSA
542 * from the orig_cm and potentially truncate outside [sstruct..estruct]. Use
543 * counts from the MSA to build a ML HMM for the orig_cm. Then perform
544 * chi-squared tests to see if the counts in the orig_cm's HMM could
545 * have been generated by the distributions in the sub_cm's HMM. This
546 * check is performed if the 'do_scheck' parameter passed in is set
547 * to TRUE.
548 *
549 * Args: orig_cm - the original model, which we're going to (potentially)
550 * remove some structure from
551 * errbuf - for error messages
552 * ret_sub_cm - the new sub_cm built from orig_cm with some structure removed.
553 * sstruct - the first position (consensus column) we want to model
554 * estruct - the last position we will model
555 * print_flag - TRUE to print debugging statements
556 *
557 * Returns: eslOK on success
558 * eslEINCONCEIVABLE if something inconceivable happens
559 * eslEINCOMPAT on contract violation
560 * eslEMEM on memory error
561 */
562 int
build_sub_cm(CM_t * orig_cm,char * errbuf,CM_t ** ret_cm,int sstruct,int estruct,CMSubMap_t ** ret_submap,int print_flag)563 build_sub_cm(CM_t *orig_cm, char *errbuf, CM_t **ret_cm, int sstruct, int estruct, CMSubMap_t **ret_submap, int print_flag)
564 {
565 int status;
566 CM_t *sub_cm; /* new covariance model, a submodel of the template */
567 CMConsensus_t *con; /* growing consensus info for orig_cm */
568 Parsetree_t *mtr; /* master structure tree from the alignment */
569 char *sub_cstr; /* consensus substructure display string */
570 int *sub_ct; /* 0..con->clen-1 base pair partners array */
571 int cpos; /* position counter within orig_cm */
572 int sub_cpos; /* position counter within sub_cm */
573 char ***tmap; /* hard-coded transition map, for convenience */
574 double *orig_psi; /* expected num times each state visited in orig_cm */
575 int v_s; /* state counter for sub_cm */
576 int n_s; /* node counter for sub_cm */
577 FILE *ofp; /* an open output file */
578 int spos; /* first consensus (match) column of the orig_cm to
579 * model with the sub_cm */
580 int epos; /* last consensus (match) column of the orig_cm to
581 * model with the sub_cm */
582 CMSubMap_t *submap;
583
584 /* check to make sure that we can actually build a sub CM of this model */
585 if((orig_cm->flags & CMH_LOCAL_BEGIN) || (orig_cm->flags & CMH_LOCAL_END)) ESL_FAIL(eslEINCOMPAT, errbuf, "build_sub_cm() trying to build a sub CM of a CM already in local mode, not yet supported.\n");
586 if(orig_cm->flags & CM_IS_SUB) ESL_FAIL(eslEINCOMPAT, errbuf, "build_sub_cm(), trying to build a sub CM of a CM that is itself a sub CM.");
587
588 /* Much of the code for building and checking sub CMs relies on the fact that every insert
589 * state in the sub CM maps exactly 1 insert state in the original CM. This is fine if we
590 * have removed ambiguities by detaching all original CM insert states that are 1 state
591 * before an END_E state. This was probably done when the CM was built, but we redo it here
592 * in case it was not.
593 */
594 cm_find_and_detach_dual_inserts(orig_cm,
595 FALSE, /* DON'T check that these states have 0 counts (they may not due to priors) */
596 TRUE); /* DO detach END_E-1 insert states, making them unreachable */
597
598 /* Get the consensus sequence and consensus structure information from the original CM */
599 if((con = CreateCMConsensus(orig_cm, orig_cm->abc)) == NULL) ESL_FAIL(eslFAIL, errbuf, "build_sub_cm(), unable to create cm consensus data");
600 if(print_flag)
601 {
602 printf("con->cseq : %s\n", con->cseq);
603 printf("con->cstr : %s\n", con->cstr);
604 printf("clen : %d\n", con->clen);
605 }
606
607 spos = sstruct;
608 epos = estruct;
609
610 /* Fill a new ct array for the sub_cm. The sub_cm will only model the consensus columns
611 * between spos and epos, and only the structure between spos
612 * and epos. First copy the template (original) CMs ct array but only for the
613 * appropriate consensus columns that lie in between both structure and model boundarIes
614 * Next, eliminate any structure that lies outside the structure boundaries.
615 */
616
617 ESL_ALLOC(sub_ct, sizeof(int) * (epos - spos + 1));
618 /* First just copy ct array for model boundaries from con->ct */
619 for (cpos = (spos-1); cpos < epos; cpos++)
620 {
621 sub_cpos = cpos - (spos-1);
622 if(con->ct[cpos] != -1 &&
623 (con->ct[cpos] < (spos-1) ||
624 con->ct[cpos] >= epos))
625 sub_ct[sub_cpos] = -1;
626 else
627 sub_ct[sub_cpos] = con->ct[cpos];
628 }
629 /* Second remove structure outside structural boundaries */
630 for (cpos = (spos-1); cpos < epos; cpos++)
631 {
632 sub_cpos = cpos - (spos-1);
633 if ((cpos+1) < sstruct || (cpos+1) > estruct) /* cpos goes 1..clen, but ct is indexed
634 * 0..clen-1.*/
635 {
636 /* CreateCMConsensus() uses -1 in ct[] to indicate single
637 * stranded (different convention than WUSS2ct()). */
638 if (sub_ct[sub_cpos] != -1)
639 sub_ct[sub_ct[sub_cpos]] = -1;
640 sub_ct[sub_cpos] = -1;
641 }
642 }
643
644 /* Construct the new structure ss_cons based on the template CM ct array.
645 * We could do this similar to how display.c::CreateCMConsensus()
646 * does it to get the fully formatted WUSS ([{<>}]) string but
647 * lazily we just do <> bps here.
648 */
649 ESL_ALLOC(sub_cstr, sizeof(char) * (epos - spos + 2));
650 for (cpos = (spos-1); cpos < epos; cpos++)
651 {
652 sub_cpos = cpos - (spos-1);
653 if(sub_ct[sub_cpos] == -1) sub_cstr[sub_cpos] = '.';
654 else if (sub_ct[sub_cpos] > cpos) sub_cstr[sub_cpos] = '<';
655 else if (sub_ct[sub_cpos] < cpos) sub_cstr[sub_cpos] = '>';
656 else cm_Fail("ERROR: weird error in build_sub_cm()\n");
657 }
658 sub_cstr[(epos-spos+1)] = '\0';
659
660 /* Build the new sub_cm given the new consensus structure. But don't
661 * parameterize it yet.
662 */
663 if((status = ConsensusModelmaker(orig_cm->abc, errbuf, sub_cstr, (epos-spos+1), TRUE, &sub_cm, &mtr)) != eslOK) return status;
664 /* TRUE in ConsensusModelmaker() call says 'yes, we're building a sub CM, allow invalid CMs in cm_from_guide() (see comments in that function for details)*/
665
666 /* Set creation time */
667 if ((status = cm_SetCtime(sub_cm)) != eslOK) ESL_FAIL(status, errbuf, "Failed to record timestamp");
668
669 /* Rebalance the CM for optimization of D&C */
670 CM_t *new;
671 if((status = CMRebalance(sub_cm, errbuf, &new)) != eslOK) return status;
672 FreeCM(sub_cm);
673 sub_cm = new;
674
675 submap = AllocSubMap(sub_cm, orig_cm, sstruct, estruct);
676 if(print_flag)
677 {
678 printf("\n\norig struct: %s\n", con->cstr);
679 printf("\n\nnew struct : %s\n", sub_cstr);
680 }
681
682 /* Map states from orig_cm to sub_cm and vice versa. */
683 map_orig2sub_cm(orig_cm, sub_cm, submap, print_flag);
684
685 /* Fill orig_psi, which we need to determine the sub_cm parameters. */
686 orig_psi = cm_ExpectedStateOccupancy(orig_cm);
687 /* we need a transition map too */
688 tmap = cm_CreateTransitionMap();
689
690 CMZero(sub_cm);
691 CMSetNullModel(sub_cm, orig_cm->null);
692 sub_cm->el_selfsc = orig_cm->el_selfsc;
693 sub_cm->beta_W = orig_cm->beta_W;
694 sub_cm->tau = orig_cm->tau;
695
696 /* copy the options from the template CM, but turn off the CM_ALIGN_SUB and
697 * CM_CONFIG_SUB options and turn on the CM_IS_SUB flag */
698 sub_cm->config_opts = orig_cm->config_opts;
699 sub_cm->align_opts = orig_cm->align_opts;
700 sub_cm->search_opts = orig_cm->search_opts;
701 sub_cm->flags = 0;
702 if(sub_cm->config_opts & CM_CONFIG_SUB) sub_cm->config_opts &= ~CM_CONFIG_SUB;
703 if(sub_cm->align_opts & CM_ALIGN_SUB) sub_cm->align_opts &= ~CM_ALIGN_SUB;
704 sub_cm->flags |= CM_IS_SUB;
705
706 /* Fill in emission probabilities */
707 for(v_s = 0; v_s < sub_cm->M; v_s++)
708 {
709 if(sub_cm->sttype[(v_s+1)] == E_st) /* detached insert */
710 esl_vec_FNorm(sub_cm->e[v_s], orig_cm->abc->K); /* equiprobable, but irrelevant, this state will never be reached */
711 else if(sub_cm->sttype[v_s] != S_st &&
712 sub_cm->sttype[v_s] != D_st &&
713 sub_cm->sttype[v_s] != B_st &&
714 sub_cm->sttype[v_s] != E_st)
715 cm2sub_cm_emit_probs(orig_cm, sub_cm, orig_psi, v_s, submap->s2o_smap[v_s][0], submap->s2o_smap[v_s][1], submap);
716 }
717 /* Fill in transition virtual counts.
718 * First handle non-B,S,E states, we'll deal with B,S,Es later.
719 * The reason we have to wait is that we can't (I don't think at least)
720 * unambiguously map the sub_cm B, S, or E states to orig_cm states.
721 */
722 for(v_s = 0; v_s < sub_cm->M; v_s++)
723 {
724 if(sub_cm->sttype[(v_s+1)] == E_st) /* detached insert */
725 esl_vec_FNorm(sub_cm->t[v_s], sub_cm->cnum[v_s]); /* equiprobable, but irrelevant, this state will never be reached */
726 else if(v_s == 0 ||
727 (sub_cm->sttype[v_s] != S_st &&
728 sub_cm->sttype[v_s] != B_st &&
729 sub_cm->sttype[v_s] != E_st))
730 cm2sub_cm_trans_probs(orig_cm, sub_cm, orig_psi, tmap, v_s, submap);
731 }
732
733
734 /* Address problem 090806 (in the 00LOG of ~/notebook/6_0725_inf_sub_cm/), by
735 * retraversing the structure and subtracting out subpaths that have been counted twice
736 * for a special situation involving the two inserts of ROOT and MATP states
737 */
738 for(n_s = 0; n_s < sub_cm->nodes; n_s++)
739 {
740 if(sub_cm->ndtype[n_s] == MATP_nd && (sub_cm->sttype[(sub_cm->nodemap[n_s] + 5)+1] != E_st))
741 {
742
743 if((submap->s2o_smap[sub_cm->nodemap[n_s] + 4][1] != -1) ||
744 (submap->s2o_smap[sub_cm->nodemap[n_s] + 5][1] != -1))
745 ESL_FAIL(eslEINCONCEIVABLE, errbuf, "build_sub_cm(), MATP_IL or MATP_IR node: %d map to 2 cm states\n", n_s);
746 if(submap->s2o_smap[sub_cm->nodemap[n_s] + 4][0] != (submap->s2o_smap[sub_cm->nodemap[n_s] + 5][0] - 1))
747 ESL_FAIL(eslEINCONCEIVABLE, errbuf, "build_sub_cm(), MATP_IL or MATP_IR node: %d don't map to adjacent orig_cm states\n", n_s);
748 }
749 if(sub_cm->ndtype[n_s] == ROOT_nd && sub_cm->ndtype[n_s+1] != BIF_nd) /* ROOT->BIFs are handled special
750 * (see next loop) */
751 cm2sub_cm_subtract_root_subpaths(orig_cm, sub_cm, orig_psi, tmap, submap, print_flag);
752 }
753
754 /* Go back through and fill in the transitions into E and B states and out of S states */
755 for(v_s = 0; v_s < sub_cm->M; v_s++)
756 {
757 if(sub_cm->sttype[v_s] == S_st)
758 cm2sub_cm_trans_probs_S(orig_cm, sub_cm, orig_psi, tmap, v_s, submap);
759
760 if(sub_cm->sttype[v_s] == E_st || sub_cm->sttype[v_s] == B_st)
761 cm2sub_cm_trans_probs_B_E(orig_cm, sub_cm, orig_psi, tmap, v_s, submap, print_flag);
762 /* convention is to leave transitions out of BIF_B as 0.0, all the code knows they're obligate */
763 }
764
765 /* Remove sub_cm ambiguities by finding and detaching sub CM insert states
766 * that are 1 state before END_E states by setting transitions into
767 * such states as 0.0.
768 */
769 cm_find_and_detach_dual_inserts(sub_cm,
770 FALSE, /* DON'T check that these states have 0 counts (they won't due to priors) */
771 TRUE); /* DO detach END_E-1 insert states */
772
773 /*debug_sub_cm_check_all_trans(orig_cm, sub_cm, submap);*/
774
775 /* reset sub_cm->qdbinfo->setby, we don't want to have to calculate QDBs for the sub CM unless we have to */
776 if(sub_cm->qdbinfo->setby == CM_QDBINFO_SETBY_INIT) sub_cm->qdbinfo->setby = CM_QDBINFO_SETBY_SUBINIT;
777 if(sub_cm->W == 0) {
778 sub_cm->W = orig_cm->W;
779 sub_cm->W_setby = CM_W_SETBY_SUBCOPY;
780 }
781
782 /* Finally renormalize the CM */
783 CMRenormalize(sub_cm);
784 /* DO NOT LOGODDSIFY YET, we'll do this when we call ConfigCM() for this CM,
785 * the logsoddsification step takes a significant amount of time.
786 * CMLogoddsify(sub_cm);
787 */
788
789 if(print_flag)
790 {
791 ofp = fopen("sub.cm", "w");
792 if(print_flag) printf("%-40s ... ", "Saving model to file"); fflush(stdout);
793 if(print_flag) cm_file_WriteASCII(ofp, -1, sub_cm);
794 if(print_flag) printf("done.\n");
795 }
796
797 if(print_flag)
798 {
799 printf("\nDEBUG PRINT OF ORIG_CM PARAMETERS:\n");
800 debug_print_cm_params(stdout, orig_cm);
801 printf("\nDEBUG PRINT OF SUB_CM PARAMETERS:\n");
802 debug_print_cm_params(stdout, sub_cm);
803 }
804
805 /* Cleanup and exit. */
806 cm_FreeTransitionMap(tmap);
807 free(sub_cstr);
808 free(sub_ct);
809 FreeCMConsensus(con);
810 FreeParsetree(mtr);
811 free(orig_psi);
812
813 *ret_cm = sub_cm;
814 *ret_submap = submap;
815
816 return eslOK;
817
818 ERROR:
819 ESL_FAIL(eslEMEM, errbuf, "build_sub_cm() memory allocation error.");
820 return FALSE; /* never reached */
821 }
822
823 /**************************************************************
824 * Function: CP9NodeForPosn()
825 * EPN 07.25.06 Benasque, Spain
826 *
827 * Purpose: Determine the node of the CP9 HMM that is most likely to
828 * have emitted (from either its Match or Insert state)
829 * a given posn in the target sequence.
830 *
831 * Args: hmm - the CM plan 9 HMM
832 * i0 - first posn of target subseq with info in posterior matrix
833 * j0 - last posn of target subseq with info in posterior matrix
834 * x - posn of target subsequence we're interested in
835 * L - last position of target sequence
836 * post - the posterior matrix for the hmm
837 * ret_node - RETURN: index of node with highest probability of emitting x
838 * ret_type - RETURN: type of state in ret_node with highest probability
839 * pmass - probability mass to require on left of start or right of end
840 * is_start - TRUE if we're doing left of start,
841 * print_flag- TRUE to print out info on most likely node
842 *
843 * Returns: eslOK on success;
844 * eslEINVAL on contract violation.
845 */
846 void
CP9NodeForPosn(CP9_t * hmm,int i0,int j0,int x,CP9_MX * post,int * ret_node,int * ret_type,float pmass,int is_start,int print_flag)847 CP9NodeForPosn(CP9_t *hmm, int i0, int j0, int x, CP9_MX *post,
848 int *ret_node, int *ret_type, float pmass, int is_start,
849 int print_flag)
850 {
851 /* post->mmx[i][k]: posterior probability that posn i was emitted from node k's
852 match state */
853 int max_k; /* node index with highest posterior probability of emitting posn x */
854 int max_type; /* type of state in max_k node with max probability '0' for match,
855 '1' for insert */
856 int max_sc; /* score (log probability) from post matrix for max_k node max_type state type */
857 int k; /* counter over nodes */
858
859 if(!is_start) pmass = 1. - pmass; /* we move left to right */
860
861 /*printf("in CP9NodeForPosn is_start: %d pmass: %f\n", is_start, pmass);*/
862 if(x > j0 || x < i0)
863 /*ESL_XFAIL(eslEINVAL, "ERROR in CP9NodeForPosn(), asking for position x: %d outside subseq bounds i0: %d j0: %d\n", x, i0, j0);*/
864 cm_Fail("ERROR in CP9NodeForPosn(), asking for position x: %d outside subseq bounds i0: %d j0: %d\n", x, i0, j0);
865
866 if(post->mmx[x][0] > post->imx[x][0])
867 {
868 max_sc = post->mmx[x][0];
869 max_type = 0; /* match */
870 }
871 else
872 {
873 max_sc = post->imx[x][0];
874 max_type = 1; /* insert */
875 }
876 max_k = 0;
877 /* move left to right through HMM nodes */
878 for(k = 1; k <= hmm->M; k++)
879 {
880 if(post->mmx[x][k] > max_sc)
881 {
882 max_k = k;
883 max_sc = post->mmx[x][k];
884 max_type = 0; /* match */
885 }
886 if(post->imx[x][k] > max_sc)
887 {
888 max_k = k;
889 max_sc = post->imx[x][k];
890 max_type = 1; /* insert */
891 }
892 }
893
894 if(print_flag)
895 {
896 if(max_type == 0)
897 printf("MATCH | mx->mmx[%3d][%3d]: %9d | %8f\n", x, max_k, post->mmx[x][max_k], Score2Prob(post->mmx[x][max_k], 1.));
898 else
899 printf("INSERT | mx->imx[%3d][%3d]: %9d | %8f\n", x, max_k, post->imx[x][max_k], Score2Prob(post->imx[x][max_k], 1.));
900 }
901 *ret_node = max_k;
902 *ret_type = max_type;
903 return;
904 }
905
906
907 /**********************************************************
908 * Function: StripWUSSGivenCC()
909 * EPN 09.07.05
910 *
911 * Purpose: Strips a secondary structure string in WUSS notation
912 * of base pair information for specific match (consensus) columns.
913 * namely those before the first match column given by first_match,
914 * and after the last match column, given by last_match
915 * The msa->ss_cons secondary structure string is modified.
916 *
917 * Characters <([{ are converted to : (left base of base pairs)
918 * Characters >)]} are converted to : (right base of base pairs)
919 * Characters _-, are converted to : (unpaired bases)
920 * Characters .:~ are untouched
921 * Pseudoknot characters are converted to : as well.
922 *
923 * Args: msa - the multiple sequence alignment
924 * gapthresh - the gap threshold for calling a match column
925 * first_match - first match column to keep structure for
926 * last_match - last match column to keep structure for
927 * Returns: (void)
928 */
929 void
StripWUSSGivenCC(ESL_MSA * msa,float gapthresh,int first_match,int last_match)930 StripWUSSGivenCC(ESL_MSA *msa, float gapthresh, int first_match, int last_match)
931 {
932 int status;
933 int *matassign; /* 0..alen-1 array; 0=insert col, 1=match col */
934 int gaps;
935 int apos;
936 int idx;
937 int cc;
938 int *ct; /* 0..alen-1 base pair partners array */
939
940 /* Contract check */
941 if(msa->flags & eslMSA_DIGITAL)
942 cm_Fail("ERROR in StripWUSSGivenCC, MSA is digitized.\n");
943
944 /* 1. Determine match/insert assignments
945 * matassign is 1..alen. Values are 1 if a match column, 0 if insert column.
946 */
947 ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1));
948 matassign[0] = 0; /* no 0th column in MSA */
949 for (apos = 0; apos < msa->alen; apos++)
950 {
951 for (gaps = 0, idx = 0; idx < msa->nseq; idx++)
952 if (esl_abc_CIsGap(msa->abc, msa->aseq[idx][apos])) gaps++;
953 matassign[apos+1] = ((double) gaps / (double) msa->nseq > gapthresh) ? 0 : 1;
954 }
955
956 /* 2. Determine a "ct" array, base-pairing partners for each position.
957 * Disallow/ignore pseudoknots. (That's what the FALSE flag does.)
958 * ct[] values give the index of a base pairing partner, or 0 for unpaired positions.
959 * Even though msa->ss_cons is in the 0..alen-1 coord system of msa, ct[]
960 * comes back in the 1..alen coord system of dsq.
961 */
962 ESL_ALLOC(ct, (msa->alen+1) * sizeof(int));
963 if (esl_wuss2ct(msa->ss_cons, msa->alen, ct) != eslOK)
964 cm_Fail("Consensus structure string is inconsistent");
965
966 /* 3. Make sure the consensus structure "ct" is consistent with the match assignments.
967 * Wipe out all structure in insert columns; including the base-paired
968 * partner of insert-assigned columns.
969 * Also, remove structure outside of the consensus columns that
970 * map to the HMM nodes first_match and last_match.
971 */
972 cc = 0;
973 for (apos = 1; apos <= msa->alen; apos++)
974 {
975 if (! matassign[apos])
976 {
977 if (ct[apos] != 0) ct[ct[apos]] = 0;
978 ct[apos] = 0;
979 }
980 else /* matassign[apos] == 1 */
981 {
982 cc++;
983 if(cc < first_match || cc > last_match)
984 {
985 if (ct[apos] != 0) ct[ct[apos]] = 0;
986 ct[apos] = 0;
987 }
988 }
989 }
990
991 /* Next construct the new msa->ss_cons based on the ct array.
992 * We should do this similar to display.c::CreateCMConsensus()
993 * does it to get the fully formatted WUSS ([{<>}]) string but
994 * lazily we just do <> bps here.
995 */
996 for (apos = 1; apos <= msa->alen; apos++)
997 {
998 if (ct[apos] == 0 ) msa->ss_cons[apos-1] = '.';
999 else if (ct[apos] > apos) msa->ss_cons[apos-1] = '<';
1000 else if (ct[apos] < apos) msa->ss_cons[apos-1] = '>';
1001 else cm_Fail("ERROR: weird error in StripWUSSGivenCC\n");
1002 }
1003
1004 return;
1005
1006 ERROR:
1007 cm_Fail("Memory allocation error.");
1008 }
1009
1010
1011 /**************************************************************************
1012 * EPN 08.31.06
1013 * cm2sub_cm_emit_probs()
1014 *
1015 * Purpose: For a specific sub CM state v_s, determine the
1016 * emission probabilities using the emission probs
1017 * of state(s) (up to 2) in the original CM that
1018 * map to it. We weight the contribution of each
1019 * state to the emission probability by it's psi
1020 * value in orig_cm (psi[v] is expected number of
1021 * times state v is entered in a parse).
1022 *
1023 * Args:
1024 * CM_t *orig_cm - the original, template CM
1025 * CM_t *sub_cm - the sub CM
1026 * double *orig_psi - orig_psi[v] is the expected number of times state v is entered
1027 * in a CM parse
1028 * int v_s - the sub_cm state we're filling emissions for
1029 * int v_o1 - orig_cm state v_s maps to
1030 * int v_o2 - orig_cm state v_s maps to (-1 if v_s maps to only 1 state)
1031 * Returns: void
1032 */
1033 static void
cm2sub_cm_emit_probs(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,int v_s,int v_o1,int v_o2,CMSubMap_t * submap)1034 cm2sub_cm_emit_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, int v_s, int v_o1, int v_o2,
1035 CMSubMap_t *submap)
1036 {
1037 int is_left;
1038 int i, j;
1039
1040 /*printf("\nin cm2sub_cm_emit_probs v_s: %d, v_o1: %d, v_o2: %d\n", v_s, v_o1, v_o2);*/
1041
1042 if(v_o1 == -1)
1043 {
1044 cm_Fail("ERROR in cm2sub_cm_emit_probs, sub_cm state %d maps to 0 states in orig_cm (spos: %d epos: %d)\n", v_s, submap->spos, submap->epos);
1045 }
1046
1047 if(sub_cm->sttype[v_s] == MP_st)
1048 {
1049 for(i = 0; i < (orig_cm->abc->K*orig_cm->abc->K); i++)
1050 sub_cm->e[v_s][i] = orig_cm->e[v_o1][i];
1051 return;
1052 }
1053
1054 if(submap->s2o_id[v_s] == TRUE)
1055 {
1056 /* must be a singlet emitter */
1057 for(i = 0; i < orig_cm->abc->K; i++)
1058 sub_cm->e[v_s][i] = orig_cm->e[v_o1][i];
1059 /* No FNorm's necessary (assuming the orig_cm is normalized), since we're
1060 * building a new CM for each sequence in --sub mode, we skip it for speed.
1061 */
1062 return;
1063 }
1064
1065 /* If we get here, v_s is a singlet emitter. */
1066
1067 /* There are two cases when two states can map to v_s.
1068 * Case 1: one of them is an MP_st,
1069 * Case 2: one is an IL_st and one is an IR_st (ambiguity in CM architecture)
1070 * These are the only cases where we need to weight emission probs by orig_psi values,
1071 * and subsequently only cases we need to call FNorm() for */
1072 if(orig_cm->sttype[v_o1] == MP_st)
1073 {
1074 if(orig_cm->sttype[v_o2] == ML_st)
1075 is_left = TRUE;
1076 else if(orig_cm->sttype[v_o2] == MR_st)
1077 is_left = FALSE;
1078 else
1079 cm_Fail("ERROR v_s: %d maps to a MP_st and another non-ML and non-MR state\n");
1080
1081 for(i = 0; i < orig_cm->abc->K; i++)
1082 if(is_left)
1083 for(j = (i*orig_cm->abc->K); j < ((i+1)*orig_cm->abc->K); j++)
1084 sub_cm->e[v_s][i] += orig_psi[v_o1] * orig_cm->e[v_o1][j];
1085 else
1086 for(j = i; j < (orig_cm->abc->K*orig_cm->abc->K); j+=orig_cm->abc->K)
1087 sub_cm->e[v_s][i] += orig_psi[v_o1] * orig_cm->e[v_o1][j];
1088 if(orig_cm->sttype[v_o2] == MP_st)
1089 cm_Fail("ERROR sub_cm state: %d maps to two MATP_MP states\n", v_s);
1090
1091 /*v_o2 must be ML or MR, which can all be handled identically */
1092 for(i = 0; i < orig_cm->abc->K; i++)
1093 sub_cm->e[v_s][i] += orig_psi[v_o2] * orig_cm->e[v_o2][i];
1094 esl_vec_FNorm(sub_cm->e[v_s], orig_cm->abc->K);
1095 return;
1096 }
1097 else if(v_o2 != -1)
1098 cm_Fail("ERROR sub_cm state: %d maps to two states (%d and %d), but neither is a MATP_MP\n", v_s, v_o1, v_o2);
1099
1100 /* If we get here, v_s maps to a single singlet emitter in orig_cm, v_o1 */
1101 for(i = 0; i < orig_cm->abc->K; i++)
1102 sub_cm->e[v_s][i] = orig_cm->e[v_o1][i];
1103
1104 return;
1105 }
1106
1107 /**************************************************************************
1108 * EPN 08.31.06
1109 * cm2sub_cm_trans_probs()
1110 *
1111 * Purpose: For a specific sub CM state v_s, determine the
1112 * transition probabilities going out of v_s,
1113 * using the psi values for the original template
1114 * CM (orig_cm) (psi[v] is expected number of
1115 * times state v is entered in a parse).
1116 * We fill the sub_cm->t[v_s] with 'virtual
1117 * counts', then we'll normalize to probabilities
1118 * later (outside this function).
1119 *
1120 * This is based on cp9_modelmaker.c::cm2hmm_trans_probs_cp9()
1121 * which was based on formulas/ideas in Zasha Weinberg's
1122 * thesis (p.123).
1123 *
1124 * Args:
1125 * CM_t *orig_cm - the original, template CM
1126 * CM_t *sub_cm - the sub CM
1127 * double *orig_psi - orig_psi[v] is the expected number of times state v is entered
1128 * in a CM parse
1129 * char ***tmap - the hard-coded transition map
1130 * int v_s - the sub_cm state we're filling transitions for
1131 * submap - the map from the sub CM to the template CM
1132 * Returns: void
1133 */
1134 static void
cm2sub_cm_trans_probs(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,int v_s,CMSubMap_t * submap)1135 cm2sub_cm_trans_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_s, CMSubMap_t *submap)
1136 {
1137 int v_o;
1138 int yoffset;
1139 int y_s;
1140
1141 /*printf("in cm2sub_cm_trans_probs: v_s: %d\n", v_s);*/
1142
1143 if(submap->s2o_id[v_s] == TRUE) /* v_s is identical to submap->s2o_smap[v_s][0] */
1144 {
1145 v_o = submap->s2o_smap[v_s][0];
1146 for(yoffset = 0; yoffset < sub_cm->cnum[v_s]; yoffset++)
1147 {
1148 /* we can just copy the transitions */
1149 sub_cm->t[v_s][yoffset] = orig_psi[v_o] * orig_cm->t[v_o][yoffset];
1150 }
1151 return;
1152 }
1153
1154 /* start with the first orig_cm state that maps to v_s */
1155 v_o = submap->s2o_smap[v_s][0];
1156 /*printf("\tv_o: %d\n", v_o);*/
1157 if(v_o == -1)
1158 {
1159 if(sub_cm->sttype[v_s] != S_st &&
1160 sub_cm->sttype[v_s] != E_st &&
1161 sub_cm->sttype[v_s] != B_st)
1162 /* special cases, S_st, E_st, B_st */
1163 cm_Fail("ERROR, sub_cm state v_s: %d maps to no state in sub_cm, but it's not a B, E or S state\n", v_s);
1164 }
1165 else
1166 {
1167 if(sub_cm->sttype[v_s] == S_st ||
1168 sub_cm->sttype[v_s] == E_st ||
1169 sub_cm->sttype[v_s] == B_st)
1170 if(v_s != 0)
1171 cm_Fail("ERROR, sub_cm state v_s: %d is S, E or B but maps to a orig_cm state: v_o:%d\n", v_o);
1172
1173 for(yoffset = 0; yoffset < sub_cm->cnum[v_s]; yoffset++)
1174 {
1175 y_s = sub_cm->cfirst[v_s] + yoffset;
1176 if(sub_cm->sttype[(y_s+1)] != E_st) /* if y_s+1 is an E, y_s is a detached insert state, we want
1177 * it to be impossible to reach this guy, leave counts as 0.0 */
1178 {
1179 cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][0], v_s, yoffset, orig_psi, tmap);
1180 cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][1], v_s, yoffset, orig_psi, tmap);
1181 }
1182 }
1183 }
1184
1185 /* move on to the second orig_cm state that maps to v_s */
1186 v_o = submap->s2o_smap[v_s][1];
1187 if(v_o != -1)
1188 {
1189 for(yoffset = 0; yoffset < sub_cm->cnum[v_s]; yoffset++)
1190 {
1191 y_s = sub_cm->cfirst[v_s] + yoffset;
1192 if(sub_cm->sttype[(y_s+1)] != E_st) /* if y_s+1 is an E, y_s is a detached insert state, we want
1193 * it to be impossible to reach this guy, leave counts as 0.0 */
1194 {
1195 cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][0], v_s, yoffset, orig_psi, tmap);
1196 cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][1], v_s, yoffset, orig_psi, tmap);
1197 }
1198 }
1199 }
1200 return;
1201 }
1202
1203 /**************************************************************************
1204 * EPN 09.15.06
1205 * cm2sub_cm_trans_probs_S()
1206 *
1207 * Purpose: For a specific sub CM S state v_start, fill in virtual counts
1208 * for transitions out of v_s. We do this in its own seperate function
1209 * because we can't robustly map S states in a sub_cm to S states
1210 * in an orig CM (if its possible - I can't figure out how to do it).
1211 *
1212 * Args:
1213 * CM_t *orig_cm - the original, template CM
1214 * CM_t *sub_cm - the sub CM
1215 * double *orig_psi - orig_psi[v] is the expected number of times state v is entered
1216 * in a CM parse
1217 * char ***tmap - the hard-coded transition map
1218 * int v_start - the sub_cm start state we're filling virtual counts of transitions into
1219 * submap - the map from the sub CM to the template CM
1220 * Returns: void
1221 */
1222 static void
cm2sub_cm_trans_probs_S(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,int v_start,CMSubMap_t * submap)1223 cm2sub_cm_trans_probs_S(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_start, CMSubMap_t *submap)
1224 {
1225 int yoffset;
1226 int y_s;
1227
1228 int sub_nd;
1229
1230 int v_s_insert;
1231 int v_o_insert;
1232
1233 float sum;
1234 float il_psi;
1235 float temp_psi;
1236 float temp_psi_sum;
1237 float diff;
1238
1239 int orig_il1, orig_il2, orig_ir1, orig_ir2;
1240
1241 /*printf("in cm2sub_cm_trans_probs_S: v_start: %d\n", v_start);*/
1242
1243 sub_nd = sub_cm->ndidx[v_start];
1244
1245 if(sub_cm->ndtype[sub_nd] == BEGL_nd)
1246 {
1247 /* This is the easy case, we have transitions into each of the states in the split-set
1248 * of the next node. The only way to reach each of these states is from the BEGL
1249 * so we just weight each by the psi values for the matching orig_cm states.
1250 */
1251 if(sub_cm->ndtype[sub_nd + 1] == BIF_nd)
1252 sub_cm->t[v_start][0] = 1.0; /* BEGL_S -> BIF_B */
1253 else
1254 for(yoffset = 0; yoffset < sub_cm->cnum[v_start]; yoffset++)
1255 {
1256 y_s = sub_cm->cfirst[v_start] + yoffset;
1257 /*printf("updating sub_cm->t[%d][%d]\n", v_start, yoffset);*/
1258 sub_cm->t[v_start][yoffset] = orig_psi[submap->s2o_smap[y_s][0]];
1259 if(submap->s2o_smap[y_s][1] != -1)
1260 sub_cm->t[v_start][yoffset] += orig_psi[submap->s2o_smap[y_s][1]];
1261 }
1262 }
1263
1264 else if(sub_cm->ndtype[sub_nd] == BEGR_nd)
1265 {
1266 /* More complicated than the BEGL case b/c we need to handle the
1267 * BEGR_S -> BEGR_IL transition as well as BEGR_S -> next node
1268 * split set transitions.
1269 * We know the BEGR_IL -> BEGR_IL self transition though because
1270 * we were able to map BEGR_IL to 1 or 2 orig_cm states.
1271 */
1272
1273 v_s_insert = v_start + 1;
1274 if(sub_cm->ndtype[sub_nd + 1] == BIF_nd)
1275 {
1276 /*printf("!!!SPECIAL CASE BEGR -> BIF! v_ct\n");*/
1277
1278 v_o_insert = submap->s2o_smap[v_s_insert][0];
1279 if(submap->s2o_smap[v_s_insert][1] == -1)
1280 {
1281 diff = sub_cm->t[v_s_insert][0] - (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]);
1282 if(diff >= 0. && diff > 0.0001)
1283 cm_Fail("ERROR, code for calc'ing BEGR_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1284 if(diff <= 0. && diff < -0.0001)
1285 cm_Fail("ERROR, code for calc'ing BEGR_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1286 }
1287
1288 /* BEGR_IL -> BEGR_IL will be the sum over possibly two orig_cm states v_o_insert that
1289 * map to sub_cm state BEGR_IL of:
1290 * orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0],
1291 * so BEGR_IL -> BIF_B is calc'ed as follows */
1292 sub_cm->t[v_s_insert][1] = orig_psi[v_o_insert] * (1. - orig_cm->t[v_o_insert][0]);
1293 if(submap->s2o_smap[v_s_insert][1] != -1)
1294 {
1295 /* we have to factor in the other state as well. */
1296 cm_Fail("ERROR, BEGR_IL: %d maps to 2 orig_cm states, was hoping this was impossible - need to implement.\n", v_s_insert);
1297 v_o_insert = submap->s2o_smap[v_s_insert][1];
1298 sub_cm->t[v_s_insert][1] += orig_psi[v_o_insert] * (1. - orig_cm->t[v_o_insert][0]);
1299 }
1300 /* Now we normalize the transitions out of BEGR_IL, so we can calculate BEGR_S -> BEGR_IL */
1301 esl_vec_FNorm(sub_cm->t[v_s_insert], sub_cm->cnum[v_s_insert]);
1302 il_psi = orig_psi[submap->s2o_smap[v_s_insert][0]];
1303 if(submap->s2o_smap[v_s_insert][1] != -1)
1304 {
1305 cm_Fail("ERROR, BEGR_IL: %d maps to 2 orig_cm states, was hoping this was impossible - need to implement.\n", v_s_insert);
1306 il_psi+= orig_psi[submap->s2o_smap[v_s_insert][1]];
1307 }
1308 sub_cm->t[v_start][0] = (1. - sub_cm->t[v_s_insert][0]) * il_psi; /* set BEGR_S -> BEGR_IL */
1309 sub_cm->t[v_start][1] = 1. - sub_cm->t[v_start][0]; /* set BEGR_S -> BIF_B */
1310 }
1311
1312 else
1313 { /* next node is not a BIF node */
1314 /* First, normalize the probabilities out of the BEGR_IL, so
1315 * we can calculate what BEGR_S -> BEGR_IL should be (we need
1316 * to know BEGR_IL -> BEGR_IL probability and orig_psi of
1317 * the states that map to BEGR_IL to do this).
1318 */
1319 sum = 0.;
1320 esl_vec_FNorm(sub_cm->t[v_s_insert], sub_cm->cnum[v_s_insert]);
1321 il_psi = orig_psi[submap->s2o_smap[v_s_insert][0]];
1322 if(submap->s2o_smap[v_s_insert][1] != -1)
1323 {
1324 cm_Fail("ERROR, BEGR_IL: %d maps to 2 orig_cm states, was hoping this was impossible - need to implement.\n", v_s_insert);
1325 il_psi += orig_psi[submap->s2o_smap[v_s_insert][1]];
1326 }
1327 sub_cm->t[v_start][0] = (1. - sub_cm->t[v_s_insert][0]) * il_psi; /* set BEGR_S -> BEGR_IL */
1328 sum = sub_cm->t[v_start][0];
1329
1330 for(yoffset = 1; yoffset < sub_cm->cnum[v_start]; yoffset++) /* note we start at yoffset = 1
1331 * BEGR_S -> first state of next
1332 * node's split set. */
1333 {
1334 y_s = sub_cm->cfirst[v_start] + yoffset;
1335 temp_psi = orig_psi[submap->s2o_smap[y_s][0]];
1336 if(submap->s2o_smap[y_s][1] != -1)
1337 temp_psi += orig_psi[submap->s2o_smap[y_s][1]];
1338
1339 sub_cm->t[v_start][yoffset] = temp_psi - il_psi * sub_cm->t[v_s_insert][yoffset];
1340 sum += sub_cm->t[v_start][yoffset];
1341 }
1342 /*printf("BEGR->NON BIF SUM: %f\n", sum);*/
1343 if(sum < 1.0 && ((1.0 - sum) > 0.001))
1344 cm_Fail("ERROR calculating transitions out of BEGR_S incorrectly\n");
1345 if(sum > 1.0 && ((sum - 1.0) > 0.001))
1346 cm_Fail("ERROR calculating transitions out of BEGR_S incorrectly\n");
1347 }
1348 }
1349 else if(sub_cm->ndtype[sub_nd] == ROOT_nd)
1350 {
1351 /*printf("in cm2sub_cm_trans_probs_S(), ROOT_nd\n");*/
1352 /* the only case we have to worry about is if the next node is BIF node,
1353 * otherwise the transitions out of ROOT_S have already been set.
1354 */
1355 if(sub_cm->ndtype[sub_nd + 1] == BIF_nd)
1356 {
1357 /*printf("!!!SPECIAL CASE ROOT -> BIF!\n");*/
1358 /* Before we do anything we have to check to see if we need to subtract
1359 * any subpaths from ROOT_S -> ROOT_IR that have been double counted:
1360 */
1361 /* if orig_ir < orig_il, we've counted paths from
1362 * ir -> il correctly for ROOT_IL -> ROOT_IR and
1363 * incorrectly for ROOT_S -> ROOT_IR
1364 */
1365
1366 orig_il1 = submap->s2o_smap[1][0];
1367 orig_il2 = submap->s2o_smap[1][1];
1368
1369 orig_ir1 = submap->s2o_smap[2][0];
1370 orig_ir2 = submap->s2o_smap[2][1];
1371
1372 if(orig_ir1 < orig_il1)
1373 {
1374 sub_cm->t[0][1] -= orig_psi[orig_ir1] *
1375 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir1, orig_il1, 0, tmap, orig_psi);
1376 }
1377 if(orig_ir2 != -1 && orig_ir2 < orig_il1)
1378 {
1379 sub_cm->t[0][1] -= orig_psi[orig_ir2] *
1380 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir2, orig_il1, 0, tmap, orig_psi);
1381 }
1382 if(orig_il2 != -1 && orig_ir1 < orig_il2)
1383 {
1384 sub_cm->t[0][1] -= orig_psi[orig_ir1] *
1385 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir1, orig_il2, 0, tmap, orig_psi);
1386 }
1387 if((orig_ir2 != -1 && orig_il2 != -1) && (orig_ir2 < orig_il2))
1388 {
1389 sub_cm->t[0][1] -= orig_psi[orig_ir2] *
1390 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir2, orig_il2, 0, tmap, orig_psi);
1391 }
1392
1393 /* Next, set transition from ROOT_S -> BIF_B, we know that
1394 * ROOT_S -> ROOT_IL and ROOT_S -> ROOT_IR were set using
1395 * orig_cm subpaths originating at ROOT_S, so we know that
1396 * the virtual counts out of ROOT_S should NOT be scaled, in
1397 * other words, they can be treated as probabilities and should
1398 * sum to 1.0.
1399 */
1400 sub_cm->t[0][2] = 1.0 - (sub_cm->t[0][0] + sub_cm->t[0][1]); /* set ROOT_S->BIF_B */
1401
1402 v_s_insert = v_start + 1; /* ROOT_IL */
1403 v_o_insert = submap->s2o_smap[v_s_insert][0];
1404 if(submap->s2o_smap[v_s_insert][1] == -1)
1405 {
1406 diff = sub_cm->t[v_s_insert][0] - (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]);
1407 if(diff >= 0. && diff >= 0.0001)
1408 cm_Fail("ERROR, code for calc'ing ROOT_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1409 if(diff <= 0. && diff <= -0.0001)
1410 cm_Fail("ERROR, code for calc'ing ROOT_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1411 }
1412 /* We want to figure out what the virtual counts for the traansition ROOT_IL -> BIF_B should
1413 * be, but this depends on how we filled the virtual counts for the other two
1414 * transitions (-> ROOT_IL and -> ROOT_IR) out of ROOT_IL, so we SHOULD revisit
1415 * their calculation. Here, I'm attempting a trick that SHOULD work: figuring out
1416 * what the self_insert probability IL->IL should be based on the self insert probabilities
1417 * of the up to 2 orig_cm states that sub_cm ROOT_IL maps to, then using that scaling
1418 * factor (the actual probability / the virtual counts currently in sub_cm->t[ROOT_IL][0])
1419 * to scale both ROOT_IL->ROOT_IL and ROOT_IL->ROOT_IR, then we can just set ROOT_IL->BIF
1420 * as 1.0 - (ROOT_IL->ROOT_IL + ROOT_IL->ROOT_IR).
1421 */
1422
1423 temp_psi_sum = orig_psi[v_o_insert];
1424
1425 if(submap->s2o_smap[v_s_insert][1] != -1)
1426 {
1427 v_o_insert = submap->s2o_smap[v_s_insert][1];
1428 temp_psi_sum += orig_psi[v_o_insert];
1429 }
1430 sub_cm->t[v_s_insert][0] /= temp_psi_sum; /* ROOT_IL -> ROOT_IL */
1431 sub_cm->t[v_s_insert][1] /= temp_psi_sum; /* ROOT_IL -> ROOT_IR */
1432 sub_cm->t[v_s_insert][2] = 1. - (sub_cm->t[v_s_insert][0] + sub_cm->t[v_s_insert][1]);
1433 /* ROOT_IL -> BIF_B */
1434
1435 /* move on to calc'ing ROOT_IR -> BIF */
1436 v_s_insert = v_start + 2; /* ROOT_IR */
1437 v_o_insert = submap->s2o_smap[v_s_insert][0];
1438 if(submap->s2o_smap[v_s_insert][1] == -1)
1439 {
1440 diff = sub_cm->t[v_s_insert][0] - (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]);
1441 if(diff >= 0. && diff > 0.0001)
1442 cm_Fail("ERROR, code for calc'ing ROOT_IR -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1443 if(diff <= 0. && diff < -0.0001)
1444 cm_Fail("ERROR, code for calc'ing ROOT_IR -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1445 }
1446 /* We can calculate what the ROOT_IR -> ROOT_IR probability should be, and then
1447 * just take 1.0 minus that probability to set ROOT_IR -> BIF.
1448 */
1449 temp_psi_sum = orig_psi[v_o_insert];
1450
1451 if(submap->s2o_smap[v_s_insert][1] != -1)
1452 {
1453 v_o_insert = submap->s2o_smap[v_s_insert][1];
1454 temp_psi_sum += orig_psi[v_o_insert];
1455 }
1456 sub_cm->t[v_s_insert][0] /= temp_psi_sum; /* ROOT_IR -> ROOT_IR */
1457 sub_cm->t[v_s_insert][1] = 1. - (sub_cm->t[v_s_insert][0]);
1458 /* ROOT_IR -> BIF_B */
1459 }
1460 else
1461 {
1462 /* ROOT -> non-BIF node, we have already handled this, so we return. */
1463 }
1464 }
1465 /*printf("leaving cm2sub_cm_trans_probs_S\n\n");*/
1466 return;
1467 }
1468
1469 /**************************************************************************
1470 * EPN 09.21.06
1471 * cm2sub_cm_trans_probs_B_E()
1472 *
1473 * Purpose: For a specific sub CM B or E state v_be fill in virtual counts
1474 * for transitions into v_be. We do this in its own seperate function,
1475 * because we can't robustly map E states in a sub_cm to E states
1476 * in an orig CM (if its possible - I can't figure out how to do it).
1477 *
1478 * Args:
1479 * CM_t *orig_cm - the original, template CM
1480 * CM_t *sub_cm - the sub CM
1481 * double *orig_psi - orig_psi[v] is the expected number of times state v is entered
1482 * in a CM parse
1483 * char ***tmap - the hard-coded transition map
1484 * int v_be - the sub_cm END state we're filling virtual counts of transitions into
1485 * submap - the map from the sub CM to the template CM
1486 * print_flag - TRUE to print useful debugging info
1487 * Returns: void
1488 */
1489 static void
cm2sub_cm_trans_probs_B_E(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,int v_be,CMSubMap_t * submap,int print_flag)1490 cm2sub_cm_trans_probs_B_E(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_be, CMSubMap_t *submap,
1491 int print_flag)
1492 {
1493 int orig_v;
1494 int sub_v;
1495
1496 int sub_nd;
1497 int psub_nd;
1498 int sub_i;
1499 int bif_end_yoffset;
1500 int orig_v1, orig_v2, orig_i;
1501 float contribution;
1502 int orig_il, orig_ir, sub_il, sub_ir;
1503 int into_end_flag;
1504
1505 if(print_flag) printf("in cm2sub_cm_trans_probs_B_E: sub_v: %d\n", v_be);
1506
1507 sub_nd = sub_cm->ndidx[v_be];
1508 psub_nd = sub_nd - 1;
1509
1510 into_end_flag = FALSE;
1511 if(sub_cm->sttype[v_be] == E_st)
1512 into_end_flag = TRUE;
1513 /* We're moving from into an END, so the END_E - 1 state is a detached insert,
1514 * and we have to handle this in a special way */
1515
1516 switch (sub_cm->ndtype[psub_nd]) {
1517 case MATP_nd:
1518 if(print_flag) printf("prev node type MATP\n");
1519 /* psub_nd is a MATP: each state in psub_nd transits to
1520 * exactly 3 states, the MATP_IL, MATP_IR and the BIF or END state v_be, for which
1521 * we are trying to fill virtual transition counts into.
1522 *
1523 * The case for MATP_nd's is actually simpler than for other nodes because we
1524 * can exploit the fact that if a MATP node exists in the sub_cm it necessarily
1525 * must map to a MATP node in the original CM (we're not adding or changing any
1526 * base pairs, just deleting some possibly), so every sub MATP state must correspond
1527 * to exactly 1 original MATP state.
1528 */
1529
1530 bif_end_yoffset = 2; /* cm->t[][2] goes to BIF_B or END_E */
1531 sub_il = sub_cm->nodemap[psub_nd] + 4;
1532 sub_ir = sub_cm->nodemap[psub_nd] + 5;
1533
1534 orig_il = submap->s2o_smap[sub_il][0];
1535 orig_ir = submap->s2o_smap[sub_ir][0];
1536 if(into_end_flag && orig_ir != -1)
1537 cm_Fail("ERROR in cm2sub_cm_trans_probs_B_E(), into_end_flag is TRUE but MATP_IR maps to a orig_cm state.\n");
1538 if(orig_ir == -1 && !into_end_flag)
1539 cm_Fail("ERROR in cm2sub_cm_trans_probs_B_E(), into_end_flag is FALSE but MATP_IR doesn't map to a orig_cm state.\n");
1540
1541 for(sub_v = sub_cm->nodemap[psub_nd]; sub_v < sub_il; sub_v++)
1542 {
1543 orig_v = submap->s2o_smap[sub_v][0]; /* submap->s2o_smap[sub_v][1] will nec. be -1 in a MATP */
1544 sub_cm->t[sub_v][bif_end_yoffset] = orig_psi[orig_v] -
1545 (sub_cm->t[sub_v][0] + sub_cm->t[sub_v][1]); /* if into_end_flag is TRUE, sub_cm->t[sub_v][1] is 0. */
1546 }
1547 /* now do MATP_IL and MATP_IR */
1548 sub_cm->t[sub_il][bif_end_yoffset] = orig_psi[orig_il] -
1549 (sub_cm->t[sub_il][0] + sub_cm->t[sub_il][1]);
1550
1551 bif_end_yoffset = 1;
1552 if(into_end_flag)
1553 sub_cm->t[sub_ir][bif_end_yoffset] = 1.0;
1554 else
1555 sub_cm->t[sub_ir][bif_end_yoffset] = orig_psi[orig_ir] - sub_cm->t[sub_ir][0];
1556 break;
1557
1558 case MATL_nd:
1559 case MATR_nd:
1560 if(print_flag) printf("prev node type MATL or MATR\n");
1561 /* psub_nd is a MATL or MATR and we know each state in psub_nd transits to
1562 * exactly 2 states, the insert state of psub_nd and the BIF or END state v_be, which
1563 * we are trying to fill in transitions to.
1564 */
1565 bif_end_yoffset = 1; /* cm->t[][1] goes to BIF_B or END_E */
1566 sub_i = sub_cm->nodemap[psub_nd] + 2; /* sub_i is MATL_IL or MATR_IR */
1567 orig_i = submap->s2o_smap[sub_i][0];
1568 if(into_end_flag && orig_i != -1)
1569 cm_Fail("ERROR in cm2sub_cm_trans_probs_B_E(), into_end_flag is TRUE but MAT*_I* maps to a orig_cm state.\n");
1570
1571 for(sub_v = sub_cm->nodemap[psub_nd]; sub_v < sub_i; sub_v++)
1572 {
1573 orig_v1 = submap->s2o_smap[sub_v][0];
1574 orig_v2 = submap->s2o_smap[sub_v][1];
1575 if(into_end_flag)
1576 sub_cm->t[sub_v][bif_end_yoffset] = 1.0;
1577 else
1578 {
1579 if(orig_v1 < orig_i)
1580 {
1581 contribution = cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1582 orig_v1, orig_i, sub_v, tmap, orig_psi);
1583 sub_cm->t[sub_v][bif_end_yoffset] = orig_psi[orig_v1] * (1. - contribution);
1584 if(print_flag) printf("curr v1 < i1 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v1], (1.-contribution), (orig_psi[orig_v1] * (1. - contribution)));
1585 }
1586 else
1587 {
1588 contribution = orig_psi[orig_i] * cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1589 orig_i, orig_v1, sub_v, tmap, orig_psi);
1590 sub_cm->t[sub_v][bif_end_yoffset] = orig_psi[orig_v1] * (1. - (contribution / orig_psi[orig_v1]));
1591 if(print_flag) printf("curr i1 < v1 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v1], (1.-contribution), (orig_psi[orig_v1] * (1. - contribution)));
1592 }
1593 if(orig_v2 != -1)
1594 {
1595 if(orig_v2 < orig_i)
1596 {
1597 contribution = cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1598 orig_v2, orig_i, sub_v, tmap, orig_psi);
1599 sub_cm->t[sub_v][bif_end_yoffset] += orig_psi[orig_v2] * (1. - contribution);
1600 if(print_flag) printf("curr v2 < i1 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v2], (1.-contribution), (orig_psi[orig_v2] * (1. - contribution)));
1601 }
1602 else
1603 {
1604 contribution = orig_psi[orig_i] * cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1605 orig_i, orig_v2, sub_v, tmap, orig_psi);
1606 sub_cm->t[sub_v][bif_end_yoffset] += orig_psi[orig_v2] * (1. - (contribution / orig_psi[orig_v2]));
1607 if(print_flag) printf("curr i1 < v2 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v2], (1.-contribution), (orig_psi[orig_v2] * (1. - contribution)));
1608 }
1609 }
1610 }
1611 }
1612 /* now set the sub_i -> B or E transition prob */
1613 if(into_end_flag) /* The transition probability out of the MAT{L,R}_I{L,R} is irrelevant,
1614 * because the state is detached. */
1615 sub_cm->t[sub_i][1] = 1.;
1616 else
1617 sub_cm->t[sub_i][1] = orig_psi[orig_i] *
1618 (1. - cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1619 orig_i, orig_i, sub_i,tmap, orig_psi));
1620 break;
1621
1622 case ROOT_nd:
1623 case BEGL_nd:
1624 case BEGR_nd:
1625 { /* we handle these cases in cm2sub_cm_trans_probs_S() */ }
1626 break;
1627
1628 default:
1629 cm_Fail("ERROR bogus node type transiting to END or BIF\n");
1630 break;
1631 }
1632
1633 if(print_flag) printf("Returning from cm2sub_cm_trans_probs_B_E\n");
1634 return;
1635 }
1636
1637 /**************************************************************************
1638 * EPN 08.31.06
1639 * Function: cm2sub_cm_add_single_trans()
1640 *
1641 * Purpose: Add a virtual counts contribution to a single CM transition.
1642 *
1643 * See related functions for explanation of parameters.
1644 * Returns: (void)
1645 */
1646 static void
cm2sub_cm_add_single_trans(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int orig_v,int orig_y,int sub_v,int yoffset,double * orig_psi,char *** tmap)1647 cm2sub_cm_add_single_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int orig_y,
1648 int sub_v, int yoffset, double *orig_psi, char ***tmap)
1649 {
1650 int start;
1651 /* check if we've got real CM state ids */
1652 if(orig_v == -1 || orig_y == -1)
1653 return;
1654 start = orig_v;
1655 if(orig_y < start)
1656 start = orig_y;
1657 sub_cm->t[sub_v][yoffset] += orig_psi[start] *
1658 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_v, orig_y, sub_v, tmap, orig_psi);
1659 return;
1660 }
1661
1662 /**************************************************************************
1663 * EPN 08.31.06
1664 * Function: cm2sub_cm_sum_subpaths()
1665 *
1666 * Purpose: Calculate probability of getting from one state (start) to
1667 * another (end) in a CM, taking special considerations involving
1668 * insert states.
1669 *
1670 * This function is similar to CP9_cm2wrhmm::cm_sum_subpaths_cp9()
1671 * but was written for mapping transitions in a template CM (orig_cm)
1672 * to those in a sub CM built from that template. The sub_cm conversion
1673 * process is more complex and this function requires more functionality
1674 * than the *_cp9() version.
1675 *
1676 * When getting a transition probability for the sub_cm state 'v' in node
1677 * 'n', we ignore the contribution of subpaths that correspond to other
1678 * transitions out of states in 'n'.
1679 * For example, we don't want to include the probability of an
1680 * IL (nd 'n') -> MATL_ML (nd 'n'+1) sub parse when calculating
1681 * the transition probability for MATL_ML (nd 'n') -> MATL_ML (nd 'n'+1)
1682 * because the IL (nd 'n') -> MATL_ML (nd 'n'+1) exists and should
1683 * contain that probability mass. This is an easy example to skip,
1684 * there are several other instances where it's more complex to
1685 * ignore subpaths involving inserts like this.
1686 *
1687 * Importantly, this function does not correctly calculate the transition
1688 * virtual counts for only the states in the ROOT_nd. This is because
1689 * the ROOT_nd has 2 insert states, which makes it much more complex to
1690 * properly ignore subparses involving both these inserts. The
1691 * cm2sub_cm_subtract_root_subpaths() function corrects the counts for
1692 * the ROOT states. MATP_nd's also have 2 insert states but when constructing
1693 * sub_cm's the only MATP_nd's that exist have an exact mapping MATP_nd in
1694 * the orig_cm, which makes them easier to handle.
1695 *
1696 * In some cases, this function calls itself to determine the probabilities
1697 * of subparses that it should ignore. It is for these cases that it's
1698 * necessary to have the init_sub_start parameter passed in, which is
1699 * the sub_cm state the initial call (non-recursive call) of this function
1700 * was calculating transitions out of.
1701 *
1702 * Args:
1703 * CM_t *orig_cm - the original, template CM
1704 * CM_t *sub_cm - the sub CM
1705 * int orig_v - orig_cm state that maps to sub_v (1 of potentially 2)
1706 * int orig_y - orig_cm state that maps to sub_y (1 of potentially 2)
1707 * int sub_v - sub_cm state; we're calc'ing transitions out of sub_v
1708 * int sub_y - sub_cm state; we're calc'ing transitions into sub_y
1709 * int init_sub_start - the sub_cm state the initial (non-recursive call) had as sub_v.
1710 * double *orig_psi - for orig_cm: orig_psi[v] is the expected number of times state v
1711 * is entered in a CM parse
1712 *
1713 * Returns: FLOAT, the summed probability of all subpaths through the CM
1714 * starting at "start" and ending at "end" that don't pass through
1715 * orig_cm inserts that map to sub_cm insert states in the same node as init_sub_start.
1716 */
1717 static float
cm2sub_cm_sum_subpaths(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int orig_v,int orig_y,int init_sub_start,char *** tmap,double * orig_psi)1718 cm2sub_cm_sum_subpaths(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int orig_y,
1719 int init_sub_start, char ***tmap, double *orig_psi)
1720 {
1721 int status;
1722 int v,x,y; /* state indices in the orig_cm */
1723 int start, end; /* min(orig_v, orig_y) and max(orig_v, orig_y) respectively */
1724 float to_return; /* the probability mass we're returning */
1725 char tmap_val; /* a value in the hard-coded tmap */
1726 int is_insert; /* TRUE if v is insert, FALSE if not */
1727 float insert_to_start; /* prob mass going into start that we must ignore (see code)*/
1728 float end_to_insert; /* prob mass going out of end that we must ignore (see code)*/
1729 int skip_flag; /* TRUE to skip state v's contribution (see code) */
1730 int init_sub_nd; /* sub_cm node containing init_sub_start */
1731 int sub_insert1; /* a sub_cm insert state in sub_cm node init_sub_nd, or -1 */
1732 int sub_insert2; /* a sub_cm insert state in sub_cm node init_sub_nd, or -1 */
1733 int orig_insert1; /* the orig_cm state that map to sub_insert1, or -1 */
1734 int orig_insert2; /* the orig_cm state that map to sub_insert2, or -1 */
1735 float self_loop_factor;/* for dealing with self-insert loops */
1736 int sub_start1; /* a sub_cm state that maps to orig_cm's 'start' */
1737 int sub_start2; /* a sub_cm state that maps to orig_cm's 'start' or -1 */
1738 int sub_end1; /* a sub_cm state that maps to orig_cm's 'end' */
1739 int sub_end2; /* a sub_cm state that maps to orig_cm's 'end' or -1 */
1740 double *sub_psi; /* sub_psi[v] is the expected number of times state v is *
1741 * entered given we started at state "start", and we *
1742 * didn't go through orig_insert1 or orig_insert2 */
1743
1744 if(orig_v == -1 || orig_y == -1)
1745 return 0.;
1746
1747 start = orig_v;
1748 end = orig_y;
1749 if(start > end)
1750 { start = orig_y; end = orig_v; }
1751 if(start == end)
1752 return orig_cm->t[start][0]; /* return self-insert probability */
1753
1754 /*printf("\nin cm2sub_cm_sum_subpaths2: start: %d | end: %d\n", start, end);*/
1755
1756 ESL_ALLOC(sub_psi, sizeof(double) * (end - start + 1));
1757 sub_psi[0] = 1.; /* Initialize sub_psi[0]. We have to start in "start" */
1758
1759 /* First we store some useful information for later in the function,
1760 * we do this and use descriptive variable names to make the code
1761 * later easier to follow. */
1762
1763 /* Determine the 1 or 2 sub_cm states that map to
1764 * the orig_cm state start and end, and store these
1765 * in sub_start{1,2} and sub_end{1,2} respectively. */
1766 sub_start1 = submap->o2s_smap[start][0];
1767 sub_start2 = submap->o2s_smap[start][1];
1768 sub_end1 = submap->o2s_smap[end ][0];
1769 sub_end2 = submap->o2s_smap[end ][1];
1770
1771 /* Determine the 0, 1, or 2 sub_cm insert states in the same
1772 * node as init_sub_start, store these in sub_insert1 and
1773 * sub_insert2.
1774 */
1775 init_sub_nd = sub_cm->ndidx[init_sub_start];
1776 orig_insert1 = orig_insert2 = sub_insert1 = sub_insert2 = -1;
1777 if(sub_cm->ndtype[init_sub_nd] == MATP_nd)
1778 {
1779 sub_insert1 = sub_cm->nodemap[init_sub_nd] + 4; /* MATP_IL */
1780 sub_insert2 = sub_cm->nodemap[init_sub_nd] + 5; /* MATP_IR */
1781 }
1782 else if(sub_cm->ndtype[init_sub_nd] == ROOT_nd)
1783 {
1784 sub_insert1 = 1; /* ROOT_IL */
1785 sub_insert2 = 2; /* ROOT_IR */
1786 }
1787 else if(sub_cm->ndtype[init_sub_nd] == MATL_nd ||
1788 sub_cm->ndtype[init_sub_nd] == MATR_nd ||
1789 sub_cm->ndtype[init_sub_nd] == BEGR_nd)
1790 sub_insert1 = sub_cm->cfirst[init_sub_start]; /* MAT{L,R}_I{L,R} or BEGR_IL */
1791
1792 /* Set orig_insert{1,2} as the orig_cm states that map to sub_insert{1,2}. */
1793 if(sub_insert1 != -1) orig_insert1 = submap->s2o_smap[sub_insert1][0];
1794 if(sub_insert2 != -1) orig_insert2 = submap->s2o_smap[sub_insert2][0];
1795
1796 /* Step through states between start and end, keeping track of prob mass. */
1797 for (v = (start+1); v <= end; v++)
1798 {
1799 sub_psi[v-start] = 0.; /* initialize */
1800 is_insert = FALSE;
1801 if(orig_cm->sttype[v] == IL_st || orig_cm->sttype[v] == IR_st)
1802 is_insert = TRUE;
1803 if(orig_cm->sttype[v] == S_st)
1804 {
1805 /* Previous state is either a BIF_B or a END_E, and there's no transitions
1806 * FROM previous state to this state, so we handle this in a special way.*/
1807 sub_psi[v-start] = sub_psi[(v-1)-start];
1808 }
1809 /* Determine if we should skip the contribution of state v because
1810 * it will be correctly counted in a subsequent call of this function
1811 * for a different sub_cm transition. We want to do this if:
1812 * (1) v is not end
1813 * (2) v is an orig_cm insert state (orig_insert{1,2}) that maps
1814 * to a sub_cm insert state (sub_insert{1,2}) in the same node
1815 * as init_sub_start.
1816 * (3) sub_insert{1,2} is equal to or downstream of init_sub_start
1817 * (4) a sub_cm transition exists between the sub_insert{1,2} and
1818 * a sub_cm state that maps to orig_cm start (sub_start{1,2})
1819 * or end (sub_end{1,2}).
1820 */
1821 skip_flag = FALSE;
1822 if (v != end && v == orig_insert1 && sub_insert1 >= init_sub_start)
1823 {
1824 if(cm_trans_check(sub_cm, sub_insert1, sub_end1 ) ||
1825 cm_trans_check(sub_cm, sub_insert1, sub_end2 ) ||
1826 cm_trans_check(sub_cm, sub_insert1, sub_start1) ||
1827 cm_trans_check(sub_cm, sub_insert1, sub_start2))
1828 skip_flag = TRUE;
1829 }
1830 else if (v != end && v == orig_insert2 && sub_insert2 >= init_sub_start)
1831 {
1832 if(cm_trans_check(sub_cm, sub_insert2, sub_end1 ) ||
1833 cm_trans_check(sub_cm, sub_insert2, sub_end2 ) ||
1834 cm_trans_check(sub_cm, sub_insert2, sub_start1) ||
1835 cm_trans_check(sub_cm, sub_insert2, sub_start2))
1836 skip_flag = TRUE;
1837 }
1838 if(!skip_flag)
1839 {
1840 for (y = orig_cm->pnum[v]-1; y >= is_insert; y--)
1841 {
1842 x = orig_cm->plast[v] - y;
1843 /* x is a parent of v, we're adding contribution of a transition from x to v. */
1844 tmap_val = tmap[(int) orig_cm->stid[x]][(int) orig_cm->ndtype[orig_cm->ndidx[v]+is_insert]][(int) orig_cm->stid[v]];
1845 /* assert(tmap_val != -1); */
1846 if((x - start) < 0) sub_psi[v-start] += 0.;
1847 else sub_psi[v-start] += sub_psi[x-start] * orig_cm->t[x][(int) tmap_val];
1848 }
1849 if(v != end && is_insert) /* if v is end, we don't include the self loop contribution */
1850 sub_psi[v-start] += sub_psi[v-start] *
1851 (orig_cm->t[v][0] / (1-orig_cm->t[v][0])); /* else we include the self loop contribution */
1852 }
1853 }
1854 to_return = sub_psi[end-start];
1855
1856 /* If start and end are both not inserts, we need to ignore some of the
1857 * probability mass that comes INTO 'start' and goes OUT OF 'end'.
1858 * Specifically we need to ignore the probability mass that
1859 * is accounted for by transitions either TO OR FROM the sub_cm insert
1860 * states in the same sub_cm node as init_sub_start (sub_insert{1,2}).
1861 * This code block is related to the block with the (for(v = start+1..) block
1862 * above involving the 'skip_flag' which skips states accounted for by
1863 * the sub_cm insert states sub_insert{1,2} between start..end, this
1864 * block handles the case where the orig_cm insert states that map
1865 * to sub_insert{1,2}, namely orig_insert{1,2}, fall outside start..end.
1866 */
1867 insert_to_start = 0.;
1868 end_to_insert = 0.;
1869
1870 if((orig_cm->sttype[start] != IL_st && orig_cm->sttype[start] != IR_st) &&
1871 (orig_cm->sttype[end] != IL_st && orig_cm->sttype[end] != IR_st))
1872 {
1873 if(orig_insert1 != -1 && orig_insert1 < start)
1874 {
1875 if(orig_cm->sttype[start] == IL_st || orig_cm->sttype[start] == IR_st)
1876 self_loop_factor = (1. + (orig_cm->t[start][0] / (1. - orig_cm->t[start][0])));
1877 else
1878 self_loop_factor = 1.0;
1879 insert_to_start += self_loop_factor * orig_psi[orig_insert1] *
1880 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_insert1, start, init_sub_start, tmap, orig_psi);
1881 }
1882 if(orig_insert1 != -1 && orig_insert1 > end)
1883 end_to_insert += cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, end, orig_insert1, init_sub_start, tmap, orig_psi);
1884
1885 if(orig_insert2 != -1 && orig_insert2 < start)
1886 {
1887 self_loop_factor = 1.0;
1888 if(orig_cm->sttype[start] == IL_st ||
1889 orig_cm->sttype[start] == IR_st)
1890 self_loop_factor = (1. + (orig_cm->t[start][0] / (1. - orig_cm->t[start][0])));
1891 insert_to_start += self_loop_factor * orig_psi[orig_insert2] * cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_insert2, start, init_sub_start, tmap, orig_psi);
1892 }
1893 if(orig_insert2 != -1 && orig_insert2 > end)
1894 end_to_insert += cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, end, orig_insert2, init_sub_start, tmap, orig_psi);
1895
1896 /*printf("\t\tinsert_to_start: %f sub_psi[0]: %f\n", insert_to_start, sub_psi[0]);
1897 printf("\t\tend_to_insert: %f sub_psi[end-start]: %f\n", end_to_insert, sub_psi[(end-start)]);*/
1898 }
1899 to_return *= (1. - (insert_to_start / orig_psi[start]));
1900 to_return *= (1. - end_to_insert);
1901 /*printf("***returning from cm2sub_cm_sum_subpaths (s: %d | e: %d): %f\n", start, end, to_return);*/
1902
1903 free(sub_psi);
1904 return (float) to_return;
1905
1906 ERROR:
1907 cm_Fail("Memory allocation error.");
1908 return 0.; /* never reached */
1909 }
1910
1911 /**************************************************************************
1912 * EPN 09.01.06
1913 * debug_print_cm_params()
1914 *
1915 * Purpose: Print out emission and transition probabilities and scores
1916 * for a CM.
1917 *
1918 * Args:
1919 * fp stdout often
1920 * CM_t *cm
1921 * Returns: (void)
1922 */
1923 void
debug_print_cm_params(FILE * fp,CM_t * cm)1924 debug_print_cm_params(FILE *fp, CM_t *cm)
1925 {
1926 int status;
1927 int v, i;
1928 int yoffset;
1929
1930 char **nodetypes;
1931 char **sttypes;
1932
1933 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
1934 nodetypes[0] = "BIF";
1935 nodetypes[1] = "MATP";
1936 nodetypes[2] = "MATL";
1937 nodetypes[3] = "MATR";
1938 nodetypes[4] = "BEGL";
1939 nodetypes[5] = "BEGR";
1940 nodetypes[6] = "ROOT";
1941 nodetypes[7] = "END";
1942
1943 ESL_ALLOC(sttypes, sizeof(char *) * 10);
1944 sttypes[0] = "D";
1945 sttypes[1] = "MP";
1946 sttypes[2] = "ML";
1947 sttypes[3] = "MR";
1948 sttypes[4] = "IL";
1949 sttypes[5] = "IR";
1950 sttypes[6] = "S";
1951 sttypes[7] = "E";
1952 sttypes[8] = "B";
1953 sttypes[9] = "EL";
1954
1955 fprintf(fp, "cm->nodes: %d\n", cm->nodes);
1956 fprintf(fp, "cm->M: %d\n", cm->M);
1957 for(v = 0; v < cm->M; v++)
1958 {
1959 fprintf(fp, "v:%4d:%4d %4s %2s\n", v, cm->ndidx[v], nodetypes[(int) cm->ndtype[cm->ndidx[v]]], sttypes[(int) cm->sttype[v]]);
1960 if(cm->nodemap[cm->ndidx[v]] == v)
1961 fprintf(fp, "beg: %0.6f (%.6f %10d)| end %0.6f (%.6f %10d)\n",
1962 cm->begin[v], cm->beginsc[v], cm->ibeginsc[v],
1963 cm->end[v], cm->endsc[v], cm->iendsc[v]);
1964 if(cm->sttype[v] == MP_st)
1965 {
1966 fprintf(fp, "\tE: ");
1967 for(i = 0; i < cm->abc->K*cm->abc->K; i++)
1968 fprintf(fp, "%0.6f (%.6f %6d) ", cm->e[v][i], cm->esc[v][i], cm->iesc[v][i]);
1969 fprintf(fp, "\n");
1970 }
1971 else if(cm->sttype[v] == ML_st ||
1972 cm->sttype[v] == MR_st ||
1973 cm->sttype[v] == IL_st ||
1974 cm->sttype[v] == IR_st)
1975 {
1976 fprintf(fp, "\tE: ");
1977 for(i = 0; i < cm->abc->K; i++)
1978 fprintf(fp, "%0.6f (%0.6f %10d) ", cm->e[v][i], cm->esc[v][i], cm->iesc[v][i]);
1979 fprintf(fp, "\n");
1980 }
1981 if(cm->sttype[v] != B_st && cm->sttype[v] != E_st)
1982 {
1983 fprintf(fp, "\tT: ");
1984 for(yoffset = 0; yoffset < cm->cnum[v]; yoffset++)
1985 fprintf(fp, "%0.6f (%0.6f %10d) ", cm->t[v][yoffset], cm->tsc[v][yoffset], cm->itsc[v][yoffset]);
1986 fprintf(fp, "\n");
1987 }
1988 else if(cm->sttype[v] == B_st)
1989 {
1990 fprintf(fp, "\tL: %d | R: %d\n", cm->cfirst[v], cm->cnum[v]);
1991 }
1992 else if(cm->sttype[v] == E_st)
1993 fprintf(fp, "\n\n");
1994 }
1995 fprintf(fp, "\n\n");
1996 free(nodetypes);
1997 free(sttypes);
1998 return;
1999
2000 ERROR:
2001 cm_Fail("Memory allocation error.");
2002 }
2003
2004 /**************************************************************************
2005 * EPN 09.01.06
2006 * Function: check_sub_cm_by_sampling()
2007 *
2008 * Purpose: Given a CM and a sub CM that is supposed to mirror
2009 * the CM as closely as possible between two given consensus
2010 * columns (spos and epos), check that the sub_cm was correctly
2011 * constructed.
2012 *
2013 * The current approach is to build a CM Plan 9 HMM from the
2014 * sub CM, then sample from the CM and see if the samples
2015 * were likely drawn from the CM Plan 9 distributions.
2016 * This is done inside CP9_cm2wrhmm::CP9_check_by_sampling().
2017 *
2018 * Args:
2019 * orig_cm - the original, template CM
2020 * sub_cm - the sub CM built from the orig_cm
2021 * errbuf - for error messages
2022 * r - source of randomness
2023 * submap - map from orig_cm to sub_cm and vice versa
2024 * subinfo - sub cm information
2025 * chi_thresh - rejection threshold for chi-squared tests
2026 * nsamples - number of samples to use to build the ML HMM
2027 * print_flag - TRUE to print useful info for debugging
2028 *
2029 * Returns: eslOK if CM and sub CM are "close enough" (see code)
2030 * eslFAIL otherwise, errbuf is filled
2031 */
2032 int
check_sub_cm_by_sampling(CM_t * orig_cm,CM_t * sub_cm,char * errbuf,ESL_RANDOMNESS * r,CMSubMap_t * submap,CMSubInfo_t * subinfo,float chi_thresh,int nsamples,int print_flag)2033 check_sub_cm_by_sampling(CM_t *orig_cm, CM_t *sub_cm, char *errbuf, ESL_RANDOMNESS *r, CMSubMap_t *submap, CMSubInfo_t *subinfo,
2034 float chi_thresh, int nsamples, int print_flag)
2035 {
2036 int status;
2037 CP9_t *orig_hmm; /* constructed CP9 HMM from the original cm */
2038 CP9_t *sub_hmm; /* constructed CP9 HMM from the sub_cm */
2039 CP9Map_t *orig_cp9map; /* maps the orig_cm to the orig_hmm and vice versa */
2040 CP9Map_t *sub_cp9map; /* maps the sub_cm to the sub_hmm and vice versa */
2041
2042 /* Build two CP9 HMMs, one for the orig_cm and one for the sub_cm */
2043 if((status = build_cp9_hmm(orig_cm, errbuf, FALSE, 0.0001, print_flag, &orig_hmm, &orig_cp9map)) != eslOK) return status;
2044 if((status = build_cp9_hmm(sub_cm, errbuf, FALSE, 0.0001, print_flag, &sub_hmm, &sub_cp9map)) != eslOK) return status;
2045 CP9Logoddsify(orig_hmm);
2046 CP9Logoddsify(sub_hmm);
2047
2048 /* Look for 'impossible' cases where we know the sub_cm
2049 * construction procedure fails, in that the distribution of transitions out of CP9 nodes
2050 * built from the sub_cm will be the same distros out of corresponding CP9 nodes built from
2051 * the full CM. */
2052 cm2sub_cm_find_impossible_misc_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
2053 cm2sub_cm_find_impossible_matr_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
2054
2055 if((status = CP9_check_by_sampling(orig_cm, sub_hmm, errbuf, r, subinfo, submap->spos, submap->epos, chi_thresh, nsamples, print_flag)) != eslOK) return status;
2056 if(print_flag) printf("CM Plan 9 built from sub_cm passed sampling check; sub_cm was built correctly.\n");
2057
2058 FreeCPlan9(orig_hmm);
2059 FreeCPlan9(sub_hmm);
2060 FreeCP9Map(orig_cp9map);
2061 FreeCP9Map(sub_cp9map);
2062 return eslOK;
2063 }
2064
2065
2066 /**************************************************************************
2067 * EPN 09.07.06
2068 * check_orig_psi_vs_sub_psi()
2069 *
2070 * Purpose: Check that the psi values for an original, template CM and
2071 * a sub CM are withing a given leeway threshold, given maps
2072 * from the states of the original to the sub and vice versa.
2073 *
2074 * Args:
2075 * orig_cm - the original, template CM
2076 * sub_cm - the sub CM
2077 * errbuf - for error messages
2078 * submap - the map data structure for the sub CM
2079 * threshold - the threshold that mapping (potentially summed) psi
2080 * values are allowed to be different by, without throwing an error.
2081 * print_flag - TRUE to print out the values, FALSE not to
2082 *
2083 * Returns: eslOK if CM and sub CM are "close enough" (see code)
2084 * eslFAIL otherwise, errbuf is filled
2085 */
2086 int
check_orig_psi_vs_sub_psi(CM_t * orig_cm,CM_t * sub_cm,char * errbuf,CMSubMap_t * submap,double threshold,int print_flag)2087 check_orig_psi_vs_sub_psi(CM_t *orig_cm, CM_t *sub_cm, char *errbuf,CMSubMap_t *submap, double threshold,
2088 int print_flag)
2089 {
2090 int v_s; /* sub_cm state index*/
2091 int v_o; /* orig_cm state index*/
2092 double temp_psi;
2093 int violation;
2094 int v_ct; /* Number of violations */
2095 int detached_insert;
2096 int is_insert;
2097 int root_v_ct;
2098 float diff;
2099 double *orig_psi; /* expected num times each state visited in orig_cm */
2100 double *sub_psi; /* expected num times each state visited in sub_cm */
2101
2102 /* Fill orig_psi and sub_psi parameters. */
2103 if((orig_psi = cm_ExpectedStateOccupancy(orig_cm)) == NULL) ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_cm(), unable to calculate expected state occupancy for orig_cm()");
2104 if((sub_psi = cm_ExpectedStateOccupancy(sub_cm)) == NULL) ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_cm(), unable to calculate expected state occupancy for sub_cm()");
2105
2106 if(print_flag)
2107 {
2108 printf("Printing psi in check_orig_psi_vs_sub_psi():\n");
2109 for(v_o = 0; v_o < orig_cm->M; v_o++)
2110 printf("orig_psi[%4d]: %.6f\n", v_o, orig_psi[v_o]);
2111
2112 }
2113
2114 v_ct = 0;
2115 root_v_ct = 0;
2116 if(print_flag == TRUE) printf("\n");
2117 for(v_s = 0; v_s < sub_cm->M; v_s++)
2118 {
2119 detached_insert = FALSE;
2120 if(sub_cm->sttype[v_s] == IL_st || sub_cm->sttype[v_s] == IR_st)
2121 is_insert = TRUE;
2122 else
2123 is_insert = FALSE;
2124
2125 if(print_flag) printf("\tv_s: %4d (%.6f) ", v_s, sub_psi[v_s]);
2126 v_o = submap->s2o_smap[v_s][0];
2127 if(sub_cm->sttype[v_s+1] == E_st) detached_insert = TRUE;
2128 if(v_o != -1)
2129 {
2130 if(print_flag) printf("v_o1: %4d (%.6f) ", v_o, orig_psi[v_o]);
2131 temp_psi = orig_psi[v_o];
2132 v_o = submap->s2o_smap[v_s][1];
2133 if(v_o != -1)
2134 {
2135 if(is_insert) /* this insert state maps to 2 orig_cm inserts */
2136 ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_psi(), sub insert state maps to 2 orig_cm inserts.");
2137 temp_psi += orig_psi[v_o];
2138 if(print_flag)
2139 printf("v_o2: %4d (%.6f)\n", v_o, orig_psi[v_o]);
2140 }
2141 else
2142 if(print_flag) printf("\n");
2143
2144 violation = FALSE;
2145 if(detached_insert)
2146 temp_psi = 0.0;
2147
2148 /* 10.20.06 Found an exceedingly rare case (2 cases in all possible sub CM
2149 * models of RMARK) where psi test fails with diff < 0.00002 but > 0.00001
2150 * (our default). Both cases involve insert self loops with p > 0.9,
2151 * the reason (I'm pretty sure) these guys fail is because when the
2152 * contribution of the self insertion loop is included in fill_psi, even
2153 * if the self-insert probs for a sub_cm and orig_cm state are equal, if
2154 * the psi values for that state BEFORE the contribution of the self insert
2155 * is added are > 0.0000001 or so, the self insert contributions amplifies
2156 * that difference above our 0.00001 threshold. This only happens if the
2157 * self insert probs are really high and explains the rareness of this case.
2158 * The approach to fixing it is to subtract out the self loop contribution
2159 * prior to checking if we exceed our threshold.
2160 */
2161 if(is_insert && !detached_insert)
2162 {
2163 diff = (sub_psi[v_s] - (sub_psi[v_s] * sub_cm->t[v_s][0])) -
2164 (temp_psi - (temp_psi * orig_cm->t[submap->s2o_smap[v_s][0]][0]));
2165 }
2166 else
2167 diff = sub_psi[v_s] - temp_psi;
2168 if((diff > threshold) || ((-1. * diff) > threshold))
2169 {
2170 violation = TRUE;
2171 v_ct++;
2172 if((sub_cm->ndidx[v_s] == 0) || (sub_cm->ndidx[v_s] == 1))
2173 {
2174 root_v_ct++;
2175 }
2176 }
2177 if(violation)
2178 printf("sub: %.6f | orig: %.6f | diff: %.6f VIOLATION\n\n", sub_psi[v_s], temp_psi, (sub_psi[v_s]-temp_psi));
2179 else if(detached_insert && print_flag)
2180 printf("sub: %.6f | orig: %.6f | diff: %.6f (DEAD INSERT)\n\n", sub_psi[v_s], temp_psi, (sub_psi[v_s]-temp_psi));
2181 else if(print_flag)
2182 printf("sub: %.6f | orig: %.6f | diff: %.6f\n\n", sub_psi[v_s], temp_psi, (sub_psi[v_s]-temp_psi));
2183 }
2184 else
2185 {
2186 if(!detached_insert &&
2187 sub_cm->sttype[v_s] != E_st &&
2188 sub_cm->sttype[v_s] != B_st &&
2189 sub_cm->sttype[v_s] != S_st &&
2190 sub_cm->sttype[v_s] != EL_st)
2191 ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_psi() state v_s:%d maps to nothing and its not E,B,S,EL\n", v_s);
2192 if(print_flag) printf("E B S or EL\n");
2193 }
2194 }
2195
2196 if(v_ct > 0) ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_psi(): check failed");
2197 else
2198 if(print_flag) printf("v_ct is 0 with thresh: %f!\n", threshold);
2199 if(root_v_ct > 0)
2200 printf("ROOT v_ct is %d with thresh: %f!\n", root_v_ct, threshold);
2201
2202 /* Cleanup and exit. */
2203 free(orig_psi);
2204 free(sub_psi);
2205
2206 return eslOK;
2207 }
2208
2209 /**************************************************************************
2210 * EPN 09.11.06
2211 * cm_trans_check()
2212 *
2213 * Return TRUE if there's a transition in the CM from state a to state b.
2214 *
2215 * Args:
2216 * CM_t cm,
2217 * int a;
2218 * int b;
2219 * Returns: TRUE if b is a child of a (a->b exists)
2220 * FALSE otherwise
2221 */
2222 int
cm_trans_check(CM_t * cm,int a,int b)2223 cm_trans_check(CM_t *cm, int a, int b)
2224 {
2225 /*printf("\t**in cm_trans_check a: %d | b: %d\n", a, b);*/
2226
2227 if((a == -1 || b == -1) || (a > b))
2228 return FALSE;
2229
2230 if((b - cm->cfirst[a]) < cm->cnum[a])
2231 { /*printf("returning TRUE") ;*/ return TRUE; }
2232
2233 return FALSE;
2234
2235 }
2236
2237 /**************************************************************************
2238 * EPN 09.21.06
2239 * Function: cm2sub_cm_subtract_root_subpaths()
2240 *
2241 * Purpose: When building a sub CM (sub_cm) from an original, template
2242 * CM (orig_cm), there's special considerations that must be
2243 * taken involving sub_cm nodes with 2 insert states where there
2244 * isn't an exactly identical node in the original CM. The only
2245 * node that potentially meets this criteria is the ROOT_nd because
2246 * all MATP nodes in sub_cm must necessarily also exist in the
2247 * orig_cm.
2248 *
2249 * The problem is that we have overcounted certain subpaths when
2250 * transitioning out of the sub_cm ROOT_S and ROOT_IL states.
2251 *
2252 * For each sub_cm ROOT_IL state, there are up to 2 orig_cm inserts
2253 * that map to it (orig_il1 and orig_il2), and analagously for sub_cm
2254 * ROOT_IR (orig_ir1 and orig_ir2). Then there are up to 2 sub_cm states
2255 * that map to each of the split set states in sub_cm node 1 (orig_ss1
2256 * and orig_ss2). The subpaths that we have overcounted is dependent
2257 * on the relationship between orig_il*, orig_ir*, and orig_ss*.
2258 * There are six cases of this relationship:
2259 *
2260 * cases 1A, 1B, 1C apply when il < ir.
2261 * case 1A: il < ir < ss (this is correctly handled by cm2sub_cm_sum_subpaths())
2262 * case 1B: il < ss < ir
2263 * case 1C: ss < il < ir
2264 *
2265 * cases 2A, 2B, 2C apply when ir < il.
2266 * case 2A: ir < il < ss
2267 * case 2B: ir < ss < il
2268 * case 2C: ss < ir < il
2269 *
2270 * These cases are not explicitly checked for in the code but were
2271 * useful for determining the correct strategy to use here, and
2272 * are mentioned in comments throughout the code.
2273 *
2274 * This function calls a helper function for each pair of orig_il* and
2275 * orig_ir* (up to 4 possible pairs), which actually subtracts
2276 * the path based on the relationships of orig_il*, orig_ir* and
2277 * orig_ss*.
2278 *
2279 * Importantly, though there are potentially many orig_ss* states
2280 * (up to 2 that map to each sub_cm node 1 split set state),
2281 * they all must be close enough in state indices that we'll
2282 * never have a case where for a specific il* and ir* we get more
2283 * than 1 of the 6 possible cases for all orig_ss*. This is
2284 * because either the orig_ss are in a sub_cm MATP node - in
2285 * which case they MUST map to exactly 1 state each in the orig_cm
2286 * (to the corresponding MATP node) and are contiguous state indices,
2287 * or they are a MATL or MATR which either map to a corresponding
2288 * MATL, MATR (in which case they're contiguous) or each sub_cm
2289 * split state maps to exactly 2 states in the same orig_cm MATP
2290 * node, (for example MATL_ML might map to MATP_MP and MATP_ML)
2291 * which are all in the same node and thus have no insert states
2292 * with a state index between the indices of the 2 states they
2293 * map to (ouch).
2294 *
2295 *
2296 * Args:
2297 * CM_t *orig_cm - the original, template CM
2298 * CM_t *sub_cm - the sub CM
2299 * double *orig_psi - for orig_cm: orig_psi[v] is the expected number of times state v is entered
2300 * in a CM parse
2301 * char ***tmap - the hard-coded transition map
2302 * CMSubMap_t *submap- the map from the sub CM to the template CM
2303 * int print_flag - TRUE to print useful debugging info
2304 *
2305 * Returns: VOID
2306 */
2307 static void
cm2sub_cm_subtract_root_subpaths(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,CMSubMap_t * submap,int print_flag)2308 cm2sub_cm_subtract_root_subpaths(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap,
2309 CMSubMap_t *submap, int print_flag)
2310
2311 {
2312 int sub_root_s;
2313 int sub_il;
2314 int sub_ir;
2315 int yoffset;
2316 int sub_y;
2317 int orig_y;
2318 int orig_il;
2319 int orig_ir;
2320 int orig_ss;
2321 int orig_ss1;
2322 int orig_ss2;
2323
2324 sub_root_s = 0; /* sub_cm ROOT_S index */
2325 sub_il = 1; /* sub ROOT_IL index */
2326 sub_ir = 2; /* sub ROOT_IR index */
2327 orig_il = submap->s2o_smap[sub_il][0];
2328 orig_ir = submap->s2o_smap[sub_ir][0];
2329
2330 if(print_flag) printf("\n\nin cm2sum_cm_subtract_root_subpaths_helper: orig_il: %d orig_ir: %d\n", orig_il, orig_ir);
2331
2332 for(yoffset = 0; yoffset < sub_cm->cnum[0]; yoffset++)
2333 if(print_flag) printf("Before t[0][%d] = %f\n", yoffset, sub_cm->t[0][yoffset]);
2334
2335 orig_ss = submap->s2o_smap[3][0]; /* orig_ss is the 1 (of possibly 2) orig_cm states that map to the first
2336 * state in sub_cm node 1 (sub_cm state 3)
2337 */
2338 /* Check to make sure that all the split states meet our guarantee (not necessary) */
2339 for(yoffset = 2; yoffset < sub_cm->cnum[0]; yoffset++) /* note we start at yoffset = 2
2340 * ROOT_S -> first state of next
2341 * node's split set. */
2342 {
2343 sub_y = yoffset + sub_cm->cfirst[0];
2344 orig_y = submap->s2o_smap[sub_y][0];
2345 if((orig_il > orig_ss && orig_il < orig_y) ||
2346 (orig_il < orig_ss && orig_il > orig_y) ||
2347 (orig_ir > orig_ss && orig_ir < orig_y) ||
2348 (orig_ir < orig_ss && orig_ir > orig_y))
2349 cm_Fail("ERROR in cm2sub_cm_subtract_root_subpaths_helper() split set state guarantee violated!\n");
2350 }
2351
2352
2353 /* Check for which of the 6 cases we have (not actually necessary) */
2354 if((orig_il < orig_ir) && (orig_ir < orig_ss))
2355 if(print_flag) printf("ROOT NODE case 1A\n");
2356 if((orig_il < orig_ss) && (orig_ss < orig_ir))
2357 if(print_flag) printf("ROOT NODE case 1B\n");
2358 if((orig_ss < orig_il) && (orig_il < orig_ir))
2359 if(print_flag) printf("ROOT NODE case 1C\n");
2360
2361 if((orig_ir < orig_il) && (orig_il < orig_ss))
2362 if(print_flag) printf("ROOT NODE case 2A\n");
2363 if((orig_ir < orig_ss) && (orig_ss < orig_il))
2364 if(print_flag) printf("ROOT NODE case 2B\n");
2365 if((orig_ss < orig_ir) && (orig_ir < orig_il))
2366 if(print_flag) printf("ROOT NODE case 2C\n");
2367
2368
2369 /* First adjust counts out of sub_cm ROOT_S */
2370
2371 /* if orig_ir < orig_il, we've counted paths from
2372 * ir -> il correctly for ROOT_IL -> ROOT_IR and
2373 * incorrectly for ROOT_S -> ROOT_IR
2374 */
2375 if(orig_ir < orig_il)
2376 {
2377 if(print_flag) printf("0 sub from S->IR, IR -> IL\n"); /* cases 2A, 2B, 2C */
2378 sub_cm->t[sub_root_s][1] -= orig_psi[orig_ir] *
2379 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_il, 0, tmap, orig_psi);
2380 }
2381
2382 /* All the remaining paths to be subtracted involve the split set state of
2383 * node 1. */
2384 for(yoffset = 0; yoffset < sub_cm->cnum[0]; yoffset++)
2385 {
2386 sub_y = sub_cm->cfirst[0] + yoffset;
2387 orig_ss1 = submap->s2o_smap[sub_y][0];
2388 orig_ss2 = submap->s2o_smap[sub_y][1];
2389 if(sub_cm->ndidx[sub_y] != 0)
2390 {
2391 /* Adjust counts out of ROOT_IL if necessary */
2392
2393 /* if orig_ir < orig_ss < orig_il, we've counted paths from
2394 * ir -> ss -> il correctly for ROOT_IL -> ROOT_IR and
2395 * incorrectly for ROOT_IL -> ROOT_SS
2396 */
2397 if(orig_ir < orig_ss && orig_ss < orig_il) /* case 2B only */
2398 {
2399 if(print_flag) printf("3 (2b) sub from IL, IR -> SS -> IL\n");
2400
2401 /* subtract paths from orig_ir -> orig_ss1 -> orig_il */
2402 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ir] *
2403 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_ss1,
2404 sub_il, tmap, orig_psi) *
2405 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss1, orig_il,
2406 sub_il, tmap, orig_psi);
2407 if(orig_ss2 != -1)
2408 /* subtract paths from orig_ir -> orig_ss2 -> orig_il */
2409 sub_cm->t[1][yoffset] -= orig_psi[orig_ir] *
2410 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_ss2,
2411 sub_il, tmap, orig_psi) *
2412 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss2, orig_il,
2413 sub_il, tmap, orig_psi);
2414
2415 }
2416
2417 /* if orig_ss < orig_il < orig_ir, we've counted paths from
2418 * ss -> il -> ir correctly for ROOT_IR -> ROOT_SS and
2419 * incorrectly for ROOT_IL -> ROOT_SS
2420 */
2421 if(orig_ss < orig_il && orig_il < orig_ir) /* case 1C only */
2422 {
2423 if(print_flag) printf("4 (1c) sub from IL, SS -> IL -> IR\n");
2424
2425 /* subtract paths from orig_ss1 -> orig_il (add self insert) -> orig_ir */
2426 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ss1] *
2427 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss1, orig_il,
2428 sub_il, tmap, orig_psi) *
2429 (1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2430 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ir,
2431 sub_il, tmap, orig_psi);
2432
2433 if(orig_ss2 != -1)
2434 /* subtract paths from orig_ss2 -> orig_il (add self insert) -> orig_ir */
2435 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ss2] *
2436 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss2, orig_il,
2437 sub_il, tmap, orig_psi) *
2438 (1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2439 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ir,
2440 sub_il, tmap, orig_psi);
2441 }
2442
2443 /* if orig_ir < orig_il < orig_ss, we've counted paths from
2444 * ir -> il -> ss correctly for ROOT_IR -> ROOT_SS and
2445 * incorrectly for ROOT_IL -> ROOT_SS
2446 */
2447 if(orig_ir < orig_il && orig_il < orig_ss) /* case 2A only */
2448 {
2449 if(print_flag) printf("5 (2a) sub from IL, IR -> IL -> SS\n");
2450
2451 /* subtract paths from orig_ir -> orig_il (add self insert) -> orig_ss1 */
2452 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ir] *
2453 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_il,
2454 sub_il, tmap, orig_psi) *
2455 (1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2456 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss1,
2457 sub_il, tmap, orig_psi);
2458
2459 if(orig_ss2 != -1)
2460 /* subtract paths from orig_ir -> orig_il (add self insert) -> orig_ss2 */
2461 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ir] *
2462 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_il,
2463 sub_il, tmap, orig_psi) *
2464 (1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2465 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss2,
2466 sub_il, tmap, orig_psi);
2467
2468 }
2469
2470 /* if orig_il < orig_ss < orig_ir, we've counted paths from
2471 * il -> ss -> ir correctly for ROOT_IL -> ROOT_IR and
2472 * incorrectly for ROOT_IL -> ROOT_SS
2473 */
2474 if(orig_il < orig_ss && orig_ss < orig_ir) /* case 1B only */
2475 {
2476 if(print_flag) printf("6 (1b) sub from IL, IL -> SS -> IR\n");
2477 if(print_flag) printf("1B before sub 1: sub_cm->t[sub_il:%d][yoffset:%d]: %f\n", sub_il, yoffset, sub_cm->t[sub_il][yoffset]);
2478 /* subtract paths from orig_il -> orig_ss1 -> orig_ir */
2479 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_il] *
2480 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss1,
2481 sub_il, tmap, orig_psi) *
2482 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss1, orig_ir,
2483 sub_il, tmap, orig_psi);
2484
2485 if(print_flag) printf("1B before sub 2: sub_cm->t[sub_il:%d][yoffset:%d]: %f\n", sub_il, yoffset, sub_cm->t[sub_il][yoffset]);
2486 if(orig_ss2 != -1)
2487 /* subtract paths from orig_il -> orig_ss2 -> orig_ir */
2488 sub_cm->t[sub_il][yoffset] -= orig_psi[orig_il] *
2489 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss2,
2490 sub_il, tmap, orig_psi) *
2491 cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss2, orig_ir,
2492 sub_il, tmap, orig_psi);
2493 if(print_flag) printf("1B after sub 2: sub_cm->t[sub_il:%d][yoffset:%d]: %f\n", sub_il, yoffset, sub_cm->t[sub_il][yoffset]);
2494
2495 }
2496 }
2497 }
2498 for(yoffset = 0; yoffset < sub_cm->cnum[0]; yoffset++)
2499 if(print_flag) printf("After t[0][%d] = %f\n", yoffset, sub_cm->t[0][yoffset]);
2500
2501 return;
2502 }
2503
2504
2505 /**************************************************************************
2506 * EPN 09.11.06
2507 * cm2sub_cm_check_id_next_node()
2508 *
2509 * It's known that sub_nd and orig_nd are identical nodes, in that they
2510 * are of the same type and model the same column of the seed alignment.
2511 * In this function, we check if the next node of the sub_cm and orig_nd
2512 * have the same relationship and if so, we update information in
2513 * the submap->s2o_id array, setting the sub_cm states of this node to
2514 * TRUE.
2515 *
2516 * Returns: void
2517 *
2518 * Args:
2519 * CM_t orig_cm
2520 * CM_t sub_cm
2521 * int orig_nd
2522 * int sub_nd
2523 * CMSubMap_t *submap
2524 * int sub_start
2525 */
2526
2527 static int
cm2sub_cm_check_id_next_node(CM_t * orig_cm,CM_t * sub_cm,int orig_nd,int sub_nd,CMSubMap_t * submap,CP9Map_t * orig_cp9map,CP9Map_t * sub_cp9map,int print_flag)2528 cm2sub_cm_check_id_next_node(CM_t *orig_cm, CM_t *sub_cm, int orig_nd, int sub_nd,
2529 CMSubMap_t *submap, CP9Map_t *orig_cp9map,
2530 CP9Map_t *sub_cp9map, int print_flag)
2531 {
2532 int v_s;
2533 int left_check, right_check;
2534 left_check = FALSE;
2535 right_check = FALSE;
2536
2537 if((orig_nd+1) > (orig_cm->nodes-1))
2538 return FALSE;
2539 if((sub_nd+1) > (sub_cm->nodes-1))
2540 return FALSE;
2541 if(orig_cm->ndtype[orig_nd] != sub_cm->ndtype[sub_nd])
2542 return FALSE;
2543 if(orig_cm->ndtype[orig_nd+1] != sub_cm->ndtype[sub_nd+1])
2544 return FALSE;
2545
2546 if(orig_cp9map->nd2lpos[orig_nd+1] == -1 && sub_cp9map->nd2lpos[sub_nd+1] == -1)
2547 left_check = TRUE;
2548 if(orig_cp9map->nd2lpos[orig_nd+1] == (sub_cp9map->nd2lpos[sub_nd+1] + submap->spos -1))
2549 left_check = TRUE;
2550 if(orig_cp9map->nd2rpos[orig_nd+1] == -1 && sub_cp9map->nd2rpos[sub_nd+1] == -1)
2551 right_check = TRUE;
2552 if(orig_cp9map->nd2rpos[orig_nd+1] == (sub_cp9map->nd2rpos[sub_nd+1] + submap->spos -1))
2553 right_check = TRUE;
2554
2555 if(left_check && right_check)
2556 {
2557 v_s = sub_cm->nodemap[sub_nd];
2558 while(sub_cm->ndidx[v_s] == sub_nd)
2559 {
2560 if(print_flag) printf("setting submap->s2o_id[v_s:%d] to TRUE\n", v_s);
2561 if(sub_cm->sttype[v_s+1] != E_st) submap->s2o_id[v_s] = TRUE; /* if v+1 is an E_st, it's a detached insert don't set s2o_id[v] to TRUE */
2562 v_s++;
2563 }
2564 return TRUE;
2565 }
2566 return FALSE;
2567 }
2568
2569 /**************************************************************************
2570 * EPN 10.05.06
2571 * cm2sub_cm_find_impossible_misc_cases
2572 *
2573 * For certain situations, the conversion of an orig_cm to a sub_cm loses
2574 * some information that makes it impossible for a CP9 trained from the sub_cm
2575 * to exactly match a CP9 trained from the orig_cm for the corresponding
2576 * columns. One case where it is impossible involves start states as follows:
2577 *
2578 * if for any k k=spos..epos-1
2579 * X >= 0 start states exists in the orig_cm in a node between:
2580 * orig_cp9map->pos2nd[k] -> orig_cp9map->pos2nd[k+1]
2581 * AND Y >= 1 start states exist in the sub_cm in a node between:
2582 * sub_cp9map->pos2nd[k-spos+1] -> sub_cp9map->pos2nd[k-spos+1+1]
2583 * where X != Y
2584 *
2585 * AND further one or both of the two nodes in the sub_cm (sub_cp9map->pos2nd[k-spos+1]
2586 * OR sub_cp9map->pos2nd[k-spos+1+1] must be a MATP.
2587 *
2588 * This is because for the sub_cm paths that would go from CP9 node k to k+1
2589 * were forced to go through a start state where as for the orig_cm there
2590 * was not a requirement to go through a start. Therefore when the
2591 * sub_cm was constructed it lost some information about the original
2592 * transitions.
2593 *
2594 * Also, a special situation of this case occurs with transitions out of
2595 * CP9 node k=spos-1 to k=spos, and out of CP9 node k=epos to k=epos+1.
2596 * The sub_cm node that models columns spos-1 and epos-1 is the ROOT node,
2597 * which only really models inserts in those columns. It turns out that
2598 * the transition distributions will nearly always be screwy out of these
2599 * two nodes, except in the case when the sub_cm ROOT_IL and ROOT_IR map
2600 * to an IL and IR state respectivley in the original *within the same
2601 * orig_cm node*. In which case the transitions out of node 0 will be
2602 * identical to those out of node spos-1 in the orig_cm.
2603 * There are a few other rare cases where the transitions will be
2604 * identical, but an exhaustive understanding of them eludes me.
2605 * (see ~nawrockie/notebook/6_0725_inf_sub_cm/00LOG for more).
2606 *
2607 * Returns: void
2608 *
2609 * Args:
2610 * CM_t orig_cm
2611 * CM_t sub_cm
2612 * int *orig_cp9map->pos2nd
2613 * int *sub_cp9map->pos2nd
2614 * int *imp_cc
2615 * int spos;
2616 * int epos;
2617 */
2618
2619 static void
cm2sub_cm_find_impossible_misc_cases(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,CMSubInfo_t * subinfo,CP9Map_t * orig_cp9map,CP9Map_t * sub_cp9map,int print_flag)2620 cm2sub_cm_find_impossible_misc_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
2621 CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag)
2622
2623 {
2624 int status;
2625 int k;
2626 int sub_starts;
2627 int orig_starts;
2628 int orig_nd1;
2629 int orig_nd2;
2630 int sub_nd1;
2631 int sub_nd2;
2632 int temp;
2633 int nd;
2634 int orig_special_matps;
2635 int sub_special_matl;
2636 int sub_both_matps;
2637 CMEmitMap_t *orig_emap; /* consensus emit map for the original, template CM */
2638 CMEmitMap_t *sub_emap; /* consensus emit map for the sub CM */
2639
2640 int orig_il1;
2641 int orig_il2;
2642 int orig_ir1;
2643 int orig_ir2;
2644
2645 char **nodetypes;
2646 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
2647 nodetypes[0] = "BIF";
2648 nodetypes[1] = "MATP";
2649 nodetypes[2] = "MATL";
2650 nodetypes[3] = "MATR";
2651 nodetypes[4] = "BEGL";
2652 nodetypes[5] = "BEGR";
2653 nodetypes[6] = "ROOT";
2654 nodetypes[7] = "END";
2655
2656 if(print_flag) printf("in cm2sub_find_impossible_misc_cases()\n");
2657
2658 orig_emap = CreateEmitMap(orig_cm);
2659 sub_emap = CreateEmitMap(sub_cm);
2660
2661 /* We know that the sub CM node 0 and node submap->sub_clen are nearly always going to
2662 * get the transition distributions wrong (see comments above in the function
2663 * explanation). There are a couple of cases they should get right
2664 * (see ~nawrockie/notebook/6_0725_inf_sub_cm/00LOG for details), we
2665 * check for these here:
2666 */
2667 subinfo->imp_cc[0] = 1;
2668 subinfo->imp_cc[submap->sub_clen] = 1;
2669
2670 /* check if the orig_cm states that model sub_cm ROOT_IL and ROOT_IR are
2671 * from the same node. */
2672 orig_il1 = submap->s2o_smap[1][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IL */
2673 orig_il2 = submap->s2o_smap[1][1]; /* 2nd state that maps to sub_cm's ROOT_IL or -1 if only 1 maps*/
2674 orig_ir1 = submap->s2o_smap[2][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IR */
2675 orig_ir2 = submap->s2o_smap[2][1]; /* 2nd state that maps to sub_cm's ROOT_IR or -1 if only 1 maps*/
2676
2677 /* We ASSUME that ambiguities have been removed, i.e. if two insert states map to either ROOT_IL
2678 * or ROOT_IR, one of them has been detached. We exploit this knowledge.
2679 */
2680 if(orig_il2 != -1)
2681 {
2682 if(orig_cm->sttype[orig_il1+1] == E_st)
2683 orig_il1 = orig_il2; /* orig_il1 was detached */
2684 else if(orig_cm->sttype[orig_il2+1] == E_st)
2685 {
2686 /* do nothing */
2687 }
2688 else
2689 cm_Fail("ERROR, can't determine which state was detached\n");
2690 }
2691 if(orig_ir2 != -1)
2692 {
2693 if(orig_cm->sttype[orig_ir1+1] == E_st)
2694 orig_ir1 = orig_ir2; /* orig_ir1 was detached */
2695 else if(orig_cm->sttype[orig_ir2+1] == E_st)
2696 {
2697 /* do nothing */
2698 }
2699 else
2700 cm_Fail("ERROR, can't determine which state was detached\n");
2701 }
2702
2703 /* Now orig_il1 and orig_ir1 map to the ONLY insert states that map to sub_cm
2704 * ROOT_IL and ROOT_IR respectively. */
2705 if(orig_cm->ndidx[orig_il1] == orig_cm->ndidx[orig_ir1])
2706 {
2707 /* we can get the distro out of 0 right in this case */
2708 subinfo->imp_cc[0] = FALSE;
2709 if(submap->spos == submap->sstruct && submap->epos == submap->estruct)
2710 subinfo->imp_cc[submap->sub_clen] = FALSE;
2711 }
2712
2713 for(k = 1; k < submap->sub_clen; k++)
2714 {
2715 if(print_flag) printf("k: %d\n", k);
2716 if((k+submap->spos-1) == 0)
2717 orig_nd1 = 0;
2718 else
2719 orig_nd1 = orig_cp9map->pos2nd[k+submap->spos-1];
2720 orig_nd2 = orig_cp9map->pos2nd[k+submap->spos-1+1];
2721
2722 orig_special_matps = FALSE;
2723 if((orig_cm->ndtype[orig_nd1] == MATP_nd &&
2724 orig_cm->ndtype[orig_nd2] == MATP_nd) &&
2725 (orig_cp9map->nd2rpos[orig_nd1] == (k+submap->spos-1) &&
2726 orig_cp9map->nd2lpos[orig_nd2] == (k+submap->spos)))
2727 {
2728 if((orig_cp9map->nd2lpos[orig_nd1] < submap->spos) ||
2729 (orig_cp9map->nd2rpos[orig_nd2] > submap->epos))
2730 {
2731 /* This is a special case */
2732 orig_special_matps = TRUE;
2733 }
2734 }
2735
2736 if(orig_nd2 < orig_nd1)
2737 {
2738 temp = orig_nd1;
2739 orig_nd1 = orig_nd2;
2740 orig_nd2 = temp;
2741 }
2742 orig_starts = 0;
2743
2744 if(print_flag) printf("orig_nd1: %d | orig_nd2: %d\n", orig_nd1, orig_nd2);
2745 for(nd = orig_nd1; nd <= orig_nd2; nd++)
2746 {
2747 if(print_flag) printf("orig_cm->ndtype[%d]: %s L: %4d R: %4d (submap->spos: %4d) (submap->epos: %4d)\n", nd, nodetypes[(int) orig_cm->ndtype[nd]], orig_emap->lpos[nd], orig_emap->rpos[nd], submap->spos, submap->epos);
2748 if(orig_cm->ndtype[nd] == BEGL_nd ||
2749 orig_cm->ndtype[nd] == BEGR_nd)
2750 { orig_starts++; }
2751 }
2752 sub_nd1 = sub_cp9map->pos2nd[k];
2753 sub_nd2 = sub_cp9map->pos2nd[k+1];
2754
2755 sub_special_matl = FALSE;
2756 if(sub_cm->ndtype[sub_nd1] == MATL_nd)
2757 sub_special_matl = TRUE;
2758
2759 if(sub_nd2 < sub_nd1)
2760 {
2761 temp = sub_nd1;
2762 sub_nd1 = sub_nd2;
2763 sub_nd2 = temp;
2764 }
2765
2766 sub_both_matps = FALSE;
2767 if((sub_cm->ndtype[sub_nd1] == MATP_nd && sub_cm->ndtype[sub_nd2] == MATP_nd) &&
2768 (!(sub_cp9map->nd2lpos[sub_nd1] < sub_cp9map->nd2lpos[sub_nd2] && sub_cp9map->nd2rpos[sub_nd1] > sub_cp9map->nd2rpos[sub_nd2])))
2769 sub_both_matps = TRUE;
2770
2771 sub_starts = 0;
2772 if(print_flag) printf("sub_nd1: %d | sub_nd2: %d\n", sub_nd1, sub_nd2);
2773 for(nd = sub_nd1; nd <= sub_nd2; nd++)
2774 {
2775 if(print_flag) printf("sub_cm->ndtype[%d]: %s L: %4d R: %4d (submap->spos: %4d) (submap->epos: %4d)\n", nd, nodetypes[(int) sub_cm->ndtype[nd]], (sub_emap->lpos[nd]+submap->spos-1), (sub_emap->rpos[nd]+submap->spos-1), submap->spos, submap->epos);
2776 if(sub_cm->ndtype[nd] == BEGL_nd || sub_cm->ndtype[nd] == BEGR_nd)
2777 {
2778 sub_starts++;
2779 if(sub_cm->ndtype[sub_nd1] != MATP_nd && sub_cm->ndtype[sub_nd2] != MATP_nd)
2780 cm_Fail("ERROR in cm2sub_cm_find_impossible_misc_cases() found impossible case not involving any MATP in the sub_cm, k: %d submap->epos-submap->spos+1: %d\n", k, submap->sub_clen);
2781 if(orig_cm->ndtype[orig_nd1] != MATP_nd && orig_cm->ndtype[orig_nd2] != MATP_nd)
2782 cm_Fail("ERROR in cm2sub_cm_find_impossible_misc_cases() found impossible case not involving any MATP in the orig_cm\n, k: %d | submap->epos-submap->spos+1: %d", k, submap->sub_clen);
2783 }
2784 }
2785
2786 if(print_flag) printf("sub_starts: %d | orig_starts: %d\n", sub_starts, orig_starts);
2787
2788 if(sub_starts > 0 && orig_starts == 0)
2789 subinfo->imp_cc[k] = 3;
2790 else if((sub_starts > 0 && sub_starts > orig_starts) &&
2791 sub_both_matps == TRUE)
2792 subinfo->imp_cc[k] = 4;
2793 else if(sub_starts > 0 && orig_special_matps && sub_special_matl)
2794 subinfo->imp_cc[k] = 5;
2795 else if(sub_starts == 0 && orig_starts > 0 && orig_special_matps && sub_special_matl)
2796 subinfo->imp_cc[k] = 6;
2797
2798 }
2799 FreeEmitMap(orig_emap);
2800 FreeEmitMap(sub_emap);
2801 free(nodetypes);
2802 return;
2803
2804 ERROR:
2805 cm_Fail("Memory allocation error.");
2806 }
2807
2808 /**************************************************************************
2809 * EPN 10.18.06
2810 * cm2sub_cm_find_impossible_matr_cases
2811 *
2812 * For certain situations, the conversion of an orig_cm to a sub_cm loses
2813 * some information that makes it impossible for a CP9 trained from the sub_cm
2814 * to exactly match a CP9 trained from the orig_cm for the corresponding
2815 * columns. One case where it is impossible involves MATR nodes as follows:
2816 *
2817 * if for any k k=spos..epos-1:
2818 * orig_cp9map->pos2nd[k] + 1 = orig_cp9map->pos2nd[k+1] AND (NOT TRUE!)
2819 * sub_cp9map->pos2nd[k] + 1 != sub_cp9map->pos2nd[k+1] AND
2820 * orig_cp9map->pos2nd[k] == sub_cp9map->pos2nd[k] == MATL AND
2821 * orig_cp9map->pos2nd[k+1] == sub_cp9map->pos2nd[k+1] == MATP AND
2822 * all nodes between sub_cp9map->pos2nd[k] .. cc_node_map[k+1] are BIF, BEG*, or MATR nodes AND
2823 * the orig_cm nodes that the stretch of MATR nodes map to are all MATP nodes
2824 * for which the other half of the nodes map to positions outside the sub_cm.
2825 *
2826 * Here we explicitly check for these situations and set imp_cc[k] = TRUE for
2827 * any k that satisfy all the criteria.
2828 * Returns: void
2829 *
2830 * Args:
2831 * CM_t orig_cm
2832 * CM_t sub_cm
2833 * int *orig_cp9map->pos2nd
2834 * int *sub_cp9map->pos2nd
2835 * int *imp_cc
2836 * int spos;
2837 * int epos;
2838 */
2839
2840 static void
cm2sub_cm_find_impossible_matr_cases(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,CMSubInfo_t * subinfo,CP9Map_t * orig_cp9map,CP9Map_t * sub_cp9map,int print_flag)2841 cm2sub_cm_find_impossible_matr_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
2842 CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag)
2843 {
2844 int status;
2845 int sub_k;
2846 int orig_k;
2847 int orig_nd;
2848 int sub_nd;
2849 int next_sub_nd;
2850 int next_orig_nd;
2851 int sub_nd1;
2852 int sub_nd2;
2853 int orig_nd1;
2854 int orig_nd2;
2855 int tmp_k;
2856 int tmp_sub_nd;
2857 int tmp_orig_nd;
2858 int correct_node_types_flag;
2859 int orig_matr_stretch_flag;
2860 int sub_matr_stretch_flag;
2861
2862 char **nodetypes;
2863 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
2864 nodetypes[0] = "BIF";
2865 nodetypes[1] = "MATP";
2866 nodetypes[2] = "MATL";
2867 nodetypes[3] = "MATR";
2868 nodetypes[4] = "BEGL";
2869 nodetypes[5] = "BEGR";
2870 nodetypes[6] = "ROOT";
2871 nodetypes[7] = "END";
2872
2873 for(sub_k = 1; sub_k < submap->sub_clen; sub_k++)
2874 {
2875 orig_k = sub_k + submap->spos - 1;
2876 if(print_flag) printf("CASE 2 k: %d\n", sub_k);
2877 /* Strategy: we check each of our criteria independently */
2878
2879 /* Find out what the node that models the next column is */
2880 sub_nd = sub_cp9map->pos2nd[sub_k];
2881 next_sub_nd = sub_cp9map->pos2nd[sub_k+1];
2882 orig_nd = orig_cp9map->pos2nd[orig_k];
2883 next_orig_nd = orig_cp9map->pos2nd[orig_k+1];
2884 if(sub_nd < next_sub_nd)
2885 {
2886 sub_nd1 = sub_nd;
2887 sub_nd2 = next_sub_nd;
2888 }
2889 else
2890 {
2891 sub_nd1 = next_sub_nd;
2892 sub_nd2 = sub_nd;
2893 }
2894 if(orig_nd < next_orig_nd)
2895 {
2896 orig_nd1 = orig_nd;
2897 orig_nd2 = next_orig_nd;
2898 }
2899 else
2900 {
2901 orig_nd1 = next_orig_nd;
2902 orig_nd2 = orig_nd;
2903 }
2904
2905 if(print_flag) printf("CASE 2 k: %d sub_nd: %d next_sub_nd: %d\n", sub_k, sub_nd, next_sub_nd);
2906 if(print_flag) printf("CASE 2 k: %d orig_nd: %d next_orig_nd: %d\n", sub_k, orig_nd, next_orig_nd);
2907
2908 /* Make sure that next_sub_nd is and sub_nd are not consecutive */
2909 if(sub_nd1 != sub_nd2-1)
2910 {
2911 /* Check if min(sub_nd, next_sub_nd) and min(orig_nd, next_orig_nd) are both MATLs and
2912 if max(sub_nd, next_sub_nd) and max(orig_nd, next_orig_nd) are both MATPs */
2913 if((orig_cm->ndtype[orig_nd1] == MATL_nd && sub_cm->ndtype[sub_nd1] == MATL_nd) &&
2914 (orig_cm->ndtype[orig_nd2] == MATP_nd && sub_cm->ndtype[sub_nd2] == MATP_nd))
2915 correct_node_types_flag = TRUE;
2916 else
2917 correct_node_types_flag = FALSE;
2918 if(print_flag) printf("CASE 2 k: %d correct_node_types_flag: %d\n", sub_k, correct_node_types_flag);
2919
2920 if(correct_node_types_flag == TRUE)
2921 {
2922 /* Determine if the next_orig_nd is the next left emitting
2923 * node of the orig_cm after orig_nd */
2924 orig_matr_stretch_flag = TRUE;
2925 for(tmp_orig_nd = orig_nd1+1; tmp_orig_nd < orig_nd2; tmp_orig_nd++)
2926 {
2927 if(orig_cm->ndtype[tmp_orig_nd] != MATR_nd)
2928 {
2929 orig_matr_stretch_flag = FALSE;
2930 break;
2931 }
2932 }
2933 if(print_flag) printf("CASE 2 k: %d orig_matr_stretch_flag: %d\n", sub_k, orig_matr_stretch_flag);
2934 if(orig_matr_stretch_flag == TRUE)
2935 {
2936 /* Check if all the sub_cm nodes between sub_nd and
2937 * next_sub_nd are MATRs. */
2938 sub_matr_stretch_flag = TRUE;
2939 for(tmp_sub_nd = sub_nd1+1; tmp_sub_nd < sub_nd2; tmp_sub_nd++)
2940 if(sub_cm->ndtype[tmp_sub_nd] != MATR_nd)
2941 {
2942 sub_matr_stretch_flag = FALSE;
2943 break;
2944 }
2945 if(print_flag) printf("CASE 2 k: %d sub_matr_stretch_flag: %d\n", sub_k, sub_matr_stretch_flag);
2946 if(sub_matr_stretch_flag == TRUE)
2947 {
2948 /* This should be a MATR impossible case,
2949 * Check all the criteria we *think* are always true in this situation,
2950 * cm_Fail if what we think is wrong.
2951 *
2952 * The orig_cm nodes that map to the same consensus columns
2953 * as the MATR nodes in the sub_cm must either be:
2954
2955 * as the MATR nodes in the sub_cm must either be:
2956 *
2957 * 1 orig_cm MATPs or MATRs for which the RIGHT half maps to the same column
2958 * modelled by the corresponding sub_cm MATR, and in the case of the
2959 * MATPs the LEFT half maps to consensus columns before spos.
2960 * 2 orig_cm nodes that map to the same consensus columns
2961 * as the MATR nodes in the sub_cm are ALL either orig_cm MATPs
2962 * or MATLs for which the LEFT half maps to the same
2963 * column modelled by the corresponding sub_cm MATR, and in the
2964 * case of the MATPs, the RIGHT half maps to consensus columns
2965 * after epos.
2966 *
2967 * Usually the entire set of orig_cm nodes that map to the sub_cm
2968 * MATRs are either one or the other type, but I've seen rare cases
2969 * where there's a stretch of one type, and then a stretch of the
2970 * other.
2971 */
2972 tmp_sub_nd = sub_cp9map->pos2nd[sub_k] + 1;
2973 tmp_k = sub_cp9map->nd2rpos[tmp_sub_nd] + submap->spos - 1;
2974 tmp_orig_nd = orig_cp9map->pos2nd[tmp_k];
2975
2976 for(tmp_sub_nd = sub_nd1+1; tmp_sub_nd < sub_nd2; tmp_sub_nd++)
2977 {
2978 tmp_k = sub_cp9map->nd2rpos[tmp_sub_nd] + submap->spos - 1;
2979 tmp_orig_nd = orig_cp9map->pos2nd[tmp_k];
2980
2981 if(print_flag) printf("10.18.06: %s %3d %3d | %s %3d %3d (xcc: %3d)\n", nodetypes[(int) sub_cm->ndtype[tmp_sub_nd]], tmp_sub_nd, (sub_cp9map->nd2rpos[tmp_sub_nd]+submap->spos-1), nodetypes[(int) orig_cm->ndtype[tmp_orig_nd]], orig_cp9map->nd2lpos[tmp_orig_nd], orig_cp9map->nd2rpos[tmp_orig_nd], tmp_k);
2982 if(orig_cp9map->nd2rpos[tmp_orig_nd] == (sub_cp9map->nd2rpos[tmp_sub_nd]+submap->spos-1)) /* Case 1 above */
2983 {
2984 if(orig_cm->ndtype[tmp_orig_nd] != MATP_nd && orig_cm->ndtype[tmp_orig_nd] != MATR_nd)
2985 cm_Fail("ERROR 2 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d", sub_k, submap->spos, submap->epos);
2986 if(orig_cm->ndtype[tmp_orig_nd] == MATP_nd && orig_cp9map->nd2lpos[tmp_orig_nd] >= submap->spos)
2987 cm_Fail("ERROR 3 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d", sub_k, submap->spos, submap->epos);
2988 }
2989 else if(orig_cp9map->nd2lpos[tmp_orig_nd] == (sub_cp9map->nd2rpos[tmp_sub_nd]+submap->spos-1)) /* Case 2 above */
2990 {
2991 if(orig_cm->ndtype[tmp_orig_nd] != MATP_nd && orig_cm->ndtype[tmp_orig_nd] != MATL_nd)
2992 cm_Fail("ERROR 4 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d", sub_k, submap->spos, submap->epos);
2993 if(orig_cm->ndtype[tmp_orig_nd] == MATP_nd && orig_cp9map->nd2rpos[tmp_orig_nd] <= submap->epos)
2994 cm_Fail("ERROR 5 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d | orig_cp9map->nd2rpos[%d]: %d", sub_k, submap->spos, submap->epos, tmp_orig_nd, orig_cp9map->nd2rpos[tmp_orig_nd]);
2995 }
2996 }
2997 /* if we get here, we've satisfied all of our criteria */
2998 subinfo->imp_cc[sub_k] = 2;
2999 }
3000 }
3001 }
3002 }
3003 }
3004 free(nodetypes);
3005 return;
3006
3007 ERROR:
3008 cm_Fail("Memory allocation error.");
3009 }
3010
3011
3012
3013 /**************************************************************************
3014 * EPN 10.16.06
3015 * Function: check_sub_cm()
3016 *
3017 * Purpose: Given a CM and a sub CM that is supposed to mirror
3018 * the CM as closely as possible between two given consensus
3019 * columns (spos and epos), check that the sub_cm was correctly
3020 * constructed.
3021 *
3022 * 1. Build a CP9 HMM (cp9_1) from the sub_cm.
3023 * 2. Build a CP9 HMM (cp9_2) from the full cm.
3024 * 3. Reconfig cp9_2 so start node is spos and end node is epos.
3025 * 4. Check corresponding parameters of cp9_1 and cp9_2 to make
3026 * sure they're within pthresh, allowing nodes we predict
3027 * to be wrong, as stored in subinfo->imp_cc[] to be wrong,
3028 * but keeping statistics on these cases.
3029 *
3030 * Args:
3031 * orig_cm - the original, template CM
3032 * sub_cm - the sub CM built from the orig_cm
3033 * errbuf - for error messages
3034 * submap - map from orig_cm to sub_cm and vice versa
3035 * subinfo - sub cm information
3036 * pthresh - the allowed difference in probability between HMMs
3037 * print_flag - TRUE to print useful debugging info
3038 *
3039 * Returns: eslOK if CM and sub CM are "close enough" (see code)
3040 * eslFAIL otherwise, errbuf is filled
3041 */
3042 int
check_sub_cm(CM_t * orig_cm,CM_t * sub_cm,char * errbuf,CMSubMap_t * submap,CMSubInfo_t * subinfo,float pthresh,int print_flag)3043 check_sub_cm(CM_t *orig_cm, CM_t *sub_cm, char *errbuf, CMSubMap_t *submap, CMSubInfo_t *subinfo, float pthresh, int print_flag)
3044 {
3045 int status;
3046 CP9_t *sub_hmm; /* constructed CP9 HMM from the sub_cm */
3047 CP9_t *orig_hmm; /* constructed CP9 HMM from the original cm
3048 this will be reconfiged to match the sub_hmm */
3049 CP9Map_t *orig_cp9map; /* maps the orig_cm to the orig_hmm and vice versa */
3050 CP9Map_t *sub_cp9map; /* maps the sub_cm to the sub_hmm and vice versa */
3051
3052 int k;
3053 double **orig_phi;
3054 int *violation;
3055 int v_ct;
3056 int apredict_total_ct;
3057 int awrong_total_ct;
3058 int i;
3059 float diff;
3060 int nd;
3061
3062 v_ct = 0;
3063 apredict_total_ct = 0;
3064 awrong_total_ct = 0;
3065
3066 /* Build two CP9 HMMs, one for the orig_cm and one for the sub_cm */
3067 if((status = build_cp9_hmm(orig_cm, errbuf, FALSE, 0.0001, print_flag, &orig_hmm, &orig_cp9map)) != eslOK) return status;
3068 if((status = build_cp9_hmm(sub_cm, errbuf, FALSE, 0.0001, print_flag, &sub_hmm, &sub_cp9map)) != eslOK) return status;
3069 CP9Logoddsify(orig_hmm);
3070 CP9Logoddsify(sub_hmm);
3071
3072 /* Look for 'impossible' cases where we know the sub_cm
3073 * construction procedure fails, in that the distribution of transitions out of CP9 nodes
3074 * built from the sub_cm will be the same distros out of corresponding CP9 nodes built from
3075 * the full CM. */
3076 cm2sub_cm_find_impossible_misc_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
3077 cm2sub_cm_find_impossible_matr_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
3078
3079 /* Reconfig the orig_hmm so that it can only start in the spos node, and end from the epos node */
3080 /* Build the sub CP9 HMM by copying as much of the original cp9_hmm as possible */
3081 fill_phi_cp9(orig_hmm, &orig_phi, 1, FALSE);
3082 CP9_reconfig2sub(orig_hmm, submap->spos, submap->epos, submap->spos, submap->epos, orig_phi);
3083
3084 if(print_flag)
3085 {
3086 printf("PRINTING BUILT SUB HMM PARAMS:\n");
3087 debug_print_cp9_params(stdout, sub_hmm, TRUE);
3088 printf("DONE PRINTING BUILT SUB HMM PARAMS:\n");
3089
3090 printf("PRINTING BUILT & RECONFIGED ORIG HMM PARAMS:\n");
3091 debug_print_cp9_params(stdout, orig_hmm, TRUE);
3092 printf("DONE PRINTING BUILT & RECONFIGED ORIG HMM PARAMS:\n");
3093 }
3094
3095 /* Check the parameters of the two CP9 HMMs */
3096 if(print_flag)
3097 {
3098 printf("COMPARING CP9 HMM parameters in check_sub_cm()\n");
3099 printf("orig | sub\n");
3100 }
3101 ESL_ALLOC(violation, sizeof(int) * (submap->sub_clen+1));
3102 for(k = 0; k <= sub_hmm->M; k++)
3103 {
3104 violation[k] = FALSE;
3105 if(print_flag) printf("Node: %d\n", k);
3106 if(k > 0)
3107 {
3108 for(i = 0; i < orig_cm->abc->K; i++)
3109 {
3110 diff = orig_hmm->mat[(submap->spos+k-1)][i] - sub_hmm->mat[k][i];
3111 if(print_flag) printf("mat[%d][%d] = %8.5f | %8.5f | (%8.5f)\n", 0, i, orig_hmm->mat[(submap->spos+k-1)][i], sub_hmm->mat[k][i], diff);
3112 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3113 {
3114 ESL_FAIL(eslFAIL, errbuf, "check_sub_cm(): emission probability incorrect");
3115 }
3116 }
3117 }
3118 for(i = 0; i < orig_cm->abc->K; i++)
3119 {
3120 diff = orig_hmm->ins[(submap->spos+k-1)][i] - sub_hmm->ins[k][i];
3121 if(print_flag) printf("ins[%d][%d] = %8.5f | %8.5f | (%8.5f)\n", 0, i, orig_hmm->ins[(submap->spos+k-1)][i], sub_hmm->ins[k][i], diff);
3122 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3123 {
3124 ESL_FAIL(eslFAIL, errbuf, "check_sub_cm(): emission probability incorrect");
3125 }
3126 }
3127
3128 /* Transitions */
3129 if(print_flag) printf("\n");
3130 diff = orig_hmm->t[(submap->spos+k-1)][CTMM] - sub_hmm->t[k][CTMM];
3131 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3132 {
3133 violation[k] = TRUE;
3134 if(print_flag) printf("\tCTMM[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTMM], sub_hmm->t[k][CTMM], diff);
3135 }
3136 else
3137 if(print_flag) printf("\tCTMM[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTMM], sub_hmm->t[k][CTMM], diff);
3138
3139 diff = orig_hmm->t[(submap->spos+k-1)][CTMI] - sub_hmm->t[k][CTMI];
3140 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3141 {
3142 violation[k] = TRUE;
3143 if(print_flag) printf("\tCTMI[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTMI], sub_hmm->t[k][CTMI], diff);
3144 }
3145 else
3146 if(print_flag) printf("\tCTMI[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTMI], sub_hmm->t[k][CTMI], diff);
3147
3148 diff = orig_hmm->t[(submap->spos+k-1)][CTMD] - sub_hmm->t[k][CTMD];
3149 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3150 {
3151 violation[k] = TRUE;
3152 if(print_flag) printf("\tCTMD[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTMD], sub_hmm->t[k][CTMD], diff);
3153 }
3154 else
3155 if(print_flag) printf("\tCTMD[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTMD], sub_hmm->t[k][CTMD], diff);
3156
3157 diff = orig_hmm->t[(submap->spos+k-1)][CTIM] - sub_hmm->t[k][CTIM];
3158 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3159 {
3160 violation[k] = TRUE;
3161 if(print_flag) printf("\tCTIM[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTIM], sub_hmm->t[k][CTIM], diff);
3162 }
3163 else
3164 if(print_flag) printf("\tCTIM[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTIM], sub_hmm->t[k][CTIM], diff);
3165
3166 diff = orig_hmm->t[(submap->spos+k-1)][CTII] - sub_hmm->t[k][CTII];
3167 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3168 {
3169 violation[k] = TRUE;
3170 if(print_flag) printf("\tCTII[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTII], sub_hmm->t[k][CTII], diff);
3171 }
3172 else
3173 if(print_flag) printf("\tCTII[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTII], sub_hmm->t[k][CTII], diff);
3174
3175 diff = orig_hmm->t[(submap->spos+k-1)][CTID] - sub_hmm->t[k][CTID];
3176 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3177 {
3178 violation[k] = TRUE;
3179 if(print_flag) printf("\tCTID[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTID], sub_hmm->t[k][CTID], diff);
3180 }
3181 else
3182 if(print_flag) printf("\tCTID[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTID], sub_hmm->t[k][CTID], diff);
3183
3184 diff = orig_hmm->t[(submap->spos+k-1)][CTDM] - sub_hmm->t[k][CTDM];
3185 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3186 {
3187 violation[k] = TRUE;
3188 if(print_flag) printf("\tCTDM[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTDM], sub_hmm->t[k][CTDM], diff);
3189 }
3190 else
3191 if(print_flag) printf("\tCTDM[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTDM], sub_hmm->t[k][CTDM], diff);
3192
3193 diff = orig_hmm->t[(submap->spos+k-1)][CTDI] - sub_hmm->t[k][CTDI];
3194 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3195 {
3196 violation[k] = TRUE;
3197 if(print_flag) printf("\tCTDI[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTDI], sub_hmm->t[k][CTDI], diff);
3198 }
3199 else
3200 if(print_flag) printf("\tCTDI[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTDI], sub_hmm->t[k][CTDI], diff);
3201
3202 diff = orig_hmm->t[(submap->spos+k-1)][CTDD] - sub_hmm->t[k][CTDD];
3203 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3204 {
3205 violation[k] = TRUE;
3206 if(print_flag) printf("\tCTDD[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTDD], sub_hmm->t[k][CTDD], diff);
3207 }
3208 else
3209 if(print_flag) printf("\tCTDD[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTDD], sub_hmm->t[k][CTDD], diff);
3210
3211 if(k > 0)
3212 {
3213 diff = orig_hmm->begin[(submap->spos+k-1)] - sub_hmm->begin[k];
3214 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3215 {
3216 violation[0] = TRUE; /* begin actually has to do with the transition distro out of node 0 */
3217 if(print_flag) printf("\t beg[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->begin[(submap->spos+k-1)], sub_hmm->begin[k], diff);
3218 }
3219 else
3220 if(print_flag) printf("\t beg[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->begin[(submap->spos+k-1)], sub_hmm->begin[k], diff);
3221
3222 diff = orig_hmm->end[(submap->spos+k-1)] - sub_hmm->end[k];
3223 if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3224 {
3225 violation[k] = TRUE;
3226 if(print_flag) printf("\t end[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->end[(submap->spos+k-1)], sub_hmm->end[k], diff);
3227 }
3228 else
3229 if(print_flag) printf("\t end[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->end[(submap->spos+k-1)], sub_hmm->end[k], diff);
3230 }
3231 }
3232
3233 /* Add-up the violations */
3234 for(nd = 0; nd <= (submap->epos-submap->spos+1); nd++)
3235 {
3236 if(violation[nd] && subinfo->imp_cc[nd] == 0)
3237 {
3238 v_ct++;
3239 printf("VIOLATION[%3d]: TRUE | submap->spos: %3d | submap->epos: %3d | subinfo->imp_cc: %d\n", nd, submap->spos, submap->epos, subinfo->imp_cc[nd]);
3240 }
3241 else if(violation[nd] && subinfo->imp_cc[nd] != 0)
3242 {
3243 subinfo->apredict_ct[subinfo->imp_cc[nd]]++;
3244 apredict_total_ct++;
3245 if(print_flag)
3246 printf("PREDICTED VIOLATION[%3d]: TRUE | submap->spos: %3d | submap->epos: %3d | subinfo->imp_cc: %d\n", nd, submap->spos, submap->epos, subinfo->imp_cc[nd]);
3247 }
3248 else if(!violation[nd] && subinfo->imp_cc[nd] != 0)
3249 {
3250 subinfo->apredict_ct[subinfo->imp_cc[nd]]++;
3251 apredict_total_ct++;
3252 subinfo->awrong_ct[subinfo->imp_cc[nd]]++;
3253 awrong_total_ct++;
3254 if(print_flag) printf("NON-VIOLATION[%3d] %3d : submap->spos: %3d | submap->epos: %3d | subinfo->imp_cc: %d\n", nd, awrong_total_ct, submap->spos, submap->epos, subinfo->imp_cc[nd]);
3255 }
3256 }
3257
3258 /* Clean up and return */
3259 for(k = 0; k <= orig_hmm->M; k++)
3260 free(orig_phi[k]);
3261 free(orig_phi);
3262
3263 free(violation);
3264 FreeCPlan9(orig_hmm);
3265 FreeCPlan9(sub_hmm);
3266 FreeCP9Map(orig_cp9map);
3267 FreeCP9Map(sub_cp9map);
3268
3269 if(v_ct > 0) ESL_FAIL(eslFAIL, errbuf, "check_sub_cm(): check failed");
3270 return eslOK;
3271
3272 ERROR:
3273 ESL_FAIL(status, errbuf, "memory allocation error");
3274 return FALSE; /* never reached */
3275 }
3276
3277 /**************************************************************************
3278 * EPN 10.23.06
3279 * Function: sub_cm2cm_parsetree()
3280 * Returns: void
3281 *
3282 * Purpose: Convert a parstree to a sub_cm to a parsetree for an original CM.
3283 * We assume we're NOT in local mode. For any node n in the original
3284 * CM for which 0 states in the sub_cm map to n, we declare that
3285 * the delete state of that node was used in the converted original
3286 * CM parse (or the S, B or E state if n is not MATP, MATL or MATR).
3287 *
3288 * Args:
3289 * CM_t *orig_cm - the original, template CM
3290 * CM_t *sub_cm - the sub CM built from the orig_cm
3291 * Parsetree_t **ret_orig_tr - orig_cm parstree allocated, filled and returned here
3292 * Parsetree_t *sub_tr - the sub_cm parstree already filled
3293 * CMSubMap_t *submap - map from the sub_cm to orig_cm and vice versa
3294 * int print_flag - TRUE to print useful debugging info
3295 *
3296 * Returns: eslOK on success
3297 * eslEMEM if we run out of memory
3298 */
3299
3300 int
sub_cm2cm_parsetree(CM_t * orig_cm,CM_t * sub_cm,Parsetree_t ** ret_orig_tr,Parsetree_t * sub_tr,CMSubMap_t * submap,int print_flag)3301 sub_cm2cm_parsetree(CM_t *orig_cm, CM_t *sub_cm, Parsetree_t **ret_orig_tr, Parsetree_t *sub_tr,
3302 CMSubMap_t *submap, int print_flag)
3303 {
3304 int status;
3305 Parsetree_t *orig_tr; /* the parsetree we're creating for the original CM */
3306 int *ss_used; /* [0..orig_cm->nodes-1], split state idx used in converted parsetree for each orig_cm nd */
3307 int *ss_emitl; /* [0..orig_cm->nodes-1], tr->emitl[n] for each orig_cm node n */
3308 int *ss_emitr; /* [0..orig_cm->nodes-1], tr->emitr[n] for each orig_cm node n */
3309 int *il_ct; /* [0..orig_cm->nodes-1], number of times IL state of orig_cm node n was visited */
3310 int *ir_ct; /* [0..orig_cm->nodes-1], number of times IR state of orig_cm node n was visited */
3311 int *il_used; /* [0..orig_cm->nodes-1], idx of IL state in node n */
3312 int *ir_used; /* [0..orig_cm->nodes-1], idx of IR state in node n */
3313 int *tr_nd_for_bifs; /* [0..orig_cm->nodes-1], if n is a BIF node, tr node of this BIF node, else -1 */
3314 int x;
3315 int nd;
3316 int sub_v;
3317 int orig_v1;
3318 int orig_v2;
3319 int orig_nd1;
3320 int orig_nd2;
3321 int cm_nd;
3322 int i;
3323 int parent_tr_nd;
3324 ESL_STACK *pda;
3325 int pos;
3326 int ss;
3327 int on_right;
3328 int emitl_flag;
3329 int emitr_flag;
3330
3331 char **nodetypes;
3332 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
3333 nodetypes[0] = "BIF";
3334 nodetypes[1] = "MATP";
3335 nodetypes[2] = "MATL";
3336 nodetypes[3] = "MATR";
3337 nodetypes[4] = "BEGL";
3338 nodetypes[5] = "BEGR";
3339 nodetypes[6] = "ROOT";
3340 nodetypes[7] = "END";
3341
3342 char **sttypes;
3343 ESL_ALLOC(sttypes, sizeof(char *) * 10);
3344 sttypes[0] = "D";
3345 sttypes[1] = "MP";
3346 sttypes[2] = "ML";
3347 sttypes[3] = "MR";
3348 sttypes[4] = "IL";
3349 sttypes[5] = "IR";
3350 sttypes[6] = "S";
3351 sttypes[7] = "E";
3352 sttypes[8] = "B";
3353 sttypes[9] = "EL";
3354
3355 if(print_flag) printf("orig_cm nodes: %d\n", orig_cm->nodes);
3356
3357 ESL_ALLOC(ss_used, sizeof(int) * orig_cm->nodes + 1);
3358 ESL_ALLOC(ss_emitl, sizeof(int) * orig_cm->nodes + 1);
3359 ESL_ALLOC(ss_emitr, sizeof(int) * orig_cm->nodes + 1);
3360 ESL_ALLOC(il_used, sizeof(int) * orig_cm->nodes + 1);
3361 ESL_ALLOC(ir_used, sizeof(int) * orig_cm->nodes + 1);
3362 ESL_ALLOC(il_ct, sizeof(int) * orig_cm->nodes + 1);
3363 ESL_ALLOC(ir_ct, sizeof(int) * orig_cm->nodes + 1);
3364 ESL_ALLOC(tr_nd_for_bifs,sizeof(int) * orig_cm->nodes + 1);
3365 /* i*_emitl[nd] is the last residue emitted by the i* state
3366 * of node nd, the first is (il_emitl[nd] - il_ct[nd] + 1)
3367 * or (ir_emitr[nd] + ir_ct[nd] - 1)
3368 */
3369
3370 for(nd = 0; nd < orig_cm->nodes; nd++)
3371 {
3372 ss_used[nd] = -1;
3373 ss_emitl[nd] = -1;
3374 ss_emitr[nd] = -1;
3375 il_ct[nd] = 0; /* the number of times the IL state was used in the sub_cm parse */
3376 ir_ct[nd] = 0; /* the number of times the IR state was used in the sub_cm parse */
3377 il_used[nd] = -1;
3378 ir_used[nd] = -1;
3379 tr_nd_for_bifs[nd] = -1; /* this will remain -1 except for bif nodes */
3380 }
3381
3382 for(x = 0; x < sub_tr->n; x++)
3383 {
3384 sub_v = sub_tr->state[x];
3385 if(print_flag) printf("x: %d sub_v: %d\n", x, sub_v);
3386 orig_v1 = submap->s2o_smap[sub_v][0];
3387 orig_v2 = submap->s2o_smap[sub_v][1];
3388 if(print_flag) printf("orig_v1: %d | orig_v2: %d\n", orig_v1, orig_v2);
3389 if(orig_v1 == -1)
3390 {
3391 if(sub_cm->sttype[sub_v] != S_st &&
3392 sub_cm->sttype[sub_v] != E_st &&
3393 sub_cm->sttype[sub_v] != B_st &&
3394 sub_cm->sttype[sub_v] != EL_st)
3395 cm_Fail("ERROR 0 in sub_cm2cm_parstree()\n");
3396 continue;
3397 }
3398 orig_nd1 = orig_cm->ndidx[orig_v1];
3399 if(orig_v2 != -1)
3400 orig_nd2 = orig_cm->ndidx[orig_v2];
3401 else
3402 orig_nd2 = -1;
3403
3404 /* No sub_cm insert states can map to 2 orig_cm inserts */
3405 if(orig_cm->sttype[orig_v1] == IL_st)
3406 {
3407 il_used[orig_nd1] = orig_v1;
3408 il_ct[orig_nd1]++;
3409 if(orig_v2 != -1)
3410 cm_Fail("ERROR 1 in sub_cm2cm_parstree()\n");
3411 }
3412 else if(orig_cm->sttype[orig_v1] == IR_st)
3413 {
3414 ir_used[orig_nd1] = orig_v1;
3415 ir_ct[orig_nd1]++;
3416 if(orig_v2 != -1)
3417 cm_Fail("ERROR 2 in sub_cm2cm_parstree()\n");
3418 }
3419 else if(sub_cm->ndtype[sub_cm->ndidx[sub_v]] == MATP_nd)
3420 {
3421 ss_used[orig_nd1] = orig_v1;
3422 if(orig_v2 != -1)
3423 cm_Fail("ERROR 3 in sub_cm2cm_parsetree()\n");
3424 }
3425 else if(orig_cm->ndtype[orig_nd1] == MATP_nd)
3426 {
3427 if(orig_v2 == -1)
3428 cm_Fail("ERROR 4 in sub_cm2cm_parsetree()\n");
3429 /* We have to figure out which MATP split state sub_v corresponds to. */
3430 if(sub_cm->ndtype[sub_cm->ndidx[sub_v]] != MATL_nd &&
3431 sub_cm->ndtype[sub_cm->ndidx[sub_v]] != MATR_nd)
3432 cm_Fail("ERROR 5 in sub_cm2cm_parsetree()\n");
3433 if(orig_cm->ndtype[orig_nd2] != MATP_nd)
3434 cm_Fail("ERROR 6 in sub_cm2cm_parsetree()\n");
3435
3436 if(sub_cm->sttype[sub_v] == D_st)
3437 {
3438 if(ss_used[orig_nd1] == -1 ||
3439 orig_cm->sttype[ss_used[orig_nd1]] == D_st)
3440 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1] + 3; /* MATP_D */
3441
3442 /* Else we do nothing, orig_cm->sttype[ss_used[orig_nd1]] is already
3443 * either a ML_st or an MR_st */
3444 }
3445 else /* sub_cm->sttype[sub_v] != D_st */
3446 {
3447 if(ss_used[orig_nd1] == -1 ||
3448 orig_cm->sttype[ss_used[orig_nd1]] == D_st)
3449 {
3450 /* Figure out if sub_v maps to the left or right half of the MATP node */
3451 if(orig_cm->sttype[orig_v1] == ML_st ||
3452 orig_cm->sttype[orig_v2] == ML_st)
3453 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1] + 1; /* MATP_ML */
3454 else if(orig_cm->sttype[orig_v1] == MR_st ||
3455 orig_cm->sttype[orig_v2] == MR_st)
3456 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1] + 2; /* MATP_MR */
3457 else
3458 cm_Fail("ERROR 7 in sub_cm2cm_parsetree()\n");
3459 }
3460 /* below is the only line we really need:
3461 else
3462 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1]; */
3463
3464 else if(orig_cm->sttype[ss_used[orig_nd1]] == ML_st) /* just for safety; should erase eventually */
3465 {
3466 if(orig_cm->sttype[orig_v1] == MR_st ||
3467 orig_cm->sttype[orig_v2] == MR_st) /* just for safety; should erase eventually */
3468 {
3469 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1]; /* MATP_MP */
3470 }
3471 else
3472 cm_Fail("ERROR 8 in sub_cm2cm_parsetree()\n");
3473 }
3474 else if(orig_cm->sttype[ss_used[orig_nd1]] == MR_st) /* just for safety; should erase eventually */
3475 {
3476 if(orig_cm->sttype[orig_v1] == ML_st ||
3477 orig_cm->sttype[orig_v2] == ML_st) /* just for safety; should erase eventually */
3478 {
3479 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1]; /* MATP_MP */
3480 }
3481 else
3482 cm_Fail("ERROR 9 in sub_cm2cm_parsetree()\n");
3483 }
3484 }
3485 }
3486 else
3487 {
3488 if(orig_v2 != -1)
3489 cm_Fail("ERROR 5 in sub_cm2cm_parsetree()\n");
3490 ss_used[orig_nd1] = orig_v1;
3491 }
3492 }
3493
3494 /* Some MATL, MATR, and MATP nodes in the orig_cm might have 0 states that map to any state
3495 * used in the sub_cm parse. If we're not allowing local begins and ends, our strategy is
3496 * to claim that in the orig_cm parse the D state of these nodes was used.
3497 */
3498 for(nd = 0; nd < orig_cm->nodes; nd++)
3499 {
3500 if(ss_used[nd] == -1)
3501 {
3502 if(orig_cm->ndtype[nd] == MATP_nd)
3503 ss_used[nd] = orig_cm->nodemap[nd] + 3; /* MATP_D */
3504 if(orig_cm->ndtype[nd] == MATL_nd ||
3505 orig_cm->ndtype[nd] == MATR_nd)
3506 ss_used[nd] = orig_cm->nodemap[nd] + 1; /* MAT{L,R}_D */
3507 if(orig_cm->ndtype[nd] == BIF_nd ||
3508 orig_cm->ndtype[nd] == BEGL_nd ||
3509 orig_cm->ndtype[nd] == BEGR_nd ||
3510 orig_cm->ndtype[nd] == END_nd)
3511 ss_used[nd] = orig_cm->nodemap[nd]; /* BIF_B, END_E or BEG{L,R}_S */
3512 }
3513 }
3514
3515 /* Determine emitl and emitr for each state in the orig_cm parse */
3516 /* This code is noticeably cleaner than the rest - it's Sean's, from
3517 * CreateEmitMap() adapted for our purposes here.
3518 */
3519
3520 pos = 1;
3521 ss = 0;
3522 if((pda = esl_stack_ICreate()) == NULL) goto ERROR;
3523 if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR; /* 0 = left side. 1 would = right side. */
3524 if((status = esl_stack_IPush(pda, ss)) != eslOK) goto ERROR;
3525 while (esl_stack_IPop(pda, &ss) != eslEOD)
3526 {
3527 esl_stack_IPop(pda, &on_right);
3528
3529 if (on_right)
3530 {
3531 pos += ir_ct[orig_cm->ndidx[ss]]; /* account for right inserts */
3532 if (orig_cm->sttype[ss] == MP_st || orig_cm->sttype[ss] == MR_st)
3533 pos++;
3534 ss_emitr[orig_cm->ndidx[ss]] = pos - 1;
3535 }
3536 else
3537 {
3538 ss_emitl[orig_cm->ndidx[ss]] = pos;
3539 if (orig_cm->sttype[ss] == MP_st || orig_cm->sttype[ss] == ML_st)
3540 pos++;
3541
3542 if (orig_cm->sttype[ss] == B_st)
3543 {
3544 /* push the BIF back on for its right side */
3545 if((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
3546 if((status = esl_stack_IPush(pda, ss)) != eslOK) goto ERROR;
3547 /* push node index for right child */
3548 if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
3549 if((status = esl_stack_IPush(pda, orig_cm->cnum[ss])) != eslOK) goto ERROR;
3550 /* push node index for left child */
3551 if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
3552 if((status = esl_stack_IPush(pda, orig_cm->cfirst[ss])) != eslOK) goto ERROR;
3553 }
3554 else
3555 {
3556 /* push the node back on for right side */
3557 if((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
3558 if((status = esl_stack_IPush(pda, ss)) != eslOK) goto ERROR;
3559 /* push split state of child node on */
3560 if (orig_cm->sttype[ss] != E_st) {
3561 if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
3562 if((status = esl_stack_IPush(pda, ss_used[orig_cm->ndidx[ss]+1])) != eslOK) goto ERROR;
3563 }
3564 }
3565 pos += il_ct[orig_cm->ndidx[ss]]; /* account for left inserts */
3566 }
3567 }
3568
3569 if(print_flag)
3570 {
3571 for(nd = 0; nd < orig_cm->nodes; nd++)
3572 {
3573 printf("ss_used[%4d] (%4s) first state(%4d) | ", nd, nodetypes[(int) orig_cm->ndtype[nd]], orig_cm->nodemap[nd]);
3574 if(ss_used[nd] != -1)
3575 printf("%4d (%2s) | L: %3d R: %3d\n", ss_used[nd], sttypes[(int) orig_cm->sttype[ss_used[nd]]], ss_emitl[nd], ss_emitr[nd]);
3576 else
3577 printf("%4d\n", -1);
3578 }
3579 for(nd = 0; nd < orig_cm->nodes; nd++)
3580 printf("il_ct[%4d] (st used: %4d) ct: %4d | ir_ct[%4d] ct: %4d (st used: %4d)\n", nd, il_used[nd], il_ct[nd], nd, ir_used[nd], ir_ct[nd]);
3581
3582 }
3583
3584 orig_tr = CreateParsetree(100);
3585 for(cm_nd = 0; cm_nd < orig_cm->nodes; cm_nd++)
3586 {
3587 emitl_flag = 0;
3588 emitr_flag = 0;
3589 if(orig_cm->sttype[ss_used[cm_nd]] == MP_st ||
3590 orig_cm->sttype[ss_used[cm_nd]] == ML_st)
3591 emitl_flag = 1;
3592 if(orig_cm->sttype[ss_used[cm_nd]] == MP_st ||
3593 orig_cm->sttype[ss_used[cm_nd]] == MR_st)
3594 emitr_flag = 1;
3595
3596 /* At least 1 state in each node must be visited (if we're not in local mode) */
3597 if(orig_cm->ndtype[cm_nd] == BEGR_nd)
3598 {
3599 parent_tr_nd = tr_nd_for_bifs[orig_cm->ndidx[orig_cm->plast[orig_cm->nodemap[cm_nd]]]];
3600 if(print_flag) printf("tr_nd_for_bifs[%d]\n", (orig_cm->ndidx[orig_cm->plast[orig_cm->nodemap[cm_nd]]]));
3601 if(print_flag) printf("parent_tr_nd for cm_nd %d: %d\n", cm_nd, parent_tr_nd);
3602 InsertTraceNode(orig_tr, parent_tr_nd, TRACE_RIGHT_CHILD, ss_emitl[cm_nd], ss_emitr[cm_nd], ss_used[cm_nd]);
3603 orig_tr->nxtr[parent_tr_nd] = orig_tr->n - 1; /* Go back and fix nxtr for the BIF parent of this BEGR */
3604 }
3605 else
3606 {
3607 InsertTraceNode(orig_tr, orig_tr->n-1, TRACE_LEFT_CHILD, ss_emitl[cm_nd], ss_emitr[cm_nd], ss_used[cm_nd]);
3608 if(print_flag) printf("inserted trace node for orig_cm st %4s | emitl: %d | emitr: %d\n", sttypes[(int) orig_cm->sttype[ss_used[cm_nd]]], ss_emitl[cm_nd], ss_emitr[cm_nd]);
3609 }
3610
3611 /* Note: if we've just added a trace node for a BIF state, it's incomplete, in that it
3612 * doesn't have the nextr correctly set. We'll go back and set this when we get to the
3613 * right child (BEGR) of this BIF */
3614 if(orig_cm->ndtype[cm_nd] == BIF_nd)
3615 {
3616 tr_nd_for_bifs[cm_nd] = orig_tr->n - 1;
3617 if(print_flag) printf("set tr_nd_for_bifs[%d]: %d\n", cm_nd, orig_tr->n);
3618 }
3619
3620 /* Add left inserts, if any */
3621 for(i = 0; i < il_ct[cm_nd]; i++)
3622 {
3623 InsertTraceNode(orig_tr, orig_tr->n-1, TRACE_LEFT_CHILD, (ss_emitl[cm_nd] + emitl_flag + i), (ss_emitr[cm_nd] - emitr_flag), il_used[cm_nd]);
3624 if(print_flag) printf("inserted trace node for orig_cm st %4s | emitl: %d | emitr: %d\n", sttypes[(int) orig_cm->sttype[il_used[cm_nd]]], orig_tr->emitl[orig_tr->n-1], orig_tr->emitr[orig_tr->n+1]);
3625 }
3626 /* Add right inserts, if any */
3627 for(i = 0; i < ir_ct[cm_nd]; i++)
3628 {
3629 InsertTraceNode(orig_tr, orig_tr->n-1, TRACE_LEFT_CHILD, (ss_emitl[cm_nd] + emitl_flag + il_ct[cm_nd]), (ss_emitr[cm_nd] - emitr_flag - i), ir_used[cm_nd]);
3630 if(print_flag) printf("inserted trace node for orig_cm st %4s | emitl: %d | emitr: %d\n", sttypes[(int) orig_cm->sttype[ir_used[cm_nd]]], orig_tr->emitl[orig_tr->n-1], orig_tr->emitr[orig_tr->n+1]);
3631 }
3632 if(print_flag) printf("END nd: %4d\n", cm_nd);
3633 }
3634 *ret_orig_tr = orig_tr;
3635
3636 free(ss_used);
3637 free(ss_emitl);
3638 free(ss_emitr);
3639 free(il_used);
3640 free(ir_used);
3641 free(il_ct);
3642 free(ir_ct);
3643 free(tr_nd_for_bifs);
3644 free(nodetypes);
3645 free(sttypes);
3646 esl_stack_Destroy(pda);
3647 return eslOK;
3648
3649 ERROR:
3650 return eslEMEM;
3651 }
3652
3653 /* Function: SubCMLogoddsify()
3654 * Date: EPN, Wed Aug 20 08:14:18 2008
3655 *
3656 * Purpose: Convert the probabilities in a sub CM built
3657 * from <mother_cm> to log-odds. Copy the <mother_cm>
3658 * parameters where possible to save time.
3659 *
3660 * Returns: eslOK on success;
3661 * Throws: eslEINCOMPAT on contract violation.
3662 * eslFAIL if we can't create CMConsensus_t object
3663 * at end of the function.
3664 */
3665 int
SubCMLogoddsify(CM_t * cm,char * errbuf,CM_t * mother_cm,CMSubMap_t * mother_map)3666 SubCMLogoddsify(CM_t *cm, char *errbuf, CM_t *mother_cm, CMSubMap_t *mother_map)
3667 {
3668 if(!(mother_cm->flags & CMH_BITS)) ESL_FAIL(eslEINCOMPAT, errbuf, "SubCMLogoddsify(), mother_cm's CMH_BITS flag down, it's bit scores are invalid.");
3669
3670 int v, mv, x, y;
3671
3672 for (v = 0; v < cm->M; v++) {
3673 if (mother_map->s2o_id[v] == TRUE) { /* this state maps identically to a mother_cm state, copy parameters */
3674 if (cm->sttype[v] != B_st && cm->sttype[v] != E_st) {
3675 mv = mother_map->s2o_smap[v][0];
3676 esl_vec_FCopy(mother_cm->tsc[mv], cm->cnum[v], cm->tsc[v]);
3677 esl_vec_ICopy(mother_cm->itsc[mv], cm->cnum[v], cm->itsc[v]);
3678 #if eslDEBUGLEVEL >= 1
3679 for(x = 0; x < cm->cnum[v]; x++) {
3680 if(esl_FCompare(mother_cm->t[mv][x], cm->t[v][x], 1E-5) != eslOK) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "You've got it wrong, mother_cm->t[mv:%d][x:%d] %.3f != cm->t[v:%d][x:%d] %.3f\n", mv, x, mother_cm->t[mv][x], v, x, cm->t[v][x]);
3681 }
3682 #endif
3683 }
3684 if (cm->sttype[v] == MP_st) {
3685 esl_vec_FCopy(mother_cm->esc[mv], cm->abc->K * cm->abc->K, cm->esc[v]);
3686 esl_vec_ICopy(mother_cm->iesc[mv], cm->abc->K * cm->abc->K, cm->iesc[v]);
3687 }
3688 if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st ||
3689 cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
3690 esl_vec_FCopy(mother_cm->esc[mv], cm->abc->K, cm->esc[v]);
3691 esl_vec_ICopy(mother_cm->iesc[mv], cm->abc->K, cm->iesc[v]);
3692 }
3693 cm->beginsc[v] = mother_cm->beginsc[mv];
3694 cm->ibeginsc[v] = mother_cm->ibeginsc[mv];
3695
3696 cm->endsc[v] = mother_cm->endsc[mv];
3697 cm->iendsc[v] = mother_cm->iendsc[mv];
3698 }
3699 else { /* this state does not map identically to a mother_cm state, we have to do the work calculate the parameters */
3700 if (cm->sttype[v] != B_st && cm->sttype[v] != E_st) {
3701 for (x = 0; x < cm->cnum[v]; x++) {
3702 cm->tsc[v][x] = sreLOG2(cm->t[v][x]);
3703 cm->itsc[v][x] = Prob2Score(cm->t[v][x], 1.0);
3704 /*printf("cm->t[%4d][%2d]: %f itsc->e: %f itsc: %d\n", v, x, cm->t[v][x], Score2Prob(cm->itsc[v][x], 1.0), cm->itsc[v][x]);*/
3705 }
3706 }
3707 if (cm->sttype[v] == MP_st) {
3708 for (x = 0; x < cm->abc->K; x++) {
3709 for (y = 0; y < cm->abc->K; y++) {
3710 cm->esc[v][x*cm->abc->K+y] = sreLOG2(cm->e[v][x*cm->abc->K+y] / (cm->null[x]*cm->null[y]));
3711 cm->iesc[v][x*cm->abc->K+y] = Prob2Score(cm->e[v][x*cm->abc->K+y], (cm->null[x]*cm->null[y]));
3712 /*printf("cm->e[%4d][%2d]: %f iesc->e: %f iesc: %d\n", v, (x*cm->abc->K+y), cm->e[v][(x*cm->abc->K+y)], Score2Prob(cm->iesc[v][x*cm->abc->K+y], (cm->null[x]*cm->null[y])), cm->iesc[v][(x*cm->abc->K+y)]);*/
3713 }
3714 }
3715 }
3716 if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st ||
3717 cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
3718 for (x = 0; x < cm->abc->K; x++) {
3719 cm->esc[v][x] = sreLOG2(cm->e[v][x] / cm->null[x]);
3720 cm->iesc[v][x] = Prob2Score(cm->e[v][x], cm->null[x]);
3721 /*printf("cm->e[%4d][%2d]: %f esc: %f null[%d]: %f\n", v, x, cm->e[v][x], cm->esc[v][x], x, cm->null[x]);*/
3722 /*printf("cm->e[%4d][%2d]: %f iesc->e: %f iesc: %d\n", v, x, cm->e[v][x], Score2Prob(cm->iesc[v][x], (cm->null[x])), cm->iesc[v][x]);*/
3723 }
3724 }
3725 /* These work even if begin/end distributions are inactive 0's,
3726 * sreLOG2 will set beginsc, endsc to -infinity.
3727 */
3728 cm->beginsc[v] = sreLOG2(cm->begin[v]);
3729 cm->ibeginsc[v] = Prob2Score(cm->begin[v], 1.0);
3730 /*printf("cm->begin[%4d]: %f ibeginsc->e: %f ibeginsc: %d\n", v, cm->begin[v], Score2Prob(cm->ibeginsc[v], 1.0), cm->ibeginsc[v]);*/
3731
3732 cm->endsc[v] = sreLOG2(cm->end[v]);
3733 cm->iendsc[v] = Prob2Score(cm->end[v], 1.0);
3734 /*printf("cm->end[%4d]: %f iendsc->e: %f iendsc: %d\n\n", v, cm->end[v], Score2Prob(cm->iendsc[v], 1.0), cm->iendsc[v]);*/
3735 }
3736 }
3737 cm->iel_selfsc = Prob2Score(sreEXP2(cm->el_selfsc), 1.0);
3738 /*printf("cm->el_selfsc: %f prob: %f cm->iel_selfsc: %d prob: %f\n", cm->el_selfsc,
3739 (sreEXP2(cm->el_selfsc)), cm->iel_selfsc, (Score2Prob(cm->iel_selfsc, 1.0)));
3740 printf("-INFTY: %d prob: %f 2^: %f\n", -INFTY, (Score2Prob(-INFTY, 1.0)), sreEXP2(-INFTY));*/
3741
3742 /* Allocate and fill optimized emission scores for this CM.
3743 * If they already exist, free them and recalculate them, slightly wasteful, oh well.
3744 */
3745 if(cm->oesc != NULL || cm->ioesc != NULL) FreeOptimizedEmitScores(cm->oesc, cm->ioesc, cm->M);
3746
3747 cm->oesc = SubFCalcAndCopyOptimizedEmitScoresFromMother(cm, mother_cm, mother_map);
3748
3749 /* EPN, Wed Aug 20 15:26:31 2008
3750 * old, slow way:
3751 * cm->ioesc = ICalcOptimizedEmitScores(cm);
3752 */
3753 cm->ioesc = ICopyOptimizedEmitScoresFromFloats(cm, cm->oesc);
3754
3755 /* Potentially, overwrite transitions with non-probabilistic
3756 * RSEARCH transitions. Currently only default transition
3757 * parameters are allowed, these are defined as DEFAULT_R*
3758 * in infernal.h
3759 * Note: This is untouched from CMLogoddsify(), we don't try
3760 * to accelerate it, it's unusual that it will be executed.
3761 */
3762 if(cm->flags & CM_RSEARCHTRANS) {
3763 float alpha = DEFAULT_RALPHA;
3764 float beta = DEFAULT_RBETA;
3765 float alphap = DEFAULT_RALPHAP;
3766 float betap = DEFAULT_RBETAP;
3767 float beginsc = DEFAULT_RBEGINSC;
3768 float endsc = DEFAULT_RENDSC;
3769 int nd;
3770 /* First do the normal transitions */
3771 for (v=0; v<cm->M; v++)
3772 {
3773 if (cm->sttype[v] != B_st && cm->sttype[v] != E_st)
3774 {
3775 for (x=0; x<cm->cnum[v]; x++)
3776 {
3777 cm->tsc[v][x] = -1. * rsearch_calculate_gap_penalty
3778 (cm->stid[v], cm->stid[cm->cfirst[v]+x],
3779 cm->ndtype[cm->ndidx[v]], cm->ndtype[cm->ndidx[cm->cfirst[v]+x]],
3780 alpha, beta, alphap, betap);
3781 /* alphas and rbetas were positive -- gap score is a penalty, so
3782 multiply by -1 */
3783 cm->itsc[v][x] = (int) floor(0.5 + INTSCALE * cm->tsc[v][x]);
3784 }
3785 }
3786 }
3787 /* Overwrite local begin and end scores */
3788 for (v=cm->M - 1; v>=0; v--) {
3789 cm->beginsc[v] = IMPOSSIBLE;
3790 cm->endsc[v] = IMPOSSIBLE;
3791 }
3792
3793 /* beginsc states */
3794 for (nd = 2; nd < cm->nodes; nd++) {
3795 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
3796 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd)
3797 {
3798 cm->beginsc[cm->nodemap[nd]] = beginsc;
3799 cm->ibeginsc[cm->nodemap[nd]] = INTSCALE * beginsc;
3800 }
3801 }
3802
3803 /* endsc states */
3804 for (nd = 1; nd < cm->nodes; nd++) {
3805 if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
3806 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
3807 cm->ndtype[nd] == BEGR_nd) &&
3808 cm->ndtype[nd+1] != END_nd)
3809 {
3810 cm->endsc[cm->nodemap[nd]] = endsc;
3811 cm->iendsc[cm->nodemap[nd]] = INTSCALE * endsc;
3812 }
3813 }
3814
3815 cm->flags |= CMH_LOCAL_BEGIN;
3816 cm->flags |= CMH_LOCAL_END;
3817 }
3818 /* raise flag saying we have valid log odds scores */
3819 cm->flags |= CMH_BITS;
3820
3821 /* create cm->cmcons, we expect this to be valid if we have valid log odds score */
3822 if(cm->cmcons != NULL) FreeCMConsensus(cm->cmcons);
3823 if((cm->cmcons = CreateCMConsensus(cm, cm->abc)) == NULL) return eslFAIL;
3824
3825 return eslOK;
3826 }
3827
3828 /* Function: SubFCalcAndCopyOptimizedEmitScoresFromMother()
3829 * Date: EPN, Wed Aug 20 15:40:23 2008
3830 *
3831 * Purpose: Allocate, fill and return an optimized emission score vector
3832 * of float scores for fast search/alignment.
3833 * Fill emission scores by copying them when
3834 * possible from a 'mother' CM if the current
3835 * cm is a sub CM of the mother.
3836 *
3837 * Returns: the 2D float emission score vector on success,
3838 * dies immediately on memory allocation error.
3839 */
3840 float **
SubFCalcAndCopyOptimizedEmitScoresFromMother(CM_t * cm,CM_t * mother_cm,CMSubMap_t * mother_map)3841 SubFCalcAndCopyOptimizedEmitScoresFromMother(CM_t *cm, CM_t *mother_cm, CMSubMap_t *mother_map)
3842 {
3843 int status;
3844 float **esc_vAA;
3845 ESL_DSQ a,b;
3846 int v;
3847 int cur_cell;
3848 int npairs = 0;
3849 int nsinglets = 0;
3850 float *ptr_to_start; /* points to block allocated to esc_vAA[0], nec b/c esc_vAA[0] gets set to NULL, because v == 0 is non-emitter */
3851 float **leftAA;
3852 float **rightAA;
3853 int mv;
3854
3855 /* count pairs, singlets */
3856 for(v = 0; v < cm->M; v++) {
3857 switch(cm->sttype[v]) {
3858 case IL_st:
3859 case ML_st:
3860 case IR_st:
3861 case MR_st:
3862 nsinglets++;
3863 break;
3864 case MP_st:
3865 npairs++;
3866 break;
3867 }
3868 }
3869
3870 /* set up our left and right vectors for all possible non-canonical residues,
3871 * these are calc'ed once and passed to FastPairScore*() functions to minimize
3872 * run time.
3873 */
3874 ESL_ALLOC(leftAA, sizeof(float *) * cm->abc->Kp);
3875 ESL_ALLOC(rightAA, sizeof(float *) * cm->abc->Kp);
3876 for(a = 0; a <= cm->abc->K; a++) leftAA[a] = rightAA[a] = NULL; /* canonicals and gap, left/right unnec */
3877 for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) {
3878 ESL_ALLOC(leftAA[a], sizeof(float) * cm->abc->K);
3879 ESL_ALLOC(rightAA[a], sizeof(float) * cm->abc->K);
3880 esl_vec_FSet(leftAA[a], cm->abc->K, 0.);
3881 esl_vec_FSet(rightAA[a], cm->abc->K, 0.);
3882 esl_abc_FCount(cm->abc, leftAA[a], a, 1.);
3883 esl_abc_FCount(cm->abc, rightAA[a], a, 1.);
3884 }
3885 leftAA[cm->abc->Kp-1] = rightAA[cm->abc->Kp-1] = NULL; /* missing data, left/right unnec */
3886
3887 /* precalculate possible emission scores for each state */
3888 ESL_ALLOC(esc_vAA, sizeof(float *) * (cm->M));
3889 ESL_ALLOC(esc_vAA[0], sizeof(float) * ((cm->abc->Kp * nsinglets) + (cm->abc->Kp * cm->abc->Kp * npairs)));
3890 ptr_to_start = esc_vAA[0];
3891 cur_cell = 0;
3892 for(v = 0; v < cm->M; v++) {
3893 switch(cm->sttype[v]) {
3894 case IL_st:
3895 case ML_st:
3896 case IR_st:
3897 case MR_st:
3898 esc_vAA[v] = ptr_to_start + cur_cell;
3899 cur_cell += cm->abc->Kp;
3900 if (mother_map->s2o_id[v] == TRUE) { /* this state maps identically to a mother_cm state, copy parameters */
3901 mv = mother_map->s2o_smap[v][0];
3902 esl_vec_FCopy(mother_cm->oesc[mv], cm->abc->Kp, esc_vAA[v]);
3903 }
3904 else {
3905 for(a = 0; a < cm->abc->K; a++) /* all canonical residues */
3906 esc_vAA[v][a] = cm->esc[v][a];
3907 esc_vAA[v][cm->abc->K] = IMPOSSIBLE; /* gap symbol is impossible */
3908 for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) /* all ambiguous residues */
3909 esc_vAA[v][a] = esl_abc_FAvgScore(cm->abc, a, cm->esc[v]);
3910 esc_vAA[v][cm->abc->Kp-1] = IMPOSSIBLE; /* missing data is IMPOSSIBLE */
3911 }
3912 break;
3913 case MP_st:
3914 esc_vAA[v] = ptr_to_start + cur_cell;
3915 esl_vec_FSet(esc_vAA[v], cm->abc->Kp * cm->abc->Kp, IMPOSSIBLE); /* init all cells to IMPOSSIBLE */
3916 cur_cell += cm->abc->Kp * cm->abc->Kp;
3917 if (mother_map->s2o_id[v] == TRUE) { /* this state maps identically to a mother_cm state, copy parameters */
3918 mv = mother_map->s2o_smap[v][0];
3919 esl_vec_FCopy(mother_cm->oesc[mv], cm->abc->Kp * cm->abc->Kp, esc_vAA[v]);
3920 }
3921 else {
3922 /* a is canonical, b is canonical */
3923 for(a = 0; a < cm->abc->K; a++) {
3924 for(b = 0; b < cm->abc->K; b++) {
3925 esc_vAA[v][(a * cm->abc->Kp) + b] = cm->esc[v][(a * cm->abc->K) + b];
3926 }
3927 }
3928 /* a is not canonical, b is canonical */
3929 for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) {
3930 for(b = 0; b < cm->abc->K; b++) {
3931 esc_vAA[v][(a * cm->abc->Kp) + b] = FastPairScoreLeftOnlyDegenerate(cm->abc->K, cm->esc[v], leftAA[a], b);
3932 }
3933 }
3934 /* a is canonical, b is not canonical */
3935 for(a = 0; a < cm->abc->K; a++) {
3936 for(b = cm->abc->K+1; b < cm->abc->Kp-1; b++) {
3937 esc_vAA[v][(a * cm->abc->Kp) + b] = FastPairScoreRightOnlyDegenerate(cm->abc->K, cm->esc[v], rightAA[b], a);
3938 }
3939 }
3940 /* a is not canonical, b is not canonical */
3941 for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) {
3942 for(b = cm->abc->K+1; b < cm->abc->Kp-1; b++) {
3943 esc_vAA[v][(a * cm->abc->Kp) + b] = FastPairScoreBothDegenerate(cm->abc->K, cm->esc[v], leftAA[a], rightAA[b]);
3944 }
3945 }
3946 /* everything else, when either a or b is gap or missing data, stays IMPOSSIBLE */
3947 }
3948 break;
3949 default:
3950 esc_vAA[v] = NULL;
3951 break;
3952 }
3953 }
3954 for(a = 0; a < cm->abc->Kp; a++) {
3955 if(leftAA[a] != NULL) free(leftAA[a]);
3956 if(rightAA[a] != NULL) free(rightAA[a]);
3957 }
3958 free(leftAA);
3959 free(rightAA);
3960 return esc_vAA;
3961
3962 ERROR:
3963 cm_Fail("memory allocation error.");
3964 return NULL; /* NEVERREACHED */
3965 }
3966
3967
3968 /* Function: CP9_reconfig2sub()
3969 * EPN 10.16.06
3970 *
3971 * Purpose: Given a CM Plan 9 HMM and a start position
3972 * (spos) and end position (epos) that a sub CM models,
3973 * reconfigure the HMM so that it can only start in the
3974 * node that models spos (spos_nd) end in the node that
3975 * models epos (epos_nd).
3976 *
3977 * If we're reconfiguring a CP9 HMM that ONLY models the
3978 * consensus columns spos to epos, then spos_nd == 1
3979 * and epos_nd == hmm->M, but this is not necessarily true.
3980 * We may be reconfiguring a CP9 HMM that models the
3981 * full alignment including positions before and/or after
3982 * spos and epos. In this case spos_nd == spos and
3983 * epos_nd == epos;
3984 *
3985 * Args: hmm - the CP9 model w/ data-dep prob's valid
3986 * spos - first consensus column modelled by some original
3987 * full length, template CP9 HMM that 'hmm' models.
3988 * epos - final consensus column modelled by some original
3989 * CP9 HMM that 'hmm' models.
3990 * spos_nd - the node of 'hmm' that models spos.
3991 * (1 if 'hmm' only has (epos-spos+1) nodes
3992 * (spos if 'hmm' has a node for each column of original aln)
3993 * epos_nd - the node of the 'hmm' in that models epos.
3994 * (hmm->M if 'hmm' only has (epos-spos+1) nodes
3995 * (epos if 'hmm' has a node for each column of original aln)
3996 * orig_phi - the 2D phi array for the original CP9 HMM.
3997 * Return: (void)
3998 * HMM probabilities are modified.
3999 */
4000 void
CP9_reconfig2sub(CP9_t * hmm,int spos,int epos,int spos_nd,int epos_nd,double ** orig_phi)4001 CP9_reconfig2sub(CP9_t *hmm, int spos, int epos, int spos_nd,
4002 int epos_nd, double **orig_phi)
4003 {
4004 /* Make the necessary modifications. Since in cmalign --sub mode this
4005 * function will be called potentially once for each sequence, we
4006 * don't want to call CP9Logoddsify(), but rather only logoddsify
4007 * the parameters that are different.
4008 */
4009
4010 /* Configure entry.
4011 * Exactly 3 ways to start, B->M_1 (hmm->begin[1]), B->I_0 (hmm->t[0][CTMI]),
4012 * and B->D_1 (hmm->t[0][CTMD])
4013 */
4014 /* prob of starting in M_spos is (1. - prob of starting in I_spos-1) as there is no D_spos-1 -> M_spos trans */
4015
4016 if(spos > 1)
4017 {
4018 hmm->begin[spos_nd] = 1.-((orig_phi[spos-1][HMMINSERT] * (1. - hmm->t[spos-1][CTII])) +
4019 (orig_phi[spos ][HMMDELETE] - (orig_phi[spos-1][HMMINSERT] * hmm->t[spos-1][CTID])));
4020 hmm->t[spos_nd-1][CTMI] = (orig_phi[spos-1][HMMINSERT] * (1. - hmm->t[spos-1][CTII]));
4021 hmm->t[spos_nd-1][CTMD] = orig_phi[spos ][HMMDELETE] - (orig_phi[spos-1][HMMINSERT] * hmm->t[spos-1][CTID]);
4022 hmm->t[spos_nd-1][CTMM] = 0.; /* probability of going from B(M_0) to M_1 is begin[1] */
4023 hmm->t[spos_nd-1][CTMEL] = 0.; /* can't go to EL from B(M_0) */
4024 hmm->t[spos_nd-1][CTDM] = 0.; /* D_0 doesn't exist */
4025 hmm->t[spos_nd-1][CTDI] = 0.; /* D_0 doesn't exist */
4026 hmm->t[spos_nd-1][CTDD] = 0.; /* D_0 doesn't exist */
4027
4028 hmm->bsc[spos_nd] = Prob2Score(hmm->begin[1], 1.0);
4029
4030 hmm->tsc[CTMM][spos_nd-1] = -INFTY; /* probability of going from B(M_0) to M_1 is begin[1] */
4031 hmm->tsc[CTMEL][spos_nd-1] = -INFTY;
4032 hmm->tsc[CTDM][spos_nd-1] = -INFTY; /* D_0 doesn't exist */
4033 hmm->tsc[CTDI][spos_nd-1] = -INFTY; /* D_0 doesn't exist */
4034 hmm->tsc[CTDD][spos_nd-1] = -INFTY; /* D_0 doesn't exist */
4035
4036 hmm->tsc[CTMI][spos_nd-1] = Prob2Score(hmm->t[spos_nd-1][CTMI], 1.0);
4037 hmm->tsc[CTMD][spos_nd-1] = Prob2Score(hmm->t[spos_nd-1][CTMD], 1.0);
4038 }
4039
4040 if(epos < hmm->M)
4041 {
4042 hmm->end[epos_nd] = hmm->t[epos][CTMM] + hmm->t[epos][CTMD];
4043 hmm->t[epos_nd][CTDM] += hmm->t[epos][CTDD];
4044 hmm->t[epos_nd][CTIM] += hmm->t[epos][CTID];
4045 hmm->t[epos_nd][CTMM] = 0.; /* M->E is actually end[M] */
4046 hmm->t[epos_nd][CTMEL] = 0.;
4047 hmm->t[epos_nd][CTMD] = 0.; /* D_M+1 doesn't exist */
4048 hmm->t[epos_nd][CTDD] = 0.; /* D_M+1 doesn't exist */
4049 hmm->t[epos_nd][CTID] = 0.; /* D_M+1 doesn't exist */
4050
4051 hmm->esc[epos_nd] = Prob2Score(hmm->end[epos_nd], 1.0);
4052 hmm->tsc[CTDM][epos_nd] = Prob2Score(hmm->t[epos_nd][CTDM], 1.0);
4053 hmm->tsc[CTIM][epos_nd] = Prob2Score(hmm->t[epos_nd][CTIM], 1.0);
4054 hmm->tsc[CTMM][epos_nd] = -INFTY; /* M->E is actually end[M] */
4055 hmm->tsc[CTMEL][epos_nd] = -INFTY;
4056 hmm->tsc[CTMD][epos_nd] = -INFTY; /* D_M+1 doesn't exist */
4057 hmm->tsc[CTDD][epos_nd] = -INFTY; /* D_M+1 doesn't exist */
4058 hmm->tsc[CTID][epos_nd] = -INFTY; /* D_M+1 doesn't exist */
4059 }
4060 hmm->flags |= CPLAN9_HASBITS; /* raise the log-odds ready flag */
4061
4062 return;
4063 }
4064
4065 #if 0
4066 /* These two functions are not currently used, but could be useful for debugging
4067 * in the future */
4068 static void debug_print_misc_sub_cm_info(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CP9Map_t *orig_cp9map);
4069 static void debug_sub_cm_check_all_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap);
4070
4071 /**************************************************************************
4072 * EPN 10.06.06
4073 * Function: debug_print_misc_sub_cm_info()
4074 **************************************************************************/
4075 static void
4076 debug_print_misc_sub_cm_info(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CP9Map_t *orig_cp9map)
4077 {
4078 int status;
4079 int orig_il1;
4080 int orig_il2;
4081 int orig_ir1;
4082 int orig_ir2;
4083 int orig_ss;
4084
4085 char **nodetypes;
4086 char **sttypes;
4087 char **sides;
4088
4089 int orig_n1_type;
4090 int side_idx;
4091
4092 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
4093 nodetypes[0] = "BIF";
4094 nodetypes[1] = "MATP";
4095 nodetypes[2] = "MATL";
4096 nodetypes[3] = "MATR";
4097 nodetypes[4] = "BEGL";
4098 nodetypes[5] = "BEGR";
4099 nodetypes[6] = "ROOT";
4100 nodetypes[7] = "END";
4101
4102 ESL_ALLOC(sttypes, sizeof(char *) * 10);
4103 sttypes[0] = "D";
4104 sttypes[1] = "MP";
4105 sttypes[2] = "ML";
4106 sttypes[3] = "MR";
4107 sttypes[4] = "IL";
4108 sttypes[5] = "IR";
4109 sttypes[6] = "S";
4110 sttypes[7] = "E";
4111 sttypes[8] = "B";
4112 sttypes[9] = "EL";
4113
4114 ESL_ALLOC(sides, sizeof(char *) * 3);
4115 sides[0] = "L";
4116 sides[1] = "R";
4117 sides[2] = "N";
4118
4119
4120 orig_il1 = submap->s2o_smap[1][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IL */
4121 orig_il2 = submap->s2o_smap[1][1]; /* 2nd state that maps to sub_cm's ROOT_IL or -1 if only 1 maps*/
4122 orig_ir1 = submap->s2o_smap[2][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IR */
4123 orig_ir2 = submap->s2o_smap[2][1]; /* 2nd state that maps to sub_cm's ROOT_IR or -1 if only 1 maps*/
4124
4125 /* We ASSUME that ambiguities have been removed, i.e. if two insert states map to either ROOT_IL
4126 * or ROOT_IR, one of them has been detached. We exploit this knowledge.
4127 */
4128 if(orig_il2 != -1)
4129 {
4130 if(orig_cm->sttype[orig_il1+1] == E_st)
4131 orig_il1 = orig_il2; /* orig_il1 was detached */
4132 else if(orig_cm->sttype[orig_il2+1] == E_st)
4133 {
4134 /* do nothing */
4135 }
4136 else
4137 cm_Fail("ERROR, can't determine which state was detached in debug_print_misc_sub_cm_info\n");
4138 }
4139 if(orig_ir2 != -1)
4140 {
4141 if(orig_cm->sttype[orig_ir1+1] == E_st)
4142 orig_ir1 = orig_ir2; /* orig_ir1 was detached */
4143 else if(orig_cm->sttype[orig_ir2+1] == E_st)
4144 {
4145 /* do nothing */
4146 }
4147 else
4148 cm_Fail("ERROR, can't determine which state was detached in debug_print_misc_sub_cm_info\n");
4149 }
4150
4151 /* Now orig_il1 and orig_ir1 map to the ONLY insert states that map to sub_cm
4152 * ROOT_IL and ROOT_IR respectively.
4153 */
4154 printf("10.16.06 IL1: %3d %4s %2s | start: %3d | end: %3d\n", orig_il1, nodetypes[(int) orig_cm->ndtype[orig_cm->ndidx[orig_il1]]], sttypes[(int) orig_cm->sttype[orig_il1]], submap->spos, submap->epos);
4155
4156 if(sub_cm->ndtype[1] == BIF_nd)
4157 {
4158 orig_n1_type = 0;
4159 }
4160 else
4161 {
4162 orig_n1_type = orig_cm->ndtype[orig_cm->ndidx[(submap->s2o_smap[3][0])]];
4163 }
4164
4165 if(orig_n1_type == MATP_nd && sub_cm->ndtype[1] != MATR_nd)
4166 {
4167 if(orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->spos)
4168 side_idx = 0;
4169 else if(orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->spos)
4170 side_idx = 1;
4171 else
4172 cm_Fail("ERROR MATP confusion! orig_cm node: %d | left: %d | right: %d | submap->spos: %d\n", (orig_cm->ndidx[(submap->s2o_smap[3][0])]), (orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), (orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), submap->spos);
4173 }
4174 else if (orig_n1_type == MATP_nd && sub_cm->ndtype[1] == MATR_nd)
4175 {
4176 if(orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->epos)
4177 side_idx = 0;
4178 else if(orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->epos)
4179 side_idx = 1;
4180 else
4181 cm_Fail("ERROR MATP confusion! orig_cm node: %d | left: %d | right: %d | submap->spos: %d\n", (orig_cm->ndidx[(submap->s2o_smap[3][0])]), (orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), (orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), submap->spos);
4182 }
4183 else
4184 {
4185 side_idx = 2;
4186 }
4187 printf("10.16.06 IR1: %3d %4s %2s | sub-n1: %4s | orig-n1: %4s%1s | case: ", orig_ir1, nodetypes[(int) orig_cm->ndtype[orig_cm->ndidx[orig_ir1]]], sttypes[(int) orig_cm->sttype[orig_ir1]], nodetypes[(int) sub_cm->ndtype[1]], nodetypes[(int) orig_n1_type], sides[side_idx]);
4188
4189 /* figure out 'case' of ROOT transitions */
4190 orig_ss = submap->s2o_smap[3][0]; /* orig_ss is the 1 (of possibly 2) orig_cm states that map to the first
4191 * state in sub_cm node 1 (sub_cm state 3)
4192 */
4193 if((orig_il1 < orig_ir1) && (orig_ir1 < orig_ss))
4194 printf("1A\n");
4195 if((orig_il1 < orig_ss) && (orig_ss < orig_ir1))
4196 printf("1B\n");
4197 if((orig_ss < orig_il1) && (orig_il1 < orig_ir1))
4198 printf("1C\n");
4199
4200 if((orig_ir1 < orig_il1) && (orig_il1 < orig_ss))
4201 printf("2A\n");
4202 if((orig_ir1 < orig_ss) && (orig_ss < orig_il1))
4203 printf("2B\n");
4204 if((orig_ss < orig_ir1) && (orig_ir1 < orig_il1))
4205 printf("2C\n");
4206
4207 printf("\n");
4208
4209 /* Begin 10.17.06 info */
4210 int ilmap, irmap;
4211 int ildual, irdual;
4212 int iloff, iroff;
4213 int other_insert_il;
4214 int other_insert_ir;
4215
4216 if(orig_cm->sttype[orig_il1] == IL_st)
4217 ilmap = 0; /* sides[0] = "L" */
4218 else
4219 ilmap = 1; /* sides[1] = "R" */
4220 if(orig_cm->sttype[orig_ir1] == IL_st)
4221 irmap = 0; /* sides[0] = "L" */
4222 else
4223 irmap = 1; /* sides[1] = "R" */
4224
4225 if(orig_cm->ndtype[orig_cm->ndidx[orig_il1]] == MATP_nd ||
4226 orig_cm->ndtype[orig_cm->ndidx[orig_il1]] == ROOT_nd)
4227 ildual = TRUE;
4228 else
4229 ildual = FALSE;
4230 if(orig_cm->ndtype[orig_cm->ndidx[orig_ir1]] == MATP_nd ||
4231 orig_cm->ndtype[orig_cm->ndidx[orig_ir1]] == ROOT_nd)
4232 irdual = TRUE;
4233 else
4234 irdual = FALSE;
4235
4236 iloff = -1;
4237 if(ildual == TRUE)
4238 {
4239 /* check if other insert state in orig_cm node that has insert that
4240 * maps to sub_cm ROOT_IL maps to a state in the sub_cm
4241 */
4242 if(ilmap == 0) /* ROOT_IL maps to a IL */
4243 other_insert_il = orig_il1 + 1;
4244 else /* ROOT_IL maps to a IR */
4245 other_insert_il = orig_il1 - 1;
4246 if(submap->o2s_smap[other_insert_il][0] == -1 &&
4247 submap->o2s_smap[other_insert_il][1] == -1)
4248 iloff = TRUE;
4249 else
4250 iloff = FALSE;
4251 }
4252
4253 iroff = -1;
4254 if(irdual == TRUE)
4255 {
4256 if(irmap == 0) /* ROOT_IR maps to a IL */
4257 other_insert_ir = orig_ir1 + 1;
4258 else /* ROOT_IR maps to a IR */
4259 other_insert_ir = orig_ir1 - 1;
4260 if(submap->o2s_smap[other_insert_ir][0] == -1 &&
4261 submap->o2s_smap[other_insert_ir][1] == -1)
4262 iroff = TRUE;
4263 else
4264 iroff = FALSE;
4265 }
4266
4267 CMEmitMap_t *orig_emap; /* consensus emit map for the original, template CM */
4268 int other_cc_il, other_cc_ir;
4269 orig_emap = CreateEmitMap(orig_cm);
4270 other_cc_il = -1;
4271 other_cc_ir = -1;
4272 if(ildual == TRUE)
4273 {
4274 if(orig_cm->sttype[other_insert_il] == IL_st) /* sub ROOT_IL maps to IR, other maps to IL */
4275 {
4276 other_cc_il = orig_emap->lpos[orig_cm->ndidx[other_insert_il]] + 1;
4277 if(other_cc_il > submap->spos)
4278 cm_Fail("ERROR FUNKY\n");
4279 ildual = 4;
4280 }
4281 else /* ROOT_IL maps to IL, other maps to IR */
4282 {
4283 other_cc_il = orig_emap->rpos[orig_cm->ndidx[other_insert_il]] - 1;
4284 if(other_cc_il < submap->epos)
4285 ildual = 1;
4286 if(other_cc_il == submap->epos)
4287 ildual = 2;
4288 if(other_cc_il > submap->epos)
4289 ildual = 3;
4290 }
4291 }
4292 /*printf("10.17.06 other_insert_il: %d other_cc_il: %d submap->spos: %d submap->epos: %d ildual: %d\n", other_insert_il, other_cc_il, submap->spos, submap->epos, ildual);*/
4293 if(irdual == TRUE)
4294 {
4295 if(orig_cm->sttype[other_insert_ir] == IL_st) /* sub ROOT_IR maps to IR, other maps to IL */
4296 {
4297 other_cc_ir = orig_emap->lpos[orig_cm->ndidx[other_insert_ir]] + 1;
4298 if(other_cc_ir > submap->spos)
4299 irdual = 1;
4300 if(other_cc_ir == submap->spos)
4301 irdual = 2;
4302 if(other_cc_ir < submap->spos)
4303 irdual = 3;
4304 }
4305 else /* ROOT_IR maps to IL, other maps to IR */
4306 {
4307 other_cc_ir = orig_emap->rpos[orig_cm->ndidx[other_insert_ir]] - 1;
4308 if(other_cc_ir < submap->epos)
4309 cm_Fail("ERROR FUNKY\n");
4310 irdual = 4;
4311 }
4312 }
4313 /*printf("10.17.06 other_insert_ir: %d other_cc_ir: %d submap->spos: %d submap->epos: %d irdual: %d\n", other_insert_ir, other_cc_ir, submap->spos, submap->epos, irdual);*/
4314
4315 printf("10.17.06 ilmap: %5s ildual: %2d iloff: %2d irmap: %5s irdual: %2d iroff: %2d subn1: %4s orign1: %4s%1s\n", sides[ilmap], ildual, iloff, sides[irmap], irdual, iroff, nodetypes[(int) sub_cm->ndtype[1]], nodetypes[(int) orig_n1_type], sides[side_idx]);
4316
4317 int start_flag;
4318 int v;
4319 int vend;
4320
4321 if(orig_il1 < orig_ir1)
4322 {
4323 v = orig_il1;
4324 vend = orig_ir1;
4325 }
4326 else
4327 {
4328 v = orig_ir1;
4329 vend = orig_il1;
4330 }
4331 start_flag = 0;
4332 for(; v <= vend; v++)
4333 if(orig_cm->sttype[v] == S_st)
4334 start_flag = 1;
4335 return;
4336
4337 ERROR:
4338 cm_Fail("Memory allocation error.");
4339 }
4340
4341 /**************************************************************************
4342 * EPN 11.01.06
4343 * debug_sub_cm_check_all_trans()
4344 */
4345 void
4346 debug_sub_cm_check_all_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap)
4347 {
4348 int status;
4349 int nd;
4350 int v;
4351 int y, yoffset;
4352 float sum, ndsum;
4353 int orig_nd, orig_v1, orig_v2;
4354
4355 char **nodetypes;
4356 ESL_ALLOC(nodetypes, sizeof(char *) * 8);
4357 nodetypes[0] = "BIF";
4358 nodetypes[1] = "MATP";
4359 nodetypes[2] = "MATL";
4360 nodetypes[3] = "MATR";
4361 nodetypes[4] = "BEGL";
4362 nodetypes[5] = "BEGR";
4363 nodetypes[6] = "ROOT";
4364 nodetypes[7] = "END";
4365
4366 char **sttypes;
4367 ESL_ALLOC(sttypes, sizeof(char *) * 10);
4368 sttypes[0] = "D";
4369 sttypes[1] = "MP";
4370 sttypes[2] = "ML";
4371 sttypes[3] = "MR";
4372 sttypes[4] = "IL";
4373 sttypes[5] = "IR";
4374 sttypes[6] = "S";
4375 sttypes[7] = "E";
4376 sttypes[8] = "B";
4377 sttypes[9] = "EL";
4378
4379 for(nd = 0; nd < sub_cm->nodes; nd++)
4380 {
4381 sum = 0.;
4382 if(sub_cm->ndtype[nd] != END_nd && sub_cm->ndtype[nd] != BIF_nd)
4383 {
4384 ndsum = 0.;
4385 v = sub_cm->nodemap[nd];
4386 while(sub_cm->ndidx[v] == nd && sub_cm->sttype[v] != IL_st && sub_cm->sttype[v] != IR_st)
4387 {
4388 sum = 0.;
4389 for(y = sub_cm->cfirst[v]; y < sub_cm->cfirst[v]+sub_cm->cnum[v]; y++)
4390 {
4391 yoffset = y - sub_cm->cfirst[v];
4392 printf("\t\tsub_cm->t[%3d][%3d]: %f\n", v, yoffset, sub_cm->t[v][yoffset]);
4393 sum += sub_cm->t[v][yoffset];
4394 }
4395 orig_v1 = submap->s2o_smap[v][0];
4396 orig_v2 = submap->s2o_smap[v][1];
4397 orig_nd = orig_cm->ndidx[orig_v1];
4398 if(sub_cm->ndtype[nd+1] != END_nd)
4399 {
4400 if(orig_v2 != -1)
4401 printf("sum t[%4d %4s %2s %2s] nd: %4d: %f\n", v, nodetypes[(int) orig_cm->ndtype[orig_nd]], sttypes[(int) orig_cm->sttype[orig_v1]], sttypes[(int) orig_cm->sttype[orig_v2]], nd, sum);
4402 else
4403 printf("sum t[%4d %4s %2s ] nd: %4d: %f\n", v, nodetypes[(int) orig_cm->ndtype[orig_nd]], sttypes[(int) orig_cm->sttype[orig_v1]], nd, sum);
4404 }
4405 ndsum += sum;
4406 v++;
4407 }
4408 if(sub_cm->ndtype[nd+1] != END_nd)
4409 printf("\tndsum t nd (%4s): %4d: %f\n", nodetypes[(int) orig_cm->ndtype[orig_nd]], nd, ndsum);
4410 }
4411 }
4412 free(nodetypes);
4413 free(sttypes);
4414 return;
4415
4416 ERROR:
4417 cm_Fail("Memory allocation error.");
4418 }
4419 #endif
4420