1 /*****************************************************************
2  * cm_submodel.c (formerly sub_cm.c)
3  * EPN 07.25.06 (Benasque)
4  *
5  * Building submodels (sub CMs) from a template CM, that represent
6  * a contiguous subset of the consensus columns that were modelled
7  * by the template CM.
8  *
9  * These functions are still under development. No guarantees.
10  *
11  * NOTE: 'sub CM' here does not correspond to Zasha Weinberg's use
12  *       of the term for filtering with subtrees of the model. To
13  *       be unambiguous, I should have not used 'sub CM', it's bad
14  *       form on my part. However 'sub CM' is so engrained in the
15  *       codebase at this point, I'm wary to change it, so it stays.
16  *
17  */
18 
19 #include "esl_config.h"
20 #include "p7_config.h"
21 #include "config.h"
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <math.h>
27 
28 #include "easel.h"
29 #include "esl_alphabet.h"
30 #include "esl_random.h"
31 #include "esl_stack.h"
32 #include "esl_vectorops.h"
33 #include "esl_wuss.h"
34 
35 #include "hmmer.h"
36 
37 #include "infernal.h"
38 
39 static void  map_orig2sub_cm(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int print_flag);
40 static int   map_orig2sub_cm_helper(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int sub_v);
41 static int   cm2sub_cm_check_id_next_node(CM_t *orig_cm, CM_t *sub_cm, int orig_nd, int sub_nd,
42 					  CMSubMap_t *submap, CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map,
43 					  int print_flag);
44 static void  cm2sub_cm_emit_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, int v_s, int v_o1, int v_o2,
45 				  CMSubMap_t *submap);
46 static void  cm2sub_cm_trans_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_s,
47 				   CMSubMap_t *submap);
48 static void  cm2sub_cm_trans_probs_S(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_start,
49 				     CMSubMap_t *submap);
50 static void  cm2sub_cm_trans_probs_B_E(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_end,
51 				       CMSubMap_t *submap, int print_flag);
52 static void  cm2sub_cm_add_single_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int orig_y,
53 					int sub_v, int yoffset, double *orig_psi, char ***tmap);
54 static float cm2sub_cm_sum_subpaths(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int start, int end,
55 				    int init_sub_start, char ***tmap, double *orig_psi);
56 static int   cm_trans_check(CM_t *cm, int a, int b);
57 static void  cm2sub_cm_subtract_root_subpaths(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap,
58 					      CMSubMap_t *submap, int print_flag);
59 static void  cm2sub_cm_find_impossible_misc_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
60 						  CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag);
61 static void  cm2sub_cm_find_impossible_matr_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
62 						  CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag);
63 
64 
65 /**************************************************************************
66  * EPN 10.25.06
67  * Function: AllocSubMap()
68  *
69  * Purpose:  Determine maps between a template CM and a sub CM, which probably
70  *           has less structure (less MATP nodes) than the template. Do this
71  *           a bit indirectly, first get maps from each CM to a CP9 HMM,
72  *           then use these maps to map the CMs to each other.
73  *           See infernal.h for more info on the submap data structure.
74  *
75  * Args
76  * CM_t *sub_cm        - the sub CM
77  * CM_t *orig_cm       - the original template CM
78  * int sstruct         - the first (leftmost)  consensus posn we're keeping structure for
79  * int estruct         - the last  (rightmost) consensus posn we're keeping structure for
80  */
81 
82 CMSubMap_t *
AllocSubMap(CM_t * sub_cm,CM_t * orig_cm,int sstruct,int estruct)83 AllocSubMap(CM_t *sub_cm, CM_t *orig_cm, int sstruct, int estruct)
84 {
85   CMSubMap_t  *submap;
86   int v;
87   int status;
88 
89   ESL_ALLOC(submap, sizeof(struct submap_s));
90 
91   submap->sub_M  = sub_cm->M;
92   submap->orig_M = orig_cm->M;
93   /* Determine clen of the orig_cm */
94   submap->orig_clen = 0;
95   for(v = 0; v <= orig_cm->M; v++)
96     {
97       if(orig_cm->stid[v] ==  MATP_MP)
98 	submap->orig_clen += 2;
99       else if(orig_cm->stid[v] == MATL_ML || orig_cm->stid[v] == MATR_MR)
100 	submap->orig_clen++;
101     }
102   submap->sstruct = sstruct;
103   submap->estruct = estruct;
104   submap->sub_clen = (submap->estruct-submap->sstruct+1);
105   submap->spos     = submap->sstruct;
106   submap->epos     = submap->estruct;
107 
108   /* Allocate and initialize arrays */
109   ESL_ALLOC(submap->s2o_id,   sizeof(int) *   (sub_cm->M+1));
110   ESL_ALLOC(submap->s2o_smap, sizeof(int *) * (sub_cm->M+1));
111   for(v = 0; v <= sub_cm->M; v++)
112     {
113       submap->s2o_id[v]      = FALSE;
114       ESL_ALLOC(submap->s2o_smap[v], sizeof(int) * 2);
115       submap->s2o_smap[v][0] = -1;
116       submap->s2o_smap[v][1] = -1;
117     }
118 
119   ESL_ALLOC(submap->o2s_smap, sizeof(int *) * (orig_cm->M+1));
120   for(v = 0; v <= orig_cm->M; v++)
121     {
122       ESL_ALLOC(submap->o2s_smap[v], sizeof(int) * 2);
123       submap->o2s_smap[v][0] = -1;
124       submap->o2s_smap[v][1] = -1;
125     }
126   return submap;
127 
128  ERROR:
129   cm_Fail("Memory allocation error.");
130   return NULL; /* never reached */
131 }
132 
133 /* Function: FreeSubMap()
134  * Returns: (void)
135  */
136 
137 void
FreeSubMap(CMSubMap_t * submap)138 FreeSubMap(CMSubMap_t *submap)
139 {
140   int v;
141   for(v = 0; v <= submap->sub_M; v++)
142     free(submap->s2o_smap[v]);
143   free(submap->s2o_smap);
144   for(v = 0; v <= submap->orig_M; v++)
145     free(submap->o2s_smap[v]);
146   free(submap->o2s_smap);
147   free(submap->s2o_id);
148   free(submap);
149 }
150 
151 /**************************************************************************
152  * EPN 10.26.06
153  * Function: AllocSubInfo()
154  *
155  * Purpose:  Store information about a sub CM that is useful for testing
156  *           if it constructed correctly.
157  *
158  * Args:
159  * int clen;           - consensus length of the sub_cm
160  * Returns: CMSubInfo_t
161  */
162 
163 CMSubInfo_t *
AllocSubInfo(int clen)164 AllocSubInfo(int clen)
165 {
166   int status;
167   CMSubInfo_t  *subinfo;
168   int i;
169   int ncases;
170 
171   ESL_ALLOC(subinfo, sizeof(struct subinfo_s));
172   /* Allocate and initialize arrays */
173   ESL_ALLOC(subinfo->imp_cc, sizeof(int) * (clen + 2));
174   for(i = 0; i <= clen+1; i++)
175     subinfo->imp_cc[i] = FALSE;
176 
177   /* 6 possible cases for predicting we get HMM distros wrong */
178   ncases = 6;
179   ESL_ALLOC(subinfo->apredict_ct, sizeof(int) * (ncases+1));
180   ESL_ALLOC(subinfo->spredict_ct, sizeof(int) * (ncases+1));
181   ESL_ALLOC(subinfo->awrong_ct,   sizeof(int) * (ncases+1));
182   ESL_ALLOC(subinfo->swrong_ct,   sizeof(int) * (ncases+1));
183   for(i = 0; i <= ncases; i++)
184     {
185       subinfo->apredict_ct[i] = 0;
186       subinfo->spredict_ct[i] = 0;
187       subinfo->awrong_ct[i]   = 0;
188       subinfo->swrong_ct[i]   = 0;
189     }
190   return subinfo;
191 
192  ERROR:
193   cm_Fail("Memory allocation error.");
194   return NULL; /* never reached */
195 }
196 
197 /* Function: FreeSubInfo()
198  * Returns:  void
199  */
200 
201 void
FreeSubInfo(CMSubInfo_t * subinfo)202 FreeSubInfo(CMSubInfo_t *subinfo)
203 {
204   free(subinfo->imp_cc);
205   free(subinfo->apredict_ct);
206   free(subinfo->spredict_ct);
207   free(subinfo->awrong_ct);
208   free(subinfo->swrong_ct);
209   free(subinfo);
210 }
211 
212 /**************************************************************************
213  * EPN 09.22.06
214  * Function: map_orig2sub_cm()
215  *
216  * Purpose:  Determine maps between a template CM and a sub CM, which probably
217  *           has less structure (less MATP nodes) than the template. Do this
218  *           a bit indirectly, first get maps from each CM to a CP9 HMM,
219  *           then use these maps to map the CMs to each other.
220  * Args:
221  * CM_t *orig_cm       - the original template CM
222  * CM_t *sub_cm        - the sub CM
223  * CMSubMap_t *submap  - the sub CM map
224  * int print_flag      - TRUE to print out useful debugging info
225  * Returns: (void)
226  */
227 void
map_orig2sub_cm(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int print_flag)228 map_orig2sub_cm(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int print_flag)
229 {
230   int status;
231   int k_s;  /* HMM node counter */
232   int v_o;
233   int v_s;
234   int v; /* state index in CM */
235   int n_o;
236   int n_s;
237   char **nodetypes;
238   char **sttypes;
239   int sub_k;
240   int orig_k;
241   int sub_nd;
242   int orig_nd;
243   int x, y;
244   CP9Map_t *orig_cp9map;
245   CP9Map_t *sub_cp9map;
246 
247   ESL_ALLOC(sttypes, sizeof(char *) * 10);
248   sttypes[0] = "D";
249   sttypes[1] = "MP";
250   sttypes[2] = "ML";
251   sttypes[3] = "MR";
252   sttypes[4] = "IL";
253   sttypes[5] = "IR";
254   sttypes[6] = "S";
255   sttypes[7] = "E";
256   sttypes[8] = "B";
257   sttypes[9] = "EL";
258 
259   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
260   nodetypes[0] = "BIF";
261   nodetypes[1] = "MATP";
262   nodetypes[2] = "MATL";
263   nodetypes[3] = "MATR";
264   nodetypes[4] = "BEGL";
265   nodetypes[5] = "BEGR";
266   nodetypes[6] = "ROOT";
267   nodetypes[7] = "END";
268 
269   /* sanity check */
270   if(sub_cm->M > orig_cm->M)
271     cm_Fail("ERROR: sub_cm has more states than orig_cm in map_orig2sub_cm()\n");
272 
273   /* We want maps from the orig_cm to a CP9 HMM and from the
274    * sub_cm to a CP9 HMM, but we don't need the actual HMMs, just
275    * the maps. */
276   /* Allocate and initialize the cp9maps */
277   orig_cp9map = AllocCP9Map(orig_cm);
278   sub_cp9map  = AllocCP9Map(sub_cm);
279   /* Map the CM states to CP9 states and nodes, and vice versa */
280   CP9_map_cm2hmm(orig_cm, orig_cp9map, print_flag);
281   CP9_map_cm2hmm(sub_cm,  sub_cp9map,  print_flag);
282 
283   /* Step through the consensus columns, filling in maps */
284   /* ROOT is special: */
285   v_s = 0; /* ROOT_S */
286   v_o = 0; /* ROOT_S */
287   map_orig2sub_cm_helper(orig_cm, sub_cm, submap, v_o, v_s);
288 
289   /* ROOT_IL inserts before orig cc submap->spos */
290   k_s = 1; /* insert */
291   for(x = 0; x <= 1; x++)
292     for(y = 0; y <= 1; y++)
293       map_orig2sub_cm_helper(orig_cm, sub_cm, submap,
294 			     orig_cp9map->hns2cs[submap->spos-1][k_s][x],
295 			     sub_cp9map->hns2cs[0][k_s][y]);
296 
297   /* ROOT_IR inserts after orig cc submap->epos */
298   k_s = 1; /* insert */
299   for(x = 0; x <= 1; x++)
300     for(y = 0; y <= 1; y++)
301       map_orig2sub_cm_helper(orig_cm, sub_cm, submap,
302 			     orig_cp9map->hns2cs[submap->epos][k_s][x],
303 			     sub_cp9map->hns2cs[submap->epos-submap->spos+1][k_s][y]);
304 
305   for(sub_k = 1; sub_k <= sub_cp9map->hmm_M; sub_k++)
306     {
307       orig_k = sub_k + submap->spos - 1;
308       sub_nd  = sub_cp9map->pos2nd[sub_k];
309       orig_nd = orig_cp9map->pos2nd[orig_k];
310 
311       /* Check for an easy case to save time in the future: when
312        * the orig_cm and sub_cm nodes that map to this column are
313        * of the same type, and also the orig_cm and sub_cm nodes
314        * that map to the next column are also of the same type.
315        * We check for this in cm2sub_cm_check_id_next_node() if TRUE,
316        * then submap->s2o_id[sub_v] is set to TRUE for all states
317        * in this node. Later this will save time by just copying
318        * transition and emission parameters from the orig_cm for
319        * these states.
320        */
321       cm2sub_cm_check_id_next_node(orig_cm, sub_cm, orig_nd, sub_nd, submap, orig_cp9map,
322 				   sub_cp9map, print_flag);
323       for(k_s = 0; k_s < 3; k_s++) /* k_s = 0 (match), k_s = 1 (insert),
324 				      k_s = 2 (delete) */
325 	for(x = 0; x <= 1; x++)
326 	  for(y = 0; y <= 1; y++)
327 	    map_orig2sub_cm_helper(orig_cm, sub_cm, submap,
328 				   orig_cp9map->hns2cs[orig_k][k_s][x],
329 				   sub_cp9map->hns2cs[sub_k][k_s][y]);
330     }
331   /* NOTE: We ignore mapping B states, E states and S states (except ROOT_S). We'll handle these guys
332    * specially when we fill in transition probabilities. The reason we don't map them is that
333    * I think its impossible to do it robustly. I saw cases with SSU where there were BIF nodes in the
334    * sub_cm for which I couldn't figure out which BIF nodes (and correspondingly BEGL and BEGR nodes)
335    * in the orig_cm they should map to. Regardless, even if there is a pretty, nice way I'm abandoning
336    * an attempt to find it for a crude way - we will handle the transitions involving these guys
337    * specially, without a need for a mapping between CMs.
338    */
339 
340   /* If we're in debugging mode and print_flag == TRUE, we print and check the map */
341   if(print_flag)
342     {
343       /* First print submap->s2o_id */
344       for(v = 0; v <= sub_cm->M; v++)
345 	if(submap->s2o_id[v] == TRUE)
346 	  printf("submap->s2o_id[%d] TRUE\n", v);
347 	else
348 	  printf("submap->s2o_id[%d] FALSE\n", v);
349 
350       printf("\n\n\nMAP\n\n\n");
351       for(v_s = 0; v_s < sub_cm->M; v_s++)
352 	{
353 	  n_s = sub_cm->ndidx[v_s];
354 	  v_o = submap->s2o_smap[v_s][0];
355 	  if(sub_cm->sttype[v_s] == E_st)
356 	    printf("sub v:%4d   END\n", v_s);
357 	  if(sub_cm->sttype[v_s] == S_st)
358 	    printf("sub v:%4d   START\n", v_s);
359 	  if(sub_cm->sttype[v_s] == B_st)
360 	    printf("sub v:%4d   BIF_B\n", v_s);
361 	  if((sub_cm->sttype[v_s] != B_st &&
362 	      sub_cm->sttype[v_s] != S_st) &&
363 	     sub_cm->sttype[v_s] != E_st)
364 	    {
365 	      if(v_o == -1 && sub_cm->sttype[(v_s+1)] == E_st) /* v_s is a dead insert */
366 		continue;
367 	      if(v_o == -1 && sub_cm->sttype[v_s] != E_st)
368 		cm_Fail("ERROR sub_cm state: %d type: %s node type: %s doesn't map to any state in orig_cm\n", v_s, sttypes[(int) sub_cm->sttype[v_s]], nodetypes[(int) sub_cm->ndtype[n_s]]);
369 
370 	      n_o = orig_cm->ndidx[v_o];
371 	      if(print_flag) printf("sub v:%4d(%4d) %6s%6s | orig v:%4d(%4d) %6s%6s\n", v_s, n_s, nodetypes[(int) sub_cm->ndtype[n_s]], sttypes[(int) sub_cm->sttype[v_s]], v_o, n_o, nodetypes[(int) orig_cm->ndtype[n_o]], sttypes[(int) orig_cm->sttype[v_o]]);
372 	      /* check to make sure submap->o2s_smap is consistent */
373 	      if(submap->o2s_smap[v_o][0] != v_s && submap->o2s_smap[v_o][1] != v_s)
374 		cm_Fail("ERROR inconsistency; neither o2s_smap[%d][0] and [1] is %d\n", v_o, v_s);
375 
376 	      v_o = submap->s2o_smap[v_s][1];
377 	      if(v_o != -1)
378 		{
379 		  n_o = orig_cm->ndidx[v_o];
380 		  if(print_flag) printf("                              | orig v:%4d(%4d) %6s%6s\n", v_o, n_o, nodetypes[(int) orig_cm->ndtype[n_o]], sttypes[(int) orig_cm->sttype[v_o]]);
381 		  /* check to make sure o2s_smap is consistent */
382 		  if(submap->o2s_smap[v_o][0] != v_s && submap->o2s_smap[v_o][1] != v_s)
383 		    {
384 		      cm_Fail("ERROR inconsistency; neither o2s_smap[%d][0] and [1] is %d\n", v_o, v_s);
385 		    }
386 		}
387 	    }
388 	}
389     }
390 
391   /* Clean up and return */
392   FreeCP9Map(orig_cp9map);
393   FreeCP9Map(sub_cp9map);
394   free(sttypes);
395   free(nodetypes);
396   return;
397 
398  ERROR:
399   cm_Fail("Memory allocation error.");
400 }
401 
402 /**************************************************************************
403  * EPN 08.30.06
404  * map_orig2sub_cm_helper()
405  *
406  * helper function for map_orig2sub_cm(),
407  *
408  * Purpose:  Fill in specific part of the map, given orig_v (orig_cm state),
409  *           sub_v (sub_cm state)
410  * Args:
411  * CM_t *orig_cm
412  * CM_t *sub_cm
413  * CMSubMap_t *submap
414  * int   orig_v;
415  * int   sub_v;
416  * Returns: 1 if a new mapping was made, 0 if not
417  */
418 static int
map_orig2sub_cm_helper(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int orig_v,int sub_v)419 map_orig2sub_cm_helper(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int sub_v)
420 {
421   int sub_nd;
422   int orig_nd;
423 
424   /*printf("\nin helper: orig_v: %d sub_v: %d\n", orig_v, sub_v);*/
425 
426   if(orig_v == -1 || sub_v == -1)
427     return 0;
428 
429   /* check to see if we already have this mapping */
430   if(submap->o2s_smap[orig_v][0] == sub_v || submap->o2s_smap[orig_v][1] == sub_v)
431     return 0;
432 
433   orig_nd = orig_cm->ndidx[orig_v];
434   sub_nd  =  sub_cm->ndidx[sub_v];
435 
436   if(sub_cm->sttype[sub_v] == IL_st || sub_cm->sttype[sub_v] == IR_st)
437     {
438       /* Make sure that neither orig_v nor sub_v is a detached insert state,
439        * if either is, we return b/c it's irrelevant, and we don't store that info in the maps */
440       if(orig_cm->sttype[(orig_v+1)] == E_st || sub_cm->sttype[(sub_v+1)] == E_st)
441 	return 0;
442     }
443   /* 09.14.06  I think that some of the code that checks for cases where we wnat to avoid mapping is unnecessary!,
444    * but I'm not sure what...*/
445 
446   /* Check for a case we want to avoid mapping. We exploit the fact that we know a MATP_nd in a sub_cm MUST be mapped
447    * to a corresponding MATP_nd in the original template CM: */
448   if(sub_cm->ndtype[sub_nd] == MATP_nd)
449     if(orig_cm->ndtype[orig_nd] == MATP_nd && sub_cm->sttype[sub_v] != orig_cm->sttype[orig_v])
450       return 0;
451 
452   /* Fill in submap->o2s_smap */
453   if(submap->o2s_smap[orig_v][0] == -1)
454     {
455       if (submap->o2s_smap[orig_v][1] != -1)
456 	cm_Fail("ERROR in map_orig2sub_cm_helper, submap->o2s_smap[%d][0] is -1 but submap->o2s_smap[%d][1] is not, this shouldn't happen.\n", orig_v, orig_v);
457       else
458 	submap->o2s_smap[orig_v][0] = sub_v;
459     }
460   else if (submap->o2s_smap[orig_v][1] != -1)
461     {
462       if(submap->o2s_smap[orig_v][0] == sub_v || submap->o2s_smap[orig_v][1] == sub_v)
463 	/* abort!, we already have this mapping */
464 	return 0;
465       else
466 	cm_Fail("ERROR in map_orig2sub_cm_helper, submap->o2s_smap[%d][0] is not -1 and submap->o2s_smap[%d][1] is not -1, this shouldn't happen.\n", orig_v, orig_v);
467     }
468   else /* submap->o2s_smap[orig_v][0] != -1 && submap->o2s_smap[orig_v][1] == -1 */
469     {
470       if(submap->o2s_smap[orig_v][0] == sub_v || submap->o2s_smap[orig_v][1] == sub_v)
471 	/* abort!, we already have this mapping */
472 	return 0;
473       submap->o2s_smap[orig_v][1] = sub_v;
474     }
475 
476   /* now fill in submap->s2o_smap */
477   if(submap->s2o_smap[sub_v][0] == -1)
478     if (submap->s2o_smap[sub_v][1] != -1)
479       cm_Fail("ERROR in map_sub2orig_cm_helper, submap->s2o_smap[%d][0] is -1 but submap->s2o_smap[%d][1] is not, this shouldn't happen.\n", sub_v, sub_v);
480     else
481       submap->s2o_smap[sub_v][0] = orig_v;
482   else if (submap->s2o_smap[sub_v][1] != -1)
483     cm_Fail("ERROR in map_sub2orig_cm_helper, submap->s2o_smap[%d][0] is not -1 and submap->s2o_smap[%d][1] is not -1, this shouldn't happen.\n", sub_v, sub_v);
484   else /* submap->s2o_smap[sub_v][0] != -1 && submap->s2o_smap[sub_v][1] == -1 */
485     submap->s2o_smap[sub_v][1] = orig_v;
486   return 1;
487 }
488 
489 
490 /****************************************************************************
491  * Function:  build_sub_cm()
492  * EPN 08.28.06
493  *
494  * Purpose:   Given a template CM (orig_cm) built from an MSA of consensus columns
495  *            1..M, build a new, "sub CM" (sub_cm), that only models well-nested base pairs
496  *            that occur (have both left and right halves) between consensus columns
497  *            [sstruct..estruct]. The sub_cm will only model the consensus
498  *            columns between [sstruct..estruct].
499  *
500  *            We attempt to construct the sub CM such that:
501  *
502  *            1. All 'mapped' states in the orig_cm and sub_cm have identical
503  *               'psi' values, i.e. the expected number of times entered in a single parse
504  *               is identical.
505  *
506  *            2. Alignments sampled from it should 'follow the same probability distributions'
507  *               as alignments sampled from the template CM and then truncated,
508  *               removing columns outside [sstruct..estruct] Specifically, 'follow
509  *               the same probability distributions' means if we built a ML HMM (Weinberg
510  *               style) from the resulting infinite alignments, we would have identical
511  *               HMMs for the sub-CM alignments and for the truncated template CMs. It turns
512  *               out this is impossible for certain situations
513  *               that cause differences in the topology of the orig_cm and sub_cm. These
514  *               situations, so-called 'impossible cases', cause the distributions out of a
515  *               single ML HMM node to be different between the orig_cm, but are relatively
516  *               rare (a rough, rough estimate is 1% of ML HMM nodes are affected).  We can
517  *               predict when all of these situations will occur, but sometimes when we
518  *               predict a ML HMM node will be off in this way, it is in fact identical
519  *               between the sub_cm and orig_cm for reasons I can't quite grasp.
520  *
521  *            This function builds a sub_cm and then potentially performs up to 3 checks
522  *            to see if 1 and 2 above are satisfied (it only checks if 2 holds for
523  *            non-impossible cases, and that we can accurately predict the impossible
524  *            cases). The 3 potential checks are:
525  *
526  *            A. Calculate the expected number of times that each state in the orig_cm
527  *               and sub_cm is entered (psi[v] = exp # times state v is entered), and
528  *               make sure these values are within 'threshold' of each other for all
529  *               states that map between the orig_cm and sub_cm. This is performed
530  *               by the 'check_orig_psi_vs_sub_psi' function, which is called within
531  *               this function if either the 'do_acheck' and/or 'do_scheck' parameters
532  *               passed in are TRUE.
533  *
534  *            B. Analytically (not by sampling) building 2 ML HMMs, one from the orig_cm
535  *               and one from the sub_cm, predicting the impossible cases and testing t
536  *               o make sure all the corresponding parameters for non-impossible cases
537  *               are within 'threshold' of each other. This is done within the
538  *               check_sub_cm() function which is called from this function if the
539  *               'do_acheck' parameter passed in is set to TRUE.
540  *
541  *            C. Build a CP9 ML HMM analytically from the sub_cm. Sample a deep MSA
542  *               from the orig_cm and potentially truncate outside [sstruct..estruct]. Use
543  *               counts from the MSA to build a ML HMM for the orig_cm. Then perform
544  *               chi-squared tests to see if the counts in the orig_cm's HMM could
545  *               have been generated by the distributions in the sub_cm's HMM. This
546  *               check is performed if the 'do_scheck' parameter passed in is set
547  *               to TRUE.
548  *
549  * Args:      orig_cm      - the original model, which we're going to (potentially)
550  *                           remove some structure from
551  *            errbuf       - for error messages
552  *            ret_sub_cm   - the new sub_cm built from orig_cm with some structure removed.
553  *            sstruct      - the first position (consensus column) we want to model
554  *            estruct      - the last position we will model
555  *            print_flag   - TRUE to print debugging statements
556  *
557  * Returns:   eslOK on success
558  *            eslEINCONCEIVABLE if something inconceivable happens
559  *            eslEINCOMPAT on contract violation
560  *            eslEMEM on memory error
561  */
562 int
build_sub_cm(CM_t * orig_cm,char * errbuf,CM_t ** ret_cm,int sstruct,int estruct,CMSubMap_t ** ret_submap,int print_flag)563 build_sub_cm(CM_t *orig_cm, char *errbuf, CM_t **ret_cm, int sstruct, int estruct, CMSubMap_t **ret_submap, int print_flag)
564 {
565   int              status;
566   CM_t            *sub_cm;      /* new covariance model, a submodel of the template */
567   CMConsensus_t   *con;         /* growing consensus info for orig_cm               */
568   Parsetree_t     *mtr;         /* master structure tree from the alignment         */
569   char            *sub_cstr;    /* consensus substructure display string            */
570   int             *sub_ct;	/* 0..con->clen-1 base pair partners array          */
571   int              cpos;        /* position counter within orig_cm                  */
572   int              sub_cpos;    /* position counter within sub_cm                   */
573   char          ***tmap;        /* hard-coded transition map, for convenience       */
574   double          *orig_psi;    /* expected num times each state visited in orig_cm */
575   int              v_s;         /* state counter for sub_cm                         */
576   int              n_s;         /* node  counter for sub_cm                         */
577   FILE            *ofp;         /* an open output file                              */
578   int              spos;        /* first consensus (match) column of the orig_cm to
579 				 * model with the sub_cm */
580   int              epos;        /* last consensus (match) column of the orig_cm to
581 				 * model with the sub_cm */
582   CMSubMap_t *submap;
583 
584   /* check to make sure that we can actually build a sub CM of this model */
585   if((orig_cm->flags & CMH_LOCAL_BEGIN) || (orig_cm->flags & CMH_LOCAL_END)) ESL_FAIL(eslEINCOMPAT, errbuf, "build_sub_cm() trying to build a sub CM of a CM already in local mode, not yet supported.\n");
586   if(orig_cm->flags & CM_IS_SUB) ESL_FAIL(eslEINCOMPAT, errbuf, "build_sub_cm(), trying to build a sub CM of a CM that is itself a sub CM.");
587 
588   /* Much of the code for building and checking sub CMs relies on the fact that every insert
589    * state in the sub CM maps exactly 1 insert state in the original CM. This is fine if we
590    * have removed ambiguities by detaching all original CM insert states that are 1 state
591    * before an END_E state. This was probably done when the CM was built, but we redo it here
592    * in case it was not.
593    */
594   cm_find_and_detach_dual_inserts(orig_cm,
595 				  FALSE, /* DON'T check that these states have 0 counts (they may not due to priors) */
596 				  TRUE); /* DO detach END_E-1 insert states, making them unreachable */
597 
598   /* Get the consensus sequence and consensus structure information from the original CM */
599   if((con = CreateCMConsensus(orig_cm, orig_cm->abc)) == NULL) ESL_FAIL(eslFAIL, errbuf, "build_sub_cm(), unable to create cm consensus data");
600   if(print_flag)
601     {
602       printf("con->cseq    : %s\n", con->cseq);
603       printf("con->cstr    : %s\n", con->cstr);
604       printf("clen         : %d\n", con->clen);
605     }
606 
607   spos = sstruct;
608   epos = estruct;
609 
610   /* Fill a new ct array for the sub_cm. The sub_cm will only model the consensus columns
611    * between spos and epos, and only the structure between spos
612    * and epos. First copy the template (original) CMs ct array but only for the
613    * appropriate consensus columns that lie in between both structure and model boundarIes
614    * Next, eliminate any structure that lies outside the structure boundaries.
615    */
616 
617   ESL_ALLOC(sub_ct,  sizeof(int) * (epos - spos + 1));
618   /* First just copy ct array for model boundaries from con->ct */
619   for (cpos = (spos-1); cpos < epos; cpos++)
620     {
621       sub_cpos = cpos - (spos-1);
622       if(con->ct[cpos] != -1 &&
623 	 (con->ct[cpos] <  (spos-1) ||
624 	  con->ct[cpos] >=  epos))
625 	sub_ct[sub_cpos] = -1;
626       else
627 	sub_ct[sub_cpos] = con->ct[cpos];
628     }
629   /* Second remove structure outside structural boundaries */
630   for (cpos = (spos-1); cpos < epos; cpos++)
631     {
632       sub_cpos = cpos - (spos-1);
633       if ((cpos+1) < sstruct || (cpos+1) > estruct) /* cpos goes 1..clen, but ct is indexed
634 						     * 0..clen-1.*/
635 	{
636 	  /* CreateCMConsensus() uses -1 in ct[] to indicate single
637 	   * stranded (different convention than WUSS2ct()). */
638 	  if (sub_ct[sub_cpos] != -1)
639 	    sub_ct[sub_ct[sub_cpos]] = -1;
640 	  sub_ct[sub_cpos] = -1;
641 	}
642     }
643 
644   /* Construct the new structure ss_cons based on the template CM ct array.
645    * We could do this similar to how display.c::CreateCMConsensus()
646    * does it to get the fully formatted WUSS ([{<>}]) string but
647    * lazily we just do <> bps here.
648    */
649   ESL_ALLOC(sub_cstr, sizeof(char) * (epos - spos + 2));
650   for (cpos = (spos-1); cpos < epos; cpos++)
651     {
652       sub_cpos = cpos - (spos-1);
653       if(sub_ct[sub_cpos] == -1)         sub_cstr[sub_cpos] = '.';
654       else if (sub_ct[sub_cpos]  > cpos) sub_cstr[sub_cpos] = '<';
655       else if (sub_ct[sub_cpos]  < cpos) sub_cstr[sub_cpos] = '>';
656       else cm_Fail("ERROR: weird error in build_sub_cm()\n");
657     }
658   sub_cstr[(epos-spos+1)] = '\0';
659 
660   /* Build the new sub_cm given the new consensus structure. But don't
661    * parameterize it yet.
662    */
663   if((status = ConsensusModelmaker(orig_cm->abc, errbuf, sub_cstr, (epos-spos+1), TRUE, &sub_cm, &mtr)) != eslOK) return status;
664   /* TRUE in ConsensusModelmaker() call says 'yes, we're building a sub CM, allow invalid CMs in cm_from_guide() (see comments in that function for details)*/
665 
666   /* Set creation time */
667   if ((status = cm_SetCtime(sub_cm)) != eslOK)  ESL_FAIL(status, errbuf, "Failed to record timestamp");
668 
669   /* Rebalance the CM for optimization of D&C */
670   CM_t *new;
671   if((status = CMRebalance(sub_cm, errbuf, &new)) != eslOK) return status;
672   FreeCM(sub_cm);
673   sub_cm = new;
674 
675   submap = AllocSubMap(sub_cm, orig_cm, sstruct, estruct);
676   if(print_flag)
677     {
678       printf("\n\norig struct: %s\n", con->cstr);
679       printf("\n\nnew struct : %s\n", sub_cstr);
680     }
681 
682   /* Map states from orig_cm to sub_cm and vice versa. */
683   map_orig2sub_cm(orig_cm, sub_cm, submap, print_flag);
684 
685   /* Fill orig_psi, which we need to determine the sub_cm parameters. */
686   orig_psi = cm_ExpectedStateOccupancy(orig_cm);
687   /* we need a transition map too */
688   tmap = cm_CreateTransitionMap();
689 
690   CMZero(sub_cm);
691   CMSetNullModel(sub_cm, orig_cm->null);
692   sub_cm->el_selfsc = orig_cm->el_selfsc;
693   sub_cm->beta_W    = orig_cm->beta_W;
694   sub_cm->tau       = orig_cm->tau;
695 
696   /* copy the options from the template CM, but turn off the CM_ALIGN_SUB and
697    * CM_CONFIG_SUB options and turn on the CM_IS_SUB flag */
698   sub_cm->config_opts      = orig_cm->config_opts;
699   sub_cm->align_opts       = orig_cm->align_opts;
700   sub_cm->search_opts      = orig_cm->search_opts;
701   sub_cm->flags            = 0;
702   if(sub_cm->config_opts & CM_CONFIG_SUB) sub_cm->config_opts &= ~CM_CONFIG_SUB;
703   if(sub_cm->align_opts  & CM_ALIGN_SUB)  sub_cm->align_opts  &= ~CM_ALIGN_SUB;
704   sub_cm->flags |= CM_IS_SUB;
705 
706   /* Fill in emission probabilities */
707   for(v_s = 0; v_s < sub_cm->M; v_s++)
708     {
709       if(sub_cm->sttype[(v_s+1)] == E_st) /* detached insert */
710 	esl_vec_FNorm(sub_cm->e[v_s], orig_cm->abc->K);   /* equiprobable, but irrelevant, this state will never be reached */
711       else if(sub_cm->sttype[v_s] != S_st &&
712 	      sub_cm->sttype[v_s] != D_st &&
713 	      sub_cm->sttype[v_s] != B_st &&
714 	      sub_cm->sttype[v_s] != E_st)
715 	cm2sub_cm_emit_probs(orig_cm, sub_cm, orig_psi, v_s, submap->s2o_smap[v_s][0], submap->s2o_smap[v_s][1], submap);
716     }
717   /* Fill in transition virtual counts.
718    * First handle non-B,S,E states, we'll deal with B,S,Es later.
719    * The reason we have to wait is that we can't (I don't think at least)
720    * unambiguously map the sub_cm B, S, or E states to orig_cm states.
721    */
722   for(v_s = 0; v_s < sub_cm->M; v_s++)
723     {
724       if(sub_cm->sttype[(v_s+1)] == E_st) /* detached insert */
725 	esl_vec_FNorm(sub_cm->t[v_s], sub_cm->cnum[v_s]);   /* equiprobable, but irrelevant, this state will never be reached */
726       else if(v_s == 0 ||
727 	      (sub_cm->sttype[v_s] != S_st &&
728 	       sub_cm->sttype[v_s] != B_st &&
729 	       sub_cm->sttype[v_s] != E_st))
730 	cm2sub_cm_trans_probs(orig_cm, sub_cm, orig_psi, tmap, v_s, submap);
731     }
732 
733 
734   /* Address problem 090806 (in the 00LOG of ~/notebook/6_0725_inf_sub_cm/), by
735    * retraversing the structure and subtracting out subpaths that have been counted twice
736    * for a special situation involving the two inserts of ROOT and MATP states
737    */
738   for(n_s = 0; n_s < sub_cm->nodes; n_s++)
739     {
740       if(sub_cm->ndtype[n_s] == MATP_nd && (sub_cm->sttype[(sub_cm->nodemap[n_s] + 5)+1] != E_st))
741 	{
742 
743 	  if((submap->s2o_smap[sub_cm->nodemap[n_s] + 4][1] != -1) ||
744 	     (submap->s2o_smap[sub_cm->nodemap[n_s] + 5][1] != -1))
745 	    ESL_FAIL(eslEINCONCEIVABLE, errbuf, "build_sub_cm(), MATP_IL or MATP_IR node: %d map to 2 cm states\n", n_s);
746 	  if(submap->s2o_smap[sub_cm->nodemap[n_s] + 4][0] != (submap->s2o_smap[sub_cm->nodemap[n_s] + 5][0] - 1))
747 	    ESL_FAIL(eslEINCONCEIVABLE, errbuf, "build_sub_cm(), MATP_IL or MATP_IR node: %d don't map to adjacent orig_cm states\n", n_s);
748 	}
749       if(sub_cm->ndtype[n_s] == ROOT_nd && sub_cm->ndtype[n_s+1] != BIF_nd) /* ROOT->BIFs are handled special
750 									     * (see next loop) */
751 	cm2sub_cm_subtract_root_subpaths(orig_cm, sub_cm, orig_psi, tmap, submap, print_flag);
752     }
753 
754   /* Go back through and fill in the transitions into E and B states and out of S states */
755   for(v_s = 0; v_s < sub_cm->M; v_s++)
756     {
757       if(sub_cm->sttype[v_s] == S_st)
758 	cm2sub_cm_trans_probs_S(orig_cm, sub_cm, orig_psi, tmap, v_s, submap);
759 
760       if(sub_cm->sttype[v_s] == E_st || sub_cm->sttype[v_s] == B_st)
761 	cm2sub_cm_trans_probs_B_E(orig_cm, sub_cm, orig_psi, tmap, v_s, submap, print_flag);
762       /* convention is to leave transitions out of BIF_B as 0.0, all the code knows they're obligate */
763     }
764 
765   /* Remove sub_cm ambiguities by finding and detaching sub CM insert states
766    * that are 1 state before END_E states by setting transitions into
767    * such states as 0.0.
768    */
769   cm_find_and_detach_dual_inserts(sub_cm,
770 				  FALSE, /* DON'T check that these states have 0 counts (they won't due to priors) */
771 				  TRUE); /* DO detach END_E-1 insert states */
772 
773   /*debug_sub_cm_check_all_trans(orig_cm, sub_cm, submap);*/
774 
775   /* reset sub_cm->qdbinfo->setby, we don't want to have to calculate QDBs for the sub CM unless we have to */
776   if(sub_cm->qdbinfo->setby == CM_QDBINFO_SETBY_INIT) sub_cm->qdbinfo->setby = CM_QDBINFO_SETBY_SUBINIT;
777   if(sub_cm->W == 0) {
778     sub_cm->W = orig_cm->W;
779     sub_cm->W_setby = CM_W_SETBY_SUBCOPY;
780   }
781 
782   /* Finally renormalize the CM */
783   CMRenormalize(sub_cm);
784   /* DO NOT LOGODDSIFY YET, we'll do this when we call ConfigCM() for this CM,
785    * the logsoddsification step takes a significant amount of time.
786    * CMLogoddsify(sub_cm);
787    */
788 
789   if(print_flag)
790     {
791       ofp = fopen("sub.cm", "w");
792       if(print_flag)  printf("%-40s ... ", "Saving model to file"); fflush(stdout);
793       if(print_flag)  cm_file_WriteASCII(ofp, -1, sub_cm);
794       if(print_flag)  printf("done.\n");
795     }
796 
797   if(print_flag)
798     {
799       printf("\nDEBUG PRINT OF ORIG_CM PARAMETERS:\n");
800       debug_print_cm_params(stdout, orig_cm);
801       printf("\nDEBUG PRINT OF SUB_CM PARAMETERS:\n");
802       debug_print_cm_params(stdout, sub_cm);
803     }
804 
805   /* Cleanup and exit. */
806   cm_FreeTransitionMap(tmap);
807   free(sub_cstr);
808   free(sub_ct);
809   FreeCMConsensus(con);
810   FreeParsetree(mtr);
811   free(orig_psi);
812 
813   *ret_cm = sub_cm;
814   *ret_submap = submap;
815 
816   return eslOK;
817 
818  ERROR:
819   ESL_FAIL(eslEMEM, errbuf, "build_sub_cm() memory allocation error.");
820   return FALSE; /* never reached */
821 }
822 
823 /**************************************************************
824  * Function: CP9NodeForPosn()
825  * EPN 07.25.06 Benasque, Spain
826  *
827  * Purpose:  Determine the node of the CP9 HMM that is most likely to
828  *           have emitted (from either its Match or Insert state)
829  *           a given posn in the target sequence.
830  *
831  * Args:     hmm       - the CM plan 9 HMM
832  *           i0        - first posn of target subseq with info in posterior matrix
833  *           j0        - last posn of target subseq with info in posterior matrix
834  *           x         - posn of target subsequence we're interested in
835  *           L         - last position of target sequence
836  *           post      - the posterior matrix for the hmm
837  *           ret_node  - RETURN: index of node with highest probability of emitting x
838  *           ret_type  - RETURN: type of state in ret_node with highest probability
839  *           pmass     - probability mass to require on left of start or right of end
840  *           is_start  - TRUE if we're doing left of start,
841  *           print_flag- TRUE to print out info on most likely node
842  *
843  * Returns:  eslOK on success;
844  *           eslEINVAL on contract violation.
845  */
846 void
CP9NodeForPosn(CP9_t * hmm,int i0,int j0,int x,CP9_MX * post,int * ret_node,int * ret_type,float pmass,int is_start,int print_flag)847 CP9NodeForPosn(CP9_t *hmm, int i0, int j0, int x, CP9_MX *post,
848 	       int *ret_node, int *ret_type, float pmass, int is_start,
849 	       int print_flag)
850 {
851   /* post->mmx[i][k]: posterior probability that posn i was emitted from node k's
852      match state */
853   int  max_k;    /* node index with highest posterior probability of emitting posn x */
854   int  max_type; /* type of state in max_k node with max probability '0' for match,
855 		    '1' for insert */
856   int  max_sc;   /* score (log probability) from post matrix for max_k node max_type state type */
857   int  k;        /* counter over nodes */
858 
859   if(!is_start) pmass = 1. - pmass; /* we move left to right */
860 
861   /*printf("in CP9NodeForPosn is_start: %d pmass: %f\n", is_start, pmass);*/
862   if(x > j0 || x < i0)
863     /*ESL_XFAIL(eslEINVAL, "ERROR in CP9NodeForPosn(), asking for position x: %d outside subseq bounds i0: %d j0: %d\n", x, i0, j0);*/
864     cm_Fail("ERROR in CP9NodeForPosn(), asking for position x: %d outside subseq bounds i0: %d j0: %d\n", x, i0, j0);
865 
866   if(post->mmx[x][0] > post->imx[x][0])
867     {
868       max_sc     = post->mmx[x][0];
869       max_type   = 0; /* match */
870     }
871   else
872     {
873       max_sc     = post->imx[x][0];
874       max_type   = 1; /* insert */
875     }
876   max_k    = 0;
877   /* move left to right through HMM nodes */
878   for(k = 1; k <= hmm->M; k++)
879     {
880       if(post->mmx[x][k] > max_sc)
881 	{
882 	  max_k  = k;
883 	  max_sc = post->mmx[x][k];
884 	  max_type = 0; /* match */
885 	}
886       if(post->imx[x][k] > max_sc)
887 	{
888 	  max_k  = k;
889 	  max_sc = post->imx[x][k];
890 	  max_type = 1; /* insert */
891 	}
892     }
893 
894   if(print_flag)
895     {
896       if(max_type == 0)
897 	printf("MATCH | mx->mmx[%3d][%3d]: %9d | %8f\n", x, max_k, post->mmx[x][max_k], Score2Prob(post->mmx[x][max_k], 1.));
898       else
899 	printf("INSERT | mx->imx[%3d][%3d]: %9d | %8f\n", x, max_k, post->imx[x][max_k], Score2Prob(post->imx[x][max_k], 1.));
900     }
901   *ret_node = max_k;
902   *ret_type = max_type;
903   return;
904 }
905 
906 
907 /**********************************************************
908  * Function:  StripWUSSGivenCC()
909  * EPN 09.07.05
910  *
911  * Purpose:   Strips a secondary structure string in WUSS notation
912  *            of base pair information for specific match (consensus) columns.
913  *            namely those before the first match column given by first_match,
914  *            and after the last match column, given by last_match
915  *            The msa->ss_cons secondary structure string is modified.
916  *
917  *            Characters <([{  are converted to :   (left base of base pairs)
918  *            Characters >)]}  are converted to :   (right base of base pairs)
919  *            Characters _-,   are converted to :   (unpaired bases)
920  *            Characters  .:~  are untouched
921  *            Pseudoknot characters are converted to : as well.
922  *
923  * Args:      msa         - the multiple sequence alignment
924  *            gapthresh   - the gap threshold for calling a match column
925  *            first_match - first match column to keep structure for
926  *            last_match  - last match column to keep structure for
927  * Returns:   (void)
928  */
929 void
StripWUSSGivenCC(ESL_MSA * msa,float gapthresh,int first_match,int last_match)930 StripWUSSGivenCC(ESL_MSA *msa, float gapthresh, int first_match, int last_match)
931 {
932   int status;
933   int *matassign;	/* 0..alen-1 array; 0=insert col, 1=match col */
934   int gaps;
935   int apos;
936   int idx;
937   int cc;
938   int *ct;		/* 0..alen-1 base pair partners array         */
939 
940   /* Contract check */
941   if(msa->flags & eslMSA_DIGITAL)
942     cm_Fail("ERROR in StripWUSSGivenCC, MSA is digitized.\n");
943 
944   /* 1. Determine match/insert assignments
945    *    matassign is 1..alen. Values are 1 if a match column, 0 if insert column.
946    */
947   ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1));
948   matassign[0] = 0; /* no 0th column in MSA */
949   for (apos = 0; apos < msa->alen; apos++)
950     {
951       for (gaps = 0, idx = 0; idx < msa->nseq; idx++)
952 	if (esl_abc_CIsGap(msa->abc, msa->aseq[idx][apos])) gaps++;
953       matassign[apos+1] = ((double) gaps / (double) msa->nseq > gapthresh) ? 0 : 1;
954     }
955 
956   /* 2. Determine a "ct" array, base-pairing partners for each position.
957    *    Disallow/ignore pseudoknots. (That's what the FALSE flag does.)
958    *    ct[] values give the index of a base pairing partner, or 0 for unpaired positions.
959    *    Even though msa->ss_cons is in the 0..alen-1 coord system of msa, ct[]
960    *    comes back in the 1..alen coord system of dsq.
961    */
962   ESL_ALLOC(ct, (msa->alen+1) * sizeof(int));
963   if (esl_wuss2ct(msa->ss_cons, msa->alen, ct) != eslOK)
964     cm_Fail("Consensus structure string is inconsistent");
965 
966   /* 3. Make sure the consensus structure "ct" is consistent with the match assignments.
967    *    Wipe out all structure in insert columns; including the base-paired
968    *    partner of insert-assigned columns.
969    *    Also, remove structure outside of the consensus columns that
970    *    map to the HMM nodes first_match and last_match.
971    */
972   cc = 0;
973   for (apos = 1; apos <= msa->alen; apos++)
974     {
975       if (! matassign[apos])
976 	{
977 	  if (ct[apos] != 0)  ct[ct[apos]] = 0;
978 	  ct[apos] = 0;
979 	}
980       else /* matassign[apos] == 1 */
981 	{
982 	  cc++;
983 	  if(cc < first_match || cc > last_match)
984 	    {
985 	      if (ct[apos] != 0)  ct[ct[apos]] = 0;
986 	      ct[apos] = 0;
987 	    }
988 	}
989     }
990 
991   /* Next construct the new msa->ss_cons based on the ct array.
992    * We should do this similar to display.c::CreateCMConsensus()
993    * does it to get the fully formatted WUSS ([{<>}]) string but
994    * lazily we just do <> bps here.
995    */
996   for (apos = 1; apos <= msa->alen; apos++)
997     {
998       if      (ct[apos] == 0   ) msa->ss_cons[apos-1] = '.';
999       else if (ct[apos]  > apos) msa->ss_cons[apos-1] = '<';
1000       else if (ct[apos]  < apos) msa->ss_cons[apos-1] = '>';
1001       else cm_Fail("ERROR: weird error in StripWUSSGivenCC\n");
1002     }
1003 
1004   return;
1005 
1006  ERROR:
1007   cm_Fail("Memory allocation error.");
1008 }
1009 
1010 
1011 /**************************************************************************
1012  * EPN 08.31.06
1013  * cm2sub_cm_emit_probs()
1014  *
1015  * Purpose:  For a specific sub CM state v_s, determine the
1016  *           emission probabilities using the emission probs
1017  *           of state(s) (up to 2) in the original CM that
1018  *           map to it. We weight the contribution of each
1019  *           state to the emission probability by it's psi
1020  *           value in orig_cm (psi[v] is expected number of
1021  *           times state v is entered in a parse).
1022  *
1023  * Args:
1024  * CM_t *orig_cm     - the original, template CM
1025  * CM_t *sub_cm      - the sub CM
1026  * double *orig_psi  - orig_psi[v] is the expected number of times state v is entered
1027  *                     in a CM parse
1028  * int v_s           - the sub_cm state we're filling emissions for
1029  * int v_o1          - orig_cm state v_s maps to
1030  * int v_o2          - orig_cm state v_s maps to (-1 if v_s maps to only 1 state)
1031  * Returns: void
1032  */
1033 static void
cm2sub_cm_emit_probs(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,int v_s,int v_o1,int v_o2,CMSubMap_t * submap)1034 cm2sub_cm_emit_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, int v_s, int v_o1, int v_o2,
1035 		     CMSubMap_t *submap)
1036 {
1037   int is_left;
1038   int i, j;
1039 
1040   /*printf("\nin cm2sub_cm_emit_probs v_s: %d, v_o1: %d, v_o2: %d\n", v_s, v_o1, v_o2);*/
1041 
1042   if(v_o1 == -1)
1043     {
1044       cm_Fail("ERROR in cm2sub_cm_emit_probs, sub_cm state %d maps to 0 states in orig_cm (spos: %d epos: %d)\n", v_s, submap->spos, submap->epos);
1045     }
1046 
1047   if(sub_cm->sttype[v_s] == MP_st)
1048     {
1049       for(i = 0; i < (orig_cm->abc->K*orig_cm->abc->K); i++)
1050 	sub_cm->e[v_s][i] = orig_cm->e[v_o1][i];
1051       return;
1052     }
1053 
1054   if(submap->s2o_id[v_s] == TRUE)
1055     {
1056       /* must be a singlet emitter */
1057       for(i = 0; i < orig_cm->abc->K; i++)
1058 	sub_cm->e[v_s][i] = orig_cm->e[v_o1][i];
1059       /* No FNorm's necessary (assuming the orig_cm is normalized), since we're
1060        * building a new CM for each sequence in --sub mode, we skip it for speed.
1061        */
1062       return;
1063     }
1064 
1065   /* If we get here, v_s is a singlet emitter. */
1066 
1067   /* There are two cases when two states can map to v_s.
1068    * Case 1: one of them is an MP_st,
1069    * Case 2: one is an IL_st and one is an IR_st (ambiguity in CM architecture)
1070    * These are the only cases where we need to weight emission probs by orig_psi values,
1071    * and subsequently only cases we need to call FNorm() for */
1072   if(orig_cm->sttype[v_o1] == MP_st)
1073     {
1074       if(orig_cm->sttype[v_o2] == ML_st)
1075 	is_left = TRUE;
1076       else if(orig_cm->sttype[v_o2] == MR_st)
1077 	is_left = FALSE;
1078       else
1079 	cm_Fail("ERROR v_s: %d maps to a MP_st and another non-ML and non-MR state\n");
1080 
1081       for(i = 0; i < orig_cm->abc->K; i++)
1082 	if(is_left)
1083 	  for(j = (i*orig_cm->abc->K); j < ((i+1)*orig_cm->abc->K); j++)
1084 	    sub_cm->e[v_s][i] += orig_psi[v_o1] * orig_cm->e[v_o1][j];
1085 	else
1086 	  for(j = i; j < (orig_cm->abc->K*orig_cm->abc->K); j+=orig_cm->abc->K)
1087 	    sub_cm->e[v_s][i] += orig_psi[v_o1] * orig_cm->e[v_o1][j];
1088       if(orig_cm->sttype[v_o2] == MP_st)
1089 	cm_Fail("ERROR sub_cm state: %d maps to two MATP_MP states\n", v_s);
1090 
1091       /*v_o2 must be ML or MR, which can all be handled identically */
1092       for(i = 0; i < orig_cm->abc->K; i++)
1093 	sub_cm->e[v_s][i] += orig_psi[v_o2] * orig_cm->e[v_o2][i];
1094       esl_vec_FNorm(sub_cm->e[v_s], orig_cm->abc->K);
1095       return;
1096     }
1097   else if(v_o2 != -1)
1098     cm_Fail("ERROR sub_cm state: %d maps to two states (%d and %d), but neither is a MATP_MP\n", v_s, v_o1, v_o2);
1099 
1100   /* If we get here, v_s maps to a single singlet emitter in orig_cm, v_o1 */
1101   for(i = 0; i < orig_cm->abc->K; i++)
1102     sub_cm->e[v_s][i] = orig_cm->e[v_o1][i];
1103 
1104   return;
1105 }
1106 
1107 /**************************************************************************
1108  * EPN 08.31.06
1109  * cm2sub_cm_trans_probs()
1110  *
1111  * Purpose:  For a specific sub CM state v_s, determine the
1112  *           transition probabilities going out of v_s,
1113  *           using the psi values for the original template
1114  *           CM (orig_cm) (psi[v] is expected number of
1115  *           times state v is entered in a parse).
1116  *           We fill the sub_cm->t[v_s] with 'virtual
1117  *           counts', then we'll normalize to probabilities
1118  *           later (outside this function).
1119  *
1120  *           This is based on cp9_modelmaker.c::cm2hmm_trans_probs_cp9()
1121  *           which was based on formulas/ideas in Zasha Weinberg's
1122  *           thesis (p.123).
1123  *
1124  * Args:
1125  * CM_t *orig_cm     - the original, template CM
1126  * CM_t *sub_cm      - the sub CM
1127  * double *orig_psi  - orig_psi[v] is the expected number of times state v is entered
1128  *                     in a CM parse
1129  * char ***tmap      - the hard-coded transition map
1130  * int v_s           - the sub_cm state we're filling transitions for
1131  * submap            - the map from the sub CM to the template CM
1132  * Returns: void
1133  */
1134 static void
cm2sub_cm_trans_probs(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,int v_s,CMSubMap_t * submap)1135 cm2sub_cm_trans_probs(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_s, CMSubMap_t *submap)
1136 {
1137   int v_o;
1138   int yoffset;
1139   int y_s;
1140 
1141   /*printf("in cm2sub_cm_trans_probs: v_s: %d\n", v_s);*/
1142 
1143   if(submap->s2o_id[v_s] == TRUE) /* v_s is identical to submap->s2o_smap[v_s][0] */
1144     {
1145       v_o = submap->s2o_smap[v_s][0];
1146       for(yoffset = 0; yoffset < sub_cm->cnum[v_s]; yoffset++)
1147 	{
1148 	  /* we can just copy the transitions */
1149 	  sub_cm->t[v_s][yoffset] = orig_psi[v_o] * orig_cm->t[v_o][yoffset];
1150 	}
1151       return;
1152     }
1153 
1154   /* start with the first orig_cm state that maps to v_s */
1155   v_o = submap->s2o_smap[v_s][0];
1156   /*printf("\tv_o: %d\n", v_o);*/
1157   if(v_o == -1)
1158     {
1159       if(sub_cm->sttype[v_s] != S_st &&
1160 	 sub_cm->sttype[v_s] != E_st &&
1161 	 sub_cm->sttype[v_s] != B_st)
1162 	/* special cases, S_st, E_st, B_st */
1163 	cm_Fail("ERROR, sub_cm state v_s: %d maps to no state in sub_cm, but it's not a B, E or S state\n", v_s);
1164     }
1165   else
1166     {
1167       if(sub_cm->sttype[v_s] == S_st ||
1168 	 sub_cm->sttype[v_s] == E_st ||
1169 	 sub_cm->sttype[v_s] == B_st)
1170 	if(v_s != 0)
1171 	  cm_Fail("ERROR, sub_cm state v_s: %d is S, E or B but maps to a orig_cm state: v_o:%d\n", v_o);
1172 
1173       for(yoffset = 0; yoffset < sub_cm->cnum[v_s]; yoffset++)
1174 	{
1175 	  y_s = sub_cm->cfirst[v_s] + yoffset;
1176 	  if(sub_cm->sttype[(y_s+1)] != E_st) /* if y_s+1 is an E, y_s is a detached insert state, we want
1177 					       * it to be impossible to reach this guy, leave counts as 0.0 */
1178 	    {
1179 	      cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][0], v_s, yoffset, orig_psi, tmap);
1180 	      cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][1], v_s, yoffset, orig_psi, tmap);
1181 	    }
1182 	}
1183     }
1184 
1185   /* move on to the second orig_cm state that maps to v_s */
1186   v_o = submap->s2o_smap[v_s][1];
1187   if(v_o != -1)
1188     {
1189       for(yoffset = 0; yoffset < sub_cm->cnum[v_s]; yoffset++)
1190 	{
1191 	  y_s = sub_cm->cfirst[v_s] + yoffset;
1192 	  if(sub_cm->sttype[(y_s+1)] != E_st) /* if y_s+1 is an E, y_s is a detached insert state, we want
1193 					       * it to be impossible to reach this guy, leave counts as 0.0 */
1194 	    {
1195 	      cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][0], v_s, yoffset, orig_psi, tmap);
1196 	      cm2sub_cm_add_single_trans(orig_cm, sub_cm, submap, v_o, submap->s2o_smap[y_s][1], v_s, yoffset, orig_psi, tmap);
1197 	    }
1198 	}
1199     }
1200   return;
1201 }
1202 
1203 /**************************************************************************
1204  * EPN 09.15.06
1205  * cm2sub_cm_trans_probs_S()
1206  *
1207  * Purpose:  For a specific sub CM S state v_start, fill in virtual counts
1208  *           for transitions out of v_s. We do this in its own seperate function
1209  *           because we can't robustly map S states in a sub_cm to S states
1210  *           in an orig CM (if its possible - I can't figure out how to do it).
1211  *
1212  * Args:
1213  * CM_t *orig_cm     - the original, template CM
1214  * CM_t *sub_cm      - the sub CM
1215  * double *orig_psi  - orig_psi[v] is the expected number of times state v is entered
1216  *                     in a CM parse
1217  * char ***tmap      - the hard-coded transition map
1218  * int v_start       - the sub_cm start state we're filling virtual counts of transitions into
1219  * submap            - the map from the sub CM to the template CM
1220  * Returns: void
1221  */
1222 static void
cm2sub_cm_trans_probs_S(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,int v_start,CMSubMap_t * submap)1223 cm2sub_cm_trans_probs_S(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_start, CMSubMap_t *submap)
1224 {
1225   int yoffset;
1226   int y_s;
1227 
1228   int sub_nd;
1229 
1230   int v_s_insert;
1231   int v_o_insert;
1232 
1233   float sum;
1234   float il_psi;
1235   float temp_psi;
1236   float temp_psi_sum;
1237   float diff;
1238 
1239   int orig_il1, orig_il2, orig_ir1, orig_ir2;
1240 
1241   /*printf("in cm2sub_cm_trans_probs_S: v_start: %d\n", v_start);*/
1242 
1243   sub_nd     = sub_cm->ndidx[v_start];
1244 
1245   if(sub_cm->ndtype[sub_nd] == BEGL_nd)
1246     {
1247       /* This is the easy case, we have transitions into each of the states in the split-set
1248        * of the next node. The only way to reach each of these states is from the BEGL
1249        * so we just weight each by the psi values for the matching orig_cm states.
1250        */
1251       if(sub_cm->ndtype[sub_nd + 1] == BIF_nd)
1252 	sub_cm->t[v_start][0] = 1.0; /* BEGL_S -> BIF_B */
1253       else
1254 	for(yoffset = 0; yoffset < sub_cm->cnum[v_start]; yoffset++)
1255 	  {
1256 	    y_s = sub_cm->cfirst[v_start] + yoffset;
1257 	    /*printf("updating sub_cm->t[%d][%d]\n", v_start, yoffset);*/
1258 	    sub_cm->t[v_start][yoffset] = orig_psi[submap->s2o_smap[y_s][0]];
1259 	    if(submap->s2o_smap[y_s][1] != -1)
1260 	      sub_cm->t[v_start][yoffset] += orig_psi[submap->s2o_smap[y_s][1]];
1261 	  }
1262     }
1263 
1264   else if(sub_cm->ndtype[sub_nd] == BEGR_nd)
1265     {
1266       /* More complicated than the BEGL case b/c we need to handle the
1267        * BEGR_S -> BEGR_IL transition as well as BEGR_S -> next node
1268        * split set transitions.
1269        * We know the BEGR_IL -> BEGR_IL self transition though because
1270        * we were able to map BEGR_IL to 1 or 2 orig_cm states.
1271        */
1272 
1273       v_s_insert = v_start + 1;
1274       if(sub_cm->ndtype[sub_nd + 1] == BIF_nd)
1275 	{
1276 	  /*printf("!!!SPECIAL CASE BEGR -> BIF! v_ct\n");*/
1277 
1278 	  v_o_insert = submap->s2o_smap[v_s_insert][0];
1279 	  if(submap->s2o_smap[v_s_insert][1] == -1)
1280 	    {
1281 	      diff = sub_cm->t[v_s_insert][0] - (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]);
1282 	      if(diff >= 0. && diff > 0.0001)
1283 		cm_Fail("ERROR, code for calc'ing BEGR_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1284 	      if(diff <= 0. && diff < -0.0001)
1285 		cm_Fail("ERROR, code for calc'ing BEGR_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1286 	    }
1287 
1288 	  /* BEGR_IL -> BEGR_IL will be the sum over possibly two orig_cm states v_o_insert that
1289 	   * map to sub_cm state BEGR_IL of:
1290 	   *    orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0],
1291 	   * so BEGR_IL -> BIF_B is calc'ed as follows */
1292 	  sub_cm->t[v_s_insert][1] = orig_psi[v_o_insert] * (1. - orig_cm->t[v_o_insert][0]);
1293 	  if(submap->s2o_smap[v_s_insert][1] != -1)
1294 	    {
1295 	      /* we have to factor in the other state as well. */
1296 	      cm_Fail("ERROR, BEGR_IL: %d maps to 2 orig_cm states, was hoping this was impossible - need to implement.\n", v_s_insert);
1297 	      v_o_insert = submap->s2o_smap[v_s_insert][1];
1298 	      sub_cm->t[v_s_insert][1] += orig_psi[v_o_insert] * (1. - orig_cm->t[v_o_insert][0]);
1299 	    }
1300 	  /* Now we normalize the transitions out of BEGR_IL, so we can calculate BEGR_S -> BEGR_IL */
1301 	  esl_vec_FNorm(sub_cm->t[v_s_insert], sub_cm->cnum[v_s_insert]);
1302 	  il_psi   = orig_psi[submap->s2o_smap[v_s_insert][0]];
1303 	  if(submap->s2o_smap[v_s_insert][1] != -1)
1304 	    {
1305 	      cm_Fail("ERROR, BEGR_IL: %d maps to 2 orig_cm states, was hoping this was impossible - need to implement.\n", v_s_insert);
1306 	      il_psi+= orig_psi[submap->s2o_smap[v_s_insert][1]];
1307 	    }
1308 	  sub_cm->t[v_start][0] = (1. - sub_cm->t[v_s_insert][0]) * il_psi; /* set BEGR_S -> BEGR_IL */
1309 	  sub_cm->t[v_start][1] = 1. - sub_cm->t[v_start][0]; /* set BEGR_S -> BIF_B */
1310 	}
1311 
1312       else
1313 	{ /* next node is not a BIF node */
1314 	  /* First, normalize the probabilities out of the BEGR_IL, so
1315 	   * we can calculate what BEGR_S -> BEGR_IL should be (we need
1316 	   * to know BEGR_IL -> BEGR_IL probability and orig_psi of
1317 	   * the states that map to BEGR_IL to do this).
1318 	   */
1319 	  sum = 0.;
1320 	  esl_vec_FNorm(sub_cm->t[v_s_insert], sub_cm->cnum[v_s_insert]);
1321 	  il_psi   = orig_psi[submap->s2o_smap[v_s_insert][0]];
1322 	  if(submap->s2o_smap[v_s_insert][1] != -1)
1323 	    {
1324 	      cm_Fail("ERROR, BEGR_IL: %d maps to 2 orig_cm states, was hoping this was impossible - need to implement.\n", v_s_insert);
1325 	      il_psi += orig_psi[submap->s2o_smap[v_s_insert][1]];
1326 	    }
1327 	  sub_cm->t[v_start][0] = (1. - sub_cm->t[v_s_insert][0]) * il_psi; /* set BEGR_S -> BEGR_IL */
1328 	  sum = sub_cm->t[v_start][0];
1329 
1330 	  for(yoffset = 1; yoffset < sub_cm->cnum[v_start]; yoffset++) /* note we start at yoffset = 1
1331 									* BEGR_S -> first state of next
1332 									* node's split set. */
1333 	    {
1334 	      y_s = sub_cm->cfirst[v_start] + yoffset;
1335 	      temp_psi   = orig_psi[submap->s2o_smap[y_s][0]];
1336 	      if(submap->s2o_smap[y_s][1] != -1)
1337 		temp_psi += orig_psi[submap->s2o_smap[y_s][1]];
1338 
1339 	      sub_cm->t[v_start][yoffset] = temp_psi - il_psi * sub_cm->t[v_s_insert][yoffset];
1340 	      sum += sub_cm->t[v_start][yoffset];
1341 	    }
1342 	  /*printf("BEGR->NON BIF  SUM: %f\n", sum);*/
1343 	  if(sum < 1.0 && ((1.0 - sum) > 0.001))
1344 	    cm_Fail("ERROR calculating transitions out of BEGR_S incorrectly\n");
1345 	  if(sum > 1.0 && ((sum - 1.0) > 0.001))
1346 	    cm_Fail("ERROR calculating transitions out of BEGR_S incorrectly\n");
1347 	}
1348     }
1349   else if(sub_cm->ndtype[sub_nd] == ROOT_nd)
1350     {
1351       /*printf("in cm2sub_cm_trans_probs_S(), ROOT_nd\n");*/
1352       /* the only case we have to worry about is if the next node is BIF node,
1353        * otherwise the transitions out of ROOT_S have already been set.
1354        */
1355       if(sub_cm->ndtype[sub_nd + 1] == BIF_nd)
1356 	{
1357 	  /*printf("!!!SPECIAL CASE ROOT -> BIF!\n");*/
1358 	  /* Before we do anything we have to check to see if we need to subtract
1359 	   * any subpaths from ROOT_S -> ROOT_IR that have been double counted:
1360 	   */
1361 	  /* if orig_ir < orig_il, we've counted paths from
1362 	   * ir -> il correctly for ROOT_IL -> ROOT_IR and
1363 	   *        incorrectly for ROOT_S  -> ROOT_IR
1364 	   */
1365 
1366 	  orig_il1 = submap->s2o_smap[1][0];
1367 	  orig_il2 = submap->s2o_smap[1][1];
1368 
1369 	  orig_ir1 = submap->s2o_smap[2][0];
1370 	  orig_ir2 = submap->s2o_smap[2][1];
1371 
1372 	  if(orig_ir1 < orig_il1)
1373 	    {
1374 	      sub_cm->t[0][1] -= orig_psi[orig_ir1] *
1375 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir1, orig_il1, 0, tmap, orig_psi);
1376 	    }
1377 	  if(orig_ir2 != -1 && orig_ir2 < orig_il1)
1378 	    {
1379 	      sub_cm->t[0][1] -= orig_psi[orig_ir2] *
1380 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir2, orig_il1, 0, tmap, orig_psi);
1381 	    }
1382 	  if(orig_il2 != -1 && orig_ir1 < orig_il2)
1383 	    {
1384 	      sub_cm->t[0][1] -= orig_psi[orig_ir1] *
1385 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir1, orig_il2, 0, tmap, orig_psi);
1386 	    }
1387 	  if((orig_ir2 != -1 && orig_il2 != -1) && (orig_ir2 < orig_il2))
1388 	    {
1389 	      sub_cm->t[0][1] -= orig_psi[orig_ir2] *
1390 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir2, orig_il2, 0, tmap, orig_psi);
1391 	    }
1392 
1393 	  /* Next, set transition from ROOT_S -> BIF_B, we know that
1394 	   * ROOT_S -> ROOT_IL and ROOT_S -> ROOT_IR were set using
1395 	   * orig_cm subpaths originating at ROOT_S, so we know that
1396 	   * the virtual counts out of ROOT_S should NOT be scaled, in
1397 	   * other words, they can be treated as probabilities and should
1398 	   * sum to 1.0.
1399 	   */
1400 	  sub_cm->t[0][2] = 1.0 - (sub_cm->t[0][0] + sub_cm->t[0][1]); /* set ROOT_S->BIF_B */
1401 
1402 	  v_s_insert = v_start + 1; /* ROOT_IL */
1403 	  v_o_insert = submap->s2o_smap[v_s_insert][0];
1404 	  if(submap->s2o_smap[v_s_insert][1] == -1)
1405 	    {
1406 	      diff = sub_cm->t[v_s_insert][0] - (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]);
1407 	      if(diff >= 0. && diff >= 0.0001)
1408 		cm_Fail("ERROR, code for calc'ing ROOT_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1409 	      if(diff <= 0. && diff <= -0.0001)
1410 		cm_Fail("ERROR, code for calc'ing ROOT_IL -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1411 	    }
1412 	  /* We want to figure out what the virtual counts for the traansition ROOT_IL -> BIF_B should
1413 	   * be, but this depends on how we filled the virtual counts for the other two
1414 	   * transitions (-> ROOT_IL and -> ROOT_IR) out of ROOT_IL, so we SHOULD revisit
1415 	   * their calculation. Here, I'm attempting a trick that SHOULD work: figuring out
1416 	   * what the self_insert probability IL->IL should be based on the self insert probabilities
1417 	   * of the up to 2 orig_cm states that sub_cm ROOT_IL maps to, then using that scaling
1418 	   * factor (the actual probability / the virtual counts currently in sub_cm->t[ROOT_IL][0])
1419 	   * to scale both ROOT_IL->ROOT_IL and ROOT_IL->ROOT_IR, then we can just set ROOT_IL->BIF
1420 	   * as 1.0 - (ROOT_IL->ROOT_IL + ROOT_IL->ROOT_IR).
1421 	   */
1422 
1423 	  temp_psi_sum = orig_psi[v_o_insert];
1424 
1425 	  if(submap->s2o_smap[v_s_insert][1] != -1)
1426 	    {
1427 	      v_o_insert    = submap->s2o_smap[v_s_insert][1];
1428 	      temp_psi_sum += orig_psi[v_o_insert];
1429 	    }
1430 	  sub_cm->t[v_s_insert][0] /= temp_psi_sum; /* ROOT_IL -> ROOT_IL */
1431 	  sub_cm->t[v_s_insert][1] /= temp_psi_sum; /* ROOT_IL -> ROOT_IR */
1432 	  sub_cm->t[v_s_insert][2]  = 1. - (sub_cm->t[v_s_insert][0] + sub_cm->t[v_s_insert][1]);
1433 	  /* ROOT_IL -> BIF_B */
1434 
1435 	  /* move on to calc'ing ROOT_IR -> BIF */
1436 	  v_s_insert = v_start + 2; /* ROOT_IR */
1437 	  v_o_insert = submap->s2o_smap[v_s_insert][0];
1438 	  if(submap->s2o_smap[v_s_insert][1] == -1)
1439 	    {
1440 	      diff = sub_cm->t[v_s_insert][0] - (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]);
1441 	      if(diff >= 0. && diff > 0.0001)
1442 		cm_Fail("ERROR, code for calc'ing ROOT_IR -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1443 	      if(diff <= 0. && diff < -0.0001)
1444 		cm_Fail("ERROR, code for calc'ing ROOT_IR -> BIF_B is wrong, sub_cm->t[v_s_insert:%d][0] should be %f (based on your understanding) but its really %f\n", v_s_insert, (orig_psi[v_o_insert] * orig_cm->t[v_o_insert][0]), (sub_cm->t[v_s_insert][0]));
1445 	    }
1446 	  /* We can calculate what the ROOT_IR -> ROOT_IR probability should be, and then
1447 	   * just take 1.0 minus that probability to set ROOT_IR -> BIF.
1448 	   */
1449 	  temp_psi_sum = orig_psi[v_o_insert];
1450 
1451 	  if(submap->s2o_smap[v_s_insert][1] != -1)
1452 	    {
1453 	      v_o_insert    = submap->s2o_smap[v_s_insert][1];
1454 	      temp_psi_sum += orig_psi[v_o_insert];
1455 	    }
1456 	  sub_cm->t[v_s_insert][0] /= temp_psi_sum; /* ROOT_IR -> ROOT_IR */
1457 	  sub_cm->t[v_s_insert][1]  = 1. - (sub_cm->t[v_s_insert][0]);
1458 	  /* ROOT_IR -> BIF_B */
1459 	}
1460       else
1461 	{
1462 	  /* ROOT -> non-BIF node, we have already handled this, so we return. */
1463 	}
1464     }
1465   /*printf("leaving cm2sub_cm_trans_probs_S\n\n");*/
1466   return;
1467 }
1468 
1469 /**************************************************************************
1470  * EPN 09.21.06
1471  * cm2sub_cm_trans_probs_B_E()
1472  *
1473  * Purpose:  For a specific sub CM B or E state v_be fill in virtual counts
1474  *           for transitions into v_be. We do this in its own seperate function,
1475  *           because we can't robustly map E states in a sub_cm to E states
1476  *           in an orig CM (if its possible - I can't figure out how to do it).
1477  *
1478  * Args:
1479  * CM_t *orig_cm     - the original, template CM
1480  * CM_t *sub_cm      - the sub CM
1481  * double *orig_psi  - orig_psi[v] is the expected number of times state v is entered
1482  *                     in a CM parse
1483  * char ***tmap      - the hard-coded transition map
1484  * int v_be         - the sub_cm END state we're filling virtual counts of transitions into
1485  * submap            - the map from the sub CM to the template CM
1486  * print_flag        - TRUE to print useful debugging info
1487  * Returns: void
1488  */
1489 static void
cm2sub_cm_trans_probs_B_E(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,int v_be,CMSubMap_t * submap,int print_flag)1490 cm2sub_cm_trans_probs_B_E(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap, int v_be, CMSubMap_t *submap,
1491 			  int print_flag)
1492 {
1493   int orig_v;
1494   int sub_v;
1495 
1496   int sub_nd;
1497   int psub_nd;
1498   int sub_i;
1499   int bif_end_yoffset;
1500   int orig_v1, orig_v2, orig_i;
1501   float contribution;
1502   int orig_il, orig_ir, sub_il, sub_ir;
1503   int into_end_flag;
1504 
1505   if(print_flag) printf("in cm2sub_cm_trans_probs_B_E: sub_v: %d\n", v_be);
1506 
1507   sub_nd =  sub_cm->ndidx[v_be];
1508   psub_nd = sub_nd - 1;
1509 
1510   into_end_flag = FALSE;
1511   if(sub_cm->sttype[v_be] == E_st)
1512     into_end_flag = TRUE;
1513   /* We're moving from into an END, so the END_E - 1 state is a detached insert,
1514    * and we have to handle this in a special way */
1515 
1516   switch (sub_cm->ndtype[psub_nd]) {
1517   case MATP_nd:
1518     if(print_flag) printf("prev node type MATP\n");
1519     /* psub_nd is a MATP: each state in psub_nd transits to
1520      * exactly 3 states, the MATP_IL, MATP_IR and the BIF or END state v_be, for which
1521      * we are trying to fill virtual transition counts into.
1522      *
1523      * The case for MATP_nd's is actually simpler than for other nodes because we
1524      * can exploit the fact that if a MATP node exists in the sub_cm it necessarily
1525      * must map to a MATP node in the original CM (we're not adding or changing any
1526      * base pairs, just deleting some possibly), so every sub MATP state must correspond
1527      * to exactly 1 original MATP state.
1528      */
1529 
1530     bif_end_yoffset = 2; /* cm->t[][2] goes to BIF_B or END_E */
1531     sub_il = sub_cm->nodemap[psub_nd] + 4;
1532     sub_ir = sub_cm->nodemap[psub_nd] + 5;
1533 
1534     orig_il = submap->s2o_smap[sub_il][0];
1535     orig_ir = submap->s2o_smap[sub_ir][0];
1536     if(into_end_flag && orig_ir != -1)
1537       cm_Fail("ERROR in cm2sub_cm_trans_probs_B_E(), into_end_flag is TRUE but MATP_IR maps to a orig_cm state.\n");
1538     if(orig_ir == -1 && !into_end_flag)
1539       cm_Fail("ERROR in cm2sub_cm_trans_probs_B_E(), into_end_flag is FALSE but MATP_IR doesn't map to a orig_cm state.\n");
1540 
1541     for(sub_v = sub_cm->nodemap[psub_nd]; sub_v < sub_il; sub_v++)
1542       {
1543 	orig_v = submap->s2o_smap[sub_v][0]; /* submap->s2o_smap[sub_v][1] will nec. be -1 in a MATP */
1544 	sub_cm->t[sub_v][bif_end_yoffset] = orig_psi[orig_v] -
1545 	  (sub_cm->t[sub_v][0] + sub_cm->t[sub_v][1]); /* if into_end_flag is TRUE, sub_cm->t[sub_v][1] is 0. */
1546       }
1547     /* now do MATP_IL and MATP_IR */
1548     sub_cm->t[sub_il][bif_end_yoffset] = orig_psi[orig_il] -
1549       (sub_cm->t[sub_il][0] + sub_cm->t[sub_il][1]);
1550 
1551     bif_end_yoffset = 1;
1552     if(into_end_flag)
1553       sub_cm->t[sub_ir][bif_end_yoffset] = 1.0;
1554     else
1555       sub_cm->t[sub_ir][bif_end_yoffset] = orig_psi[orig_ir] - sub_cm->t[sub_ir][0];
1556     break;
1557 
1558   case MATL_nd:
1559   case MATR_nd:
1560     if(print_flag) printf("prev node type MATL or MATR\n");
1561     /* psub_nd is a MATL or MATR and we know each state in psub_nd transits to
1562      * exactly 2 states, the insert state of psub_nd and the BIF or END state v_be, which
1563      * we are trying to fill in transitions to.
1564      */
1565     bif_end_yoffset = 1; /* cm->t[][1] goes to BIF_B or END_E */
1566     sub_i = sub_cm->nodemap[psub_nd] + 2; /* sub_i is MATL_IL or MATR_IR */
1567     orig_i = submap->s2o_smap[sub_i][0];
1568     if(into_end_flag && orig_i != -1)
1569       cm_Fail("ERROR in cm2sub_cm_trans_probs_B_E(), into_end_flag is TRUE but MAT*_I* maps to a orig_cm state.\n");
1570 
1571     for(sub_v = sub_cm->nodemap[psub_nd]; sub_v < sub_i; sub_v++)
1572       {
1573 	orig_v1 = submap->s2o_smap[sub_v][0];
1574 	orig_v2 = submap->s2o_smap[sub_v][1];
1575 	if(into_end_flag)
1576 	  sub_cm->t[sub_v][bif_end_yoffset] = 1.0;
1577 	else
1578 	  {
1579 	    if(orig_v1 < orig_i)
1580 	      {
1581 		contribution = cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1582 						      orig_v1, orig_i, sub_v, tmap, orig_psi);
1583 		sub_cm->t[sub_v][bif_end_yoffset] = orig_psi[orig_v1] * (1. - contribution);
1584 		if(print_flag) printf("curr v1 < i1 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v1], (1.-contribution), (orig_psi[orig_v1] * (1. - contribution)));
1585 	      }
1586 	    else
1587 	      {
1588 		contribution = orig_psi[orig_i] * cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1589 									 orig_i, orig_v1, sub_v, tmap, orig_psi);
1590 		sub_cm->t[sub_v][bif_end_yoffset] = orig_psi[orig_v1] * (1. - (contribution / orig_psi[orig_v1]));
1591 		if(print_flag) printf("curr i1 < v1 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v1], (1.-contribution), (orig_psi[orig_v1] * (1. - contribution)));
1592 	      }
1593 	    if(orig_v2 != -1)
1594 	      {
1595 		if(orig_v2 < orig_i)
1596 		  {
1597 		    contribution = cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1598 							  orig_v2, orig_i, sub_v, tmap, orig_psi);
1599 		    sub_cm->t[sub_v][bif_end_yoffset] += orig_psi[orig_v2] * (1. - contribution);
1600 		    if(print_flag) printf("curr v2 < i1 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v2], (1.-contribution), (orig_psi[orig_v2] * (1. - contribution)));
1601 		  }
1602 		else
1603 		  {
1604 		    contribution = orig_psi[orig_i] * cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1605 									     orig_i, orig_v2, sub_v, tmap, orig_psi);
1606 		    sub_cm->t[sub_v][bif_end_yoffset] += orig_psi[orig_v2] * (1. - (contribution / orig_psi[orig_v2]));
1607 		    if(print_flag) printf("curr i1 < v2 sub_cm->t[sub_v:%d][1] now: %f (added: psi:%f * 1-cont: %f (%f))\n", sub_v, sub_cm->t[sub_v][1], orig_psi[orig_v2], (1.-contribution), (orig_psi[orig_v2] * (1. - contribution)));
1608 		  }
1609 	      }
1610 	  }
1611       }
1612     /* now set the sub_i -> B or E transition prob */
1613     if(into_end_flag) /* The transition probability out of the MAT{L,R}_I{L,R} is irrelevant,
1614 		       * because the state is detached. */
1615       sub_cm->t[sub_i][1] = 1.;
1616     else
1617       sub_cm->t[sub_i][1] = orig_psi[orig_i] *
1618 	(1. - cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap,
1619 				     orig_i, orig_i, sub_i,tmap, orig_psi));
1620     break;
1621 
1622   case ROOT_nd:
1623   case BEGL_nd:
1624   case BEGR_nd:
1625     { /* we handle these cases in cm2sub_cm_trans_probs_S() */ }
1626     break;
1627 
1628   default:
1629     cm_Fail("ERROR bogus node type transiting to END or BIF\n");
1630     break;
1631   }
1632 
1633   if(print_flag) printf("Returning from cm2sub_cm_trans_probs_B_E\n");
1634   return;
1635 }
1636 
1637 /**************************************************************************
1638  * EPN 08.31.06
1639  * Function: cm2sub_cm_add_single_trans()
1640  *
1641  * Purpose:  Add a virtual counts contribution to a single CM transition.
1642  *
1643  * See related functions for explanation of parameters.
1644  * Returns: (void)
1645  */
1646 static void
cm2sub_cm_add_single_trans(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int orig_v,int orig_y,int sub_v,int yoffset,double * orig_psi,char *** tmap)1647 cm2sub_cm_add_single_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int orig_y,
1648 			   int sub_v, int yoffset, double *orig_psi, char ***tmap)
1649 {
1650   int start;
1651   /* check if we've got real CM state ids */
1652   if(orig_v == -1 || orig_y == -1)
1653     return;
1654   start = orig_v;
1655   if(orig_y < start)
1656     start = orig_y;
1657   sub_cm->t[sub_v][yoffset] += orig_psi[start] *
1658     cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_v, orig_y, sub_v, tmap, orig_psi);
1659   return;
1660 }
1661 
1662 /**************************************************************************
1663  * EPN 08.31.06
1664  * Function: cm2sub_cm_sum_subpaths()
1665  *
1666  * Purpose:  Calculate probability of getting from one state (start) to
1667  *           another (end) in a CM, taking special considerations involving
1668  *           insert states.
1669  *
1670  *           This function is similar to CP9_cm2wrhmm::cm_sum_subpaths_cp9()
1671  *           but was written for mapping transitions in a template CM (orig_cm)
1672  *           to those in a sub CM built from that template. The sub_cm conversion
1673  *           process is more complex and this function requires more functionality
1674  *           than the *_cp9() version.
1675  *
1676  *           When getting a transition probability for the sub_cm state 'v' in node
1677  *           'n', we ignore the contribution of subpaths that correspond to other
1678  *           transitions out of states in 'n'.
1679  *           For example, we don't want to include the probability of an
1680  *           IL (nd 'n') -> MATL_ML (nd 'n'+1) sub parse when calculating
1681  *           the transition probability for MATL_ML (nd 'n') -> MATL_ML (nd 'n'+1)
1682  *           because the IL (nd 'n') -> MATL_ML (nd 'n'+1) exists and should
1683  *           contain that probability mass. This is an easy example to skip,
1684  *           there are several other instances where it's more complex to
1685  *           ignore subpaths involving inserts like this.
1686  *
1687  *           Importantly, this function does not correctly calculate the transition
1688  *           virtual counts for only the states in the ROOT_nd. This is because
1689  *           the ROOT_nd has 2 insert states, which makes it much more complex to
1690  *           properly ignore subparses involving both these inserts. The
1691  *           cm2sub_cm_subtract_root_subpaths() function corrects the counts for
1692  *           the ROOT states. MATP_nd's also have 2 insert states but when constructing
1693  *           sub_cm's the only  MATP_nd's that exist have an exact mapping MATP_nd in
1694  *           the orig_cm, which makes them easier to handle.
1695  *
1696  *           In some cases, this function calls itself to determine the probabilities
1697  *           of subparses that it should ignore. It is for these cases that it's
1698  *           necessary to have the init_sub_start parameter passed in, which is
1699  *           the sub_cm state the initial call (non-recursive call) of this function
1700  *           was calculating transitions out of.
1701  *
1702  * Args:
1703  * CM_t   *orig_cm        - the original, template CM
1704  * CM_t   *sub_cm         - the sub CM
1705  * int     orig_v         - orig_cm state that maps to sub_v (1 of potentially 2)
1706  * int     orig_y         - orig_cm state that maps to sub_y (1 of potentially 2)
1707  * int     sub_v          - sub_cm state; we're calc'ing transitions out of sub_v
1708  * int     sub_y          - sub_cm state; we're calc'ing transitions into sub_y
1709  * int     init_sub_start - the sub_cm state the initial (non-recursive call) had as sub_v.
1710  * double *orig_psi       - for orig_cm: orig_psi[v] is the expected number of times state v
1711  *                          is entered in a CM parse
1712  *
1713  * Returns: FLOAT, the summed probability of all subpaths through the CM
1714  *          starting at "start" and ending at "end" that don't pass through
1715  *          orig_cm inserts that map to sub_cm insert states in the same node as init_sub_start.
1716  */
1717 static float
cm2sub_cm_sum_subpaths(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,int orig_v,int orig_y,int init_sub_start,char *** tmap,double * orig_psi)1718 cm2sub_cm_sum_subpaths(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, int orig_v, int orig_y,
1719 		       int init_sub_start, char ***tmap, double *orig_psi)
1720 {
1721   int     status;
1722   int     v,x,y;           /* state indices in the orig_cm                             */
1723   int     start, end;      /* min(orig_v, orig_y) and max(orig_v, orig_y) respectively */
1724   float   to_return;       /* the probability mass we're returning                     */
1725   char    tmap_val;        /* a value in the hard-coded tmap                           */
1726   int     is_insert;       /* TRUE if v is insert, FALSE if not                        */
1727   float   insert_to_start; /* prob mass going into start that we must ignore (see code)*/
1728   float   end_to_insert;   /* prob mass going out of end that we must ignore (see code)*/
1729   int     skip_flag;       /* TRUE to skip state v's contribution (see code)           */
1730   int     init_sub_nd;     /* sub_cm node containing init_sub_start                    */
1731   int     sub_insert1;     /* a sub_cm insert state in sub_cm node init_sub_nd, or -1  */
1732   int     sub_insert2;     /* a sub_cm insert state in sub_cm node init_sub_nd, or -1  */
1733   int     orig_insert1;    /* the orig_cm state that map to sub_insert1, or -1         */
1734   int     orig_insert2;    /* the orig_cm state that map to sub_insert2, or -1         */
1735   float   self_loop_factor;/* for dealing with self-insert loops                       */
1736   int     sub_start1;      /* a sub_cm state that maps to orig_cm's 'start'            */
1737   int     sub_start2;      /* a sub_cm state that maps to orig_cm's 'start' or -1      */
1738   int     sub_end1;        /* a sub_cm state that maps to orig_cm's 'end'              */
1739   int     sub_end2;        /* a sub_cm state that maps to orig_cm's 'end' or -1        */
1740   double *sub_psi;         /* sub_psi[v] is the expected number of times state v is    *
1741 			    * entered given we started at state "start",  and we       *
1742 			    * didn't go through orig_insert1 or orig_insert2           */
1743 
1744   if(orig_v == -1 || orig_y == -1)
1745     return 0.;
1746 
1747   start = orig_v;
1748   end   = orig_y;
1749   if(start > end)
1750     { start = orig_y; end = orig_v; }
1751   if(start == end)
1752     return orig_cm->t[start][0]; /* return self-insert probability */
1753 
1754   /*printf("\nin cm2sub_cm_sum_subpaths2: start: %d | end: %d\n", start, end);*/
1755 
1756   ESL_ALLOC(sub_psi, sizeof(double) * (end - start + 1));
1757   sub_psi[0] = 1.; /* Initialize sub_psi[0]. We have to start in "start" */
1758 
1759   /* First we store some useful information for later in the function,
1760    * we do this and use descriptive variable names to make the code
1761    * later easier to follow. */
1762 
1763   /* Determine the 1 or 2 sub_cm states that map to
1764    * the orig_cm state start and end, and store these
1765    * in sub_start{1,2} and sub_end{1,2} respectively. */
1766   sub_start1 = submap->o2s_smap[start][0];
1767   sub_start2 = submap->o2s_smap[start][1];
1768   sub_end1   = submap->o2s_smap[end  ][0];
1769   sub_end2   = submap->o2s_smap[end  ][1];
1770 
1771   /* Determine the 0, 1, or 2 sub_cm insert states in the same
1772    * node as init_sub_start, store these in sub_insert1 and
1773    * sub_insert2.
1774    */
1775   init_sub_nd = sub_cm->ndidx[init_sub_start];
1776   orig_insert1 = orig_insert2 = sub_insert1 = sub_insert2 = -1;
1777   if(sub_cm->ndtype[init_sub_nd] == MATP_nd)
1778     {
1779       sub_insert1 = sub_cm->nodemap[init_sub_nd] + 4; /* MATP_IL */
1780       sub_insert2 = sub_cm->nodemap[init_sub_nd] + 5; /* MATP_IR */
1781     }
1782   else if(sub_cm->ndtype[init_sub_nd] == ROOT_nd)
1783     {
1784       sub_insert1 = 1; /* ROOT_IL */
1785       sub_insert2 = 2; /* ROOT_IR */
1786     }
1787   else if(sub_cm->ndtype[init_sub_nd] == MATL_nd ||
1788 	  sub_cm->ndtype[init_sub_nd] == MATR_nd ||
1789 	  sub_cm->ndtype[init_sub_nd] == BEGR_nd)
1790     sub_insert1 = sub_cm->cfirst[init_sub_start]; /* MAT{L,R}_I{L,R} or BEGR_IL */
1791 
1792   /* Set orig_insert{1,2} as the orig_cm states that map to sub_insert{1,2}. */
1793   if(sub_insert1 != -1) orig_insert1 = submap->s2o_smap[sub_insert1][0];
1794   if(sub_insert2 != -1) orig_insert2 = submap->s2o_smap[sub_insert2][0];
1795 
1796   /* Step through states between start and end, keeping track of prob mass. */
1797   for (v = (start+1); v <= end; v++)
1798     {
1799       sub_psi[v-start] = 0.; /* initialize */
1800       is_insert = FALSE;
1801       if(orig_cm->sttype[v] == IL_st || orig_cm->sttype[v] == IR_st)
1802 	is_insert = TRUE;
1803       if(orig_cm->sttype[v] == S_st)
1804 	{
1805 	  /* Previous state is either a BIF_B or a END_E, and there's no transitions
1806 	   * FROM previous state to this state, so we handle this in a special way.*/
1807 	  sub_psi[v-start] = sub_psi[(v-1)-start];
1808 	}
1809       /* Determine if we should skip the contribution of state v because
1810        * it will be correctly counted in a subsequent call of this function
1811        * for a different sub_cm transition. We want to do this if:
1812        *    (1) v is not end
1813        *    (2) v is an orig_cm insert state (orig_insert{1,2}) that maps
1814        *        to a sub_cm insert state (sub_insert{1,2}) in the same node
1815        *        as init_sub_start.
1816        *    (3) sub_insert{1,2} is equal to or downstream of init_sub_start
1817        *    (4) a sub_cm transition exists between the sub_insert{1,2} and
1818        *        a sub_cm state that maps to orig_cm start (sub_start{1,2})
1819        *        or end (sub_end{1,2}).
1820        */
1821       skip_flag = FALSE;
1822       if (v != end && v == orig_insert1 && sub_insert1 >= init_sub_start)
1823 	{
1824 	  if(cm_trans_check(sub_cm, sub_insert1, sub_end1  ) ||
1825 	     cm_trans_check(sub_cm, sub_insert1, sub_end2  ) ||
1826 	     cm_trans_check(sub_cm, sub_insert1, sub_start1) ||
1827 	     cm_trans_check(sub_cm, sub_insert1, sub_start2))
1828 	    skip_flag = TRUE;
1829 	}
1830       else if (v != end && v == orig_insert2 && sub_insert2 >= init_sub_start)
1831 	{
1832 	  if(cm_trans_check(sub_cm, sub_insert2, sub_end1  ) ||
1833 	     cm_trans_check(sub_cm, sub_insert2, sub_end2  ) ||
1834 	     cm_trans_check(sub_cm, sub_insert2, sub_start1) ||
1835 	     cm_trans_check(sub_cm, sub_insert2, sub_start2))
1836 	    skip_flag = TRUE;
1837 	}
1838       if(!skip_flag)
1839 	{
1840 	  for (y = orig_cm->pnum[v]-1; y >= is_insert; y--)
1841 	    {
1842 	      x = orig_cm->plast[v] - y;
1843 	      /* x is a parent of v, we're adding contribution of a transition from x to v. */
1844 	      tmap_val = tmap[(int) orig_cm->stid[x]][(int) orig_cm->ndtype[orig_cm->ndidx[v]+is_insert]][(int) orig_cm->stid[v]];
1845 	      /* assert(tmap_val != -1); */
1846 	      if((x - start) < 0) sub_psi[v-start] += 0.;
1847 	      else sub_psi[v-start] += sub_psi[x-start] * orig_cm->t[x][(int) tmap_val];
1848 	    }
1849 	  if(v != end && is_insert) /* if v is end, we don't include the self loop contribution */
1850 	    sub_psi[v-start] += sub_psi[v-start] *
1851 	      (orig_cm->t[v][0] / (1-orig_cm->t[v][0])); /* else we include the self loop contribution */
1852 	}
1853     }
1854   to_return = sub_psi[end-start];
1855 
1856   /* If start and end are both not inserts, we need to ignore some of the
1857    * probability mass that comes INTO 'start' and goes OUT OF 'end'.
1858    * Specifically we need to ignore the probability mass that
1859    * is accounted for by transitions either TO OR FROM the sub_cm insert
1860    * states in the same sub_cm node as init_sub_start (sub_insert{1,2}).
1861    * This code block is related to the block with the (for(v = start+1..) block
1862    * above involving the 'skip_flag' which skips states accounted for by
1863    * the sub_cm insert states sub_insert{1,2} between start..end, this
1864    * block handles the case where the orig_cm insert states that map
1865    * to sub_insert{1,2}, namely orig_insert{1,2}, fall outside start..end.
1866    */
1867   insert_to_start = 0.;
1868   end_to_insert = 0.;
1869 
1870   if((orig_cm->sttype[start] != IL_st && orig_cm->sttype[start] != IR_st) &&
1871      (orig_cm->sttype[end]   != IL_st && orig_cm->sttype[end]   != IR_st))
1872     {
1873       if(orig_insert1 != -1 && orig_insert1 < start)
1874 	{
1875 	  if(orig_cm->sttype[start] == IL_st || orig_cm->sttype[start] == IR_st)
1876 	    self_loop_factor = (1. + (orig_cm->t[start][0] / (1. - orig_cm->t[start][0])));
1877 	  else
1878 	    self_loop_factor = 1.0;
1879 	  insert_to_start += self_loop_factor * orig_psi[orig_insert1] *
1880 	    cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_insert1, start, init_sub_start, tmap, orig_psi);
1881 	}
1882       if(orig_insert1 != -1 && orig_insert1 > end)
1883 	end_to_insert += cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, end, orig_insert1, init_sub_start, tmap, orig_psi);
1884 
1885       if(orig_insert2 != -1 && orig_insert2 < start)
1886 	{
1887 	  self_loop_factor = 1.0;
1888 	  if(orig_cm->sttype[start] == IL_st ||
1889 	     orig_cm->sttype[start] == IR_st)
1890 	    self_loop_factor = (1. + (orig_cm->t[start][0] / (1. - orig_cm->t[start][0])));
1891 	  insert_to_start += self_loop_factor * orig_psi[orig_insert2] * cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_insert2, start, init_sub_start, tmap, orig_psi);
1892 	}
1893       if(orig_insert2 != -1 && orig_insert2 > end)
1894 	end_to_insert += cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, end, orig_insert2, init_sub_start, tmap, orig_psi);
1895 
1896       /*printf("\t\tinsert_to_start: %f sub_psi[0]: %f\n", insert_to_start, sub_psi[0]);
1897 	printf("\t\tend_to_insert: %f sub_psi[end-start]: %f\n", end_to_insert, sub_psi[(end-start)]);*/
1898     }
1899   to_return *= (1. - (insert_to_start / orig_psi[start]));
1900   to_return *= (1. - end_to_insert);
1901   /*printf("***returning from cm2sub_cm_sum_subpaths (s: %d | e: %d): %f\n", start, end, to_return);*/
1902 
1903   free(sub_psi);
1904   return (float) to_return;
1905 
1906  ERROR:
1907   cm_Fail("Memory allocation error.");
1908   return 0.; /* never reached */
1909 }
1910 
1911 /**************************************************************************
1912  * EPN 09.01.06
1913  * debug_print_cm_params()
1914  *
1915  * Purpose:  Print out emission and transition probabilities and scores
1916  *           for a CM.
1917  *
1918  * Args:
1919  * fp        stdout often
1920  * CM_t *cm
1921  * Returns: (void)
1922  */
1923 void
debug_print_cm_params(FILE * fp,CM_t * cm)1924 debug_print_cm_params(FILE *fp, CM_t *cm)
1925 {
1926   int status;
1927   int v, i;
1928   int yoffset;
1929 
1930   char **nodetypes;
1931   char **sttypes;
1932 
1933   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
1934   nodetypes[0] = "BIF";
1935   nodetypes[1] = "MATP";
1936   nodetypes[2] = "MATL";
1937   nodetypes[3] = "MATR";
1938   nodetypes[4] = "BEGL";
1939   nodetypes[5] = "BEGR";
1940   nodetypes[6] = "ROOT";
1941   nodetypes[7] = "END";
1942 
1943   ESL_ALLOC(sttypes, sizeof(char *) * 10);
1944   sttypes[0] = "D";
1945   sttypes[1] = "MP";
1946   sttypes[2] = "ML";
1947   sttypes[3] = "MR";
1948   sttypes[4] = "IL";
1949   sttypes[5] = "IR";
1950   sttypes[6] = "S";
1951   sttypes[7] = "E";
1952   sttypes[8] = "B";
1953   sttypes[9] = "EL";
1954 
1955   fprintf(fp, "cm->nodes: %d\n", cm->nodes);
1956   fprintf(fp, "cm->M:     %d\n", cm->M);
1957   for(v = 0; v < cm->M; v++)
1958     {
1959       fprintf(fp, "v:%4d:%4d %4s %2s\n", v, cm->ndidx[v], nodetypes[(int) cm->ndtype[cm->ndidx[v]]], sttypes[(int) cm->sttype[v]]);
1960       if(cm->nodemap[cm->ndidx[v]] == v)
1961 	fprintf(fp, "beg: %0.6f (%.6f %10d)| end %0.6f (%.6f %10d)\n",
1962 		cm->begin[v], cm->beginsc[v], cm->ibeginsc[v],
1963 		cm->end[v], cm->endsc[v], cm->iendsc[v]);
1964       if(cm->sttype[v] == MP_st)
1965 	{
1966 	  fprintf(fp, "\tE: ");
1967 	  for(i = 0; i < cm->abc->K*cm->abc->K; i++)
1968 	    fprintf(fp, "%0.6f (%.6f %6d) ", cm->e[v][i], cm->esc[v][i], cm->iesc[v][i]);
1969 	  fprintf(fp, "\n");
1970 	}
1971       else if(cm->sttype[v] == ML_st ||
1972 	      cm->sttype[v] == MR_st ||
1973 	      cm->sttype[v] == IL_st ||
1974 	      cm->sttype[v] == IR_st)
1975 	{
1976 	  fprintf(fp, "\tE: ");
1977 	  for(i = 0; i < cm->abc->K; i++)
1978 	    fprintf(fp, "%0.6f (%0.6f %10d) ", cm->e[v][i], cm->esc[v][i], cm->iesc[v][i]);
1979 	  fprintf(fp, "\n");
1980 	}
1981       if(cm->sttype[v] != B_st && cm->sttype[v] != E_st)
1982 	{
1983 	  fprintf(fp, "\tT: ");
1984 	  for(yoffset = 0; yoffset < cm->cnum[v]; yoffset++)
1985 	    fprintf(fp, "%0.6f (%0.6f %10d) ", cm->t[v][yoffset], cm->tsc[v][yoffset], cm->itsc[v][yoffset]);
1986 	  fprintf(fp, "\n");
1987 	}
1988       else if(cm->sttype[v] == B_st)
1989 	{
1990 	  fprintf(fp, "\tL: %d | R: %d\n", cm->cfirst[v], cm->cnum[v]);
1991 	}
1992       else if(cm->sttype[v] == E_st)
1993 	fprintf(fp, "\n\n");
1994     }
1995   fprintf(fp, "\n\n");
1996   free(nodetypes);
1997   free(sttypes);
1998   return;
1999 
2000  ERROR:
2001   cm_Fail("Memory allocation error.");
2002 }
2003 
2004 /**************************************************************************
2005  * EPN 09.01.06
2006  * Function: check_sub_cm_by_sampling()
2007  *
2008  * Purpose:  Given a CM and a sub CM that is supposed to mirror
2009  *           the CM as closely as possible between two given consensus
2010  *           columns (spos and epos), check that the sub_cm was correctly
2011  *           constructed.
2012  *
2013  *           The current approach is to build a CM Plan 9 HMM from the
2014  *           sub CM, then sample from the CM and see if the samples
2015  *           were likely drawn from the CM Plan 9 distributions.
2016  *           This is done inside CP9_cm2wrhmm::CP9_check_by_sampling().
2017  *
2018  * Args:
2019  * orig_cm     - the original, template CM
2020  * sub_cm      - the sub CM built from the orig_cm
2021  * errbuf      - for error messages
2022  * r           - source of randomness
2023  * submap      - map from orig_cm to sub_cm and vice versa
2024  * subinfo     - sub cm information
2025  * chi_thresh  - rejection threshold for chi-squared tests
2026  * nsamples    - number of samples to use to build the ML HMM
2027  * print_flag  - TRUE to print useful info for debugging
2028  *
2029  * Returns: eslOK   if CM and sub CM are "close enough" (see code)
2030  *          eslFAIL otherwise, errbuf is filled
2031  */
2032 int
check_sub_cm_by_sampling(CM_t * orig_cm,CM_t * sub_cm,char * errbuf,ESL_RANDOMNESS * r,CMSubMap_t * submap,CMSubInfo_t * subinfo,float chi_thresh,int nsamples,int print_flag)2033 check_sub_cm_by_sampling(CM_t *orig_cm, CM_t *sub_cm, char *errbuf, ESL_RANDOMNESS *r, CMSubMap_t *submap, CMSubInfo_t *subinfo,
2034 			 float chi_thresh, int nsamples, int print_flag)
2035 {
2036   int       status;
2037   CP9_t    *orig_hmm;    /* constructed CP9 HMM from the original cm */
2038   CP9_t    *sub_hmm;     /* constructed CP9 HMM from the sub_cm */
2039   CP9Map_t *orig_cp9map; /* maps the orig_cm to the orig_hmm and vice versa */
2040   CP9Map_t *sub_cp9map;  /* maps the sub_cm to the sub_hmm and vice versa */
2041 
2042   /* Build two CP9 HMMs, one for the orig_cm and one for the sub_cm */
2043   if((status = build_cp9_hmm(orig_cm, errbuf, FALSE, 0.0001, print_flag, &orig_hmm, &orig_cp9map)) != eslOK) return status;
2044   if((status = build_cp9_hmm(sub_cm,  errbuf, FALSE, 0.0001, print_flag, &sub_hmm,  &sub_cp9map)) != eslOK) return status;
2045   CP9Logoddsify(orig_hmm);
2046   CP9Logoddsify(sub_hmm);
2047 
2048   /* Look for 'impossible' cases where we know the sub_cm
2049    * construction procedure fails, in that the distribution of transitions out of CP9 nodes
2050    * built from the sub_cm will be the same distros out of corresponding CP9 nodes built from
2051    * the full CM. */
2052   cm2sub_cm_find_impossible_misc_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
2053   cm2sub_cm_find_impossible_matr_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
2054 
2055   if((status = CP9_check_by_sampling(orig_cm, sub_hmm, errbuf, r, subinfo, submap->spos, submap->epos, chi_thresh, nsamples, print_flag)) != eslOK) return status;
2056   if(print_flag) printf("CM Plan 9 built from sub_cm passed sampling check; sub_cm was built correctly.\n");
2057 
2058   FreeCPlan9(orig_hmm);
2059   FreeCPlan9(sub_hmm);
2060   FreeCP9Map(orig_cp9map);
2061   FreeCP9Map(sub_cp9map);
2062   return eslOK;
2063 }
2064 
2065 
2066 /**************************************************************************
2067  * EPN 09.07.06
2068  * check_orig_psi_vs_sub_psi()
2069  *
2070  * Purpose:  Check that the psi values for an original, template CM and
2071  *           a sub CM are withing a given leeway threshold, given maps
2072  *           from the states of the original to the sub and vice versa.
2073  *
2074  * Args:
2075  * orig_cm    - the original, template CM
2076  * sub_cm     - the sub CM
2077  * errbuf     - for error messages
2078  * submap     - the map data structure for the sub CM
2079  * threshold  - the threshold that mapping (potentially summed) psi
2080  *              values are allowed to be different by, without throwing an error.
2081  * print_flag  - TRUE to print out the values, FALSE not to
2082  *
2083  * Returns: eslOK   if CM and sub CM are "close enough" (see code)
2084  *          eslFAIL otherwise, errbuf is filled
2085  */
2086 int
check_orig_psi_vs_sub_psi(CM_t * orig_cm,CM_t * sub_cm,char * errbuf,CMSubMap_t * submap,double threshold,int print_flag)2087 check_orig_psi_vs_sub_psi(CM_t *orig_cm, CM_t *sub_cm, char *errbuf,CMSubMap_t *submap, double threshold,
2088 			  int print_flag)
2089 {
2090   int v_s; /* sub_cm state index*/
2091   int v_o; /* orig_cm state index*/
2092   double temp_psi;
2093   int violation;
2094   int v_ct; /* Number of violations */
2095   int detached_insert;
2096   int is_insert;
2097   int root_v_ct;
2098   float diff;
2099   double     *orig_psi;    /* expected num times each state visited in orig_cm */
2100   double     *sub_psi;     /* expected num times each state visited in sub_cm  */
2101 
2102   /* Fill orig_psi and sub_psi parameters. */
2103   if((orig_psi = cm_ExpectedStateOccupancy(orig_cm)) == NULL) ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_cm(), unable to calculate expected state occupancy for orig_cm()");
2104   if((sub_psi  = cm_ExpectedStateOccupancy(sub_cm))  == NULL) ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_cm(), unable to calculate expected state occupancy for sub_cm()");
2105 
2106   if(print_flag)
2107     {
2108       printf("Printing psi in check_orig_psi_vs_sub_psi():\n");
2109       for(v_o = 0; v_o < orig_cm->M; v_o++)
2110 	printf("orig_psi[%4d]: %.6f\n", v_o, orig_psi[v_o]);
2111 
2112     }
2113 
2114   v_ct = 0;
2115   root_v_ct = 0;
2116   if(print_flag == TRUE) printf("\n");
2117   for(v_s = 0; v_s < sub_cm->M; v_s++)
2118     {
2119       detached_insert         = FALSE;
2120       if(sub_cm->sttype[v_s] == IL_st || sub_cm->sttype[v_s] == IR_st)
2121 	is_insert = TRUE;
2122       else
2123 	is_insert = FALSE;
2124 
2125       if(print_flag) printf("\tv_s: %4d (%.6f) ", v_s, sub_psi[v_s]);
2126       v_o = submap->s2o_smap[v_s][0];
2127       if(sub_cm->sttype[v_s+1] == E_st) detached_insert = TRUE;
2128       if(v_o != -1)
2129 	{
2130 	  if(print_flag) printf("v_o1: %4d (%.6f) ", v_o, orig_psi[v_o]);
2131 	  temp_psi = orig_psi[v_o];
2132 	  v_o = submap->s2o_smap[v_s][1];
2133 	  if(v_o != -1)
2134 	    {
2135 	      if(is_insert) /* this insert state maps to 2 orig_cm inserts */
2136 		ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_psi(), sub insert state maps to 2 orig_cm inserts.");
2137 	      temp_psi += orig_psi[v_o];
2138 	      if(print_flag)
2139 		printf("v_o2: %4d (%.6f)\n", v_o, orig_psi[v_o]);
2140 	    }
2141 	  else
2142 	    if(print_flag) printf("\n");
2143 
2144 	  violation = FALSE;
2145 	  if(detached_insert)
2146 	    temp_psi = 0.0;
2147 
2148 	  /* 10.20.06 Found an exceedingly rare case (2 cases in all possible sub CM
2149 	   * models of RMARK) where psi test fails with diff < 0.00002 but > 0.00001
2150 	   * (our default). Both cases involve insert self loops with p > 0.9,
2151 	   * the reason (I'm pretty sure) these guys fail is because when the
2152 	   * contribution of the self insertion loop is included in fill_psi, even
2153 	   * if the self-insert probs for a sub_cm and orig_cm state are equal, if
2154 	   * the psi values for that state BEFORE the contribution of the self insert
2155 	   * is added are > 0.0000001 or so, the self insert contributions amplifies
2156 	   * that difference above our 0.00001 threshold. This only happens if the
2157 	   * self insert probs are really high and explains the rareness of this case.
2158 	   * The approach to fixing it is to subtract out the self loop contribution
2159 	   * prior to checking if we exceed our threshold.
2160 	   */
2161 	  if(is_insert && !detached_insert)
2162 	    {
2163 	      diff = (sub_psi[v_s] - (sub_psi[v_s] * sub_cm->t[v_s][0])) -
2164 		(temp_psi - (temp_psi * orig_cm->t[submap->s2o_smap[v_s][0]][0]));
2165 	    }
2166 	  else
2167 	    diff = sub_psi[v_s] - temp_psi;
2168 	  if((diff > threshold) || ((-1. * diff) > threshold))
2169 	    {
2170 	      violation = TRUE;
2171 	      v_ct++;
2172 	      if((sub_cm->ndidx[v_s] == 0) || (sub_cm->ndidx[v_s] == 1))
2173 		{
2174 		  root_v_ct++;
2175 		}
2176 	    }
2177 	  if(violation)
2178 	    printf("sub: %.6f | orig: %.6f | diff: %.6f VIOLATION\n\n", sub_psi[v_s], temp_psi, (sub_psi[v_s]-temp_psi));
2179 	  else if(detached_insert && print_flag)
2180 	    printf("sub: %.6f | orig: %.6f | diff: %.6f (DEAD INSERT)\n\n", sub_psi[v_s], temp_psi, (sub_psi[v_s]-temp_psi));
2181 	  else if(print_flag)
2182 	    printf("sub: %.6f | orig: %.6f | diff: %.6f\n\n", sub_psi[v_s], temp_psi, (sub_psi[v_s]-temp_psi));
2183 	}
2184       else
2185 	{
2186 	  if(!detached_insert &&
2187 	     sub_cm->sttype[v_s] != E_st &&
2188 	     sub_cm->sttype[v_s] != B_st &&
2189 	     sub_cm->sttype[v_s] != S_st &&
2190 	     sub_cm->sttype[v_s] != EL_st)
2191 	    ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_psi() state v_s:%d maps to nothing and its not E,B,S,EL\n", v_s);
2192 	  if(print_flag) printf("E B S or EL\n");
2193 	}
2194     }
2195 
2196   if(v_ct > 0) ESL_FAIL(eslFAIL, errbuf, "check_orig_psi_vs_sub_psi(): check failed");
2197   else
2198     if(print_flag) printf("v_ct is 0 with thresh: %f!\n", threshold);
2199   if(root_v_ct > 0)
2200     printf("ROOT v_ct is %d with thresh: %f!\n", root_v_ct, threshold);
2201 
2202   /* Cleanup and exit. */
2203   free(orig_psi);
2204   free(sub_psi);
2205 
2206   return eslOK;
2207 }
2208 
2209 /**************************************************************************
2210  * EPN 09.11.06
2211  * cm_trans_check()
2212  *
2213  * Return TRUE if there's a transition in the CM from state a to state b.
2214  *
2215  * Args:
2216  * CM_t  cm,
2217  * int   a;
2218  * int   b;
2219  * Returns: TRUE if b is a child of a (a->b exists)
2220  *          FALSE otherwise
2221  */
2222 int
cm_trans_check(CM_t * cm,int a,int b)2223 cm_trans_check(CM_t *cm, int a, int b)
2224 {
2225   /*printf("\t**in cm_trans_check a: %d | b: %d\n", a, b);*/
2226 
2227   if((a == -1 || b == -1) || (a > b))
2228     return FALSE;
2229 
2230   if((b - cm->cfirst[a]) < cm->cnum[a])
2231     { /*printf("returning TRUE") ;*/ return TRUE; }
2232 
2233   return FALSE;
2234 
2235 }
2236 
2237 /**************************************************************************
2238  * EPN 09.21.06
2239  * Function: cm2sub_cm_subtract_root_subpaths()
2240  *
2241  * Purpose:  When building a sub CM (sub_cm) from an original, template
2242  *           CM (orig_cm), there's special considerations that must be
2243  *           taken involving sub_cm nodes with 2 insert states where there
2244  *           isn't an exactly identical node in the original CM. The only
2245  *           node that potentially meets this criteria is the ROOT_nd because
2246  *           all MATP nodes in sub_cm must necessarily also exist in the
2247  *           orig_cm.
2248  *
2249  *           The problem is that we have overcounted certain subpaths when
2250  *           transitioning out of the sub_cm ROOT_S and ROOT_IL states.
2251  *
2252  *           For each sub_cm ROOT_IL state, there are up to 2 orig_cm inserts
2253  *           that map to it (orig_il1 and orig_il2), and analagously for sub_cm
2254  *           ROOT_IR (orig_ir1 and orig_ir2). Then there are up to 2 sub_cm states
2255  *           that map to each of the split set states in sub_cm node 1 (orig_ss1
2256  *           and orig_ss2). The subpaths that we have overcounted is dependent
2257  *           on the relationship between orig_il*, orig_ir*, and orig_ss*.
2258  *           There are six cases of this relationship:
2259  *
2260  *             cases 1A, 1B, 1C apply when il < ir.
2261  *             case 1A: il < ir < ss (this is correctly handled by cm2sub_cm_sum_subpaths())
2262  *             case 1B: il < ss < ir
2263  *             case 1C: ss < il < ir
2264  *
2265  *             cases 2A, 2B, 2C apply when ir < il.
2266  *             case 2A: ir < il < ss
2267  *             case 2B: ir < ss < il
2268  *             case 2C: ss < ir < il
2269  *
2270  *           These cases are not explicitly checked for in the code but were
2271  *           useful for determining the correct strategy to use here, and
2272  *           are mentioned in comments throughout the code.
2273  *
2274  *           This function calls a helper function for each pair of orig_il* and
2275  *           orig_ir* (up to 4 possible pairs), which actually subtracts
2276  *           the path based on the relationships of orig_il*, orig_ir* and
2277  *           orig_ss*.
2278  *
2279  *           Importantly, though there are potentially many orig_ss* states
2280  *           (up to 2 that map to each sub_cm node 1 split set state),
2281  *           they all must be close enough in state indices that we'll
2282  *           never have a case where for a specific il* and ir* we get more
2283  *           than 1 of the 6 possible cases for all orig_ss*. This is
2284  *           because either the orig_ss are in a sub_cm MATP node - in
2285  *           which case they MUST map to exactly 1 state each in the orig_cm
2286  *           (to the corresponding MATP node) and are contiguous state indices,
2287  *           or they are a MATL or MATR which either map to a corresponding
2288  *           MATL, MATR (in which case they're contiguous) or each sub_cm
2289  *           split state maps to exactly 2 states in the same orig_cm MATP
2290  *           node, (for example MATL_ML might map to MATP_MP and MATP_ML)
2291  *           which are all in the same node and thus have no insert states
2292  *           with a state index between the indices of the 2 states they
2293  *           map to (ouch).
2294  *
2295  *
2296  * Args:
2297  * CM_t *orig_cm     - the original, template CM
2298  * CM_t *sub_cm      - the sub CM
2299  * double *orig_psi  - for orig_cm: orig_psi[v] is the expected number of times state v is entered
2300  *                     in a CM parse
2301  * char ***tmap      - the hard-coded transition map
2302  * CMSubMap_t *submap- the map from the sub CM to the template CM
2303  * int print_flag    - TRUE to print useful debugging info
2304  *
2305  * Returns: VOID
2306  */
2307 static void
cm2sub_cm_subtract_root_subpaths(CM_t * orig_cm,CM_t * sub_cm,double * orig_psi,char *** tmap,CMSubMap_t * submap,int print_flag)2308 cm2sub_cm_subtract_root_subpaths(CM_t *orig_cm, CM_t *sub_cm, double *orig_psi, char ***tmap,
2309 				 CMSubMap_t *submap, int print_flag)
2310 
2311 {
2312   int sub_root_s;
2313   int sub_il;
2314   int sub_ir;
2315   int yoffset;
2316   int sub_y;
2317   int orig_y;
2318   int orig_il;
2319   int orig_ir;
2320   int orig_ss;
2321   int orig_ss1;
2322   int orig_ss2;
2323 
2324   sub_root_s = 0; /* sub_cm ROOT_S index */
2325   sub_il = 1; /* sub ROOT_IL index */
2326   sub_ir = 2; /* sub ROOT_IR index */
2327   orig_il = submap->s2o_smap[sub_il][0];
2328   orig_ir = submap->s2o_smap[sub_ir][0];
2329 
2330   if(print_flag) printf("\n\nin cm2sum_cm_subtract_root_subpaths_helper: orig_il: %d orig_ir: %d\n", orig_il, orig_ir);
2331 
2332   for(yoffset = 0; yoffset < sub_cm->cnum[0]; yoffset++)
2333     if(print_flag) printf("Before t[0][%d] = %f\n", yoffset, sub_cm->t[0][yoffset]);
2334 
2335   orig_ss = submap->s2o_smap[3][0]; /* orig_ss is the 1 (of possibly 2) orig_cm states that map to the first
2336 				     * state in sub_cm node 1 (sub_cm state 3)
2337 				     */
2338   /* Check to make sure that all the split states meet our guarantee (not necessary) */
2339   for(yoffset = 2; yoffset < sub_cm->cnum[0]; yoffset++) /* note we start at yoffset = 2
2340 							  * ROOT_S -> first state of next
2341 							  * node's split set. */
2342     {
2343       sub_y = yoffset + sub_cm->cfirst[0];
2344       orig_y = submap->s2o_smap[sub_y][0];
2345       if((orig_il > orig_ss && orig_il < orig_y) ||
2346 	 (orig_il < orig_ss && orig_il > orig_y) ||
2347 	 (orig_ir > orig_ss && orig_ir < orig_y) ||
2348 	 (orig_ir < orig_ss && orig_ir > orig_y))
2349 	cm_Fail("ERROR in cm2sub_cm_subtract_root_subpaths_helper() split set state guarantee violated!\n");
2350     }
2351 
2352 
2353   /* Check for which of the 6 cases we have (not actually necessary) */
2354   if((orig_il < orig_ir) && (orig_ir < orig_ss))
2355     if(print_flag) printf("ROOT NODE case 1A\n");
2356   if((orig_il < orig_ss) && (orig_ss < orig_ir))
2357     if(print_flag) printf("ROOT NODE case 1B\n");
2358   if((orig_ss < orig_il) && (orig_il < orig_ir))
2359     if(print_flag) printf("ROOT NODE case 1C\n");
2360 
2361   if((orig_ir < orig_il) && (orig_il < orig_ss))
2362     if(print_flag) printf("ROOT NODE case 2A\n");
2363   if((orig_ir < orig_ss) && (orig_ss < orig_il))
2364     if(print_flag) printf("ROOT NODE case 2B\n");
2365   if((orig_ss < orig_ir) && (orig_ir < orig_il))
2366     if(print_flag) printf("ROOT NODE case 2C\n");
2367 
2368 
2369   /* First adjust counts out of sub_cm ROOT_S */
2370 
2371   /* if orig_ir < orig_il, we've counted paths from
2372    * ir -> il correctly for ROOT_IL -> ROOT_IR and
2373    *        incorrectly for ROOT_S  -> ROOT_IR
2374    */
2375   if(orig_ir < orig_il)
2376     {
2377       if(print_flag) printf("0 sub from S->IR, IR -> IL\n"); /* cases 2A, 2B, 2C */
2378       sub_cm->t[sub_root_s][1] -= orig_psi[orig_ir] *
2379 	cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_il, 0, tmap, orig_psi);
2380     }
2381 
2382   /* All the remaining paths to be subtracted involve the split set state of
2383    * node 1. */
2384   for(yoffset = 0; yoffset < sub_cm->cnum[0]; yoffset++)
2385     {
2386       sub_y = sub_cm->cfirst[0] + yoffset;
2387       orig_ss1 = submap->s2o_smap[sub_y][0];
2388       orig_ss2 = submap->s2o_smap[sub_y][1];
2389       if(sub_cm->ndidx[sub_y] != 0)
2390 	{
2391 	  /* Adjust counts out of ROOT_IL if necessary */
2392 
2393 	  /* if orig_ir < orig_ss < orig_il, we've counted paths from
2394 	   * ir -> ss -> il correctly for ROOT_IL -> ROOT_IR and
2395 	   *              incorrectly for ROOT_IL -> ROOT_SS
2396 	   */
2397 	  if(orig_ir < orig_ss && orig_ss < orig_il) /* case 2B only */
2398 	    {
2399 	      if(print_flag) printf("3 (2b) sub from IL, IR -> SS -> IL\n");
2400 
2401 	      /* subtract paths from orig_ir -> orig_ss1 -> orig_il */
2402 	      sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ir] *
2403 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_ss1,
2404 				       sub_il, tmap, orig_psi) *
2405 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss1, orig_il,
2406 				       sub_il, tmap, orig_psi);
2407 	      if(orig_ss2 != -1)
2408 		/* subtract paths from orig_ir -> orig_ss2 -> orig_il */
2409 		sub_cm->t[1][yoffset] -= orig_psi[orig_ir] *
2410 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_ss2,
2411 					 sub_il, tmap, orig_psi) *
2412 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss2, orig_il,
2413 					 sub_il, tmap, orig_psi);
2414 
2415 	    }
2416 
2417 	  /* if orig_ss < orig_il < orig_ir, we've counted paths from
2418 	   * ss -> il -> ir correctly for ROOT_IR -> ROOT_SS and
2419 	   *              incorrectly for ROOT_IL -> ROOT_SS
2420 	   */
2421 	  if(orig_ss < orig_il && orig_il < orig_ir) /* case 1C only */
2422 	    {
2423 	      if(print_flag) printf("4 (1c) sub from IL, SS -> IL -> IR\n");
2424 
2425 	      /* subtract paths from orig_ss1 -> orig_il (add self insert) -> orig_ir */
2426 	      sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ss1] *
2427 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss1, orig_il,
2428 				       sub_il, tmap, orig_psi) *
2429 		(1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2430 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ir,
2431 				       sub_il, tmap, orig_psi);
2432 
2433 	      if(orig_ss2 != -1)
2434 		/* subtract paths from orig_ss2 -> orig_il (add self insert) -> orig_ir */
2435 		sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ss2] *
2436 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss2, orig_il,
2437 					 sub_il, tmap, orig_psi) *
2438 		  (1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2439 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ir,
2440 					 sub_il, tmap, orig_psi);
2441 	    }
2442 
2443 	  /* if orig_ir < orig_il < orig_ss, we've counted paths from
2444 	   * ir -> il -> ss correctly for ROOT_IR -> ROOT_SS and
2445 	   *              incorrectly for ROOT_IL -> ROOT_SS
2446 	   */
2447 	  if(orig_ir < orig_il && orig_il < orig_ss) /* case 2A only */
2448 	    {
2449 	      if(print_flag) printf("5 (2a) sub from IL, IR -> IL -> SS\n");
2450 
2451 	      /* subtract paths from orig_ir -> orig_il (add self insert) -> orig_ss1 */
2452 	      sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ir] *
2453 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_il,
2454 				       sub_il, tmap, orig_psi) *
2455 		(1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2456 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss1,
2457 				       sub_il, tmap, orig_psi);
2458 
2459 	      if(orig_ss2 != -1)
2460 		/* subtract paths from orig_ir -> orig_il (add self insert) -> orig_ss2 */
2461 		sub_cm->t[sub_il][yoffset] -= orig_psi[orig_ir] *
2462 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ir, orig_il,
2463 					 sub_il, tmap, orig_psi) *
2464 		  (1. + (orig_cm->t[orig_il][0] / (1 - orig_cm->t[orig_il][0]))) *
2465 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss2,
2466 					 sub_il, tmap, orig_psi);
2467 
2468 	    }
2469 
2470 	  /* if orig_il < orig_ss < orig_ir, we've counted paths from
2471 	   * il -> ss -> ir correctly for ROOT_IL -> ROOT_IR and
2472 	   *              incorrectly for ROOT_IL -> ROOT_SS
2473 	   */
2474 	  if(orig_il < orig_ss && orig_ss < orig_ir) /* case 1B only */
2475 	    {
2476 	      if(print_flag) printf("6 (1b) sub from IL, IL -> SS -> IR\n");
2477 	      if(print_flag) printf("1B before sub 1: sub_cm->t[sub_il:%d][yoffset:%d]: %f\n", sub_il, yoffset, sub_cm->t[sub_il][yoffset]);
2478 	      /* subtract paths from orig_il -> orig_ss1 -> orig_ir */
2479 	      sub_cm->t[sub_il][yoffset] -= orig_psi[orig_il] *
2480 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss1,
2481 				       sub_il, tmap, orig_psi) *
2482 		cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss1, orig_ir,
2483 				       sub_il, tmap, orig_psi);
2484 
2485 	      if(print_flag) printf("1B before sub 2: sub_cm->t[sub_il:%d][yoffset:%d]: %f\n", sub_il, yoffset, sub_cm->t[sub_il][yoffset]);
2486 	      if(orig_ss2 != -1)
2487 		/* subtract paths from orig_il -> orig_ss2 -> orig_ir */
2488 		sub_cm->t[sub_il][yoffset] -= orig_psi[orig_il] *
2489 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_il, orig_ss2,
2490 					 sub_il, tmap, orig_psi) *
2491 		  cm2sub_cm_sum_subpaths(orig_cm, sub_cm, submap, orig_ss2, orig_ir,
2492 					 sub_il, tmap, orig_psi);
2493 	      if(print_flag) printf("1B after sub 2: sub_cm->t[sub_il:%d][yoffset:%d]: %f\n", sub_il, yoffset, sub_cm->t[sub_il][yoffset]);
2494 
2495 	    }
2496 	}
2497     }
2498   for(yoffset = 0; yoffset < sub_cm->cnum[0]; yoffset++)
2499     if(print_flag) printf("After t[0][%d] = %f\n", yoffset, sub_cm->t[0][yoffset]);
2500 
2501   return;
2502 }
2503 
2504 
2505 /**************************************************************************
2506  * EPN 09.11.06
2507  * cm2sub_cm_check_id_next_node()
2508  *
2509  * It's known that sub_nd and orig_nd are identical nodes, in that they
2510  * are of the same type and model the same column of the seed alignment.
2511  * In this function, we check if the next node of the sub_cm and orig_nd
2512  * have the same relationship and if so, we update information in
2513  * the submap->s2o_id array, setting the sub_cm states of this node to
2514  * TRUE.
2515  *
2516  * Returns: void
2517  *
2518  * Args:
2519  * CM_t  orig_cm
2520  * CM_t  sub_cm
2521  * int   orig_nd
2522  * int   sub_nd
2523  * CMSubMap_t *submap
2524  * int   sub_start
2525  */
2526 
2527 static int
cm2sub_cm_check_id_next_node(CM_t * orig_cm,CM_t * sub_cm,int orig_nd,int sub_nd,CMSubMap_t * submap,CP9Map_t * orig_cp9map,CP9Map_t * sub_cp9map,int print_flag)2528 cm2sub_cm_check_id_next_node(CM_t *orig_cm, CM_t *sub_cm, int orig_nd, int sub_nd,
2529 			     CMSubMap_t *submap, CP9Map_t *orig_cp9map,
2530 			     CP9Map_t *sub_cp9map, int print_flag)
2531 {
2532   int v_s;
2533   int left_check, right_check;
2534   left_check = FALSE;
2535   right_check = FALSE;
2536 
2537   if((orig_nd+1) > (orig_cm->nodes-1))
2538     return FALSE;
2539   if((sub_nd+1)  > (sub_cm->nodes-1))
2540     return FALSE;
2541   if(orig_cm->ndtype[orig_nd] != sub_cm->ndtype[sub_nd])
2542     return FALSE;
2543   if(orig_cm->ndtype[orig_nd+1] != sub_cm->ndtype[sub_nd+1])
2544     return FALSE;
2545 
2546   if(orig_cp9map->nd2lpos[orig_nd+1] == -1 && sub_cp9map->nd2lpos[sub_nd+1] == -1)
2547     left_check = TRUE;
2548   if(orig_cp9map->nd2lpos[orig_nd+1] == (sub_cp9map->nd2lpos[sub_nd+1] + submap->spos -1))
2549     left_check = TRUE;
2550   if(orig_cp9map->nd2rpos[orig_nd+1] == -1 && sub_cp9map->nd2rpos[sub_nd+1] == -1)
2551     right_check = TRUE;
2552   if(orig_cp9map->nd2rpos[orig_nd+1] == (sub_cp9map->nd2rpos[sub_nd+1] + submap->spos -1))
2553     right_check = TRUE;
2554 
2555   if(left_check && right_check)
2556     {
2557       v_s = sub_cm->nodemap[sub_nd];
2558       while(sub_cm->ndidx[v_s] == sub_nd)
2559 	{
2560 	  if(print_flag) printf("setting submap->s2o_id[v_s:%d] to TRUE\n", v_s);
2561 	  if(sub_cm->sttype[v_s+1] != E_st) submap->s2o_id[v_s] = TRUE; /* if v+1 is an E_st, it's a detached insert don't set s2o_id[v] to TRUE */
2562 	  v_s++;
2563 	}
2564       return TRUE;
2565     }
2566   return FALSE;
2567 }
2568 
2569 /**************************************************************************
2570  * EPN 10.05.06
2571  * cm2sub_cm_find_impossible_misc_cases
2572  *
2573  * For certain situations, the conversion of an orig_cm to a sub_cm loses
2574  * some information that makes it impossible for a CP9 trained from the sub_cm
2575  * to exactly match a CP9 trained from the orig_cm for the corresponding
2576  * columns. One case where it is impossible involves start states as follows:
2577  *
2578  * if for any k k=spos..epos-1
2579  * X >= 0 start states exists in the orig_cm in a node between:
2580  *          orig_cp9map->pos2nd[k] -> orig_cp9map->pos2nd[k+1]
2581  * AND Y >= 1 start states exist in the sub_cm in a node between:
2582  *          sub_cp9map->pos2nd[k-spos+1] -> sub_cp9map->pos2nd[k-spos+1+1]
2583  * where X != Y
2584  *
2585  * AND further one or both of the two nodes in the sub_cm (sub_cp9map->pos2nd[k-spos+1]
2586  * OR sub_cp9map->pos2nd[k-spos+1+1] must be a MATP.
2587  *
2588  * This is because for the sub_cm paths that would go from CP9 node k to k+1
2589  * were forced to go through a start state where as for the orig_cm there
2590  * was not a requirement to go through a start. Therefore when the
2591  * sub_cm was constructed it lost some information about the original
2592  * transitions.
2593  *
2594  * Also, a special situation of this case occurs with transitions out of
2595  * CP9 node k=spos-1 to k=spos, and out of CP9 node k=epos to k=epos+1.
2596  * The sub_cm node that models columns spos-1 and epos-1 is the ROOT node,
2597  * which only really models inserts in those columns. It turns out that
2598  * the transition distributions will nearly always be screwy out of these
2599  * two nodes, except in the case when the sub_cm ROOT_IL and ROOT_IR map
2600  * to an IL and IR state respectivley in the original *within the same
2601  * orig_cm node*. In which case the transitions out of node 0 will be
2602  * identical to those out of node spos-1 in the orig_cm.
2603  * There are a few other rare cases where the transitions will be
2604  * identical, but an exhaustive understanding of them eludes me.
2605  * (see ~nawrockie/notebook/6_0725_inf_sub_cm/00LOG for more).
2606  *
2607  * Returns: void
2608  *
2609  * Args:
2610  * CM_t  orig_cm
2611  * CM_t  sub_cm
2612  * int *orig_cp9map->pos2nd
2613  * int *sub_cp9map->pos2nd
2614  * int *imp_cc
2615  * int spos;
2616  * int epos;
2617  */
2618 
2619 static void
cm2sub_cm_find_impossible_misc_cases(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,CMSubInfo_t * subinfo,CP9Map_t * orig_cp9map,CP9Map_t * sub_cp9map,int print_flag)2620 cm2sub_cm_find_impossible_misc_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
2621 				     CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag)
2622 
2623 {
2624   int status;
2625   int k;
2626   int sub_starts;
2627   int orig_starts;
2628   int orig_nd1;
2629   int orig_nd2;
2630   int sub_nd1;
2631   int sub_nd2;
2632   int temp;
2633   int nd;
2634   int orig_special_matps;
2635   int sub_special_matl;
2636   int sub_both_matps;
2637   CMEmitMap_t *orig_emap;         /* consensus emit map for the original, template CM */
2638   CMEmitMap_t *sub_emap;          /* consensus emit map for the sub CM */
2639 
2640   int orig_il1;
2641   int orig_il2;
2642   int orig_ir1;
2643   int orig_ir2;
2644 
2645   char **nodetypes;
2646   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
2647   nodetypes[0] = "BIF";
2648   nodetypes[1] = "MATP";
2649   nodetypes[2] = "MATL";
2650   nodetypes[3] = "MATR";
2651   nodetypes[4] = "BEGL";
2652   nodetypes[5] = "BEGR";
2653   nodetypes[6] = "ROOT";
2654   nodetypes[7] = "END";
2655 
2656   if(print_flag) printf("in cm2sub_find_impossible_misc_cases()\n");
2657 
2658   orig_emap = CreateEmitMap(orig_cm);
2659   sub_emap = CreateEmitMap(sub_cm);
2660 
2661   /* We know that the sub CM node 0 and node submap->sub_clen are nearly always going to
2662    * get the transition distributions wrong (see comments above in the function
2663    * explanation). There are a couple of cases they should get right
2664    * (see ~nawrockie/notebook/6_0725_inf_sub_cm/00LOG for details), we
2665    * check for these here:
2666    */
2667   subinfo->imp_cc[0] = 1;
2668   subinfo->imp_cc[submap->sub_clen] = 1;
2669 
2670   /* check if the orig_cm states that model sub_cm ROOT_IL and ROOT_IR are
2671    * from the same node. */
2672   orig_il1 = submap->s2o_smap[1][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IL */
2673   orig_il2 = submap->s2o_smap[1][1]; /* 2nd state that maps to sub_cm's ROOT_IL or -1 if only 1 maps*/
2674   orig_ir1 = submap->s2o_smap[2][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IR */
2675   orig_ir2 = submap->s2o_smap[2][1]; /* 2nd state that maps to sub_cm's ROOT_IR or -1 if only 1 maps*/
2676 
2677   /* We ASSUME that ambiguities have been removed, i.e. if two insert states map to either ROOT_IL
2678    * or ROOT_IR, one of them has been detached. We exploit this knowledge.
2679    */
2680   if(orig_il2 != -1)
2681     {
2682       if(orig_cm->sttype[orig_il1+1] == E_st)
2683 	orig_il1 = orig_il2; /* orig_il1 was detached */
2684       else if(orig_cm->sttype[orig_il2+1] == E_st)
2685 	{
2686 	  /* do nothing */
2687 	}
2688       else
2689 	cm_Fail("ERROR, can't determine which state was detached\n");
2690     }
2691   if(orig_ir2 != -1)
2692     {
2693       if(orig_cm->sttype[orig_ir1+1] == E_st)
2694 	orig_ir1 = orig_ir2; /* orig_ir1 was detached */
2695       else if(orig_cm->sttype[orig_ir2+1] == E_st)
2696 	{
2697 	  /* do nothing */
2698 	}
2699       else
2700 	cm_Fail("ERROR, can't determine which state was detached\n");
2701     }
2702 
2703   /* Now orig_il1 and orig_ir1 map to the ONLY insert states that map to sub_cm
2704    * ROOT_IL and ROOT_IR respectively. */
2705   if(orig_cm->ndidx[orig_il1] == orig_cm->ndidx[orig_ir1])
2706     {
2707       /* we can get the distro out of 0 right in this case */
2708       subinfo->imp_cc[0] = FALSE;
2709       if(submap->spos == submap->sstruct && submap->epos == submap->estruct)
2710 	subinfo->imp_cc[submap->sub_clen] = FALSE;
2711     }
2712 
2713   for(k = 1; k < submap->sub_clen; k++)
2714     {
2715       if(print_flag) printf("k: %d\n", k);
2716       if((k+submap->spos-1) == 0)
2717 	orig_nd1 = 0;
2718       else
2719 	orig_nd1 = orig_cp9map->pos2nd[k+submap->spos-1];
2720       orig_nd2 = orig_cp9map->pos2nd[k+submap->spos-1+1];
2721 
2722       orig_special_matps = FALSE;
2723       if((orig_cm->ndtype[orig_nd1] == MATP_nd &&
2724 	  orig_cm->ndtype[orig_nd2] == MATP_nd) &&
2725 	 (orig_cp9map->nd2rpos[orig_nd1] == (k+submap->spos-1) &&
2726 	  orig_cp9map->nd2lpos[orig_nd2]  == (k+submap->spos)))
2727 	{
2728 	  if((orig_cp9map->nd2lpos[orig_nd1] < submap->spos) ||
2729 	     (orig_cp9map->nd2rpos[orig_nd2] > submap->epos))
2730 	    {
2731 	      /* This is a special case */
2732 	      orig_special_matps = TRUE;
2733 	    }
2734 	}
2735 
2736       if(orig_nd2 < orig_nd1)
2737 	{
2738 	  temp = orig_nd1;
2739 	  orig_nd1 = orig_nd2;
2740 	  orig_nd2 = temp;
2741 	}
2742       orig_starts = 0;
2743 
2744       if(print_flag) printf("orig_nd1: %d | orig_nd2: %d\n", orig_nd1, orig_nd2);
2745       for(nd = orig_nd1; nd <= orig_nd2; nd++)
2746 	{
2747 	  if(print_flag) printf("orig_cm->ndtype[%d]: %s L: %4d R: %4d (submap->spos: %4d) (submap->epos: %4d)\n", nd, nodetypes[(int) orig_cm->ndtype[nd]], orig_emap->lpos[nd], orig_emap->rpos[nd], submap->spos, submap->epos);
2748 	  if(orig_cm->ndtype[nd] == BEGL_nd ||
2749 	     orig_cm->ndtype[nd] == BEGR_nd)
2750 	    { orig_starts++; }
2751 	}
2752       sub_nd1 = sub_cp9map->pos2nd[k];
2753       sub_nd2 = sub_cp9map->pos2nd[k+1];
2754 
2755       sub_special_matl = FALSE;
2756       if(sub_cm->ndtype[sub_nd1] == MATL_nd)
2757 	sub_special_matl = TRUE;
2758 
2759       if(sub_nd2 < sub_nd1)
2760 	{
2761 	  temp = sub_nd1;
2762 	  sub_nd1 = sub_nd2;
2763 	  sub_nd2 = temp;
2764 	}
2765 
2766       sub_both_matps = FALSE;
2767       if((sub_cm->ndtype[sub_nd1] == MATP_nd && sub_cm->ndtype[sub_nd2] == MATP_nd) &&
2768 	 (!(sub_cp9map->nd2lpos[sub_nd1] < sub_cp9map->nd2lpos[sub_nd2] && sub_cp9map->nd2rpos[sub_nd1] > sub_cp9map->nd2rpos[sub_nd2])))
2769 	sub_both_matps = TRUE;
2770 
2771       sub_starts = 0;
2772       if(print_flag) printf("sub_nd1: %d | sub_nd2: %d\n", sub_nd1, sub_nd2);
2773       for(nd = sub_nd1; nd <= sub_nd2; nd++)
2774 	{
2775 	  if(print_flag) printf("sub_cm->ndtype[%d]: %s L: %4d R: %4d (submap->spos: %4d) (submap->epos: %4d)\n", nd, nodetypes[(int) sub_cm->ndtype[nd]], (sub_emap->lpos[nd]+submap->spos-1), (sub_emap->rpos[nd]+submap->spos-1), submap->spos, submap->epos);
2776 	  if(sub_cm->ndtype[nd] == BEGL_nd || sub_cm->ndtype[nd] == BEGR_nd)
2777 	    {
2778 	      sub_starts++;
2779 	      if(sub_cm->ndtype[sub_nd1] != MATP_nd && sub_cm->ndtype[sub_nd2] != MATP_nd)
2780 		cm_Fail("ERROR in cm2sub_cm_find_impossible_misc_cases() found impossible case not involving any MATP in the sub_cm, k: %d submap->epos-submap->spos+1: %d\n", k, submap->sub_clen);
2781 	      if(orig_cm->ndtype[orig_nd1] != MATP_nd && orig_cm->ndtype[orig_nd2] != MATP_nd)
2782 		cm_Fail("ERROR in cm2sub_cm_find_impossible_misc_cases() found impossible case not involving any MATP in the orig_cm\n, k: %d | submap->epos-submap->spos+1: %d", k, submap->sub_clen);
2783 	    }
2784 	}
2785 
2786       if(print_flag) printf("sub_starts: %d | orig_starts: %d\n", sub_starts, orig_starts);
2787 
2788       if(sub_starts > 0 && orig_starts == 0)
2789 	subinfo->imp_cc[k] = 3;
2790       else if((sub_starts > 0 && sub_starts > orig_starts) &&
2791 	      sub_both_matps == TRUE)
2792 	subinfo->imp_cc[k] = 4;
2793       else if(sub_starts > 0 && orig_special_matps && sub_special_matl)
2794 	subinfo->imp_cc[k] = 5;
2795       else if(sub_starts == 0 && orig_starts > 0 && orig_special_matps && sub_special_matl)
2796 	subinfo->imp_cc[k] = 6;
2797 
2798     }
2799   FreeEmitMap(orig_emap);
2800   FreeEmitMap(sub_emap);
2801   free(nodetypes);
2802   return;
2803 
2804  ERROR:
2805   cm_Fail("Memory allocation error.");
2806 }
2807 
2808 /**************************************************************************
2809  * EPN 10.18.06
2810  * cm2sub_cm_find_impossible_matr_cases
2811  *
2812  * For certain situations, the conversion of an orig_cm to a sub_cm loses
2813  * some information that makes it impossible for a CP9 trained from the sub_cm
2814  * to exactly match a CP9 trained from the orig_cm for the corresponding
2815  * columns. One case where it is impossible involves MATR nodes as follows:
2816  *
2817  * if for any k k=spos..epos-1:
2818  * orig_cp9map->pos2nd[k] + 1  = orig_cp9map->pos2nd[k+1] AND  (NOT TRUE!)
2819  * sub_cp9map->pos2nd[k] + 1 != sub_cp9map->pos2nd[k+1] AND
2820  * orig_cp9map->pos2nd[k]     == sub_cp9map->pos2nd[k]   == MATL AND
2821  * orig_cp9map->pos2nd[k+1]   == sub_cp9map->pos2nd[k+1] == MATP AND
2822  * all nodes between sub_cp9map->pos2nd[k] .. cc_node_map[k+1] are BIF, BEG*, or MATR nodes AND
2823  * the orig_cm nodes that the stretch of MATR nodes map to are all MATP nodes
2824  *     for which the other half of the nodes map to positions outside the sub_cm.
2825  *
2826  * Here we explicitly check for these situations and set imp_cc[k] = TRUE for
2827  * any k that satisfy all the criteria.
2828  * Returns: void
2829  *
2830  * Args:
2831  * CM_t  orig_cm
2832  * CM_t  sub_cm
2833  * int *orig_cp9map->pos2nd
2834  * int *sub_cp9map->pos2nd
2835  * int *imp_cc
2836  * int spos;
2837  * int epos;
2838  */
2839 
2840 static void
cm2sub_cm_find_impossible_matr_cases(CM_t * orig_cm,CM_t * sub_cm,CMSubMap_t * submap,CMSubInfo_t * subinfo,CP9Map_t * orig_cp9map,CP9Map_t * sub_cp9map,int print_flag)2841 cm2sub_cm_find_impossible_matr_cases(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CMSubInfo_t *subinfo,
2842 				     CP9Map_t *orig_cp9map, CP9Map_t *sub_cp9map, int print_flag)
2843 {
2844   int status;
2845   int sub_k;
2846   int orig_k;
2847   int orig_nd;
2848   int sub_nd;
2849   int next_sub_nd;
2850   int next_orig_nd;
2851   int sub_nd1;
2852   int sub_nd2;
2853   int orig_nd1;
2854   int orig_nd2;
2855   int tmp_k;
2856   int tmp_sub_nd;
2857   int tmp_orig_nd;
2858   int correct_node_types_flag;
2859   int orig_matr_stretch_flag;
2860   int sub_matr_stretch_flag;
2861 
2862   char **nodetypes;
2863   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
2864   nodetypes[0] = "BIF";
2865   nodetypes[1] = "MATP";
2866   nodetypes[2] = "MATL";
2867   nodetypes[3] = "MATR";
2868   nodetypes[4] = "BEGL";
2869   nodetypes[5] = "BEGR";
2870   nodetypes[6] = "ROOT";
2871   nodetypes[7] = "END";
2872 
2873   for(sub_k = 1; sub_k < submap->sub_clen; sub_k++)
2874     {
2875       orig_k = sub_k + submap->spos - 1;
2876       if(print_flag) printf("CASE 2 k: %d\n", sub_k);
2877       /* Strategy: we check each of our criteria independently */
2878 
2879       /* Find out what the node that models the next column is */
2880       sub_nd = sub_cp9map->pos2nd[sub_k];
2881       next_sub_nd = sub_cp9map->pos2nd[sub_k+1];
2882       orig_nd = orig_cp9map->pos2nd[orig_k];
2883       next_orig_nd = orig_cp9map->pos2nd[orig_k+1];
2884       if(sub_nd < next_sub_nd)
2885 	{
2886 	  sub_nd1 = sub_nd;
2887 	  sub_nd2 = next_sub_nd;
2888 	}
2889       else
2890 	{
2891 	  sub_nd1 = next_sub_nd;
2892 	  sub_nd2 = sub_nd;
2893 	}
2894       if(orig_nd < next_orig_nd)
2895 	{
2896 	  orig_nd1 = orig_nd;
2897 	  orig_nd2 = next_orig_nd;
2898 	}
2899       else
2900 	{
2901 	  orig_nd1 = next_orig_nd;
2902 	  orig_nd2 = orig_nd;
2903 	}
2904 
2905       if(print_flag) printf("CASE 2 k: %d  sub_nd: %d  next_sub_nd: %d\n", sub_k, sub_nd, next_sub_nd);
2906       if(print_flag) printf("CASE 2 k: %d orig_nd: %d next_orig_nd: %d\n", sub_k, orig_nd, next_orig_nd);
2907 
2908       /* Make sure that next_sub_nd is and sub_nd are not consecutive */
2909       if(sub_nd1 != sub_nd2-1)
2910 	{
2911 	  /* Check if min(sub_nd, next_sub_nd) and min(orig_nd, next_orig_nd) are both MATLs and
2912 	     if max(sub_nd, next_sub_nd) and max(orig_nd, next_orig_nd) are both MATPs */
2913 	  if((orig_cm->ndtype[orig_nd1] == MATL_nd && sub_cm->ndtype[sub_nd1] == MATL_nd) &&
2914 	     (orig_cm->ndtype[orig_nd2] == MATP_nd && sub_cm->ndtype[sub_nd2] == MATP_nd))
2915 	    correct_node_types_flag = TRUE;
2916 	  else
2917 	    correct_node_types_flag = FALSE;
2918 	  if(print_flag) printf("CASE 2 k: %d correct_node_types_flag: %d\n", sub_k, correct_node_types_flag);
2919 
2920 	  if(correct_node_types_flag == TRUE)
2921 	    {
2922 	      /* Determine if the next_orig_nd is the next left emitting
2923 	       * node of the orig_cm after orig_nd */
2924 	      orig_matr_stretch_flag = TRUE;
2925 	      for(tmp_orig_nd = orig_nd1+1; tmp_orig_nd < orig_nd2; tmp_orig_nd++)
2926 		{
2927 		  if(orig_cm->ndtype[tmp_orig_nd] != MATR_nd)
2928 		    {
2929 		      orig_matr_stretch_flag = FALSE;
2930 		      break;
2931 		    }
2932 		}
2933 	      if(print_flag) printf("CASE 2 k: %d orig_matr_stretch_flag: %d\n", sub_k, orig_matr_stretch_flag);
2934 	      if(orig_matr_stretch_flag == TRUE)
2935 		{
2936 		  /* Check if all the sub_cm nodes between sub_nd and
2937 		   * next_sub_nd are MATRs. */
2938 		  sub_matr_stretch_flag = TRUE;
2939 		  for(tmp_sub_nd = sub_nd1+1; tmp_sub_nd < sub_nd2; tmp_sub_nd++)
2940 		    if(sub_cm->ndtype[tmp_sub_nd] != MATR_nd)
2941 		      {
2942 			sub_matr_stretch_flag = FALSE;
2943 			break;
2944 		      }
2945 		  if(print_flag) printf("CASE 2 k: %d sub_matr_stretch_flag: %d\n", sub_k, sub_matr_stretch_flag);
2946 		  if(sub_matr_stretch_flag == TRUE)
2947 		    {
2948 		      /* This should be a MATR impossible case,
2949 		       * Check all the criteria we *think* are always true in this situation,
2950 		       * cm_Fail if what we think is wrong.
2951 		       *
2952 		       * The orig_cm nodes that map to the same consensus columns
2953 		       * as the MATR nodes in the sub_cm must either be:
2954 
2955 		       * as the MATR nodes in the sub_cm must either be:
2956 		       *
2957 		       * 1 orig_cm MATPs or MATRs for which the RIGHT half maps to the same column
2958 		       *   modelled by the corresponding sub_cm MATR, and in the case of the
2959 		       *   MATPs the LEFT half maps to consensus columns before spos.
2960 		       * 2 orig_cm nodes that map to the same consensus columns
2961 		       *   as the MATR nodes in the sub_cm are ALL either orig_cm MATPs
2962 		       *   or MATLs for which the LEFT half maps to the same
2963 		       *   column modelled by the corresponding sub_cm MATR, and in the
2964 		       *   case of the MATPs, the RIGHT half maps to consensus columns
2965 		       *   after epos.
2966 		       *
2967 		       * Usually the entire set of orig_cm nodes that map to the sub_cm
2968 		       * MATRs are either one or the other type, but I've seen rare cases
2969 		       * where there's a stretch of one type, and then a stretch of the
2970 		       * other.
2971 		       */
2972 		      tmp_sub_nd = sub_cp9map->pos2nd[sub_k] + 1;
2973 		      tmp_k       = sub_cp9map->nd2rpos[tmp_sub_nd] + submap->spos - 1;
2974 		      tmp_orig_nd = orig_cp9map->pos2nd[tmp_k];
2975 
2976 		      for(tmp_sub_nd = sub_nd1+1; tmp_sub_nd < sub_nd2; tmp_sub_nd++)
2977 			{
2978 			  tmp_k       = sub_cp9map->nd2rpos[tmp_sub_nd] + submap->spos - 1;
2979 			  tmp_orig_nd = orig_cp9map->pos2nd[tmp_k];
2980 
2981 			  if(print_flag) printf("10.18.06: %s %3d %3d | %s %3d %3d (xcc: %3d)\n", nodetypes[(int) sub_cm->ndtype[tmp_sub_nd]], tmp_sub_nd, (sub_cp9map->nd2rpos[tmp_sub_nd]+submap->spos-1), nodetypes[(int) orig_cm->ndtype[tmp_orig_nd]], orig_cp9map->nd2lpos[tmp_orig_nd], orig_cp9map->nd2rpos[tmp_orig_nd], tmp_k);
2982 			  if(orig_cp9map->nd2rpos[tmp_orig_nd] == (sub_cp9map->nd2rpos[tmp_sub_nd]+submap->spos-1)) /* Case 1 above */
2983 			    {
2984 			      if(orig_cm->ndtype[tmp_orig_nd] != MATP_nd && orig_cm->ndtype[tmp_orig_nd] != MATR_nd)
2985 				cm_Fail("ERROR 2 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d", sub_k, submap->spos, submap->epos);
2986 			      if(orig_cm->ndtype[tmp_orig_nd] == MATP_nd && orig_cp9map->nd2lpos[tmp_orig_nd] >= submap->spos)
2987 				cm_Fail("ERROR 3 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d", sub_k, submap->spos, submap->epos);
2988 			    }
2989 			  else if(orig_cp9map->nd2lpos[tmp_orig_nd] == (sub_cp9map->nd2rpos[tmp_sub_nd]+submap->spos-1)) /* Case 2 above */
2990 			    {
2991 			      if(orig_cm->ndtype[tmp_orig_nd] != MATP_nd && orig_cm->ndtype[tmp_orig_nd] != MATL_nd)
2992 				cm_Fail("ERROR 4 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d", sub_k, submap->spos, submap->epos);
2993 			      if(orig_cm->ndtype[tmp_orig_nd] == MATP_nd && orig_cp9map->nd2rpos[tmp_orig_nd] <= submap->epos)
2994 				cm_Fail("ERROR 5 in cm2sub_cm_find_impossible_matr_cases() found impossible MATR case that can't be classified as case 1 or case 2, k: %d | submap->spos: %d submap->epos: %d | orig_cp9map->nd2rpos[%d]: %d", sub_k, submap->spos, submap->epos, tmp_orig_nd, orig_cp9map->nd2rpos[tmp_orig_nd]);
2995 			    }
2996 			}
2997 		      /* if we get here, we've satisfied all of our criteria */
2998 		      subinfo->imp_cc[sub_k] = 2;
2999 		    }
3000 		}
3001 	    }
3002 	}
3003     }
3004   free(nodetypes);
3005   return;
3006 
3007  ERROR:
3008   cm_Fail("Memory allocation error.");
3009 }
3010 
3011 
3012 
3013 /**************************************************************************
3014  * EPN 10.16.06
3015  * Function: check_sub_cm()
3016  *
3017  * Purpose:  Given a CM and a sub CM that is supposed to mirror
3018  *           the CM as closely as possible between two given consensus
3019  *           columns (spos and epos), check that the sub_cm was correctly
3020  *           constructed.
3021  *
3022  *	     1. Build a CP9 HMM (cp9_1) from the sub_cm.
3023  *	     2. Build a CP9 HMM (cp9_2) from the full cm.
3024  *	     3. Reconfig cp9_2 so start node is spos and end node is epos.
3025  *	     4. Check corresponding parameters of cp9_1 and cp9_2 to make
3026  *	        sure they're within pthresh, allowing nodes we predict
3027  *              to be wrong, as stored in subinfo->imp_cc[] to be wrong,
3028  *              but keeping statistics on these cases.
3029  *
3030  * Args:
3031  * orig_cm     - the original, template CM
3032  * sub_cm      - the sub CM built from the orig_cm
3033  * errbuf      - for error messages
3034  * submap      - map from orig_cm to sub_cm and vice versa
3035  * subinfo     - sub cm information
3036  * pthresh     - the allowed difference in probability between HMMs
3037  * print_flag  - TRUE to print useful debugging info
3038  *
3039  * Returns: eslOK   if CM and sub CM are "close enough" (see code)
3040  *          eslFAIL otherwise, errbuf is filled
3041  */
3042 int
check_sub_cm(CM_t * orig_cm,CM_t * sub_cm,char * errbuf,CMSubMap_t * submap,CMSubInfo_t * subinfo,float pthresh,int print_flag)3043 check_sub_cm(CM_t *orig_cm, CM_t *sub_cm, char *errbuf, CMSubMap_t *submap, CMSubInfo_t *subinfo, float pthresh, int print_flag)
3044 {
3045   int          status;
3046   CP9_t       *sub_hmm;  /* constructed CP9 HMM from the sub_cm */
3047   CP9_t       *orig_hmm; /* constructed CP9 HMM from the original cm
3048 			   this will be reconfiged to match the sub_hmm */
3049   CP9Map_t *orig_cp9map; /* maps the orig_cm to the orig_hmm and vice versa */
3050   CP9Map_t *sub_cp9map;  /* maps the sub_cm to the sub_hmm and vice versa */
3051 
3052   int k;
3053   double **orig_phi;
3054   int *violation;
3055   int v_ct;
3056   int apredict_total_ct;
3057   int awrong_total_ct;
3058   int i;
3059   float diff;
3060   int nd;
3061 
3062   v_ct = 0;
3063   apredict_total_ct = 0;
3064   awrong_total_ct = 0;
3065 
3066   /* Build two CP9 HMMs, one for the orig_cm and one for the sub_cm */
3067   if((status = build_cp9_hmm(orig_cm, errbuf, FALSE, 0.0001, print_flag, &orig_hmm, &orig_cp9map)) != eslOK) return status;
3068   if((status = build_cp9_hmm(sub_cm,  errbuf, FALSE, 0.0001, print_flag, &sub_hmm,  &sub_cp9map)) != eslOK) return status;
3069   CP9Logoddsify(orig_hmm);
3070   CP9Logoddsify(sub_hmm);
3071 
3072   /* Look for 'impossible' cases where we know the sub_cm
3073    * construction procedure fails, in that the distribution of transitions out of CP9 nodes
3074    * built from the sub_cm will be the same distros out of corresponding CP9 nodes built from
3075    * the full CM. */
3076   cm2sub_cm_find_impossible_misc_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
3077   cm2sub_cm_find_impossible_matr_cases(orig_cm, sub_cm, submap, subinfo, orig_cp9map, sub_cp9map, print_flag);
3078 
3079   /* Reconfig the orig_hmm so that it can only start in the spos node, and end from the epos node */
3080   /* Build the sub CP9 HMM by copying as much of the original cp9_hmm as possible */
3081   fill_phi_cp9(orig_hmm, &orig_phi, 1, FALSE);
3082   CP9_reconfig2sub(orig_hmm, submap->spos, submap->epos, submap->spos, submap->epos, orig_phi);
3083 
3084   if(print_flag)
3085     {
3086       printf("PRINTING BUILT SUB HMM PARAMS:\n");
3087       debug_print_cp9_params(stdout, sub_hmm, TRUE);
3088       printf("DONE PRINTING BUILT SUB HMM PARAMS:\n");
3089 
3090       printf("PRINTING BUILT & RECONFIGED ORIG HMM PARAMS:\n");
3091       debug_print_cp9_params(stdout, orig_hmm, TRUE);
3092       printf("DONE PRINTING BUILT & RECONFIGED ORIG HMM PARAMS:\n");
3093     }
3094 
3095   /* Check the parameters of the two CP9 HMMs */
3096   if(print_flag)
3097     {
3098       printf("COMPARING CP9 HMM parameters in check_sub_cm()\n");
3099       printf("orig | sub\n");
3100     }
3101   ESL_ALLOC(violation, sizeof(int) * (submap->sub_clen+1));
3102   for(k = 0; k <= sub_hmm->M; k++)
3103     {
3104       violation[k] = FALSE;
3105       if(print_flag) printf("Node: %d\n", k);
3106       if(k > 0)
3107 	{
3108 	  for(i = 0; i < orig_cm->abc->K; i++)
3109 	    {
3110 	      diff = orig_hmm->mat[(submap->spos+k-1)][i] - sub_hmm->mat[k][i];
3111 	      if(print_flag) printf("mat[%d][%d] = %8.5f | %8.5f | (%8.5f)\n", 0, i, orig_hmm->mat[(submap->spos+k-1)][i], sub_hmm->mat[k][i], diff);
3112 	      if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3113 		{
3114 		  ESL_FAIL(eslFAIL, errbuf, "check_sub_cm(): emission probability incorrect");
3115 		}
3116 	    }
3117 	}
3118       for(i = 0; i < orig_cm->abc->K; i++)
3119 	{
3120 	  diff = orig_hmm->ins[(submap->spos+k-1)][i] - sub_hmm->ins[k][i];
3121 	  if(print_flag) printf("ins[%d][%d] = %8.5f | %8.5f | (%8.5f)\n", 0, i, orig_hmm->ins[(submap->spos+k-1)][i], sub_hmm->ins[k][i], diff);
3122 	  if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3123 	    {
3124 	      ESL_FAIL(eslFAIL, errbuf, "check_sub_cm(): emission probability incorrect");
3125 	    }
3126 	}
3127 
3128       /* Transitions */
3129       if(print_flag) printf("\n");
3130       diff = orig_hmm->t[(submap->spos+k-1)][CTMM] - sub_hmm->t[k][CTMM];
3131       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3132 	{
3133 	  violation[k] = TRUE;
3134 	  if(print_flag) printf("\tCTMM[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTMM], sub_hmm->t[k][CTMM], diff);
3135 	}
3136       else
3137 	if(print_flag) printf("\tCTMM[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTMM], sub_hmm->t[k][CTMM], diff);
3138 
3139       diff = orig_hmm->t[(submap->spos+k-1)][CTMI] - sub_hmm->t[k][CTMI];
3140       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3141 	{
3142 	  violation[k] = TRUE;
3143 	  if(print_flag) printf("\tCTMI[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTMI], sub_hmm->t[k][CTMI], diff);
3144 	}
3145       else
3146 	if(print_flag) printf("\tCTMI[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTMI], sub_hmm->t[k][CTMI], diff);
3147 
3148       diff = orig_hmm->t[(submap->spos+k-1)][CTMD] - sub_hmm->t[k][CTMD];
3149       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3150 	{
3151 	  violation[k] = TRUE;
3152 	  if(print_flag) printf("\tCTMD[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTMD], sub_hmm->t[k][CTMD], diff);
3153 	}
3154       else
3155 	if(print_flag) printf("\tCTMD[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTMD], sub_hmm->t[k][CTMD], diff);
3156 
3157       diff = orig_hmm->t[(submap->spos+k-1)][CTIM] - sub_hmm->t[k][CTIM];
3158       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3159 	{
3160 	  violation[k] = TRUE;
3161 	  if(print_flag) printf("\tCTIM[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTIM], sub_hmm->t[k][CTIM], diff);
3162 	}
3163       else
3164 	if(print_flag) printf("\tCTIM[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTIM], sub_hmm->t[k][CTIM], diff);
3165 
3166       diff = orig_hmm->t[(submap->spos+k-1)][CTII] - sub_hmm->t[k][CTII];
3167       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3168 	{
3169 	  violation[k] = TRUE;
3170 	  if(print_flag) printf("\tCTII[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTII], sub_hmm->t[k][CTII], diff);
3171 	}
3172       else
3173 	if(print_flag) printf("\tCTII[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTII], sub_hmm->t[k][CTII], diff);
3174 
3175       diff = orig_hmm->t[(submap->spos+k-1)][CTID] - sub_hmm->t[k][CTID];
3176       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3177 	{
3178 	  violation[k] = TRUE;
3179 	  if(print_flag) printf("\tCTID[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTID], sub_hmm->t[k][CTID], diff);
3180 	}
3181       else
3182 	if(print_flag) printf("\tCTID[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTID], sub_hmm->t[k][CTID], diff);
3183 
3184       diff = orig_hmm->t[(submap->spos+k-1)][CTDM] - sub_hmm->t[k][CTDM];
3185       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3186 	{
3187 	  violation[k] = TRUE;
3188 	  if(print_flag) printf("\tCTDM[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTDM], sub_hmm->t[k][CTDM], diff);
3189 	}
3190       else
3191 	if(print_flag) printf("\tCTDM[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTDM], sub_hmm->t[k][CTDM], diff);
3192 
3193       diff = orig_hmm->t[(submap->spos+k-1)][CTDI] - sub_hmm->t[k][CTDI];
3194       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3195 	{
3196 	  violation[k] = TRUE;
3197 	  if(print_flag) printf("\tCTDI[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTDI], sub_hmm->t[k][CTDI], diff);
3198 	}
3199       else
3200 	if(print_flag) printf("\tCTDI[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTDI], sub_hmm->t[k][CTDI], diff);
3201 
3202       diff = orig_hmm->t[(submap->spos+k-1)][CTDD] - sub_hmm->t[k][CTDD];
3203       if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3204 	{
3205 	  violation[k] = TRUE;
3206 	  if(print_flag) printf("\tCTDD[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->t[(submap->spos+k-1)][CTDD], sub_hmm->t[k][CTDD], diff);
3207 	}
3208       else
3209 	if(print_flag) printf("\tCTDD[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->t[(submap->spos+k-1)][CTDD], sub_hmm->t[k][CTDD], diff);
3210 
3211       if(k > 0)
3212 	{
3213 	  diff = orig_hmm->begin[(submap->spos+k-1)] - sub_hmm->begin[k];
3214 	  if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3215 	    {
3216 	      violation[0] = TRUE; /* begin actually has to do with the transition distro out of node 0 */
3217 	      if(print_flag) printf("\t beg[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->begin[(submap->spos+k-1)], sub_hmm->begin[k], diff);
3218 	    }
3219 	  else
3220 	    if(print_flag) printf("\t beg[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->begin[(submap->spos+k-1)], sub_hmm->begin[k], diff);
3221 
3222 	  diff = orig_hmm->end[(submap->spos+k-1)] - sub_hmm->end[k];
3223 	  if((diff > 0 && diff > pthresh) || (diff < 0 && diff < (-1. * pthresh)))
3224 	    {
3225 	      violation[k] = TRUE;
3226 	      if(print_flag) printf("\t end[%d] = %8.5f | %8.5f | %8.5f VIOLATION\n", k, orig_hmm->end[(submap->spos+k-1)], sub_hmm->end[k], diff);
3227 	    }
3228 	  else
3229 	    if(print_flag) printf("\t end[%d] = %8.5f | %8.5f | %8.5f\n", k, orig_hmm->end[(submap->spos+k-1)], sub_hmm->end[k], diff);
3230 	}
3231     }
3232 
3233   /* Add-up the violations */
3234   for(nd = 0; nd <= (submap->epos-submap->spos+1); nd++)
3235     {
3236       if(violation[nd] && subinfo->imp_cc[nd] == 0)
3237 	{
3238 	  v_ct++;
3239 	  printf("VIOLATION[%3d]: TRUE | submap->spos: %3d | submap->epos: %3d | subinfo->imp_cc: %d\n", nd, submap->spos, submap->epos, subinfo->imp_cc[nd]);
3240 	}
3241       else if(violation[nd] && subinfo->imp_cc[nd] != 0)
3242 	{
3243 	  subinfo->apredict_ct[subinfo->imp_cc[nd]]++;
3244 	  apredict_total_ct++;
3245 	  if(print_flag)
3246 	    printf("PREDICTED VIOLATION[%3d]: TRUE | submap->spos: %3d | submap->epos: %3d | subinfo->imp_cc: %d\n", nd, submap->spos, submap->epos, subinfo->imp_cc[nd]);
3247 	}
3248       else if(!violation[nd] && subinfo->imp_cc[nd] != 0)
3249 	{
3250 	  subinfo->apredict_ct[subinfo->imp_cc[nd]]++;
3251 	  apredict_total_ct++;
3252 	  subinfo->awrong_ct[subinfo->imp_cc[nd]]++;
3253 	  awrong_total_ct++;
3254 	  if(print_flag) printf("NON-VIOLATION[%3d] %3d : submap->spos: %3d | submap->epos: %3d | subinfo->imp_cc: %d\n", nd, awrong_total_ct, submap->spos, submap->epos, subinfo->imp_cc[nd]);
3255 	}
3256     }
3257 
3258   /* Clean up and return */
3259   for(k = 0; k <= orig_hmm->M; k++)
3260     free(orig_phi[k]);
3261   free(orig_phi);
3262 
3263   free(violation);
3264   FreeCPlan9(orig_hmm);
3265   FreeCPlan9(sub_hmm);
3266   FreeCP9Map(orig_cp9map);
3267   FreeCP9Map(sub_cp9map);
3268 
3269   if(v_ct > 0) ESL_FAIL(eslFAIL, errbuf, "check_sub_cm(): check failed");
3270   return eslOK;
3271 
3272  ERROR:
3273   ESL_FAIL(status, errbuf, "memory allocation error");
3274   return FALSE; /* never reached */
3275 }
3276 
3277 /**************************************************************************
3278  * EPN 10.23.06
3279  * Function: sub_cm2cm_parsetree()
3280  * Returns: void
3281  *
3282  * Purpose: Convert a parstree to a sub_cm to a parsetree for an original CM.
3283  *          We assume we're NOT in local mode. For any node n in the original
3284  *          CM for which 0 states in the sub_cm map to n, we declare that
3285  *          the delete state of that node was used in the converted original
3286  *          CM parse (or the S, B or E state if n is not MATP, MATL or MATR).
3287  *
3288  * Args:
3289  * CM_t *orig_cm             - the original, template CM
3290  * CM_t  *sub_cm             - the sub CM built from the orig_cm
3291  * Parsetree_t **ret_orig_tr - orig_cm parstree allocated, filled and returned here
3292  * Parsetree_t *sub_tr       - the sub_cm parstree already filled
3293  * CMSubMap_t *submap        - map from the sub_cm to orig_cm and vice versa
3294  * int print_flag    - TRUE to print useful debugging info
3295  *
3296  * Returns: eslOK on success
3297  *          eslEMEM if we run out of memory
3298  */
3299 
3300 int
sub_cm2cm_parsetree(CM_t * orig_cm,CM_t * sub_cm,Parsetree_t ** ret_orig_tr,Parsetree_t * sub_tr,CMSubMap_t * submap,int print_flag)3301 sub_cm2cm_parsetree(CM_t *orig_cm, CM_t *sub_cm, Parsetree_t **ret_orig_tr, Parsetree_t *sub_tr,
3302 		    CMSubMap_t *submap, int print_flag)
3303 {
3304   int  status;
3305   Parsetree_t *orig_tr; /* the parsetree we're creating for the original CM */
3306   int *ss_used;     /* [0..orig_cm->nodes-1], split state idx used in converted parsetree for each orig_cm nd */
3307   int *ss_emitl;    /* [0..orig_cm->nodes-1], tr->emitl[n] for each orig_cm node n */
3308   int *ss_emitr;    /* [0..orig_cm->nodes-1], tr->emitr[n] for each orig_cm node n */
3309   int *il_ct;       /* [0..orig_cm->nodes-1], number of times IL state of orig_cm node n was visited */
3310   int *ir_ct;       /* [0..orig_cm->nodes-1], number of times IR state of orig_cm node n was visited */
3311   int *il_used;     /* [0..orig_cm->nodes-1], idx of IL state in node n */
3312   int *ir_used;     /* [0..orig_cm->nodes-1], idx of IR state in node n */
3313   int *tr_nd_for_bifs; /* [0..orig_cm->nodes-1], if n is a BIF node, tr node of this BIF node, else -1 */
3314   int x;
3315   int nd;
3316   int sub_v;
3317   int orig_v1;
3318   int orig_v2;
3319   int orig_nd1;
3320   int orig_nd2;
3321   int cm_nd;
3322   int i;
3323   int parent_tr_nd;
3324   ESL_STACK   *pda;
3325   int          pos;
3326   int          ss;
3327   int          on_right;
3328   int emitl_flag;
3329   int emitr_flag;
3330 
3331   char **nodetypes;
3332   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
3333   nodetypes[0] = "BIF";
3334   nodetypes[1] = "MATP";
3335   nodetypes[2] = "MATL";
3336   nodetypes[3] = "MATR";
3337   nodetypes[4] = "BEGL";
3338   nodetypes[5] = "BEGR";
3339   nodetypes[6] = "ROOT";
3340   nodetypes[7] = "END";
3341 
3342   char **sttypes;
3343   ESL_ALLOC(sttypes, sizeof(char *) * 10);
3344   sttypes[0] = "D";
3345   sttypes[1] = "MP";
3346   sttypes[2] = "ML";
3347   sttypes[3] = "MR";
3348   sttypes[4] = "IL";
3349   sttypes[5] = "IR";
3350   sttypes[6] = "S";
3351   sttypes[7] = "E";
3352   sttypes[8] = "B";
3353   sttypes[9] = "EL";
3354 
3355   if(print_flag) printf("orig_cm nodes: %d\n", orig_cm->nodes);
3356 
3357   ESL_ALLOC(ss_used,       sizeof(int) * orig_cm->nodes + 1);
3358   ESL_ALLOC(ss_emitl,      sizeof(int) * orig_cm->nodes + 1);
3359   ESL_ALLOC(ss_emitr,      sizeof(int) * orig_cm->nodes + 1);
3360   ESL_ALLOC(il_used,       sizeof(int) * orig_cm->nodes + 1);
3361   ESL_ALLOC(ir_used,       sizeof(int) * orig_cm->nodes + 1);
3362   ESL_ALLOC(il_ct,         sizeof(int) * orig_cm->nodes + 1);
3363   ESL_ALLOC(ir_ct,         sizeof(int) * orig_cm->nodes + 1);
3364   ESL_ALLOC(tr_nd_for_bifs,sizeof(int) * orig_cm->nodes + 1);
3365   /* i*_emitl[nd] is the last residue emitted by the i* state
3366    * of node nd, the first is (il_emitl[nd] - il_ct[nd] + 1)
3367    * or (ir_emitr[nd] + ir_ct[nd] - 1)
3368    */
3369 
3370   for(nd = 0; nd < orig_cm->nodes; nd++)
3371     {
3372       ss_used[nd]   = -1;
3373       ss_emitl[nd]  = -1;
3374       ss_emitr[nd]  = -1;
3375       il_ct[nd]     =  0; /* the number of times the IL state was used in the sub_cm parse */
3376       ir_ct[nd]     =  0; /* the number of times the IR state was used in the sub_cm parse */
3377       il_used[nd]   = -1;
3378       ir_used[nd]   = -1;
3379       tr_nd_for_bifs[nd] = -1; /* this will remain -1 except for bif nodes */
3380     }
3381 
3382   for(x = 0; x < sub_tr->n; x++)
3383     {
3384       sub_v    = sub_tr->state[x];
3385       if(print_flag) printf("x: %d sub_v: %d\n", x, sub_v);
3386       orig_v1  = submap->s2o_smap[sub_v][0];
3387       orig_v2  = submap->s2o_smap[sub_v][1];
3388       if(print_flag) printf("orig_v1: %d | orig_v2: %d\n", orig_v1, orig_v2);
3389       if(orig_v1 == -1)
3390 	{
3391 	  if(sub_cm->sttype[sub_v] != S_st &&
3392 	     sub_cm->sttype[sub_v] != E_st &&
3393 	     sub_cm->sttype[sub_v] != B_st &&
3394 	     sub_cm->sttype[sub_v] != EL_st)
3395 	    cm_Fail("ERROR 0 in sub_cm2cm_parstree()\n");
3396 	  continue;
3397 	}
3398       orig_nd1 = orig_cm->ndidx[orig_v1];
3399       if(orig_v2 != -1)
3400 	orig_nd2 = orig_cm->ndidx[orig_v2];
3401       else
3402 	orig_nd2 = -1;
3403 
3404       /* No sub_cm insert states can map to 2 orig_cm inserts */
3405       if(orig_cm->sttype[orig_v1] == IL_st)
3406 	{
3407 	  il_used[orig_nd1] = orig_v1;
3408 	  il_ct[orig_nd1]++;
3409 	  if(orig_v2 != -1)
3410 	    cm_Fail("ERROR 1 in sub_cm2cm_parstree()\n");
3411 	}
3412       else if(orig_cm->sttype[orig_v1] == IR_st)
3413 	{
3414 	  ir_used[orig_nd1] = orig_v1;
3415 	  ir_ct[orig_nd1]++;
3416 	  if(orig_v2 != -1)
3417 	    cm_Fail("ERROR 2 in sub_cm2cm_parstree()\n");
3418 	}
3419       else if(sub_cm->ndtype[sub_cm->ndidx[sub_v]] == MATP_nd)
3420 	{
3421 	  ss_used[orig_nd1] = orig_v1;
3422 	  if(orig_v2 != -1)
3423 	    cm_Fail("ERROR 3 in sub_cm2cm_parsetree()\n");
3424 	}
3425       else if(orig_cm->ndtype[orig_nd1] == MATP_nd)
3426 	{
3427 	  if(orig_v2 == -1)
3428 	    cm_Fail("ERROR 4 in sub_cm2cm_parsetree()\n");
3429 	  /* We have to figure out which MATP split state sub_v corresponds to. */
3430 	  if(sub_cm->ndtype[sub_cm->ndidx[sub_v]] != MATL_nd &&
3431 	     sub_cm->ndtype[sub_cm->ndidx[sub_v]] != MATR_nd)
3432 	    cm_Fail("ERROR 5 in sub_cm2cm_parsetree()\n");
3433 	  if(orig_cm->ndtype[orig_nd2] != MATP_nd)
3434 	    cm_Fail("ERROR 6 in sub_cm2cm_parsetree()\n");
3435 
3436 	  if(sub_cm->sttype[sub_v] == D_st)
3437 	    {
3438 	      if(ss_used[orig_nd1] == -1 ||
3439 		 orig_cm->sttype[ss_used[orig_nd1]] == D_st)
3440 		ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1] + 3; /* MATP_D */
3441 
3442 	      /* Else we do nothing, orig_cm->sttype[ss_used[orig_nd1]] is already
3443 	       * either a ML_st or an MR_st */
3444 	    }
3445 	  else /* sub_cm->sttype[sub_v] != D_st */
3446 	    {
3447 	      if(ss_used[orig_nd1] == -1 ||
3448 		 orig_cm->sttype[ss_used[orig_nd1]] == D_st)
3449 		{
3450 		  /* Figure out if sub_v maps to the left or right half of the MATP node */
3451 		  if(orig_cm->sttype[orig_v1] == ML_st ||
3452 		     orig_cm->sttype[orig_v2] == ML_st)
3453 		    ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1] + 1; /* MATP_ML */
3454 		  else if(orig_cm->sttype[orig_v1] == MR_st ||
3455 			  orig_cm->sttype[orig_v2] == MR_st)
3456 		    ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1] + 2; /* MATP_MR */
3457 		  else
3458 		    cm_Fail("ERROR 7 in sub_cm2cm_parsetree()\n");
3459 		}
3460 	      /* below is the only line we really need:
3461 		 else
3462 		 ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1]; */
3463 
3464 	      else if(orig_cm->sttype[ss_used[orig_nd1]] == ML_st) /* just for safety; should erase eventually */
3465 		{
3466 		  if(orig_cm->sttype[orig_v1] == MR_st ||
3467 		     orig_cm->sttype[orig_v2] == MR_st) /* just for safety; should erase eventually */
3468 		    {
3469 		      ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1]; /* MATP_MP */
3470 		    }
3471 		  else
3472 		    cm_Fail("ERROR 8 in sub_cm2cm_parsetree()\n");
3473 		}
3474 	      else if(orig_cm->sttype[ss_used[orig_nd1]] == MR_st) /* just for safety; should erase eventually */
3475 		{
3476 		  if(orig_cm->sttype[orig_v1] == ML_st ||
3477 		     orig_cm->sttype[orig_v2] == ML_st) /* just for safety; should erase eventually */
3478 		    {
3479 		      ss_used[orig_nd1] = orig_cm->nodemap[orig_nd1]; /* MATP_MP */
3480 		    }
3481 		  else
3482 		    cm_Fail("ERROR 9 in sub_cm2cm_parsetree()\n");
3483 		}
3484 	    }
3485 	}
3486       else
3487 	{
3488 	  if(orig_v2 != -1)
3489 	    cm_Fail("ERROR 5 in sub_cm2cm_parsetree()\n");
3490 	  ss_used[orig_nd1] = orig_v1;
3491 	}
3492     }
3493 
3494   /* Some MATL, MATR, and MATP nodes in the orig_cm might have 0 states that map to any state
3495    * used in the sub_cm parse. If we're not allowing local begins and ends, our strategy is
3496    * to claim that in the orig_cm parse the D state of these nodes was used.
3497    */
3498   for(nd = 0; nd < orig_cm->nodes; nd++)
3499     {
3500       if(ss_used[nd] == -1)
3501 	{
3502 	  if(orig_cm->ndtype[nd] == MATP_nd)
3503 	    ss_used[nd] = orig_cm->nodemap[nd] + 3; /* MATP_D */
3504 	  if(orig_cm->ndtype[nd] == MATL_nd ||
3505 	     orig_cm->ndtype[nd] == MATR_nd)
3506 	    ss_used[nd] = orig_cm->nodemap[nd] + 1; /* MAT{L,R}_D */
3507 	  if(orig_cm->ndtype[nd] == BIF_nd  ||
3508 	     orig_cm->ndtype[nd] == BEGL_nd ||
3509 	     orig_cm->ndtype[nd] == BEGR_nd ||
3510 	     orig_cm->ndtype[nd] == END_nd)
3511 	    ss_used[nd] = orig_cm->nodemap[nd];     /* BIF_B, END_E or BEG{L,R}_S */
3512 	}
3513     }
3514 
3515   /* Determine emitl and emitr for each state in the orig_cm parse */
3516   /* This code is noticeably cleaner than the rest - it's Sean's, from
3517    * CreateEmitMap() adapted for our purposes here.
3518    */
3519 
3520   pos   = 1;
3521   ss    = 0;
3522   if((pda = esl_stack_ICreate()) == NULL) goto ERROR;
3523   if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;		/* 0 = left side. 1 would = right side. */
3524   if((status = esl_stack_IPush(pda, ss)) != eslOK) goto ERROR;
3525   while (esl_stack_IPop(pda, &ss) != eslEOD)
3526     {
3527       esl_stack_IPop(pda, &on_right);
3528 
3529       if (on_right)
3530 	{
3531 	  pos += ir_ct[orig_cm->ndidx[ss]]; /* account for right inserts */
3532 	  if (orig_cm->sttype[ss] == MP_st || orig_cm->sttype[ss] == MR_st)
3533 	    pos++;
3534 	  ss_emitr[orig_cm->ndidx[ss]] = pos - 1;
3535 	}
3536       else
3537 	{
3538 	  ss_emitl[orig_cm->ndidx[ss]] = pos;
3539 	  if (orig_cm->sttype[ss] == MP_st || orig_cm->sttype[ss] == ML_st)
3540 	    pos++;
3541 
3542 	  if (orig_cm->sttype[ss] == B_st)
3543 	    {
3544 	      /* push the BIF back on for its right side  */
3545 	      if((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
3546 	      if((status = esl_stack_IPush(pda, ss)) != eslOK) goto ERROR;
3547 	      /* push node index for right child */
3548 	      if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
3549 	      if((status = esl_stack_IPush(pda, orig_cm->cnum[ss])) != eslOK) goto ERROR;
3550 	      /* push node index for left child */
3551 	      if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
3552 	      if((status = esl_stack_IPush(pda, orig_cm->cfirst[ss])) != eslOK) goto ERROR;
3553 	    }
3554 	  else
3555 	    {
3556 	      /* push the node back on for right side */
3557 	      if((status = esl_stack_IPush(pda, 1)) != eslOK) goto ERROR;
3558 	      if((status = esl_stack_IPush(pda, ss)) != eslOK) goto ERROR;
3559 	      /* push split state of child node on */
3560 	      if (orig_cm->sttype[ss] != E_st) {
3561 		if((status = esl_stack_IPush(pda, 0)) != eslOK) goto ERROR;
3562 		if((status = esl_stack_IPush(pda, ss_used[orig_cm->ndidx[ss]+1])) != eslOK) goto ERROR;
3563 	      }
3564 	    }
3565 	  pos += il_ct[orig_cm->ndidx[ss]]; /* account for left inserts */
3566 	}
3567     }
3568 
3569   if(print_flag)
3570     {
3571       for(nd = 0; nd < orig_cm->nodes; nd++)
3572 	{
3573 	  printf("ss_used[%4d] (%4s) first state(%4d) | ", nd, nodetypes[(int) orig_cm->ndtype[nd]], orig_cm->nodemap[nd]);
3574 	  if(ss_used[nd] != -1)
3575 	    printf("%4d (%2s) | L: %3d R: %3d\n", ss_used[nd], sttypes[(int) orig_cm->sttype[ss_used[nd]]], ss_emitl[nd], ss_emitr[nd]);
3576 	  else
3577 	    printf("%4d\n", -1);
3578 	}
3579       for(nd = 0; nd < orig_cm->nodes; nd++)
3580 	printf("il_ct[%4d] (st used: %4d) ct: %4d | ir_ct[%4d] ct: %4d (st used: %4d)\n", nd, il_used[nd], il_ct[nd], nd, ir_used[nd], ir_ct[nd]);
3581 
3582     }
3583 
3584   orig_tr = CreateParsetree(100);
3585   for(cm_nd = 0; cm_nd < orig_cm->nodes; cm_nd++)
3586     {
3587       emitl_flag = 0;
3588       emitr_flag = 0;
3589       if(orig_cm->sttype[ss_used[cm_nd]] == MP_st ||
3590 	 orig_cm->sttype[ss_used[cm_nd]] == ML_st)
3591 	emitl_flag = 1;
3592       if(orig_cm->sttype[ss_used[cm_nd]] == MP_st ||
3593 	 orig_cm->sttype[ss_used[cm_nd]] == MR_st)
3594 	emitr_flag = 1;
3595 
3596       /* At least 1 state in each node must be visited (if we're not in local mode) */
3597       if(orig_cm->ndtype[cm_nd] == BEGR_nd)
3598 	{
3599 	  parent_tr_nd =  tr_nd_for_bifs[orig_cm->ndidx[orig_cm->plast[orig_cm->nodemap[cm_nd]]]];
3600 	  if(print_flag) printf("tr_nd_for_bifs[%d]\n", (orig_cm->ndidx[orig_cm->plast[orig_cm->nodemap[cm_nd]]]));
3601 	  if(print_flag) printf("parent_tr_nd for cm_nd %d: %d\n", cm_nd, parent_tr_nd);
3602 	  InsertTraceNode(orig_tr, parent_tr_nd, TRACE_RIGHT_CHILD, ss_emitl[cm_nd], ss_emitr[cm_nd], ss_used[cm_nd]);
3603 	  orig_tr->nxtr[parent_tr_nd]  = orig_tr->n - 1; /* Go back and fix nxtr for the BIF parent of this BEGR */
3604 	}
3605       else
3606 	{
3607 	  InsertTraceNode(orig_tr, orig_tr->n-1, TRACE_LEFT_CHILD, ss_emitl[cm_nd], ss_emitr[cm_nd], ss_used[cm_nd]);
3608 	  if(print_flag) printf("inserted trace node for orig_cm st %4s | emitl: %d | emitr: %d\n", sttypes[(int) orig_cm->sttype[ss_used[cm_nd]]], ss_emitl[cm_nd], ss_emitr[cm_nd]);
3609 	}
3610 
3611       /* Note: if we've just added a trace node for a BIF state, it's incomplete, in that it
3612        * doesn't have the nextr correctly set. We'll go back and set this when we get to the
3613        * right child (BEGR) of this BIF */
3614       if(orig_cm->ndtype[cm_nd] == BIF_nd)
3615 	{
3616 	  tr_nd_for_bifs[cm_nd] = orig_tr->n - 1;
3617 	  if(print_flag) printf("set tr_nd_for_bifs[%d]: %d\n", cm_nd, orig_tr->n);
3618 	}
3619 
3620       /* Add left inserts, if any */
3621       for(i = 0; i < il_ct[cm_nd]; i++)
3622 	{
3623 	  InsertTraceNode(orig_tr, orig_tr->n-1, TRACE_LEFT_CHILD, (ss_emitl[cm_nd] + emitl_flag + i), (ss_emitr[cm_nd] - emitr_flag), il_used[cm_nd]);
3624 	  if(print_flag) printf("inserted trace node for orig_cm st %4s | emitl: %d | emitr: %d\n", sttypes[(int) orig_cm->sttype[il_used[cm_nd]]], orig_tr->emitl[orig_tr->n-1], orig_tr->emitr[orig_tr->n+1]);
3625 	}
3626       /* Add right inserts, if any */
3627       for(i = 0; i < ir_ct[cm_nd]; i++)
3628 	{
3629 	  InsertTraceNode(orig_tr, orig_tr->n-1, TRACE_LEFT_CHILD, (ss_emitl[cm_nd] + emitl_flag + il_ct[cm_nd]), (ss_emitr[cm_nd] - emitr_flag - i), ir_used[cm_nd]);
3630 	  if(print_flag) printf("inserted trace node for orig_cm st %4s | emitl: %d | emitr: %d\n", sttypes[(int) orig_cm->sttype[ir_used[cm_nd]]], orig_tr->emitl[orig_tr->n-1], orig_tr->emitr[orig_tr->n+1]);
3631 	}
3632       if(print_flag) printf("END nd: %4d\n", cm_nd);
3633     }
3634   *ret_orig_tr = orig_tr;
3635 
3636   free(ss_used);
3637   free(ss_emitl);
3638   free(ss_emitr);
3639   free(il_used);
3640   free(ir_used);
3641   free(il_ct);
3642   free(ir_ct);
3643   free(tr_nd_for_bifs);
3644   free(nodetypes);
3645   free(sttypes);
3646   esl_stack_Destroy(pda);
3647   return eslOK;
3648 
3649  ERROR:
3650   return eslEMEM;
3651 }
3652 
3653 /* Function: SubCMLogoddsify()
3654  * Date:     EPN, Wed Aug 20 08:14:18 2008
3655  *
3656  * Purpose:  Convert the probabilities in a sub CM built
3657  *           from <mother_cm> to log-odds. Copy the <mother_cm>
3658  *           parameters where possible to save time.
3659  *
3660  * Returns:  eslOK on success;
3661  * Throws:   eslEINCOMPAT on contract violation.
3662  *           eslFAIL if we can't create CMConsensus_t object
3663  *           at end of the function.
3664  */
3665 int
SubCMLogoddsify(CM_t * cm,char * errbuf,CM_t * mother_cm,CMSubMap_t * mother_map)3666 SubCMLogoddsify(CM_t *cm, char *errbuf, CM_t *mother_cm, CMSubMap_t *mother_map)
3667 {
3668   if(!(mother_cm->flags & CMH_BITS)) ESL_FAIL(eslEINCOMPAT, errbuf, "SubCMLogoddsify(), mother_cm's CMH_BITS flag down, it's bit scores are invalid.");
3669 
3670   int v, mv, x, y;
3671 
3672   for (v = 0; v < cm->M; v++) {
3673     if (mother_map->s2o_id[v] == TRUE) { /* this state maps identically to a mother_cm state, copy parameters */
3674       if (cm->sttype[v] != B_st && cm->sttype[v] != E_st) {
3675 	mv = mother_map->s2o_smap[v][0];
3676 	esl_vec_FCopy(mother_cm->tsc[mv],  cm->cnum[v], cm->tsc[v]);
3677 	esl_vec_ICopy(mother_cm->itsc[mv], cm->cnum[v], cm->itsc[v]);
3678 #if eslDEBUGLEVEL >= 1
3679 	for(x = 0; x < cm->cnum[v]; x++) {
3680 	  if(esl_FCompare(mother_cm->t[mv][x], cm->t[v][x], 1E-5) != eslOK) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "You've got it wrong, mother_cm->t[mv:%d][x:%d] %.3f != cm->t[v:%d][x:%d] %.3f\n", mv, x, mother_cm->t[mv][x], v, x, cm->t[v][x]);
3681 	}
3682 #endif
3683       }
3684       if (cm->sttype[v] == MP_st) {
3685 	esl_vec_FCopy(mother_cm->esc[mv],  cm->abc->K * cm->abc->K, cm->esc[v]);
3686 	esl_vec_ICopy(mother_cm->iesc[mv], cm->abc->K * cm->abc->K, cm->iesc[v]);
3687       }
3688       if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st ||
3689 	  cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
3690 	esl_vec_FCopy(mother_cm->esc[mv],  cm->abc->K, cm->esc[v]);
3691 	esl_vec_ICopy(mother_cm->iesc[mv], cm->abc->K, cm->iesc[v]);
3692       }
3693       cm->beginsc[v]  = mother_cm->beginsc[mv];
3694       cm->ibeginsc[v] = mother_cm->ibeginsc[mv];
3695 
3696       cm->endsc[v]    = mother_cm->endsc[mv];
3697       cm->iendsc[v]   = mother_cm->iendsc[mv];
3698     }
3699     else { /* this state does not map identically to a mother_cm state, we have to do the work calculate the parameters */
3700       if (cm->sttype[v] != B_st && cm->sttype[v] != E_st) {
3701 	for (x = 0; x < cm->cnum[v]; x++) {
3702 	  cm->tsc[v][x]  = sreLOG2(cm->t[v][x]);
3703 	  cm->itsc[v][x] = Prob2Score(cm->t[v][x], 1.0);
3704 	  /*printf("cm->t[%4d][%2d]: %f itsc->e: %f itsc: %d\n", v, x, cm->t[v][x], Score2Prob(cm->itsc[v][x], 1.0), cm->itsc[v][x]);*/
3705 	}
3706       }
3707       if (cm->sttype[v] == MP_st) {
3708 	for (x = 0; x < cm->abc->K; x++) {
3709 	  for (y = 0; y < cm->abc->K; y++) {
3710 	    cm->esc[v][x*cm->abc->K+y]  = sreLOG2(cm->e[v][x*cm->abc->K+y] / (cm->null[x]*cm->null[y]));
3711 	    cm->iesc[v][x*cm->abc->K+y] = Prob2Score(cm->e[v][x*cm->abc->K+y], (cm->null[x]*cm->null[y]));
3712 	    /*printf("cm->e[%4d][%2d]: %f iesc->e: %f iesc: %d\n", v, (x*cm->abc->K+y), cm->e[v][(x*cm->abc->K+y)], Score2Prob(cm->iesc[v][x*cm->abc->K+y], (cm->null[x]*cm->null[y])), cm->iesc[v][(x*cm->abc->K+y)]);*/
3713 	  }
3714 	}
3715       }
3716       if (cm->sttype[v] == ML_st || cm->sttype[v] == MR_st ||
3717 	  cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) {
3718 	for (x = 0; x < cm->abc->K; x++) {
3719 	  cm->esc[v][x]  = sreLOG2(cm->e[v][x] / cm->null[x]);
3720 	  cm->iesc[v][x] = Prob2Score(cm->e[v][x], cm->null[x]);
3721 	  /*printf("cm->e[%4d][%2d]: %f esc: %f null[%d]: %f\n", v, x, cm->e[v][x], cm->esc[v][x], x, cm->null[x]);*/
3722 	  /*printf("cm->e[%4d][%2d]: %f iesc->e: %f iesc: %d\n", v, x, cm->e[v][x], Score2Prob(cm->iesc[v][x], (cm->null[x])), cm->iesc[v][x]);*/
3723 	}
3724       }
3725       /* These work even if begin/end distributions are inactive 0's,
3726        * sreLOG2 will set beginsc, endsc to -infinity.
3727        */
3728       cm->beginsc[v]  = sreLOG2(cm->begin[v]);
3729       cm->ibeginsc[v] = Prob2Score(cm->begin[v], 1.0);
3730       /*printf("cm->begin[%4d]: %f ibeginsc->e: %f ibeginsc: %d\n", v, cm->begin[v], Score2Prob(cm->ibeginsc[v], 1.0), cm->ibeginsc[v]);*/
3731 
3732       cm->endsc[v]    = sreLOG2(cm->end[v]);
3733       cm->iendsc[v]   = Prob2Score(cm->end[v], 1.0);
3734       /*printf("cm->end[%4d]: %f iendsc->e: %f iendsc: %d\n\n", v, cm->end[v], Score2Prob(cm->iendsc[v], 1.0), cm->iendsc[v]);*/
3735     }
3736   }
3737   cm->iel_selfsc = Prob2Score(sreEXP2(cm->el_selfsc), 1.0);
3738   /*printf("cm->el_selfsc: %f prob: %f cm->iel_selfsc: %d prob: %f\n", cm->el_selfsc,
3739 	 (sreEXP2(cm->el_selfsc)), cm->iel_selfsc, (Score2Prob(cm->iel_selfsc, 1.0)));
3740 	 printf("-INFTY: %d prob: %f 2^: %f\n", -INFTY, (Score2Prob(-INFTY, 1.0)), sreEXP2(-INFTY));*/
3741 
3742   /* Allocate and fill optimized emission scores for this CM.
3743    * If they already exist, free them and recalculate them, slightly wasteful, oh well.
3744    */
3745   if(cm->oesc != NULL || cm->ioesc != NULL) FreeOptimizedEmitScores(cm->oesc, cm->ioesc, cm->M);
3746 
3747   cm->oesc  = SubFCalcAndCopyOptimizedEmitScoresFromMother(cm, mother_cm, mother_map);
3748 
3749   /* EPN, Wed Aug 20 15:26:31 2008
3750    * old, slow way:
3751    * cm->ioesc = ICalcOptimizedEmitScores(cm);
3752    */
3753   cm->ioesc = ICopyOptimizedEmitScoresFromFloats(cm, cm->oesc);
3754 
3755   /* Potentially, overwrite transitions with non-probabilistic
3756    * RSEARCH transitions. Currently only default transition
3757    * parameters are allowed, these are defined as DEFAULT_R*
3758    * in infernal.h
3759    * Note: This is untouched from CMLogoddsify(), we don't try
3760    *       to accelerate it, it's unusual that it will be executed.
3761    */
3762   if(cm->flags & CM_RSEARCHTRANS) {
3763       float           alpha =   DEFAULT_RALPHA;
3764       float           beta =    DEFAULT_RBETA;
3765       float           alphap =  DEFAULT_RALPHAP;
3766       float           betap =   DEFAULT_RBETAP;
3767       float           beginsc = DEFAULT_RBEGINSC;
3768       float           endsc =   DEFAULT_RENDSC;
3769       int             nd;
3770       /* First do the normal transitions */
3771       for (v=0; v<cm->M; v++)
3772 	{
3773 	  if (cm->sttype[v] != B_st && cm->sttype[v] != E_st)
3774 	    {
3775 	      for (x=0; x<cm->cnum[v]; x++)
3776 		{
3777 		  cm->tsc[v][x] = -1. * rsearch_calculate_gap_penalty
3778 		    (cm->stid[v], cm->stid[cm->cfirst[v]+x],
3779 		     cm->ndtype[cm->ndidx[v]], cm->ndtype[cm->ndidx[cm->cfirst[v]+x]],
3780 		     alpha, beta, alphap, betap);
3781 		  /* alphas and rbetas were positive -- gap score is a penalty, so
3782 		     multiply by -1 */
3783 		  cm->itsc[v][x] = (int) floor(0.5 + INTSCALE * cm->tsc[v][x]);
3784 		}
3785 	    }
3786 	}
3787       /* Overwrite local begin and end scores */
3788       for (v=cm->M - 1; v>=0; v--) {
3789 	cm->beginsc[v] = IMPOSSIBLE;
3790 	cm->endsc[v] = IMPOSSIBLE;
3791       }
3792 
3793       /* beginsc states */
3794       for (nd = 2; nd < cm->nodes; nd++) {
3795 	if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
3796 	    cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd)
3797 	  {
3798 	    cm->beginsc[cm->nodemap[nd]] = beginsc;
3799 	    cm->ibeginsc[cm->nodemap[nd]] = INTSCALE * beginsc;
3800 	  }
3801       }
3802 
3803       /* endsc states */
3804       for (nd = 1; nd < cm->nodes; nd++) {
3805 	if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
3806 	     cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
3807 	     cm->ndtype[nd] == BEGR_nd) &&
3808 	    cm->ndtype[nd+1] != END_nd)
3809 	  {
3810 	  cm->endsc[cm->nodemap[nd]] = endsc;
3811 	  cm->iendsc[cm->nodemap[nd]] = INTSCALE * endsc;
3812 	  }
3813       }
3814 
3815       cm->flags |= CMH_LOCAL_BEGIN;
3816       cm->flags |= CMH_LOCAL_END;
3817     }
3818   /* raise flag saying we have valid log odds scores */
3819   cm->flags |= CMH_BITS;
3820 
3821   /* create cm->cmcons, we expect this to be valid if we have valid log odds score */
3822   if(cm->cmcons != NULL) FreeCMConsensus(cm->cmcons);
3823   if((cm->cmcons = CreateCMConsensus(cm, cm->abc)) == NULL) return eslFAIL;
3824 
3825   return eslOK;
3826 }
3827 
3828 /* Function: SubFCalcAndCopyOptimizedEmitScoresFromMother()
3829  * Date:     EPN, Wed Aug 20 15:40:23 2008
3830  *
3831  * Purpose:  Allocate, fill and return an optimized emission score vector
3832  *           of float scores for fast search/alignment.
3833  *           Fill emission scores by copying them when
3834  *           possible from a 'mother' CM if the current
3835  *           cm is a sub CM of the mother.
3836  *
3837  * Returns:  the 2D float emission score vector on success,
3838  *           dies immediately on memory allocation error.
3839  */
3840 float **
SubFCalcAndCopyOptimizedEmitScoresFromMother(CM_t * cm,CM_t * mother_cm,CMSubMap_t * mother_map)3841 SubFCalcAndCopyOptimizedEmitScoresFromMother(CM_t *cm, CM_t *mother_cm, CMSubMap_t *mother_map)
3842 {
3843   int status;
3844   float **esc_vAA;
3845   ESL_DSQ a,b;
3846   int v;
3847   int cur_cell;
3848   int npairs = 0;
3849   int nsinglets = 0;
3850   float *ptr_to_start; /* points to block allocated to esc_vAA[0], nec b/c esc_vAA[0] gets set to NULL, because v == 0 is non-emitter */
3851   float **leftAA;
3852   float **rightAA;
3853   int mv;
3854 
3855   /* count pairs, singlets */
3856   for(v = 0; v < cm->M; v++) {
3857     switch(cm->sttype[v]) {
3858     case IL_st:
3859     case ML_st:
3860     case IR_st:
3861     case MR_st:
3862       nsinglets++;
3863       break;
3864     case MP_st:
3865       npairs++;
3866       break;
3867     }
3868   }
3869 
3870   /* set up our left and right vectors for all possible non-canonical residues,
3871    * these are calc'ed once and passed to FastPairScore*() functions to minimize
3872    * run time.
3873    */
3874   ESL_ALLOC(leftAA,  sizeof(float *) * cm->abc->Kp);
3875   ESL_ALLOC(rightAA, sizeof(float *) * cm->abc->Kp);
3876   for(a = 0; a <= cm->abc->K; a++) leftAA[a] = rightAA[a] = NULL; /* canonicals and gap, left/right unnec */
3877   for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) {
3878     ESL_ALLOC(leftAA[a],  sizeof(float) * cm->abc->K);
3879     ESL_ALLOC(rightAA[a], sizeof(float) * cm->abc->K);
3880     esl_vec_FSet(leftAA[a],  cm->abc->K, 0.);
3881     esl_vec_FSet(rightAA[a], cm->abc->K, 0.);
3882     esl_abc_FCount(cm->abc, leftAA[a],  a, 1.);
3883     esl_abc_FCount(cm->abc, rightAA[a], a, 1.);
3884   }
3885   leftAA[cm->abc->Kp-1] = rightAA[cm->abc->Kp-1] = NULL; /* missing data, left/right unnec */
3886 
3887   /* precalculate possible emission scores for each state */
3888   ESL_ALLOC(esc_vAA,     sizeof(float *) * (cm->M));
3889   ESL_ALLOC(esc_vAA[0],  sizeof(float)   * ((cm->abc->Kp * nsinglets) + (cm->abc->Kp * cm->abc->Kp * npairs)));
3890   ptr_to_start = esc_vAA[0];
3891   cur_cell = 0;
3892   for(v = 0; v < cm->M; v++) {
3893     switch(cm->sttype[v]) {
3894     case IL_st:
3895     case ML_st:
3896     case IR_st:
3897     case MR_st:
3898       esc_vAA[v] = ptr_to_start + cur_cell;
3899       cur_cell += cm->abc->Kp;
3900       if (mother_map->s2o_id[v] == TRUE) { /* this state maps identically to a mother_cm state, copy parameters */
3901 	mv = mother_map->s2o_smap[v][0];
3902 	esl_vec_FCopy(mother_cm->oesc[mv], cm->abc->Kp, esc_vAA[v]);
3903       }
3904       else {
3905 	for(a = 0; a < cm->abc->K; a++) /* all canonical residues */
3906 	  esc_vAA[v][a]  = cm->esc[v][a];
3907 	esc_vAA[v][cm->abc->K] = IMPOSSIBLE; /* gap symbol is impossible */
3908 	for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) /* all ambiguous residues */
3909 	  esc_vAA[v][a]  = esl_abc_FAvgScore(cm->abc, a, cm->esc[v]);
3910 	esc_vAA[v][cm->abc->Kp-1] = IMPOSSIBLE; /* missing data is IMPOSSIBLE */
3911       }
3912       break;
3913     case MP_st:
3914       esc_vAA[v] = ptr_to_start + cur_cell;
3915       esl_vec_FSet(esc_vAA[v], cm->abc->Kp * cm->abc->Kp, IMPOSSIBLE); /* init all cells to IMPOSSIBLE */
3916       cur_cell += cm->abc->Kp * cm->abc->Kp;
3917       if (mother_map->s2o_id[v] == TRUE) { /* this state maps identically to a mother_cm state, copy parameters */
3918 	mv = mother_map->s2o_smap[v][0];
3919 	esl_vec_FCopy(mother_cm->oesc[mv], cm->abc->Kp * cm->abc->Kp, esc_vAA[v]);
3920       }
3921       else {
3922 	/* a is canonical, b is canonical */
3923 	for(a = 0; a < cm->abc->K; a++) {
3924 	  for(b = 0; b < cm->abc->K; b++) {
3925 	    esc_vAA[v][(a * cm->abc->Kp) + b]  = cm->esc[v][(a * cm->abc->K) + b];
3926 	  }
3927 	}
3928 	/* a is not canonical, b is canonical */
3929 	for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) {
3930 	  for(b = 0; b < cm->abc->K; b++) {
3931 	    esc_vAA[v][(a * cm->abc->Kp) + b]  = FastPairScoreLeftOnlyDegenerate(cm->abc->K, cm->esc[v], leftAA[a], b);
3932 	  }
3933 	}
3934 	/* a is canonical, b is not canonical */
3935 	for(a = 0; a < cm->abc->K; a++) {
3936 	  for(b = cm->abc->K+1; b < cm->abc->Kp-1; b++) {
3937 	    esc_vAA[v][(a * cm->abc->Kp) + b]  = FastPairScoreRightOnlyDegenerate(cm->abc->K, cm->esc[v], rightAA[b], a);
3938 	  }
3939 	}
3940 	/* a is not canonical, b is not canonical */
3941 	for(a = cm->abc->K+1; a < cm->abc->Kp-1; a++) {
3942 	  for(b = cm->abc->K+1; b < cm->abc->Kp-1; b++) {
3943 	    esc_vAA[v][(a * cm->abc->Kp) + b]  = FastPairScoreBothDegenerate(cm->abc->K, cm->esc[v], leftAA[a], rightAA[b]);
3944 	  }
3945 	}
3946 	/* everything else, when either a or b is gap or missing data, stays IMPOSSIBLE */
3947       }
3948       break;
3949     default:
3950       esc_vAA[v] = NULL;
3951       break;
3952     }
3953   }
3954   for(a = 0; a < cm->abc->Kp; a++) {
3955     if(leftAA[a] != NULL)  free(leftAA[a]);
3956     if(rightAA[a] != NULL) free(rightAA[a]);
3957   }
3958   free(leftAA);
3959   free(rightAA);
3960   return esc_vAA;
3961 
3962  ERROR:
3963   cm_Fail("memory allocation error.");
3964   return NULL; /* NEVERREACHED */
3965 }
3966 
3967 
3968 /* Function: CP9_reconfig2sub()
3969  * EPN 10.16.06
3970  *
3971  * Purpose:  Given a CM Plan 9 HMM and a start position
3972  *           (spos) and end position (epos) that a sub CM models,
3973  *           reconfigure the HMM so that it can only start in the
3974  *           node that models spos (spos_nd) end in the node that
3975  *           models epos (epos_nd).
3976  *
3977  *           If we're reconfiguring a CP9 HMM that ONLY models the
3978  *           consensus columns spos to epos, then spos_nd == 1
3979  *           and epos_nd == hmm->M, but this is not necessarily true.
3980  *           We may be reconfiguring a CP9 HMM that models the
3981  *           full alignment including positions before and/or after
3982  *           spos and epos. In this case spos_nd == spos and
3983  *           epos_nd == epos;
3984  *
3985  * Args:     hmm         - the CP9 model w/ data-dep prob's valid
3986  *           spos        - first consensus column modelled by some original
3987  *                         full length, template CP9 HMM that 'hmm' models.
3988  *           epos        - final consensus column modelled by some original
3989  *                         CP9 HMM that 'hmm' models.
3990  *           spos_nd     - the node of 'hmm' that models spos.
3991  *                         (1 if 'hmm' only has (epos-spos+1) nodes
3992  *                         (spos if 'hmm' has a node for each column of original aln)
3993  *           epos_nd     - the node of the 'hmm' in that models epos.
3994  *                         (hmm->M if 'hmm' only has (epos-spos+1) nodes
3995  *                         (epos if 'hmm' has a node for each column of original aln)
3996  *           orig_phi    - the 2D phi array for the original CP9 HMM.
3997  * Return:   (void)
3998  *           HMM probabilities are modified.
3999  */
4000 void
CP9_reconfig2sub(CP9_t * hmm,int spos,int epos,int spos_nd,int epos_nd,double ** orig_phi)4001 CP9_reconfig2sub(CP9_t *hmm, int spos, int epos, int spos_nd,
4002 		 int epos_nd, double **orig_phi)
4003 {
4004   /* Make the necessary modifications. Since in cmalign --sub mode this
4005    * function will be called potentially once for each sequence, we
4006    * don't want to call CP9Logoddsify(), but rather only logoddsify
4007    * the parameters that are different.
4008    */
4009 
4010   /* Configure entry.
4011    * Exactly 3 ways to start, B->M_1 (hmm->begin[1]), B->I_0 (hmm->t[0][CTMI]),
4012    *                      and B->D_1 (hmm->t[0][CTMD])
4013    */
4014   /* prob of starting in M_spos is (1. - prob of starting in I_spos-1) as there is no D_spos-1 -> M_spos trans */
4015 
4016   if(spos > 1)
4017     {
4018       hmm->begin[spos_nd] = 1.-((orig_phi[spos-1][HMMINSERT] * (1. - hmm->t[spos-1][CTII])) +
4019 			        (orig_phi[spos  ][HMMDELETE] - (orig_phi[spos-1][HMMINSERT] * hmm->t[spos-1][CTID])));
4020       hmm->t[spos_nd-1][CTMI] =   (orig_phi[spos-1][HMMINSERT] * (1. - hmm->t[spos-1][CTII]));
4021       hmm->t[spos_nd-1][CTMD] =    orig_phi[spos  ][HMMDELETE] - (orig_phi[spos-1][HMMINSERT] * hmm->t[spos-1][CTID]);
4022       hmm->t[spos_nd-1][CTMM] = 0.; /* probability of going from B(M_0) to M_1 is begin[1] */
4023       hmm->t[spos_nd-1][CTMEL] = 0.; /* can't go to EL from B(M_0) */
4024       hmm->t[spos_nd-1][CTDM] = 0.; /* D_0 doesn't exist */
4025       hmm->t[spos_nd-1][CTDI] = 0.; /* D_0 doesn't exist */
4026       hmm->t[spos_nd-1][CTDD] = 0.; /* D_0 doesn't exist */
4027 
4028       hmm->bsc[spos_nd]       = Prob2Score(hmm->begin[1], 1.0);
4029 
4030       hmm->tsc[CTMM][spos_nd-1] = -INFTY; /* probability of going from B(M_0) to M_1 is begin[1] */
4031       hmm->tsc[CTMEL][spos_nd-1] = -INFTY;
4032       hmm->tsc[CTDM][spos_nd-1] = -INFTY; /* D_0 doesn't exist */
4033       hmm->tsc[CTDI][spos_nd-1] = -INFTY; /* D_0 doesn't exist */
4034       hmm->tsc[CTDD][spos_nd-1] = -INFTY; /* D_0 doesn't exist */
4035 
4036       hmm->tsc[CTMI][spos_nd-1] = Prob2Score(hmm->t[spos_nd-1][CTMI], 1.0);
4037       hmm->tsc[CTMD][spos_nd-1] = Prob2Score(hmm->t[spos_nd-1][CTMD], 1.0);
4038     }
4039 
4040   if(epos < hmm->M)
4041     {
4042       hmm->end[epos_nd]      = hmm->t[epos][CTMM] + hmm->t[epos][CTMD];
4043       hmm->t[epos_nd][CTDM] += hmm->t[epos][CTDD];
4044       hmm->t[epos_nd][CTIM] += hmm->t[epos][CTID];
4045       hmm->t[epos_nd][CTMM]  = 0.; /* M->E is actually end[M] */
4046       hmm->t[epos_nd][CTMEL]  = 0.;
4047       hmm->t[epos_nd][CTMD]  = 0.; /* D_M+1 doesn't exist */
4048       hmm->t[epos_nd][CTDD]  = 0.; /* D_M+1 doesn't exist */
4049       hmm->t[epos_nd][CTID]  = 0.; /* D_M+1 doesn't exist */
4050 
4051       hmm->esc[epos_nd]       = Prob2Score(hmm->end[epos_nd], 1.0);
4052       hmm->tsc[CTDM][epos_nd] = Prob2Score(hmm->t[epos_nd][CTDM], 1.0);
4053       hmm->tsc[CTIM][epos_nd] = Prob2Score(hmm->t[epos_nd][CTIM], 1.0);
4054       hmm->tsc[CTMM][epos_nd] = -INFTY; /* M->E is actually end[M] */
4055       hmm->tsc[CTMEL][epos_nd] = -INFTY;
4056       hmm->tsc[CTMD][epos_nd] = -INFTY; /* D_M+1 doesn't exist */
4057       hmm->tsc[CTDD][epos_nd] = -INFTY; /* D_M+1 doesn't exist */
4058       hmm->tsc[CTID][epos_nd] = -INFTY; /* D_M+1 doesn't exist */
4059     }
4060   hmm->flags |= CPLAN9_HASBITS;	/* raise the log-odds ready flag */
4061 
4062   return;
4063 }
4064 
4065 #if 0
4066 /* These two functions are not currently used, but could be useful for debugging
4067  * in the future */
4068 static void  debug_print_misc_sub_cm_info(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CP9Map_t *orig_cp9map);
4069 static void  debug_sub_cm_check_all_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap);
4070 
4071 /**************************************************************************
4072  * EPN 10.06.06
4073  * Function: debug_print_misc_sub_cm_info()
4074  **************************************************************************/
4075 static void
4076 debug_print_misc_sub_cm_info(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap, CP9Map_t *orig_cp9map)
4077 {
4078   int status;
4079   int orig_il1;
4080   int orig_il2;
4081   int orig_ir1;
4082   int orig_ir2;
4083   int orig_ss;
4084 
4085   char **nodetypes;
4086   char **sttypes;
4087   char **sides;
4088 
4089   int orig_n1_type;
4090   int side_idx;
4091 
4092   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
4093   nodetypes[0] = "BIF";
4094   nodetypes[1] = "MATP";
4095   nodetypes[2] = "MATL";
4096   nodetypes[3] = "MATR";
4097   nodetypes[4] = "BEGL";
4098   nodetypes[5] = "BEGR";
4099   nodetypes[6] = "ROOT";
4100   nodetypes[7] = "END";
4101 
4102   ESL_ALLOC(sttypes, sizeof(char *) * 10);
4103   sttypes[0] = "D";
4104   sttypes[1] = "MP";
4105   sttypes[2] = "ML";
4106   sttypes[3] = "MR";
4107   sttypes[4] = "IL";
4108   sttypes[5] = "IR";
4109   sttypes[6] = "S";
4110   sttypes[7] = "E";
4111   sttypes[8] = "B";
4112   sttypes[9] = "EL";
4113 
4114   ESL_ALLOC(sides, sizeof(char *) * 3);
4115   sides[0] = "L";
4116   sides[1] = "R";
4117   sides[2] = "N";
4118 
4119 
4120   orig_il1 = submap->s2o_smap[1][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IL */
4121   orig_il2 = submap->s2o_smap[1][1]; /* 2nd state that maps to sub_cm's ROOT_IL or -1 if only 1 maps*/
4122   orig_ir1 = submap->s2o_smap[2][0]; /* 1st of up to 2 states that maps to sub_cm's ROOT_IR */
4123   orig_ir2 = submap->s2o_smap[2][1]; /* 2nd state that maps to sub_cm's ROOT_IR or -1 if only 1 maps*/
4124 
4125   /* We ASSUME that ambiguities have been removed, i.e. if two insert states map to either ROOT_IL
4126    * or ROOT_IR, one of them has been detached. We exploit this knowledge.
4127    */
4128   if(orig_il2 != -1)
4129     {
4130       if(orig_cm->sttype[orig_il1+1] == E_st)
4131 	orig_il1 = orig_il2; /* orig_il1 was detached */
4132       else if(orig_cm->sttype[orig_il2+1] == E_st)
4133 	{
4134 	  /* do nothing */
4135 	}
4136       else
4137 	cm_Fail("ERROR, can't determine which state was detached in debug_print_misc_sub_cm_info\n");
4138     }
4139   if(orig_ir2 != -1)
4140     {
4141       if(orig_cm->sttype[orig_ir1+1] == E_st)
4142 	orig_ir1 = orig_ir2; /* orig_ir1 was detached */
4143       else if(orig_cm->sttype[orig_ir2+1] == E_st)
4144 	{
4145 	  /* do nothing */
4146 	}
4147       else
4148 	cm_Fail("ERROR, can't determine which state was detached in debug_print_misc_sub_cm_info\n");
4149     }
4150 
4151   /* Now orig_il1 and orig_ir1 map to the ONLY insert states that map to sub_cm
4152    * ROOT_IL and ROOT_IR respectively.
4153    */
4154   printf("10.16.06 IL1: %3d %4s %2s | start:   %3d | end:   %3d\n", orig_il1, nodetypes[(int) orig_cm->ndtype[orig_cm->ndidx[orig_il1]]], sttypes[(int) orig_cm->sttype[orig_il1]], submap->spos, submap->epos);
4155 
4156   if(sub_cm->ndtype[1] == BIF_nd)
4157     {
4158       orig_n1_type = 0;
4159     }
4160   else
4161     {
4162       orig_n1_type = orig_cm->ndtype[orig_cm->ndidx[(submap->s2o_smap[3][0])]];
4163     }
4164 
4165   if(orig_n1_type == MATP_nd && sub_cm->ndtype[1] != MATR_nd)
4166     {
4167       if(orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->spos)
4168 	side_idx = 0;
4169       else if(orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->spos)
4170 	side_idx = 1;
4171       else
4172 	cm_Fail("ERROR MATP confusion! orig_cm node: %d | left: %d | right: %d | submap->spos: %d\n", (orig_cm->ndidx[(submap->s2o_smap[3][0])]), (orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), (orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), submap->spos);
4173     }
4174   else if (orig_n1_type == MATP_nd && sub_cm->ndtype[1] == MATR_nd)
4175     {
4176       if(orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->epos)
4177 	side_idx = 0;
4178       else if(orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]] == submap->epos)
4179 	side_idx = 1;
4180       else
4181 	cm_Fail("ERROR MATP confusion! orig_cm node: %d | left: %d | right: %d | submap->spos: %d\n", (orig_cm->ndidx[(submap->s2o_smap[3][0])]), (orig_cp9map->nd2lpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), (orig_cp9map->nd2rpos[orig_cm->ndidx[(submap->s2o_smap[3][0])]]), submap->spos);
4182     }
4183   else
4184     {
4185       side_idx = 2;
4186     }
4187   printf("10.16.06 IR1: %3d %4s %2s | sub-n1: %4s | orig-n1: %4s%1s | case:   ", orig_ir1, nodetypes[(int) orig_cm->ndtype[orig_cm->ndidx[orig_ir1]]], sttypes[(int) orig_cm->sttype[orig_ir1]], nodetypes[(int) sub_cm->ndtype[1]], nodetypes[(int) orig_n1_type], sides[side_idx]);
4188 
4189   /* figure out 'case' of ROOT transitions */
4190   orig_ss = submap->s2o_smap[3][0]; /* orig_ss is the 1 (of possibly 2) orig_cm states that map to the first
4191 				     * state in sub_cm node 1 (sub_cm state 3)
4192 				     */
4193   if((orig_il1 < orig_ir1) && (orig_ir1 < orig_ss))
4194     printf("1A\n");
4195   if((orig_il1 < orig_ss) && (orig_ss < orig_ir1))
4196     printf("1B\n");
4197   if((orig_ss < orig_il1) && (orig_il1 < orig_ir1))
4198     printf("1C\n");
4199 
4200   if((orig_ir1 < orig_il1) && (orig_il1 < orig_ss))
4201     printf("2A\n");
4202   if((orig_ir1 < orig_ss) && (orig_ss < orig_il1))
4203     printf("2B\n");
4204   if((orig_ss < orig_ir1) && (orig_ir1 < orig_il1))
4205     printf("2C\n");
4206 
4207   printf("\n");
4208 
4209   /* Begin 10.17.06 info */
4210   int ilmap, irmap;
4211   int ildual, irdual;
4212   int iloff, iroff;
4213   int other_insert_il;
4214   int other_insert_ir;
4215 
4216   if(orig_cm->sttype[orig_il1] == IL_st)
4217     ilmap = 0; /* sides[0] = "L" */
4218   else
4219     ilmap = 1; /* sides[1] = "R" */
4220   if(orig_cm->sttype[orig_ir1] == IL_st)
4221     irmap = 0; /* sides[0] = "L" */
4222   else
4223     irmap = 1; /* sides[1] = "R" */
4224 
4225   if(orig_cm->ndtype[orig_cm->ndidx[orig_il1]] == MATP_nd ||
4226      orig_cm->ndtype[orig_cm->ndidx[orig_il1]] == ROOT_nd)
4227     ildual = TRUE;
4228   else
4229     ildual = FALSE;
4230   if(orig_cm->ndtype[orig_cm->ndidx[orig_ir1]] == MATP_nd ||
4231      orig_cm->ndtype[orig_cm->ndidx[orig_ir1]] == ROOT_nd)
4232     irdual = TRUE;
4233   else
4234     irdual = FALSE;
4235 
4236   iloff = -1;
4237   if(ildual == TRUE)
4238     {
4239       /* check if other insert state in orig_cm node that has insert that
4240        * maps to sub_cm ROOT_IL maps to a state in the sub_cm
4241        */
4242       if(ilmap == 0) /* ROOT_IL maps to a IL */
4243 	other_insert_il = orig_il1 + 1;
4244       else           /* ROOT_IL maps to a IR */
4245 	other_insert_il = orig_il1 - 1;
4246       if(submap->o2s_smap[other_insert_il][0] == -1 &&
4247 	 submap->o2s_smap[other_insert_il][1] == -1)
4248 	iloff = TRUE;
4249       else
4250 	iloff = FALSE;
4251     }
4252 
4253   iroff = -1;
4254   if(irdual == TRUE)
4255     {
4256       if(irmap == 0) /* ROOT_IR maps to a IL */
4257 	other_insert_ir = orig_ir1 + 1;
4258       else           /* ROOT_IR maps to a IR */
4259 	other_insert_ir = orig_ir1 - 1;
4260       if(submap->o2s_smap[other_insert_ir][0] == -1 &&
4261 	 submap->o2s_smap[other_insert_ir][1] == -1)
4262 	iroff = TRUE;
4263       else
4264 	iroff = FALSE;
4265     }
4266 
4267   CMEmitMap_t *orig_emap;         /* consensus emit map for the original, template CM */
4268   int other_cc_il, other_cc_ir;
4269   orig_emap = CreateEmitMap(orig_cm);
4270   other_cc_il = -1;
4271   other_cc_ir = -1;
4272   if(ildual == TRUE)
4273     {
4274       if(orig_cm->sttype[other_insert_il] == IL_st) /* sub ROOT_IL maps to IR, other maps to IL */
4275  	{
4276 	  other_cc_il = orig_emap->lpos[orig_cm->ndidx[other_insert_il]] + 1;
4277 	  if(other_cc_il > submap->spos)
4278 	    cm_Fail("ERROR FUNKY\n");
4279 	  ildual = 4;
4280 	}
4281       else /* ROOT_IL maps to IL, other maps to IR */
4282 	{
4283 	  other_cc_il = orig_emap->rpos[orig_cm->ndidx[other_insert_il]] - 1;
4284 	  if(other_cc_il < submap->epos)
4285 	    ildual = 1;
4286 	  if(other_cc_il == submap->epos)
4287 	    ildual = 2;
4288 	  if(other_cc_il > submap->epos)
4289 	    ildual = 3;
4290 	}
4291     }
4292   /*printf("10.17.06 other_insert_il: %d other_cc_il: %d submap->spos: %d submap->epos: %d ildual: %d\n", other_insert_il, other_cc_il, submap->spos, submap->epos, ildual);*/
4293   if(irdual == TRUE)
4294     {
4295       if(orig_cm->sttype[other_insert_ir] == IL_st) /* sub ROOT_IR maps to IR, other maps to IL */
4296 	{
4297 	  other_cc_ir = orig_emap->lpos[orig_cm->ndidx[other_insert_ir]] + 1;
4298 	  if(other_cc_ir > submap->spos)
4299 	    irdual = 1;
4300 	  if(other_cc_ir == submap->spos)
4301 	    irdual = 2;
4302 	  if(other_cc_ir < submap->spos)
4303 	    irdual = 3;
4304 	}
4305       else /* ROOT_IR maps to IL, other maps to IR */
4306 	{
4307 	  other_cc_ir = orig_emap->rpos[orig_cm->ndidx[other_insert_ir]] - 1;
4308 	  if(other_cc_ir < submap->epos)
4309 	    cm_Fail("ERROR FUNKY\n");
4310 	  irdual = 4;
4311 	}
4312     }
4313   /*printf("10.17.06 other_insert_ir: %d other_cc_ir: %d submap->spos: %d submap->epos: %d irdual: %d\n", other_insert_ir, other_cc_ir, submap->spos, submap->epos, irdual);*/
4314 
4315   printf("10.17.06 ilmap: %5s ildual: %2d iloff: %2d irmap: %5s irdual: %2d iroff: %2d subn1: %4s orign1: %4s%1s\n", sides[ilmap], ildual, iloff, sides[irmap], irdual, iroff, nodetypes[(int) sub_cm->ndtype[1]], nodetypes[(int) orig_n1_type], sides[side_idx]);
4316 
4317   int start_flag;
4318   int v;
4319   int vend;
4320 
4321   if(orig_il1 < orig_ir1)
4322     {
4323       v = orig_il1;
4324       vend = orig_ir1;
4325     }
4326   else
4327     {
4328       v = orig_ir1;
4329       vend = orig_il1;
4330     }
4331   start_flag = 0;
4332   for(; v <= vend; v++)
4333     if(orig_cm->sttype[v] == S_st)
4334       start_flag = 1;
4335   return;
4336 
4337  ERROR:
4338   cm_Fail("Memory allocation error.");
4339 }
4340 
4341 /**************************************************************************
4342  * EPN 11.01.06
4343  * debug_sub_cm_check_all_trans()
4344  */
4345 void
4346 debug_sub_cm_check_all_trans(CM_t *orig_cm, CM_t *sub_cm, CMSubMap_t *submap)
4347 {
4348   int status;
4349   int nd;
4350   int v;
4351   int y, yoffset;
4352   float sum, ndsum;
4353   int orig_nd, orig_v1, orig_v2;
4354 
4355   char **nodetypes;
4356   ESL_ALLOC(nodetypes, sizeof(char *) * 8);
4357   nodetypes[0] = "BIF";
4358   nodetypes[1] = "MATP";
4359   nodetypes[2] = "MATL";
4360   nodetypes[3] = "MATR";
4361   nodetypes[4] = "BEGL";
4362   nodetypes[5] = "BEGR";
4363   nodetypes[6] = "ROOT";
4364   nodetypes[7] = "END";
4365 
4366   char **sttypes;
4367   ESL_ALLOC(sttypes, sizeof(char *) * 10);
4368   sttypes[0] = "D";
4369   sttypes[1] = "MP";
4370   sttypes[2] = "ML";
4371   sttypes[3] = "MR";
4372   sttypes[4] = "IL";
4373   sttypes[5] = "IR";
4374   sttypes[6] = "S";
4375   sttypes[7] = "E";
4376   sttypes[8] = "B";
4377   sttypes[9] = "EL";
4378 
4379   for(nd = 0; nd < sub_cm->nodes; nd++)
4380     {
4381       sum = 0.;
4382       if(sub_cm->ndtype[nd] != END_nd && sub_cm->ndtype[nd] != BIF_nd)
4383 	{
4384 	  ndsum = 0.;
4385 	  v = sub_cm->nodemap[nd];
4386 	  while(sub_cm->ndidx[v] == nd && sub_cm->sttype[v] != IL_st && sub_cm->sttype[v] != IR_st)
4387 	    {
4388 	      sum = 0.;
4389 	      for(y = sub_cm->cfirst[v]; y < sub_cm->cfirst[v]+sub_cm->cnum[v]; y++)
4390 		{
4391 		  yoffset = y - sub_cm->cfirst[v];
4392 		  printf("\t\tsub_cm->t[%3d][%3d]: %f\n", v, yoffset, sub_cm->t[v][yoffset]);
4393 		  sum    += sub_cm->t[v][yoffset];
4394 		}
4395 	      orig_v1  = submap->s2o_smap[v][0];
4396 	      orig_v2  = submap->s2o_smap[v][1];
4397 	      orig_nd = orig_cm->ndidx[orig_v1];
4398 	      if(sub_cm->ndtype[nd+1] != END_nd)
4399 		{
4400 		  if(orig_v2 != -1)
4401 		    printf("sum t[%4d %4s %2s %2s] nd: %4d: %f\n", v, nodetypes[(int) orig_cm->ndtype[orig_nd]], sttypes[(int) orig_cm->sttype[orig_v1]], sttypes[(int) orig_cm->sttype[orig_v2]], nd, sum);
4402 		  else
4403 		    printf("sum t[%4d %4s %2s   ] nd: %4d: %f\n", v, nodetypes[(int) orig_cm->ndtype[orig_nd]], sttypes[(int) orig_cm->sttype[orig_v1]], nd, sum);
4404 		}
4405 	      ndsum += sum;
4406 	      v++;
4407 	    }
4408 	  if(sub_cm->ndtype[nd+1] != END_nd)
4409 	    printf("\tndsum t nd (%4s): %4d: %f\n", nodetypes[(int) orig_cm->ndtype[orig_nd]], nd, ndsum);
4410 	}
4411     }
4412   free(nodetypes);
4413   free(sttypes);
4414   return;
4415 
4416  ERROR:
4417   cm_Fail("Memory allocation error.");
4418 }
4419 #endif
4420