1 /* cm_modelconfig.c
2  * SRE, Wed May  8 14:30:38 2002 [St. Louis]
3  *
4  * Configuring a covariance model. The desideratum is for a CM to be
5  * configured exactly once. This is done via the cm_Configure() or
6  * cm_ConfigureSub() function. Configuration consists of initializing
7  * the matrices and data structures that are considered always valid
8  * in a CM, but are not read during CM input from a file, and putting
9  * the model into local mode if necessary (CMs are always read in from
10  * a file with global parameters). When a CM is about to be
11  * configured, it must be 'non-configured', which means all of the
12  * data not read upon CM input from a file must not be present.
13  *
14  * All functions that aid in the configuration process (and actually
15  * modify the CM, thus removing its 'non-configured' status) are
16  * static local functions in this file. This is by design - we want
17  * the only way to configure a CM to be through cm_Configure() or
18  * cm_ConfigureSub(), which, again, will happen exactly once per
19  * model. By doing this we are limiting the possible execution paths
20  * for model configuration to try and avoid the case that some
21  * execution paths (of which there could be many due to the various
22  * command-line options of Infernal applications) may screw something
23  * up in the model.
24  *
25  */
26 
27 #include "esl_config.h"
28 #include "p7_config.h"
29 #include "config.h"
30 
31 #include <string.h>
32 
33 #include "easel.h"
34 #include "esl_alphabet.h"
35 #include "esl_stack.h"
36 #include "esl_vectorops.h"
37 
38 #include "hmmer.h"
39 
40 #include "infernal.h"
41 
42 static void  cm_localize(CM_t *cm, float p_internal_entry, float p_internal_exit);
43 static  int  cp9_sw_config(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype);
44 static  int  cp9_EL_local_ends_config(CP9_t *cp9, CM_t *cm, char *errbuf);
45 static void  cp9_renormalize_exits(CP9_t *hmm);
46 
47 /* Function: cm_Configure()
48  * Date:     EPN, Thu Jan  4 06:36:09 2007
49  *           EPN, Fri Dec  9 15:27:29 2011 [updated prior to 1.1 release]
50  *
51  * Purpose:  Configure a CM. Configuration options are in
52  *           cm->config_opts.
53  *
54  *           CM data structures that are always built or initialized:
55  *
56  *           - emitmap (if not yet built)
57  *
58  *           - all HMM banded matrices for the CM (hb_mx, hb_omx, hb_emx,
59  *             hb_shmx, trhb_mx, trhb_omx, trhb_emx, trhb_shmx)
60  *
61  *           - CP9 HMMs (cp9, Lcp9, Rcp9, Tcp9)
62  *
63  *           - CP9 associated data structures (cp9map, cp9b, cp9_mx,
64  *             cp9_bmx)
65  *
66  *           - maximum-likelihood P7 HMM (mlp7; filter p7 HMM fp7 is
67  *             read from the CM file).
68  *
69  *           Optional configuration:
70  *
71  *           - CM and cp9 HMMs (built in this function) are put
72  *             into local mode if cm->config_opts & CM_CONFIG_LOCAL.
73  *
74  *           - QDBs and W are recalculated if cm->qdbinfo isn't
75  *             already set (cm->qdbinfo->setby =
76  *             CM_QDBINFO_SETBY_INIT) or if cm->config_opts &
77  *             CM_CONFIG_QDB.
78  *
79  *           - W is also recalculated if cm->config_opts &
80  *             CM_CONFIG_W.
81  *
82  *           - all non-banded matrices for the CM (nbmx, onbmx,
83  *             trnbmx, tronbmx, enbmx, trenbmx, shnbmx, trshnbmx)
84  *
85  *           - the scan matrix (smx) is created if cm->config_opts &
86  *             CM_CONFIG_SMX
87  *
88  *
89  * Args:     cm             - the covariance model
90  *           errbuf         - for error messages
91  *           W_from_cmdline - W set on cmdline, -1 if W not set on cmdline (usually -1)
92  *
93  * Returns:   <eslOK> on success.
94  *            <eslEINVAL> on contract violation.
95  *            <eslEMEM> on memory allocation error.
96  */
97 int
cm_Configure(CM_t * cm,char * errbuf,int W_from_cmdline)98 cm_Configure(CM_t *cm, char *errbuf, int W_from_cmdline)
99 {
100   return cm_ConfigureSub(cm, errbuf, W_from_cmdline, NULL, NULL);
101 }
102 
103 /* Function: cm_ConfigureSub()
104  * Date:     EPN, Thu Jan  4 06:36:09 2007
105  *           EPN, Fri Dec  9 15:27:29 2011 [updated prior to 1.1 release]
106  *
107  * Purpose:  See cm_Configure()'s Purpose above. This function
108  *           actually does the work for cm_Configure(). Both functions
109  *           exist so we can take two additional parameters in the
110  *           rare case that we're configuring a model that is a
111  *           sub-model of another CM. If <sub_mother_cm> and
112  *           <sub_mother_map> are non-NULL, <cm> is a sub CM
113  *           constructed from <sub_mother_cm>. In this case, we're
114  *           doing alignment and are constructing a new sub CM for
115  *           each target sequence so running time should be
116  *           minimized. Special functions for building the CP9 HMMs
117  *           and for logoddsifying the model are called that are
118  *           faster than the normal versions b/c they can just copy
119  *           some of the parameters of the mother model instead of
120  *           calc'ing them.
121  *
122  * Args:     cm             - the covariance model
123  *           errbuf         - for error messages
124  *           W_from_cmdline - W set on cmdline, -1 if W not set on cmdline (usually -1)
125  *           mother_cm      - if non-NULL, <cm> is a sub CM construced from
126  *                            <mother_cm>. In this case we use <mother_map>
127  *                            to help streamline the two steps that dominate the
128  *                            running time of this function (b/c speed is an issue):
129  *                            building a cp9 HMM, and logoddsifying the model.
130  *           mother_map     - must be non-NULL iff <mother_cm> is non-NULL, the
131  *                            map from <cm> to <mother_cm>.
132  *
133  * Returns:   <eslOK> on success.
134  *            <eslEINVAL> on contract violation.
135  *            <eslEMEM> on memory allocation error.
136  *            <eslFAIL> on other failure (errbuf filled)
137  */
138 int
cm_ConfigureSub(CM_t * cm,char * errbuf,int W_from_cmdline,CM_t * mother_cm,CMSubMap_t * mother_map)139 cm_ConfigureSub(CM_t *cm, char *errbuf, int W_from_cmdline, CM_t *mother_cm, CMSubMap_t *mother_map)
140 {
141   int   status;
142   float swentry, swexit;
143   int   have_mother;
144 
145   have_mother = (mother_cm != NULL && mother_map != NULL) ? TRUE : FALSE;
146 
147   /* contract check */
148   if(mother_cm != NULL && mother_map == NULL)            ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, mother_cm != NULL but mother_map == NULL (both must be NULL or both non-NULL).");
149   if(mother_cm == NULL && mother_map != NULL)            ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, mother_cm == NULL but mother_map != NULL (both must be NULL or both non-NULL).");
150   if(have_mother && (cm->config_opts & CM_CONFIG_LOCAL)) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, configuring a sub CM, but CM_CONFIG_LOCAL config flag up.");
151   if((  cm->config_opts & CM_CONFIG_HMMLOCAL) &&
152      (! (cm->config_opts & CM_CONFIG_LOCAL)))    ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, cp9 is to be configured locally, but CM is not");
153   if((  cm->config_opts & CM_CONFIG_HMMEL) &&
154      (! (cm->config_opts & CM_CONFIG_HMMLOCAL))) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, cp9 is to be configured without local entries exists but with ELs on");
155   if((cm->config_opts & CM_CONFIG_TRUNC) && (cm->config_opts & CM_CONFIG_SUB)) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, incompatible configuration options: CM_CONFIG_TRUNC and CM_CONFIG_SUB");
156   if((cm->config_opts & CM_CONFIG_LOCAL) && (cm->config_opts & CM_CONFIG_SUB)) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, incompatible configuration options: CM_CONFIG_LOCAL and CM_CONFIG_SUB");
157 
158   /* validate the CM */
159   if((status = cm_Validate(cm, 0.0001, errbuf)) != eslOK) return status;
160 
161   /* verify we're not already configured */
162   if((status = cm_nonconfigured_Verify(cm, errbuf)) != eslOK) { status = eslEINCOMPAT; return status; }
163   /* cm_nonconfigured_Verify() checked that everything that should be
164    * NULL in <cm> is NULL, so we don't have to check below.
165    */
166 
167   /* Build the emitmap, if necessary */
168   if(cm->emap == NULL) cm->emap = CreateEmitMap(cm);
169 
170   /* Define W and set up query dependent bands. This is confusing. We need
171    * the user to be able to set W on the command line if they want, but W
172    * and the QDBs used to define the CM_SCAN_MX (created later) are dependent
173    * on one another.
174    *
175    * If W was set on the command line (W_from_cmdline != -1):
176    * Set W, and calculate QDBs if nec, without redefining W.
177    *
178    * Else, if W was not set on the command line (W_from_cmdline ==
179    * -1): Calculate QDBs if nec, and redefine W using beta=cm->beta_W,
180    * which may or may not have been changed by caller from what BETA_W
181    * was in the CM file.
182    *
183    * Then, W (regardless of how it was set) is enforced when the scan
184    * matrix is created, i.e. no dmin/dmax values from cm->qdbinfo
185    * that exceed W will be treated as begin equal to W.
186    */
187   if(W_from_cmdline != -1) {
188     cm->W       = W_from_cmdline;
189     cm->W_setby = CM_W_SETBY_CMDLINE;
190   }
191   /* If nec, set up the query dependent bands (it's important to do this before creating the ml p7 HMM (which needs to know cm->W)) */
192   if((cm->config_opts & CM_CONFIG_QDB)    ||
193      (cm->config_opts & CM_CONFIG_W_BETA) ||
194      (cm->qdbinfo->setby == CM_QDBINFO_SETBY_INIT)) {
195     if(W_from_cmdline != -1) {
196       if((status = CalculateQueryDependentBands(cm, errbuf, cm->qdbinfo,
197 						ESL_MIN(cm->qdbinfo->beta1, cm->qdbinfo->beta2), NULL, /* don't redefine W, we just set it above as W_from_cmdline */
198 						NULL, NULL, NULL)) != eslOK) return status;
199     }
200     else {
201       if((status = CalculateQueryDependentBands(cm, errbuf, cm->qdbinfo,
202 						cm->beta_W, &(cm->W), /* do redefine W, as that calc'ed with cm->beta_W */
203 						NULL, NULL, NULL)) != eslOK) return status;
204       cm->W_setby = CM_W_SETBY_BANDCALC;
205     }
206   }
207 
208   /* Allocate the HMM banded matrices, these are originally
209    * very small and only grown as needed. Optionally create
210    * the truncated alignment matrices.
211    */
212   cm->hb_mx    = cm_hb_mx_Create(cm->M);
213   cm->hb_omx   = cm_hb_mx_Create(cm->M);
214   cm->hb_emx   = cm_hb_emit_mx_Create(cm);
215   cm->hb_shmx  = cm_hb_shadow_mx_Create(cm);
216   if(cm->config_opts & CM_CONFIG_TRUNC) {
217     cm->trhb_mx   = cm_tr_hb_mx_Create(cm);
218     cm->trhb_omx  = cm_tr_hb_mx_Create(cm);
219     cm->trhb_emx  = cm_tr_hb_emit_mx_Create(cm);
220     cm->trhb_shmx = cm_tr_hb_shadow_mx_Create(cm);
221   }
222 
223   /* If nec, create the nonbanded matrices (usually we won't, these get real big) */
224   if(cm->config_opts & CM_CONFIG_NONBANDEDMX) {
225     cm->nb_mx     = cm_mx_Create(cm->M);
226     cm->nb_omx    = cm_mx_Create(cm->M);
227     cm->nb_emx    = cm_emit_mx_Create(cm);
228     cm->nb_shmx   = cm_shadow_mx_Create(cm);
229     if(cm->config_opts & CM_CONFIG_TRUNC) {
230       cm->trnb_mx   = cm_tr_mx_Create(cm);
231       cm->trnb_omx  = cm_tr_mx_Create(cm);
232       cm->trnb_emx  = cm_tr_emit_mx_Create(cm);
233       cm->trnb_shmx = cm_tr_shadow_mx_Create(cm);
234     }
235   }
236 
237   /* If nec, create the scan matrix and truncated scan matrix */
238   /* (we could the check size of matrices first, return error if too big, but don't currently) */
239   if(cm->config_opts & CM_CONFIG_SCANMX) {
240     if((status = cm_scan_mx_Create(cm, errbuf, TRUE, TRUE, &(cm->smx)))      != eslOK) return status;
241   }
242   if(cm->config_opts & CM_CONFIG_TRSCANMX) {
243     if((status = cm_tr_scan_mx_Create(cm, errbuf, TRUE, TRUE, &(cm->trsmx))) != eslOK) return status;
244   }
245 
246   /* Build the CP9 HMM and associated data. It's important
247    * to do this before setting up CM for local mode.
248    */
249   if(have_mother) {
250     if((status = sub_build_cp9_hmm_from_mother(cm, errbuf, mother_cm, mother_map, &(cm->cp9), &(cm->cp9map), FALSE, 0.0001, 0)) != eslOK) return status;
251   }
252   else {
253     if((status = build_cp9_hmm(cm, errbuf, FALSE, 0.0001, 0, &(cm->cp9), &(cm->cp9map))) != eslOK) return status;
254   }
255   cm->cp9b = AllocCP9Bands(cm->M, cm->cp9->M);
256   /* create the CP9 matrices, we init to 1 row, which is tiny so it's okay
257    * that we have two of them, we only grow them as needed, cp9_bmx is
258    * only needed if we're doing Forward -> Backward -> Posteriors.
259    */
260   cm->cp9_mx  = CreateCP9Matrix(1, cm->cp9->M);
261   cm->cp9_bmx = CreateCP9Matrix(1, cm->cp9->M);
262   cm->flags |= CMH_CP9; /* raise the CP9 flag */
263 
264   /* Configure for truncated search/alignment. */
265   if(cm->config_opts & CM_CONFIG_TRUNC) {
266     /* (1) Define truncated alignment penalty probabilities (cm->trp).
267      * 'FALSE' informs the function not to ignore inserts, that is to
268      * allow truncated begins into insert states. This is the
269      * hard-coded behavior. We'd only set it to TRUE if we were
270      * testing the code, because in that case a validation is
271      * performed that isn't possible with truncated begins into
272      * inserts allowed.
273      */
274     if((cm->trp = cm_tr_penalties_Create(cm, FALSE, errbuf)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't create truncation penalties for the CM");
275     /* cm_tr_penalties_Dump(stdout, cm, cm->trp); */
276 
277     /* (2) Setup Lcp9, Rcp9, Tcp9 CP9 HMMs
278      * Clone the globally configured CP9 HMM cm->cp9 before its put
279      * into local mode into each of cm->Lcp9, cm->Rcp9, cm->Tcp9 and
280      * configure them for their specific mode of truncated alignment.
281      * cm->cp9 is not yet logoddsified and so bit scores will not
282      * be copied into Lcp9, Rcp9, Tcp9. This is important because
283      * we want to be as efficient as possible, and only want to
284      * logoddsify each cp9 exactly once, which will occur after
285      * they've been configured into their specific locality mode.
286      */
287     if((cm->Lcp9 = cp9_Clone(cm->cp9)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't clone cm->cp9 to get cm->Lcp9");
288     if((cm->Rcp9 = cp9_Clone(cm->cp9)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't clone cm->cp9 to get cm->Rcp9");
289     if((cm->Tcp9 = cp9_Clone(cm->cp9)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't clone cm->cp9 to get cm->Tcp9");
290 
291     /* L mode alignment, 3' truncation: begin into node 1, equiprobable ends */
292     swentry = 0.;
293     swexit  = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
294     cp9_sw_config(cm->Lcp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
295     /* R mode alignment, 5' truncation: equiprobable begins, end out of node M */
296     swentry = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
297     swexit  = 0.;
298     cp9_sw_config(cm->Rcp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
299     /* T mode alignment, 5' and 3' truncation: equiprobable begins, equiprobable ends */
300     swentry = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
301     swexit  = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
302     cp9_sw_config(cm->Tcp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
303 
304     cm->flags |= CMH_CP9_TRUNC;
305     /* don't logoddsify Lcp9, Rcp9, Tcp9 yet, wait until the end of
306      * the function, after we potentially turn on EL local ends below
307      */
308   }
309 
310   /* Configure the CM and cm->cp9 for local alignment and cm->Lcp9,
311    * cm->Rcp9, cm->Tcp9 for EL-type local ends, if necessary */
312   if(cm->config_opts & CM_CONFIG_LOCAL) {
313     cm_localize(cm, cm->pbegin, cm->pend);
314     if(cm->config_opts & CM_CONFIG_HMMLOCAL) { /* contract enforced CM_CONFIG_HMMLOCAL only raised if CM_CONFIG_LOCAL raised */
315       cp9_sw_config(cm->cp9, cm->pbegin, cm->pbegin, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
316       /* set up EL-type local ends, if necessary */
317       if(cm->config_opts & CM_CONFIG_HMMEL) {
318 	if((status = cp9_EL_local_ends_config(cm->cp9, cm, errbuf)) != eslOK) return status;
319 	if(cm->flags & CMH_CP9_TRUNC) {
320 	  if((status = cp9_EL_local_ends_config(cm->Lcp9, cm, errbuf)) != eslOK) return status;
321 	  if((status = cp9_EL_local_ends_config(cm->Rcp9, cm, errbuf)) != eslOK) return status;
322 	  if((status = cp9_EL_local_ends_config(cm->Tcp9, cm, errbuf)) != eslOK) return status;
323 	}
324       }
325     }
326   }
327 
328   /* Possibly configure cm->cp9 for submodel alignment (contract
329      enforced that if CM_CONFIG_SUB, CM_CONFIG_LOCAL and
330      CM_CONFIG_TRUNC must both be FALSE) */
331   if(cm->config_opts & CM_CONFIG_SUB) {
332     swentry= ((cm->cp9->M)-1.)/cm->cp9->M; /* all start pts equiprobable, including 1 */
333     swexit = ((cm->cp9->M)-1.)/cm->cp9->M; /* all end   pts equiprobable, including M */
334     cp9_sw_config(cm->cp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
335   }
336 
337   /* We need to ensure that cm->el_selfsc * W >= IMPOSSIBLE
338    * (cm->el_selfsc is the score for an EL self transition) This is
339    * done because we potentially multiply cm->el_selfsc * W, and add
340    * that to IMPOSSIBLE.
341    */
342   if((cm->el_selfsc * cm->W) < IMPOSSIBLE) {
343     cm->el_selfsc  = (IMPOSSIBLE / (cm->W+1));
344     cm->iel_selfsc = -INFTY;
345   }
346 
347   /* Compute log odds scores. This will create cm->cmcons as well, which requires scores. */
348   if(have_mother) {
349     if((status = SubCMLogoddsify(cm, errbuf, mother_cm, mother_map)) != eslOK) return status;
350   }
351   else {
352     if((status = CMLogoddsify(cm)) != eslOK) ESL_FAIL(status, errbuf, "problem logodisfying CM");
353   }
354   CP9Logoddsify(cm->cp9);
355   if(cm->flags & CMH_CP9_TRUNC) {
356     CP9Logoddsify(cm->Lcp9);
357     CP9Logoddsify(cm->Rcp9);
358     CP9Logoddsify(cm->Tcp9);
359   }
360 
361   /* Finally, build the ml p7 HMM, which requires cm->cmcons */
362   if((status = cm_cp9_to_p7(cm, cm->cp9, errbuf)) != eslOK) return status;
363 
364   /*debug_print_cm_params(stdout, cm);
365     debug_print_cp9_params(stdout, cm->cp9, TRUE);
366     debug_print_cp9_params(stdout, cm->Lcp9, TRUE);
367     debug_print_cp9_params(stdout, cm->Rcp9, TRUE);
368     debug_print_cp9_params(stdout, cm->Tcp9, TRUE);*/
369 
370   cm->flags |= CM_IS_CONFIGURED;
371 
372   return eslOK;
373 }
374 
375 
376 /* Function: cm_CalculateLocalBeginProbs()
377  * Incept:   EPN, Fri Dec  9 05:20:35 2011
378  *
379  * Purpose:
380  *
381  *           Calculate local begin probabilities. The transitions in
382  *           <t> should be for a CM in global mode, not local mode. By
383  *           specifying <t> as not necessarily equal to <cm->t>, we
384  *           can calculate local begin probs for a model already in
385  *           local mode.
386  *
387  * Args:     cm               - the covariance model
388  *           p_internal_start - prob mass to spread for local begins
389  *           t                - [0..M-1][0..MAXCONNECT-1] transition probabilities (not necessarily cm->t)
390  *           begin            - [0..M-1] standard local begin probs to set
391  *
392  * Returns:  eslOK on success
393  *           eslEMEM if out of memory
394  */
395 
396 int
cm_CalculateLocalBeginProbs(CM_t * cm,float p_internal_start,float ** t,float * begin)397 cm_CalculateLocalBeginProbs(CM_t *cm, float p_internal_start, float **t, float *begin)
398 {
399   int nd;			/* counter over nodes */
400   int nstarts;			/* number of possible internal starts */
401   float p;                      /* p_internal_start / nstarts */
402 
403   /* Count "internal" nodes: MATP, MATL, MATR, and BIF nodes.
404    * Ignore all start nodes, and also node 1 (which is always the
405    * "first" node and gets an entry prob of 1-p_internal_start).
406    */
407   nstarts = 0;
408   for (nd = 2; nd < cm->nodes; nd++) {
409     if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
410     	cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd)
411       nstarts++;
412   }
413 
414   /* Set begin probs (standard local begins) */
415   esl_vec_FSet(begin, cm->M, 0.);
416   /* Node 1 gets prob 1-p_internal_start. */
417   begin[cm->nodemap[1]] = 1.-p_internal_start;
418   /* Remaining nodes share p_internal_start. */
419   p = p_internal_start / (float) nstarts;
420   for (nd = 2; nd < cm->nodes; nd++) {
421     if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
422     	cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd)
423       begin[cm->nodemap[nd]] = p;
424   }
425   return eslOK;
426 }
427 
428 /* Function: cm_localize()
429  * Incept:   EPN, Tue Nov 29 13:46:29 2011 [updated]
430  *
431  * Purpose:  Configure a CM for local alignment by spreading
432  *           <p_internal_start> local entry probability evenly across
433  *           all internal nodes, and by spreading <p_internal_exit>
434  *           local exit probability evenly across all internal nodes.
435  *
436  *           Local entry probabilities for truncated alignment
437  *           (cm->trbegin and cm->trbeginsc) are configured slightly
438  *           differently, to allow inserts at the ends of the
439  *           alignment. As with standard local begins,
440  *           <p_internal_start> is spread evenly across all <nstarts>
441  *           internal nodes, but entries into inserts are also
442  *           allowed. The <p_internal_start>/<nstarts> probability
443  *           for each node is divided between match and insert states
444  *           of that node using a <psi> vector, psi[v] is the expected
445  *           number of times state v is entered.
446  *
447  *           Note: to reproduce how Diana scored truncated alignments
448  *           in the Kolbe and Eddy 2009 paper, set trbegin[v] to
449  *           (2 / (cm->clen * (cm->clen + 1))) for all v.
450  *
451  *           Local end probability is spread evenly across all states
452  *           from which local ends are permitted (see code).
453  *
454  *           We exploit the fact that we know we are a local static
455  *           function called only by cm_ConfigureSub() and don't do
456  *           any checking of the CM because cm_ConfigureSub() already
457  *           has done that. If we were callable by other functions
458  *           we'd want to make sure the CM wasn't already in local
459  *           mode, at least.
460  *
461  * Args:     cm               - the covariance model
462  *           p_internal_start - prob mass to spread for local begins
463  *           p_internal_exit  - prob mass to spread for local ends
464  */
465 
466 void
cm_localize(CM_t * cm,float p_internal_start,float p_internal_exit)467 cm_localize(CM_t *cm, float p_internal_start, float p_internal_exit)
468 {
469   int status;
470   int v;			/* counter over states */
471   int nd;			/* counter over nodes */
472   int nexits;			/* number of possible internal ends */
473   float denom;
474 
475   /* Local begins: */
476   cm_CalculateLocalBeginProbs(cm, p_internal_start, cm->t, cm->begin);
477   /* Erase the previous transition probs from node 0. The only way out
478    * of node 0 in standard scanners/aligners is going to be local
479    * begin transitions from the root v=0 directly to MATP_MP, MATR_MR,
480    * MATL_ML, and BIF_B states. In truncated scanners/aligners we also
481    * allow transitions into inserts, see comments in CalculateLocalBeginProbs().
482    *
483    * First we want to save the transition probs we're about to zero,
484    * so we don't lose that information in case we need it subsequently
485    * cm->root_trans is NULL prior to this.
486    */
487   if(cm->root_trans == NULL) { /* otherwise they've already been set */
488     ESL_ALLOC(cm->root_trans, sizeof(float) * cm->cnum[0]);
489     esl_vec_FCopy(cm->t[0], cm->cnum[0], cm->root_trans);
490   }
491   esl_vec_FSet(cm->t[0], cm->cnum[0], 0.);
492     cm->flags |= CMH_LOCAL_BEGIN;
493 
494   /* Local ends:
495    * Count internal nodes MATP, MATL, MATR, BEGL, BEGR that aren't
496    * adjacent to END nodes.
497    */
498   nexits = 0;
499   for (nd = 1; nd < cm->nodes; nd++) {
500     if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
501 	 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
502 	 cm->ndtype[nd] == BEGR_nd) &&
503 	cm->ndtype[nd+1] != END_nd)
504       nexits++;
505   }
506   /* Spread the exit probability across internal nodes.
507    * Currently does not compensate for the decreasing probability
508    * of reaching a node, the way HMMER does: therefore the probability
509    * of exiting at later nodes is actually lower than the probability
510    * of exiting at earlier nodes. This should be a small effect.
511    */
512   for (v = 0; v < cm->M; v++) cm->end[v] = 0.;
513   for (nd = 1; nd < cm->nodes; nd++) {
514     if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
515 	 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
516 	 cm->ndtype[nd] == BEGR_nd) &&
517 	cm->ndtype[nd+1] != END_nd)
518       {
519 	v = cm->nodemap[nd];
520 	cm->end[v] = p_internal_exit / (float) nexits;
521 				/* renormalize the main model transition distribution */
522 	denom = esl_vec_FSum(cm->t[v], cm->cnum[v]);
523 	denom += cm->end[v];
524 	esl_vec_FScale(cm->t[v], cm->cnum[v], 1./denom);
525 	/* cm->t[v] vector will purposefully no longer sum to 1.,
526 	 * if we were to append cm->end[v] as a new number in the vector, it would sum to 1.
527 	 */
528       }
529   }
530   cm->flags |= CMH_LOCAL_END;
531 
532   /* new probs invalidate log odds scores if we had them */
533   cm->flags &= ~CMH_BITS;
534 
535   return;
536 
537  ERROR:
538   cm_Fail("Memory allocation error.");
539 }
540 
541 /* Function: cp9_sw_config()
542  * Incept:   EPN 05.30.06
543  *           based on SRE's Plan7SWConfig() from HMMER's plan7.c
544  *           EPN, Mon Dec 12 04:35:28 2011 [Updated, made local in cm_modelconfig.c]
545  *
546  * Purpose:  Set the alignment independent parameters of
547  *           a CM Plan 9 model to hmmsw (Smith/Waterman) configuration.
548  *
549  * Notes:    The desideratum for begin/end probs is that all fragments ij
550  *           (starting at match i, ending at match j) are
551  *           equiprobable -- there is no information in the choice of
552  *           entry/exit. There are M(M+1)/2 possible choices of ij, so
553  *           each must get a probability of 2/M(M+1). This prob is the
554  *           product of a begin, an end, and all the not-end probs in
555  *           the path between i,j.
556  *
557  *           Thus: entry/exit is asymmetric because of the left/right
558  *           nature of the HMM/profile. Entry probability is distributed
559  *           simply by assigning p_x = pentry / (M-1) to M-1
560  *           internal match states. However, the same approach doesn't
561  *           lead to a flat distribution over exit points. Exit p's
562  *           must be corrected for the probability of a previous exit
563  *           from the model. Requiring a flat distribution over exit
564  *           points leads to an easily solved piece of algebra, giving:
565  *                      p_1 = pexit / (M-1)
566  *                      p_x = p_1 / (1 - (x-1) p_1)
567  *
568  *           Modified EPN, Thu Feb  7 15:54:16 2008, as follows:
569  *           To better match a locally configured CM, if <do_match_local_cm>
570  *           we disallow insertions before the first (emitting) match state,
571  *           (from I_0), and after the final (emitting) match state,
572  *           (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR
573  *           which can never be entered in a locally configured CM
574  *           (b/c the ROOT_S state MUST jump into a local begin state, which
575  *            are always match states>). Also we disallow a M_0->D_1 transition
576  *           because these would be impossible in a locally configured CM.
577  *
578  *           <do_match_local_cm> is usually TRUE, unless we're configuring
579  *           the CP9 specifically for eventual sub CM alignment, where
580  *           the goal is simply find the most likely start/end point
581  *           of the alignment with this CP9 (in that case we want
582  *           I_0 and I_M reachable).
583  *
584  *           HMM probabilities are modified, but HMM is only
585  *           logoddsified to get valid bit scores before leaving the
586  *           function if it had valid bit scores upon entering.
587  *
588  * Args:     hmm    - the CM Plan 9 model w/ data-dep prob's valid
589  *           pentry - probability of an internal entry somewhere;
590  *                    will be evenly distributed over M-1 match states
591  *           pexit  - probability of an internal exit somewhere;
592  *                    will be distributed over M-1 match states.
593  *           do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable
594  *                    to better match a locally configured CM.
595  *           first_cm_ndtype - only used if do_match_local_cm is TRUE
596  *                             if it's MATL or MATP then D_1 should be unreachable (it is in the CM)
597  *                             if it's MATR or MATP then D_M should be unreachable (it is in the CM)
598  *
599  * Return:   (void)
600  *           HMM probabilities are modified.
601  */
602 int
cp9_sw_config(CP9_t * hmm,float pentry,float pexit,int do_match_local_cm,int first_cm_ndtype)603 cp9_sw_config(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype)
604 {
605   float basep;			/* p1 for exits: the base p */
606   int   k;			/* counter over states      */
607   float d;
608   int   had_bits;
609 
610   had_bits = (hmm->flags & CPLAN9_HASBITS) ? TRUE : FALSE;
611 
612   /* No special (*x* states in Plan 7) states in CM Plan 9 */
613 
614   /* Configure entry.
615    */
616   if(do_match_local_cm) {
617     hmm->t[0][CTMI] = 0.;
618     hmm->t[0][CTMM] = 0.;  /* already was 0.0, transition from M_0 to M_1 is begin[1] */
619     hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */
620     if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */
621       hmm->t[0][CTMD] = 0.;
622     }
623 
624     hmm->t[hmm->M][CTMI] = 0.;
625     hmm->t[hmm->M][CTDI] = 0.;
626     if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */
627       hmm->t[hmm->M][CTMD] = 0.;
628     }
629 
630     /* renormalize transitions out of M_M */
631     d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M];
632     esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d);
633     hmm->end[hmm->M] /= d;
634 
635     /* renormalize transitions out of D_M */
636     esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE);	/* delete */
637   }
638 
639   hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]));
640   esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1));
641   /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin)
642    *       and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0.
643    */
644 
645   /* Configure exit.
646    * Don't touch hmm->end[hmm->M]
647    */
648 
649   basep = pexit / (float) (hmm->M-1);
650   for (k = 1; k < hmm->M; k++)
651     hmm->end[k] = basep / (1. - basep * (float) (k-1));
652   cp9_renormalize_exits(hmm);
653   /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/
654 
655   hmm->flags       &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
656   hmm->flags       |= CPLAN9_LOCAL_BEGIN; /* local begins now on */
657   hmm->flags       |= CPLAN9_LOCAL_END;   /* local ends now on */
658 
659   /* only call CP9Logoddsify() if we had valid scores upon entering */
660   if(had_bits) CP9Logoddsify(hmm);
661 
662   return eslOK;
663 }
664 
665 /* Function: cp9_EL_local_ends_config()
666  * Incept:   EPN, Tue Jun 19 09:50:52 2007
667  *           EPN, Mon Dec 12 04:38:23 2011 [Updated, made local in cm_modelconfig.c]
668  *
669  * Purpose:  Turn EL local ends in a CM Plan 9 HMM on based on
670  *           the local end probs in the CM.
671  *
672  *           HMM probabilities are modified, but HMM is only
673  *           logoddsified to get valid bit scores before leaving the
674  *           function if it had valid bit scores upon entering.
675 
676  * Args:     cp9 - the cp9 HMM
677  *           cm  - the CM the cp9 was built from
678  *
679  * Return:   eslOK on success.
680  *           eslEINVAL on any error, errbuf is filled.
681  */
682 int
cp9_EL_local_ends_config(CP9_t * cp9,CM_t * cm,char * errbuf)683 cp9_EL_local_ends_config(CP9_t *cp9, CM_t *cm, char *errbuf)
684 {
685   /* Contract checks */
686   if(cp9->M != cm->clen)     ESL_FAIL(eslEINVAL, errbuf, "cp9 and cm model length do not match");
687   if(cm->cp9map == NULL)     ESL_FAIL(eslEINVAL, errbuf, "cm->cp9map is NULL when trying to setup ELs in cp9");
688   if(cp9->flags & CPLAN9_EL) ESL_FAIL(eslEINVAL, errbuf, "trying to setup ELs in a cp9, but CPLAN_EL flag already raised");
689 
690   int v;
691   int k;                     /* counter over HMM nodes */
692   int nd;
693   int seen_exit;
694   float to_el_prob;
695   float norm_factor;
696   int   nexits;
697   int   had_bits = (cp9->flags & CPLAN9_HASBITS) ? TRUE : FALSE;
698 
699   /* If the CM has local ends on, check to make sure all non-zero
700    * local end probabilities in the CM are identical (within reasonable
701    * precision), use that probability to set all HMM transitions to
702    * EL states.
703    */
704   if(cm->flags & CMH_LOCAL_END) {
705     seen_exit  = FALSE;
706     to_el_prob = 0.;
707     for(v = 0; v < cm->M; v++) {
708       nd = cm->ndidx[v];
709       if (((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
710 	  cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
711 	  cm->ndtype[nd] == BEGR_nd) &&
712 	 cm->ndtype[nd+1] != END_nd) && cm->nodemap[nd] == v) {
713 	/* this should have a non-zero local end probability */
714 	if(fabs(cm->end[v] - 0.) < eslSMALLX1) ESL_FAIL(eslEINVAL, errbuf, "cp9_el_local_ends_config(), CM state %d has local end prob of 0.", v);
715 	if(! seen_exit) {
716 	  to_el_prob = cm->end[v];
717 	  seen_exit  = TRUE;
718 	}
719 	else if(fabs(to_el_prob - cm->end[v]) > eslSMALLX1) {
720 	  ESL_FAIL(eslEINVAL, errbuf, "cp9_el_local_ends_config(), not all CM states EL probs are identical.\n");
721 	}
722       }
723     }
724     if(! seen_exit && cm->nodes != 3) ESL_FAIL(eslEINVAL, errbuf, "cp9_el_local_ends_config(), cm->nodes != 3, but all CM local end probs are zero.");
725   }
726   else {
727     /* CM_LOCAL_END flag is down, local ends are off in the CM
728      * We figure out what the local end prob would be given cm->pend
729      * and set the HMM local end probs based on that.
730      * First, count internal nodes MATP, MATL, MATR, BEGL, BEGR that aren't
731      * adjacent to END nodes.
732      */
733     nexits = 0;
734     for (nd = 1; nd < cm->nodes; nd++) {
735       if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
736 	   cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
737 	   cm->ndtype[nd] == BEGR_nd) &&
738 	  cm->ndtype[nd+1] != END_nd)
739 	nexits++;
740     }
741     to_el_prob = cm->pend / (float) nexits;
742   }
743 
744   /* transitions from HMM node 0 to EL is impossible */
745   cp9->t[0][CTMEL] = 0.;
746   for(k = 1; k <= cp9->M; k++)
747     {
748       if(cp9->has_el[k])
749 	{
750 	  cp9->t[k][CTMEL] = to_el_prob;
751 	  norm_factor = 1. - (cp9->t[k][CTMEL] / (1. - cp9->end[k]));
752 	  cp9->t[k][CTMM] *= norm_factor;
753 	  cp9->t[k][CTMI] *= norm_factor;
754 	  cp9->t[k][CTMD] *= norm_factor;
755 	  /* cp9->end[k] untouched */
756 	}
757     }
758   cp9->flags &= ~CPLAN9_HASBITS;	/* clear the log-odds ready flag */
759 
760   /* only call CP9Logoddsify() if we had valid scores upon entering */
761   if(had_bits) CP9Logoddsify(cp9);
762 
763   cp9->flags |= CPLAN9_EL;          /* EL end locals now on */
764   /*debug_print_cp9_params(cp9);*/
765 
766   return eslOK;
767 }
768 
769 
770 /* Function: cp9_renormalize_exits()
771  * Incept:   EPN 05.30.06
772  *           based on SRE's Plan7RenormalizeExits() from HMMER2's plan7.c.
773  *           EPN, Mon Dec 12 04:49:29 2011 [Updated, made local in cm_modelconfig.c]
774  *
775  * Purpose:  Renormalize just the match state transitions;
776  *           for instance, after a Config() function has
777  *           modified the exit distribution.
778  *
779  * Args:     hmm - hmm to renormalize
780  *
781  * Returns:  void
782  */
783 void
cp9_renormalize_exits(CP9_t * hmm)784 cp9_renormalize_exits(CP9_t *hmm)
785 {
786   int   k;
787   float d;
788 
789   /* We can't exit from node 0 so we start renormalizing at node 1 */
790   for (k = 1; k < hmm->M; k++) {
791     d = esl_vec_FSum(hmm->t[k], 4);
792     /* esl_vec_FScale(hmm->t[k], 4, 1./(d + d*hmm->end[k])); */
793     esl_vec_FScale(hmm->t[k], 4, (1.-hmm->end[k])/d);
794   }
795   /* Take care of hmm->M node, which is special */
796   d = hmm->t[hmm->M][CTMI] + hmm->t[hmm->M][CTMEL]; /* CTMD is IMPOSSIBLE, CTMM is hmm->end[hmm-M] */
797   if(! (fabs(d-0.) < eslSMALLX1)) { /* don't divide by d if it's zero */
798     hmm->t[hmm->M][CTMI] *= (1.-hmm->end[hmm->M])/d;
799     hmm->t[hmm->M][CTMEL] *= (1.-hmm->end[hmm->M])/d;
800   }
801 
802   hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
803 
804   return;
805 }
806