1 /* cm_modelconfig.c
2 * SRE, Wed May 8 14:30:38 2002 [St. Louis]
3 *
4 * Configuring a covariance model. The desideratum is for a CM to be
5 * configured exactly once. This is done via the cm_Configure() or
6 * cm_ConfigureSub() function. Configuration consists of initializing
7 * the matrices and data structures that are considered always valid
8 * in a CM, but are not read during CM input from a file, and putting
9 * the model into local mode if necessary (CMs are always read in from
10 * a file with global parameters). When a CM is about to be
11 * configured, it must be 'non-configured', which means all of the
12 * data not read upon CM input from a file must not be present.
13 *
14 * All functions that aid in the configuration process (and actually
15 * modify the CM, thus removing its 'non-configured' status) are
16 * static local functions in this file. This is by design - we want
17 * the only way to configure a CM to be through cm_Configure() or
18 * cm_ConfigureSub(), which, again, will happen exactly once per
19 * model. By doing this we are limiting the possible execution paths
20 * for model configuration to try and avoid the case that some
21 * execution paths (of which there could be many due to the various
22 * command-line options of Infernal applications) may screw something
23 * up in the model.
24 *
25 */
26
27 #include "esl_config.h"
28 #include "p7_config.h"
29 #include "config.h"
30
31 #include <string.h>
32
33 #include "easel.h"
34 #include "esl_alphabet.h"
35 #include "esl_stack.h"
36 #include "esl_vectorops.h"
37
38 #include "hmmer.h"
39
40 #include "infernal.h"
41
42 static void cm_localize(CM_t *cm, float p_internal_entry, float p_internal_exit);
43 static int cp9_sw_config(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype);
44 static int cp9_EL_local_ends_config(CP9_t *cp9, CM_t *cm, char *errbuf);
45 static void cp9_renormalize_exits(CP9_t *hmm);
46
47 /* Function: cm_Configure()
48 * Date: EPN, Thu Jan 4 06:36:09 2007
49 * EPN, Fri Dec 9 15:27:29 2011 [updated prior to 1.1 release]
50 *
51 * Purpose: Configure a CM. Configuration options are in
52 * cm->config_opts.
53 *
54 * CM data structures that are always built or initialized:
55 *
56 * - emitmap (if not yet built)
57 *
58 * - all HMM banded matrices for the CM (hb_mx, hb_omx, hb_emx,
59 * hb_shmx, trhb_mx, trhb_omx, trhb_emx, trhb_shmx)
60 *
61 * - CP9 HMMs (cp9, Lcp9, Rcp9, Tcp9)
62 *
63 * - CP9 associated data structures (cp9map, cp9b, cp9_mx,
64 * cp9_bmx)
65 *
66 * - maximum-likelihood P7 HMM (mlp7; filter p7 HMM fp7 is
67 * read from the CM file).
68 *
69 * Optional configuration:
70 *
71 * - CM and cp9 HMMs (built in this function) are put
72 * into local mode if cm->config_opts & CM_CONFIG_LOCAL.
73 *
74 * - QDBs and W are recalculated if cm->qdbinfo isn't
75 * already set (cm->qdbinfo->setby =
76 * CM_QDBINFO_SETBY_INIT) or if cm->config_opts &
77 * CM_CONFIG_QDB.
78 *
79 * - W is also recalculated if cm->config_opts &
80 * CM_CONFIG_W.
81 *
82 * - all non-banded matrices for the CM (nbmx, onbmx,
83 * trnbmx, tronbmx, enbmx, trenbmx, shnbmx, trshnbmx)
84 *
85 * - the scan matrix (smx) is created if cm->config_opts &
86 * CM_CONFIG_SMX
87 *
88 *
89 * Args: cm - the covariance model
90 * errbuf - for error messages
91 * W_from_cmdline - W set on cmdline, -1 if W not set on cmdline (usually -1)
92 *
93 * Returns: <eslOK> on success.
94 * <eslEINVAL> on contract violation.
95 * <eslEMEM> on memory allocation error.
96 */
97 int
cm_Configure(CM_t * cm,char * errbuf,int W_from_cmdline)98 cm_Configure(CM_t *cm, char *errbuf, int W_from_cmdline)
99 {
100 return cm_ConfigureSub(cm, errbuf, W_from_cmdline, NULL, NULL);
101 }
102
103 /* Function: cm_ConfigureSub()
104 * Date: EPN, Thu Jan 4 06:36:09 2007
105 * EPN, Fri Dec 9 15:27:29 2011 [updated prior to 1.1 release]
106 *
107 * Purpose: See cm_Configure()'s Purpose above. This function
108 * actually does the work for cm_Configure(). Both functions
109 * exist so we can take two additional parameters in the
110 * rare case that we're configuring a model that is a
111 * sub-model of another CM. If <sub_mother_cm> and
112 * <sub_mother_map> are non-NULL, <cm> is a sub CM
113 * constructed from <sub_mother_cm>. In this case, we're
114 * doing alignment and are constructing a new sub CM for
115 * each target sequence so running time should be
116 * minimized. Special functions for building the CP9 HMMs
117 * and for logoddsifying the model are called that are
118 * faster than the normal versions b/c they can just copy
119 * some of the parameters of the mother model instead of
120 * calc'ing them.
121 *
122 * Args: cm - the covariance model
123 * errbuf - for error messages
124 * W_from_cmdline - W set on cmdline, -1 if W not set on cmdline (usually -1)
125 * mother_cm - if non-NULL, <cm> is a sub CM construced from
126 * <mother_cm>. In this case we use <mother_map>
127 * to help streamline the two steps that dominate the
128 * running time of this function (b/c speed is an issue):
129 * building a cp9 HMM, and logoddsifying the model.
130 * mother_map - must be non-NULL iff <mother_cm> is non-NULL, the
131 * map from <cm> to <mother_cm>.
132 *
133 * Returns: <eslOK> on success.
134 * <eslEINVAL> on contract violation.
135 * <eslEMEM> on memory allocation error.
136 * <eslFAIL> on other failure (errbuf filled)
137 */
138 int
cm_ConfigureSub(CM_t * cm,char * errbuf,int W_from_cmdline,CM_t * mother_cm,CMSubMap_t * mother_map)139 cm_ConfigureSub(CM_t *cm, char *errbuf, int W_from_cmdline, CM_t *mother_cm, CMSubMap_t *mother_map)
140 {
141 int status;
142 float swentry, swexit;
143 int have_mother;
144
145 have_mother = (mother_cm != NULL && mother_map != NULL) ? TRUE : FALSE;
146
147 /* contract check */
148 if(mother_cm != NULL && mother_map == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, mother_cm != NULL but mother_map == NULL (both must be NULL or both non-NULL).");
149 if(mother_cm == NULL && mother_map != NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, mother_cm == NULL but mother_map != NULL (both must be NULL or both non-NULL).");
150 if(have_mother && (cm->config_opts & CM_CONFIG_LOCAL)) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, configuring a sub CM, but CM_CONFIG_LOCAL config flag up.");
151 if(( cm->config_opts & CM_CONFIG_HMMLOCAL) &&
152 (! (cm->config_opts & CM_CONFIG_LOCAL))) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, cp9 is to be configured locally, but CM is not");
153 if(( cm->config_opts & CM_CONFIG_HMMEL) &&
154 (! (cm->config_opts & CM_CONFIG_HMMLOCAL))) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, cp9 is to be configured without local entries exists but with ELs on");
155 if((cm->config_opts & CM_CONFIG_TRUNC) && (cm->config_opts & CM_CONFIG_SUB)) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, incompatible configuration options: CM_CONFIG_TRUNC and CM_CONFIG_SUB");
156 if((cm->config_opts & CM_CONFIG_LOCAL) && (cm->config_opts & CM_CONFIG_SUB)) ESL_FAIL(eslEINCOMPAT, errbuf, "Configuring CM, incompatible configuration options: CM_CONFIG_LOCAL and CM_CONFIG_SUB");
157
158 /* validate the CM */
159 if((status = cm_Validate(cm, 0.0001, errbuf)) != eslOK) return status;
160
161 /* verify we're not already configured */
162 if((status = cm_nonconfigured_Verify(cm, errbuf)) != eslOK) { status = eslEINCOMPAT; return status; }
163 /* cm_nonconfigured_Verify() checked that everything that should be
164 * NULL in <cm> is NULL, so we don't have to check below.
165 */
166
167 /* Build the emitmap, if necessary */
168 if(cm->emap == NULL) cm->emap = CreateEmitMap(cm);
169
170 /* Define W and set up query dependent bands. This is confusing. We need
171 * the user to be able to set W on the command line if they want, but W
172 * and the QDBs used to define the CM_SCAN_MX (created later) are dependent
173 * on one another.
174 *
175 * If W was set on the command line (W_from_cmdline != -1):
176 * Set W, and calculate QDBs if nec, without redefining W.
177 *
178 * Else, if W was not set on the command line (W_from_cmdline ==
179 * -1): Calculate QDBs if nec, and redefine W using beta=cm->beta_W,
180 * which may or may not have been changed by caller from what BETA_W
181 * was in the CM file.
182 *
183 * Then, W (regardless of how it was set) is enforced when the scan
184 * matrix is created, i.e. no dmin/dmax values from cm->qdbinfo
185 * that exceed W will be treated as begin equal to W.
186 */
187 if(W_from_cmdline != -1) {
188 cm->W = W_from_cmdline;
189 cm->W_setby = CM_W_SETBY_CMDLINE;
190 }
191 /* If nec, set up the query dependent bands (it's important to do this before creating the ml p7 HMM (which needs to know cm->W)) */
192 if((cm->config_opts & CM_CONFIG_QDB) ||
193 (cm->config_opts & CM_CONFIG_W_BETA) ||
194 (cm->qdbinfo->setby == CM_QDBINFO_SETBY_INIT)) {
195 if(W_from_cmdline != -1) {
196 if((status = CalculateQueryDependentBands(cm, errbuf, cm->qdbinfo,
197 ESL_MIN(cm->qdbinfo->beta1, cm->qdbinfo->beta2), NULL, /* don't redefine W, we just set it above as W_from_cmdline */
198 NULL, NULL, NULL)) != eslOK) return status;
199 }
200 else {
201 if((status = CalculateQueryDependentBands(cm, errbuf, cm->qdbinfo,
202 cm->beta_W, &(cm->W), /* do redefine W, as that calc'ed with cm->beta_W */
203 NULL, NULL, NULL)) != eslOK) return status;
204 cm->W_setby = CM_W_SETBY_BANDCALC;
205 }
206 }
207
208 /* Allocate the HMM banded matrices, these are originally
209 * very small and only grown as needed. Optionally create
210 * the truncated alignment matrices.
211 */
212 cm->hb_mx = cm_hb_mx_Create(cm->M);
213 cm->hb_omx = cm_hb_mx_Create(cm->M);
214 cm->hb_emx = cm_hb_emit_mx_Create(cm);
215 cm->hb_shmx = cm_hb_shadow_mx_Create(cm);
216 if(cm->config_opts & CM_CONFIG_TRUNC) {
217 cm->trhb_mx = cm_tr_hb_mx_Create(cm);
218 cm->trhb_omx = cm_tr_hb_mx_Create(cm);
219 cm->trhb_emx = cm_tr_hb_emit_mx_Create(cm);
220 cm->trhb_shmx = cm_tr_hb_shadow_mx_Create(cm);
221 }
222
223 /* If nec, create the nonbanded matrices (usually we won't, these get real big) */
224 if(cm->config_opts & CM_CONFIG_NONBANDEDMX) {
225 cm->nb_mx = cm_mx_Create(cm->M);
226 cm->nb_omx = cm_mx_Create(cm->M);
227 cm->nb_emx = cm_emit_mx_Create(cm);
228 cm->nb_shmx = cm_shadow_mx_Create(cm);
229 if(cm->config_opts & CM_CONFIG_TRUNC) {
230 cm->trnb_mx = cm_tr_mx_Create(cm);
231 cm->trnb_omx = cm_tr_mx_Create(cm);
232 cm->trnb_emx = cm_tr_emit_mx_Create(cm);
233 cm->trnb_shmx = cm_tr_shadow_mx_Create(cm);
234 }
235 }
236
237 /* If nec, create the scan matrix and truncated scan matrix */
238 /* (we could the check size of matrices first, return error if too big, but don't currently) */
239 if(cm->config_opts & CM_CONFIG_SCANMX) {
240 if((status = cm_scan_mx_Create(cm, errbuf, TRUE, TRUE, &(cm->smx))) != eslOK) return status;
241 }
242 if(cm->config_opts & CM_CONFIG_TRSCANMX) {
243 if((status = cm_tr_scan_mx_Create(cm, errbuf, TRUE, TRUE, &(cm->trsmx))) != eslOK) return status;
244 }
245
246 /* Build the CP9 HMM and associated data. It's important
247 * to do this before setting up CM for local mode.
248 */
249 if(have_mother) {
250 if((status = sub_build_cp9_hmm_from_mother(cm, errbuf, mother_cm, mother_map, &(cm->cp9), &(cm->cp9map), FALSE, 0.0001, 0)) != eslOK) return status;
251 }
252 else {
253 if((status = build_cp9_hmm(cm, errbuf, FALSE, 0.0001, 0, &(cm->cp9), &(cm->cp9map))) != eslOK) return status;
254 }
255 cm->cp9b = AllocCP9Bands(cm->M, cm->cp9->M);
256 /* create the CP9 matrices, we init to 1 row, which is tiny so it's okay
257 * that we have two of them, we only grow them as needed, cp9_bmx is
258 * only needed if we're doing Forward -> Backward -> Posteriors.
259 */
260 cm->cp9_mx = CreateCP9Matrix(1, cm->cp9->M);
261 cm->cp9_bmx = CreateCP9Matrix(1, cm->cp9->M);
262 cm->flags |= CMH_CP9; /* raise the CP9 flag */
263
264 /* Configure for truncated search/alignment. */
265 if(cm->config_opts & CM_CONFIG_TRUNC) {
266 /* (1) Define truncated alignment penalty probabilities (cm->trp).
267 * 'FALSE' informs the function not to ignore inserts, that is to
268 * allow truncated begins into insert states. This is the
269 * hard-coded behavior. We'd only set it to TRUE if we were
270 * testing the code, because in that case a validation is
271 * performed that isn't possible with truncated begins into
272 * inserts allowed.
273 */
274 if((cm->trp = cm_tr_penalties_Create(cm, FALSE, errbuf)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't create truncation penalties for the CM");
275 /* cm_tr_penalties_Dump(stdout, cm, cm->trp); */
276
277 /* (2) Setup Lcp9, Rcp9, Tcp9 CP9 HMMs
278 * Clone the globally configured CP9 HMM cm->cp9 before its put
279 * into local mode into each of cm->Lcp9, cm->Rcp9, cm->Tcp9 and
280 * configure them for their specific mode of truncated alignment.
281 * cm->cp9 is not yet logoddsified and so bit scores will not
282 * be copied into Lcp9, Rcp9, Tcp9. This is important because
283 * we want to be as efficient as possible, and only want to
284 * logoddsify each cp9 exactly once, which will occur after
285 * they've been configured into their specific locality mode.
286 */
287 if((cm->Lcp9 = cp9_Clone(cm->cp9)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't clone cm->cp9 to get cm->Lcp9");
288 if((cm->Rcp9 = cp9_Clone(cm->cp9)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't clone cm->cp9 to get cm->Rcp9");
289 if((cm->Tcp9 = cp9_Clone(cm->cp9)) == NULL) ESL_FAIL(eslFAIL, errbuf, "couldn't clone cm->cp9 to get cm->Tcp9");
290
291 /* L mode alignment, 3' truncation: begin into node 1, equiprobable ends */
292 swentry = 0.;
293 swexit = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
294 cp9_sw_config(cm->Lcp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
295 /* R mode alignment, 5' truncation: equiprobable begins, end out of node M */
296 swentry = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
297 swexit = 0.;
298 cp9_sw_config(cm->Rcp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
299 /* T mode alignment, 5' and 3' truncation: equiprobable begins, equiprobable ends */
300 swentry = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
301 swexit = ((float) cm->cp9->M - 1.) / (float) cm->cp9->M;
302 cp9_sw_config(cm->Tcp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
303
304 cm->flags |= CMH_CP9_TRUNC;
305 /* don't logoddsify Lcp9, Rcp9, Tcp9 yet, wait until the end of
306 * the function, after we potentially turn on EL local ends below
307 */
308 }
309
310 /* Configure the CM and cm->cp9 for local alignment and cm->Lcp9,
311 * cm->Rcp9, cm->Tcp9 for EL-type local ends, if necessary */
312 if(cm->config_opts & CM_CONFIG_LOCAL) {
313 cm_localize(cm, cm->pbegin, cm->pend);
314 if(cm->config_opts & CM_CONFIG_HMMLOCAL) { /* contract enforced CM_CONFIG_HMMLOCAL only raised if CM_CONFIG_LOCAL raised */
315 cp9_sw_config(cm->cp9, cm->pbegin, cm->pbegin, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
316 /* set up EL-type local ends, if necessary */
317 if(cm->config_opts & CM_CONFIG_HMMEL) {
318 if((status = cp9_EL_local_ends_config(cm->cp9, cm, errbuf)) != eslOK) return status;
319 if(cm->flags & CMH_CP9_TRUNC) {
320 if((status = cp9_EL_local_ends_config(cm->Lcp9, cm, errbuf)) != eslOK) return status;
321 if((status = cp9_EL_local_ends_config(cm->Rcp9, cm, errbuf)) != eslOK) return status;
322 if((status = cp9_EL_local_ends_config(cm->Tcp9, cm, errbuf)) != eslOK) return status;
323 }
324 }
325 }
326 }
327
328 /* Possibly configure cm->cp9 for submodel alignment (contract
329 enforced that if CM_CONFIG_SUB, CM_CONFIG_LOCAL and
330 CM_CONFIG_TRUNC must both be FALSE) */
331 if(cm->config_opts & CM_CONFIG_SUB) {
332 swentry= ((cm->cp9->M)-1.)/cm->cp9->M; /* all start pts equiprobable, including 1 */
333 swexit = ((cm->cp9->M)-1.)/cm->cp9->M; /* all end pts equiprobable, including M */
334 cp9_sw_config(cm->cp9, swentry, swexit, FALSE, cm->ndtype[1]); /* FALSE: let I_0, D_1, I_M be reachable */
335 }
336
337 /* We need to ensure that cm->el_selfsc * W >= IMPOSSIBLE
338 * (cm->el_selfsc is the score for an EL self transition) This is
339 * done because we potentially multiply cm->el_selfsc * W, and add
340 * that to IMPOSSIBLE.
341 */
342 if((cm->el_selfsc * cm->W) < IMPOSSIBLE) {
343 cm->el_selfsc = (IMPOSSIBLE / (cm->W+1));
344 cm->iel_selfsc = -INFTY;
345 }
346
347 /* Compute log odds scores. This will create cm->cmcons as well, which requires scores. */
348 if(have_mother) {
349 if((status = SubCMLogoddsify(cm, errbuf, mother_cm, mother_map)) != eslOK) return status;
350 }
351 else {
352 if((status = CMLogoddsify(cm)) != eslOK) ESL_FAIL(status, errbuf, "problem logodisfying CM");
353 }
354 CP9Logoddsify(cm->cp9);
355 if(cm->flags & CMH_CP9_TRUNC) {
356 CP9Logoddsify(cm->Lcp9);
357 CP9Logoddsify(cm->Rcp9);
358 CP9Logoddsify(cm->Tcp9);
359 }
360
361 /* Finally, build the ml p7 HMM, which requires cm->cmcons */
362 if((status = cm_cp9_to_p7(cm, cm->cp9, errbuf)) != eslOK) return status;
363
364 /*debug_print_cm_params(stdout, cm);
365 debug_print_cp9_params(stdout, cm->cp9, TRUE);
366 debug_print_cp9_params(stdout, cm->Lcp9, TRUE);
367 debug_print_cp9_params(stdout, cm->Rcp9, TRUE);
368 debug_print_cp9_params(stdout, cm->Tcp9, TRUE);*/
369
370 cm->flags |= CM_IS_CONFIGURED;
371
372 return eslOK;
373 }
374
375
376 /* Function: cm_CalculateLocalBeginProbs()
377 * Incept: EPN, Fri Dec 9 05:20:35 2011
378 *
379 * Purpose:
380 *
381 * Calculate local begin probabilities. The transitions in
382 * <t> should be for a CM in global mode, not local mode. By
383 * specifying <t> as not necessarily equal to <cm->t>, we
384 * can calculate local begin probs for a model already in
385 * local mode.
386 *
387 * Args: cm - the covariance model
388 * p_internal_start - prob mass to spread for local begins
389 * t - [0..M-1][0..MAXCONNECT-1] transition probabilities (not necessarily cm->t)
390 * begin - [0..M-1] standard local begin probs to set
391 *
392 * Returns: eslOK on success
393 * eslEMEM if out of memory
394 */
395
396 int
cm_CalculateLocalBeginProbs(CM_t * cm,float p_internal_start,float ** t,float * begin)397 cm_CalculateLocalBeginProbs(CM_t *cm, float p_internal_start, float **t, float *begin)
398 {
399 int nd; /* counter over nodes */
400 int nstarts; /* number of possible internal starts */
401 float p; /* p_internal_start / nstarts */
402
403 /* Count "internal" nodes: MATP, MATL, MATR, and BIF nodes.
404 * Ignore all start nodes, and also node 1 (which is always the
405 * "first" node and gets an entry prob of 1-p_internal_start).
406 */
407 nstarts = 0;
408 for (nd = 2; nd < cm->nodes; nd++) {
409 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
410 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd)
411 nstarts++;
412 }
413
414 /* Set begin probs (standard local begins) */
415 esl_vec_FSet(begin, cm->M, 0.);
416 /* Node 1 gets prob 1-p_internal_start. */
417 begin[cm->nodemap[1]] = 1.-p_internal_start;
418 /* Remaining nodes share p_internal_start. */
419 p = p_internal_start / (float) nstarts;
420 for (nd = 2; nd < cm->nodes; nd++) {
421 if (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
422 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd)
423 begin[cm->nodemap[nd]] = p;
424 }
425 return eslOK;
426 }
427
428 /* Function: cm_localize()
429 * Incept: EPN, Tue Nov 29 13:46:29 2011 [updated]
430 *
431 * Purpose: Configure a CM for local alignment by spreading
432 * <p_internal_start> local entry probability evenly across
433 * all internal nodes, and by spreading <p_internal_exit>
434 * local exit probability evenly across all internal nodes.
435 *
436 * Local entry probabilities for truncated alignment
437 * (cm->trbegin and cm->trbeginsc) are configured slightly
438 * differently, to allow inserts at the ends of the
439 * alignment. As with standard local begins,
440 * <p_internal_start> is spread evenly across all <nstarts>
441 * internal nodes, but entries into inserts are also
442 * allowed. The <p_internal_start>/<nstarts> probability
443 * for each node is divided between match and insert states
444 * of that node using a <psi> vector, psi[v] is the expected
445 * number of times state v is entered.
446 *
447 * Note: to reproduce how Diana scored truncated alignments
448 * in the Kolbe and Eddy 2009 paper, set trbegin[v] to
449 * (2 / (cm->clen * (cm->clen + 1))) for all v.
450 *
451 * Local end probability is spread evenly across all states
452 * from which local ends are permitted (see code).
453 *
454 * We exploit the fact that we know we are a local static
455 * function called only by cm_ConfigureSub() and don't do
456 * any checking of the CM because cm_ConfigureSub() already
457 * has done that. If we were callable by other functions
458 * we'd want to make sure the CM wasn't already in local
459 * mode, at least.
460 *
461 * Args: cm - the covariance model
462 * p_internal_start - prob mass to spread for local begins
463 * p_internal_exit - prob mass to spread for local ends
464 */
465
466 void
cm_localize(CM_t * cm,float p_internal_start,float p_internal_exit)467 cm_localize(CM_t *cm, float p_internal_start, float p_internal_exit)
468 {
469 int status;
470 int v; /* counter over states */
471 int nd; /* counter over nodes */
472 int nexits; /* number of possible internal ends */
473 float denom;
474
475 /* Local begins: */
476 cm_CalculateLocalBeginProbs(cm, p_internal_start, cm->t, cm->begin);
477 /* Erase the previous transition probs from node 0. The only way out
478 * of node 0 in standard scanners/aligners is going to be local
479 * begin transitions from the root v=0 directly to MATP_MP, MATR_MR,
480 * MATL_ML, and BIF_B states. In truncated scanners/aligners we also
481 * allow transitions into inserts, see comments in CalculateLocalBeginProbs().
482 *
483 * First we want to save the transition probs we're about to zero,
484 * so we don't lose that information in case we need it subsequently
485 * cm->root_trans is NULL prior to this.
486 */
487 if(cm->root_trans == NULL) { /* otherwise they've already been set */
488 ESL_ALLOC(cm->root_trans, sizeof(float) * cm->cnum[0]);
489 esl_vec_FCopy(cm->t[0], cm->cnum[0], cm->root_trans);
490 }
491 esl_vec_FSet(cm->t[0], cm->cnum[0], 0.);
492 cm->flags |= CMH_LOCAL_BEGIN;
493
494 /* Local ends:
495 * Count internal nodes MATP, MATL, MATR, BEGL, BEGR that aren't
496 * adjacent to END nodes.
497 */
498 nexits = 0;
499 for (nd = 1; nd < cm->nodes; nd++) {
500 if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
501 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
502 cm->ndtype[nd] == BEGR_nd) &&
503 cm->ndtype[nd+1] != END_nd)
504 nexits++;
505 }
506 /* Spread the exit probability across internal nodes.
507 * Currently does not compensate for the decreasing probability
508 * of reaching a node, the way HMMER does: therefore the probability
509 * of exiting at later nodes is actually lower than the probability
510 * of exiting at earlier nodes. This should be a small effect.
511 */
512 for (v = 0; v < cm->M; v++) cm->end[v] = 0.;
513 for (nd = 1; nd < cm->nodes; nd++) {
514 if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
515 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
516 cm->ndtype[nd] == BEGR_nd) &&
517 cm->ndtype[nd+1] != END_nd)
518 {
519 v = cm->nodemap[nd];
520 cm->end[v] = p_internal_exit / (float) nexits;
521 /* renormalize the main model transition distribution */
522 denom = esl_vec_FSum(cm->t[v], cm->cnum[v]);
523 denom += cm->end[v];
524 esl_vec_FScale(cm->t[v], cm->cnum[v], 1./denom);
525 /* cm->t[v] vector will purposefully no longer sum to 1.,
526 * if we were to append cm->end[v] as a new number in the vector, it would sum to 1.
527 */
528 }
529 }
530 cm->flags |= CMH_LOCAL_END;
531
532 /* new probs invalidate log odds scores if we had them */
533 cm->flags &= ~CMH_BITS;
534
535 return;
536
537 ERROR:
538 cm_Fail("Memory allocation error.");
539 }
540
541 /* Function: cp9_sw_config()
542 * Incept: EPN 05.30.06
543 * based on SRE's Plan7SWConfig() from HMMER's plan7.c
544 * EPN, Mon Dec 12 04:35:28 2011 [Updated, made local in cm_modelconfig.c]
545 *
546 * Purpose: Set the alignment independent parameters of
547 * a CM Plan 9 model to hmmsw (Smith/Waterman) configuration.
548 *
549 * Notes: The desideratum for begin/end probs is that all fragments ij
550 * (starting at match i, ending at match j) are
551 * equiprobable -- there is no information in the choice of
552 * entry/exit. There are M(M+1)/2 possible choices of ij, so
553 * each must get a probability of 2/M(M+1). This prob is the
554 * product of a begin, an end, and all the not-end probs in
555 * the path between i,j.
556 *
557 * Thus: entry/exit is asymmetric because of the left/right
558 * nature of the HMM/profile. Entry probability is distributed
559 * simply by assigning p_x = pentry / (M-1) to M-1
560 * internal match states. However, the same approach doesn't
561 * lead to a flat distribution over exit points. Exit p's
562 * must be corrected for the probability of a previous exit
563 * from the model. Requiring a flat distribution over exit
564 * points leads to an easily solved piece of algebra, giving:
565 * p_1 = pexit / (M-1)
566 * p_x = p_1 / (1 - (x-1) p_1)
567 *
568 * Modified EPN, Thu Feb 7 15:54:16 2008, as follows:
569 * To better match a locally configured CM, if <do_match_local_cm>
570 * we disallow insertions before the first (emitting) match state,
571 * (from I_0), and after the final (emitting) match state,
572 * (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR
573 * which can never be entered in a locally configured CM
574 * (b/c the ROOT_S state MUST jump into a local begin state, which
575 * are always match states>). Also we disallow a M_0->D_1 transition
576 * because these would be impossible in a locally configured CM.
577 *
578 * <do_match_local_cm> is usually TRUE, unless we're configuring
579 * the CP9 specifically for eventual sub CM alignment, where
580 * the goal is simply find the most likely start/end point
581 * of the alignment with this CP9 (in that case we want
582 * I_0 and I_M reachable).
583 *
584 * HMM probabilities are modified, but HMM is only
585 * logoddsified to get valid bit scores before leaving the
586 * function if it had valid bit scores upon entering.
587 *
588 * Args: hmm - the CM Plan 9 model w/ data-dep prob's valid
589 * pentry - probability of an internal entry somewhere;
590 * will be evenly distributed over M-1 match states
591 * pexit - probability of an internal exit somewhere;
592 * will be distributed over M-1 match states.
593 * do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable
594 * to better match a locally configured CM.
595 * first_cm_ndtype - only used if do_match_local_cm is TRUE
596 * if it's MATL or MATP then D_1 should be unreachable (it is in the CM)
597 * if it's MATR or MATP then D_M should be unreachable (it is in the CM)
598 *
599 * Return: (void)
600 * HMM probabilities are modified.
601 */
602 int
cp9_sw_config(CP9_t * hmm,float pentry,float pexit,int do_match_local_cm,int first_cm_ndtype)603 cp9_sw_config(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype)
604 {
605 float basep; /* p1 for exits: the base p */
606 int k; /* counter over states */
607 float d;
608 int had_bits;
609
610 had_bits = (hmm->flags & CPLAN9_HASBITS) ? TRUE : FALSE;
611
612 /* No special (*x* states in Plan 7) states in CM Plan 9 */
613
614 /* Configure entry.
615 */
616 if(do_match_local_cm) {
617 hmm->t[0][CTMI] = 0.;
618 hmm->t[0][CTMM] = 0.; /* already was 0.0, transition from M_0 to M_1 is begin[1] */
619 hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */
620 if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */
621 hmm->t[0][CTMD] = 0.;
622 }
623
624 hmm->t[hmm->M][CTMI] = 0.;
625 hmm->t[hmm->M][CTDI] = 0.;
626 if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */
627 hmm->t[hmm->M][CTMD] = 0.;
628 }
629
630 /* renormalize transitions out of M_M */
631 d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M];
632 esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d);
633 hmm->end[hmm->M] /= d;
634
635 /* renormalize transitions out of D_M */
636 esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */
637 }
638
639 hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]));
640 esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1));
641 /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin)
642 * and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0.
643 */
644
645 /* Configure exit.
646 * Don't touch hmm->end[hmm->M]
647 */
648
649 basep = pexit / (float) (hmm->M-1);
650 for (k = 1; k < hmm->M; k++)
651 hmm->end[k] = basep / (1. - basep * (float) (k-1));
652 cp9_renormalize_exits(hmm);
653 /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/
654
655 hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
656 hmm->flags |= CPLAN9_LOCAL_BEGIN; /* local begins now on */
657 hmm->flags |= CPLAN9_LOCAL_END; /* local ends now on */
658
659 /* only call CP9Logoddsify() if we had valid scores upon entering */
660 if(had_bits) CP9Logoddsify(hmm);
661
662 return eslOK;
663 }
664
665 /* Function: cp9_EL_local_ends_config()
666 * Incept: EPN, Tue Jun 19 09:50:52 2007
667 * EPN, Mon Dec 12 04:38:23 2011 [Updated, made local in cm_modelconfig.c]
668 *
669 * Purpose: Turn EL local ends in a CM Plan 9 HMM on based on
670 * the local end probs in the CM.
671 *
672 * HMM probabilities are modified, but HMM is only
673 * logoddsified to get valid bit scores before leaving the
674 * function if it had valid bit scores upon entering.
675
676 * Args: cp9 - the cp9 HMM
677 * cm - the CM the cp9 was built from
678 *
679 * Return: eslOK on success.
680 * eslEINVAL on any error, errbuf is filled.
681 */
682 int
cp9_EL_local_ends_config(CP9_t * cp9,CM_t * cm,char * errbuf)683 cp9_EL_local_ends_config(CP9_t *cp9, CM_t *cm, char *errbuf)
684 {
685 /* Contract checks */
686 if(cp9->M != cm->clen) ESL_FAIL(eslEINVAL, errbuf, "cp9 and cm model length do not match");
687 if(cm->cp9map == NULL) ESL_FAIL(eslEINVAL, errbuf, "cm->cp9map is NULL when trying to setup ELs in cp9");
688 if(cp9->flags & CPLAN9_EL) ESL_FAIL(eslEINVAL, errbuf, "trying to setup ELs in a cp9, but CPLAN_EL flag already raised");
689
690 int v;
691 int k; /* counter over HMM nodes */
692 int nd;
693 int seen_exit;
694 float to_el_prob;
695 float norm_factor;
696 int nexits;
697 int had_bits = (cp9->flags & CPLAN9_HASBITS) ? TRUE : FALSE;
698
699 /* If the CM has local ends on, check to make sure all non-zero
700 * local end probabilities in the CM are identical (within reasonable
701 * precision), use that probability to set all HMM transitions to
702 * EL states.
703 */
704 if(cm->flags & CMH_LOCAL_END) {
705 seen_exit = FALSE;
706 to_el_prob = 0.;
707 for(v = 0; v < cm->M; v++) {
708 nd = cm->ndidx[v];
709 if (((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
710 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
711 cm->ndtype[nd] == BEGR_nd) &&
712 cm->ndtype[nd+1] != END_nd) && cm->nodemap[nd] == v) {
713 /* this should have a non-zero local end probability */
714 if(fabs(cm->end[v] - 0.) < eslSMALLX1) ESL_FAIL(eslEINVAL, errbuf, "cp9_el_local_ends_config(), CM state %d has local end prob of 0.", v);
715 if(! seen_exit) {
716 to_el_prob = cm->end[v];
717 seen_exit = TRUE;
718 }
719 else if(fabs(to_el_prob - cm->end[v]) > eslSMALLX1) {
720 ESL_FAIL(eslEINVAL, errbuf, "cp9_el_local_ends_config(), not all CM states EL probs are identical.\n");
721 }
722 }
723 }
724 if(! seen_exit && cm->nodes != 3) ESL_FAIL(eslEINVAL, errbuf, "cp9_el_local_ends_config(), cm->nodes != 3, but all CM local end probs are zero.");
725 }
726 else {
727 /* CM_LOCAL_END flag is down, local ends are off in the CM
728 * We figure out what the local end prob would be given cm->pend
729 * and set the HMM local end probs based on that.
730 * First, count internal nodes MATP, MATL, MATR, BEGL, BEGR that aren't
731 * adjacent to END nodes.
732 */
733 nexits = 0;
734 for (nd = 1; nd < cm->nodes; nd++) {
735 if ((cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd ||
736 cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BEGL_nd ||
737 cm->ndtype[nd] == BEGR_nd) &&
738 cm->ndtype[nd+1] != END_nd)
739 nexits++;
740 }
741 to_el_prob = cm->pend / (float) nexits;
742 }
743
744 /* transitions from HMM node 0 to EL is impossible */
745 cp9->t[0][CTMEL] = 0.;
746 for(k = 1; k <= cp9->M; k++)
747 {
748 if(cp9->has_el[k])
749 {
750 cp9->t[k][CTMEL] = to_el_prob;
751 norm_factor = 1. - (cp9->t[k][CTMEL] / (1. - cp9->end[k]));
752 cp9->t[k][CTMM] *= norm_factor;
753 cp9->t[k][CTMI] *= norm_factor;
754 cp9->t[k][CTMD] *= norm_factor;
755 /* cp9->end[k] untouched */
756 }
757 }
758 cp9->flags &= ~CPLAN9_HASBITS; /* clear the log-odds ready flag */
759
760 /* only call CP9Logoddsify() if we had valid scores upon entering */
761 if(had_bits) CP9Logoddsify(cp9);
762
763 cp9->flags |= CPLAN9_EL; /* EL end locals now on */
764 /*debug_print_cp9_params(cp9);*/
765
766 return eslOK;
767 }
768
769
770 /* Function: cp9_renormalize_exits()
771 * Incept: EPN 05.30.06
772 * based on SRE's Plan7RenormalizeExits() from HMMER2's plan7.c.
773 * EPN, Mon Dec 12 04:49:29 2011 [Updated, made local in cm_modelconfig.c]
774 *
775 * Purpose: Renormalize just the match state transitions;
776 * for instance, after a Config() function has
777 * modified the exit distribution.
778 *
779 * Args: hmm - hmm to renormalize
780 *
781 * Returns: void
782 */
783 void
cp9_renormalize_exits(CP9_t * hmm)784 cp9_renormalize_exits(CP9_t *hmm)
785 {
786 int k;
787 float d;
788
789 /* We can't exit from node 0 so we start renormalizing at node 1 */
790 for (k = 1; k < hmm->M; k++) {
791 d = esl_vec_FSum(hmm->t[k], 4);
792 /* esl_vec_FScale(hmm->t[k], 4, 1./(d + d*hmm->end[k])); */
793 esl_vec_FScale(hmm->t[k], 4, (1.-hmm->end[k])/d);
794 }
795 /* Take care of hmm->M node, which is special */
796 d = hmm->t[hmm->M][CTMI] + hmm->t[hmm->M][CTMEL]; /* CTMD is IMPOSSIBLE, CTMM is hmm->end[hmm-M] */
797 if(! (fabs(d-0.) < eslSMALLX1)) { /* don't divide by d if it's zero */
798 hmm->t[hmm->M][CTMI] *= (1.-hmm->end[hmm->M])/d;
799 hmm->t[hmm->M][CTMEL] *= (1.-hmm->end[hmm->M])/d;
800 }
801
802 hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
803
804 return;
805 }
806