1 /* eweight.c [EPN 11.07.05]
2 * based on: HMMER 2.4devl's lsj_eweight.c
3 * Most original comments from lsj_eweight.c untouched.
4 *
5 * LSJ, Wed Feb 4 15:03:58 CST 2004
6 *
7 * entropy targeting:
8 * Code for setting effective sequence number (in cmbuild) by
9 * achieving a certain target entropy loss, relative to background
10 * null distribution.
11 */
12
13 #include "esl_config.h"
14 #include "p7_config.h"
15 #include "config.h"
16
17 #include <stdio.h>
18 #include <string.h>
19 #include <math.h>
20
21 #include "easel.h"
22 #include "esl_rootfinder.h"
23 #include "esl_vectorops.h"
24
25 #include "hmmer.h"
26
27 #include "infernal.h"
28
29 struct ew_param_s {
30 CM_t *cm; /* ptr to the original count-based CM, cm->t and cm->e be changed, but we have a copy of the original data in t_orig and e_orig */
31 const Prior_t *pri; /* Dirichlet prior used to parameterize from counts */
32 float **t_orig; /* copy of initial (when this object was created) CM transitions in counts form */
33 float **e_orig; /* copy of initial (when this object was created) CM emissions in counts form */
34 float *begin_orig; /* copy of initial (when this object was created) CM begin transitions in counts form */
35 float *end_orig; /* copy of initial (when this object was created) CM end transitions in counts form */
36 double etarget; /* information content target, in bits */
37 };
38
39
40 /* Evaluate fx = cm rel entropy - etarget, which we want to be = 0,
41 * for effective sequence number <Neff>.
42 */
43 static int
cm_eweight_target_f(double Neff,void * params,double * ret_fx)44 cm_eweight_target_f(double Neff, void *params, double *ret_fx)
45 {
46 struct ew_param_s *p = (struct ew_param_s *) params;
47 int v, i;
48
49 /* copy parameters from to p->*_orig to CM arrays */
50 for (v = 0; v < p->cm->M; v++) {
51 for (i = 0; i < MAXCONNECT; i++) p->cm->t[v][i] = p->t_orig[v][i];
52 for (i = 0; i < p->cm->abc->K * p->cm->abc->K; i++) p->cm->e[v][i] = p->e_orig[v][i];
53 p->cm->begin[v] = p->begin_orig[v];
54 p->cm->end[v] = p->end_orig[v];
55 }
56 cm_Rescale(p->cm, Neff / (double) p->cm->nseq);
57 PriorifyCM(p->cm, p->pri);
58 *ret_fx = cm_MeanMatchRelativeEntropy(p->cm) - p->etarget; /* only diff with hmm_eweight_target_f */
59 return eslOK;
60 }
61
62 /* Evaluate fx = hmm rel entropy - etarget, which we want to be = 0,
63 * for effective sequence number <x>. Differs from cm_eweight_target_f
64 * in that emissions from MATP_MP pair emitting states are marginalized
65 * out, effectively treating the CM like an HMM. This is done with
66 * a cm_MeanMatchRelativeEntropyHMM() instead of cm_MeanMatchRelativeEntropy().
67 *
68 */
69 static int
hmm_eweight_target_f(double Neff,void * params,double * ret_fx)70 hmm_eweight_target_f(double Neff, void *params, double *ret_fx)
71 {
72 struct ew_param_s *p = (struct ew_param_s *) params;
73 int v, i;
74 /*printf("hmm_eweight_target_f() Neff: %f\n", Neff); */
75
76 /* copy parameters from CM to p->*_orig arrays */
77 for (v = 0; v < p->cm->M; v++) {
78 for (i = 0; i < MAXCONNECT; i++) p->cm->t[v][i] = p->t_orig[v][i];
79 for (i = 0; i < p->cm->abc->K * p->cm->abc->K; i++) p->cm->e[v][i] = p->e_orig[v][i];
80 p->cm->begin[v] = p->begin_orig[v];
81 p->cm->end[v] = p->end_orig[v];
82 }
83 cm_Rescale(p->cm, Neff / (double) p->cm->nseq);
84 PriorifyCM(p->cm, p->pri);
85 *ret_fx = cm_MeanMatchRelativeEntropyHMM(p->cm) - p->etarget; /* only diff with cm_eweight_target_f */
86 return eslOK;
87 }
88
89 /* Function: cm_EntropyWeight()
90 * Incept: EPN, Mon Jan 7 07:19:46 2008
91 * based on HMMER3's p7_EntropyWeight()
92 * SRE, Fri May 4 15:32:59 2007 [Janelia]
93 *
94 * Purpose: Use the "entropy weighting" algorithm to determine
95 * what effective sequence number we should use, and
96 * return it in <ret_Neff>.
97 *
98 * Caller provides a count-based <cm>, and the
99 * Dirichlet prior <pri> that's to be used to parameterize
100 * models; neither of these will be modified.
101 * Caller also provides the relative entropy
102 * target in bits in <etarget>.
103 *
104 * <ret_Neff> will range from min_Neff to the true number of
105 * sequences counted into the model, <hmm->nseq>.
106 *
107 * If <pretend_cm_is_hmm> is TRUE the CM's MATP_MP pair
108 * emissions are marginalized, treating pair emitting states
109 * effectively as a pair of singlet emitting states.
110 *
111 * Minimum allowed Neff is <min_Neff>. Maximum allowed Neff
112 * is <max_Neff> (new as of v1.1.3, controllable via
113 * cmbuild cmdline option --emaxseq).)
114 *
115 * Returns: <eslOK> on success.
116 *
117 * Throws: <eslEMEM> on allocation failure.
118 */
119 int
cm_EntropyWeight(CM_t * cm,const Prior_t * pri,double etarget,double min_Neff,double max_Neff,int pretend_cm_is_hmm,double * ret_hmm_re,double * ret_Neff)120 cm_EntropyWeight(CM_t *cm, const Prior_t *pri, double etarget, double min_Neff, double max_Neff, int pretend_cm_is_hmm, double *ret_hmm_re, double *ret_Neff)
121 {
122 int status;
123 ESL_ROOTFINDER *R = NULL;
124 struct ew_param_s p;
125 double Neff;
126 double fx;
127 double hmm_re;
128 int v, i;
129
130 /* Store parameters in the structure we'll pass to the rootfinder
131 */
132 p.cm = cm;
133 /* copy parameters of the CM that will be changed by cm_Rescale() */
134 ESL_ALLOC(p.t_orig, (cm->M) * sizeof(float *));
135 ESL_ALLOC(p.e_orig, (cm->M) * sizeof(float *));
136 ESL_ALLOC(p.begin_orig, (cm->M) * sizeof(float));
137 ESL_ALLOC(p.end_orig, (cm->M) * sizeof(float));
138
139 p.t_orig[0] = NULL;
140 p.e_orig[0] = NULL;
141 ESL_ALLOC(p.t_orig[0], MAXCONNECT * cm->M * sizeof(float));
142 ESL_ALLOC(p.e_orig[0], cm->abc->K * cm->abc->K * cm->M * sizeof(float));
143 for (v = 0; v < cm->M; v++) {
144 p.t_orig[v] = p.t_orig[0] + v * MAXCONNECT;
145 p.e_orig[v] = p.e_orig[0] + v * (cm->abc->K * cm->abc->K);
146 }
147 for (v = 0; v < p.cm->M; v++) {
148 for (i = 0; i < MAXCONNECT; i++) p.t_orig[v][i] = cm->t[v][i];
149 for (i = 0; i < cm->abc->K * cm->abc->K; i++) p.e_orig[v][i] = cm->e[v][i];
150 p.begin_orig[v] = cm->begin[v];
151 p.end_orig[v] = cm->end[v];
152 }
153 p.pri = pri;
154 p.etarget = etarget;
155
156 /* First, check if min_Neff gives a rel entropy >= e.target, if so
157 * set Neff to min_Neff. In this case its impossible to get a Neff
158 * < min_Neff such that a relent == etarget, so we use min_Neff;
159 */
160 Neff = min_Neff;
161 if(pretend_cm_is_hmm) { if ((status = hmm_eweight_target_f(Neff, &p, &fx)) != eslOK) goto ERROR; }
162 else { if ((status = cm_eweight_target_f(Neff, &p, &fx)) != eslOK) goto ERROR; }
163
164 if(fx < 0.) { /* an Neff > min_Neff that gives a rel entropy of p.etarget is achievable, find it */
165
166 /* check if max_Neff gives a rel_entropy < e.target, if so set
167 * Neff to max_Neff. In this case its impossible to get a Neff >
168 * max_Neff such that relent == etarget, so we use max_Neff;
169 * (max_Neff is typically cm->nseq (for cmbuild, this it's
170 * cm->nseq unless --emaxseq is used on cmdline). Using
171 * max_Neff in this way is new in v1.1.3, in versions 1.1.2
172 * and earlier max_Neff was fixed at cm->nseq.
173 */
174 Neff = max_Neff;
175 if(pretend_cm_is_hmm) { if ((status = hmm_eweight_target_f(Neff, &p, &fx)) != eslOK) goto ERROR; }
176 else { if ((status = cm_eweight_target_f(Neff, &p, &fx)) != eslOK) goto ERROR; }
177
178 if (fx > 0.) {
179 if(pretend_cm_is_hmm) { if ((R = esl_rootfinder_Create(hmm_eweight_target_f, &p)) == NULL) {status = eslEMEM; goto ERROR;} }
180 else { if ((R = esl_rootfinder_Create(cm_eweight_target_f, &p)) == NULL) {status = eslEMEM; goto ERROR;} }
181 esl_rootfinder_SetAbsoluteTolerance(R, 0.01); /* getting Neff to ~2 sig digits is fine */
182 if ((status = esl_root_Bisection(R, 0., max_Neff, &Neff)) != eslOK) goto ERROR;
183
184 esl_rootfinder_Destroy(R);
185 }
186 }
187 /* we've found Neff, determine the relative entropy of the CM if we marginalize the MP pair emissions,
188 * this is (ALMOST) the relative entropy of the CP9 HMM we'll eventually construct from it,
189 * (it's only ALMOST b/c the CP9 will have marginalized emissions from the MATP_MP PLUS the MATP_ML
190 * state weighted by the expected number of times each state is visited).
191 */
192 hmm_re = cm_MeanMatchRelativeEntropyHMM(p.cm);
193
194 /* reset CM params to their original values */
195 for (v = 0; v < cm->M; v++) {
196 for (i = 0; i < MAXCONNECT; i++) cm->t[v][i] = p.t_orig[v][i];
197 for (i = 0; i < cm->abc->K * cm->abc->K; i++) cm->e[v][i] = p.e_orig[v][i];
198 cm->begin[v] = p.begin_orig[v];
199 cm->end[v] = p.end_orig[v];
200 }
201 /* free params p */
202 free(p.t_orig[0]);
203 free(p.t_orig);
204 free(p.e_orig[0]);
205 free(p.e_orig);
206 free(p.begin_orig);
207 free(p.end_orig);
208
209 *ret_hmm_re = hmm_re;
210 *ret_Neff = Neff;
211 return eslOK;
212
213 ERROR:
214 if (R != NULL) esl_rootfinder_Destroy(R);
215 *ret_Neff = (double) cm->nseq;
216 return status;
217 }
218
219 /* Function: cm_Rescale()
220 *
221 * Incept: EPN 11.07.05
222 * based on: HMMER's plan7.c's Plan7Rescale() (Steve Johnson)
223 *
224 * Purpose: Scale a counts-based CM by some factor, for
225 * adjusting counts to a new effective sequence number.
226 *
227 * Args: cm - counts based CM.
228 * scale - scaling factor (e.g. eff_nseq/nseq); 1.0= no scaling.
229 *
230 * Returns: (void)
231 */
232 void
cm_Rescale(CM_t * cm,float scale)233 cm_Rescale(CM_t *cm, float scale)
234 {
235 int v;
236
237 for (v = 0; v < cm->M; v++)
238 {
239 /* Scale transition counts vector if not a BIF or E state */
240 if (cm->sttype[v] != B_st && cm->sttype[v] != E_st)
241 {
242 /* Number of transitions is cm->cnum[v] */
243 esl_vec_FScale(cm->t[v], cm->cnum[v], scale);
244 }
245 /* Scale emission counts vectors */
246 if (cm->sttype[v] == MP_st)
247 { /* Consensus base pairs */
248 esl_vec_FScale(cm->e[v], (cm->abc->K*cm->abc->K), scale);
249 }
250 else if ((cm->sttype[v] == ML_st) ||
251 (cm->sttype[v] == MR_st) ||
252 (cm->sttype[v] == IL_st) ||
253 (cm->sttype[v] == IR_st))
254 { /* singlets (some consensus, some not)*/
255 esl_vec_FScale(cm->e[v], cm->abc->K, scale);
256 }
257 }/* end loop over states v */
258
259 /* begin, end transitions; only valid [0..M-1] */
260 esl_vec_FScale(cm->begin, cm->M, scale);
261 esl_vec_FScale(cm->end, cm->M, scale);
262
263 return;
264 }
265
266 /* Function: cp9_Rescale()
267 * EPN based on Steve Johnsons plan 7 version
268 *
269 * Purpose: Scale a counts-based HMM by some factor, for
270 * adjusting counts to a new effective sequence number.
271 *
272 * Args: hmm - counts based HMM.
273 * scale - scaling factor (e.g. eff_nseq/nseq); 1.0= no scaling.
274 *
275 * Returns: (void)
276 */
277 void
cp9_Rescale(CP9_t * hmm,float scale)278 cp9_Rescale(CP9_t *hmm, float scale)
279 {
280 int k;
281
282 /* emissions and transitions in the main model.
283 * Note that match states are 1..M, insert states are 0..M,
284 * and deletes are 0..M-1
285 */
286 for(k = 1; k <= hmm->M; k++)
287 esl_vec_FScale(hmm->mat[k], hmm->abc->K, scale);
288 for(k = 0; k <= hmm->M; k++)
289 esl_vec_FScale(hmm->ins[k], hmm->abc->K, scale);
290 for(k = 0; k < hmm->M; k++)
291 esl_vec_FScale(hmm->t[k], cp9_NTRANS, scale);
292
293 /* begin, end transitions; only valid [1..M] */
294 esl_vec_FScale(hmm->begin+1, hmm->M, scale);
295 esl_vec_FScale(hmm->end+1, hmm->M, scale);
296
297 return;
298 }
299
300 /* Function: cm_MeanMatchInfo()
301 * Incept: SRE, Fri May 4 11:43:56 2007 [Janelia]
302 *
303 * Purpose: Calculate the mean information content per match state
304 * emission distribution, in bits:
305 *
306 * \[
307 * \frac{1}{M} \sum_{k=1}^{M}
308 * \left[
309 * - \sum_x p_k(x) \log_2 p_k(x)
310 * + \sum_x f(x) \log_2 f(x)
311 * \right]
312 * \]
313 *
314 * where $p_k(x)$ is emission probability for symbol $x$
315 * from match state $k$, and $f(x)$ is the null model's
316 * background emission probability for $x$.
317 *
318 */
319 double
cm_MeanMatchInfo(const CM_t * cm)320 cm_MeanMatchInfo(const CM_t *cm)
321 {
322 return esl_vec_FEntropy(cm->null, cm->abc->K) - cm_MeanMatchEntropy(cm);
323 }
324
325 /*
326 * Function: cm_MeanMatchEntropy
327 * Incept: EPN, Tue May 1 14:06:37 2007
328 * Updated to match Sean's analogous p7_MeanMatchEntropy() in
329 * HMMER3's hmmstat.c, EPN, Sat Jan 5 14:48:27 2008.
330 *
331 * Purpose: Calculate the mean entropy per match state emission
332 * distribution, in bits:
333 *
334 * \[
335 * \frac{1}{clen} \sum_{v=0}^{M-1} -\sum_x p_v(x) \log_2 p_v(x)
336 * \]
337 *
338 * where $p_v(x)$ is emission probability for symbol $x$
339 * from MATL\_ML, MATR\_MR or MATP\_MP state $v$. For MATP\_MP
340 * states symbols $x$ are base pairs.
341 */
342 double
cm_MeanMatchEntropy(const CM_t * cm)343 cm_MeanMatchEntropy(const CM_t *cm)
344 {
345 int v;
346 double H = 0.;
347
348 for (v = 0; v < cm->M; v++)
349 {
350 if(cm->stid[v] == MATP_MP)
351 H += esl_vec_FEntropy(cm->e[v], (cm->abc->K * cm->abc->K));
352 else if(cm->stid[v] == MATL_ML ||
353 cm->stid[v] == MATR_MR)
354 H += esl_vec_FEntropy(cm->e[v], cm->abc->K);
355 }
356 H /= (double) cm->clen;
357 return H;
358 }
359
360
361 /* Function: cm_MeanMatchRelativeEntropy()
362 * Incept: SRE, Fri May 11 09:25:01 2007 [Janelia]
363 *
364 * Purpose: Calculate the mean relative entropy per match state emission
365 * distribution, in bits:
366 *
367 * \[
368 * \frac{1}{M} \sum_{v=0}^{M-1} \sum_x p_v(x) \log_2 \frac{p_v(x)}{f(x)}
369 * \]
370 *
371 * where $p_v(x)$ is emission probability for symbol $x$
372 * from MATL\_ML, MATR\_MR, or MATP\_MP state state $v$,
373 * and $f(x)$ is the null model's background emission
374 * probability for $x$. For MATP\_MP states, $x$ is a
375 * base pair.
376 */
377 double
cm_MeanMatchRelativeEntropy(const CM_t * cm)378 cm_MeanMatchRelativeEntropy(const CM_t *cm)
379 {
380 int status;
381 int v;
382 double KL = 0.;
383 float *pair_null;
384 int i,j;
385 int KL_pair_denom = 0;
386 int KL_singlet_denom = 0;
387 float left_e[cm->abc->K];
388 float right_e[cm->abc->K];
389 double KL_pair = 0.;
390 double KL_pair_marg = 0.;
391 double KL_singlet = 0.;
392
393 ESL_ALLOC(pair_null, (sizeof(float) * cm->abc->K * cm->abc->K));
394 for(i = 0; i < cm->abc->K; i++)
395 for(j = 0; j < cm->abc->K; j++)
396 pair_null[(i * cm->abc->K) + j] = cm->null[i] * cm->null[j];
397
398 for (v = 0; v < cm->M; v++) {
399 if(cm->stid[v] == MATP_MP) {
400 KL += esl_vec_FRelEntropy(cm->e[v], pair_null, (cm->abc->K * cm->abc->K));
401 KL_pair += esl_vec_FRelEntropy(cm->e[v], pair_null, (cm->abc->K * cm->abc->K));
402 KL_pair_denom += 2;
403 /*printf("MP (%5d) %6.3f\n", v, esl_vec_FRelEntropy(cm->e[v], pair_null, (cm->abc->K * cm->abc->K)));*/
404
405 /* calculate marginals */
406 /* left half */
407 esl_vec_FSet(left_e, cm->abc->K, 0.);
408 for(i = 0; i < cm->abc->K; i++) {
409 for(j = (i*cm->abc->K); j < ((i+1)*cm->abc->K); j++) {
410 left_e[i] += cm->e[v][j];
411 }
412 }
413 esl_vec_FNorm(left_e, cm->abc->K);
414 KL_pair_marg += esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K);
415 /*printf("cm L %4d (%4s) v: %5d KL: %10.5f (added: %10.5f)\n", cm->ndidx[v], "MATP", v, KL, esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K));*/
416 /* right half */
417 esl_vec_FSet(right_e, cm->abc->K, 0.);
418 for(i = 0; i < cm->abc->K; i++) {
419 for(j = i; j < cm->abc->K * cm->abc->K; j += cm->abc->K) {
420 right_e[i] += cm->e[v][j];
421 }
422 }
423 KL_pair_marg += esl_vec_FRelEntropy(right_e, cm->null, cm->abc->K);
424 }
425 else if(cm->stid[v] == MATL_ML ||
426 cm->stid[v] == MATR_MR) {
427 KL += esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K);
428 KL_singlet += esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K);
429 KL_singlet_denom += 1;
430 /*printf("ML/MR (%5d) %6.3f\n", v, esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K));*/
431 }
432 }
433 free(pair_null);
434
435 /*printf("\n%s KL total %8.3f %8.3f per cpos\n", cm->name, KL, KL / (double) cm->clen);
436 printf("%s KL pair %8.3f %8.3f per paired cpos\n", cm->name, KL_pair, KL_pair / (double) KL_pair_denom);
437 printf("%s KL pair(m) %8.3f %8.3f per paired cpos\n", cm->name, KL_pair_marg, KL_pair_marg / (double) KL_pair_denom);
438 printf("%s KL singlet %8.3f %8.3f per unpaired cpos\n", cm->name, KL_singlet, KL_singlet / (double) KL_singlet_denom);*/
439
440 KL /= (double) cm->clen;
441 return KL;
442
443 ERROR:
444 cm_Fail("Memory allocation error.");
445 return 0.; /* NOTREACHED */
446 }
447
448
449 /* Function: cm_MeanMatchInfoHMM()
450 * Incept: EPN, Mon Feb 18 07:43:01 2008
451 *
452 * Purpose: Calculate the mean information content per match state
453 * emission distribution, in bits:
454 *
455 * \[
456 * \frac{1}{M} \sum_{k=1}^{M}
457 * \left[
458 * - \sum_x p_k(x) \log_2 p_k(x)
459 * + \sum_x f(x) \log_2 f(x)
460 * \right]
461 * \]
462 *
463 * where $p_k(x)$ is emission probability for symbol $x$
464 * from match state $k$, and $f(x)$ is the null model's
465 * background emission probability for $x$.
466 *
467 * Differs from cm_MeanMatchInfo() in that base pair emissions
468 * are marginalized, in effect treating the CM like an HMM that
469 * can only emit 1 residue at a time.
470 *
471 */
472 double
cm_MeanMatchInfoHMM(const CM_t * cm)473 cm_MeanMatchInfoHMM(const CM_t *cm)
474 {
475 return esl_vec_FEntropy(cm->null, cm->abc->K) - cm_MeanMatchEntropyHMM(cm);
476 }
477
478 /* Function: cm_MeanMatchEntropyHMM
479 * Incept: EPN, Mon Feb 18 08:06:20 2008
480 * Updated to match Sean's analogous p7_MeanMatchEntropy() in
481 * HMMER3's hmmstat.c, EPN, Sat Jan 5 14:48:27 2008.
482 *
483 * Purpose: Calculate the mean entropy per match state emission
484 * distribution, in bits:
485 *
486 * \[
487 * \frac{1}{clen} \sum_{v=0}^{M-1} -\sum_x p_v(x) \log_2 p_v(x)
488 * \]
489 *
490 * where $p_v(x)$ is emission probability for symbol $x$
491 * from MATL\_ML, MATR\_MR or MATP\_MP state $v$. For MATP\_MP
492 * states symbols $x$ are base pairs.
493 *
494 * Differs from cm_MeanMatchEntropy() in that base pair emissions
495 * are marginalized, in effect treating the CM like an HMM that
496 * can only emit 1 residue at a time.
497 */
498 double
cm_MeanMatchEntropyHMM(const CM_t * cm)499 cm_MeanMatchEntropyHMM(const CM_t *cm)
500 {
501 int v;
502 double H = 0.;
503 float left_e[cm->abc->K];
504 float right_e[cm->abc->K];
505 int i,j;
506
507 for (v = 0; v < cm->M; v++) {
508 if(cm->stid[v] == MATP_MP) {
509 /* calculate marginals */
510 /* left half */
511 esl_vec_FSet(left_e, cm->abc->K, 0.);
512 for(i = 0; i < cm->abc->K; i++) {
513 for(j = (i*cm->abc->K); j < ((i+1)*cm->abc->K); j++) {
514 left_e[i] += cm->e[v][j];
515 }
516 H += esl_vec_FEntropy(left_e, cm->abc->K);
517 }
518 /* right half */
519 esl_vec_FSet(right_e, cm->abc->K, 0.);
520 for(i = 0; i < cm->abc->K; i++) {
521 for(j = i; j < cm->abc->K * cm->abc->K; j += cm->abc->K) {
522 right_e[i] += cm->e[v][j];
523 }
524 H += esl_vec_FEntropy(right_e, cm->abc->K);
525 }
526 }
527 else if(cm->stid[v] == MATL_ML ||
528 cm->stid[v] == MATR_MR) {
529 H += esl_vec_FEntropy(cm->e[v], cm->abc->K);
530 }
531 }
532 H /= (double) cm->clen;
533 return H;
534 }
535
536 /* Function: cm_MeanMatchRelativeEntropyHMM()
537 * Incept: EPN, Mon Feb 18 08:06:24 2008
538 *
539 * Purpose: Calculate the mean relative entropy per match state emission
540 * distribution, in bits:
541 *
542 * \[
543 * \frac{1}{M} \sum_{v=0}^{M-1} \sum_x p_v(x) \log_2 \frac{p_v(x)}{f(x)}
544 * \]
545 *
546 * where $p_v(x)$ is emission probability for symbol $x$
547 * from MATL\_ML, MATR\_MR, or MATP\_MP state state $v$,
548 * and $f(x)$ is the null model's background emission
549 * probability for $x$. For MATP\_MP states, $x$ is a
550 * base pair.
551 *
552 * Differs from cm_MeanMatchRelativeEntropy() in that base pair
553 * emissions are marginalized, in effect treating the CM like an
554 * HMM that can only emit 1 residue at a time.
555 */
556 double
cm_MeanMatchRelativeEntropyHMM(const CM_t * cm)557 cm_MeanMatchRelativeEntropyHMM(const CM_t *cm)
558 {
559 int v;
560 double KL = 0.;
561 float left_e[cm->abc->K];
562 float right_e[cm->abc->K];
563 int i,j;
564
565 for (v = 0; v < cm->M; v++) {
566 if(cm->stid[v] == MATP_MP) {
567 /* calculate marginals */
568 /* left half */
569 esl_vec_FSet(left_e, cm->abc->K, 0.);
570 for(i = 0; i < cm->abc->K; i++) {
571 for(j = (i*cm->abc->K); j < ((i+1)*cm->abc->K); j++) {
572 left_e[i] += cm->e[v][j];
573 }
574 }
575 esl_vec_FNorm(left_e, cm->abc->K);
576 KL += esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K);
577 /*printf("cm L %4d (%4s) v: %5d KL: %10.5f (added: %10.5f)\n", cm->ndidx[v], "MATP", v, KL, esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K));*/
578 /* right half */
579 esl_vec_FSet(right_e, cm->abc->K, 0.);
580 for(i = 0; i < cm->abc->K; i++) {
581 for(j = i; j < cm->abc->K * cm->abc->K; j += cm->abc->K) {
582 right_e[i] += cm->e[v][j];
583 }
584 }
585 KL += esl_vec_FRelEntropy(right_e, cm->null, cm->abc->K);
586 /*printf("cm R %4d (%4s) v: %5d KL: %10.5f (added: %10.5f)\n", cm->ndidx[v], "MATP", v, KL, esl_vec_FRelEntropy(right_e, cm->null, cm->abc->K));*/
587 }
588 else if(cm->stid[v] == MATL_ML ||
589 cm->stid[v] == MATR_MR) {
590 KL += esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K);
591 /*printf("cm %4d (%4s) v: %5d KL: %10.5f (added %10.5f)\n", cm->ndidx[v], Nodetype(cm->ndtype[cm->ndidx[v]]), v, KL, esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K));*/
592 }
593 }
594
595 KL /= (double) cm->clen;
596 return KL;
597 }
598
599 /* Function: cp9_MeanMatchInfo()
600 * Incept: SRE, Fri May 4 11:43:56 2007 [Janelia]
601 *
602 * Purpose: Calculate the mean information content per match state
603 * emission distribution, in bits:
604 *
605 * \[
606 * \frac{1}{M} \sum_{k=1}^{M}
607 * \left[
608 * - \sum_x p_k(x) \log_2 p_k(x)
609 * + \sum_x f(x) \log_2 f(x)
610 * \right]
611 * \]
612 *
613 * where $p_k(x)$ is emission probability for symbol $x$
614 * from match state $k$, and $f(x)$ is the null model's
615 * background emission probability for $x$.
616 *
617 * This statistic is used in "entropy weighting" to set the
618 * total sequence weight when model building.
619 */
620 double
cp9_MeanMatchInfo(const CP9_t * cp9)621 cp9_MeanMatchInfo(const CP9_t *cp9)
622 {
623 return esl_vec_FEntropy(cp9->null, cp9->abc->K) - cp9_MeanMatchEntropy(cp9);
624 }
625
626 /* Function: cp9_MeanMatchEntropy()
627 * Incept: SRE, Fri May 4 13:37:15 2007 [Janelia]
628 *
629 * Purpose: Calculate the mean entropy per match state emission
630 * distribution, in bits:
631 *
632 * \[
633 * \frac{1}{M} \sum_{k=1}^{M} -\sum_x p_k(x) \log_2 p_k(x)
634 * \]
635 *
636 * where $p_k(x)$ is emission probability for symbol $x$
637 * from match state $k$.
638 */
639 double
cp9_MeanMatchEntropy(const CP9_t * cp9)640 cp9_MeanMatchEntropy(const CP9_t *cp9)
641 {
642 int k;
643 double H = 0.;
644
645 for (k = 1; k <= cp9->M; k++)
646 H += esl_vec_FEntropy(cp9->mat[k], cp9->abc->K);
647 H /= (double) cp9->M;
648 return H;
649 }
650
651
652 /* Function: cp9_MeanMatchRelativeEntropy()
653 * Incept: SRE, Fri May 11 09:25:01 2007 [Janelia]
654 *
655 * Purpose: Calculate the mean relative entropy per match state emission
656 * distribution, in bits:
657 *
658 * \[
659 * \frac{1}{M} \sum_{k=1}^{M} \sum_x p_k(x) \log_2 \frac{p_k(x)}{f(x)}
660 * \]
661 *
662 * where $p_k(x)$ is emission probability for symbol $x$
663 * from match state $k$, and $f(x)$ is the null model's
664 * background emission probability for $x$.
665 */
666 double
cp9_MeanMatchRelativeEntropy(const CP9_t * cp9)667 cp9_MeanMatchRelativeEntropy(const CP9_t *cp9)
668 {
669 int k;
670 double KL = 0.;
671
672 for (k = 1; k <= cp9->M; k++) {
673 KL += esl_vec_FRelEntropy(cp9->mat[k], cp9->null, cp9->abc->K);
674 }
675 KL /= (double) cp9->M;
676 return KL;
677 }
678
679