1 /* eweight.c [EPN 11.07.05]
2  * based on: HMMER 2.4devl's lsj_eweight.c
3  * Most original comments from lsj_eweight.c untouched.
4  *
5  * LSJ, Wed Feb  4 15:03:58 CST 2004
6  *
7  * entropy targeting:
8  * Code for setting effective sequence number (in cmbuild) by
9  * achieving a certain target entropy loss, relative to background
10  * null distribution.
11  */
12 
13 #include "esl_config.h"
14 #include "p7_config.h"
15 #include "config.h"
16 
17 #include <stdio.h>
18 #include <string.h>
19 #include <math.h>
20 
21 #include "easel.h"
22 #include "esl_rootfinder.h"
23 #include "esl_vectorops.h"
24 
25 #include "hmmer.h"
26 
27 #include "infernal.h"
28 
29 struct ew_param_s {
30   CM_t      *cm;		/* ptr to the original count-based CM, cm->t and cm->e be changed, but we have a copy of the original data in t_orig and e_orig */
31   const Prior_t *pri;		/* Dirichlet prior used to parameterize from counts */
32   float           **t_orig;     /* copy of initial (when this object was created) CM transitions in counts form */
33   float           **e_orig;     /* copy of initial (when this object was created) CM emissions   in counts form */
34   float           *begin_orig;  /* copy of initial (when this object was created) CM begin transitions in counts form */
35   float           *end_orig;    /* copy of initial (when this object was created) CM end transitions in counts form */
36   double           etarget;	/* information content target, in bits */
37 };
38 
39 
40 /* Evaluate fx = cm rel entropy - etarget, which we want to be = 0,
41  * for effective sequence number <Neff>.
42  */
43 static int
cm_eweight_target_f(double Neff,void * params,double * ret_fx)44 cm_eweight_target_f(double Neff, void *params, double *ret_fx)
45 {
46   struct ew_param_s *p = (struct ew_param_s *) params;
47   int v, i;
48 
49   /* copy parameters from to p->*_orig to CM arrays */
50   for (v = 0; v < p->cm->M; v++) {
51     for (i = 0; i < MAXCONNECT; i++)                    p->cm->t[v][i] = p->t_orig[v][i];
52     for (i = 0; i < p->cm->abc->K * p->cm->abc->K; i++) p->cm->e[v][i] = p->e_orig[v][i];
53     p->cm->begin[v] = p->begin_orig[v];
54     p->cm->end[v]   = p->end_orig[v];
55   }
56   cm_Rescale(p->cm, Neff / (double) p->cm->nseq);
57   PriorifyCM(p->cm, p->pri);
58   *ret_fx = cm_MeanMatchRelativeEntropy(p->cm) - p->etarget; /* only diff with hmm_eweight_target_f */
59   return eslOK;
60 }
61 
62 /* Evaluate fx = hmm rel entropy - etarget, which we want to be = 0,
63  * for effective sequence number <x>. Differs from cm_eweight_target_f
64  * in that emissions from MATP_MP pair emitting states are marginalized
65  * out, effectively treating the CM like an HMM. This is done with
66  * a cm_MeanMatchRelativeEntropyHMM() instead of cm_MeanMatchRelativeEntropy().
67  *
68  */
69 static int
hmm_eweight_target_f(double Neff,void * params,double * ret_fx)70 hmm_eweight_target_f(double Neff, void *params, double *ret_fx)
71 {
72   struct ew_param_s *p = (struct ew_param_s *) params;
73   int v, i;
74   /*printf("hmm_eweight_target_f() Neff: %f\n", Neff); */
75 
76   /* copy parameters from CM to p->*_orig arrays */
77   for (v = 0; v < p->cm->M; v++) {
78     for (i = 0; i < MAXCONNECT; i++)                    p->cm->t[v][i] = p->t_orig[v][i];
79     for (i = 0; i < p->cm->abc->K * p->cm->abc->K; i++) p->cm->e[v][i] = p->e_orig[v][i];
80     p->cm->begin[v] = p->begin_orig[v];
81     p->cm->end[v]   = p->end_orig[v];
82   }
83   cm_Rescale(p->cm, Neff / (double) p->cm->nseq);
84   PriorifyCM(p->cm, p->pri);
85   *ret_fx = cm_MeanMatchRelativeEntropyHMM(p->cm) - p->etarget; /* only diff with cm_eweight_target_f */
86   return eslOK;
87 }
88 
89 /* Function:  cm_EntropyWeight()
90  * Incept:    EPN, Mon Jan  7 07:19:46 2008
91  *            based on HMMER3's p7_EntropyWeight()
92  *            SRE, Fri May  4 15:32:59 2007 [Janelia]
93  *
94  * Purpose:   Use the "entropy weighting" algorithm to determine
95  *            what effective sequence number we should use, and
96  *            return it in <ret_Neff>.
97  *
98  *            Caller provides a count-based <cm>, and the
99  *            Dirichlet prior <pri> that's to be used to parameterize
100  *            models; neither of these will be modified.
101  *            Caller also provides the relative entropy
102  *            target in bits in <etarget>.
103  *
104  *            <ret_Neff> will range from min_Neff to the true number of
105  *            sequences counted into the model, <hmm->nseq>.
106  *
107  *            If <pretend_cm_is_hmm> is TRUE the CM's MATP_MP pair
108  *            emissions are marginalized, treating pair emitting states
109  *            effectively as a pair of singlet emitting states.
110  *
111  *            Minimum allowed Neff is <min_Neff>. Maximum allowed Neff
112  *            is <max_Neff> (new as of v1.1.3, controllable via
113  *            cmbuild cmdline option --emaxseq).)
114  *
115  * Returns:   <eslOK> on success.
116  *
117  * Throws:    <eslEMEM> on allocation failure.
118  */
119 int
cm_EntropyWeight(CM_t * cm,const Prior_t * pri,double etarget,double min_Neff,double max_Neff,int pretend_cm_is_hmm,double * ret_hmm_re,double * ret_Neff)120 cm_EntropyWeight(CM_t *cm, const Prior_t *pri, double etarget, double min_Neff, double max_Neff, int pretend_cm_is_hmm, double *ret_hmm_re, double *ret_Neff)
121 {
122   int status;
123   ESL_ROOTFINDER *R = NULL;
124   struct ew_param_s p;
125   double Neff;
126   double fx;
127   double hmm_re;
128   int v, i;
129 
130   /* Store parameters in the structure we'll pass to the rootfinder
131    */
132   p.cm = cm;
133   /* copy parameters of the CM that will be changed by cm_Rescale() */
134   ESL_ALLOC(p.t_orig,     (cm->M) * sizeof(float *));
135   ESL_ALLOC(p.e_orig,     (cm->M) * sizeof(float *));
136   ESL_ALLOC(p.begin_orig, (cm->M) * sizeof(float));
137   ESL_ALLOC(p.end_orig,   (cm->M) * sizeof(float));
138 
139   p.t_orig[0]   = NULL;
140   p.e_orig[0]   = NULL;
141   ESL_ALLOC(p.t_orig[0], MAXCONNECT * cm->M * sizeof(float));
142   ESL_ALLOC(p.e_orig[0], cm->abc->K * cm->abc->K * cm->M * sizeof(float));
143   for (v = 0; v < cm->M; v++) {
144     p.t_orig[v]    = p.t_orig[0]    + v * MAXCONNECT;
145     p.e_orig[v]    = p.e_orig[0]    + v * (cm->abc->K * cm->abc->K);
146   }
147   for (v = 0; v < p.cm->M; v++) {
148     for (i = 0; i < MAXCONNECT; i++)              p.t_orig[v][i] = cm->t[v][i];
149     for (i = 0; i < cm->abc->K * cm->abc->K; i++) p.e_orig[v][i] = cm->e[v][i];
150     p.begin_orig[v] = cm->begin[v];
151     p.end_orig[v]   = cm->end[v];
152   }
153   p.pri = pri;
154   p.etarget = etarget;
155 
156   /* First, check if min_Neff gives a rel entropy >= e.target, if so
157    * set Neff to min_Neff.  In this case its impossible to get a Neff
158    * < min_Neff such that a relent == etarget, so we use min_Neff;
159    */
160   Neff = min_Neff;
161   if(pretend_cm_is_hmm) { if ((status = hmm_eweight_target_f(Neff, &p, &fx)) != eslOK)    goto ERROR; }
162   else                  { if ((status = cm_eweight_target_f(Neff, &p, &fx)) != eslOK)     goto ERROR; }
163 
164   if(fx < 0.) { /* an Neff > min_Neff that gives a rel entropy of p.etarget is achievable, find it */
165 
166     /* check if max_Neff gives a rel_entropy < e.target, if so set
167      * Neff to max_Neff.  In this case its impossible to get a Neff >
168      * max_Neff such that relent == etarget, so we use max_Neff;
169      * (max_Neff is typically cm->nseq (for cmbuild, this it's
170      * cm->nseq unless --emaxseq is used on cmdline). Using
171      * max_Neff in this way is new in v1.1.3, in versions 1.1.2
172      * and earlier max_Neff was fixed at cm->nseq.
173      */
174     Neff = max_Neff;
175     if(pretend_cm_is_hmm) { if ((status = hmm_eweight_target_f(Neff, &p, &fx)) != eslOK)    goto ERROR; }
176     else                  { if ((status = cm_eweight_target_f(Neff, &p, &fx)) != eslOK)     goto ERROR; }
177 
178     if (fx > 0.) {
179       if(pretend_cm_is_hmm) { if ((R = esl_rootfinder_Create(hmm_eweight_target_f, &p))    == NULL) {status = eslEMEM; goto ERROR;} }
180       else                  { if ((R = esl_rootfinder_Create(cm_eweight_target_f, &p))     == NULL) {status = eslEMEM; goto ERROR;} }
181       esl_rootfinder_SetAbsoluteTolerance(R, 0.01); /* getting Neff to ~2 sig digits is fine */
182       if ((status = esl_root_Bisection(R, 0., max_Neff, &Neff)) != eslOK) goto ERROR;
183 
184       esl_rootfinder_Destroy(R);
185     }
186   }
187   /* we've found Neff, determine the relative entropy of the CM if we marginalize the MP pair emissions,
188    * this is (ALMOST) the relative entropy of the CP9 HMM we'll eventually construct from it,
189    * (it's only ALMOST b/c the CP9 will have marginalized emissions from the MATP_MP PLUS the MATP_ML
190    * state weighted by the expected number of times each state is visited).
191    */
192   hmm_re = cm_MeanMatchRelativeEntropyHMM(p.cm);
193 
194   /* reset CM params to their original values */
195   for (v = 0; v < cm->M; v++) {
196     for (i = 0; i < MAXCONNECT; i++)              cm->t[v][i] = p.t_orig[v][i];
197     for (i = 0; i < cm->abc->K * cm->abc->K; i++) cm->e[v][i] = p.e_orig[v][i];
198     cm->begin[v] = p.begin_orig[v];
199     cm->end[v]   = p.end_orig[v];
200   }
201   /* free params p */
202   free(p.t_orig[0]);
203   free(p.t_orig);
204   free(p.e_orig[0]);
205   free(p.e_orig);
206   free(p.begin_orig);
207   free(p.end_orig);
208 
209   *ret_hmm_re = hmm_re;
210   *ret_Neff = Neff;
211   return eslOK;
212 
213  ERROR:
214   if (R    != NULL)   esl_rootfinder_Destroy(R);
215   *ret_Neff = (double) cm->nseq;
216   return status;
217 }
218 
219 /* Function:  cm_Rescale()
220  *
221  * Incept:    EPN 11.07.05
222  * based on:  HMMER's plan7.c's Plan7Rescale() (Steve Johnson)
223  *
224  * Purpose:   Scale a counts-based CM by some factor, for
225  *            adjusting counts to a new effective sequence number.
226  *
227  * Args:      cm         - counts based CM.
228  *            scale      - scaling factor (e.g. eff_nseq/nseq); 1.0= no scaling.
229  *
230  * Returns:   (void)
231  */
232 void
cm_Rescale(CM_t * cm,float scale)233 cm_Rescale(CM_t *cm, float scale)
234 {
235   int v;
236 
237   for (v = 0; v < cm->M; v++)
238     {
239       /* Scale transition counts vector if not a BIF or E state */
240       if (cm->sttype[v] != B_st && cm->sttype[v] != E_st)
241 	{
242 	  /* Number of transitions is cm->cnum[v] */
243 	  esl_vec_FScale(cm->t[v], cm->cnum[v], scale);
244 	}
245       /* Scale emission counts vectors */
246       if (cm->sttype[v] == MP_st)
247 	{       /* Consensus base pairs */
248 	  esl_vec_FScale(cm->e[v], (cm->abc->K*cm->abc->K), scale);
249 	}
250       else if ((cm->sttype[v] == ML_st) ||
251 	       (cm->sttype[v] == MR_st) ||
252 	       (cm->sttype[v] == IL_st) ||
253 	       (cm->sttype[v] == IR_st))
254 	{      /* singlets (some consensus, some not)*/
255 	  esl_vec_FScale(cm->e[v], cm->abc->K, scale);
256 	}
257     }/* end loop over states v */
258 
259   /* begin, end transitions; only valid [0..M-1] */
260   esl_vec_FScale(cm->begin, cm->M, scale);
261   esl_vec_FScale(cm->end,   cm->M, scale);
262 
263   return;
264 }
265 
266 /* Function:  cp9_Rescale()
267  *            EPN based on Steve Johnsons plan 7 version
268  *
269  * Purpose:   Scale a counts-based HMM by some factor, for
270  *            adjusting counts to a new effective sequence number.
271  *
272  * Args:      hmm        - counts based HMM.
273  *            scale      - scaling factor (e.g. eff_nseq/nseq); 1.0= no scaling.
274  *
275  * Returns:   (void)
276  */
277 void
cp9_Rescale(CP9_t * hmm,float scale)278 cp9_Rescale(CP9_t *hmm, float scale)
279 {
280   int k;
281 
282   /* emissions and transitions in the main model.
283    * Note that match states are 1..M, insert states are 0..M,
284    * and deletes are 0..M-1
285    */
286   for(k = 1; k <= hmm->M; k++)
287     esl_vec_FScale(hmm->mat[k], hmm->abc->K, scale);
288   for(k = 0; k <=  hmm->M; k++)
289     esl_vec_FScale(hmm->ins[k], hmm->abc->K, scale);
290   for(k = 0; k <  hmm->M; k++)
291     esl_vec_FScale(hmm->t[k],   cp9_NTRANS,             scale);
292 
293   /* begin, end transitions; only valid [1..M] */
294   esl_vec_FScale(hmm->begin+1, hmm->M, scale);
295   esl_vec_FScale(hmm->end+1,   hmm->M, scale);
296 
297   return;
298 }
299 
300 /* Function:  cm_MeanMatchInfo()
301  * Incept:    SRE, Fri May  4 11:43:56 2007 [Janelia]
302  *
303  * Purpose:   Calculate the mean information content per match state
304  *            emission distribution, in bits:
305  *
306  *            \[
307  *              \frac{1}{M} \sum_{k=1}^{M}
308  *                \left[
309  *                   - \sum_x p_k(x) \log_2 p_k(x)
310  *                   + \sum_x f(x) \log_2 f(x)
311  *                \right]
312  *            \]
313  *
314  *            where $p_k(x)$ is emission probability for symbol $x$
315  *            from match state $k$, and $f(x)$ is the null model's
316  *            background emission probability for $x$.
317  *
318  */
319 double
cm_MeanMatchInfo(const CM_t * cm)320 cm_MeanMatchInfo(const CM_t *cm)
321 {
322   return esl_vec_FEntropy(cm->null, cm->abc->K) - cm_MeanMatchEntropy(cm);
323 }
324 
325 /*
326  * Function: cm_MeanMatchEntropy
327  * Incept:   EPN, Tue May  1 14:06:37 2007
328  *           Updated to match Sean's analogous p7_MeanMatchEntropy() in
329  *           HMMER3's hmmstat.c, EPN, Sat Jan  5 14:48:27 2008.
330  *
331  * Purpose:   Calculate the mean entropy per match state emission
332  *            distribution, in bits:
333  *
334  *            \[
335  *              \frac{1}{clen} \sum_{v=0}^{M-1} -\sum_x p_v(x) \log_2 p_v(x)
336  *            \]
337  *
338  *            where $p_v(x)$ is emission probability for symbol $x$
339  *            from MATL\_ML, MATR\_MR or MATP\_MP state $v$. For MATP\_MP
340  *            states symbols $x$ are base pairs.
341  */
342 double
cm_MeanMatchEntropy(const CM_t * cm)343 cm_MeanMatchEntropy(const CM_t *cm)
344 {
345   int    v;
346   double H = 0.;
347 
348   for (v = 0; v < cm->M; v++)
349     {
350       if(cm->stid[v] == MATP_MP)
351        H += esl_vec_FEntropy(cm->e[v], (cm->abc->K * cm->abc->K));
352       else if(cm->stid[v] == MATL_ML ||
353 	      cm->stid[v] == MATR_MR)
354 	H += esl_vec_FEntropy(cm->e[v], cm->abc->K);
355     }
356   H /= (double) cm->clen;
357   return H;
358 }
359 
360 
361 /* Function:  cm_MeanMatchRelativeEntropy()
362  * Incept:    SRE, Fri May 11 09:25:01 2007 [Janelia]
363  *
364  * Purpose:   Calculate the mean relative entropy per match state emission
365  *            distribution, in bits:
366  *
367  *            \[
368  *              \frac{1}{M} \sum_{v=0}^{M-1} \sum_x p_v(x) \log_2 \frac{p_v(x)}{f(x)}
369  *            \]
370  *
371  *            where $p_v(x)$ is emission probability for symbol $x$
372  *            from MATL\_ML, MATR\_MR, or MATP\_MP state state $v$,
373  *            and $f(x)$ is the null model's background emission
374  *            probability for $x$. For MATP\_MP states, $x$ is a
375  *            base pair.
376  */
377 double
cm_MeanMatchRelativeEntropy(const CM_t * cm)378 cm_MeanMatchRelativeEntropy(const CM_t *cm)
379 {
380   int    status;
381   int    v;
382   double KL = 0.;
383   float *pair_null;
384   int i,j;
385   int KL_pair_denom = 0;
386   int KL_singlet_denom = 0;
387   float left_e[cm->abc->K];
388   float right_e[cm->abc->K];
389   double KL_pair = 0.;
390   double KL_pair_marg = 0.;
391   double KL_singlet = 0.;
392 
393   ESL_ALLOC(pair_null, (sizeof(float) * cm->abc->K * cm->abc->K));
394   for(i = 0; i < cm->abc->K; i++)
395     for(j = 0; j < cm->abc->K; j++)
396       pair_null[(i * cm->abc->K) + j] = cm->null[i] * cm->null[j];
397 
398   for (v = 0; v < cm->M; v++) {
399     if(cm->stid[v] == MATP_MP) {
400       KL += esl_vec_FRelEntropy(cm->e[v], pair_null, (cm->abc->K * cm->abc->K));
401       KL_pair += esl_vec_FRelEntropy(cm->e[v], pair_null, (cm->abc->K * cm->abc->K));
402       KL_pair_denom += 2;
403       /*printf("MP    (%5d) %6.3f\n", v, esl_vec_FRelEntropy(cm->e[v], pair_null, (cm->abc->K * cm->abc->K)));*/
404 
405       /* calculate marginals */
406       /* left half */
407       esl_vec_FSet(left_e, cm->abc->K, 0.);
408       for(i = 0; i < cm->abc->K; i++) {
409 	for(j = (i*cm->abc->K); j < ((i+1)*cm->abc->K); j++) {
410 	  left_e[i] += cm->e[v][j];
411 	}
412       }
413       esl_vec_FNorm(left_e, cm->abc->K);
414       KL_pair_marg += esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K);
415       /*printf("cm       L %4d (%4s) v: %5d KL: %10.5f (added: %10.5f)\n", cm->ndidx[v], "MATP", v, KL, esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K));*/
416       /* right half */
417       esl_vec_FSet(right_e, cm->abc->K, 0.);
418       for(i = 0; i < cm->abc->K; i++) {
419 	for(j = i; j < cm->abc->K * cm->abc->K; j += cm->abc->K) {
420 	  right_e[i] += cm->e[v][j];
421 	}
422       }
423       KL_pair_marg += esl_vec_FRelEntropy(right_e, cm->null, cm->abc->K);
424     }
425     else if(cm->stid[v] == MATL_ML ||
426 	    cm->stid[v] == MATR_MR) {
427       KL += esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K);
428       KL_singlet += esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K);
429       KL_singlet_denom += 1;
430       /*printf("ML/MR (%5d) %6.3f\n", v, esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K));*/
431     }
432   }
433   free(pair_null);
434 
435   /*printf("\n%s KL total   %8.3f  %8.3f per          cpos\n", cm->name, KL, KL / (double) cm->clen);
436     printf("%s KL pair    %8.3f  %8.3f per   paired cpos\n", cm->name, KL_pair, KL_pair / (double) KL_pair_denom);
437     printf("%s KL pair(m) %8.3f  %8.3f per   paired cpos\n", cm->name, KL_pair_marg, KL_pair_marg / (double) KL_pair_denom);
438     printf("%s KL singlet %8.3f  %8.3f per unpaired cpos\n", cm->name, KL_singlet, KL_singlet / (double) KL_singlet_denom);*/
439 
440   KL /= (double) cm->clen;
441   return KL;
442 
443  ERROR:
444   cm_Fail("Memory allocation error.");
445   return 0.; /* NOTREACHED */
446 }
447 
448 
449 /* Function:  cm_MeanMatchInfoHMM()
450  * Incept:    EPN, Mon Feb 18 07:43:01 2008
451  *
452  * Purpose:   Calculate the mean information content per match state
453  *            emission distribution, in bits:
454  *
455  *            \[
456  *              \frac{1}{M} \sum_{k=1}^{M}
457  *                \left[
458  *                   - \sum_x p_k(x) \log_2 p_k(x)
459  *                   + \sum_x f(x) \log_2 f(x)
460  *                \right]
461  *            \]
462  *
463  *            where $p_k(x)$ is emission probability for symbol $x$
464  *            from match state $k$, and $f(x)$ is the null model's
465  *            background emission probability for $x$.
466  *
467  *            Differs from cm_MeanMatchInfo() in that base pair emissions
468  *            are marginalized, in effect treating the CM like an HMM that
469  *            can only emit 1 residue at a time.
470  *
471  */
472 double
cm_MeanMatchInfoHMM(const CM_t * cm)473 cm_MeanMatchInfoHMM(const CM_t *cm)
474 {
475   return esl_vec_FEntropy(cm->null, cm->abc->K) - cm_MeanMatchEntropyHMM(cm);
476 }
477 
478 /* Function: cm_MeanMatchEntropyHMM
479  * Incept:   EPN, Mon Feb 18 08:06:20 2008
480  *           Updated to match Sean's analogous p7_MeanMatchEntropy() in
481  *           HMMER3's hmmstat.c, EPN, Sat Jan  5 14:48:27 2008.
482  *
483  * Purpose:   Calculate the mean entropy per match state emission
484  *            distribution, in bits:
485  *
486  *            \[
487  *              \frac{1}{clen} \sum_{v=0}^{M-1} -\sum_x p_v(x) \log_2 p_v(x)
488  *            \]
489  *
490  *            where $p_v(x)$ is emission probability for symbol $x$
491  *            from MATL\_ML, MATR\_MR or MATP\_MP state $v$. For MATP\_MP
492  *            states symbols $x$ are base pairs.
493  *
494  *            Differs from cm_MeanMatchEntropy() in that base pair emissions
495  *            are marginalized, in effect treating the CM like an HMM that
496  *            can only emit 1 residue at a time.
497  */
498 double
cm_MeanMatchEntropyHMM(const CM_t * cm)499 cm_MeanMatchEntropyHMM(const CM_t *cm)
500 {
501   int    v;
502   double H = 0.;
503   float left_e[cm->abc->K];
504   float right_e[cm->abc->K];
505   int i,j;
506 
507   for (v = 0; v < cm->M; v++) {
508       if(cm->stid[v] == MATP_MP) {
509 	/* calculate marginals */
510 	/* left half */
511 	esl_vec_FSet(left_e, cm->abc->K, 0.);
512 	for(i = 0; i < cm->abc->K; i++) {
513 	  for(j = (i*cm->abc->K); j < ((i+1)*cm->abc->K); j++) {
514 	    left_e[i] += cm->e[v][j];
515 	  }
516 	  H += esl_vec_FEntropy(left_e, cm->abc->K);
517 	}
518 	/* right half */
519 	esl_vec_FSet(right_e, cm->abc->K, 0.);
520 	for(i = 0; i < cm->abc->K; i++) {
521 	  for(j = i; j < cm->abc->K * cm->abc->K; j += cm->abc->K) {
522 	    right_e[i] += cm->e[v][j];
523 	  }
524 	  H += esl_vec_FEntropy(right_e, cm->abc->K);
525 	}
526       }
527       else if(cm->stid[v] == MATL_ML ||
528 	      cm->stid[v] == MATR_MR) {
529 	H += esl_vec_FEntropy(cm->e[v], cm->abc->K);
530       }
531   }
532   H /= (double) cm->clen;
533   return H;
534 }
535 
536 /* Function:  cm_MeanMatchRelativeEntropyHMM()
537  * Incept:    EPN, Mon Feb 18 08:06:24 2008
538  *
539  * Purpose:   Calculate the mean relative entropy per match state emission
540  *            distribution, in bits:
541  *
542  *            \[
543  *              \frac{1}{M} \sum_{v=0}^{M-1} \sum_x p_v(x) \log_2 \frac{p_v(x)}{f(x)}
544  *            \]
545  *
546  *            where $p_v(x)$ is emission probability for symbol $x$
547  *            from MATL\_ML, MATR\_MR, or MATP\_MP state state $v$,
548  *            and $f(x)$ is the null model's background emission
549  *            probability for $x$. For MATP\_MP states, $x$ is a
550  *            base pair.
551  *
552  *            Differs from cm_MeanMatchRelativeEntropy() in that base pair
553  *            emissions are marginalized, in effect treating the CM like an
554  *            HMM that can only emit 1 residue at a time.
555  */
556 double
cm_MeanMatchRelativeEntropyHMM(const CM_t * cm)557 cm_MeanMatchRelativeEntropyHMM(const CM_t *cm)
558 {
559   int    v;
560   double KL = 0.;
561   float left_e[cm->abc->K];
562   float right_e[cm->abc->K];
563   int i,j;
564 
565   for (v = 0; v < cm->M; v++) {
566       if(cm->stid[v] == MATP_MP) {
567 	/* calculate marginals */
568 	/* left half */
569 	esl_vec_FSet(left_e, cm->abc->K, 0.);
570 	for(i = 0; i < cm->abc->K; i++) {
571 	  for(j = (i*cm->abc->K); j < ((i+1)*cm->abc->K); j++) {
572 	    left_e[i] += cm->e[v][j];
573 	  }
574 	}
575 	esl_vec_FNorm(left_e, cm->abc->K);
576 	KL += esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K);
577 	/*printf("cm       L %4d (%4s) v: %5d KL: %10.5f (added: %10.5f)\n", cm->ndidx[v], "MATP", v, KL, esl_vec_FRelEntropy(left_e, cm->null, cm->abc->K));*/
578 	/* right half */
579 	esl_vec_FSet(right_e, cm->abc->K, 0.);
580 	for(i = 0; i < cm->abc->K; i++) {
581 	  for(j = i; j < cm->abc->K * cm->abc->K; j += cm->abc->K) {
582 	    right_e[i] += cm->e[v][j];
583 	  }
584 	}
585 	KL += esl_vec_FRelEntropy(right_e, cm->null, cm->abc->K);
586 	/*printf("cm       R %4d (%4s) v: %5d KL: %10.5f (added: %10.5f)\n", cm->ndidx[v], "MATP", v, KL, esl_vec_FRelEntropy(right_e, cm->null, cm->abc->K));*/
587       }
588       else if(cm->stid[v] == MATL_ML ||
589 	      cm->stid[v] == MATR_MR) {
590 	KL += esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K);
591 	/*printf("cm         %4d (%4s) v: %5d KL: %10.5f (added %10.5f)\n", cm->ndidx[v], Nodetype(cm->ndtype[cm->ndidx[v]]), v, KL, esl_vec_FRelEntropy(cm->e[v], cm->null, cm->abc->K));*/
592       }
593   }
594 
595   KL /= (double) cm->clen;
596   return KL;
597 }
598 
599 /* Function:  cp9_MeanMatchInfo()
600  * Incept:    SRE, Fri May  4 11:43:56 2007 [Janelia]
601  *
602  * Purpose:   Calculate the mean information content per match state
603  *            emission distribution, in bits:
604  *
605  *            \[
606  *              \frac{1}{M} \sum_{k=1}^{M}
607  *                \left[
608  *                   - \sum_x p_k(x) \log_2 p_k(x)
609  *                   + \sum_x f(x) \log_2 f(x)
610  *                \right]
611  *            \]
612  *
613  *            where $p_k(x)$ is emission probability for symbol $x$
614  *            from match state $k$, and $f(x)$ is the null model's
615  *            background emission probability for $x$.
616  *
617  *            This statistic is used in "entropy weighting" to set the
618  *            total sequence weight when model building.
619  */
620 double
cp9_MeanMatchInfo(const CP9_t * cp9)621 cp9_MeanMatchInfo(const CP9_t *cp9)
622 {
623   return esl_vec_FEntropy(cp9->null, cp9->abc->K) - cp9_MeanMatchEntropy(cp9);
624 }
625 
626 /* Function:  cp9_MeanMatchEntropy()
627  * Incept:    SRE, Fri May  4 13:37:15 2007 [Janelia]
628  *
629  * Purpose:   Calculate the mean entropy per match state emission
630  *            distribution, in bits:
631  *
632  *            \[
633  *              \frac{1}{M} \sum_{k=1}^{M} -\sum_x p_k(x) \log_2 p_k(x)
634  *            \]
635  *
636  *            where $p_k(x)$ is emission probability for symbol $x$
637  *            from match state $k$.
638  */
639 double
cp9_MeanMatchEntropy(const CP9_t * cp9)640 cp9_MeanMatchEntropy(const CP9_t *cp9)
641 {
642   int    k;
643   double H = 0.;
644 
645   for (k = 1; k <= cp9->M; k++)
646     H += esl_vec_FEntropy(cp9->mat[k], cp9->abc->K);
647   H /= (double) cp9->M;
648   return H;
649 }
650 
651 
652 /* Function:  cp9_MeanMatchRelativeEntropy()
653  * Incept:    SRE, Fri May 11 09:25:01 2007 [Janelia]
654  *
655  * Purpose:   Calculate the mean relative entropy per match state emission
656  *            distribution, in bits:
657  *
658  *            \[
659  *              \frac{1}{M} \sum_{k=1}^{M} \sum_x p_k(x) \log_2 \frac{p_k(x)}{f(x)}
660  *            \]
661  *
662  *            where $p_k(x)$ is emission probability for symbol $x$
663  *            from match state $k$, and $f(x)$ is the null model's
664  *            background emission probability for $x$.
665  */
666 double
cp9_MeanMatchRelativeEntropy(const CP9_t * cp9)667 cp9_MeanMatchRelativeEntropy(const CP9_t *cp9)
668 {
669   int    k;
670   double KL = 0.;
671 
672   for (k = 1; k <= cp9->M; k++) {
673     KL += esl_vec_FRelEntropy(cp9->mat[k], cp9->null, cp9->abc->K);
674   }
675   KL /= (double) cp9->M;
676   return KL;
677 }
678 
679