1 /* Miscellaneous summary statistics calculated for HMMs and profiles.
2  *
3  * SRE, Fri May  4 11:43:20 2007 [Janelia]
4  */
5 
6 #include "p7_config.h"
7 
8 #include "easel.h"
9 #include "esl_vectorops.h"
10 
11 #include "hmmer.h"
12 
13 
14 
15 /* Function:  p7_MeanMatchInfo()
16  * Incept:    SRE, Fri May  4 11:43:56 2007 [Janelia]
17  *
18  * Purpose:   Calculate the mean information content per match state
19  *            emission distribution, in bits:
20  *
21  *            \[
22  *              \frac{1}{M} \sum_{k=1}^{M}
23  *                \left[
24  *                     \sum_x f(x)   \log_2 f(x)
25  *                   - \sum_x p_k(x) \log_2 p_k(x)
26  *                \right]
27  *            \]
28  *
29  *            where $p_k(x)$ is emission probability for symbol $x$
30  *            from match state $k$, and $f(x)$ is the null model's
31  *            background emission probability for $x$.
32  */
33 double
p7_MeanMatchInfo(const P7_HMM * hmm,const P7_BG * bg)34 p7_MeanMatchInfo(const P7_HMM *hmm, const P7_BG *bg)
35 {
36   return esl_vec_FEntropy(bg->f, hmm->abc->K) - p7_MeanMatchEntropy(hmm);
37 }
38 
39 /* Function:  p7_MeanMatchEntropy()
40  * Incept:    SRE, Fri May  4 13:37:15 2007 [Janelia]
41  *
42  * Purpose:   Calculate the mean entropy per match state emission
43  *            distribution, in bits:
44  *
45  *            \[
46  *              - \frac{1}{M} \sum_{k=1}^{M} \sum_x p_k(x) \log_2 p_k(x)
47  *            \]
48  *
49  *            where $p_k(x)$ is emission probability for symbol $x$
50  *            from match state $k$.
51  */
52 double
p7_MeanMatchEntropy(const P7_HMM * hmm)53 p7_MeanMatchEntropy(const P7_HMM *hmm)
54 {
55   int    k;
56   double H = 0.;
57 
58   for (k = 1; k <= hmm->M; k++)
59     H += esl_vec_FEntropy(hmm->mat[k], hmm->abc->K);
60   H /= (double) hmm->M;
61   return H;
62 }
63 
64 
65 /* Function:  p7_MeanMatchRelativeEntropy()
66  * Incept:    SRE, Fri May 11 09:25:01 2007 [Janelia]
67  *
68  * Purpose:   Calculate the mean relative entropy per match state emission
69  *            distribution, in bits:
70  *
71  *            \[
72  *              \frac{1}{M} \sum_{k=1}^{M} \sum_x p_k(x) \log_2 \frac{p_k(x)}{f(x)}
73  *            \]
74  *
75  *            where $p_k(x)$ is emission probability for symbol $x$
76  *            from match state $k$, and $f(x)$ is the null model's
77  *            background emission probability for $x$.
78  */
79 double
p7_MeanMatchRelativeEntropy(const P7_HMM * hmm,const P7_BG * bg)80 p7_MeanMatchRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg)
81 {
82   int    k;
83   double KL = 0.;
84 
85 #if 0
86   p7_bg_Dump(stdout, hmm->bg);
87   for (k = 1; k <= hmm->M; k++)
88     printf("Match %d : %.2f %.2f\n", k,
89 	   esl_vec_FRelEntropy(hmm->mat[k], hmm->bg->f, hmm->abc->K),
90 	   esl_vec_FEntropy(bg->f, hmm->abc->K) - esl_vec_FEntropy(hmm->mat[k], hmm->abc->K));
91 #endif
92 
93   for (k = 1; k <= hmm->M; k++)
94     KL += esl_vec_FRelEntropy(hmm->mat[k], bg->f, hmm->abc->K);
95   KL /= (double) hmm->M;
96   return KL;
97 }
98 
99 
100 
101 double
p7_MeanForwardScore(const P7_HMM * hmm,const P7_BG * bg)102 p7_MeanForwardScore(const P7_HMM *hmm, const P7_BG *bg)
103 {
104   int             L   = 350;
105   int             N   = 100;
106   P7_PROFILE     *gm  = p7_profile_Create(hmm->M, hmm->abc);
107   P7_GMX         *gx  = p7_gmx_Create(gm->M, L);
108   ESL_SQ         *sq  = esl_sq_CreateDigital(hmm->abc);
109   ESL_RANDOMNESS *r   = esl_randomness_CreateFast(0);
110   float           fsc;
111   float           nullsc;
112   double          bitscore;
113   double          sum = 0.;
114   int             i;
115 
116   if (p7_ProfileConfig (hmm, bg, gm, L, p7_LOCAL)        != eslOK) p7_Die("failed to configure profile");
117   for (i = 0; i < N; i++)
118     {
119       if (p7_ReconfigLength(gm, L)                        != eslOK) p7_Die("failed to reconfig profile length");
120       if (p7_ProfileEmit(r, hmm, gm, bg, sq, NULL)        != eslOK) p7_Die("failed to emit sequence");
121       if (p7_ReconfigLength(gm, sq->n)                    != eslOK) p7_Die("failed to reconfig profile length");
122       if (p7_gmx_GrowTo(gx, gm->M, sq->n)                 != eslOK) p7_Die("failed to grow the matrix");
123       if (p7_GForward(sq->dsq, sq->n, gm, gx, &fsc)       != eslOK) p7_Die("failed to run Forward");
124       if (p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc)      != eslOK) p7_Die("failed to run bg_NullOne()");
125       bitscore = (fsc - nullsc) / eslCONST_LOG2;
126 
127       sum += bitscore;
128     }
129 
130   esl_randomness_Destroy(r);
131   esl_sq_Destroy(sq);
132   p7_gmx_Destroy(gx);
133   p7_profile_Destroy(gm);
134   return (sum / (double) N);
135 }
136 
137 
138 /* Function:  p7_MeanPositionRelativeEntropy()
139  * Synopsis:  Calculate the mean score per match position, including gap cost.
140  * Incept:    SRE, Thu Sep  6 10:26:14 2007 [Janelia]
141  *
142  * Purpose:   Calculate the mean score (relative entropy) in bits per
143  *            match (consensus) position in model <hmm>, given background
144  *            model <bg>.
145  *
146  *            More specifically: the mean bitscore is weighted by
147  *            match state occupancy (match states that aren't used
148  *            much are downweighted), and the log transitions into
149  *            that match state from the previous M, D, or I are
150  *            counted against it, weighted by their probability.
151  *
152  *            This isn't a complete accounting of the average score
153  *            per model position nor per aligned residue; most
154  *            notably, it doesn't include the contribution of
155  *            entry/exit probabilities. So don't expect to approximate
156  *            average scores by multiplying <*ret_entropy> by <M>.
157  *
158  * Returns:   <eslOK> on success, and <*ret_entropy> is the result.
159  *
160  * Throws:    <eslEMEM> on allocation failure, and <*ret_entropy> is 0.
161  */
162 int
p7_MeanPositionRelativeEntropy(const P7_HMM * hmm,const P7_BG * bg,double * ret_entropy)163 p7_MeanPositionRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg, double *ret_entropy)
164 {
165   int     status;
166   float  *mocc = NULL;
167   int     k;
168   double  mre, tre;
169   double  xm, xi, xd;
170 
171   ESL_ALLOC(mocc, sizeof(float) * (hmm->M+1));
172   if ((status = p7_hmm_CalculateOccupancy(hmm, mocc, NULL)) != eslOK) goto ERROR;
173 
174   /* mre = the weighted relative entropy per match emission */
175   for (mre = 0., k = 1; k <= hmm->M; k++)
176     mre += mocc[k] * esl_vec_FRelEntropy(hmm->mat[k], bg->f, hmm->abc->K);
177   mre /= esl_vec_FSum(mocc+1, hmm->M);
178 
179   /* The weighted relative entropy per match entry transition, 2..M
180    */
181   for (tre = 0., k = 2; k <= hmm->M; k++)
182     {
183       xm = mocc[k-1]*hmm->t[k-1][p7H_MM] * log(hmm->t[k-1][p7H_MM] / bg->p1);
184       xi = mocc[k-1]*hmm->t[k-1][p7H_MI] * (log(hmm->t[k-1][p7H_MM] / bg->p1) + log(hmm->t[k-1][p7H_IM] / bg->p1));
185       xd = (1.-mocc[k-1])*hmm->t[k-1][p7H_DM] * log(hmm->t[k-1][p7H_DM] / bg->p1);
186       tre += (xm+xi+xd) / eslCONST_LOG2;
187     }
188   tre /= esl_vec_FSum(mocc+2, hmm->M-1);
189 
190   free(mocc);
191   *ret_entropy = mre+tre;
192   return eslOK;
193 
194  ERROR:
195   if (mocc != NULL) free(mocc);
196   *ret_entropy = 0.;
197   return status;
198 }
199 
200 
201 /* Function:  p7_hmm_CompositionKLD()
202  * Synopsis:  A statistic of model's composition bias.
203  * Incept:    SRE, Mon Jul  2 08:40:12 2007 [Janelia]
204  *
205  * Purpose:   Calculates the KL divergence (relative entropy) between
206  *            the average match state residue composition in model
207  *            <hmm> and the background frequency distribution in <bg>;
208  *            return it in <ret_KL>, in bits.
209  *
210  *            Optionally return the average match state residue
211  *            composition in <opt_avp>. This vector, of length
212  *            <hmm->abc->K> is allocated here and becomes the caller's
213  *            responsibility if <opt_avp> is non-<NULL>.
214  *
215  *            The average match composition is an occupancy-weighted
216  *            average (see <p7_hmm_CalculateOccupancy()>.
217  *
218  *            For average match state residue composition <p> and
219  *            background residue frequencies <q>, the KL divergence
220  *            is:
221  *
222  *            \begin{eqnarray*}
223  *                 \sum_a p_a \log_2 \frac{p_a}{q_a}
224  *            \end{eqnarray*}
225  *
226  * Returns:   <eslOK> on success.
227  *
228  * Throws:    <eslEMEM> on allocation error.
229  */
230 int
p7_hmm_CompositionKLD(const P7_HMM * hmm,const P7_BG * bg,float * ret_KL,float ** opt_avp)231 p7_hmm_CompositionKLD(const P7_HMM *hmm, const P7_BG *bg, float *ret_KL, float **opt_avp)
232 {
233   int    K   = hmm->abc->K;
234   float *occ = NULL;
235   float *p   = NULL;
236   int    status;
237   int    k;
238 
239   ESL_ALLOC(occ, sizeof(float) * (hmm->M+1));
240   ESL_ALLOC(p,   sizeof(float) * K);
241 
242   p7_hmm_CalculateOccupancy(hmm, occ, NULL);         // match state occupancy probabilities
243 
244   esl_vec_FSet(p, K, 0.);                            // average composition over match states
245   for (k = 1; k <= hmm->M; k++)
246     esl_vec_FAddScaled(p, hmm->mat[k], occ[k], K);
247   esl_vec_FNorm(p, K);
248 
249   *ret_KL = esl_vec_FRelEntropy(p, bg->f, K);       // easel returns this value in bits
250   if (opt_avp != NULL) *opt_avp = p;  else free(p);
251   free(occ);
252   return eslOK;
253 
254  ERROR:
255   free(occ);
256   free(p);
257   if (opt_avp) *opt_avp = NULL;
258   *ret_KL = 0.0;
259   return status;
260 }
261 
262 
263