1 /* Miscellaneous summary statistics calculated for HMMs and profiles.
2 *
3 * SRE, Fri May 4 11:43:20 2007 [Janelia]
4 */
5
6 #include "p7_config.h"
7
8 #include "easel.h"
9 #include "esl_vectorops.h"
10
11 #include "hmmer.h"
12
13
14
15 /* Function: p7_MeanMatchInfo()
16 * Incept: SRE, Fri May 4 11:43:56 2007 [Janelia]
17 *
18 * Purpose: Calculate the mean information content per match state
19 * emission distribution, in bits:
20 *
21 * \[
22 * \frac{1}{M} \sum_{k=1}^{M}
23 * \left[
24 * \sum_x f(x) \log_2 f(x)
25 * - \sum_x p_k(x) \log_2 p_k(x)
26 * \right]
27 * \]
28 *
29 * where $p_k(x)$ is emission probability for symbol $x$
30 * from match state $k$, and $f(x)$ is the null model's
31 * background emission probability for $x$.
32 */
33 double
p7_MeanMatchInfo(const P7_HMM * hmm,const P7_BG * bg)34 p7_MeanMatchInfo(const P7_HMM *hmm, const P7_BG *bg)
35 {
36 return esl_vec_FEntropy(bg->f, hmm->abc->K) - p7_MeanMatchEntropy(hmm);
37 }
38
39 /* Function: p7_MeanMatchEntropy()
40 * Incept: SRE, Fri May 4 13:37:15 2007 [Janelia]
41 *
42 * Purpose: Calculate the mean entropy per match state emission
43 * distribution, in bits:
44 *
45 * \[
46 * - \frac{1}{M} \sum_{k=1}^{M} \sum_x p_k(x) \log_2 p_k(x)
47 * \]
48 *
49 * where $p_k(x)$ is emission probability for symbol $x$
50 * from match state $k$.
51 */
52 double
p7_MeanMatchEntropy(const P7_HMM * hmm)53 p7_MeanMatchEntropy(const P7_HMM *hmm)
54 {
55 int k;
56 double H = 0.;
57
58 for (k = 1; k <= hmm->M; k++)
59 H += esl_vec_FEntropy(hmm->mat[k], hmm->abc->K);
60 H /= (double) hmm->M;
61 return H;
62 }
63
64
65 /* Function: p7_MeanMatchRelativeEntropy()
66 * Incept: SRE, Fri May 11 09:25:01 2007 [Janelia]
67 *
68 * Purpose: Calculate the mean relative entropy per match state emission
69 * distribution, in bits:
70 *
71 * \[
72 * \frac{1}{M} \sum_{k=1}^{M} \sum_x p_k(x) \log_2 \frac{p_k(x)}{f(x)}
73 * \]
74 *
75 * where $p_k(x)$ is emission probability for symbol $x$
76 * from match state $k$, and $f(x)$ is the null model's
77 * background emission probability for $x$.
78 */
79 double
p7_MeanMatchRelativeEntropy(const P7_HMM * hmm,const P7_BG * bg)80 p7_MeanMatchRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg)
81 {
82 int k;
83 double KL = 0.;
84
85 #if 0
86 p7_bg_Dump(stdout, hmm->bg);
87 for (k = 1; k <= hmm->M; k++)
88 printf("Match %d : %.2f %.2f\n", k,
89 esl_vec_FRelEntropy(hmm->mat[k], hmm->bg->f, hmm->abc->K),
90 esl_vec_FEntropy(bg->f, hmm->abc->K) - esl_vec_FEntropy(hmm->mat[k], hmm->abc->K));
91 #endif
92
93 for (k = 1; k <= hmm->M; k++)
94 KL += esl_vec_FRelEntropy(hmm->mat[k], bg->f, hmm->abc->K);
95 KL /= (double) hmm->M;
96 return KL;
97 }
98
99
100
101 double
p7_MeanForwardScore(const P7_HMM * hmm,const P7_BG * bg)102 p7_MeanForwardScore(const P7_HMM *hmm, const P7_BG *bg)
103 {
104 int L = 350;
105 int N = 100;
106 P7_PROFILE *gm = p7_profile_Create(hmm->M, hmm->abc);
107 P7_GMX *gx = p7_gmx_Create(gm->M, L);
108 ESL_SQ *sq = esl_sq_CreateDigital(hmm->abc);
109 ESL_RANDOMNESS *r = esl_randomness_CreateFast(0);
110 float fsc;
111 float nullsc;
112 double bitscore;
113 double sum = 0.;
114 int i;
115
116 if (p7_ProfileConfig (hmm, bg, gm, L, p7_LOCAL) != eslOK) p7_Die("failed to configure profile");
117 for (i = 0; i < N; i++)
118 {
119 if (p7_ReconfigLength(gm, L) != eslOK) p7_Die("failed to reconfig profile length");
120 if (p7_ProfileEmit(r, hmm, gm, bg, sq, NULL) != eslOK) p7_Die("failed to emit sequence");
121 if (p7_ReconfigLength(gm, sq->n) != eslOK) p7_Die("failed to reconfig profile length");
122 if (p7_gmx_GrowTo(gx, gm->M, sq->n) != eslOK) p7_Die("failed to grow the matrix");
123 if (p7_GForward(sq->dsq, sq->n, gm, gx, &fsc) != eslOK) p7_Die("failed to run Forward");
124 if (p7_bg_NullOne(bg, sq->dsq, sq->n, &nullsc) != eslOK) p7_Die("failed to run bg_NullOne()");
125 bitscore = (fsc - nullsc) / eslCONST_LOG2;
126
127 sum += bitscore;
128 }
129
130 esl_randomness_Destroy(r);
131 esl_sq_Destroy(sq);
132 p7_gmx_Destroy(gx);
133 p7_profile_Destroy(gm);
134 return (sum / (double) N);
135 }
136
137
138 /* Function: p7_MeanPositionRelativeEntropy()
139 * Synopsis: Calculate the mean score per match position, including gap cost.
140 * Incept: SRE, Thu Sep 6 10:26:14 2007 [Janelia]
141 *
142 * Purpose: Calculate the mean score (relative entropy) in bits per
143 * match (consensus) position in model <hmm>, given background
144 * model <bg>.
145 *
146 * More specifically: the mean bitscore is weighted by
147 * match state occupancy (match states that aren't used
148 * much are downweighted), and the log transitions into
149 * that match state from the previous M, D, or I are
150 * counted against it, weighted by their probability.
151 *
152 * This isn't a complete accounting of the average score
153 * per model position nor per aligned residue; most
154 * notably, it doesn't include the contribution of
155 * entry/exit probabilities. So don't expect to approximate
156 * average scores by multiplying <*ret_entropy> by <M>.
157 *
158 * Returns: <eslOK> on success, and <*ret_entropy> is the result.
159 *
160 * Throws: <eslEMEM> on allocation failure, and <*ret_entropy> is 0.
161 */
162 int
p7_MeanPositionRelativeEntropy(const P7_HMM * hmm,const P7_BG * bg,double * ret_entropy)163 p7_MeanPositionRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg, double *ret_entropy)
164 {
165 int status;
166 float *mocc = NULL;
167 int k;
168 double mre, tre;
169 double xm, xi, xd;
170
171 ESL_ALLOC(mocc, sizeof(float) * (hmm->M+1));
172 if ((status = p7_hmm_CalculateOccupancy(hmm, mocc, NULL)) != eslOK) goto ERROR;
173
174 /* mre = the weighted relative entropy per match emission */
175 for (mre = 0., k = 1; k <= hmm->M; k++)
176 mre += mocc[k] * esl_vec_FRelEntropy(hmm->mat[k], bg->f, hmm->abc->K);
177 mre /= esl_vec_FSum(mocc+1, hmm->M);
178
179 /* The weighted relative entropy per match entry transition, 2..M
180 */
181 for (tre = 0., k = 2; k <= hmm->M; k++)
182 {
183 xm = mocc[k-1]*hmm->t[k-1][p7H_MM] * log(hmm->t[k-1][p7H_MM] / bg->p1);
184 xi = mocc[k-1]*hmm->t[k-1][p7H_MI] * (log(hmm->t[k-1][p7H_MM] / bg->p1) + log(hmm->t[k-1][p7H_IM] / bg->p1));
185 xd = (1.-mocc[k-1])*hmm->t[k-1][p7H_DM] * log(hmm->t[k-1][p7H_DM] / bg->p1);
186 tre += (xm+xi+xd) / eslCONST_LOG2;
187 }
188 tre /= esl_vec_FSum(mocc+2, hmm->M-1);
189
190 free(mocc);
191 *ret_entropy = mre+tre;
192 return eslOK;
193
194 ERROR:
195 if (mocc != NULL) free(mocc);
196 *ret_entropy = 0.;
197 return status;
198 }
199
200
201 /* Function: p7_hmm_CompositionKLD()
202 * Synopsis: A statistic of model's composition bias.
203 * Incept: SRE, Mon Jul 2 08:40:12 2007 [Janelia]
204 *
205 * Purpose: Calculates the KL divergence (relative entropy) between
206 * the average match state residue composition in model
207 * <hmm> and the background frequency distribution in <bg>;
208 * return it in <ret_KL>, in bits.
209 *
210 * Optionally return the average match state residue
211 * composition in <opt_avp>. This vector, of length
212 * <hmm->abc->K> is allocated here and becomes the caller's
213 * responsibility if <opt_avp> is non-<NULL>.
214 *
215 * The average match composition is an occupancy-weighted
216 * average (see <p7_hmm_CalculateOccupancy()>.
217 *
218 * For average match state residue composition <p> and
219 * background residue frequencies <q>, the KL divergence
220 * is:
221 *
222 * \begin{eqnarray*}
223 * \sum_a p_a \log_2 \frac{p_a}{q_a}
224 * \end{eqnarray*}
225 *
226 * Returns: <eslOK> on success.
227 *
228 * Throws: <eslEMEM> on allocation error.
229 */
230 int
p7_hmm_CompositionKLD(const P7_HMM * hmm,const P7_BG * bg,float * ret_KL,float ** opt_avp)231 p7_hmm_CompositionKLD(const P7_HMM *hmm, const P7_BG *bg, float *ret_KL, float **opt_avp)
232 {
233 int K = hmm->abc->K;
234 float *occ = NULL;
235 float *p = NULL;
236 int status;
237 int k;
238
239 ESL_ALLOC(occ, sizeof(float) * (hmm->M+1));
240 ESL_ALLOC(p, sizeof(float) * K);
241
242 p7_hmm_CalculateOccupancy(hmm, occ, NULL); // match state occupancy probabilities
243
244 esl_vec_FSet(p, K, 0.); // average composition over match states
245 for (k = 1; k <= hmm->M; k++)
246 esl_vec_FAddScaled(p, hmm->mat[k], occ[k], K);
247 esl_vec_FNorm(p, K);
248
249 *ret_KL = esl_vec_FRelEntropy(p, bg->f, K); // easel returns this value in bits
250 if (opt_avp != NULL) *opt_avp = p; else free(p);
251 free(occ);
252 return eslOK;
253
254 ERROR:
255 free(occ);
256 free(p);
257 if (opt_avp) *opt_avp = NULL;
258 *ret_KL = 0.0;
259 return status;
260 }
261
262
263