1 /* "Entropy weighting" to determine absolute sequence number to use in hmmbuild.
2 *
3 * Reference:
4 * L. Steven Johnson, "Remote Protein Homology Detection Using Hidden Markov Models",
5 * Ph.D. thesis, Washington University School of Medicine, 2006.
6 *
7 * SRE, Fri May 4 14:01:54 2007 [Janelia] [Tom Waits, Orphans]
8 */
9
10 #include "p7_config.h"
11
12 #include "easel.h"
13 #include "esl_rootfinder.h"
14
15 #include "hmmer.h"
16
17 struct ew_param_s {
18 const P7_HMM *hmm; /* ptr to the original count-based HMM, which remains unchanged */
19 const P7_BG *bg; /* ptr to the null model */
20 const P7_PRIOR *pri; /* Dirichlet prior used to parameterize from counts */
21 P7_HMM *h2; /* our working space: a copy of <hmm> that we can muck with */
22 double etarget; /* information content target, in bits */
23 };
24
25 /* Evaluate fx = rel entropy - etarget, which we want to be = 0,
26 * for effective sequence number <x>.
27 */
28 static int
eweight_target_f(double Neff,void * params,double * ret_fx)29 eweight_target_f(double Neff, void *params, double *ret_fx)
30 {
31 struct ew_param_s *p = (struct ew_param_s *) params;
32
33 p7_hmm_CopyParameters(p->hmm, p->h2);
34 p7_hmm_Scale(p->h2, Neff / (double) p->h2->nseq);
35 p7_ParameterEstimation(p->h2, p->pri);
36 *ret_fx = p7_MeanMatchRelativeEntropy(p->h2, p->bg) - p->etarget;
37 return eslOK;
38 }
39
40 /* Function: p7_EntropyWeight()
41 * Incept: SRE, Fri May 4 15:32:59 2007 [Janelia]
42 *
43 * Purpose: Use the "entropy weighting" algorithm to determine
44 * what effective sequence number we should use, and
45 * return it in <ret_Neff>.
46 *
47 * Caller provides a count-based <hmm>, and the
48 * Dirichlet prior <pri> that's to be used to parameterize
49 * models; neither of these will be modified.
50 * Caller also provides the relative entropy
51 * target in bits in <etarget>.
52 *
53 * <ret_Neff> will range from 0 to the true number of
54 * sequences counted into the model, <hmm->nseq>.
55 *
56 * Returns: <eslOK> on success.
57 *
58 * Throws: <eslEMEM> on allocation failure.
59 */
60 int
p7_EntropyWeight(const P7_HMM * hmm,const P7_BG * bg,const P7_PRIOR * pri,double etarget,double * ret_Neff)61 p7_EntropyWeight(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double etarget, double *ret_Neff)
62 {
63 int status;
64 ESL_ROOTFINDER *R = NULL;
65 struct ew_param_s p;
66 double Neff;
67 double fx;
68
69 /* Store parameters in the structure we'll pass to the rootfinder
70 */
71 p.hmm = hmm;
72 p.bg = bg;
73 p.pri = pri;
74 if ((p.h2 = p7_hmm_Clone(hmm)) == NULL) return eslEMEM;
75 p.etarget = etarget;
76
77 Neff = (double) hmm->nseq;
78 if ((status = eweight_target_f(Neff, &p, &fx)) != eslOK) goto ERROR;
79 if (fx > 0.)
80 {
81 if ((R = esl_rootfinder_Create(eweight_target_f, &p)) == NULL) {status = eslEMEM; goto ERROR;}
82 esl_rootfinder_SetAbsoluteTolerance(R, 0.01); /* getting Neff to ~2 sig digits is fine */
83 if ((status = esl_root_Bisection(R, 0., (double) hmm->nseq, &Neff)) != eslOK) goto ERROR;
84
85 esl_rootfinder_Destroy(R);
86 }
87
88 p7_hmm_Destroy(p.h2);
89 *ret_Neff = Neff;
90 return eslOK;
91
92 ERROR:
93 if (p.h2 != NULL) p7_hmm_Destroy(p.h2);
94 if (R != NULL) esl_rootfinder_Destroy(R);
95 *ret_Neff = (double) hmm->nseq;
96 return status;
97 }
98
99
100 /* Evaluate fx = rel entropy - etarget, which we want to be = 0,
101 * for effective sequence number <x>.
102 */
103 static int
eweight_target_exp_f(double exp,void * params,double * ret_fx)104 eweight_target_exp_f(double exp, void *params, double *ret_fx)
105 {
106 struct ew_param_s *p = (struct ew_param_s *) params;
107
108 p7_hmm_CopyParameters(p->hmm, p->h2);
109 p7_hmm_ScaleExponential(p->h2, exp);
110 p7_ParameterEstimation(p->h2, p->pri);
111 *ret_fx = p7_MeanMatchRelativeEntropy(p->h2, p->bg) - p->etarget;
112 return eslOK;
113 }
114
115
116
117 /* Function: p7_EntropyWeight_exp()
118 *
119 * Purpose: Use an alternative "entropy weighting" algorithm to
120 * determine the effective observed counts we should
121 * use, and return it in <ret_Neff>.
122 *
123 * Caller provides a count-based <hmm>, and the
124 * Dirichlet prior <pri> that's to be used to parameterize
125 * models; neither of these will be modified.
126 * Caller also provides the relative entropy
127 * target in bits in <etarget>.
128 *
129 * <ret_exp> will range from 0 to 1. This value will be used
130 * for each column to exponentially scale the observed counts.
131 * If a column has K (possibly weighted) observed letters, the
132 * scaled count will be K^ret_exp. This ensures that all scaled
133 * counts are at least 1.
134 *
135 * See p7_hmm_ScaleExponential() for more details.
136 *
137 * Returns: <eslOK> on success.
138 *
139 * Throws: <eslEMEM> on allocation failure.
140 */
141 int
p7_EntropyWeight_exp(const P7_HMM * hmm,const P7_BG * bg,const P7_PRIOR * pri,double etarget,double * ret_exp)142 p7_EntropyWeight_exp(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double etarget, double *ret_exp)
143 {
144
145 int status;
146 ESL_ROOTFINDER *R = NULL;
147 struct ew_param_s p;
148 double exp = 1.;
149 double fx;
150
151 /* Store parameters in the structure we'll pass to the rootfinder
152 */
153 p.hmm = hmm;
154 p.bg = bg;
155 p.pri = pri;
156 if ((p.h2 = p7_hmm_Clone(hmm)) == NULL) return eslEMEM;
157 p.etarget = etarget;
158
159 //Neff = (double) hmm->nseq;
160 if ((status = eweight_target_exp_f(1.0, &p, &fx)) != eslOK) goto ERROR;
161 if (fx > 0.)
162 {
163 if ((R = esl_rootfinder_Create(eweight_target_exp_f, &p)) == NULL) {status = eslEMEM; goto ERROR;}
164 esl_rootfinder_SetAbsoluteTolerance(R, 0.001); /* getting exp to ~3 sig digits is fine */
165 if ((status = esl_root_Bisection(R, 0., 1.0, &exp)) != eslOK) goto ERROR;
166
167 esl_rootfinder_Destroy(R);
168 }
169
170
171 p7_hmm_Destroy(p.h2);
172
173 *ret_exp = exp;
174 return eslOK;
175
176 ERROR:
177 if (p.h2 != NULL) p7_hmm_Destroy(p.h2);
178 if (R != NULL) esl_rootfinder_Destroy(R);
179
180 return status;
181 }
182
183