1 /* "Entropy weighting" to determine absolute sequence number to use in hmmbuild.
2  *
3  * Reference:
4  *    L. Steven Johnson, "Remote Protein Homology Detection Using Hidden Markov Models",
5  *    Ph.D. thesis, Washington University School of Medicine, 2006.
6  *
7  * SRE, Fri May  4 14:01:54 2007 [Janelia] [Tom Waits, Orphans]
8  */
9 
10 #include "p7_config.h"
11 
12 #include "easel.h"
13 #include "esl_rootfinder.h"
14 
15 #include "hmmer.h"
16 
17 struct ew_param_s {
18   const P7_HMM    *hmm;		/* ptr to the original count-based HMM, which remains unchanged */
19   const P7_BG     *bg;		/* ptr to the null model */
20   const P7_PRIOR  *pri;		/* Dirichlet prior used to parameterize from counts */
21   P7_HMM          *h2;		/* our working space: a copy of <hmm> that we can muck with */
22   double           etarget;	/* information content target, in bits */
23 };
24 
25 /* Evaluate fx = rel entropy - etarget, which we want to be = 0,
26  * for effective sequence number <x>.
27  */
28 static int
eweight_target_f(double Neff,void * params,double * ret_fx)29 eweight_target_f(double Neff, void *params, double *ret_fx)
30 {
31   struct ew_param_s *p = (struct ew_param_s *) params;
32 
33   p7_hmm_CopyParameters(p->hmm, p->h2);
34   p7_hmm_Scale(p->h2, Neff / (double) p->h2->nseq);
35   p7_ParameterEstimation(p->h2, p->pri);
36   *ret_fx = p7_MeanMatchRelativeEntropy(p->h2, p->bg) - p->etarget;
37   return eslOK;
38 }
39 
40 /* Function:  p7_EntropyWeight()
41  * Incept:    SRE, Fri May  4 15:32:59 2007 [Janelia]
42  *
43  * Purpose:   Use the "entropy weighting" algorithm to determine
44  *            what effective sequence number we should use, and
45  *            return it in <ret_Neff>.
46  *
47  *            Caller provides a count-based <hmm>, and the
48  *            Dirichlet prior <pri> that's to be used to parameterize
49  *            models; neither of these will be modified.
50  *            Caller also provides the relative entropy
51  *            target in bits in <etarget>.
52  *
53  *            <ret_Neff> will range from 0 to the true number of
54  *            sequences counted into the model, <hmm->nseq>.
55  *
56  * Returns:   <eslOK> on success.
57  *
58  * Throws:    <eslEMEM> on allocation failure.
59  */
60 int
p7_EntropyWeight(const P7_HMM * hmm,const P7_BG * bg,const P7_PRIOR * pri,double etarget,double * ret_Neff)61 p7_EntropyWeight(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double etarget, double *ret_Neff)
62 {
63   int status;
64   ESL_ROOTFINDER *R = NULL;
65   struct ew_param_s p;
66   double Neff;
67   double fx;
68 
69   /* Store parameters in the structure we'll pass to the rootfinder
70    */
71   p.hmm = hmm;
72   p.bg  = bg;
73   p.pri = pri;
74   if ((p.h2  = p7_hmm_Clone(hmm)) == NULL) return eslEMEM;
75   p.etarget = etarget;
76 
77   Neff = (double) hmm->nseq;
78   if ((status = eweight_target_f(Neff, &p, &fx)) != eslOK) goto ERROR;
79   if (fx > 0.)
80     {
81       if ((R = esl_rootfinder_Create(eweight_target_f, &p)) == NULL) {status = eslEMEM; goto ERROR;}
82       esl_rootfinder_SetAbsoluteTolerance(R, 0.01); /* getting Neff to ~2 sig digits is fine */
83       if ((status = esl_root_Bisection(R, 0., (double) hmm->nseq, &Neff)) != eslOK) goto ERROR;
84 
85       esl_rootfinder_Destroy(R);
86     }
87 
88   p7_hmm_Destroy(p.h2);
89   *ret_Neff = Neff;
90   return eslOK;
91 
92  ERROR:
93   if (p.h2 != NULL)   p7_hmm_Destroy(p.h2);
94   if (R    != NULL)   esl_rootfinder_Destroy(R);
95   *ret_Neff = (double) hmm->nseq;
96   return status;
97 }
98 
99 
100 /* Evaluate fx = rel entropy - etarget, which we want to be = 0,
101  * for effective sequence number <x>.
102  */
103 static int
eweight_target_exp_f(double exp,void * params,double * ret_fx)104 eweight_target_exp_f(double exp, void *params, double *ret_fx)
105 {
106   struct ew_param_s *p = (struct ew_param_s *) params;
107 
108   p7_hmm_CopyParameters(p->hmm, p->h2);
109   p7_hmm_ScaleExponential(p->h2, exp);
110   p7_ParameterEstimation(p->h2, p->pri);
111   *ret_fx = p7_MeanMatchRelativeEntropy(p->h2, p->bg) - p->etarget;
112   return eslOK;
113 }
114 
115 
116 
117 /* Function:  p7_EntropyWeight_exp()
118  *
119  * Purpose:   Use an alternative "entropy weighting" algorithm to
120  *            determine the effective observed counts we should
121  *            use, and return it in <ret_Neff>.
122  *
123  *            Caller provides a count-based <hmm>, and the
124  *            Dirichlet prior <pri> that's to be used to parameterize
125  *            models; neither of these will be modified.
126  *            Caller also provides the relative entropy
127  *            target in bits in <etarget>.
128  *
129  *            <ret_exp> will range from 0 to 1.  This value will be used
130  *            for each column to exponentially scale the observed counts.
131  *            If a column has K (possibly weighted) observed letters, the
132  *            scaled count will be K^ret_exp. This ensures that all scaled
133  *            counts are at least 1.
134  *
135  *            See p7_hmm_ScaleExponential() for more details.
136  *
137  * Returns:   <eslOK> on success.
138  *
139  * Throws:    <eslEMEM> on allocation failure.
140  */
141 int
p7_EntropyWeight_exp(const P7_HMM * hmm,const P7_BG * bg,const P7_PRIOR * pri,double etarget,double * ret_exp)142 p7_EntropyWeight_exp(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double etarget, double *ret_exp)
143 {
144 
145   int status;
146   ESL_ROOTFINDER *R = NULL;
147   struct ew_param_s p;
148   double exp = 1.;
149   double fx;
150 
151   /* Store parameters in the structure we'll pass to the rootfinder
152    */
153   p.hmm = hmm;
154   p.bg  = bg;
155   p.pri = pri;
156   if ((p.h2  = p7_hmm_Clone(hmm)) == NULL) return eslEMEM;
157   p.etarget = etarget;
158 
159   //Neff = (double) hmm->nseq;
160   if ((status = eweight_target_exp_f(1.0, &p, &fx)) != eslOK) goto ERROR;
161   if (fx > 0.)
162   {
163       if ((R = esl_rootfinder_Create(eweight_target_exp_f, &p)) == NULL) {status = eslEMEM; goto ERROR;}
164       esl_rootfinder_SetAbsoluteTolerance(R, 0.001); /* getting exp to ~3 sig digits is fine */
165       if ((status = esl_root_Bisection(R, 0., 1.0, &exp)) != eslOK) goto ERROR;
166 
167       esl_rootfinder_Destroy(R);
168   }
169 
170 
171   p7_hmm_Destroy(p.h2);
172 
173   *ret_exp = exp;
174   return eslOK;
175 
176  ERROR:
177   if (p.h2 != NULL)   p7_hmm_Destroy(p.h2);
178   if (R    != NULL)   esl_rootfinder_Destroy(R);
179 
180   return status;
181 }
182 
183