1 /* Posterior decoding algorithms; generic versions.
2  *
3  * Contents:
4  *   1. Posterior decoding algorithms.
5  *   2. Benchmark driver.
6  *   3. Unit tests.
7  *   4. Test driver.
8  *   5. Example.
9  */
10 #include "p7_config.h"
11 
12 #include <math.h>
13 #include "easel.h"
14 #include "hmmer.h"
15 
16 /*****************************************************************
17  * 1. Posterior decoding algorithms.
18  *****************************************************************/
19 
20 /* Function:  p7_GDecoding()
21  * Synopsis:  Posterior decoding of residue assignments.
22  *
23  * Purpose:   Calculates a posterior decoding of the residues in a
24  *            target sequence, given profile <gm> and filled Forward
25  *            and Backward matrices <fwd>, <bck> for the profile
26  *            aligned to that target sequence. The resulting posterior
27  *            decoding is stored in a DP matrix <pp>, provided by the
28  *            caller.
29  *
30  *            Each residue <i> must have been emitted by match state
31  *            <1..M>, insert state <1..M-1>, or an NN, CC, or JJ loop
32  *            transition.  For <dp = pp->dp>, <xmx = pp->xmx>,
33  *            <MMX(i,k)> is the probability that match <k> emitted
34  *            residue <i>; <IMX(i,k)> is the probability that insert
35  *            <k> emitted residue <i>; <XMX(i,N)>,<XMX(i,C)>,
36  *            <XMX(i,J)> are the probabilities that residue <i> was
37  *            emitted on an NN, CC, or JJ transition. The sum over all
38  *            these possibilities for a given residue <i> is 1.0.
39  *
40  *            Thus the only nonzero entries in a posterior decoding matrix
41  *            <pp> are <M_{1..M}>, <I_{1..M-1}>, <N_{1..L-1}> (residue L
42  *            can't be emitted by N), <C_{2..L}> (residue 1 can't be
43  *            emitted by C), and <J_{2..L-1}> (residues 1,L can't be
44  *            emitted by J).
45  *
46  *            In particular, row i=0 is unused (all zeros) in a pp
47  *            matrix; the null2 calculation will take advantage of
48  *            this by using the zero row for workspace.
49  *
50  *            The caller may pass the Backward matrix <bck> as <pp>,
51  *            in which case <bck> will be overwritten with
52  *            <pp>. However, the caller may \emph{not} overwrite <fwd>
53  *            this way; an <(i-1)> dependency in the calculation of
54  *            NN, CC, JJ transitions prevents this.
55  *
56  * Args:      gm   - profile (must be the same that was used to fill <fwd>, <bck>).
57  *            fwd  - filled Forward matrix
58  *            bck  - filled Backward matrix
59  *            pp   - RESULT: posterior decoding matrix.
60  *
61  * Returns:   <eslOK> on success.
62  *
63  * Throws:    (no abnormal error conditions)
64  *
65  * Note:      Burns time renormalizing each row. If you don't do this,
66  *            probabilities will have an error of +/- 0.001 or so, creeping
67  *            in from error in FLogsum()'s table approximation and even
68  *            in log() and exp() themselves; including "probabilities"
69  *            up to  ~1.001. Though this isn't going to break anything
70  *            in normal use, it does drive the unit tests wild; the SSE
71  *            implementation is more accurate, and unit tests that try
72  *            to compare SSE and generic results will see differences,
73  *            some sufficient to alter the choice of OA traceback.
74  *
75  */
76 int
p7_GDecoding(const P7_PROFILE * gm,const P7_GMX * fwd,P7_GMX * bck,P7_GMX * pp)77 p7_GDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, P7_GMX *bck, P7_GMX *pp)
78 {
79   float      **dp   = pp->dp;
80   float       *xmx  = pp->xmx;
81   int          L    = fwd->L;
82   int          M    = gm->M;
83   int          i,k;
84   float        overall_sc = fwd->xmx[p7G_NXCELLS*L + p7G_C] + gm->xsc[p7P_C][p7P_MOVE];
85   float        denom;
86 
87   pp->M = M;
88   pp->L = L;
89 
90   XMX(0, p7G_E) = 0.0;
91   XMX(0, p7G_N) = 0.0;
92   XMX(0, p7G_J) = 0.0;
93   XMX(0, p7G_B) = 0.0;
94   XMX(0, p7G_C) = 0.0;
95   for (k = 0; k <= M; k++)
96     MMX(0,k) = IMX(0,k) = DMX(0,k) = 0.0;
97 
98   for (i = 1; i <= L; i++)
99     {
100       denom = 0.0;
101       MMX(i,0) = IMX(i,0) = DMX(i,0) = 0.0;
102       for (k = 1; k < M; k++)
103 	{
104 	  MMX(i,k) = expf(fwd->dp[i][k*p7G_NSCELLS + p7G_M] + bck->dp[i][k*p7G_NSCELLS + p7G_M] - overall_sc); denom += MMX(i,k);
105 	  IMX(i,k) = expf(fwd->dp[i][k*p7G_NSCELLS + p7G_I] + bck->dp[i][k*p7G_NSCELLS + p7G_I] - overall_sc); denom += IMX(i,k);
106 	  DMX(i,k) = 0.;
107 	}
108       MMX(i,M)     = expf(fwd->dp[i][M*p7G_NSCELLS + p7G_M] + bck->dp[i][M*p7G_NSCELLS + p7G_M] - overall_sc); denom += MMX(i,M);
109       IMX(i,M)     = 0.;
110       DMX(i,M)     = 0.;
111 
112       /* order doesn't matter.  note that this whole function is trivially simd parallel */
113       XMX(i,p7G_E) = 0.;
114       XMX(i,p7G_N) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_N] + bck->xmx[p7G_NXCELLS*i + p7G_N] + gm->xsc[p7P_N][p7P_LOOP] - overall_sc);
115       XMX(i,p7G_J) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_J] + bck->xmx[p7G_NXCELLS*i + p7G_J] + gm->xsc[p7P_J][p7P_LOOP] - overall_sc);
116       XMX(i,p7G_B) = 0.;
117       XMX(i,p7G_C) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_C] + bck->xmx[p7G_NXCELLS*i + p7G_C] + gm->xsc[p7P_C][p7P_LOOP] - overall_sc);
118       denom += XMX(i,p7G_N) + XMX(i,p7G_J) + XMX(i,p7G_C);
119 
120       denom = 1.0 / denom;
121       for (k = 1; k < M; k++) {  MMX(i,k) *= denom; IMX(i,k) *= denom; }
122       MMX(i,M)     *= denom;
123       XMX(i,p7G_N) *= denom;
124       XMX(i,p7G_J) *= denom;
125       XMX(i,p7G_C) *= denom;
126     }
127   return eslOK;
128 }
129 
130 /* Function:  p7_GDomainDecoding()
131  * Synopsis:  Posterior decoding of domain location.
132  *
133  * Purpose:   The caller has calculated Forward and Backward matrices
134  *            <fwd> and <bck> for model <gm> aligned to a target
135  *            sequence. (The target sequence doesn't need to be
136  *            provided, because all we need to know is its length
137  *            <L>, and that's available in either of the two DP
138  *            matrices.)
139  *
140  *            We use this information to calculate the posterior
141  *            probabilities that we're in a begin state B, end state
142  *            E, or any core model state {M,D,I} at each target
143  *            sequence position <i = 1..L>.
144  *
145  *            This information is stored in three arrays in
146  *            <ddef>. This routine expects that this storage has
147  *            already been (re)allocated appropriately for a target
148  *            seq of length <L>.
149  *
150  *            <ddef->btot[i]> stores the cumulative expectation
151  *            $\sum_1^i$ of the number of i's that were emitted (by an
152  *            Mk state) immediately after a B : i.e., the expected
153  *            number of times domains have started at or before
154  *            position i.
155  *
156  *            <ddef->etot[i]> stores the cumulative expectation
157  *            $\sum_1^i$ of the number of i's that were emitted (by
158  *            an Mk or Dk state) and immediately followed by an end
159  *            transition to E : i.e., the expected number of times
160  *            domains have ended at or before position i.
161  *
162  *            <ddef->mocc[i]> stores the probability that residue i is
163  *            emitted by the core model, as opposed to the flanking
164  *            N,C,J states : i.e., the probability that i is in a
165  *            domain.
166  *
167  *            Upon return, each of these arrays has been made, and
168  *            <ddef->L> has * been set.
169  *
170  * Args:      gm   - profile
171  *            fwd  - filled Forward matrix
172  *            bck  - filled Backward matrix
173  *            ddef - container for the results.
174  *
175  * Returns:   <eslOK> on success.
176  *
177  * Throws:    (no abnormal error conditions)
178  *
179  * Notes:    Ideas for future optimization:
180  *
181  *           - The calculations only need to access the xmx[CNJBE][i] special
182  *             states in the fwd, bck matrices, so we could use
183  *             streamlined (checkpointed?) matrices that only maintain
184  *             this info over all i. This would be a step in letting
185  *             us do domain parses in linear memory.
186  *
187  *           - indeed, the <btot>, <etot>, and <mocc> arrays could be made
188  *             sparse; on long target sequences, we expect long
189  *             stretches of negligible posterior probability that
190  *             we're in the model or using a begin or end
191  *             transition.
192  *
193  *           - indeed indeed, we don't really need to store the <btot>, <etot>,
194  *             and <mocc> arrays at all. We can define regions in a
195  *             single pass in O(1) extra memory, straight from the
196  *             <fwd>, <bck> matrices, if we have to (xref
197  *             J2/101). <p7_domaindef_ByPosteriorHeuristics()> is
198  *             already implemented in a way to make this easy. We're
199  *             not doing that for now, partly for clarity in the code,
200  *             and partly because I think we'll want to output the
201  *             <btot>, <etot>, and <mocc> arrays -- this view of the
202  *             posterior decoding of the domain structure of a target
203  *             sequence will be useful. Also, it's a lot easier to
204  *             implement the <is_multidomain_region()> trigger if
205  *             these arrays are available.
206  */
207 int
p7_GDomainDecoding(const P7_PROFILE * gm,const P7_GMX * fwd,const P7_GMX * bck,P7_DOMAINDEF * ddef)208 p7_GDomainDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, const P7_GMX *bck, P7_DOMAINDEF *ddef)
209 {
210   int   L            = fwd->L;
211   float overall_logp = fwd->xmx[p7G_NXCELLS*L + p7G_C] + gm->xsc[p7P_C][p7P_MOVE];
212   float njcp;
213   int   i;
214 
215   for (i = 1; i <= L; i++)
216     {
217       ddef->btot[i] = ddef->btot[i-1] + exp(fwd->xmx[(i-1)*p7G_NXCELLS+p7G_B] + bck->xmx[(i-1)*p7G_NXCELLS+p7G_B] - overall_logp);
218       ddef->etot[i] = ddef->etot[i-1] + exp(fwd->xmx[i    *p7G_NXCELLS+p7G_E] + bck->xmx[i    *p7G_NXCELLS+p7G_E] - overall_logp);
219 
220       njcp  = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_N] + bck->xmx[p7G_NXCELLS*i + p7G_N] + gm->xsc[p7P_N][p7P_LOOP] - overall_logp);
221       njcp += expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_J] + bck->xmx[p7G_NXCELLS*i + p7G_J] + gm->xsc[p7P_J][p7P_LOOP] - overall_logp);
222       njcp += expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_C] + bck->xmx[p7G_NXCELLS*i + p7G_C] + gm->xsc[p7P_C][p7P_LOOP] - overall_logp);
223       ddef->mocc[i] = 1. - njcp;
224     }
225   ddef->L = gm->L;
226   return eslOK;
227 }
228 /*------------------ end, decoding algorithms -------------------*/
229 
230 
231 
232 /*****************************************************************
233  * 2. Benchmark driver
234  *****************************************************************/
235 #ifdef p7GENERIC_DECODING_BENCHMARK
236 /*
237    icc -O3 -static -o generic_decoding_benchmark -I. -L. -I../easel -L../easel -Dp7GENERIC_DECODING_BENCHMARK generic_decoding.c -lhmmer -leasel -lm
238    ./benchmark-generic-decoding <hmmfile>
239                    RRM_1 (M=72)       Caudal_act (M=136)      SMC_N (M=1151)
240                  -----------------    ------------------     -------------------
241    21 Aug 08      6.62u (21.8 Mc/s)    12.52u (21.7 Mc/s)     106.27u (21.7 Mc/s)
242  */
243 #include "p7_config.h"
244 
245 #include "easel.h"
246 #include "esl_alphabet.h"
247 #include "esl_getopts.h"
248 #include "esl_random.h"
249 #include "esl_randomseq.h"
250 #include "esl_stopwatch.h"
251 
252 #include "hmmer.h"
253 
254 static ESL_OPTIONS options[] = {
255   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
256   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",           0 },
257   { "-s",        eslARG_INT,     "42", NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                  0 },
258   { "-L",        eslARG_INT,    "400", NULL, "n>0", NULL,  NULL, NULL, "length of random target seqs",                   0 },
259   { "-N",        eslARG_INT,   "5000", NULL, "n>0", NULL,  NULL, NULL, "number of random target seqs",                   0 },
260   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
261 };
262 static char usage[]  = "[-options] <hmmfile>";
263 static char banner[] = "benchmark driver for posterior residue decoding, generic version";
264 
265 int
main(int argc,char ** argv)266 main(int argc, char **argv)
267 {
268   ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage);
269   char           *hmmfile = esl_opt_GetArg(go, 1);
270   ESL_STOPWATCH  *w       = esl_stopwatch_Create();
271   ESL_RANDOMNESS *r       = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
272   ESL_ALPHABET   *abc     = NULL;
273   P7_HMMFILE     *hfp     = NULL;
274   P7_HMM         *hmm     = NULL;
275   P7_BG          *bg      = NULL;
276   P7_PROFILE     *gm      = NULL;
277   P7_GMX         *fwd     = NULL;
278   P7_GMX         *bck     = NULL;
279   P7_GMX         *pp      = NULL;
280   int             L       = esl_opt_GetInteger(go, "-L");
281   int             N       = esl_opt_GetInteger(go, "-N");
282   ESL_DSQ        *dsq     = malloc(sizeof(ESL_DSQ) * (L+2));
283   int             i;
284   float           fsc, bsc;
285   double          Mcs;
286 
287   if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
288   if (p7_hmmfile_Read(hfp, &abc, &hmm)            != eslOK) p7_Fail("Failed to read HMM");
289 
290   bg = p7_bg_Create(abc);                 p7_bg_SetLength(bg, L);
291   gm = p7_profile_Create(hmm->M, abc);    p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL);
292   fwd = p7_gmx_Create(gm->M, L);
293   bck = p7_gmx_Create(gm->M, L);
294   pp  = p7_gmx_Create(gm->M, L);
295 
296   esl_rsq_xfIID(r, bg->f, abc->K, L, dsq);
297   p7_GForward (dsq, L, gm, fwd, &fsc);
298   p7_GBackward(dsq, L, gm, bck, &bsc);
299 
300   esl_stopwatch_Start(w);
301   for (i = 0; i < N; i++)
302     p7_GDecoding(gm, fwd, bck, pp);
303   esl_stopwatch_Stop(w);
304 
305   Mcs  = (double) N * (double) L * (double) gm->M * 1e-6 / w->user;
306   esl_stopwatch_Display(stdout, w, "# CPU time: ");
307   printf("# M    = %d\n", gm->M);
308   printf("# %.1f Mc/s\n", Mcs);
309 
310   free(dsq);
311   p7_gmx_Destroy(pp);
312   p7_gmx_Destroy(fwd);
313   p7_gmx_Destroy(bck);
314   p7_profile_Destroy(gm);
315   p7_bg_Destroy(bg);
316   p7_hmm_Destroy(hmm);
317   p7_hmmfile_Close(hfp);
318   esl_alphabet_Destroy(abc);
319   esl_stopwatch_Destroy(w);
320   esl_randomness_Destroy(r);
321   esl_getopts_Destroy(go);
322   return 0;
323 }
324 #endif /*p7GENERIC_DECODING_BENCHMARK*/
325 /*------------------ end, benchmark driver ----------------------*/
326 
327 
328 
329 
330 /*****************************************************************
331  * 3. Unit tests
332  *****************************************************************/
333 #ifdef p7GENERIC_DECODING_TESTDRIVE
334 
335 #endif /*p7GENERIC_DECODING_TESTDRIVE*/
336 /*--------------------- end, unit tests -------------------------*/
337 
338 
339 
340 
341 /*****************************************************************
342  * 4. Test driver
343  *****************************************************************/
344 #ifdef p7GENERIC_DECODING_TESTDRIVE
345 
346 #endif /*p7GENERIC_DECODING_TESTDRIVE*/
347 /*-------------------- end, test driver -------------------------*/
348 
349 
350 
351 
352 /*****************************************************************
353  * 5. Example
354  *****************************************************************/
355 #ifdef p7GENERIC_DECODING_EXAMPLE
356 
357 #include "p7_config.h"
358 
359 #include "easel.h"
360 #include "esl_alphabet.h"
361 #include "esl_getopts.h"
362 #include "esl_sq.h"
363 #include "esl_sqio.h"
364 
365 #include "hmmer.h"
366 
367 #define STYLES     "--fs,--sw,--ls,--s"	  /* Exclusive choice for alignment mode     */
368 
369 static ESL_OPTIONS options[] = {
370   /* name           type      default  env  range  toggles reqs incomp  help                                       docgroup*/
371   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,   NULL,  NULL, NULL, "show brief help on version and usage",             0 },
372   { "--fs",      eslARG_NONE,"default",NULL, NULL, STYLES,  NULL, NULL, "multihit local alignment",                         0 },
373   { "--sw",      eslARG_NONE,   FALSE, NULL, NULL, STYLES,  NULL, NULL, "unihit local alignment",                           0 },
374   { "--ls",      eslARG_NONE,   FALSE, NULL, NULL, STYLES,  NULL, NULL, "multihit glocal alignment",                        0 },
375   { "--s",       eslARG_NONE,   FALSE, NULL, NULL, STYLES,  NULL, NULL, "unihit glocal alignment",                          0 },
376   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
377 };
378 static char usage[]  = "[-options] <hmmfile> <seqfile>";
379 static char banner[] = "example of posterior decoding, generic implementation";
380 
381 static void dump_matrix_csv(FILE *fp, P7_GMX *pp, int istart, int iend, int kstart, int kend);
382 
383 int
main(int argc,char ** argv)384 main(int argc, char **argv)
385 {
386   ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
387   char           *hmmfile = esl_opt_GetArg(go, 1);
388   char           *seqfile = esl_opt_GetArg(go, 2);
389   ESL_ALPHABET   *abc     = NULL;
390   P7_HMMFILE     *hfp     = NULL;
391   P7_HMM         *hmm     = NULL;
392   P7_BG          *bg      = NULL;
393   P7_PROFILE     *gm      = NULL;
394   P7_GMX         *fwd     = NULL;
395   P7_GMX         *bck     = NULL;
396   ESL_SQ         *sq      = NULL;
397   ESL_SQFILE     *sqfp    = NULL;
398   int             format  = eslSQFILE_UNKNOWN;
399   float           fsc, bsc;
400 
401   /* Read in one query profile */
402   if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
403   if (p7_hmmfile_Read(hfp, &abc, &hmm)            != eslOK) p7_Fail("Failed to read HMM");
404   p7_hmmfile_Close(hfp);
405 
406   /* Read in one target sequence */
407   sq     = esl_sq_CreateDigital(abc);
408   if (esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK) p7_Fail("Failed to open sequence file %s", seqfile);
409   if (esl_sqio_Read(sqfp, sq)                       != eslOK) p7_Fail("Failed to read sequence");
410   esl_sqfile_Close(sqfp);
411 
412   /* Configure a profile from the HMM */
413   bg = p7_bg_Create(abc);
414   gm = p7_profile_Create(hmm->M, abc);
415 
416   /* Now reconfig the model however we were asked to */
417   if      (esl_opt_GetBoolean(go, "--fs"))  p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL);
418   else if (esl_opt_GetBoolean(go, "--sw"))  p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL);
419   else if (esl_opt_GetBoolean(go, "--ls"))  p7_ProfileConfig(hmm, bg, gm, sq->n, p7_GLOCAL);
420   else if (esl_opt_GetBoolean(go, "--s"))   p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNIGLOCAL);
421 
422   /* Allocate matrices */
423   fwd = p7_gmx_Create(gm->M, sq->n);
424   bck = p7_gmx_Create(gm->M, sq->n);
425 
426   /* Set the profile and null model's target length models */
427   p7_bg_SetLength(bg,   sq->n);
428   p7_ReconfigLength(gm, sq->n);
429 
430   /* Run Forward, Backward */
431   p7_GForward (sq->dsq, sq->n, gm, fwd, &fsc);
432   p7_GBackward(sq->dsq, sq->n, gm, bck, &bsc);
433 
434   /* Decoding: <bck> becomes the posterior probability mx */
435   p7_GDecoding(gm, fwd, bck, bck);
436 
437   //p7_gmx_Dump(stdout, bck, p7_DEFAULT);
438   dump_matrix_csv(stdout, bck, 1, sq->n, 1, gm->M);
439 
440   /* Cleanup */
441   esl_sq_Destroy(sq);
442   p7_profile_Destroy(gm);
443   p7_gmx_Destroy(fwd);
444   p7_gmx_Destroy(bck);
445   p7_bg_Destroy(bg);
446   p7_hmm_Destroy(hmm);
447   esl_alphabet_Destroy(abc);
448   esl_getopts_Destroy(go);
449   return 0;
450 }
451 
452 static void
dump_matrix_csv(FILE * fp,P7_GMX * pp,int istart,int iend,int kstart,int kend)453 dump_matrix_csv(FILE *fp, P7_GMX *pp, int istart, int iend, int kstart, int kend)
454 {
455   int   width     = 7;
456   int   precision = 5;
457   int   i, k;
458   float val;
459 
460   printf("i,");
461   for (k = kstart; k <= kend; k++)
462     printf("%-d%s", k, k==kend ? "\n" : ",");
463 
464   for (i = istart; i <= iend; i++)
465     {
466       printf("%-d,", i);
467       for (k = kstart; k <= kend; k++)
468 	{
469 	  val = pp->dp[i][k * p7G_NSCELLS + p7G_M] +
470 	    pp->dp[i][k * p7G_NSCELLS + p7G_I] +
471 	    pp->dp[i][k * p7G_NSCELLS + p7G_D];
472 
473 	  fprintf(fp, "%*.*f%s", width, precision, val, k==kend ? "\n" : ", ");
474 	}
475     }
476 }
477 #endif /*p7GENERIC_DECODING_EXAMPLE*/
478 /*------------------------ example ------------------------------*/
479 
480 
481 
482