1 /* Posterior decoding algorithms; generic versions.
2 *
3 * Contents:
4 * 1. Posterior decoding algorithms.
5 * 2. Benchmark driver.
6 * 3. Unit tests.
7 * 4. Test driver.
8 * 5. Example.
9 */
10 #include "p7_config.h"
11
12 #include <math.h>
13 #include "easel.h"
14 #include "hmmer.h"
15
16 /*****************************************************************
17 * 1. Posterior decoding algorithms.
18 *****************************************************************/
19
20 /* Function: p7_GDecoding()
21 * Synopsis: Posterior decoding of residue assignments.
22 *
23 * Purpose: Calculates a posterior decoding of the residues in a
24 * target sequence, given profile <gm> and filled Forward
25 * and Backward matrices <fwd>, <bck> for the profile
26 * aligned to that target sequence. The resulting posterior
27 * decoding is stored in a DP matrix <pp>, provided by the
28 * caller.
29 *
30 * Each residue <i> must have been emitted by match state
31 * <1..M>, insert state <1..M-1>, or an NN, CC, or JJ loop
32 * transition. For <dp = pp->dp>, <xmx = pp->xmx>,
33 * <MMX(i,k)> is the probability that match <k> emitted
34 * residue <i>; <IMX(i,k)> is the probability that insert
35 * <k> emitted residue <i>; <XMX(i,N)>,<XMX(i,C)>,
36 * <XMX(i,J)> are the probabilities that residue <i> was
37 * emitted on an NN, CC, or JJ transition. The sum over all
38 * these possibilities for a given residue <i> is 1.0.
39 *
40 * Thus the only nonzero entries in a posterior decoding matrix
41 * <pp> are <M_{1..M}>, <I_{1..M-1}>, <N_{1..L-1}> (residue L
42 * can't be emitted by N), <C_{2..L}> (residue 1 can't be
43 * emitted by C), and <J_{2..L-1}> (residues 1,L can't be
44 * emitted by J).
45 *
46 * In particular, row i=0 is unused (all zeros) in a pp
47 * matrix; the null2 calculation will take advantage of
48 * this by using the zero row for workspace.
49 *
50 * The caller may pass the Backward matrix <bck> as <pp>,
51 * in which case <bck> will be overwritten with
52 * <pp>. However, the caller may \emph{not} overwrite <fwd>
53 * this way; an <(i-1)> dependency in the calculation of
54 * NN, CC, JJ transitions prevents this.
55 *
56 * Args: gm - profile (must be the same that was used to fill <fwd>, <bck>).
57 * fwd - filled Forward matrix
58 * bck - filled Backward matrix
59 * pp - RESULT: posterior decoding matrix.
60 *
61 * Returns: <eslOK> on success.
62 *
63 * Throws: (no abnormal error conditions)
64 *
65 * Note: Burns time renormalizing each row. If you don't do this,
66 * probabilities will have an error of +/- 0.001 or so, creeping
67 * in from error in FLogsum()'s table approximation and even
68 * in log() and exp() themselves; including "probabilities"
69 * up to ~1.001. Though this isn't going to break anything
70 * in normal use, it does drive the unit tests wild; the SSE
71 * implementation is more accurate, and unit tests that try
72 * to compare SSE and generic results will see differences,
73 * some sufficient to alter the choice of OA traceback.
74 *
75 */
76 int
p7_GDecoding(const P7_PROFILE * gm,const P7_GMX * fwd,P7_GMX * bck,P7_GMX * pp)77 p7_GDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, P7_GMX *bck, P7_GMX *pp)
78 {
79 float **dp = pp->dp;
80 float *xmx = pp->xmx;
81 int L = fwd->L;
82 int M = gm->M;
83 int i,k;
84 float overall_sc = fwd->xmx[p7G_NXCELLS*L + p7G_C] + gm->xsc[p7P_C][p7P_MOVE];
85 float denom;
86
87 pp->M = M;
88 pp->L = L;
89
90 XMX(0, p7G_E) = 0.0;
91 XMX(0, p7G_N) = 0.0;
92 XMX(0, p7G_J) = 0.0;
93 XMX(0, p7G_B) = 0.0;
94 XMX(0, p7G_C) = 0.0;
95 for (k = 0; k <= M; k++)
96 MMX(0,k) = IMX(0,k) = DMX(0,k) = 0.0;
97
98 for (i = 1; i <= L; i++)
99 {
100 denom = 0.0;
101 MMX(i,0) = IMX(i,0) = DMX(i,0) = 0.0;
102 for (k = 1; k < M; k++)
103 {
104 MMX(i,k) = expf(fwd->dp[i][k*p7G_NSCELLS + p7G_M] + bck->dp[i][k*p7G_NSCELLS + p7G_M] - overall_sc); denom += MMX(i,k);
105 IMX(i,k) = expf(fwd->dp[i][k*p7G_NSCELLS + p7G_I] + bck->dp[i][k*p7G_NSCELLS + p7G_I] - overall_sc); denom += IMX(i,k);
106 DMX(i,k) = 0.;
107 }
108 MMX(i,M) = expf(fwd->dp[i][M*p7G_NSCELLS + p7G_M] + bck->dp[i][M*p7G_NSCELLS + p7G_M] - overall_sc); denom += MMX(i,M);
109 IMX(i,M) = 0.;
110 DMX(i,M) = 0.;
111
112 /* order doesn't matter. note that this whole function is trivially simd parallel */
113 XMX(i,p7G_E) = 0.;
114 XMX(i,p7G_N) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_N] + bck->xmx[p7G_NXCELLS*i + p7G_N] + gm->xsc[p7P_N][p7P_LOOP] - overall_sc);
115 XMX(i,p7G_J) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_J] + bck->xmx[p7G_NXCELLS*i + p7G_J] + gm->xsc[p7P_J][p7P_LOOP] - overall_sc);
116 XMX(i,p7G_B) = 0.;
117 XMX(i,p7G_C) = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_C] + bck->xmx[p7G_NXCELLS*i + p7G_C] + gm->xsc[p7P_C][p7P_LOOP] - overall_sc);
118 denom += XMX(i,p7G_N) + XMX(i,p7G_J) + XMX(i,p7G_C);
119
120 denom = 1.0 / denom;
121 for (k = 1; k < M; k++) { MMX(i,k) *= denom; IMX(i,k) *= denom; }
122 MMX(i,M) *= denom;
123 XMX(i,p7G_N) *= denom;
124 XMX(i,p7G_J) *= denom;
125 XMX(i,p7G_C) *= denom;
126 }
127 return eslOK;
128 }
129
130 /* Function: p7_GDomainDecoding()
131 * Synopsis: Posterior decoding of domain location.
132 *
133 * Purpose: The caller has calculated Forward and Backward matrices
134 * <fwd> and <bck> for model <gm> aligned to a target
135 * sequence. (The target sequence doesn't need to be
136 * provided, because all we need to know is its length
137 * <L>, and that's available in either of the two DP
138 * matrices.)
139 *
140 * We use this information to calculate the posterior
141 * probabilities that we're in a begin state B, end state
142 * E, or any core model state {M,D,I} at each target
143 * sequence position <i = 1..L>.
144 *
145 * This information is stored in three arrays in
146 * <ddef>. This routine expects that this storage has
147 * already been (re)allocated appropriately for a target
148 * seq of length <L>.
149 *
150 * <ddef->btot[i]> stores the cumulative expectation
151 * $\sum_1^i$ of the number of i's that were emitted (by an
152 * Mk state) immediately after a B : i.e., the expected
153 * number of times domains have started at or before
154 * position i.
155 *
156 * <ddef->etot[i]> stores the cumulative expectation
157 * $\sum_1^i$ of the number of i's that were emitted (by
158 * an Mk or Dk state) and immediately followed by an end
159 * transition to E : i.e., the expected number of times
160 * domains have ended at or before position i.
161 *
162 * <ddef->mocc[i]> stores the probability that residue i is
163 * emitted by the core model, as opposed to the flanking
164 * N,C,J states : i.e., the probability that i is in a
165 * domain.
166 *
167 * Upon return, each of these arrays has been made, and
168 * <ddef->L> has * been set.
169 *
170 * Args: gm - profile
171 * fwd - filled Forward matrix
172 * bck - filled Backward matrix
173 * ddef - container for the results.
174 *
175 * Returns: <eslOK> on success.
176 *
177 * Throws: (no abnormal error conditions)
178 *
179 * Notes: Ideas for future optimization:
180 *
181 * - The calculations only need to access the xmx[CNJBE][i] special
182 * states in the fwd, bck matrices, so we could use
183 * streamlined (checkpointed?) matrices that only maintain
184 * this info over all i. This would be a step in letting
185 * us do domain parses in linear memory.
186 *
187 * - indeed, the <btot>, <etot>, and <mocc> arrays could be made
188 * sparse; on long target sequences, we expect long
189 * stretches of negligible posterior probability that
190 * we're in the model or using a begin or end
191 * transition.
192 *
193 * - indeed indeed, we don't really need to store the <btot>, <etot>,
194 * and <mocc> arrays at all. We can define regions in a
195 * single pass in O(1) extra memory, straight from the
196 * <fwd>, <bck> matrices, if we have to (xref
197 * J2/101). <p7_domaindef_ByPosteriorHeuristics()> is
198 * already implemented in a way to make this easy. We're
199 * not doing that for now, partly for clarity in the code,
200 * and partly because I think we'll want to output the
201 * <btot>, <etot>, and <mocc> arrays -- this view of the
202 * posterior decoding of the domain structure of a target
203 * sequence will be useful. Also, it's a lot easier to
204 * implement the <is_multidomain_region()> trigger if
205 * these arrays are available.
206 */
207 int
p7_GDomainDecoding(const P7_PROFILE * gm,const P7_GMX * fwd,const P7_GMX * bck,P7_DOMAINDEF * ddef)208 p7_GDomainDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, const P7_GMX *bck, P7_DOMAINDEF *ddef)
209 {
210 int L = fwd->L;
211 float overall_logp = fwd->xmx[p7G_NXCELLS*L + p7G_C] + gm->xsc[p7P_C][p7P_MOVE];
212 float njcp;
213 int i;
214
215 for (i = 1; i <= L; i++)
216 {
217 ddef->btot[i] = ddef->btot[i-1] + exp(fwd->xmx[(i-1)*p7G_NXCELLS+p7G_B] + bck->xmx[(i-1)*p7G_NXCELLS+p7G_B] - overall_logp);
218 ddef->etot[i] = ddef->etot[i-1] + exp(fwd->xmx[i *p7G_NXCELLS+p7G_E] + bck->xmx[i *p7G_NXCELLS+p7G_E] - overall_logp);
219
220 njcp = expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_N] + bck->xmx[p7G_NXCELLS*i + p7G_N] + gm->xsc[p7P_N][p7P_LOOP] - overall_logp);
221 njcp += expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_J] + bck->xmx[p7G_NXCELLS*i + p7G_J] + gm->xsc[p7P_J][p7P_LOOP] - overall_logp);
222 njcp += expf(fwd->xmx[p7G_NXCELLS*(i-1) + p7G_C] + bck->xmx[p7G_NXCELLS*i + p7G_C] + gm->xsc[p7P_C][p7P_LOOP] - overall_logp);
223 ddef->mocc[i] = 1. - njcp;
224 }
225 ddef->L = gm->L;
226 return eslOK;
227 }
228 /*------------------ end, decoding algorithms -------------------*/
229
230
231
232 /*****************************************************************
233 * 2. Benchmark driver
234 *****************************************************************/
235 #ifdef p7GENERIC_DECODING_BENCHMARK
236 /*
237 icc -O3 -static -o generic_decoding_benchmark -I. -L. -I../easel -L../easel -Dp7GENERIC_DECODING_BENCHMARK generic_decoding.c -lhmmer -leasel -lm
238 ./benchmark-generic-decoding <hmmfile>
239 RRM_1 (M=72) Caudal_act (M=136) SMC_N (M=1151)
240 ----------------- ------------------ -------------------
241 21 Aug 08 6.62u (21.8 Mc/s) 12.52u (21.7 Mc/s) 106.27u (21.7 Mc/s)
242 */
243 #include "p7_config.h"
244
245 #include "easel.h"
246 #include "esl_alphabet.h"
247 #include "esl_getopts.h"
248 #include "esl_random.h"
249 #include "esl_randomseq.h"
250 #include "esl_stopwatch.h"
251
252 #include "hmmer.h"
253
254 static ESL_OPTIONS options[] = {
255 /* name type default env range toggles reqs incomp help docgroup*/
256 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
257 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
258 { "-L", eslARG_INT, "400", NULL, "n>0", NULL, NULL, NULL, "length of random target seqs", 0 },
259 { "-N", eslARG_INT, "5000", NULL, "n>0", NULL, NULL, NULL, "number of random target seqs", 0 },
260 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
261 };
262 static char usage[] = "[-options] <hmmfile>";
263 static char banner[] = "benchmark driver for posterior residue decoding, generic version";
264
265 int
main(int argc,char ** argv)266 main(int argc, char **argv)
267 {
268 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage);
269 char *hmmfile = esl_opt_GetArg(go, 1);
270 ESL_STOPWATCH *w = esl_stopwatch_Create();
271 ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
272 ESL_ALPHABET *abc = NULL;
273 P7_HMMFILE *hfp = NULL;
274 P7_HMM *hmm = NULL;
275 P7_BG *bg = NULL;
276 P7_PROFILE *gm = NULL;
277 P7_GMX *fwd = NULL;
278 P7_GMX *bck = NULL;
279 P7_GMX *pp = NULL;
280 int L = esl_opt_GetInteger(go, "-L");
281 int N = esl_opt_GetInteger(go, "-N");
282 ESL_DSQ *dsq = malloc(sizeof(ESL_DSQ) * (L+2));
283 int i;
284 float fsc, bsc;
285 double Mcs;
286
287 if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
288 if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM");
289
290 bg = p7_bg_Create(abc); p7_bg_SetLength(bg, L);
291 gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_LOCAL);
292 fwd = p7_gmx_Create(gm->M, L);
293 bck = p7_gmx_Create(gm->M, L);
294 pp = p7_gmx_Create(gm->M, L);
295
296 esl_rsq_xfIID(r, bg->f, abc->K, L, dsq);
297 p7_GForward (dsq, L, gm, fwd, &fsc);
298 p7_GBackward(dsq, L, gm, bck, &bsc);
299
300 esl_stopwatch_Start(w);
301 for (i = 0; i < N; i++)
302 p7_GDecoding(gm, fwd, bck, pp);
303 esl_stopwatch_Stop(w);
304
305 Mcs = (double) N * (double) L * (double) gm->M * 1e-6 / w->user;
306 esl_stopwatch_Display(stdout, w, "# CPU time: ");
307 printf("# M = %d\n", gm->M);
308 printf("# %.1f Mc/s\n", Mcs);
309
310 free(dsq);
311 p7_gmx_Destroy(pp);
312 p7_gmx_Destroy(fwd);
313 p7_gmx_Destroy(bck);
314 p7_profile_Destroy(gm);
315 p7_bg_Destroy(bg);
316 p7_hmm_Destroy(hmm);
317 p7_hmmfile_Close(hfp);
318 esl_alphabet_Destroy(abc);
319 esl_stopwatch_Destroy(w);
320 esl_randomness_Destroy(r);
321 esl_getopts_Destroy(go);
322 return 0;
323 }
324 #endif /*p7GENERIC_DECODING_BENCHMARK*/
325 /*------------------ end, benchmark driver ----------------------*/
326
327
328
329
330 /*****************************************************************
331 * 3. Unit tests
332 *****************************************************************/
333 #ifdef p7GENERIC_DECODING_TESTDRIVE
334
335 #endif /*p7GENERIC_DECODING_TESTDRIVE*/
336 /*--------------------- end, unit tests -------------------------*/
337
338
339
340
341 /*****************************************************************
342 * 4. Test driver
343 *****************************************************************/
344 #ifdef p7GENERIC_DECODING_TESTDRIVE
345
346 #endif /*p7GENERIC_DECODING_TESTDRIVE*/
347 /*-------------------- end, test driver -------------------------*/
348
349
350
351
352 /*****************************************************************
353 * 5. Example
354 *****************************************************************/
355 #ifdef p7GENERIC_DECODING_EXAMPLE
356
357 #include "p7_config.h"
358
359 #include "easel.h"
360 #include "esl_alphabet.h"
361 #include "esl_getopts.h"
362 #include "esl_sq.h"
363 #include "esl_sqio.h"
364
365 #include "hmmer.h"
366
367 #define STYLES "--fs,--sw,--ls,--s" /* Exclusive choice for alignment mode */
368
369 static ESL_OPTIONS options[] = {
370 /* name type default env range toggles reqs incomp help docgroup*/
371 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
372 { "--fs", eslARG_NONE,"default",NULL, NULL, STYLES, NULL, NULL, "multihit local alignment", 0 },
373 { "--sw", eslARG_NONE, FALSE, NULL, NULL, STYLES, NULL, NULL, "unihit local alignment", 0 },
374 { "--ls", eslARG_NONE, FALSE, NULL, NULL, STYLES, NULL, NULL, "multihit glocal alignment", 0 },
375 { "--s", eslARG_NONE, FALSE, NULL, NULL, STYLES, NULL, NULL, "unihit glocal alignment", 0 },
376 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
377 };
378 static char usage[] = "[-options] <hmmfile> <seqfile>";
379 static char banner[] = "example of posterior decoding, generic implementation";
380
381 static void dump_matrix_csv(FILE *fp, P7_GMX *pp, int istart, int iend, int kstart, int kend);
382
383 int
main(int argc,char ** argv)384 main(int argc, char **argv)
385 {
386 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
387 char *hmmfile = esl_opt_GetArg(go, 1);
388 char *seqfile = esl_opt_GetArg(go, 2);
389 ESL_ALPHABET *abc = NULL;
390 P7_HMMFILE *hfp = NULL;
391 P7_HMM *hmm = NULL;
392 P7_BG *bg = NULL;
393 P7_PROFILE *gm = NULL;
394 P7_GMX *fwd = NULL;
395 P7_GMX *bck = NULL;
396 ESL_SQ *sq = NULL;
397 ESL_SQFILE *sqfp = NULL;
398 int format = eslSQFILE_UNKNOWN;
399 float fsc, bsc;
400
401 /* Read in one query profile */
402 if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
403 if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM");
404 p7_hmmfile_Close(hfp);
405
406 /* Read in one target sequence */
407 sq = esl_sq_CreateDigital(abc);
408 if (esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK) p7_Fail("Failed to open sequence file %s", seqfile);
409 if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence");
410 esl_sqfile_Close(sqfp);
411
412 /* Configure a profile from the HMM */
413 bg = p7_bg_Create(abc);
414 gm = p7_profile_Create(hmm->M, abc);
415
416 /* Now reconfig the model however we were asked to */
417 if (esl_opt_GetBoolean(go, "--fs")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL);
418 else if (esl_opt_GetBoolean(go, "--sw")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNILOCAL);
419 else if (esl_opt_GetBoolean(go, "--ls")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_GLOCAL);
420 else if (esl_opt_GetBoolean(go, "--s")) p7_ProfileConfig(hmm, bg, gm, sq->n, p7_UNIGLOCAL);
421
422 /* Allocate matrices */
423 fwd = p7_gmx_Create(gm->M, sq->n);
424 bck = p7_gmx_Create(gm->M, sq->n);
425
426 /* Set the profile and null model's target length models */
427 p7_bg_SetLength(bg, sq->n);
428 p7_ReconfigLength(gm, sq->n);
429
430 /* Run Forward, Backward */
431 p7_GForward (sq->dsq, sq->n, gm, fwd, &fsc);
432 p7_GBackward(sq->dsq, sq->n, gm, bck, &bsc);
433
434 /* Decoding: <bck> becomes the posterior probability mx */
435 p7_GDecoding(gm, fwd, bck, bck);
436
437 //p7_gmx_Dump(stdout, bck, p7_DEFAULT);
438 dump_matrix_csv(stdout, bck, 1, sq->n, 1, gm->M);
439
440 /* Cleanup */
441 esl_sq_Destroy(sq);
442 p7_profile_Destroy(gm);
443 p7_gmx_Destroy(fwd);
444 p7_gmx_Destroy(bck);
445 p7_bg_Destroy(bg);
446 p7_hmm_Destroy(hmm);
447 esl_alphabet_Destroy(abc);
448 esl_getopts_Destroy(go);
449 return 0;
450 }
451
452 static void
dump_matrix_csv(FILE * fp,P7_GMX * pp,int istart,int iend,int kstart,int kend)453 dump_matrix_csv(FILE *fp, P7_GMX *pp, int istart, int iend, int kstart, int kend)
454 {
455 int width = 7;
456 int precision = 5;
457 int i, k;
458 float val;
459
460 printf("i,");
461 for (k = kstart; k <= kend; k++)
462 printf("%-d%s", k, k==kend ? "\n" : ",");
463
464 for (i = istart; i <= iend; i++)
465 {
466 printf("%-d,", i);
467 for (k = kstart; k <= kend; k++)
468 {
469 val = pp->dp[i][k * p7G_NSCELLS + p7G_M] +
470 pp->dp[i][k * p7G_NSCELLS + p7G_I] +
471 pp->dp[i][k * p7G_NSCELLS + p7G_D];
472
473 fprintf(fp, "%*.*f%s", width, precision, val, k==kend ? "\n" : ", ");
474 }
475 }
476 }
477 #endif /*p7GENERIC_DECODING_EXAMPLE*/
478 /*------------------------ example ------------------------------*/
479
480
481
482