1 /* Construction of new HMMs from multiple alignments.
2 *
3 * Two versions:
4 * p7_Handmodelmaker() -- use #=RF annotation to indicate match columns
5 * p7_Fastmodelmaker() -- Krogh/Haussler heuristic
6 *
7 * The maximum likelihood model construction algorithm that was in previous
8 * HMMER versions has been deprecated, at least for the moment.
9 *
10 * The meat of the model construction code is in matassign2hmm().
11 * The two model construction strategies simply label which columns
12 * are supposed to be match states, and then hand this info to
13 * matassign2hmm().
14 *
15 *
16 * Contents:
17 * 1. Exported API: model construction routines.
18 * 2. Private functions used in constructing models.
19 * 3. Unit tests.
20 * 4. Test driver.
21 * 5. Example.
22 */
23 #include "p7_config.h"
24
25 #include <string.h>
26
27 #include "easel.h"
28 #include "esl_alphabet.h"
29 #include "esl_msa.h"
30 #include "esl_msafile.h"
31
32 #include "hmmer.h"
33
34 static int do_modelmask( ESL_MSA *msa);
35 static int matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr);
36 static int annotate_model(P7_HMM *hmm, int *matassign, ESL_MSA *msa);
37
38 /*****************************************************************
39 * 1. Exported API: model construction routines.
40 *****************************************************************/
41
42 /* Function: p7_Handmodelmaker()
43 *
44 * Purpose: Manual model construction.
45 * Construct an HMM from a digital alignment, where the
46 * <#=RF> line of the alignment file is used to indicate the
47 * columns assigned to matches vs. inserts.
48 *
49 * The <msa> must be in digital mode, and it must have
50 * a reference annotation line.
51 *
52 * NOTE: <p7_Handmodelmaker()> will slightly revise the
53 * alignment if necessary, if the assignment of columns
54 * implies DI and ID transitions.
55 *
56 * Returns both the HMM in counts form (ready for applying
57 * Dirichlet priors as the next step), and fake tracebacks
58 * for each aligned sequence.
59 *
60 * Models must have at least one node, so if the <msa> defined
61 * no consensus columns, a <eslENORESULT> error is returned.
62 *
63 * Args: msa - multiple sequence alignment
64 * bld - holds information on regions requiring masking, optionally NULL -> no masking
65 * ret_hmm - RETURN: counts-form HMM
66 * opt_tr - optRETURN: array of tracebacks for aseq's
67 *
68 * Return: <eslOK> on success. <ret_hmm> and <opt_tr> are allocated
69 * here, and must be free'd by caller.
70 *
71 * Returns <eslENORESULT> if no consensus columns were annotated;
72 * in this case, <ret_hmm> and <opt_tr> are returned NULL.
73 *
74 * Returns <eslEFORMAT> if the <msa> doesn't have a reference
75 * annotation line.
76 *
77 * Throws: <eslEMEM> on allocation failure. Throws <eslEINVAL> if the <msa>
78 * isn't in digital mode.
79 */
80 int
p7_Handmodelmaker(ESL_MSA * msa,P7_BUILDER * bld,P7_HMM ** ret_hmm,P7_TRACE *** opt_tr)81 p7_Handmodelmaker(ESL_MSA *msa, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
82 {
83 int status;
84 int *matassign = NULL; /* MAT state assignments if 1; 1..alen */
85 int apos; /* counter for aligned columns */
86
87 if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need a digital msa");
88 if (msa->rf == NULL) return eslEFORMAT;
89
90 ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1));
91
92 /* Watch for off-by-one. rf is [0..alen-1]; matassign is [1..alen] */
93 for (apos = 1; apos <= msa->alen; apos++)
94 matassign[apos] = (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])? FALSE : TRUE);
95
96 /* matassign2hmm leaves ret_hmm, opt_tr in their proper state: */
97 if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) goto ERROR;
98
99 free(matassign);
100 return eslOK;
101
102 ERROR:
103 if (matassign != NULL) free(matassign);
104 return status;
105 }
106
107 /* Function: p7_Fastmodelmaker()
108 *
109 * Purpose: Heuristic model construction.
110 * Construct an HMM from an alignment by a simple rule,
111 * based on the fractional occupancy of each columns w/
112 * residues vs gaps. Any column w/ a fractional
113 * occupancy of $\geq$ <symfrac> is assigned as a MATCH column;
114 * for instance, if thresh = 0.5, columns w/ $\geq$ 50\%
115 * residues are assigned to match... roughly speaking.
116 *
117 * "Roughly speaking" because sequences may be weighted
118 * in the input <msa>, and because missing data symbols are
119 * ignored, in order to deal with sequence fragments.
120 *
121 * The <msa> must be in digital mode.
122 *
123 * If the caller wants to designate any sequences as
124 * fragments, it does so by converting all N-terminal and
125 * C-terminal flanking gap symbols to missing data symbols.
126 *
127 * NOTE: p7_Fastmodelmaker() will slightly revise the
128 * alignment if the assignment of columns implies
129 * DI and ID transitions.
130 *
131 * Returns the HMM in counts form (ready for applying Dirichlet
132 * priors as the next step). Also returns fake traceback
133 * for each training sequence.
134 *
135 * Models must have at least one node, so if the <msa> defined
136 * no consensus columns, a <eslENORESULT> error is returned.
137 *
138 * Args: msa - multiple sequence alignment
139 * symfrac - threshold for residue occupancy; >= assigns MATCH
140 * bld - holds information on regions requiring masking, optionally NULL -> no masking
141 * ret_hmm - RETURN: counts-form HMM
142 * opt_tr - optRETURN: array of tracebacks for aseq's
143 *
144 * Return: <eslOK> on success. ret_hmm and opt_tr allocated here,
145 * and must be free'd by the caller (FreeTrace(tr[i]), free(tr),
146 * FreeHMM(hmm)).
147 *
148 * Returns <eslENORESULT> if no consensus columns were annotated;
149 * in this case, <ret_hmm> and <opt_tr> are returned NULL.
150 *
151 * Throws: <eslEMEM> on allocation failure; <eslEINVAL> if the
152 * <msa> isn't in digital mode.
153 */
154 int
p7_Fastmodelmaker(ESL_MSA * msa,float symfrac,P7_BUILDER * bld,P7_HMM ** ret_hmm,P7_TRACE *** opt_tr)155 p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
156 {
157 int status; /* return status flag */
158 int *matassign = NULL; /* MAT state assignments if 1; 1..alen */
159 int idx; /* counter over sequences */
160 int apos; /* counter for aligned columns */
161 float r; /* weighted residue count */
162 float totwgt; /* weighted residue+gap count */
163
164 if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA");
165
166 /* Allocations: matassign is 1..alen array of bit flags.
167 */
168 ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1));
169
170 /* Determine weighted sym freq in each column, set matassign[] accordingly.
171 */
172 for (apos = 1; apos <= msa->alen; apos++)
173 {
174 r = totwgt = 0.;
175 for (idx = 0; idx < msa->nseq; idx++)
176 {
177 if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; }
178 else if (esl_abc_XIsGap(msa->abc, msa->ax[idx][apos])) { totwgt += msa->wgt[idx]; }
179 else if (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue;
180 }
181 if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE;
182 else matassign[apos] = FALSE;
183 }
184
185
186 /* Once we have matassign calculated, modelmakers behave
187 * the same; matassign2hmm() does this stuff (traceback construction,
188 * trace counting) and sets up ret_hmm and opt_tr.
189 */
190 if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) {
191 fprintf (stderr, "hmm construction error during trace counting\n");
192 goto ERROR;
193 }
194
195 free(matassign);
196 return eslOK;
197
198 ERROR:
199 if (matassign != NULL) free(matassign);
200 return status;
201 }
202
203 /*-------------------- end, exported API -------------------------*/
204
205
206
207
208 /*****************************************************************
209 * 2. Private functions used in constructing models.
210 *****************************************************************/
211
212
213 /* Function: do_modelmask()
214 *
215 * Purpose: If the given <msa> has a MM CS line, mask (turn to
216 * degenerate) residues in the msa positions associated
217 * with the marked position in the MM (marked with 'm')
218 *
219 * Return: <eslOK> on success.
220 * <eslENORESULT> if error.
221 */
222 static int
do_modelmask(ESL_MSA * msa)223 do_modelmask( ESL_MSA *msa)
224 {
225 int i,j;
226
227 if (msa->mm == NULL) return eslOK; //nothing to do
228
229 for (i = 1; i <= msa->alen; i++) {
230 for (j = 0; j < msa->nseq; j++) {
231 if (msa->mm[i-1] == 'm') {
232 if (msa->ax[j][i] != msa->abc->K && msa->ax[j][i] != msa->abc->Kp-1) // if not gap
233 msa->ax[j][i] = msa->abc->Kp-3; //that's the degenerate "any character" (N for DNA, X for protein)
234 }
235 }
236 }
237 return eslOK;
238 }
239
240 /* Function: matassign2hmm()
241 *
242 * Purpose: Given an assignment of alignment columns to match vs.
243 * insert, finish the final part of the model construction
244 * calculation that is constant between model construction
245 * algorithms.
246 *
247 * Args: msa - multiple sequence alignment
248 * matassign - 1..alen bit flags for column assignments
249 * ret_hmm - RETURN: counts-form HMM
250 * opt_tr - optRETURN: array of tracebacks for aseq's
251 *
252 * Return: <eslOK> on success.
253 * <eslENORESULT> if no consensus columns are identified.
254 *
255 * ret_hmm and opt_tr alloc'ed here.
256 */
257 static int
matassign2hmm(ESL_MSA * msa,int * matassign,P7_HMM ** ret_hmm,P7_TRACE *** opt_tr)258 matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
259 {
260 int status; /* return status */
261 P7_HMM *hmm = NULL; /* RETURN: new hmm */
262 P7_TRACE **tr = NULL; /* RETURN: 0..nseq-1 fake traces */
263 int M; /* length of new model in match states */
264 int idx; /* counter over sequences */
265 int apos; /* counter for aligned columns */
266 char errbuf[eslERRBUFSIZE];
267
268 /* apply the model mask in the 'GC MM' row */
269 do_modelmask(msa);
270
271 /* How many match states in the HMM? */
272 for (M = 0, apos = 1; apos <= msa->alen; apos++)
273 if (matassign[apos]) M++;
274 if (M == 0) { status = eslENORESULT; goto ERROR; }
275
276 /* Make fake tracebacks for each seq */
277 ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq);
278 if ((status = p7_trace_FauxFromMSA(msa, matassign, p7_MSA_COORDS, tr)) != eslOK) goto ERROR;
279 for (idx = 0; idx < msa->nseq; idx++)
280 {
281 if ((status = p7_trace_Doctor(tr[idx], NULL, NULL)) != eslOK) goto ERROR;
282 if ((status = p7_trace_Validate(tr[idx], msa->abc, msa->ax[idx], errbuf)) != eslOK)
283 ESL_XEXCEPTION(eslFAIL, "validation failed: %s", errbuf);
284 }
285
286 /* Build count model from tracebacks */
287 if ((hmm = p7_hmm_Create(M, msa->abc)) == NULL) { status = eslEMEM; goto ERROR; }
288 if ((status = p7_hmm_Zero(hmm)) != eslOK) goto ERROR;
289 for (idx = 0; idx < msa->nseq; idx++) {
290 if (tr[idx] == NULL) continue; /* skip rare examples of empty sequences */
291 if ((status = p7_trace_Count(hmm, msa->ax[idx], msa->wgt[idx], tr[idx])) != eslOK) goto ERROR;
292 }
293
294 hmm->nseq = msa->nseq;
295 hmm->eff_nseq = msa->nseq;
296
297 /* Transfer annotation from the MSA to the new model
298 */
299 if ((status = annotate_model(hmm, matassign, msa)) != eslOK) goto ERROR;
300
301 /* Reset #=RF line of alignment to reflect our assignment
302 * of match, delete. matassign is valid from 1..alen and is off
303 * by one from msa->rf.
304 */
305 if (msa->rf == NULL) ESL_ALLOC(msa->rf, sizeof(char) * (msa->alen + 1));
306 for (apos = 1; apos <= msa->alen; apos++)
307 msa->rf[apos-1] = matassign[apos] ? 'x' : '.';
308 msa->rf[msa->alen] = '\0';
309
310 if (opt_tr != NULL) *opt_tr = tr;
311 else p7_trace_DestroyArray(tr, msa->nseq);
312 *ret_hmm = hmm;
313 return eslOK;
314
315 ERROR:
316 if (tr != NULL) p7_trace_DestroyArray(tr, msa->nseq);
317 if (hmm != NULL) p7_hmm_Destroy(hmm);
318 if (opt_tr != NULL) *opt_tr = NULL;
319 *ret_hmm = NULL;
320 return status;
321 }
322
323
324
325 /* Function: annotate_model()
326 *
327 * Purpose: Transfer rf, cs, and other optional annotation from the alignment
328 * to the new model.
329 *
330 * Args: hmm - [M] new model to annotate
331 * matassign - which alignment columns are MAT; [1..alen]
332 * msa - alignment, including annotation to transfer
333 *
334 * Return: <eslOK> on success.
335 *
336 * Throws: <eslEMEM> on allocation error.
337 */
338 static int
annotate_model(P7_HMM * hmm,int * matassign,ESL_MSA * msa)339 annotate_model(P7_HMM *hmm, int *matassign, ESL_MSA *msa)
340 {
341 int apos; /* position in matassign, 1.alen */
342 int k; /* position in model, 1.M */
343 int status;
344
345 /* Reference coord annotation */
346 if (msa->rf != NULL) {
347 ESL_ALLOC(hmm->rf, sizeof(char) * (hmm->M+2));
348 hmm->rf[0] = ' ';
349 for (apos = k = 1; apos <= msa->alen; apos++)
350 if (matassign[apos]) hmm->rf[k++] = msa->rf[apos-1]; /* watch off-by-one in msa's rf */
351 hmm->rf[k] = '\0';
352 hmm->flags |= p7H_RF;
353 }
354
355 /* Model mask annotation */
356 if (msa->mm != NULL) {
357 ESL_ALLOC(hmm->mm, sizeof(char) * (hmm->M+2));
358 hmm->mm[0] = ' ';
359 for (apos = k = 1; apos <= msa->alen; apos++)
360 if (matassign[apos]) hmm->mm[k++] = ( msa->mm[apos-1] == '.' ? '-' : msa->mm[apos-1]) ;
361 hmm->mm[k] = '\0';
362 hmm->flags |= p7H_MMASK;
363 }
364
365 /* Consensus structure annotation */
366 if (msa->ss_cons != NULL) {
367 ESL_ALLOC(hmm->cs, sizeof(char) * (hmm->M+2));
368 hmm->cs[0] = ' ';
369 for (apos = k = 1; apos <= msa->alen; apos++)
370 if (matassign[apos]) hmm->cs[k++] = msa->ss_cons[apos-1];
371 hmm->cs[k] = '\0';
372 hmm->flags |= p7H_CS;
373 }
374
375 /* Surface accessibility annotation */
376 if (msa->sa_cons != NULL) {
377 ESL_ALLOC(hmm->ca, sizeof(char) * (hmm->M+2));
378 hmm->ca[0] = ' ';
379 for (apos = k = 1; apos <= msa->alen; apos++)
380 if (matassign[apos]) hmm->ca[k++] = msa->sa_cons[apos-1];
381 hmm->ca[k] = '\0';
382 hmm->flags |= p7H_CA;
383 }
384
385 /* The alignment map (1..M in model, 1..alen in alignment) */
386 ESL_ALLOC(hmm->map, sizeof(int) * (hmm->M+1));
387 hmm->map[0] = 0;
388 for (apos = k = 1; apos <= msa->alen; apos++)
389 if (matassign[apos]) hmm->map[k++] = apos;
390 hmm->flags |= p7H_MAP;
391
392 return eslOK;
393
394 ERROR:
395 return status;
396 }
397
398 /*****************************************************************
399 * 3. Unit tests.
400 *****************************************************************/
401 #ifdef p7BUILD_TESTDRIVE
402
403 /* utest_basic()
404 * An MSA to ex{e,o}rcise past demons.
405 * 1. seq2 gives an I->end transition.
406 * 2. seq1 contains degenerate Z,X, exercising symbol counting
407 * of degenerate residues.
408 */
409 static void
utest_basic(void)410 utest_basic(void)
411 {
412 char *failmsg = "failure in build.c::utest_basic() unit test";
413 char msafile[16] = "p7tmpXXXXXX"; /* tmpfile name template */
414 FILE *ofp = NULL;
415 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
416 ESL_MSAFILE *afp = NULL;
417 ESL_MSA *msa = NULL;
418 P7_HMM *hmm = NULL;
419 float symfrac = 0.5;
420
421 if (esl_tmpfile_named(msafile, &ofp) != eslOK) esl_fatal(failmsg);
422 fprintf(ofp, "# STOCKHOLM 1.0\n");
423 fprintf(ofp, "#=GC RF --xxxxxxxxxxxxxxxx-xxx-x--\n");
424 fprintf(ofp, "seq1 --ACDEFGHIKLMNPZXS-TVW-Yyy\n");
425 fprintf(ofp, "seq2 aaACDEFGHIKLMNPQRS-TVWw---\n");
426 fprintf(ofp, "seq3 aaAC-EFGHIKLMNPQRS-TVW-Y--\n");
427 fprintf(ofp, "seq4 aaAC-EFGHIKLMNPQRS-TVW-Y--\n");
428 fprintf(ofp, "//\n");
429 fclose(ofp);
430
431 if (esl_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(failmsg);
432 if (esl_msafile_Read(afp, &msa) != eslOK) esl_fatal(failmsg);
433 if (p7_Fastmodelmaker(msa, symfrac, NULL, &hmm, NULL) != eslOK) esl_fatal(failmsg);
434
435 p7_hmm_Destroy(hmm);
436 esl_msa_Destroy(msa);
437 esl_msafile_Close(afp);
438 esl_alphabet_Destroy(abc);
439 remove(msafile);
440 return;
441 }
442
443 /* utest_fragments()
444 * This exercises the building code that deals with fragments,
445 * creating traces with B->X->{MDI}k and {MDI}k->X->E
446 * transitions, and making sure we can make MSAs correctly
447 * from them using p7_tracealign_MSA(). This code was initially
448 * buggy when first written; bugs first detected by Elena,
449 * Nov 2009
450 */
451 static void
utest_fragments(void)452 utest_fragments(void)
453 {
454 char *failmsg = "failure in build.c::utest_fragments() unit test";
455 char msafile[16] = "p7tmpXXXXXX"; /* tmpfile name template */
456 FILE *ofp = NULL;
457 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
458 ESL_MSAFILE *afp = NULL;
459 ESL_MSA *msa = NULL;
460 ESL_MSA *dmsa = NULL;
461 ESL_MSA *postmsa = NULL;
462 P7_HMM *hmm = NULL;
463 P7_TRACE **trarr = NULL;
464 int i;
465
466 /* Write an MSA that tests fragment/missing data transitions.
467 * When built with Handmodelmaker (using the RF line):
468 * seq1 forces B->X->Mk and Mk->X->E missing data transitions;
469 * seq2 forces B->X->Ik and Ik->X->E missing data transitions;
470 * seq3 forces B->X->Dk and Dk->X->E missing data transitions.
471 *
472 * The first two cases can arise from fragment definition in
473 * model construction, or in an input file.
474 *
475 * The X->Dk and Dk->X cases should never happen, but we don't
476 * prohibit them. They can only arise in an input file, because
477 * esl_msa_MarkFragments_old() converts everything before/after
478 * first/last residue to ~, and won't leave a gap character in
479 * between.
480 *
481 * There's nothing being tested by seq4 and seq5; they're just there.
482 */
483 if (esl_tmpfile_named(msafile, &ofp) != eslOK) esl_fatal(failmsg);
484 fprintf(ofp, "# STOCKHOLM 1.0\n");
485 fprintf(ofp, "#=GC RF xxxxx.xxxxxxxxxxxx.xxx\n");
486 fprintf(ofp, "seq1 ~~~~~~GHIKLMNPQRST~~~~\n");
487 fprintf(ofp, "seq2 ~~~~~aGHIKLMNPQRSTa~~~\n");
488 fprintf(ofp, "seq3 ~~~~~~-HIKLMNPQRS-~~~~\n");
489 fprintf(ofp, "seq4 ACDEF.GHIKLMNPQRST.VWY\n");
490 fprintf(ofp, "seq5 ACDEF.GHIKLMNPQRST.VWY\n");
491 fprintf(ofp, "//\n");
492 fclose(ofp);
493
494 /* Read the original as text for comparison to postmsa. Make a digital copy for construction */
495 if (esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)!= eslOK) esl_fatal(failmsg);
496 if (esl_msafile_Read(afp, &msa) != eslOK) esl_fatal(failmsg);
497 if ((dmsa = esl_msa_Clone(msa)) == NULL) esl_fatal(failmsg);
498 if (esl_msa_Digitize(abc, dmsa, NULL) != eslOK) esl_fatal(failmsg);
499
500 if (p7_Handmodelmaker(dmsa, NULL, &hmm, &trarr) != eslOK) esl_fatal(failmsg);
501 for (i = 0; i < dmsa->nseq; i++)
502 if (p7_trace_Validate(trarr[i], abc, dmsa->ax[i], NULL) != eslOK) esl_fatal(failmsg);
503
504 /* The example is contrived such that the traces should give exactly the
505 * same (text) alignment as the input alignment; no tracedoctoring.
506 * Not a trivial test; for example, sequence 2 has a B->X->I transition that
507 * can be problematic to handle.
508 */
509 if (p7_tracealign_MSA(dmsa, trarr, hmm->M, p7_DEFAULT, &postmsa) != eslOK) esl_fatal(failmsg);
510 for (i = 0; i < msa->nseq; i++)
511 if (strcmp(msa->aseq[i], postmsa->aseq[i]) != 0) esl_fatal(failmsg);
512
513 p7_trace_DestroyArray(trarr, msa->nseq);
514 p7_hmm_Destroy(hmm);
515 esl_msa_Destroy(msa);
516 esl_msa_Destroy(dmsa);
517 esl_msa_Destroy(postmsa);
518 esl_msafile_Close(afp);
519 esl_alphabet_Destroy(abc);
520 remove(msafile);
521 return;
522 }
523
524 #endif /*p7BUILD_TESTDRIVE*/
525 /*---------------------- end of unit tests -----------------------*/
526
527
528
529
530 /*****************************************************************
531 * 4. Test driver.
532 *****************************************************************/
533
534 #ifdef p7BUILD_TESTDRIVE
535 /* gcc -g -Wall -Dp7BUILD_TESTDRIVE -I. -I../easel -L. -L../easel -o build_utest build.c -lhmmer -leasel -lm
536 */
537 #include "easel.h"
538
539 #include "p7_config.h"
540 #include "hmmer.h"
541
542 int
main(int argc,char ** argv)543 main(int argc, char **argv)
544 {
545 utest_basic();
546 utest_fragments();
547
548 return eslOK;
549 }
550
551
552 #endif /*p7BUILD_TESTDRIVE*/
553 /*-------------------- end of test driver ---------------------*/
554
555
556
557 /******************************************************************************
558 * 5. Example.
559 ******************************************************************************/
560 #ifdef p7BUILD_EXAMPLE
561
562 #include "easel.h"
563 #include "esl_alphabet.h"
564 #include "esl_getopts.h"
565
566 static ESL_OPTIONS options[] = {
567 /* name type default env range toggles reqs incomp help docgroup*/
568 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
569 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 0 },
570 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 0 },
571 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 0 },
572 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
573 };
574 static char usage[] = "[-options] <msafile>";
575 static char banner[] = "example for the build module";
576
577
578 int
main(int argc,char ** argv)579 main(int argc, char **argv)
580 {
581 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage);
582 char *msafile = esl_opt_GetArg(go, 1);
583 int fmt = eslMSAFILE_UNKNOWN;
584 ESL_ALPHABET *abc = NULL;
585 ESL_MSAFILE *afp = NULL;
586 ESL_MSA *msa = NULL;
587 P7_HMM *hmm = NULL;
588 P7_PRIOR *prior = NULL;
589 P7_TRACE **trarr = NULL;
590 P7_BG *bg = NULL;
591 P7_PROFILE *gm = NULL;
592 ESL_MSA *postmsa = NULL;
593 int i;
594 int status;
595
596 /* Standard idioms for opening and reading a digital MSA. (See esl_msa.c example). */
597 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
598 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
599 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
600
601 if ((status = esl_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp)) != eslOK)
602 esl_msafile_OpenFailure(afp, status);
603
604 bg = p7_bg_Create(abc);
605
606 switch (abc->type) {
607 case eslAMINO: prior = p7_prior_CreateAmino(); break;
608 case eslDNA: prior = p7_prior_CreateNucleic(); break;
609 case eslRNA: prior = p7_prior_CreateNucleic(); break;
610 default: prior = p7_prior_CreateLaplace(abc); break;
611 }
612 if (prior == NULL) esl_fatal("Failed to initialize prior");
613
614 while ((status = esl_msafile_Read(afp, &msa)) != eslEOF)
615 {
616 if (status != eslOK) esl_msafile_ReadFailure(afp, status);
617
618 /* The modelmakers collect counts in an HMM structure */
619 status = p7_Handmodelmaker(msa, NULL, &hmm, &trarr);
620 if (status == eslENORESULT) esl_fatal("no consensus columns in alignment %s\n", msa->name);
621 else if (status != eslOK) esl_fatal("failed to build HMM from alignment %s\n", msa->name);
622
623 printf("COUNTS:\n");
624 p7_hmm_Dump(stdout, hmm);
625
626 /* These counts, in combination with a prior, are converted to probability parameters */
627 status = p7_ParameterEstimation(hmm, prior);
628 if (status != eslOK) esl_fatal("failed to parameterize HMM for %s", msa->name);
629
630 printf("PROBABILITIES:\n");
631 p7_hmm_Dump(stdout, hmm);
632
633 /* Just so we can dump a more informatively annotated trace - build a profile */
634 gm = p7_profile_Create(hmm->M, abc);
635 p7_ProfileConfig(hmm, bg, gm, 400, p7_LOCAL);
636
637 /* Dump the individual traces */
638 for (i = 0; i < msa->nseq; i++)
639 {
640 printf("Trace %d: %s\n", i+1, msa->sqname[i]);
641 p7_trace_Dump(stdout, trarr[i], gm, msa->ax[i]);
642 }
643
644 /* Create an MSA from the individual traces */
645 status = p7_tracealign_MSA(msa, trarr, hmm->M, p7_DEFAULT, &postmsa);
646 if (status != eslOK) esl_fatal("failed to create new MSA from traces\n");
647
648 esl_msafile_Write(stdout, postmsa, eslMSAFILE_PFAM);
649
650 p7_profile_Destroy(gm);
651 p7_hmm_Destroy(hmm);
652 p7_trace_DestroyArray(trarr, msa->nseq);
653 esl_msa_Destroy(postmsa);
654 esl_msa_Destroy(msa);
655 }
656
657 esl_msafile_Close(afp);
658 p7_bg_Destroy(bg);
659 esl_alphabet_Destroy(abc);
660 esl_getopts_Destroy(go);
661 return 0;
662 }
663
664 #endif /*p7BUILD_EXAMPLE*/
665
666
667
668