1 /* Construction of new HMMs from multiple alignments.
2  *
3  * Two versions:
4  *    p7_Handmodelmaker() -- use #=RF annotation to indicate match columns
5  *    p7_Fastmodelmaker() -- Krogh/Haussler heuristic
6  *
7  * The maximum likelihood model construction algorithm that was in previous
8  * HMMER versions has been deprecated, at least for the moment.
9  *
10  * The meat of the model construction code is in matassign2hmm().
11  * The two model construction strategies simply label which columns
12  * are supposed to be match states, and then hand this info to
13  * matassign2hmm().
14  *
15  *
16  * Contents:
17  *    1. Exported API: model construction routines.
18  *    2. Private functions used in constructing models.
19  *    3. Unit tests.
20  *    4. Test driver.
21  *    5. Example.
22  */
23 #include "p7_config.h"
24 
25 #include <string.h>
26 
27 #include "easel.h"
28 #include "esl_alphabet.h"
29 #include "esl_msa.h"
30 #include "esl_msafile.h"
31 
32 #include "hmmer.h"
33 
34 static int do_modelmask( ESL_MSA *msa);
35 static int matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr);
36 static int annotate_model(P7_HMM *hmm, int *matassign, ESL_MSA *msa);
37 
38 /*****************************************************************
39  * 1. Exported API: model construction routines.
40  *****************************************************************/
41 
42 /* Function: p7_Handmodelmaker()
43  *
44  * Purpose:  Manual model construction.
45  *           Construct an HMM from a digital alignment, where the
46  *           <#=RF> line of the alignment file is used to indicate the
47  *           columns assigned to matches vs. inserts.
48  *
49  *           The <msa> must be in digital mode, and it must have
50  *           a reference annotation line.
51  *
52  *           NOTE: <p7_Handmodelmaker()> will slightly revise the
53  *           alignment if necessary, if the assignment of columns
54  *           implies DI and ID transitions.
55  *
56  *           Returns both the HMM in counts form (ready for applying
57  *           Dirichlet priors as the next step), and fake tracebacks
58  *           for each aligned sequence.
59  *
60  *           Models must have at least one node, so if the <msa> defined
61  *           no consensus columns, a <eslENORESULT> error is returned.
62  *
63  * Args:     msa     - multiple sequence alignment
64  *           bld     - holds information on regions requiring masking, optionally NULL -> no masking
65  *           ret_hmm - RETURN: counts-form HMM
66  *           opt_tr  - optRETURN: array of tracebacks for aseq's
67  *
68  * Return:   <eslOK> on success. <ret_hmm> and <opt_tr> are allocated
69  *           here, and must be free'd by caller.
70  *
71  *           Returns <eslENORESULT> if no consensus columns were annotated;
72  *           in this case, <ret_hmm> and <opt_tr> are returned NULL.
73  *
74  *           Returns <eslEFORMAT> if the <msa> doesn't have a reference
75  *           annotation line.
76  *
77  * Throws:   <eslEMEM> on allocation failure. Throws <eslEINVAL> if the <msa>
78  *           isn't in digital mode.
79  */
80 int
p7_Handmodelmaker(ESL_MSA * msa,P7_BUILDER * bld,P7_HMM ** ret_hmm,P7_TRACE *** opt_tr)81 p7_Handmodelmaker(ESL_MSA *msa, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
82 {
83   int        status;
84   int       *matassign = NULL;    /* MAT state assignments if 1; 1..alen */
85   int        apos;                /* counter for aligned columns         */
86 
87   if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need a digital msa");
88   if (msa->rf == NULL)                 return eslEFORMAT;
89 
90   ESL_ALLOC(matassign, sizeof(int) * (msa->alen+1));
91 
92   /* Watch for off-by-one. rf is [0..alen-1]; matassign is [1..alen] */
93   for (apos = 1; apos <= msa->alen; apos++)
94     matassign[apos] = (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])? FALSE : TRUE);
95 
96   /* matassign2hmm leaves ret_hmm, opt_tr in their proper state: */
97   if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) goto ERROR;
98 
99   free(matassign);
100   return eslOK;
101 
102  ERROR:
103   if (matassign != NULL) free(matassign);
104   return status;
105 }
106 
107 /* Function: p7_Fastmodelmaker()
108  *
109  * Purpose:  Heuristic model construction.
110  *           Construct an HMM from an alignment by a simple rule,
111  *           based on the fractional occupancy of each columns w/
112  *           residues vs gaps. Any column w/ a fractional
113  *           occupancy of $\geq$ <symfrac> is assigned as a MATCH column;
114  *           for instance, if thresh = 0.5, columns w/ $\geq$ 50\%
115  *           residues are assigned to match... roughly speaking.
116  *
117  *           "Roughly speaking" because sequences may be weighted
118  *           in the input <msa>, and because missing data symbols are
119  *           ignored, in order to deal with sequence fragments.
120  *
121  *           The <msa> must be in digital mode.
122  *
123  *           If the caller wants to designate any sequences as
124  *           fragments, it does so by converting all N-terminal and
125  *           C-terminal flanking gap symbols to missing data symbols.
126  *
127  *           NOTE: p7_Fastmodelmaker() will slightly revise the
128  *           alignment if the assignment of columns implies
129  *           DI and ID transitions.
130  *
131  *           Returns the HMM in counts form (ready for applying Dirichlet
132  *           priors as the next step). Also returns fake traceback
133  *           for each training sequence.
134  *
135  *           Models must have at least one node, so if the <msa> defined
136  *           no consensus columns, a <eslENORESULT> error is returned.
137  *
138  * Args:     msa       - multiple sequence alignment
139  *           symfrac   - threshold for residue occupancy; >= assigns MATCH
140  *           bld       - holds information on regions requiring masking, optionally NULL -> no masking
141  *           ret_hmm   - RETURN: counts-form HMM
142  *           opt_tr    - optRETURN: array of tracebacks for aseq's
143  *
144  * Return:   <eslOK> on success. ret_hmm and opt_tr allocated here,
145  *           and must be free'd by the caller (FreeTrace(tr[i]), free(tr),
146  *           FreeHMM(hmm)).
147  *
148  *           Returns <eslENORESULT> if no consensus columns were annotated;
149  *           in this case, <ret_hmm> and <opt_tr> are returned NULL.
150  *
151  * Throws:   <eslEMEM> on allocation failure; <eslEINVAL> if the
152  *           <msa> isn't in digital mode.
153  */
154 int
p7_Fastmodelmaker(ESL_MSA * msa,float symfrac,P7_BUILDER * bld,P7_HMM ** ret_hmm,P7_TRACE *** opt_tr)155 p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
156 {
157   int      status;	     /* return status flag                  */
158   int     *matassign = NULL; /* MAT state assignments if 1; 1..alen */
159   int      idx;              /* counter over sequences              */
160   int      apos;             /* counter for aligned columns         */
161   float    r;		         /* weighted residue count              */
162   float    totwgt;	     /* weighted residue+gap count          */
163 
164   if (! (msa->flags & eslMSA_DIGITAL)) ESL_XEXCEPTION(eslEINVAL, "need digital MSA");
165 
166   /* Allocations: matassign is 1..alen array of bit flags.
167    */
168   ESL_ALLOC(matassign, sizeof(int)     * (msa->alen+1));
169 
170   /* Determine weighted sym freq in each column, set matassign[] accordingly.
171    */
172   for (apos = 1; apos <= msa->alen; apos++)
173     {
174       r = totwgt = 0.;
175       for (idx = 0; idx < msa->nseq; idx++)
176       {
177         if       (esl_abc_XIsResidue(msa->abc, msa->ax[idx][apos])) { r += msa->wgt[idx]; totwgt += msa->wgt[idx]; }
178         else if  (esl_abc_XIsGap(msa->abc,     msa->ax[idx][apos])) {                     totwgt += msa->wgt[idx]; }
179         else if  (esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos])) continue;
180       }
181       if (r > 0. && r / totwgt >= symfrac) matassign[apos] = TRUE;
182       else                                 matassign[apos] = FALSE;
183     }
184 
185 
186   /* Once we have matassign calculated, modelmakers behave
187    * the same; matassign2hmm() does this stuff (traceback construction,
188    * trace counting) and sets up ret_hmm and opt_tr.
189    */
190   if ((status = matassign2hmm(msa, matassign, ret_hmm, opt_tr)) != eslOK) {
191     fprintf (stderr, "hmm construction error during trace counting\n");
192     goto ERROR;
193   }
194 
195   free(matassign);
196   return eslOK;
197 
198  ERROR:
199   if (matassign != NULL) free(matassign);
200   return status;
201 }
202 
203 /*-------------------- end, exported API -------------------------*/
204 
205 
206 
207 
208 /*****************************************************************
209  * 2. Private functions used in constructing models.
210  *****************************************************************/
211 
212 
213 /* Function: do_modelmask()
214  *
215  * Purpose:  If the given <msa> has a MM CS line, mask (turn to
216  *           degenerate) residues in the msa positions associated
217  *           with the marked position in the MM (marked with 'm')
218  *
219  * Return:   <eslOK> on success.
220  *           <eslENORESULT> if error.
221  */
222 static int
do_modelmask(ESL_MSA * msa)223 do_modelmask( ESL_MSA *msa)
224 {
225   int i,j;
226 
227   if (msa->mm == NULL)  return eslOK;  //nothing to do
228 
229   for (i = 1; i <= msa->alen; i++) {
230     for (j = 0; j < msa->nseq; j++) {
231       if (msa->mm[i-1] == 'm') {
232         if (msa->ax[j][i] != msa->abc->K && msa->ax[j][i] != msa->abc->Kp-1) // if not gap
233           msa->ax[j][i] = msa->abc->Kp-3; //that's the degenerate "any character" (N for DNA, X for protein)
234       }
235     }
236   }
237   return eslOK;
238 }
239 
240 /* Function: matassign2hmm()
241  *
242  * Purpose:  Given an assignment of alignment columns to match vs.
243  *           insert, finish the final part of the model construction
244  *           calculation that is constant between model construction
245  *           algorithms.
246  *
247  * Args:     msa       - multiple sequence alignment
248  *           matassign - 1..alen bit flags for column assignments
249  *           ret_hmm   - RETURN: counts-form HMM
250  *           opt_tr    - optRETURN: array of tracebacks for aseq's
251  *
252  * Return:   <eslOK> on success.
253  *           <eslENORESULT> if no consensus columns are identified.
254  *
255  *           ret_hmm and opt_tr alloc'ed here.
256  */
257 static int
matassign2hmm(ESL_MSA * msa,int * matassign,P7_HMM ** ret_hmm,P7_TRACE *** opt_tr)258 matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
259 {
260   int        status;		/* return status                       */
261   P7_HMM    *hmm = NULL;        /* RETURN: new hmm                     */
262   P7_TRACE **tr  = NULL;        /* RETURN: 0..nseq-1 fake traces       */
263   int      M;                   /* length of new model in match states */
264   int      idx;                 /* counter over sequences              */
265   int      apos;                /* counter for aligned columns         */
266   char errbuf[eslERRBUFSIZE];
267 
268   /* apply the model mask in the 'GC MM' row */
269   do_modelmask(msa);
270 
271   /* How many match states in the HMM? */
272   for (M = 0, apos = 1; apos <= msa->alen; apos++)
273     if (matassign[apos]) M++;
274   if (M == 0) { status = eslENORESULT; goto ERROR; }
275 
276   /* Make fake tracebacks for each seq */
277   ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq);
278   if ((status = p7_trace_FauxFromMSA(msa, matassign, p7_MSA_COORDS, tr))        != eslOK) goto ERROR;
279   for (idx = 0; idx < msa->nseq; idx++)
280     {
281       if ((status = p7_trace_Doctor(tr[idx], NULL, NULL))                       != eslOK) goto ERROR;
282       if ((status = p7_trace_Validate(tr[idx], msa->abc, msa->ax[idx], errbuf)) != eslOK)
283 	ESL_XEXCEPTION(eslFAIL, "validation failed: %s", errbuf);
284     }
285 
286   /* Build count model from tracebacks */
287   if ((hmm    = p7_hmm_Create(M, msa->abc)) == NULL)  { status = eslEMEM; goto ERROR; }
288   if ((status = p7_hmm_Zero(hmm))           != eslOK) goto ERROR;
289   for (idx = 0; idx < msa->nseq; idx++) {
290     if (tr[idx] == NULL) continue; /* skip rare examples of empty sequences */
291     if ((status = p7_trace_Count(hmm, msa->ax[idx], msa->wgt[idx], tr[idx])) != eslOK) goto ERROR;
292   }
293 
294   hmm->nseq     = msa->nseq;
295   hmm->eff_nseq = msa->nseq;
296 
297   /* Transfer annotation from the MSA to the new model
298    */
299   if ((status = annotate_model(hmm, matassign, msa)) != eslOK) goto ERROR;
300 
301   /* Reset #=RF line of alignment to reflect our assignment
302    * of match, delete. matassign is valid from 1..alen and is off
303    * by one from msa->rf.
304    */
305   if (msa->rf == NULL)  ESL_ALLOC(msa->rf, sizeof(char) * (msa->alen + 1));
306   for (apos = 1; apos <= msa->alen; apos++)
307     msa->rf[apos-1] = matassign[apos] ? 'x' : '.';
308   msa->rf[msa->alen] = '\0';
309 
310   if (opt_tr  != NULL) *opt_tr  = tr;
311   else                  p7_trace_DestroyArray(tr, msa->nseq);
312   *ret_hmm = hmm;
313   return eslOK;
314 
315  ERROR:
316   if (tr     != NULL) p7_trace_DestroyArray(tr, msa->nseq);
317   if (hmm    != NULL) p7_hmm_Destroy(hmm);
318   if (opt_tr != NULL) *opt_tr = NULL;
319   *ret_hmm = NULL;
320   return status;
321 }
322 
323 
324 
325 /* Function: annotate_model()
326  *
327  * Purpose:  Transfer rf, cs, and other optional annotation from the alignment
328  *           to the new model.
329  *
330  * Args:     hmm       - [M] new model to annotate
331  *           matassign - which alignment columns are MAT; [1..alen]
332  *           msa       - alignment, including annotation to transfer
333  *
334  * Return:   <eslOK> on success.
335  *
336  * Throws:   <eslEMEM> on allocation error.
337  */
338 static int
annotate_model(P7_HMM * hmm,int * matassign,ESL_MSA * msa)339 annotate_model(P7_HMM *hmm, int *matassign, ESL_MSA *msa)
340 {
341   int   apos;			/* position in matassign, 1.alen  */
342   int   k;			/* position in model, 1.M         */
343   int   status;
344 
345   /* Reference coord annotation  */
346   if (msa->rf != NULL) {
347     ESL_ALLOC(hmm->rf, sizeof(char) * (hmm->M+2));
348     hmm->rf[0] = ' ';
349     for (apos = k = 1; apos <= msa->alen; apos++)
350       if (matassign[apos]) hmm->rf[k++] = msa->rf[apos-1]; /* watch off-by-one in msa's rf */
351     hmm->rf[k] = '\0';
352     hmm->flags |= p7H_RF;
353   }
354 
355   /* Model mask annotation  */
356   if (msa->mm != NULL) {
357     ESL_ALLOC(hmm->mm, sizeof(char) * (hmm->M+2));
358     hmm->mm[0] = ' ';
359     for (apos = k = 1; apos <= msa->alen; apos++)
360       if (matassign[apos]) hmm->mm[k++] = ( msa->mm[apos-1] == '.' ? '-' : msa->mm[apos-1]) ;
361     hmm->mm[k] = '\0';
362     hmm->flags |= p7H_MMASK;
363   }
364 
365   /* Consensus structure annotation */
366   if (msa->ss_cons != NULL) {
367     ESL_ALLOC(hmm->cs, sizeof(char) * (hmm->M+2));
368     hmm->cs[0] = ' ';
369     for (apos = k = 1; apos <= msa->alen; apos++)
370       if (matassign[apos]) hmm->cs[k++] = msa->ss_cons[apos-1];
371     hmm->cs[k] = '\0';
372     hmm->flags |= p7H_CS;
373   }
374 
375   /* Surface accessibility annotation */
376   if (msa->sa_cons != NULL) {
377     ESL_ALLOC(hmm->ca, sizeof(char) * (hmm->M+2));
378     hmm->ca[0] = ' ';
379     for (apos = k = 1; apos <= msa->alen; apos++)
380       if (matassign[apos]) hmm->ca[k++] = msa->sa_cons[apos-1];
381     hmm->ca[k] = '\0';
382     hmm->flags |= p7H_CA;
383   }
384 
385   /* The alignment map (1..M in model, 1..alen in alignment) */
386   ESL_ALLOC(hmm->map, sizeof(int) * (hmm->M+1));
387   hmm->map[0] = 0;
388   for (apos = k = 1; apos <= msa->alen; apos++)
389     if (matassign[apos]) hmm->map[k++] = apos;
390   hmm->flags |= p7H_MAP;
391 
392   return eslOK;
393 
394  ERROR:
395   return status;
396 }
397 
398 /*****************************************************************
399  * 3. Unit tests.
400  *****************************************************************/
401 #ifdef p7BUILD_TESTDRIVE
402 
403 /* utest_basic()
404  * An MSA to ex{e,o}rcise past demons.
405  *   1. seq2 gives an I->end transition.
406  *   2. seq1 contains degenerate Z,X, exercising symbol counting
407  *      of degenerate residues.
408  */
409 static void
utest_basic(void)410 utest_basic(void)
411 {
412   char         *failmsg      = "failure in build.c::utest_basic() unit test";
413   char          msafile[16]  = "p7tmpXXXXXX"; /* tmpfile name template */
414   FILE         *ofp          = NULL;
415   ESL_ALPHABET *abc          = esl_alphabet_Create(eslAMINO);
416   ESL_MSAFILE  *afp          = NULL;
417   ESL_MSA      *msa          = NULL;
418   P7_HMM       *hmm          = NULL;
419   float         symfrac      = 0.5;
420 
421   if (esl_tmpfile_named(msafile, &ofp) != eslOK) esl_fatal(failmsg);
422   fprintf(ofp, "# STOCKHOLM 1.0\n");
423   fprintf(ofp, "#=GC RF --xxxxxxxxxxxxxxxx-xxx-x--\n");
424   fprintf(ofp, "seq1    --ACDEFGHIKLMNPZXS-TVW-Yyy\n");
425   fprintf(ofp, "seq2    aaACDEFGHIKLMNPQRS-TVWw---\n");
426   fprintf(ofp, "seq3    aaAC-EFGHIKLMNPQRS-TVW-Y--\n");
427   fprintf(ofp, "seq4    aaAC-EFGHIKLMNPQRS-TVW-Y--\n");
428   fprintf(ofp, "//\n");
429   fclose(ofp);
430 
431   if (esl_msafile_Open(&abc, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp) != eslOK) esl_fatal(failmsg);
432   if (esl_msafile_Read(afp, &msa)                                           != eslOK) esl_fatal(failmsg);
433   if (p7_Fastmodelmaker(msa, symfrac, NULL, &hmm, NULL)                     != eslOK) esl_fatal(failmsg);
434 
435   p7_hmm_Destroy(hmm);
436   esl_msa_Destroy(msa);
437   esl_msafile_Close(afp);
438   esl_alphabet_Destroy(abc);
439   remove(msafile);
440   return;
441 }
442 
443 /* utest_fragments()
444  * This exercises the building code that deals with fragments,
445  * creating traces with B->X->{MDI}k and {MDI}k->X->E
446  * transitions, and making sure we can make MSAs correctly
447  * from them using p7_tracealign_MSA(). This code was initially
448  * buggy when first written; bugs first detected by Elena,
449  * Nov 2009
450  */
451 static void
utest_fragments(void)452 utest_fragments(void)
453 {
454   char         *failmsg      = "failure in build.c::utest_fragments() unit test";
455   char          msafile[16]  = "p7tmpXXXXXX"; /* tmpfile name template */
456   FILE         *ofp          = NULL;
457   ESL_ALPHABET *abc          = esl_alphabet_Create(eslAMINO);
458   ESL_MSAFILE  *afp          = NULL;
459   ESL_MSA      *msa          = NULL;
460   ESL_MSA      *dmsa         = NULL;
461   ESL_MSA      *postmsa      = NULL;
462   P7_HMM       *hmm          = NULL;
463   P7_TRACE    **trarr        = NULL;
464   int           i;
465 
466   /* Write an MSA that tests fragment/missing data transitions.
467    * When built with Handmodelmaker (using the RF line):
468    *   seq1 forces B->X->Mk and Mk->X->E missing data transitions;
469    *   seq2 forces B->X->Ik and Ik->X->E missing data transitions;
470    *   seq3 forces B->X->Dk and Dk->X->E missing data transitions.
471    *
472    * The first two cases can arise from fragment definition in
473    * model construction, or in an input file.
474    *
475    * The X->Dk and Dk->X cases should never happen, but we don't
476    * prohibit them. They can only arise in an input file, because
477    * esl_msa_MarkFragments_old() converts everything before/after
478    * first/last residue to ~, and won't leave a gap character in
479    * between.
480    *
481    * There's nothing being tested by seq4 and seq5; they're just there.
482    */
483   if (esl_tmpfile_named(msafile, &ofp) != eslOK) esl_fatal(failmsg);
484   fprintf(ofp, "# STOCKHOLM 1.0\n");
485   fprintf(ofp, "#=GC RF xxxxx.xxxxxxxxxxxx.xxx\n");
486   fprintf(ofp, "seq1    ~~~~~~GHIKLMNPQRST~~~~\n");
487   fprintf(ofp, "seq2    ~~~~~aGHIKLMNPQRSTa~~~\n");
488   fprintf(ofp, "seq3    ~~~~~~-HIKLMNPQRS-~~~~\n");
489   fprintf(ofp, "seq4    ACDEF.GHIKLMNPQRST.VWY\n");
490   fprintf(ofp, "seq5    ACDEF.GHIKLMNPQRST.VWY\n");
491   fprintf(ofp, "//\n");
492   fclose(ofp);
493 
494   /* Read the original as text for comparison to postmsa. Make a digital copy for construction */
495   if (esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)!= eslOK) esl_fatal(failmsg);
496   if (esl_msafile_Read(afp, &msa)                                          != eslOK) esl_fatal(failmsg);
497   if ((dmsa = esl_msa_Clone(msa))                                          == NULL)  esl_fatal(failmsg);
498   if (esl_msa_Digitize(abc, dmsa, NULL)                                    != eslOK) esl_fatal(failmsg);
499 
500   if (p7_Handmodelmaker(dmsa, NULL, &hmm, &trarr)                                 != eslOK) esl_fatal(failmsg);
501   for (i = 0; i < dmsa->nseq; i++)
502     if (p7_trace_Validate(trarr[i], abc, dmsa->ax[i], NULL)                 != eslOK) esl_fatal(failmsg);
503 
504   /* The example is contrived such that the traces should give exactly the
505    * same (text) alignment as the input alignment; no tracedoctoring.
506    * Not a trivial test; for example, sequence 2 has a B->X->I transition that
507    * can be problematic to handle.
508    */
509   if (p7_tracealign_MSA(dmsa, trarr, hmm->M, p7_DEFAULT, &postmsa)          != eslOK) esl_fatal(failmsg);
510   for (i = 0; i < msa->nseq; i++)
511     if (strcmp(msa->aseq[i], postmsa->aseq[i]) != 0) esl_fatal(failmsg);
512 
513   p7_trace_DestroyArray(trarr, msa->nseq);
514   p7_hmm_Destroy(hmm);
515   esl_msa_Destroy(msa);
516   esl_msa_Destroy(dmsa);
517   esl_msa_Destroy(postmsa);
518   esl_msafile_Close(afp);
519   esl_alphabet_Destroy(abc);
520   remove(msafile);
521   return;
522 }
523 
524 #endif /*p7BUILD_TESTDRIVE*/
525 /*---------------------- end of unit tests -----------------------*/
526 
527 
528 
529 
530 /*****************************************************************
531  * 4. Test driver.
532  *****************************************************************/
533 
534 #ifdef p7BUILD_TESTDRIVE
535 /* gcc -g -Wall -Dp7BUILD_TESTDRIVE -I. -I../easel -L. -L../easel -o build_utest build.c -lhmmer -leasel -lm
536  */
537 #include "easel.h"
538 
539 #include "p7_config.h"
540 #include "hmmer.h"
541 
542 int
main(int argc,char ** argv)543 main(int argc, char **argv)
544 {
545   utest_basic();
546   utest_fragments();
547 
548   return eslOK;
549 }
550 
551 
552 #endif /*p7BUILD_TESTDRIVE*/
553 /*-------------------- end of test driver ---------------------*/
554 
555 
556 
557 /******************************************************************************
558  * 5. Example.
559  ******************************************************************************/
560 #ifdef p7BUILD_EXAMPLE
561 
562 #include "easel.h"
563 #include "esl_alphabet.h"
564 #include "esl_getopts.h"
565 
566 static ESL_OPTIONS options[] = {
567   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
568   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",    0 },
569   { "--dna",     eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use DNA alphabet",                        0 },
570   { "--rna",     eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use RNA alphabet",                        0 },
571   { "--amino",   eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use protein alphabet",                    0 },
572   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
573 };
574 static char usage[]  = "[-options] <msafile>";
575 static char banner[] = "example for the build module";
576 
577 
578 int
main(int argc,char ** argv)579 main(int argc, char **argv)
580 {
581   ESL_GETOPTS  *go        = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage);
582   char         *msafile   = esl_opt_GetArg(go, 1);
583   int           fmt       = eslMSAFILE_UNKNOWN;
584   ESL_ALPHABET *abc       = NULL;
585   ESL_MSAFILE  *afp       = NULL;
586   ESL_MSA      *msa       = NULL;
587   P7_HMM       *hmm       = NULL;
588   P7_PRIOR     *prior     = NULL;
589   P7_TRACE    **trarr     = NULL;
590   P7_BG        *bg        = NULL;
591   P7_PROFILE   *gm        = NULL;
592   ESL_MSA      *postmsa   = NULL;
593   int           i;
594   int           status;
595 
596   /* Standard idioms for opening and reading a digital MSA. (See esl_msa.c example). */
597   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
598   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
599   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
600 
601   if ((status = esl_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp)) != eslOK)
602     esl_msafile_OpenFailure(afp, status);
603 
604   bg  = p7_bg_Create(abc);
605 
606   switch (abc->type) {
607   case eslAMINO: prior = p7_prior_CreateAmino();      break;
608   case eslDNA:   prior = p7_prior_CreateNucleic();    break;
609   case eslRNA:   prior = p7_prior_CreateNucleic();    break;
610   default:       prior = p7_prior_CreateLaplace(abc); break;
611   }
612   if (prior == NULL) esl_fatal("Failed to initialize prior");
613 
614   while ((status = esl_msafile_Read(afp, &msa)) != eslEOF)
615     {
616       if (status != eslOK) esl_msafile_ReadFailure(afp, status);
617 
618       /* The modelmakers collect counts in an HMM structure */
619       status = p7_Handmodelmaker(msa, NULL, &hmm, &trarr);
620       if      (status == eslENORESULT) esl_fatal("no consensus columns in alignment %s\n",  msa->name);
621       else if (status != eslOK)        esl_fatal("failed to build HMM from alignment %s\n", msa->name);
622 
623       printf("COUNTS:\n");
624       p7_hmm_Dump(stdout, hmm);
625 
626       /* These counts, in combination with a prior, are converted to probability parameters */
627       status = p7_ParameterEstimation(hmm, prior);
628       if (status != eslOK)             esl_fatal("failed to parameterize HMM for %s", msa->name);
629 
630       printf("PROBABILITIES:\n");
631       p7_hmm_Dump(stdout, hmm);
632 
633       /* Just so we can dump a more informatively annotated trace - build a profile */
634       gm = p7_profile_Create(hmm->M, abc);
635       p7_ProfileConfig(hmm, bg, gm, 400, p7_LOCAL);
636 
637       /* Dump the individual traces */
638       for (i = 0; i < msa->nseq; i++)
639 	{
640 	  printf("Trace %d: %s\n", i+1, msa->sqname[i]);
641 	  p7_trace_Dump(stdout, trarr[i], gm, msa->ax[i]);
642 	}
643 
644       /* Create an MSA from the individual traces */
645       status = p7_tracealign_MSA(msa, trarr, hmm->M, p7_DEFAULT, &postmsa);
646       if (status != eslOK) esl_fatal("failed to create new MSA from traces\n");
647 
648       esl_msafile_Write(stdout, postmsa, eslMSAFILE_PFAM);
649 
650       p7_profile_Destroy(gm);
651       p7_hmm_Destroy(hmm);
652       p7_trace_DestroyArray(trarr, msa->nseq);
653       esl_msa_Destroy(postmsa);
654       esl_msa_Destroy(msa);
655     }
656 
657   esl_msafile_Close(afp);
658   p7_bg_Destroy(bg);
659   esl_alphabet_Destroy(abc);
660   esl_getopts_Destroy(go);
661   return 0;
662 }
663 
664 #endif /*p7BUILD_EXAMPLE*/
665 
666 
667 
668