1 /************************************************************
2  * HMMER - Biological sequence analysis with profile-HMMs
3  * Copyright (C) 1992-1998 Washington University School of Medicine
4  *
5  *   This source code is distributed under the terms of the
6  *   GNU General Public License. See the files COPYING and
7  *   GNULICENSE for details.
8  *
9  ************************************************************/
10 
11 /* hmmsearch.c
12  * SRE, Tue Jan  7 17:19:20 1997
13  *
14  * Search a sequence database with a profile HMM.
15  * RCS $Id: hmmsearch.c,v 1.1.1.1 2001/06/18 13:59:50 birney Exp $
16  */
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <float.h>
23 
24 #include "structs.h"		/* data structures, macros, #define's   */
25 #include "config.h"		/* compile-time configuration constants */
26 #include "funcs.h"		/* function declarations                */
27 #include "globals.h"		/* alphabet global variables            */
28 #include "squid.h"		/* general sequence analysis library    */
29 
30 #ifdef MEMDEBUG
31 #include "dbmalloc.h"
32 #endif
33 
34 static char banner[] = "hmmsearch - search a sequence database with a profile HMM";
35 
36 static char usage[]  = "\
37 Usage: hmmsearch [-options] <hmmfile> <sequence file or database>\n\
38   Available options are:\n\
39    -h        : help; print brief help on version and usage\n\
40    -A <n>    : sets alignment output limit to <n> best domain alignments\n\
41    -E <x>    : sets E value cutoff (globE) to <x>\n\
42    -T <x>    : sets T bit threshold (globT) to <x>\n\
43    -Z <n>    : sets Z (# seqs) for E-value calculation\n\
44 ";
45 
46 static char experts[] = "\
47    --domE <x>: sets domain Eval cutoff (2nd threshold) to <x>\n\
48    --domT <x>: sets domain T bit thresh (2nd threshold) to <x>\n\
49    --forward : use the full Forward() algorithm instead of Viterbi\n\
50    --null2   : turn OFF the post hoc second null model\n\
51    --xnu     : turn ON XNU filtering of target protein sequences\n\
52 ";
53 
54 static struct opt_s OPTIONS[] = {
55   { "-h",        TRUE,  sqdARG_NONE },
56   { "-A",        TRUE,  sqdARG_INT  },
57   { "-E",        TRUE,  sqdARG_FLOAT},
58   { "-T",        TRUE,  sqdARG_FLOAT},
59   { "-Z",        TRUE,  sqdARG_INT },
60   { "--domE",    FALSE, sqdARG_FLOAT},
61   { "--domT",    FALSE, sqdARG_FLOAT},
62   { "--forward", FALSE, sqdARG_NONE },
63   { "--null2",   FALSE, sqdARG_NONE },
64   { "--xnu",     FALSE, sqdARG_NONE },
65 };
66 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
67 
68 static void record_domains(struct tophit_s *h,
69 			   struct plan7_s *hmm, char *dsq, SQINFO *sqinfo,
70 			   struct p7trace_s *tr, double whole_pval, float whole_sc,
71 			   int do_null2);
72 
73 int
main(int argc,char ** argv)74 main(int argc, char **argv)
75 {
76   char    *hmmfile;	        /* file to read HMM(s) from                */
77   HMMFILE *hmmfp;               /* opened hmmfile for reading              */
78   char    *seqfile;             /* file to read target sequence(s) from    */
79   SQFILE   *sqfp;               /* opened seqfile for reading              */
80   int       format;	        /* format of seqfile                       */
81   char     *seq;		/* target sequence                         */
82   SQINFO    sqinfo;	        /* optional info for seq                   */
83   char     *dsq;		/* digitized target sequence               */
84   int       i;
85   struct plan7_s  *hmm;         /* HMM to search with                      */
86   struct histogram_s *histogram;/* histogram of all scores                 */
87   struct p7trace_s  *tr;	/* traceback                               */
88   struct fancyali_s *ali;       /* displayed alignment info                */
89   struct tophit_s   *ghit;      /* list of top hits for whole sequences    */
90   struct tophit_s   *dhit;	/* list of top hits for domains            */
91 
92   float     sc;	        	/* score of an HMM search                  */
93   double  pvalue;		/* pvalue of an HMM score                  */
94   double  evalue;		/* evalue of an HMM score                  */
95   double  motherp;		/* pvalue of a whole seq HMM score         */
96   float   mothersc;		/* score of a whole seq parent of domain   */
97   int     sqfrom, sqto;		/* coordinates in sequence                 */
98   int     hmmfrom, hmmto;	/* coordinate in HMM                       */
99   char   *name, *desc;          /* hit sequence name and description       */
100   int     sqlen;		/* length of seq that was hit              */
101   int     nseq;			/* number of sequences searched            */
102   int     Z;			/* # of seqs for purposes of E-val calc    */
103   int     domidx;		/* number of this domain                   */
104   int     ndom;			/* total # of domains in this seq          */
105   int     namewidth;		/* max width of sequence name              */
106 
107   int    Alimit;		/* A parameter limiting output alignments   */
108   float  globT;			/* T parameter: keep only hits > globT bits */
109   double globE;			/* E parameter: keep hits < globE E-value   */
110   float  domT;			/* T parameter for individual domains       */
111   double domE;			/* E parameter for individual domains       */
112 
113   char *optname;                /* name of option found by Getopt()         */
114   char *optarg;                 /* argument found by Getopt()               */
115   int   optind;                 /* index in argv[]                          */
116   int   do_null2;		/* TRUE to adjust scores with null model #2 */
117   int   do_forward;		/* TRUE to use Forward() not Viterbi()      */
118   int   do_xnu;			/* TRUE to filter sequences thru XNU        */
119 
120 #ifdef MEMDEBUG
121   unsigned long histid1, histid2, orig_size, current_size;
122   orig_size = malloc_inuse(&histid1);
123   fprintf(stderr, "[... memory debugging is ON ...]\n");
124 #endif
125 
126   /***********************************************
127    * Parse command line
128    ***********************************************/
129 
130   do_forward  = FALSE;
131   do_null2    = TRUE;
132   do_xnu      = FALSE;
133   Z           = 0;
134 
135   Alimit      = INT_MAX;	/* no limit on alignment output     */
136   globE       = 10.0;		/* use a reasonable Eval threshold; */
137   globT       = -FLT_MAX;	/*   but no bit threshold,          */
138   domT        = -FLT_MAX;	/*   no domain bit threshold,       */
139   domE        = FLT_MAX;        /*   and no domain Eval threshold.  */
140 
141   while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
142                 &optind, &optname, &optarg))  {
143     if      (strcmp(optname, "-A") == 0)        Alimit     = atoi(optarg);
144     else if (strcmp(optname, "-E") == 0)        globE      = atof(optarg);
145     else if (strcmp(optname, "-T") == 0)        globT      = atof(optarg);
146     else if (strcmp(optname, "-Z") == 0)        Z          = atoi(optarg);
147     else if (strcmp(optname, "--domE")    == 0) domE       = atof(optarg);
148     else if (strcmp(optname, "--domT")    == 0) domT       = atof(optarg);
149     else if (strcmp(optname, "--forward") == 0) do_forward = TRUE;
150     else if (strcmp(optname, "--null2")   == 0) do_null2   = FALSE;
151     else if (strcmp(optname, "--xnu")     == 0) do_xnu     = TRUE;
152     else if (strcmp(optname, "-h") == 0) {
153       Banner(stdout, banner);
154       puts(usage);
155       puts(experts);
156       exit(0);
157     }
158   }
159   if (argc - optind != 2)
160     Die("Incorrect number of arguments.\n%s\n", usage);
161 
162   hmmfile = argv[optind++];
163   seqfile = argv[optind++];
164 
165   /***********************************************
166    * Open sequence database (might be in BLASTDB or current directory)
167    ***********************************************/
168 
169   if (! SeqfileFormat(seqfile, &format, "BLASTDB"))
170     switch (squid_errno) {
171     case SQERR_NOFILE:
172       Die("Sequence file %s could not be opened for reading", seqfile); break;
173     case SQERR_FORMAT:
174     default:
175       Die("Failed to determine format of sequence file %s", seqfile);
176     }
177   if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
178     Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
179 
180   /***********************************************
181    * Open HMM file (might be in HMMERDB or current directory).
182    * Read a single HMM from it. (Config HMM, if necessary).
183    ***********************************************/
184 
185   if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
186     Die("Failed to open HMM file %s\n%s", hmmfile, usage);
187   if (!HMMFileRead(hmmfp, &hmm))
188     Die("Failed to read any HMMs from %s\n", hmmfile);
189   if (hmm == NULL)
190     Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
191   P7Logoddsify(hmm, !do_forward);
192 
193   /***********************************************
194    * Show the banner
195    ***********************************************/
196 
197   Banner(stdout, banner);
198   printf(   "HMM file:                 %s [%s]\n", hmmfile, hmm->name);
199   printf(   "Sequence database:        %s\n", seqfile);
200   printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n");
201 
202   /***********************************************
203    * Search HMM against each sequence
204    ***********************************************/
205 
206   /* set up structures for storing output
207      */
208   histogram = AllocHistogram(-200, 200, 100);  /* keeps full histogram */
209   ghit      = AllocTophits(200);         /* per-seq hits: 200=lumpsize */
210   dhit      = AllocTophits(200);         /* domain hits:  200=lumpsize */
211 
212   nseq = 0;
213   while (ReadSeq(sqfp, format, &seq, &sqinfo))
214     {
215       /* Silently skip empty sequences, which, God help us, appear
216        * sometimes in wormpep. (AGB bug)
217        */
218       if (sqinfo.len == 0) continue;
219 
220       nseq++;
221       dsq = DigitizeSequence(seq, sqinfo.len);
222 
223       if (do_xnu) XNU(dsq, sqinfo.len);
224 
225       /* 1. Recover a trace by Viterbi.
226        */
227       if (P7ViterbiSize(sqinfo.len, hmm->M) <= RAMLIMIT)
228 	sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr);
229       else
230 	sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr);
231 
232       /* 2. If we're using Forward scores, do another DP
233        *    to get it; else, we already have a Viterbi score
234        *    in sc.
235        */
236       if (do_forward) sc = P7Forward(dsq, sqinfo.len, hmm, NULL);
237 
238       if (do_null2)  sc -= TraceScoreCorrection(hmm, tr, dsq);
239 
240 #if DEBUGLEVEL >= DEBUG_LOTS
241       P7PrintTrace(stdout, tr, hmm, dsq);
242 #endif
243 
244       /* 2. Store score/pvalue for global alignment; will sort on score.
245        *    Keep all domains in a significant sequence hit.
246        *    We can only make a lower bound estimate of E-value since
247        *    we don't know the final value of nseq yet.
248        */
249       pvalue = PValue(hmm, sc);
250       evalue = Z ? (double) Z * pvalue : (double) nseq * pvalue;
251       if (sc > globT && evalue < globE)
252 	{
253 	  RegisterHit(ghit, sc, pvalue, sc,
254 		      0., 0.,	                /* no mother seq */
255 		      sqinfo.name,
256 		      sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL,
257 		      0,0,0,                	/* seq positions  */
258 		      0,0,0,	                /* HMM positions  */
259 		      0, TraceDomainNumber(tr), /* domain info    */
260 		      NULL);	                /* alignment info */
261 
262 	  record_domains(dhit, hmm, dsq, &sqinfo, tr, pvalue, sc, do_null2);
263 	}
264       AddToHistogram(histogram, sc);
265 
266       FreeSequence(seq, &sqinfo);
267       P7FreeTrace(tr);
268       free(dsq);
269     }
270 
271   /* We're done searching an HMM over the whole sequence database.
272    * Set the theoretical EVD curve in our histogram using
273    * calibration in the HMM, if available.
274    */
275   if (hmm->flags & PLAN7_STATS)
276     ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda,
277 			     histogram->lowscore, histogram->highscore,
278 			     1.0, 0);
279   if (!Z) Z = nseq;		/* set Z for good now that we're done. */
280 
281   /* Now format and report our output
282    */
283 
284   /* 1. Report overall sequence hits (sorted on E-value) */
285   printf("\nQuery HMM:  %s  %s\n",
286 	 hmm->name, hmm->desc != NULL ? hmm->desc : "");
287   if (hmm->flags & PLAN7_STATS)
288     printf("  [HMM has been calibrated; E-values are empirical estimates]\n");
289   else
290     printf("  [No calibration for HMM; E-values are upper bounds]\n");
291 
292   FullSortTophits(ghit);
293   namewidth = MAX(8, TophitsMaxName(ghit));
294 
295   printf("\nScores for complete sequences (score includes all domains):\n");
296   printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Sequence", 52-namewidth, "Description", "Score", "E-value", " N ");
297   printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", 52-namewidth, "-----------", "-----", "-------", "---");
298   for (i = 0; i < ghit->num; i++)
299     {
300       GetRankedHit(ghit, i,
301 		   &pvalue, &sc, NULL, NULL,
302 		   &name, &desc,
303 		   NULL, NULL, NULL,               /* sequence positions */
304 		   NULL, NULL, NULL,               /* HMM positions      */
305 		   NULL, &ndom,	                   /* domain info        */
306 		   NULL);	                   /* alignment info     */
307       evalue = pvalue * (double) Z;
308       if (evalue < globE && sc > globT)
309 	printf("%-*s %-*.*s %7.1f %10.2g %3d\n",
310 	       namewidth, name,
311 	       52-namewidth, 52-namewidth, desc != NULL ? desc : "",
312 	       sc, evalue, ndom);
313       else if (evalue >= globE)
314 	{
315 	  if (i > 0) printf("\t[no more scores below E threshold]\n");
316 	  break;
317 	}
318       else if (sc <= globT)
319 	{
320 	  if (i > 0) printf("\t[no more scores above T threshold]");
321 	  break;
322 	}
323     }
324   if (i == 0) printf("\t[no hits above thresholds]\n");
325 
326 
327 
328 
329   /* 2. Report domain hits (also sorted on E-value) */
330   FullSortTophits(dhit);
331   namewidth = MAX(8, TophitsMaxName(dhit));
332 
333   printf("\nParsed for domains:\n");
334   printf("%-*s %7s %5s %5s    %5s %5s    %7s %8s\n",
335 	 namewidth, "Sequence", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value");
336   printf("%-*s %7s %5s %5s    %5s %5s    %7s %8s\n",
337 	 namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------");
338 
339   for (i = 0; i < dhit->num; i++)
340     {
341       GetRankedHit(dhit, i,
342 		   &pvalue, &sc, &motherp, &mothersc,
343 		   &name, NULL,
344 		   &sqfrom, &sqto, &sqlen,            /* seq position info  */
345 		   &hmmfrom, &hmmto, NULL,            /* HMM position info  */
346 		   &domidx, &ndom,                    /* domain info        */
347 		   NULL);	                      /* alignment info     */
348       evalue = pvalue * (double) Z;
349 
350       if (motherp * (double) Z >= globE || mothersc <= globT)
351 	continue;
352       else if (evalue < domE && sc > domT)
353 	printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n",
354 	       namewidth, name,
355 	       domidx, ndom,
356 	       sqfrom, sqto,
357 	       sqfrom == 1 ? '[' : '.', sqto == sqlen ? ']' : '.',
358 	       hmmfrom, hmmto,
359 	       hmmfrom == 1 ? '[':'.', hmmto == hmm->M ? ']' : '.',
360 	       sc, evalue);
361       else if (evalue >= domE) {
362 	if (i > 0) printf("\t[no more scores below domE threshold]\n");
363 	break;
364       }
365       else if (sc <= domT) {
366 	if (i > 0) printf("\t[no more scores above domT threshold]\n");
367 	break;
368       }
369     }
370   if (i == 0) printf("\t[no hits above thresholds\n");
371 
372 
373 
374   /* 3. Alignment output, also by domain.
375    *    dhits is already sorted and namewidth is set, from above code.
376    *    Number of displayed alignments is limited by Alimit parameter;
377    *    also by domE (evalue threshold), domT (score theshold).
378    */
379   if (Alimit != 0)
380     {
381       printf("\nAlignments of top-scoring domains:\n");
382       for (i = 0; i < dhit->num; i++)
383 	{
384 	  if (i == Alimit) break; /* limit to Alimit output alignments */
385 	  GetRankedHit(dhit, i,
386 		       &pvalue, &sc, &motherp, &mothersc,
387 		       &name, NULL,
388 		       &sqfrom, &sqto, &sqlen,            /* seq position info  */
389 		       &hmmfrom, &hmmto, NULL,            /* HMM position info  */
390 		       &domidx, &ndom,                    /* domain info        */
391 		       &ali);	                      /* alignment info     */
392 	  evalue = pvalue * (double) Z;
393 
394 	  if (motherp * (double) Z >= globE || mothersc <= globT)
395 	    continue;
396 	  else if (evalue < domE && sc > domT)
397 	    {
398 	      printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n",
399 		     name, domidx, ndom, sqfrom, sqto, sc, evalue);
400 	      PrintFancyAli(stdout, ali);
401 	    }
402 	  else if (evalue >= domE) {
403 	    if (i > 0) printf("\t[no more alignments below domE threshold\n");
404 	    break;
405 	  }
406 	  else if (sc <= domT) {
407 	    if (i > 0) printf("\t[no more alignments above domT threshold\n");
408 	    break;
409 	  }
410 	}
411       if (i == 0)      printf("\t[no hits above thresholds\n");
412       if (i == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit);
413     }
414 
415   /* 4. Histogram output */
416   printf("\nHistogram of all scores:\n");
417   PrintASCIIHistogram(stdout, histogram);
418 
419   /* 5. Tophits summaries, while developing...
420    */
421   printf("\nWhole sequence top hits:\n");
422   TophitsReport(ghit, globE, nseq);
423   printf("\nDomain top hits:\n");
424   TophitsReport(dhit, domE, nseq);
425 
426   /***********************************************
427    * Clean-up and exit.
428    ***********************************************/
429 
430   FreeHistogram(histogram);
431   HMMFileClose(hmmfp);
432   SeqfileClose(sqfp);
433   FreeTophits(ghit);
434   FreeTophits(dhit);
435   FreePlan7(hmm);
436   SqdClean();
437 
438 #ifdef MEMDEBUG
439   current_size = malloc_inuse(&histid2);
440   if (current_size != orig_size) malloc_list(2, histid1, histid2);
441   else fprintf(stderr, "[No memory leaks.]\n");
442 #endif
443   return 0;
444 }
445 
446 
447 /* Function: record_domains()
448  * Date:     SRE, Tue Nov  4 11:25:14 1997 [St. Louis]
449  *
450  * Purpose:  Decompose a trace, and register scores, P-values, alignments,
451  *           etc. for individual domains in a hitlist.
452  *
453  * Args:     hmm    - the HMM structure
454  *           dsq    - digitized sequence 1..L
455  *           sqinfo - contains name of sequence, etc.
456  *           tr     - traceback of the whole sequence aligned to HMM
457  *           whole_pval - P-value of complete alignment
458  *           whole_sc   - score of complete alignment (bits)
459  *           do_null2   - TRUE to use post hoc null model correction
460  *
461  * Return:   (void)
462  */
463 static void
record_domains(struct tophit_s * h,struct plan7_s * hmm,char * dsq,SQINFO * sqinfo,struct p7trace_s * tr,double whole_pval,float whole_sc,int do_null2)464 record_domains(struct tophit_s *h,
465 	       struct plan7_s *hmm, char *dsq, SQINFO *sqinfo,
466 	       struct p7trace_s *tr, double whole_pval, float whole_sc,
467 	       int do_null2)
468 {
469   struct p7trace_s **tarr;      /* array of per-domain traces */
470   struct fancyali_s *ali;       /* alignment of a domain      */
471   int ntr;			/* number of domain traces    */
472   int idx;			/* index for traces           */
473   int k1, k2;			/* start, stop coord in model */
474   int i1, i2;			/* start, stop in sequence    */
475   float  score;
476   double pvalue;
477 
478   TraceDecompose(tr, &tarr, &ntr);
479   if (ntr == 0) Die("TraceDecompose() screwup"); /* "can't happen" (!) */
480 
481   for (idx = 0; idx < ntr; idx++)
482     {
483       /* Get the score and bounds of the match.
484        */
485       score  = P7TraceScore(hmm, dsq, tarr[idx]);
486       if (do_null2)
487 	score -= TraceScoreCorrection(hmm, tarr[idx], dsq);
488       pvalue = PValue(hmm, score);
489       TraceSimpleBounds(tarr[idx], &i1, &i2, &k1, &k2);
490 
491 #if DEBUGLEVEL >= DEBUG_LOTS
492       P7PrintTrace(stdout, tarr[idx], hmm, dsq);
493 #endif
494 
495       /* Record the match. Use score as the sort key.
496        */
497       ali = CreateFancyAli(tarr[idx], hmm, dsq, sqinfo->name);
498       RegisterHit(h, score, pvalue, score, whole_pval, whole_sc,
499 		  sqinfo->name,
500 		  sqinfo->flags & SQINFO_DESC ? sqinfo->desc : NULL,
501 		  i1,i2, sqinfo->len,
502 		  k1,k2, hmm->M,
503 		  idx+1, ntr,
504 		  ali);
505     }
506   for (idx = 0; idx < ntr; idx++)
507     P7FreeTrace(tarr[idx]);
508   free(tarr);
509   return;
510 }
511