1 /************************************************************
2 * HMMER - Biological sequence analysis with profile-HMMs
3 * Copyright (C) 1992-1998 Washington University School of Medicine
4 *
5 * This source code is distributed under the terms of the
6 * GNU General Public License. See the files COPYING and
7 * GNULICENSE for details.
8 *
9 ************************************************************/
10
11 /* hmmsearch.c
12 * SRE, Tue Jan 7 17:19:20 1997
13 *
14 * Search a sequence database with a profile HMM.
15 * RCS $Id: hmmsearch.c,v 1.1.1.1 2001/06/18 13:59:50 birney Exp $
16 */
17
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <float.h>
23
24 #include "structs.h" /* data structures, macros, #define's */
25 #include "config.h" /* compile-time configuration constants */
26 #include "funcs.h" /* function declarations */
27 #include "globals.h" /* alphabet global variables */
28 #include "squid.h" /* general sequence analysis library */
29
30 #ifdef MEMDEBUG
31 #include "dbmalloc.h"
32 #endif
33
34 static char banner[] = "hmmsearch - search a sequence database with a profile HMM";
35
36 static char usage[] = "\
37 Usage: hmmsearch [-options] <hmmfile> <sequence file or database>\n\
38 Available options are:\n\
39 -h : help; print brief help on version and usage\n\
40 -A <n> : sets alignment output limit to <n> best domain alignments\n\
41 -E <x> : sets E value cutoff (globE) to <x>\n\
42 -T <x> : sets T bit threshold (globT) to <x>\n\
43 -Z <n> : sets Z (# seqs) for E-value calculation\n\
44 ";
45
46 static char experts[] = "\
47 --domE <x>: sets domain Eval cutoff (2nd threshold) to <x>\n\
48 --domT <x>: sets domain T bit thresh (2nd threshold) to <x>\n\
49 --forward : use the full Forward() algorithm instead of Viterbi\n\
50 --null2 : turn OFF the post hoc second null model\n\
51 --xnu : turn ON XNU filtering of target protein sequences\n\
52 ";
53
54 static struct opt_s OPTIONS[] = {
55 { "-h", TRUE, sqdARG_NONE },
56 { "-A", TRUE, sqdARG_INT },
57 { "-E", TRUE, sqdARG_FLOAT},
58 { "-T", TRUE, sqdARG_FLOAT},
59 { "-Z", TRUE, sqdARG_INT },
60 { "--domE", FALSE, sqdARG_FLOAT},
61 { "--domT", FALSE, sqdARG_FLOAT},
62 { "--forward", FALSE, sqdARG_NONE },
63 { "--null2", FALSE, sqdARG_NONE },
64 { "--xnu", FALSE, sqdARG_NONE },
65 };
66 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
67
68 static void record_domains(struct tophit_s *h,
69 struct plan7_s *hmm, char *dsq, SQINFO *sqinfo,
70 struct p7trace_s *tr, double whole_pval, float whole_sc,
71 int do_null2);
72
73 int
main(int argc,char ** argv)74 main(int argc, char **argv)
75 {
76 char *hmmfile; /* file to read HMM(s) from */
77 HMMFILE *hmmfp; /* opened hmmfile for reading */
78 char *seqfile; /* file to read target sequence(s) from */
79 SQFILE *sqfp; /* opened seqfile for reading */
80 int format; /* format of seqfile */
81 char *seq; /* target sequence */
82 SQINFO sqinfo; /* optional info for seq */
83 char *dsq; /* digitized target sequence */
84 int i;
85 struct plan7_s *hmm; /* HMM to search with */
86 struct histogram_s *histogram;/* histogram of all scores */
87 struct p7trace_s *tr; /* traceback */
88 struct fancyali_s *ali; /* displayed alignment info */
89 struct tophit_s *ghit; /* list of top hits for whole sequences */
90 struct tophit_s *dhit; /* list of top hits for domains */
91
92 float sc; /* score of an HMM search */
93 double pvalue; /* pvalue of an HMM score */
94 double evalue; /* evalue of an HMM score */
95 double motherp; /* pvalue of a whole seq HMM score */
96 float mothersc; /* score of a whole seq parent of domain */
97 int sqfrom, sqto; /* coordinates in sequence */
98 int hmmfrom, hmmto; /* coordinate in HMM */
99 char *name, *desc; /* hit sequence name and description */
100 int sqlen; /* length of seq that was hit */
101 int nseq; /* number of sequences searched */
102 int Z; /* # of seqs for purposes of E-val calc */
103 int domidx; /* number of this domain */
104 int ndom; /* total # of domains in this seq */
105 int namewidth; /* max width of sequence name */
106
107 int Alimit; /* A parameter limiting output alignments */
108 float globT; /* T parameter: keep only hits > globT bits */
109 double globE; /* E parameter: keep hits < globE E-value */
110 float domT; /* T parameter for individual domains */
111 double domE; /* E parameter for individual domains */
112
113 char *optname; /* name of option found by Getopt() */
114 char *optarg; /* argument found by Getopt() */
115 int optind; /* index in argv[] */
116 int do_null2; /* TRUE to adjust scores with null model #2 */
117 int do_forward; /* TRUE to use Forward() not Viterbi() */
118 int do_xnu; /* TRUE to filter sequences thru XNU */
119
120 #ifdef MEMDEBUG
121 unsigned long histid1, histid2, orig_size, current_size;
122 orig_size = malloc_inuse(&histid1);
123 fprintf(stderr, "[... memory debugging is ON ...]\n");
124 #endif
125
126 /***********************************************
127 * Parse command line
128 ***********************************************/
129
130 do_forward = FALSE;
131 do_null2 = TRUE;
132 do_xnu = FALSE;
133 Z = 0;
134
135 Alimit = INT_MAX; /* no limit on alignment output */
136 globE = 10.0; /* use a reasonable Eval threshold; */
137 globT = -FLT_MAX; /* but no bit threshold, */
138 domT = -FLT_MAX; /* no domain bit threshold, */
139 domE = FLT_MAX; /* and no domain Eval threshold. */
140
141 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
142 &optind, &optname, &optarg)) {
143 if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg);
144 else if (strcmp(optname, "-E") == 0) globE = atof(optarg);
145 else if (strcmp(optname, "-T") == 0) globT = atof(optarg);
146 else if (strcmp(optname, "-Z") == 0) Z = atoi(optarg);
147 else if (strcmp(optname, "--domE") == 0) domE = atof(optarg);
148 else if (strcmp(optname, "--domT") == 0) domT = atof(optarg);
149 else if (strcmp(optname, "--forward") == 0) do_forward = TRUE;
150 else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE;
151 else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE;
152 else if (strcmp(optname, "-h") == 0) {
153 Banner(stdout, banner);
154 puts(usage);
155 puts(experts);
156 exit(0);
157 }
158 }
159 if (argc - optind != 2)
160 Die("Incorrect number of arguments.\n%s\n", usage);
161
162 hmmfile = argv[optind++];
163 seqfile = argv[optind++];
164
165 /***********************************************
166 * Open sequence database (might be in BLASTDB or current directory)
167 ***********************************************/
168
169 if (! SeqfileFormat(seqfile, &format, "BLASTDB"))
170 switch (squid_errno) {
171 case SQERR_NOFILE:
172 Die("Sequence file %s could not be opened for reading", seqfile); break;
173 case SQERR_FORMAT:
174 default:
175 Die("Failed to determine format of sequence file %s", seqfile);
176 }
177 if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
178 Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
179
180 /***********************************************
181 * Open HMM file (might be in HMMERDB or current directory).
182 * Read a single HMM from it. (Config HMM, if necessary).
183 ***********************************************/
184
185 if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
186 Die("Failed to open HMM file %s\n%s", hmmfile, usage);
187 if (!HMMFileRead(hmmfp, &hmm))
188 Die("Failed to read any HMMs from %s\n", hmmfile);
189 if (hmm == NULL)
190 Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
191 P7Logoddsify(hmm, !do_forward);
192
193 /***********************************************
194 * Show the banner
195 ***********************************************/
196
197 Banner(stdout, banner);
198 printf( "HMM file: %s [%s]\n", hmmfile, hmm->name);
199 printf( "Sequence database: %s\n", seqfile);
200 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n");
201
202 /***********************************************
203 * Search HMM against each sequence
204 ***********************************************/
205
206 /* set up structures for storing output
207 */
208 histogram = AllocHistogram(-200, 200, 100); /* keeps full histogram */
209 ghit = AllocTophits(200); /* per-seq hits: 200=lumpsize */
210 dhit = AllocTophits(200); /* domain hits: 200=lumpsize */
211
212 nseq = 0;
213 while (ReadSeq(sqfp, format, &seq, &sqinfo))
214 {
215 /* Silently skip empty sequences, which, God help us, appear
216 * sometimes in wormpep. (AGB bug)
217 */
218 if (sqinfo.len == 0) continue;
219
220 nseq++;
221 dsq = DigitizeSequence(seq, sqinfo.len);
222
223 if (do_xnu) XNU(dsq, sqinfo.len);
224
225 /* 1. Recover a trace by Viterbi.
226 */
227 if (P7ViterbiSize(sqinfo.len, hmm->M) <= RAMLIMIT)
228 sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr);
229 else
230 sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr);
231
232 /* 2. If we're using Forward scores, do another DP
233 * to get it; else, we already have a Viterbi score
234 * in sc.
235 */
236 if (do_forward) sc = P7Forward(dsq, sqinfo.len, hmm, NULL);
237
238 if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq);
239
240 #if DEBUGLEVEL >= DEBUG_LOTS
241 P7PrintTrace(stdout, tr, hmm, dsq);
242 #endif
243
244 /* 2. Store score/pvalue for global alignment; will sort on score.
245 * Keep all domains in a significant sequence hit.
246 * We can only make a lower bound estimate of E-value since
247 * we don't know the final value of nseq yet.
248 */
249 pvalue = PValue(hmm, sc);
250 evalue = Z ? (double) Z * pvalue : (double) nseq * pvalue;
251 if (sc > globT && evalue < globE)
252 {
253 RegisterHit(ghit, sc, pvalue, sc,
254 0., 0., /* no mother seq */
255 sqinfo.name,
256 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL,
257 0,0,0, /* seq positions */
258 0,0,0, /* HMM positions */
259 0, TraceDomainNumber(tr), /* domain info */
260 NULL); /* alignment info */
261
262 record_domains(dhit, hmm, dsq, &sqinfo, tr, pvalue, sc, do_null2);
263 }
264 AddToHistogram(histogram, sc);
265
266 FreeSequence(seq, &sqinfo);
267 P7FreeTrace(tr);
268 free(dsq);
269 }
270
271 /* We're done searching an HMM over the whole sequence database.
272 * Set the theoretical EVD curve in our histogram using
273 * calibration in the HMM, if available.
274 */
275 if (hmm->flags & PLAN7_STATS)
276 ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda,
277 histogram->lowscore, histogram->highscore,
278 1.0, 0);
279 if (!Z) Z = nseq; /* set Z for good now that we're done. */
280
281 /* Now format and report our output
282 */
283
284 /* 1. Report overall sequence hits (sorted on E-value) */
285 printf("\nQuery HMM: %s %s\n",
286 hmm->name, hmm->desc != NULL ? hmm->desc : "");
287 if (hmm->flags & PLAN7_STATS)
288 printf(" [HMM has been calibrated; E-values are empirical estimates]\n");
289 else
290 printf(" [No calibration for HMM; E-values are upper bounds]\n");
291
292 FullSortTophits(ghit);
293 namewidth = MAX(8, TophitsMaxName(ghit));
294
295 printf("\nScores for complete sequences (score includes all domains):\n");
296 printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Sequence", 52-namewidth, "Description", "Score", "E-value", " N ");
297 printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", 52-namewidth, "-----------", "-----", "-------", "---");
298 for (i = 0; i < ghit->num; i++)
299 {
300 GetRankedHit(ghit, i,
301 &pvalue, &sc, NULL, NULL,
302 &name, &desc,
303 NULL, NULL, NULL, /* sequence positions */
304 NULL, NULL, NULL, /* HMM positions */
305 NULL, &ndom, /* domain info */
306 NULL); /* alignment info */
307 evalue = pvalue * (double) Z;
308 if (evalue < globE && sc > globT)
309 printf("%-*s %-*.*s %7.1f %10.2g %3d\n",
310 namewidth, name,
311 52-namewidth, 52-namewidth, desc != NULL ? desc : "",
312 sc, evalue, ndom);
313 else if (evalue >= globE)
314 {
315 if (i > 0) printf("\t[no more scores below E threshold]\n");
316 break;
317 }
318 else if (sc <= globT)
319 {
320 if (i > 0) printf("\t[no more scores above T threshold]");
321 break;
322 }
323 }
324 if (i == 0) printf("\t[no hits above thresholds]\n");
325
326
327
328
329 /* 2. Report domain hits (also sorted on E-value) */
330 FullSortTophits(dhit);
331 namewidth = MAX(8, TophitsMaxName(dhit));
332
333 printf("\nParsed for domains:\n");
334 printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n",
335 namewidth, "Sequence", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value");
336 printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n",
337 namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------");
338
339 for (i = 0; i < dhit->num; i++)
340 {
341 GetRankedHit(dhit, i,
342 &pvalue, &sc, &motherp, &mothersc,
343 &name, NULL,
344 &sqfrom, &sqto, &sqlen, /* seq position info */
345 &hmmfrom, &hmmto, NULL, /* HMM position info */
346 &domidx, &ndom, /* domain info */
347 NULL); /* alignment info */
348 evalue = pvalue * (double) Z;
349
350 if (motherp * (double) Z >= globE || mothersc <= globT)
351 continue;
352 else if (evalue < domE && sc > domT)
353 printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n",
354 namewidth, name,
355 domidx, ndom,
356 sqfrom, sqto,
357 sqfrom == 1 ? '[' : '.', sqto == sqlen ? ']' : '.',
358 hmmfrom, hmmto,
359 hmmfrom == 1 ? '[':'.', hmmto == hmm->M ? ']' : '.',
360 sc, evalue);
361 else if (evalue >= domE) {
362 if (i > 0) printf("\t[no more scores below domE threshold]\n");
363 break;
364 }
365 else if (sc <= domT) {
366 if (i > 0) printf("\t[no more scores above domT threshold]\n");
367 break;
368 }
369 }
370 if (i == 0) printf("\t[no hits above thresholds\n");
371
372
373
374 /* 3. Alignment output, also by domain.
375 * dhits is already sorted and namewidth is set, from above code.
376 * Number of displayed alignments is limited by Alimit parameter;
377 * also by domE (evalue threshold), domT (score theshold).
378 */
379 if (Alimit != 0)
380 {
381 printf("\nAlignments of top-scoring domains:\n");
382 for (i = 0; i < dhit->num; i++)
383 {
384 if (i == Alimit) break; /* limit to Alimit output alignments */
385 GetRankedHit(dhit, i,
386 &pvalue, &sc, &motherp, &mothersc,
387 &name, NULL,
388 &sqfrom, &sqto, &sqlen, /* seq position info */
389 &hmmfrom, &hmmto, NULL, /* HMM position info */
390 &domidx, &ndom, /* domain info */
391 &ali); /* alignment info */
392 evalue = pvalue * (double) Z;
393
394 if (motherp * (double) Z >= globE || mothersc <= globT)
395 continue;
396 else if (evalue < domE && sc > domT)
397 {
398 printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n",
399 name, domidx, ndom, sqfrom, sqto, sc, evalue);
400 PrintFancyAli(stdout, ali);
401 }
402 else if (evalue >= domE) {
403 if (i > 0) printf("\t[no more alignments below domE threshold\n");
404 break;
405 }
406 else if (sc <= domT) {
407 if (i > 0) printf("\t[no more alignments above domT threshold\n");
408 break;
409 }
410 }
411 if (i == 0) printf("\t[no hits above thresholds\n");
412 if (i == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit);
413 }
414
415 /* 4. Histogram output */
416 printf("\nHistogram of all scores:\n");
417 PrintASCIIHistogram(stdout, histogram);
418
419 /* 5. Tophits summaries, while developing...
420 */
421 printf("\nWhole sequence top hits:\n");
422 TophitsReport(ghit, globE, nseq);
423 printf("\nDomain top hits:\n");
424 TophitsReport(dhit, domE, nseq);
425
426 /***********************************************
427 * Clean-up and exit.
428 ***********************************************/
429
430 FreeHistogram(histogram);
431 HMMFileClose(hmmfp);
432 SeqfileClose(sqfp);
433 FreeTophits(ghit);
434 FreeTophits(dhit);
435 FreePlan7(hmm);
436 SqdClean();
437
438 #ifdef MEMDEBUG
439 current_size = malloc_inuse(&histid2);
440 if (current_size != orig_size) malloc_list(2, histid1, histid2);
441 else fprintf(stderr, "[No memory leaks.]\n");
442 #endif
443 return 0;
444 }
445
446
447 /* Function: record_domains()
448 * Date: SRE, Tue Nov 4 11:25:14 1997 [St. Louis]
449 *
450 * Purpose: Decompose a trace, and register scores, P-values, alignments,
451 * etc. for individual domains in a hitlist.
452 *
453 * Args: hmm - the HMM structure
454 * dsq - digitized sequence 1..L
455 * sqinfo - contains name of sequence, etc.
456 * tr - traceback of the whole sequence aligned to HMM
457 * whole_pval - P-value of complete alignment
458 * whole_sc - score of complete alignment (bits)
459 * do_null2 - TRUE to use post hoc null model correction
460 *
461 * Return: (void)
462 */
463 static void
record_domains(struct tophit_s * h,struct plan7_s * hmm,char * dsq,SQINFO * sqinfo,struct p7trace_s * tr,double whole_pval,float whole_sc,int do_null2)464 record_domains(struct tophit_s *h,
465 struct plan7_s *hmm, char *dsq, SQINFO *sqinfo,
466 struct p7trace_s *tr, double whole_pval, float whole_sc,
467 int do_null2)
468 {
469 struct p7trace_s **tarr; /* array of per-domain traces */
470 struct fancyali_s *ali; /* alignment of a domain */
471 int ntr; /* number of domain traces */
472 int idx; /* index for traces */
473 int k1, k2; /* start, stop coord in model */
474 int i1, i2; /* start, stop in sequence */
475 float score;
476 double pvalue;
477
478 TraceDecompose(tr, &tarr, &ntr);
479 if (ntr == 0) Die("TraceDecompose() screwup"); /* "can't happen" (!) */
480
481 for (idx = 0; idx < ntr; idx++)
482 {
483 /* Get the score and bounds of the match.
484 */
485 score = P7TraceScore(hmm, dsq, tarr[idx]);
486 if (do_null2)
487 score -= TraceScoreCorrection(hmm, tarr[idx], dsq);
488 pvalue = PValue(hmm, score);
489 TraceSimpleBounds(tarr[idx], &i1, &i2, &k1, &k2);
490
491 #if DEBUGLEVEL >= DEBUG_LOTS
492 P7PrintTrace(stdout, tarr[idx], hmm, dsq);
493 #endif
494
495 /* Record the match. Use score as the sort key.
496 */
497 ali = CreateFancyAli(tarr[idx], hmm, dsq, sqinfo->name);
498 RegisterHit(h, score, pvalue, score, whole_pval, whole_sc,
499 sqinfo->name,
500 sqinfo->flags & SQINFO_DESC ? sqinfo->desc : NULL,
501 i1,i2, sqinfo->len,
502 k1,k2, hmm->M,
503 idx+1, ntr,
504 ali);
505 }
506 for (idx = 0; idx < ntr; idx++)
507 P7FreeTrace(tarr[idx]);
508 free(tarr);
509 return;
510 }
511