1 /************************************************************
2  * HMMER - Biological sequence analysis with profile-HMMs
3  * Copyright (C) 1992-1998 Washington University School of Medicine
4  *
5  *   This source code is distributed under the terms of the
6  *   GNU General Public License. See the files COPYING and
7  *   GNULICENSE for details.
8  *
9  ************************************************************/
10 
11 /* hmmalign.c
12  * SRE, Thu Dec 18 16:05:29 1997 [St. Louis]
13  *
14  * main() for aligning a set of sequences to an HMM.
15  * RCS $Id: hmmalign.c,v 1.1.1.1 2001/06/18 13:59:49 birney Exp $
16  */
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 
21 #include "structs.h"		/* data structures, macros, #define's   */
22 #include "config.h"		/* compile-time configuration constants */
23 #include "funcs.h"		/* function declarations                */
24 #include "globals.h"		/* alphabet global variables            */
25 #include "squid.h"		/* general sequence analysis library    */
26 
27 #ifdef MEMDEBUG
28 #include "dbmalloc.h"
29 #endif
30 
31 static char banner[] = "hmmalign - align sequences to an HMM profile";
32 
33 static char usage[]  = "\
34 Usage: hmmalign [-options] <hmm file> <sequence file>\n\
35 Available options are:\n\
36    -h     : help; print brief help on version and usage\n\
37    -m     : only print symbols aligned to match states\n\
38    -o <f> : save alignment in file <f> in SELEX format\n\
39    -q     : quiet - suppress verbose banner\n\
40 ";
41 
42 static char experts[] = "\
43 \n";
44 
45 static struct opt_s OPTIONS[] = {
46   { "-h", TRUE, sqdARG_NONE   },
47   { "-m", TRUE, sqdARG_NONE   } ,
48   { "-o", TRUE, sqdARG_STRING },
49   { "-q", TRUE, sqdARG_NONE   },
50 };
51 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
52 
53 int
main(int argc,char ** argv)54 main(int argc, char **argv)
55 {
56   char            *hmmfile;	/* file to read HMMs from                  */
57   HMMFILE         *hmmfp;       /* opened hmmfile for reading              */
58   char            *seqfile;     /* file to read target sequence from       */
59   int              format;	/* format of seqfile                       */
60   char           **rseq;        /* raw, unaligned sequences                */
61   SQINFO          *sqinfo;      /* info associated with sequences          */
62   char           **dsq;         /* digitized raw sequences                 */
63   int              nseq;        /* number of sequences                     */
64   char           **aseq;        /* aligned sequences                       */
65   AINFO            ainfo;       /* alignment information                   */
66   float           *wgt;         /* per-sequence weights                    */
67   int              i;
68   struct plan7_s    *hmm;       /* HMM to align to                         */
69   struct p7trace_s **tr;        /* traces for aligned sequences            */
70 
71   char *optname;                /* name of option found by Getopt()         */
72   char *optarg;                 /* argument found by Getopt()               */
73   int   optind;                 /* index in argv[]                          */
74   int   be_quiet;		/* TRUE to suppress verbose banner          */
75   int   matchonly;		/* TRUE to show only match state syms       */
76   char *outfile;                /* optional alignment output file           */
77   FILE *ofp;                    /* handle on alignment output file          */
78 
79 
80 #ifdef MEMDEBUG
81   unsigned long histid1, histid2, orig_size, current_size;
82   orig_size = malloc_inuse(&histid1);
83   fprintf(stderr, "[... memory debugging is ON ...]\n");
84 #endif
85 
86   /***********************************************
87    * Parse command line
88    ***********************************************/
89 
90   matchonly = FALSE;
91   outfile   = NULL;
92   be_quiet  = FALSE;
93 
94   while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
95                 &optind, &optname, &optarg))  {
96     if      (strcmp(optname, "-m") == 0) matchonly= TRUE;
97     else if (strcmp(optname, "-o") == 0) outfile  = optarg;
98     else if (strcmp(optname, "-q") == 0) be_quiet = TRUE;
99     else if (strcmp(optname, "-h") == 0)
100       {
101 	Banner(stdout, banner);
102 	puts(usage);
103 	puts(experts);
104 	exit(0);
105       }
106   }
107   if (argc - optind != 2)
108     Die("Incorrect number of arguments.\n%s\n", usage);
109 
110   hmmfile = argv[optind++];
111   seqfile = argv[optind++];
112 
113  /***********************************************
114   * Open HMM file (might be in HMMERDB or current directory).
115   * Read a single HMM from it.
116   ***********************************************/
117 
118   if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
119     Die("Failed to open HMM file %s\n%s", hmmfile, usage);
120   if (!HMMFileRead(hmmfp, &hmm))
121     Die("Failed to read any HMMs from %s\n", hmmfile);
122   HMMFileClose(hmmfp);
123   if (hmm == NULL)
124     Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
125   P7Logoddsify(hmm, TRUE);
126 
127   /***********************************************
128    * Open sequence file in current directory.
129    * Read all seqs from it.
130    ***********************************************/
131 
132   if (! SeqfileFormat(seqfile, &format, NULL))
133     switch (squid_errno) {
134     case SQERR_NOFILE:
135       Die("Sequence file %s could not be opened for reading", seqfile); /*FALLTHRU*/
136     case SQERR_FORMAT:
137     default:
138       Die("Failed to determine format of sequence file %s", seqfile);
139     }
140   if (! ReadMultipleRseqs(seqfile, format, &rseq, &sqinfo, &nseq))
141     Die("Failed to read any sequences from file %s", seqfile);
142 
143   /***********************************************
144    * Show the banner
145    ***********************************************/
146 
147   if (! be_quiet)
148     {
149       Banner(stdout, banner);
150       printf(   "HMM file:             %s\n", hmmfile);
151       printf(   "Sequence file:        %s\n", seqfile);
152       printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
153     }
154 
155   /***********************************************
156    * Do the work
157    ***********************************************/
158 
159   /* Allocations and initializations.
160    */
161   dsq = MallocOrDie(sizeof(char *) * nseq);
162   tr  = MallocOrDie(sizeof(struct p7trace_s *) * nseq);
163   wgt = MallocOrDie(sizeof(float) * nseq);
164   FSet(wgt, nseq, 1.0);
165 
166   /* Align each sequence to the model, collect traces
167    */
168   for (i = 0; i < nseq; i++)
169     {
170       dsq[i] = DigitizeSequence(rseq[i], sqinfo[i].len);
171 
172       if (P7ViterbiSize(sqinfo[i].len, hmm->M) <= RAMLIMIT)
173 	(void) P7Viterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
174       else
175 	(void) P7SmallViterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i]));
176     }
177 
178   /* Turn traces into a multiple alignment
179    */
180   P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, matchonly,
181 		     &aseq, &ainfo);
182 
183   /***********************************************
184    * Output the alignment
185    ***********************************************/
186 
187   if (outfile != NULL && (ofp = fopen(outfile, "w")) != NULL)
188     {
189       WriteSELEX(ofp, aseq, &ainfo, 50);
190       printf("Alignment saved in file %s\n", outfile);
191       fclose(ofp);
192     }
193   else
194     WriteSELEX(stdout, aseq, &ainfo, 50);
195 
196   /***********************************************
197    * Cleanup and exit
198    ***********************************************/
199 
200   for (i = 0; i < nseq; i++)
201     {
202       P7FreeTrace(tr[i]);
203       FreeSequence(rseq[i], &(sqinfo[i]));
204       free(dsq[i]);
205     }
206   FreeAlignment(aseq, &ainfo);
207   FreePlan7(hmm);
208   free(sqinfo);
209   free(dsq);
210   free(wgt);
211   free(tr);
212 
213   SqdClean();
214 
215 #ifdef MEMDEBUG
216   current_size = malloc_inuse(&histid2);
217   if (current_size != orig_size) malloc_list(2, histid1, histid2);
218   else fprintf(stderr, "[No memory leaks.]\n");
219 #endif
220   return 0;
221 }
222