1 /* cmpress: prepare a CM database for faster cmscan searches.
2  *
3  * EPN, Thu Jul  7 13:08:55 2011
4  * SRE, Fri Oct 17 11:24:26 2008 [Janelia] (hmmpress.c)
5  */
6 #include "esl_config.h"
7 #include "p7_config.h"
8 #include "config.h"
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "easel.h"
15 #include "esl_alphabet.h"
16 #include "esl_getopts.h"
17 
18 #include "hmmer.h"
19 
20 #include "infernal.h"
21 
22 static ESL_OPTIONS options[] = {
23   /* name           type      default  env  range     toggles      reqs   incomp  help   docgroup*/
24   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL,    NULL, "show brief help on version and usage",          0 },
25   { "-F",        eslARG_NONE,   FALSE, NULL, NULL,      NULL,      NULL,    NULL, "force: overwrite any previous pressed files",   0 },
26   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
27 };
28 static char usage[]  = "[-options] <cmfile>";
29 static char banner[] = "prepare an CM database for faster cmscan searches";
30 
31 /* cmpress creates four output files.
32  * Bundling their info into a structure streamlines creation and cleanup.
33  */
34 struct dbfiles {
35   char       *mfile;    // .i1m file: binary core CMs and HMM filters
36   char       *ffile;    // .i1f file: binary vectorized profile HMM filters, MSV filter part only
37   char       *pfile;    // .i1p file: binary vectorized profile HMM filters, remainder (excluding MSV filter part)
38   char       *ssifile;  // .i1i file: SSI index for retrieval from .i1m
39 
40   FILE       *mfp;
41   FILE       *ffp;
42   FILE       *pfp;
43   ESL_NEWSSI *nssi;
44 };
45 
46 static struct dbfiles *open_dbfiles(ESL_GETOPTS *go, char *basename);
47 static void            close_dbfiles(struct dbfiles *dbf, int status);
48 
49 int
main(int argc,char ** argv)50 main(int argc, char **argv)
51 {
52   ESL_GETOPTS    *go         = cm_CreateDefaultApp(options, 1, argc, argv, banner, usage);
53   ESL_ALPHABET   *abc        = NULL;
54   char           *cmfile     = esl_opt_GetArg(go, 1);
55   CM_FILE        *cmfp       = NULL;
56   CM_t           *cm         = NULL;
57   P7_BG          *bg         = NULL;
58   P7_PROFILE     *gm         = NULL;
59   P7_OPROFILE    *om         = NULL;
60   struct dbfiles *dbf        = NULL;
61   uint16_t        fh         = 0;
62   int             ncm        = 0;
63   uint64_t        tot_clen   = 0;
64   off_t           cm_offset  = 0;
65   off_t           fp7_offset = 0;
66   char            errbuf[eslERRBUFSIZE];
67   int             status;
68 
69   if (strcmp(cmfile, "-") == 0) cm_Fail("Can't use - for <cmfile> argument: can't index standard input\n");
70 
71   status = cm_file_OpenNoDB(cmfile, NULL, FALSE, &cmfp, errbuf);
72   if      (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
73   else if (status == eslEFORMAT)   cm_Fail("File format problem in trying to open CM file %s.\n%s\n",                cmfile, errbuf);
74   else if (status != eslOK)        cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n",                       status, cmfile, errbuf);
75 
76   if (cmfp->do_stdin || cmfp->do_gzip) cm_Fail("CM file %s must be a normal file, not gzipped or a stdin pipe", cmfile);
77 
78   dbf = open_dbfiles(go, cmfile);  // After this, we have to close_dbfiles() before exiting after any error. Don't leave partial/corrupt files.
79 
80   if (esl_newssi_AddFile(dbf->nssi, cmfp->fname, 0, &fh) != eslOK) /* 0 = format code (CMs don't have any yet) */
81     ESL_XFAIL(status, errbuf, "Failed to add CM file %s to new SSI index\n", cmfp->fname);
82 
83   printf("Working...    ");
84   fflush(stdout);
85 
86   while ((status = cm_file_Read(cmfp, TRUE, &abc, &cm)) == eslOK)
87     {
88       if (cm->name == NULL)                              ESL_XFAIL(eslEINVAL, errbuf, "Every CM must have a name to be indexed. Failed to find name of CM #%d\n", ncm+1);
89       if (cm->fp7  == NULL || (! (cm->flags & CMH_FP7))) ESL_XFAIL(eslEINVAL, errbuf, "CM %s (#%d) does not have a filter HMM", cm->name, ncm+1);
90 
91       if (ncm == 0) { /* first time initialization, now that alphabet is known */
92 	bg = p7_bg_Create(abc);
93 	p7_bg_SetLength(bg, 400);
94       }
95 
96       ncm++;
97       tot_clen += cm->clen;
98 
99       gm = p7_profile_Create(cm->fp7->M, abc);
100       p7_ProfileConfig(cm->fp7, bg, gm, 400, p7_LOCAL);
101       om = p7_oprofile_Create(gm->M, abc);
102       p7_oprofile_Convert(gm, om);
103 
104       if ((cm_offset            = ftello(dbf->mfp)) == -1) ESL_XFAIL(eslESYS, errbuf, "Failed to ftello() current disk position of CM db file");
105       if ((om->offs[p7_FOFFSET] = ftello(dbf->ffp)) == -1) ESL_XFAIL(eslESYS, errbuf, "Failed to ftello() current disk position of MSV db file");
106       if ((om->offs[p7_POFFSET] = ftello(dbf->pfp)) == -1) ESL_XFAIL(eslESYS, errbuf, "Failed to ftello() current disk position of profile db file");
107 
108       if ((status = esl_newssi_AddKey(dbf->nssi, cm->name, fh, cm_offset, 0, 0)) != eslOK) ESL_XFAIL(status,  errbuf, "Failed to add key %s to SSI index", cm->name);
109       if (cm->acc) {
110 	if ((status = esl_newssi_AddAlias(dbf->nssi, cm->acc, cm->name))         != eslOK) ESL_XFAIL(status,  errbuf, "Failed to add secondary key %s to SSI index", cm->acc);
111       }
112 
113       cm_file_WriteBinary(dbf->mfp, -1, cm, &fp7_offset);
114       /* write the oprofile after the CM, because we need to know fp7_offset first */
115       om->offs[p7_MOFFSET] = fp7_offset;
116       cm_p7_oprofile_Write(dbf->ffp, dbf->pfp, cm_offset, cm->clen, cm->W, CMCountNodetype(cm, MATP_nd), cm->fp7_evparam[CM_p7_GFMU], cm->fp7_evparam[CM_p7_GFLAMBDA], om);
117 
118       FreeCM(cm);
119       p7_profile_Destroy(gm);
120       p7_oprofile_Destroy(om);
121     }
122   if      (status == eslEFORMAT)   ESL_XFAIL(status, errbuf, "bad file format in CM file %s",             cmfile);
123   else if (status == eslEINCOMPAT) ESL_XFAIL(status, errbuf, "CM file %s contains different alphabets",   cmfile);
124   else if (status != eslEOF)       ESL_XFAIL(status, errbuf, "Unexpected error in reading CMs from %s",   cmfile);
125 
126   status = esl_newssi_Write(dbf->nssi);
127   if      (status == eslEDUP)     ESL_XFAIL(status, errbuf, "SSI index construction failed:\n  %s", dbf->nssi->errbuf);
128   else if (status == eslERANGE)   ESL_XFAIL(status, errbuf, "SSI index file size exceeds maximum allowed by your filesystem");
129   else if (status == eslESYS)     ESL_XFAIL(status, errbuf, "SSI index sort failed:\n  %s", dbf->nssi->errbuf);
130   else if (status != eslOK)       ESL_XFAIL(status, errbuf, "SSI indexing failed:\n  %s", dbf->nssi->errbuf);
131 
132   printf("done.\n");
133   if (dbf->nssi->nsecondary > 0)
134     printf("Pressed and indexed %d CMs and p7 HMM filters (%ld names and %ld accessions).\n", ncm, (long) dbf->nssi->nprimary, (long) dbf->nssi->nsecondary);
135   else
136     printf("Pressed and indexed %d CMs and p7 HMM filters (%ld names).\n", ncm, (long) dbf->nssi->nprimary);
137   printf("Covariance models and p7 filters pressed into binary file:  %s\n", dbf->mfile);
138   printf("SSI index for binary covariance model file:                 %s\n", dbf->ssifile);
139   printf("Optimized p7 filter profiles (MSV part)  pressed into:      %s\n", dbf->ffile);
140   printf("Optimized p7 filter profiles (remainder) pressed into:      %s\n", dbf->pfile);
141 
142   close_dbfiles(dbf, eslOK);
143   p7_bg_Destroy(bg);
144   cm_file_Close(cmfp);
145   esl_alphabet_Destroy(abc);
146   esl_getopts_Destroy(go);
147   exit(0);
148 
149  ERROR:
150   close_dbfiles(dbf, status);
151   p7_bg_Destroy(bg);
152   cm_file_Close(cmfp);
153   esl_alphabet_Destroy(abc);
154   esl_getopts_Destroy(go);
155   exit(1);
156 }
157 
158 
159 static struct dbfiles *
open_dbfiles(ESL_GETOPTS * go,char * basename)160 open_dbfiles(ESL_GETOPTS *go, char *basename)
161 {
162   struct dbfiles *dbf             = NULL;
163   int             allow_overwrite = esl_opt_GetBoolean(go, "-F");
164   char            errbuf[eslERRBUFSIZE];
165   int             status;
166 
167   if ( ( dbf = malloc(sizeof(struct dbfiles))) == NULL) ESL_XEXCEPTION(eslEMEM, "malloc() failed");
168   dbf->mfile   = NULL;
169   dbf->ffile   = NULL;
170   dbf->pfile   = NULL;
171   dbf->ssifile = NULL;
172   dbf->mfp     = NULL;
173   dbf->ffp     = NULL;
174   dbf->pfp     = NULL;
175   dbf->nssi    = NULL;
176 
177   if ( (status = esl_sprintf(&(dbf->ssifile), "%s.i1i", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
178   if ( (status = esl_sprintf(&(dbf->mfile),   "%s.i1m", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
179   if ( (status = esl_sprintf(&(dbf->ffile),   "%s.i1f", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
180   if ( (status = esl_sprintf(&(dbf->pfile),   "%s.i1p", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
181 
182   if (! allow_overwrite && esl_FileExists(dbf->ssifile)) ESL_XFAIL(eslEOVERWRITE, errbuf, "SSI index file %s already exists;\nDelete old cmpress indices first",        dbf->ssifile);
183   if (! allow_overwrite && esl_FileExists(dbf->mfile))   ESL_XFAIL(eslEOVERWRITE, errbuf, "Binary HMM file %s already exists;\nDelete old cmpress indices first",       dbf->mfile);
184   if (! allow_overwrite && esl_FileExists(dbf->ffile))   ESL_XFAIL(eslEOVERWRITE, errbuf, "Binary MSV filter file %s already exists\nDelete old cmpress indices first", dbf->ffile);
185   if (! allow_overwrite && esl_FileExists(dbf->pfile))   ESL_XFAIL(eslEOVERWRITE, errbuf, "Binary profile file %s already exists\nDelete old cmpress indices first",    dbf->pfile);
186 
187   status = esl_newssi_Open(dbf->ssifile, allow_overwrite, &(dbf->nssi));
188   if      (status == eslENOTFOUND)   ESL_XFAIL(status, errbuf, "failed to open SSI index %s", dbf->ssifile);
189   else if (status == eslEOVERWRITE)  ESL_XFAIL(status, errbuf, "SSI index file %s already exists;\nDelete old cmpress indices first", basename);
190   else if (status != eslOK)          ESL_XFAIL(status, errbuf, "failed to create a new SSI index");
191 
192   if ((dbf->mfp = fopen(dbf->mfile, "wb")) == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "Failed to open binary CM file %s for writing",         dbf->mfile);
193   if ((dbf->ffp = fopen(dbf->ffile, "wb")) == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "Failed to open binary MSV filter file %s for writing", dbf->ffile);
194   if ((dbf->pfp = fopen(dbf->pfile, "wb")) == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "Failed to open binary profile file %s for writing",    dbf->pfile);
195 
196   return dbf;
197 
198  ERROR:
199   fprintf(stderr, "%s\n", errbuf);
200   close_dbfiles(dbf, status);
201   exit(1);
202 }
203 
204 
205 /* If status != eslOK, then in addition to free'ing memory, also
206  * remove the four output files.
207  */
208 static void
close_dbfiles(struct dbfiles * dbf,int status)209 close_dbfiles(struct dbfiles *dbf, int status)
210 {
211   if (dbf)
212     {
213       /* Close the output files first */
214       if (dbf->mfp)     fclose(dbf->mfp);
215       if (dbf->ffp)     fclose(dbf->ffp);
216       if (dbf->pfp)     fclose(dbf->pfp);
217       if (dbf->nssi)    esl_newssi_Close(dbf->nssi);
218 
219       /* Then remove them, if status isn't OK. esl_newssi_Write() takes care of the ssifile. */
220       if (status != eslOK)
221         {
222           if (esl_FileExists(dbf->mfile))   remove(dbf->mfile);
223           if (esl_FileExists(dbf->ffile))   remove(dbf->ffile);
224           if (esl_FileExists(dbf->pfile))   remove(dbf->pfile);
225         }
226 
227       /* Finally free their names, and the structure. */
228       if (dbf->mfile)   free(dbf->mfile);
229       if (dbf->ffile)   free(dbf->ffile);
230       if (dbf->pfile)   free(dbf->pfile);
231       if (dbf->ssifile) free(dbf->ssifile);
232       free(dbf);
233     }
234 }
235 
236