1 /* cmpress: prepare a CM database for faster cmscan searches.
2 *
3 * EPN, Thu Jul 7 13:08:55 2011
4 * SRE, Fri Oct 17 11:24:26 2008 [Janelia] (hmmpress.c)
5 */
6 #include "esl_config.h"
7 #include "p7_config.h"
8 #include "config.h"
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13
14 #include "easel.h"
15 #include "esl_alphabet.h"
16 #include "esl_getopts.h"
17
18 #include "hmmer.h"
19
20 #include "infernal.h"
21
22 static ESL_OPTIONS options[] = {
23 /* name type default env range toggles reqs incomp help docgroup*/
24 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
25 { "-F", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "force: overwrite any previous pressed files", 0 },
26 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
27 };
28 static char usage[] = "[-options] <cmfile>";
29 static char banner[] = "prepare an CM database for faster cmscan searches";
30
31 /* cmpress creates four output files.
32 * Bundling their info into a structure streamlines creation and cleanup.
33 */
34 struct dbfiles {
35 char *mfile; // .i1m file: binary core CMs and HMM filters
36 char *ffile; // .i1f file: binary vectorized profile HMM filters, MSV filter part only
37 char *pfile; // .i1p file: binary vectorized profile HMM filters, remainder (excluding MSV filter part)
38 char *ssifile; // .i1i file: SSI index for retrieval from .i1m
39
40 FILE *mfp;
41 FILE *ffp;
42 FILE *pfp;
43 ESL_NEWSSI *nssi;
44 };
45
46 static struct dbfiles *open_dbfiles(ESL_GETOPTS *go, char *basename);
47 static void close_dbfiles(struct dbfiles *dbf, int status);
48
49 int
main(int argc,char ** argv)50 main(int argc, char **argv)
51 {
52 ESL_GETOPTS *go = cm_CreateDefaultApp(options, 1, argc, argv, banner, usage);
53 ESL_ALPHABET *abc = NULL;
54 char *cmfile = esl_opt_GetArg(go, 1);
55 CM_FILE *cmfp = NULL;
56 CM_t *cm = NULL;
57 P7_BG *bg = NULL;
58 P7_PROFILE *gm = NULL;
59 P7_OPROFILE *om = NULL;
60 struct dbfiles *dbf = NULL;
61 uint16_t fh = 0;
62 int ncm = 0;
63 uint64_t tot_clen = 0;
64 off_t cm_offset = 0;
65 off_t fp7_offset = 0;
66 char errbuf[eslERRBUFSIZE];
67 int status;
68
69 if (strcmp(cmfile, "-") == 0) cm_Fail("Can't use - for <cmfile> argument: can't index standard input\n");
70
71 status = cm_file_OpenNoDB(cmfile, NULL, FALSE, &cmfp, errbuf);
72 if (status == eslENOTFOUND) cm_Fail("File existence/permissions problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
73 else if (status == eslEFORMAT) cm_Fail("File format problem in trying to open CM file %s.\n%s\n", cmfile, errbuf);
74 else if (status != eslOK) cm_Fail("Unexpected error %d in opening CM file %s.\n%s\n", status, cmfile, errbuf);
75
76 if (cmfp->do_stdin || cmfp->do_gzip) cm_Fail("CM file %s must be a normal file, not gzipped or a stdin pipe", cmfile);
77
78 dbf = open_dbfiles(go, cmfile); // After this, we have to close_dbfiles() before exiting after any error. Don't leave partial/corrupt files.
79
80 if (esl_newssi_AddFile(dbf->nssi, cmfp->fname, 0, &fh) != eslOK) /* 0 = format code (CMs don't have any yet) */
81 ESL_XFAIL(status, errbuf, "Failed to add CM file %s to new SSI index\n", cmfp->fname);
82
83 printf("Working... ");
84 fflush(stdout);
85
86 while ((status = cm_file_Read(cmfp, TRUE, &abc, &cm)) == eslOK)
87 {
88 if (cm->name == NULL) ESL_XFAIL(eslEINVAL, errbuf, "Every CM must have a name to be indexed. Failed to find name of CM #%d\n", ncm+1);
89 if (cm->fp7 == NULL || (! (cm->flags & CMH_FP7))) ESL_XFAIL(eslEINVAL, errbuf, "CM %s (#%d) does not have a filter HMM", cm->name, ncm+1);
90
91 if (ncm == 0) { /* first time initialization, now that alphabet is known */
92 bg = p7_bg_Create(abc);
93 p7_bg_SetLength(bg, 400);
94 }
95
96 ncm++;
97 tot_clen += cm->clen;
98
99 gm = p7_profile_Create(cm->fp7->M, abc);
100 p7_ProfileConfig(cm->fp7, bg, gm, 400, p7_LOCAL);
101 om = p7_oprofile_Create(gm->M, abc);
102 p7_oprofile_Convert(gm, om);
103
104 if ((cm_offset = ftello(dbf->mfp)) == -1) ESL_XFAIL(eslESYS, errbuf, "Failed to ftello() current disk position of CM db file");
105 if ((om->offs[p7_FOFFSET] = ftello(dbf->ffp)) == -1) ESL_XFAIL(eslESYS, errbuf, "Failed to ftello() current disk position of MSV db file");
106 if ((om->offs[p7_POFFSET] = ftello(dbf->pfp)) == -1) ESL_XFAIL(eslESYS, errbuf, "Failed to ftello() current disk position of profile db file");
107
108 if ((status = esl_newssi_AddKey(dbf->nssi, cm->name, fh, cm_offset, 0, 0)) != eslOK) ESL_XFAIL(status, errbuf, "Failed to add key %s to SSI index", cm->name);
109 if (cm->acc) {
110 if ((status = esl_newssi_AddAlias(dbf->nssi, cm->acc, cm->name)) != eslOK) ESL_XFAIL(status, errbuf, "Failed to add secondary key %s to SSI index", cm->acc);
111 }
112
113 cm_file_WriteBinary(dbf->mfp, -1, cm, &fp7_offset);
114 /* write the oprofile after the CM, because we need to know fp7_offset first */
115 om->offs[p7_MOFFSET] = fp7_offset;
116 cm_p7_oprofile_Write(dbf->ffp, dbf->pfp, cm_offset, cm->clen, cm->W, CMCountNodetype(cm, MATP_nd), cm->fp7_evparam[CM_p7_GFMU], cm->fp7_evparam[CM_p7_GFLAMBDA], om);
117
118 FreeCM(cm);
119 p7_profile_Destroy(gm);
120 p7_oprofile_Destroy(om);
121 }
122 if (status == eslEFORMAT) ESL_XFAIL(status, errbuf, "bad file format in CM file %s", cmfile);
123 else if (status == eslEINCOMPAT) ESL_XFAIL(status, errbuf, "CM file %s contains different alphabets", cmfile);
124 else if (status != eslEOF) ESL_XFAIL(status, errbuf, "Unexpected error in reading CMs from %s", cmfile);
125
126 status = esl_newssi_Write(dbf->nssi);
127 if (status == eslEDUP) ESL_XFAIL(status, errbuf, "SSI index construction failed:\n %s", dbf->nssi->errbuf);
128 else if (status == eslERANGE) ESL_XFAIL(status, errbuf, "SSI index file size exceeds maximum allowed by your filesystem");
129 else if (status == eslESYS) ESL_XFAIL(status, errbuf, "SSI index sort failed:\n %s", dbf->nssi->errbuf);
130 else if (status != eslOK) ESL_XFAIL(status, errbuf, "SSI indexing failed:\n %s", dbf->nssi->errbuf);
131
132 printf("done.\n");
133 if (dbf->nssi->nsecondary > 0)
134 printf("Pressed and indexed %d CMs and p7 HMM filters (%ld names and %ld accessions).\n", ncm, (long) dbf->nssi->nprimary, (long) dbf->nssi->nsecondary);
135 else
136 printf("Pressed and indexed %d CMs and p7 HMM filters (%ld names).\n", ncm, (long) dbf->nssi->nprimary);
137 printf("Covariance models and p7 filters pressed into binary file: %s\n", dbf->mfile);
138 printf("SSI index for binary covariance model file: %s\n", dbf->ssifile);
139 printf("Optimized p7 filter profiles (MSV part) pressed into: %s\n", dbf->ffile);
140 printf("Optimized p7 filter profiles (remainder) pressed into: %s\n", dbf->pfile);
141
142 close_dbfiles(dbf, eslOK);
143 p7_bg_Destroy(bg);
144 cm_file_Close(cmfp);
145 esl_alphabet_Destroy(abc);
146 esl_getopts_Destroy(go);
147 exit(0);
148
149 ERROR:
150 close_dbfiles(dbf, status);
151 p7_bg_Destroy(bg);
152 cm_file_Close(cmfp);
153 esl_alphabet_Destroy(abc);
154 esl_getopts_Destroy(go);
155 exit(1);
156 }
157
158
159 static struct dbfiles *
open_dbfiles(ESL_GETOPTS * go,char * basename)160 open_dbfiles(ESL_GETOPTS *go, char *basename)
161 {
162 struct dbfiles *dbf = NULL;
163 int allow_overwrite = esl_opt_GetBoolean(go, "-F");
164 char errbuf[eslERRBUFSIZE];
165 int status;
166
167 if ( ( dbf = malloc(sizeof(struct dbfiles))) == NULL) ESL_XEXCEPTION(eslEMEM, "malloc() failed");
168 dbf->mfile = NULL;
169 dbf->ffile = NULL;
170 dbf->pfile = NULL;
171 dbf->ssifile = NULL;
172 dbf->mfp = NULL;
173 dbf->ffp = NULL;
174 dbf->pfp = NULL;
175 dbf->nssi = NULL;
176
177 if ( (status = esl_sprintf(&(dbf->ssifile), "%s.i1i", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
178 if ( (status = esl_sprintf(&(dbf->mfile), "%s.i1m", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
179 if ( (status = esl_sprintf(&(dbf->ffile), "%s.i1f", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
180 if ( (status = esl_sprintf(&(dbf->pfile), "%s.i1p", basename)) != eslOK) ESL_XFAIL(status, errbuf, "esl_sprintf() failed");
181
182 if (! allow_overwrite && esl_FileExists(dbf->ssifile)) ESL_XFAIL(eslEOVERWRITE, errbuf, "SSI index file %s already exists;\nDelete old cmpress indices first", dbf->ssifile);
183 if (! allow_overwrite && esl_FileExists(dbf->mfile)) ESL_XFAIL(eslEOVERWRITE, errbuf, "Binary HMM file %s already exists;\nDelete old cmpress indices first", dbf->mfile);
184 if (! allow_overwrite && esl_FileExists(dbf->ffile)) ESL_XFAIL(eslEOVERWRITE, errbuf, "Binary MSV filter file %s already exists\nDelete old cmpress indices first", dbf->ffile);
185 if (! allow_overwrite && esl_FileExists(dbf->pfile)) ESL_XFAIL(eslEOVERWRITE, errbuf, "Binary profile file %s already exists\nDelete old cmpress indices first", dbf->pfile);
186
187 status = esl_newssi_Open(dbf->ssifile, allow_overwrite, &(dbf->nssi));
188 if (status == eslENOTFOUND) ESL_XFAIL(status, errbuf, "failed to open SSI index %s", dbf->ssifile);
189 else if (status == eslEOVERWRITE) ESL_XFAIL(status, errbuf, "SSI index file %s already exists;\nDelete old cmpress indices first", basename);
190 else if (status != eslOK) ESL_XFAIL(status, errbuf, "failed to create a new SSI index");
191
192 if ((dbf->mfp = fopen(dbf->mfile, "wb")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "Failed to open binary CM file %s for writing", dbf->mfile);
193 if ((dbf->ffp = fopen(dbf->ffile, "wb")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "Failed to open binary MSV filter file %s for writing", dbf->ffile);
194 if ((dbf->pfp = fopen(dbf->pfile, "wb")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "Failed to open binary profile file %s for writing", dbf->pfile);
195
196 return dbf;
197
198 ERROR:
199 fprintf(stderr, "%s\n", errbuf);
200 close_dbfiles(dbf, status);
201 exit(1);
202 }
203
204
205 /* If status != eslOK, then in addition to free'ing memory, also
206 * remove the four output files.
207 */
208 static void
close_dbfiles(struct dbfiles * dbf,int status)209 close_dbfiles(struct dbfiles *dbf, int status)
210 {
211 if (dbf)
212 {
213 /* Close the output files first */
214 if (dbf->mfp) fclose(dbf->mfp);
215 if (dbf->ffp) fclose(dbf->ffp);
216 if (dbf->pfp) fclose(dbf->pfp);
217 if (dbf->nssi) esl_newssi_Close(dbf->nssi);
218
219 /* Then remove them, if status isn't OK. esl_newssi_Write() takes care of the ssifile. */
220 if (status != eslOK)
221 {
222 if (esl_FileExists(dbf->mfile)) remove(dbf->mfile);
223 if (esl_FileExists(dbf->ffile)) remove(dbf->ffile);
224 if (esl_FileExists(dbf->pfile)) remove(dbf->pfile);
225 }
226
227 /* Finally free their names, and the structure. */
228 if (dbf->mfile) free(dbf->mfile);
229 if (dbf->ffile) free(dbf->ffile);
230 if (dbf->pfile) free(dbf->pfile);
231 if (dbf->ssifile) free(dbf->ssifile);
232 free(dbf);
233 }
234 }
235
236