1 /* phmmer: search a protein sequence against a protein database
2 */
3 #include "p7_config.h"
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8
9 #include "easel.h"
10 #include "esl_alphabet.h"
11 #include "esl_dmatrix.h"
12 #include "esl_getopts.h"
13 #include "esl_msa.h"
14 #include "esl_msafile.h"
15 #include "esl_scorematrix.h"
16 #include "esl_sq.h"
17 #include "esl_sqio.h"
18 #include "esl_stopwatch.h"
19
20 #ifdef HMMER_MPI
21 #include "mpi.h"
22 #include "esl_mpi.h"
23 #endif
24
25 #ifdef HMMER_THREADS
26 #include <unistd.h>
27 #include "esl_threads.h"
28 #include "esl_workqueue.h"
29 #endif
30
31 #include "hmmer.h"
32
33 typedef struct {
34 #ifdef HMMER_THREADS
35 ESL_WORK_QUEUE *queue;
36 #endif
37 P7_BG *bg;
38 P7_PIPELINE *pli;
39 P7_TOPHITS *th;
40 P7_OPROFILE *om;
41 } WORKER_INFO;
42
43 #define REPOPTS "-E,-T,--cut_ga,--cut_nc,--cut_tc"
44 #define DOMREPOPTS "--domE,--domT,--cut_ga,--cut_nc,--cut_tc"
45 #define INCOPTS "--incE,--incT,--cut_ga,--cut_nc,--cut_tc"
46 #define INCDOMOPTS "--incdomE,--incdomT,--cut_ga,--cut_nc,--cut_tc"
47 #define THRESHOPTS "-E,-T,--domE,--domT,--incE,--incT,--incdomE,--incdomT,--cut_ga,--cut_nc,--cut_tc"
48
49 #if defined (HMMER_THREADS) && defined (HMMER_MPI)
50 #define CPUOPTS "--mpi"
51 #define MPIOPTS "--cpu"
52 #else
53 #define CPUOPTS NULL
54 #define MPIOPTS NULL
55 #endif
56
57 static ESL_OPTIONS options[] = {
58 /* name type default env range toggles reqs incomp help docgroup*/
59 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 1 },
60 /* Control of output */
61 { "-o", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "direct output to file <f>, not stdout", 2 },
62 { "-A", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "save multiple alignment of hits to file <f>", 2 },
63 { "--tblout", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "save parseable table of per-sequence hits to file <f>", 2 },
64 { "--domtblout", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "save parseable table of per-domain hits to file <f>", 2 },
65 { "--pfamtblout", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "save table of hits and domains to file, in Pfam format <f>", 2 },
66 { "--acc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "prefer accessions over names in output", 2 },
67 { "--noali", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "don't output alignments, so output is smaller", 2 },
68 { "--notextw", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--textw", "unlimit ASCII text output line width", 2 },
69 { "--textw", eslARG_INT, "120", NULL, "n>=120", NULL, NULL, "--notextw", "set max width of ASCII text output lines", 2 },
70 /* Control of scoring system */
71 { "--popen", eslARG_REAL, "0.02", NULL, "0<=x<0.5",NULL, NULL, NULL, "gap open probability", 3 },
72 { "--pextend", eslARG_REAL, "0.4", NULL, "0<=x<1", NULL, NULL, NULL, "gap extend probability", 3 },
73 { "--mx", eslARG_STRING, "BLOSUM62", NULL, NULL, NULL, NULL, "--mxfile", "substitution score matrix choice (of some built-in matrices)", 3 },
74 { "--mxfile", eslARG_INFILE, NULL, NULL, NULL, NULL, NULL, "--mx", "read substitution score matrix from file <f>", 3 },
75 /* Control of reporting thresholds */
76 { "-E", eslARG_REAL, "10.0", NULL, "x>0", NULL, NULL, REPOPTS, "report sequences <= this E-value threshold in output", 4 },
77 { "-T", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, REPOPTS, "report sequences >= this score threshold in output", 4 },
78 { "--domE", eslARG_REAL, "10.0", NULL, "x>0", NULL, NULL, DOMREPOPTS, "report domains <= this E-value threshold in output", 4 },
79 { "--domT", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, DOMREPOPTS, "report domains >= this score cutoff in output", 4 },
80 /* Control of inclusion thresholds */
81 { "--incE", eslARG_REAL, "0.01", NULL, "x>0", NULL, NULL, INCOPTS, "consider sequences <= this E-value threshold as significant", 5 },
82 { "--incT", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, INCOPTS, "consider sequences >= this score threshold as significant", 5 },
83 { "--incdomE", eslARG_REAL, "0.01", NULL, "x>0", NULL, NULL, INCDOMOPTS, "consider domains <= this E-value threshold as significant", 5 },
84 { "--incdomT", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, INCDOMOPTS, "consider domains >= this score threshold as significant", 5 },
85 /* Model-specific thresholding for both reporting and inclusion (unused in phmmer)*/
86 { "--cut_ga", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, THRESHOPTS, "use profile's GA gathering cutoffs to set all thresholding", 99 },
87 { "--cut_nc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, THRESHOPTS, "use profile's NC noise cutoffs to set all thresholding", 99 },
88 { "--cut_tc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, THRESHOPTS, "use profile's TC trusted cutoffs to set all thresholding", 99 },
89 /* Control of filter pipeline */
90 { "--max", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "--F1,--F2,--F3", "Turn all heuristic filters off (less speed, more power)", 7 },
91 { "--F1", eslARG_REAL, "0.02", NULL, NULL, NULL, NULL, "--max", "Stage 1 (MSV) threshold: promote hits w/ P <= F1", 7 },
92 { "--F2", eslARG_REAL, "1e-3", NULL, NULL, NULL, NULL, "--max", "Stage 2 (Vit) threshold: promote hits w/ P <= F2", 7 },
93 { "--F3", eslARG_REAL, "1e-5", NULL, NULL, NULL, NULL, "--max", "Stage 3 (Fwd) threshold: promote hits w/ P <= F3", 7 },
94 { "--nobias", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--max", "turn off composition bias filter", 7 },
95 /* Control of E-value calibration */
96 { "--EmL", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "length of sequences for MSV Gumbel mu fit", 11 },
97 { "--EmN", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "number of sequences for MSV Gumbel mu fit", 11 },
98 { "--EvL", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "length of sequences for Viterbi Gumbel mu fit", 11 },
99 { "--EvN", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "number of sequences for Viterbi Gumbel mu fit", 11 },
100 { "--EfL", eslARG_INT, "100", NULL,"n>0", NULL, NULL, NULL, "length of sequences for Forward exp tail tau fit", 11 },
101 { "--EfN", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "number of sequences for Forward exp tail tau fit", 11 },
102 { "--Eft", eslARG_REAL, "0.04", NULL,"0<x<1", NULL, NULL, NULL, "tail mass for Forward exponential tail tau fit", 11 },
103 /* other options */
104 { "--nonull2", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, NULL, "turn off biased composition score corrections", 12 },
105 { "-Z", eslARG_REAL, FALSE, NULL, "x>0", NULL, NULL, NULL, "set # of comparisons done, for E-value calculation", 12 },
106 { "--domZ", eslARG_REAL, FALSE, NULL, "x>0", NULL, NULL, NULL, "set # of significant seqs, for domain E-value calculation", 12 },
107 { "--seed", eslARG_INT, "42", NULL, "n>=0", NULL, NULL, NULL, "set RNG seed to <n> (if 0: one-time arbitrary seed)", 12 },
108 { "--qformat", eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "assert query <seqfile> is in format <s>: no autodetection", 12 },
109 { "--tformat", eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "assert target <seqdb> is in format <s>>: no autodetection", 12 },
110 #ifdef HMMER_THREADS
111 { "--cpu", eslARG_INT, p7_NCPU,"HMMER_NCPU", "n>=0",NULL, NULL, CPUOPTS, "number of parallel CPU workers to use for multithreads", 12 },
112 #endif
113 #ifdef HMMER_MPI
114 { "--stall", eslARG_NONE, FALSE, NULL, NULL, NULL,"--mpi", NULL, "arrest after start: for debugging MPI under gdb", 12 },
115 { "--mpi", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, MPIOPTS, "run as an MPI parallel program", 12 },
116 #endif
117
118 /* Restrict search to subset of database - hidden because these flags are
119 * (a) currently for internal use
120 * (b) probably going to change
121 * Doesn't work with MPI
122 */
123 { "--restrictdb_stkey", eslARG_STRING, "0", NULL, NULL, NULL, NULL, NULL, "Search starts at the sequence with name <s> (not with MPI)", 99 },
124 { "--restrictdb_n",eslARG_INT, "-1", NULL, NULL, NULL, NULL, NULL, "Search <j> target sequences (starting at --restrictdb_stkey)", 99 },
125 { "--ssifile", eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "restrictdb_x values require ssi file. Override default to <s>", 99 },
126
127 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
128 };
129 static char usage[] = "[-options] <seqfile> <seqdb>";
130 static char banner[] = "search a protein sequence against a protein database";
131
132 /* struct cfg_s : "Global" application configuration shared by all threads/processes
133 *
134 * This structure is passed to routines within main.c, as a means of semi-encapsulation
135 * of shared data amongst different parallel processes (threads or MPI processes).
136 */
137 struct cfg_s {
138 char *qfile; /* query sequence file */
139 char *dbfile; /* database file */
140
141 int do_mpi; /* TRUE if we're doing MPI parallelization */
142 int nproc; /* how many MPI processes, total */
143 int my_rank; /* who am I, in 0..nproc-1 */
144
145 char *firstseq_key; /* name of the first sequence in the restricted db range */
146 int n_targetseq; /* number of sequences in the restricted range */
147 };
148
149 static int serial_master(ESL_GETOPTS *go, struct cfg_s *cfg);
150 static int serial_loop (WORKER_INFO *info, ESL_SQFILE *dbfp, int n_targetseqs);
151
152 #ifdef HMMER_THREADS
153 #define BLOCK_SIZE 1000
154
155 static int thread_loop(ESL_THREADS *obj, ESL_WORK_QUEUE *queue, ESL_SQFILE *dbfp, int n_targetseqs);
156 static void pipeline_thread(void *arg);
157 #endif
158
159 #ifdef HMMER_MPI
160 static int mpi_master (ESL_GETOPTS *go, struct cfg_s *cfg);
161 static int mpi_worker (ESL_GETOPTS *go, struct cfg_s *cfg);
162 #endif
163
164 /* process_commandline()
165 * Take argc, argv, and options; parse the command line;
166 * display help/usage info.
167 */
168 static int
process_commandline(int argc,char ** argv,ESL_GETOPTS ** ret_go,char ** ret_qfile,char ** ret_dbfile)169 process_commandline(int argc, char **argv, ESL_GETOPTS **ret_go, char **ret_qfile, char **ret_dbfile)
170 {
171 ESL_GETOPTS *go = esl_getopts_Create(options);
172 int status;
173
174 if (esl_opt_ProcessEnvironment(go) != eslOK) { if (printf("Failed to process environment: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
175 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
176 if (esl_opt_VerifyConfig(go) != eslOK) { if (printf("Failed to parse command line: %s\n", go->errbuf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
177
178 /* help format: */
179 if (esl_opt_GetBoolean(go, "-h") == TRUE)
180 {
181 p7_banner(stdout, argv[0], banner);
182 esl_usage(stdout, argv[0], usage);
183 if (puts("\nBasic options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
184 esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1= group; 2 = indentation; 120=textwidth*/
185
186 if (puts("\nOptions directing output:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
187 esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
188
189 if (puts("\nOptions controlling scoring system:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
190 esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
191
192 if (puts("\nOptions controlling reporting thresholds:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
193 esl_opt_DisplayHelp(stdout, go, 4, 2, 80);
194
195 if (puts("\nOptions controlling inclusion (significance) thresholds:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
196 esl_opt_DisplayHelp(stdout, go, 5, 2, 80);
197
198 if (puts("\nOptions controlling acceleration heuristics:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
199 esl_opt_DisplayHelp(stdout, go, 7, 2, 80);
200
201 if (puts("\nOptions controlling E value calibration:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
202 esl_opt_DisplayHelp(stdout, go, 11, 2, 80);
203
204 if (puts("\nOther expert options:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
205 esl_opt_DisplayHelp(stdout, go, 12, 2, 80);
206 exit(0);
207 }
208
209 if (esl_opt_ArgNumber(go) != 2) { if (puts("Incorrect number of command line arguments.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
210 if ((*ret_qfile = esl_opt_GetArg(go, 1)) == NULL) { if (puts("Failed to get <seqfile> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
211 if ((*ret_dbfile = esl_opt_GetArg(go, 2)) == NULL) { if (puts("Failed to get <seqdb> argument on command line") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
212
213 /* Validate any attempted use of stdin streams */
214 if (strcmp(*ret_qfile, "-") == 0 && strcmp(*ret_dbfile, "-") == 0)
215 { if (puts("Either <seqfile> or <seqdb> may be '-' (to read from stdin), but not both.") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed"); goto FAILURE; }
216
217 *ret_go = go;
218 return eslOK;
219
220 FAILURE: /* all errors handled here are user errors, so be polite. */
221 esl_usage(stdout, argv[0], usage);
222 if (puts("\nwhere basic options are:") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
223 esl_opt_DisplayHelp(stdout, go, 1, 2, 120); /* 1= group; 2 = indentation; 120=textwidth*/
224 if (printf("\nTo see more help on available options, do %s -h\n\n", argv[0]) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "write failed");
225 esl_getopts_Destroy(go);
226 exit(1);
227
228 ERROR:
229 if (go) esl_getopts_Destroy(go);
230 exit(status);
231 }
232
233
234 static int
output_header(FILE * ofp,ESL_GETOPTS * go,char * qfile,char * dbfile)235 output_header(FILE *ofp, ESL_GETOPTS *go, char *qfile, char *dbfile)
236 {
237 p7_banner(ofp, go->argv[0], banner);
238
239 if (fprintf(ofp, "# query sequence file: %s\n", qfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
240 if (fprintf(ofp, "# target sequence database: %s\n", dbfile) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
241 if (esl_opt_IsUsed(go, "-o") && fprintf(ofp, "# output directed to file: %s\n", esl_opt_GetString(go, "-o")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
242 if (esl_opt_IsUsed(go, "-A") && fprintf(ofp, "# MSA of hits saved to file: %s\n", esl_opt_GetString(go, "-A")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
243 if (esl_opt_IsUsed(go, "--tblout") && fprintf(ofp, "# per-seq hits tabular output: %s\n", esl_opt_GetString(go, "--tblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
244 if (esl_opt_IsUsed(go, "--domtblout") && fprintf(ofp, "# per-dom hits tabular output: %s\n", esl_opt_GetString(go, "--domtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
245 if (esl_opt_IsUsed(go, "--pfamtblout")&& fprintf(ofp, "# pfam-style tabular hit output: %s\n", esl_opt_GetString(go, "--pfamtblout")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
246 if (esl_opt_IsUsed(go, "--acc") && fprintf(ofp, "# prefer accessions over names: yes\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
247 if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
248 if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
249 if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
250 if (esl_opt_IsUsed(go, "--popen") && fprintf(ofp, "# gap open probability: %f\n", esl_opt_GetReal (go, "--popen")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
251 if (esl_opt_IsUsed(go, "--pextend") && fprintf(ofp, "# gap extend probability: %f\n", esl_opt_GetReal (go, "--pextend")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
252 if (esl_opt_IsUsed(go, "--mx") && fprintf(ofp, "# subst score matrix (built-in): %s\n", esl_opt_GetString(go, "--mx")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
253 if (esl_opt_IsUsed(go, "--mxfile") && fprintf(ofp, "# subst score matrix (file): %s\n", esl_opt_GetString(go, "--mxfile")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
254 if (esl_opt_IsUsed(go, "-E") && fprintf(ofp, "# sequence reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
255 if (esl_opt_IsUsed(go, "-T") && fprintf(ofp, "# sequence reporting threshold: score >= %g\n", esl_opt_GetReal(go, "-T")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
256 if (esl_opt_IsUsed(go, "--domE") && fprintf(ofp, "# domain reporting threshold: E-value <= %g\n", esl_opt_GetReal(go, "--domE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
257 if (esl_opt_IsUsed(go, "--domT") && fprintf(ofp, "# domain reporting threshold: score >= %g\n", esl_opt_GetReal(go, "--domT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
258 if (esl_opt_IsUsed(go, "--incE") && fprintf(ofp, "# sequence inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
259 if (esl_opt_IsUsed(go, "--incT") && fprintf(ofp, "# sequence inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
260 if (esl_opt_IsUsed(go, "--incdomE") && fprintf(ofp, "# domain inclusion threshold: E-value <= %g\n", esl_opt_GetReal(go, "--incdomE")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
261 if (esl_opt_IsUsed(go, "--incdomT") && fprintf(ofp, "# domain inclusion threshold: score >= %g\n", esl_opt_GetReal(go, "--incdomT")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
262 //if (esl_opt_IsUsed(go, "--cut_ga") && fprintf(ofp, "# model-specific thresholding: GA cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
263 //if (esl_opt_IsUsed(go, "--cut_nc") && fprintf(ofp, "# model-specific thresholding: NC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
264 //if (esl_opt_IsUsed(go, "--cut_tc") && fprintf(ofp, "# model-specific thresholding: TC cutoffs\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
265 if (esl_opt_IsUsed(go, "--max") && fprintf(ofp, "# Max sensitivity mode: on [all heuristic filters off]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
266 if (esl_opt_IsUsed(go, "--F1") && fprintf(ofp, "# MSV filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
267 if (esl_opt_IsUsed(go, "--F2") && fprintf(ofp, "# Vit filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F2")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
268 if (esl_opt_IsUsed(go, "--F3") && fprintf(ofp, "# Fwd filter P threshold: <= %g\n", esl_opt_GetReal(go, "--F3")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
269 if (esl_opt_IsUsed(go, "--nobias") && fprintf(ofp, "# biased composition HMM filter: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
270 if (esl_opt_IsUsed(go, "--restrictdb_stkey") && fprintf(ofp, "# Restrict db to start at seq key: %s\n", esl_opt_GetString(go, "--restrictdb_stkey")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
271 if (esl_opt_IsUsed(go, "--restrictdb_n") && fprintf(ofp, "# Restrict db to # target seqs: %d\n", esl_opt_GetInteger(go, "--restrictdb_n")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
272 if (esl_opt_IsUsed(go, "--ssifile") && fprintf(ofp, "# Override ssi file to: %s\n", esl_opt_GetString(go, "--ssifile")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
273
274 if (esl_opt_IsUsed(go, "--nonull2") && fprintf(ofp, "# null2 bias corrections: off\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
275 if (esl_opt_IsUsed(go, "--EmL") && fprintf(ofp, "# seq length, MSV Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EmL")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
276 if (esl_opt_IsUsed(go, "--EmN") && fprintf(ofp, "# seq number, MSV Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EmN")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
277 if (esl_opt_IsUsed(go, "--EvL") && fprintf(ofp, "# seq length, Vit Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EvL")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
278 if (esl_opt_IsUsed(go, "--EvN") && fprintf(ofp, "# seq number, Vit Gumbel mu fit: %d\n", esl_opt_GetInteger(go, "--EvN")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
279 if (esl_opt_IsUsed(go, "--EfL") && fprintf(ofp, "# seq length, Fwd exp tau fit: %d\n", esl_opt_GetInteger(go, "--EfL")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
280 if (esl_opt_IsUsed(go, "--EfN") && fprintf(ofp, "# seq number, Fwd exp tau fit: %d\n", esl_opt_GetInteger(go, "--EfN")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
281 if (esl_opt_IsUsed(go, "--Eft") && fprintf(ofp, "# tail mass for Fwd exp tau fit: %f\n", esl_opt_GetReal (go, "--Eft")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
282 if (esl_opt_IsUsed(go, "-Z") && fprintf(ofp, "# sequence search space set to: %.0f\n", esl_opt_GetReal(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
283 if (esl_opt_IsUsed(go, "--domZ") && fprintf(ofp, "# domain search space set to: %.0f\n", esl_opt_GetReal(go, "--domZ")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
284 if (esl_opt_IsUsed(go, "--seed")) {
285 if (esl_opt_GetInteger(go, "--seed") == 0 && fprintf(ofp, "# random number seed: one-time arbitrary\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
286 else if ( fprintf(ofp, "# random number seed set to: %d\n", esl_opt_GetInteger(go, "--seed")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
287 }
288 if (esl_opt_IsUsed(go, "--qformat") && fprintf(ofp, "# query <seqfile> format asserted: %s\n", esl_opt_GetString(go, "--qformat")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
289 if (esl_opt_IsUsed(go, "--tformat") && fprintf(ofp, "# target <seqdb> format asserted: %s\n", esl_opt_GetString(go, "--tformat")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
290 #ifdef HMMER_THREADS
291 if (esl_opt_IsUsed(go, "--cpu") && fprintf(ofp, "# number of worker threads: %d\n", esl_opt_GetInteger(go, "--cpu")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
292 #endif
293 #ifdef HMMER_MPI
294 if (esl_opt_IsUsed(go, "--mpi") && fprintf(ofp, "# MPI: on\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
295 #endif
296 if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
297 return eslOK;
298 }
299
300
301 int
main(int argc,char ** argv)302 main(int argc, char **argv)
303 {
304 int status = eslOK;
305
306 ESL_GETOPTS *go = NULL; /* command line processing */
307 struct cfg_s cfg; /* configuration data */
308
309 /* Set processor specific flags */
310 impl_Init();
311
312 /* Initialize what we can in the config structure (without knowing the alphabet yet)
313 */
314 cfg.qfile = NULL;
315 cfg.dbfile = NULL;
316
317 cfg.do_mpi = FALSE; /* this gets reset below, if we init MPI */
318 cfg.nproc = 0; /* this gets reset below, if we init MPI */
319 cfg.my_rank = 0; /* this gets reset below, if we init MPI */
320 cfg.firstseq_key = NULL;
321 cfg.n_targetseq = -1;
322
323 /* Initializations */
324 p7_FLogsumInit(); /* we're going to use table-driven Logsum() approximations at times */
325 process_commandline(argc, argv, &go, &cfg.qfile, &cfg.dbfile);
326
327
328 /* is the range restricted? */
329 if (esl_opt_IsUsed(go, "--restrictdb_stkey") )
330 if ((cfg.firstseq_key = esl_opt_GetString(go, "--restrictdb_stkey")) == NULL) p7_Fail("Failure capturing --restrictdb_stkey\n");
331
332 if (esl_opt_IsUsed(go, "--restrictdb_n") )
333 cfg.n_targetseq = esl_opt_GetInteger(go, "--restrictdb_n");
334
335 if ( cfg.n_targetseq != -1 && cfg.n_targetseq < 1 )
336 p7_Fail("--restrictdb_n must be >= 1\n");
337
338 /* Figure out who we are, and send control there:
339 * we might be an MPI master, an MPI worker, or a serial program.
340 */
341 #ifdef HMMER_MPI
342 /* pause the execution of the programs execution until the user has a
343 * chance to attach with a debugger and send a signal to resume execution
344 * i.e. (gdb) signal SIGCONT
345 */
346 if (esl_opt_GetBoolean(go, "--stall")) pause();
347
348 if (esl_opt_GetBoolean(go, "--mpi"))
349 {
350 cfg.do_mpi = TRUE;
351 MPI_Init(&argc, &argv);
352 MPI_Comm_rank(MPI_COMM_WORLD, &(cfg.my_rank));
353 MPI_Comm_size(MPI_COMM_WORLD, &(cfg.nproc));
354
355 if (cfg.my_rank > 0) status = mpi_worker(go, &cfg);
356 else status = mpi_master(go, &cfg);
357
358 MPI_Finalize();
359 }
360 else
361 #endif /*HMMER_MPI*/
362 {
363 status = serial_master(go, &cfg);
364 }
365
366 esl_getopts_Destroy(go);
367
368 return status;
369 }
370
371 /* serial_master()
372 * For each query sequence in <seqfile> search the database for hits.
373 *
374 * A master can only return if it's successful. All errors are handled
375 * immediately and fatally with p7_Fail(). Where we use the
376 * ESL_EXCEPTION mechanism and ERROR: block, it's for convenience; we
377 * know we're using a fatal exception handler.
378 */
379 static int
serial_master(ESL_GETOPTS * go,struct cfg_s * cfg)380 serial_master(ESL_GETOPTS *go, struct cfg_s *cfg)
381 {
382 FILE *ofp = stdout; /* output file for results (default stdout) */
383 FILE *afp = NULL; /* alignment output file (-A option) */
384 FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */
385 FILE *domtblfp = NULL; /* output stream for tabular per-seq (--domtblout) */
386 FILE *pfamtblfp= NULL; /* output stream for pfam tabular output (--pfamtblout) */
387 int qformat = eslSQFILE_UNKNOWN; /* format of qfile */
388 ESL_SQFILE *qfp = NULL; /* open qfile */
389 ESL_SQ *qsq = NULL; /* query sequence */
390 int dbformat = eslSQFILE_UNKNOWN; /* format of dbfile */
391 ESL_SQFILE *dbfp = NULL; /* open dbfile */
392 ESL_ALPHABET *abc = NULL; /* sequence alphabet */
393 P7_BG *bg = NULL; /* null model (copies made of this into threads) */
394 P7_BUILDER *bld = NULL; /* HMM construction configuration */
395 ESL_STOPWATCH *w = NULL; /* for timing */
396 int nquery = 0;
397 int seed;
398 int textw;
399 int status = eslOK;
400 int qstatus = eslOK;
401 int sstatus = eslOK;
402 int i;
403 int ncpus = 0;
404 int infocnt = 0;
405 WORKER_INFO *info = NULL;
406 #ifdef HMMER_THREADS
407 ESL_SQ_BLOCK *block = NULL;
408 ESL_THREADS *threadObj= NULL;
409 ESL_WORK_QUEUE *queue = NULL;
410 #endif
411
412 /* Initializations */
413 abc = esl_alphabet_Create(eslAMINO);
414 w = esl_stopwatch_Create();
415 textw = (esl_opt_GetBoolean(go, "--notextw") ? 0 : esl_opt_GetInteger(go, "--textw"));
416 bg = p7_bg_Create(abc);
417
418 /* If caller declared input formats, decode them */
419 if (esl_opt_IsOn(go, "--qformat")) {
420 qformat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat"));
421 if (qformat == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat"));
422 }
423 if (esl_opt_IsOn(go, "--tformat")) {
424 dbformat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat"));
425 if (dbformat == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat"));
426 }
427
428 /* Initialize a default builder configuration,
429 * then set only the options we need for single sequence search
430 */
431 bld = p7_builder_Create(NULL, abc);
432 if ((seed = esl_opt_GetInteger(go, "--seed")) > 0)
433 { /* a little wasteful - we're blowing a couple of usec by reinitializing */
434 esl_randomness_Init(bld->r, seed);
435 bld->do_reseeding = TRUE;
436 }
437 bld->EmL = esl_opt_GetInteger(go, "--EmL");
438 bld->EmN = esl_opt_GetInteger(go, "--EmN");
439 bld->EvL = esl_opt_GetInteger(go, "--EvL");
440 bld->EvN = esl_opt_GetInteger(go, "--EvN");
441 bld->EfL = esl_opt_GetInteger(go, "--EfL");
442 bld->EfN = esl_opt_GetInteger(go, "--EfN");
443 bld->Eft = esl_opt_GetReal (go, "--Eft");
444
445 /* Default is stored in the --mx option, so it's always IsOn(). Check --mxfile first; then go to the --mx option and the default. */
446 if (esl_opt_IsOn(go, "--mxfile")) status = p7_builder_SetScoreSystem (bld, esl_opt_GetString(go, "--mxfile"), NULL, esl_opt_GetReal(go, "--popen"), esl_opt_GetReal(go, "--pextend"), bg);
447 else status = p7_builder_LoadScoreSystem(bld, esl_opt_GetString(go, "--mx"), esl_opt_GetReal(go, "--popen"), esl_opt_GetReal(go, "--pextend"), bg);
448 if (status != eslOK) p7_Fail("Failed to set single query seq score system:\n%s\n", bld->errbuf);
449
450 /* Open results output files */
451 if (esl_opt_IsOn(go, "-o")) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) p7_Fail("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o")); }
452 if (esl_opt_IsOn(go, "-A")) { if ((afp = fopen(esl_opt_GetString(go, "-A"), "w")) == NULL) p7_Fail("Failed to open alignment output file %s for writing\n", esl_opt_GetString(go, "-A")); }
453 if (esl_opt_IsOn(go, "--tblout")) { if ((tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL) p7_Fail("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp")); }
454 if (esl_opt_IsOn(go, "--domtblout")) { if ((domtblfp = fopen(esl_opt_GetString(go, "--domtblout"), "w")) == NULL) p7_Fail("Failed to open tabular per-dom output file %s for writing\n", esl_opt_GetString(go, "--domtblfp")); }
455 if (esl_opt_IsOn(go, "--pfamtblout")){ if ((pfamtblfp = fopen(esl_opt_GetString(go, "--pfamtblout"), "w")) == NULL) esl_fatal("Failed to open pfam-style tabular output file %s for writing\n", esl_opt_GetString(go, "--pfamtblout")); }
456
457 /* Open the target sequence database for sequential access. */
458 status = esl_sqfile_OpenDigital(abc, cfg->dbfile, dbformat, p7_SEQDBENV, &dbfp);
459 if (status == eslENOTFOUND) p7_Fail("Failed to open target sequence database %s for reading\n", cfg->dbfile);
460 else if (status == eslEFORMAT) p7_Fail("Target sequence database file %s is empty or misformatted\n", cfg->dbfile);
461 else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile");
462 else if (status != eslOK) p7_Fail("Unexpected error %d opening target sequence database file %s\n", status, cfg->dbfile);
463
464
465 if (esl_opt_IsUsed(go, "--restrictdb_stkey") || esl_opt_IsUsed(go, "--restrictdb_n")) {
466 if (esl_opt_IsUsed(go, "--ssifile"))
467 esl_sqfile_OpenSSI(dbfp, esl_opt_GetString(go, "--ssifile"));
468 else
469 esl_sqfile_OpenSSI(dbfp, NULL);
470 }
471
472
473 /* Open the query sequence file */
474 status = esl_sqfile_OpenDigital(abc, cfg->qfile, qformat, NULL, &qfp);
475 if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->qfile);
476 else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->qfile);
477 else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile");
478 else if (status != eslOK) p7_Fail ("Unexpected error %d opening sequence file %s\n", status, cfg->qfile);
479 qsq = esl_sq_CreateDigital(abc);
480
481 #ifdef HMMER_THREADS
482 /* initialize thread data */
483 ncpus = ESL_MIN( esl_opt_GetInteger(go, "--cpu"), esl_threads_GetCPUCount());
484 if (ncpus > 0)
485 {
486 threadObj = esl_threads_Create(&pipeline_thread);
487 queue = esl_workqueue_Create(ncpus * 2);
488 }
489 #endif
490
491 infocnt = (ncpus == 0) ? 1 : ncpus;
492 ESL_ALLOC(info, sizeof(*info) * infocnt);
493
494 /* Show header output */
495 output_header(ofp, go, cfg->qfile, cfg->dbfile);
496
497 for (i = 0; i < infocnt; ++i)
498 {
499 info[i].pli = NULL;
500 info[i].th = NULL;
501 info[i].om = NULL;
502 info[i].bg = p7_bg_Clone(bg);
503 #ifdef HMMER_THREADS
504 info[i].queue = queue;
505 #endif
506 }
507
508 #ifdef HMMER_THREADS
509 for (i = 0; i < ncpus * 2; ++i)
510 {
511 block = esl_sq_CreateDigitalBlock(BLOCK_SIZE, abc);
512 if (block == NULL)
513 {
514 p7_Fail("Failed to allocate sequence block");
515 }
516
517 status = esl_workqueue_Init(queue, block);
518 if (status != eslOK)
519 {
520 p7_Fail("Failed to add block to work queue");
521 }
522 }
523 #endif
524
525 /* Outer loop over sequence queries */
526 while ((qstatus = esl_sqio_Read(qfp, qsq)) == eslOK)
527 {
528 P7_OPROFILE *om = NULL; /* optimized query profile */
529
530 nquery++;
531 if (qsq->n == 0) continue; /* skip zero length seqs as if they aren't even present */
532
533 esl_stopwatch_Start(w);
534
535 /* seqfile may need to be rewound (multiquery mode) */
536 if (nquery > 1)
537 {
538 if (! esl_sqfile_IsRewindable(dbfp)) p7_Fail("Target sequence file %s isn't rewindable; can't search it with multiple queries", cfg->dbfile);
539
540 if ( cfg->firstseq_key == NULL )
541 esl_sqfile_Position(dbfp, 0); //only re-set current position to 0 if we're not planning to set it in a moment
542 }
543
544 if ( cfg->firstseq_key != NULL ) { //it's tempting to want to do this once and capture the offset position for future passes, but ncbi files make this non-trivial, so this keeps it general
545 sstatus = esl_sqfile_PositionByKey(dbfp, cfg->firstseq_key);
546 if (sstatus != eslOK)
547 p7_Fail("Failure setting restrictdb_stkey to %d\n", cfg->firstseq_key);
548 }
549
550
551 if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
552 if (qsq->acc[0] != '\0' && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
553 if (qsq->desc[0] != '\0' && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
554
555
556 /* Build the model */
557 p7_SingleBuilder(bld, qsq, info[0].bg, NULL, NULL, NULL, &om); /* bypass HMM - only need model */
558
559 for (i = 0; i < infocnt; ++i)
560 {
561 /* Create processing pipeline and hit list */
562 info[i].th = p7_tophits_Create();
563 info[i].om = p7_oprofile_Clone(om);
564 info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */
565 p7_pli_NewModel(info[i].pli, info[i].om, info[i].bg);
566
567 #ifdef HMMER_THREADS
568 if (ncpus > 0) esl_threads_AddThread(threadObj, &info[i]);
569 #endif
570 }
571
572 #ifdef HMMER_THREADS
573 if (ncpus > 0) sstatus = thread_loop(threadObj, queue, dbfp, cfg->n_targetseq);
574 else sstatus = serial_loop(info, dbfp, cfg->n_targetseq);
575 #else
576 sstatus = serial_loop(info, dbfp, cfg->n_targetseq);
577 #endif
578 switch(sstatus)
579 {
580 case eslEFORMAT:
581 p7_Fail("Parse failed (sequence file %s):\n%s\n",
582 dbfp->filename, esl_sqfile_GetErrorBuf(dbfp));
583 break;
584 case eslEOF:
585 /* do nothing */
586 break;
587 default:
588 p7_Fail("Unexpected error %d reading sequence file %s",
589 sstatus, dbfp->filename);
590 }
591
592
593 /* merge the results of the search results */
594 for (i = 1; i < infocnt; ++i)
595 {
596 p7_tophits_Merge(info[0].th, info[i].th);
597 p7_pipeline_Merge(info[0].pli, info[i].pli);
598
599 p7_pipeline_Destroy(info[i].pli);
600 p7_tophits_Destroy(info[i].th);
601 p7_oprofile_Destroy(info[i].om);
602 }
603
604 /* Print the results. */
605 p7_tophits_SortBySortkey(info->th);
606 p7_tophits_Threshold(info->th, info->pli);
607 p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
608 p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
609
610 if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1));
611 if (domtblfp) p7_tophits_TabularDomains(domtblfp, qsq->name, qsq->acc, info->th, info->pli, (nquery == 1));
612 if (pfamtblfp) p7_tophits_TabularXfam(pfamtblfp, qsq->name, qsq->acc, info->th, info->pli);
613
614 esl_stopwatch_Stop(w);
615 p7_pli_Statistics(ofp, info->pli, w);
616 if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
617 fflush(ofp);
618
619 /* Output the results in an MSA (-A option) */
620 if (afp) {
621 ESL_MSA *msa = NULL;
622
623 if ( p7_tophits_Alignment(info->th, abc, NULL, NULL, 0, p7_ALL_CONSENSUS_COLS, &msa) == eslOK)
624 {
625 esl_msa_SetName (msa, om->name, -1); // don't use qsq->name; it's optional in a ESL_SQ, and SingleBuilder took care of naming model.
626 if (qsq->acc[0] != '\0') esl_msa_SetAccession(msa, qsq->acc, -1);
627 if (qsq->desc[0] != '\0') esl_msa_SetDesc (msa, qsq->desc, -1);
628 esl_msa_FormatAuthor(msa, "phmmer (HMMER %s)", HMMER_VERSION);
629
630 if (textw > 0) esl_msafile_Write(afp, msa, eslMSAFILE_STOCKHOLM);
631 else esl_msafile_Write(afp, msa, eslMSAFILE_PFAM);
632
633 if (fprintf(ofp, "# Alignment of %d hits satisfying inclusion thresholds saved to: %s\n", msa->nseq, esl_opt_GetString(go, "-A")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
634 }
635 else if (fprintf(ofp, "# No hits satisfy inclusion thresholds; no alignment saved\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
636
637 esl_msa_Destroy(msa);
638 }
639
640 p7_tophits_Destroy(info->th);
641 p7_pipeline_Destroy(info->pli);
642 p7_oprofile_Destroy(info->om);
643 p7_oprofile_Destroy(om);
644 esl_sq_Reuse(qsq);
645 } /* end outer loop over query sequences */
646 if (qstatus == eslEFORMAT) p7_Fail("Parse failed (sequence file %s):\n%s\n",
647 qfp->filename, esl_sqfile_GetErrorBuf(qfp));
648 else if (qstatus != eslEOF) p7_Fail("Unexpected error %d reading sequence file %s",
649 qstatus, qfp->filename);
650
651
652 /* Terminate outputs - any last words?
653 */
654 if (tblfp) p7_tophits_TabularTail(tblfp, "phmmer", p7_SEARCH_SEQS, cfg->qfile, cfg->dbfile, go);
655 if (domtblfp) p7_tophits_TabularTail(domtblfp, "phmmer", p7_SEARCH_SEQS, cfg->qfile, cfg->dbfile, go);
656 if (pfamtblfp) p7_tophits_TabularTail(pfamtblfp,"phmmer", p7_SEARCH_SEQS, cfg->qfile, cfg->dbfile, go);
657 if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
658
659 /* Cleanup - prepare for successful exit
660 */
661 for (i = 0; i < infocnt; ++i)
662 p7_bg_Destroy(info[i].bg);
663
664 #ifdef HMMER_THREADS
665 if (ncpus > 0)
666 {
667 esl_workqueue_Reset(queue);
668 while (esl_workqueue_Remove(queue, (void **) &block) == eslOK)
669 esl_sq_DestroyBlock(block);
670 esl_workqueue_Destroy(queue);
671 esl_threads_Destroy(threadObj);
672 }
673 #endif
674
675 free(info);
676 esl_sqfile_Close(dbfp);
677 esl_sqfile_Close(qfp);
678 esl_stopwatch_Destroy(w);
679 esl_sq_Destroy(qsq);
680 p7_bg_Destroy(bg);
681 p7_builder_Destroy(bld);
682 esl_alphabet_Destroy(abc);
683
684 if (ofp != stdout) fclose(ofp);
685 if (afp != NULL) fclose(afp);
686 if (tblfp != NULL) fclose(tblfp);
687 if (domtblfp != NULL) fclose(domtblfp);
688 if (pfamtblfp) fclose(pfamtblfp);
689 return eslOK;
690
691 ERROR:
692 return status;
693 }
694
695 #ifdef HMMER_MPI
696
697 /* Define common tags used by the MPI master/slave processes */
698 #define HMMER_ERROR_TAG 1
699 #define HMMER_HMM_TAG 2
700 #define HMMER_SEQUENCE_TAG 3
701 #define HMMER_BLOCK_TAG 4
702 #define HMMER_PIPELINE_TAG 5
703 #define HMMER_TOPHITS_TAG 6
704 #define HMMER_HIT_TAG 7
705 #define HMMER_TERMINATING_TAG 8
706 #define HMMER_READY_TAG 9
707
708 /* mpi_failure()
709 * Generate an error message. If the clients rank is not 0, a
710 * message is created with the error message and sent to the
711 * master process for handling.
712 */
713 static void
mpi_failure(char * format,...)714 mpi_failure(char *format, ...)
715 {
716 va_list argp;
717 int status = eslFAIL;
718 int len;
719 int rank;
720 char str[512];
721
722 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
723
724 /* format the error mesg */
725 va_start(argp, format);
726 len = vsnprintf(str, sizeof(str), format, argp);
727 va_end(argp);
728
729 /* make sure the error string is terminated */
730 str[sizeof(str)-1] = '\0';
731
732 /* if the caller is the master, print the results and abort */
733 if (rank == 0)
734 {
735 if (fprintf(stderr, "\nError: ") < 0) exit(eslEWRITE);
736 if (fprintf(stderr, "%s", str) < 0) exit(eslEWRITE);
737 if (fprintf(stderr, "\n") < 0) exit(eslEWRITE);
738 fflush(stderr);
739
740 MPI_Abort(MPI_COMM_WORLD, status);
741 exit(1);
742 }
743 else
744 {
745 MPI_Send(str, len, MPI_CHAR, 0, HMMER_ERROR_TAG, MPI_COMM_WORLD);
746 pause();
747 }
748 }
749
750 #define MAX_BLOCK_SIZE (512*1024)
751
752 typedef struct {
753 uint64_t offset;
754 uint64_t length;
755 uint64_t count;
756 } SEQ_BLOCK;
757
758 typedef struct {
759 int complete;
760 int size;
761 int current;
762 int last;
763 SEQ_BLOCK *blocks;
764 } BLOCK_LIST;
765
766 /* this routine parses the database keeping track of the blocks
767 * offset within the file, number of sequences and the length
768 * of the block. These blocks are passed as work units to the
769 * MPI workers. If multiple hmm's are in the query file, the
770 * blocks are reused without parsing the database a second time.
771 */
next_block(ESL_SQFILE * sqfp,ESL_SQ * sq,BLOCK_LIST * list,SEQ_BLOCK * block,int n_targetseqs)772 int next_block(ESL_SQFILE *sqfp, ESL_SQ *sq, BLOCK_LIST *list, SEQ_BLOCK *block, int n_targetseqs)
773 {
774 int status = eslOK;
775
776 /* if the list has been calculated, use it instead of parsing the database */
777 if (list->complete)
778 {
779 if (list->current == list->last)
780 {
781 block->offset = 0;
782 block->length = 0;
783 block->count = 0;
784
785 status = eslEOF;
786 }
787 else
788 {
789 int inx = list->current++;
790
791 block->offset = list->blocks[inx].offset;
792 block->length = list->blocks[inx].length;
793 block->count = list->blocks[inx].count;
794
795 status = eslOK;
796 }
797
798 return status;
799 }
800
801 block->offset = 0;
802 block->length = 0;
803 block->count = 0;
804
805 esl_sq_Reuse(sq);
806 if (n_targetseqs == 0) status = eslEOF; //this is to handle the end-case of a restrictdb scenario, where no more targets are required, and we want to mark the list as complete
807 while (block->length < MAX_BLOCK_SIZE && (n_targetseqs < 0 || block->count < n_targetseqs) && (status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK)
808 {
809 if (block->count == 0) block->offset = sq->roff;
810 block->length = sq->eoff - block->offset + 1;
811 block->count++;
812 esl_sq_Reuse(sq);
813 }
814
815 if (block->count > 0)
816 if (status == eslEOF || block->count == n_targetseqs)
817 status = eslOK;
818 if (status == eslEOF) list->complete = 1;
819
820 /* add the block to the list of known blocks */
821 if (status == eslOK)
822 {
823 int inx;
824
825 if (list->last >= list->size)
826 {
827 void *tmp;
828 list->size += 500;
829 ESL_RALLOC(list->blocks, tmp, sizeof(SEQ_BLOCK) * list->size);
830 }
831
832 inx = list->last++;
833 list->blocks[inx].offset = block->offset;
834 list->blocks[inx].length = block->length;
835 list->blocks[inx].count = block->count;
836 }
837
838 return status;
839
840 ERROR:
841 return eslEMEM;
842 }
843
844 /* mpi_master()
845 * The MPI version of hmmbuild.
846 * Follows standard pattern for a master/worker load-balanced MPI program (J1/78-79).
847 *
848 * A master can only return if it's successful.
849 * Where we use EXCEPTION mechanism and ERROR block, it's for convenience;
850 * we know we're using a fatal exception handler.
851 *
852 * Errors in an MPI master come in two classes: recoverable and nonrecoverable.
853 *
854 * Recoverable errors include all worker-side errors, and any
855 * master-side error that do not affect MPI communication. Error
856 * messages from recoverable messages are delayed until we've cleanly
857 * shut down the workers.
858 *
859 * Unrecoverable errors are master-side errors that may affect MPI
860 * communication, meaning we cannot count on being able to reach the
861 * workers and shut them down. Unrecoverable errors result in immediate
862 * p7_Fail()'s, which will cause MPI to shut down the worker processes
863 * uncleanly.
864 */
865 static int
mpi_master(ESL_GETOPTS * go,struct cfg_s * cfg)866 mpi_master(ESL_GETOPTS *go, struct cfg_s *cfg)
867 {
868 FILE *ofp = stdout; /* output file for results (default stdout) */
869 FILE *afp = NULL; /* alignment output file (-A option) */
870 FILE *tblfp = NULL; /* output stream for tabular per-seq (--tblout) */
871 FILE *domtblfp = NULL; /* output stream for tabular per-seq (--domtblout) */
872 FILE *pfamtblfp= NULL; /* output stream for pfam-style tabular output (--pfamtblout) */
873 int qformat = eslSQFILE_UNKNOWN; /* format of qfile */
874 P7_BG *bg = NULL; /* null model */
875 ESL_SQFILE *qfp = NULL; /* open qfile */
876 ESL_SQ *qsq = NULL; /* query sequence */
877 int dbformat = eslSQFILE_UNKNOWN; /* format of dbfile */
878 ESL_SQFILE *dbfp = NULL; /* open dbfile */
879 ESL_SQ *dbsq = NULL; /* target sequence */
880 ESL_ALPHABET *abc = NULL; /* sequence alphabet */
881 P7_BUILDER *bld = NULL; /* HMM construction configuration */
882 ESL_STOPWATCH *w = NULL; /* for timing */
883 int nquery = 0;
884 int seed;
885 int textw;
886 int status = eslOK;
887 int qstatus = eslOK;
888 int sstatus = eslOK;
889 int dest;
890
891 char *mpi_buf = NULL; /* buffer used to pack/unpack structures */
892 int mpi_size = 0; /* size of the allocated buffer */
893 BLOCK_LIST *list = NULL;
894 SEQ_BLOCK block;
895
896 int i;
897 int size;
898 MPI_Status mpistatus;
899
900 int n_targets;
901
902 /* Initializations */
903 abc = esl_alphabet_Create(eslAMINO);
904 w = esl_stopwatch_Create();
905 if (esl_opt_GetBoolean(go, "--notextw")) textw = 0;
906 else textw = esl_opt_GetInteger(go, "--textw");
907 esl_stopwatch_Start(w);
908
909 /* If caller declared input formats, decode them */
910 if (esl_opt_IsOn(go, "--qformat")) {
911 qformat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat"));
912 if (qformat == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat"));
913 }
914 if (esl_opt_IsOn(go, "--tformat")) {
915 dbformat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat"));
916 if (dbformat == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat"));
917 }
918
919 bg = p7_bg_Create(abc);
920
921 /* Initialize a default builder configuration,
922 * then set only the options we need for single sequence search
923 */
924 bld = p7_builder_Create(NULL, abc);
925 if ((seed = esl_opt_GetInteger(go, "--seed")) > 0)
926 { /* a little wasteful - we're blowing a couple of usec by reinitializing */
927 esl_randomness_Init(bld->r, seed);
928 bld->do_reseeding = TRUE;
929 }
930 bld->EmL = esl_opt_GetInteger(go, "--EmL");
931 bld->EmN = esl_opt_GetInteger(go, "--EmN");
932 bld->EvL = esl_opt_GetInteger(go, "--EvL");
933 bld->EvN = esl_opt_GetInteger(go, "--EvN");
934 bld->EfL = esl_opt_GetInteger(go, "--EfL");
935 bld->EfN = esl_opt_GetInteger(go, "--EfN");
936 bld->Eft = esl_opt_GetReal (go, "--Eft");
937
938 if (esl_opt_IsOn(go, "--mxfile")) status = p7_builder_SetScoreSystem (bld, esl_opt_GetString(go, "--mxfile"), NULL, esl_opt_GetReal(go, "--popen"), esl_opt_GetReal(go, "--pextend"), bg);
939 else status = p7_builder_LoadScoreSystem(bld, esl_opt_GetString(go, "--mx"), esl_opt_GetReal(go, "--popen"), esl_opt_GetReal(go, "--pextend"), bg);
940 if (status != eslOK) mpi_failure("Failed to set single query seq score system:\n%s\n", bld->errbuf);
941
942 /* Open results output files */
943 if (esl_opt_IsOn(go, "-o") && (ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL)
944 mpi_failure("Failed to open output file %s for writing\n", esl_opt_GetString(go, "-o"));
945 if (esl_opt_IsOn(go, "-A") && (afp = fopen(esl_opt_GetString(go, "-A"), "w")) == NULL)
946 mpi_failure("Failed to open alignment output file %s for writing\n", esl_opt_GetString(go, "-A"));
947 if (esl_opt_IsOn(go, "--tblout") && (tblfp = fopen(esl_opt_GetString(go, "--tblout"), "w")) == NULL)
948 mpi_failure("Failed to open tabular per-seq output file %s for writing\n", esl_opt_GetString(go, "--tblfp"));
949 if (esl_opt_IsOn(go, "--domtblout") && (domtblfp = fopen(esl_opt_GetString(go, "--domtblout"), "w")) == NULL)
950 mpi_failure("Failed to open tabular per-dom output file %s for writing\n", esl_opt_GetString(go, "--domtblfp"));
951 if (esl_opt_IsOn(go, "--pfamtblout") && (pfamtblfp = fopen(esl_opt_GetString(go, "--pfamtblout"), "w")) == NULL)
952 mpi_failure("Failed to open pfam-style tabular output file %s for writing\n", esl_opt_GetString(go, "--pfamtblout"));
953
954 /* Open the target sequence database for sequential access. */
955 status = esl_sqfile_OpenDigital(abc, cfg->dbfile, dbformat, p7_SEQDBENV, &dbfp);
956 if (status == eslENOTFOUND) mpi_failure("Failed to open target sequence database %s for reading\n", cfg->dbfile);
957 else if (status == eslEFORMAT) mpi_failure("Target sequence database file %s is empty or misformatted\n", cfg->dbfile);
958 else if (status == eslEINVAL) mpi_failure("Can't autodetect format of a stdin or .gz seqfile");
959 else if (status != eslOK) mpi_failure("Unexpected error %d opening target sequence database file %s\n", status, cfg->dbfile);
960 dbsq = esl_sq_CreateDigital(abc);
961
962 if (esl_opt_IsUsed(go, "--restrictdb_stkey") || esl_opt_IsUsed(go, "--restrictdb_n")) {
963 if (esl_opt_IsUsed(go, "--ssifile"))
964 esl_sqfile_OpenSSI(dbfp, esl_opt_GetString(go, "--ssifile"));
965 else
966 esl_sqfile_OpenSSI(dbfp, NULL);
967 }
968
969
970
971 /* Open the query sequence file */
972 status = esl_sqfile_OpenDigital(abc, cfg->qfile, qformat, NULL, &qfp);
973 if (status == eslENOTFOUND) mpi_failure("Failed to open sequence file %s for reading\n", cfg->qfile);
974 else if (status == eslEFORMAT) mpi_failure("Sequence file %s is empty or misformatted\n", cfg->qfile);
975 else if (status == eslEINVAL) mpi_failure("Can't autodetect format of a stdin or .gz seqfile");
976 else if (status != eslOK) mpi_failure ("Unexpected error %d opening sequence file %s\n", status, cfg->qfile);
977 qsq = esl_sq_CreateDigital(abc);
978
979 ESL_ALLOC(list, sizeof(SEQ_BLOCK));
980 list->complete = 0;
981 list->size = 0;
982 list->current = 0;
983 list->last = 0;
984 list->blocks = NULL;
985
986
987 /* Show header output */
988 output_header(ofp, go, cfg->qfile, cfg->dbfile);
989
990 if ( cfg->firstseq_key != NULL ) { //it's tempting to want to do this once and capture the offset position for future passes, but ncbi files make this non-trivial, so this keeps it general
991 sstatus = esl_sqfile_PositionByKey(dbfp, cfg->firstseq_key);
992 if (sstatus != eslOK)
993 p7_Fail("Failure setting restrictdb_stkey to %d\n", cfg->firstseq_key);
994 }
995
996
997 /* Outer loop over sequence queries */
998 while ((qstatus = esl_sqio_Read(qfp, qsq)) == eslOK)
999 {
1000 P7_PIPELINE *pli = NULL; /* processing pipeline */
1001 P7_TOPHITS *th = NULL; /* top-scoring sequence hits */
1002 P7_OPROFILE *om = NULL; /* optimized query profile */
1003 int seq_cnt = 0;
1004
1005 nquery++;
1006 if (qsq->n == 0) continue; /* skip zero length seqs as if they aren't even present */
1007
1008 esl_stopwatch_Start(w);
1009
1010 n_targets = cfg->n_targetseq;
1011
1012 /* seqfile may need to be rewound (multiquery mode) */
1013 if (nquery > 1) list->current = 0;
1014
1015 if (fprintf(ofp, "Query: %s [L=%ld]\n", qsq->name, (long) qsq->n) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1016 if (qsq->acc[0] != '\0' && fprintf(ofp, "Accession: %s\n", qsq->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1017 if (qsq->desc[0] != '\0' && fprintf(ofp, "Description: %s\n", qsq->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1018
1019 /* Build the model */
1020 p7_SingleBuilder(bld, qsq, bg, NULL, NULL, NULL, &om); /* bypass HMM - only need model */
1021
1022 /* Create processing pipeline and hit list */
1023 th = p7_tophits_Create();
1024 pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */
1025 p7_pli_NewModel(pli, om, bg);
1026
1027 /* Main loop: */
1028 while ((n_targets==-1 || seq_cnt<=n_targets) && (sstatus = next_block(dbfp, dbsq, list, &block, n_targets-seq_cnt)) == eslOK )
1029 {
1030 seq_cnt += block.count;
1031
1032 if (MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &mpistatus) != 0)
1033 mpi_failure("MPI error %d receiving message from %d\n", mpistatus.MPI_SOURCE);
1034
1035 MPI_Get_count(&mpistatus, MPI_PACKED, &size);
1036 if (mpi_buf == NULL || size > mpi_size) {
1037 void *tmp;
1038 ESL_RALLOC(mpi_buf, tmp, sizeof(char) * size);
1039 mpi_size = size;
1040 }
1041
1042 dest = mpistatus.MPI_SOURCE;
1043 MPI_Recv(mpi_buf, size, MPI_PACKED, dest, mpistatus.MPI_TAG, MPI_COMM_WORLD, &mpistatus);
1044
1045 if (mpistatus.MPI_TAG == HMMER_ERROR_TAG)
1046 mpi_failure("MPI client %d raised error:\n%s\n", dest, mpi_buf);
1047 if (mpistatus.MPI_TAG != HMMER_READY_TAG)
1048 mpi_failure("Unexpected tag %d from %d\n", mpistatus.MPI_TAG, dest);
1049
1050 MPI_Send(&block, 3, MPI_LONG_LONG_INT, dest, HMMER_BLOCK_TAG, MPI_COMM_WORLD);
1051 }
1052
1053 if (n_targets!=-1 && seq_cnt==n_targets)
1054 sstatus = eslEOF;
1055
1056 switch(sstatus)
1057 {
1058 case eslEFORMAT:
1059 mpi_failure("Parse failed (sequence file %s):\n%s\n",
1060 dbfp->filename, esl_sqfile_GetErrorBuf(dbfp));
1061 break;
1062 case eslEOF:
1063 break;
1064 default:
1065 mpi_failure("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename);
1066 }
1067
1068 block.offset = 0;
1069 block.length = 0;
1070 block.count = 0;
1071
1072 /* wait for all workers to finish up their work blocks */
1073 for (i = 1; i < cfg->nproc; ++i)
1074 {
1075 if (MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &mpistatus) != 0)
1076 mpi_failure("MPI error %d receiving message from %d\n", mpistatus.MPI_SOURCE);
1077
1078 MPI_Get_count(&mpistatus, MPI_PACKED, &size);
1079 if (mpi_buf == NULL || size > mpi_size) {
1080 void *tmp;
1081 ESL_RALLOC(mpi_buf, tmp, sizeof(char) * size);
1082 mpi_size = size;
1083 }
1084
1085 dest = mpistatus.MPI_SOURCE;
1086 MPI_Recv(mpi_buf, size, MPI_PACKED, dest, mpistatus.MPI_TAG, MPI_COMM_WORLD, &mpistatus);
1087
1088 if (mpistatus.MPI_TAG == HMMER_ERROR_TAG)
1089 mpi_failure("MPI client %d raised error:\n%s\n", dest, mpi_buf);
1090 if (mpistatus.MPI_TAG != HMMER_READY_TAG)
1091 mpi_failure("Unexpected tag %d from %d\n", mpistatus.MPI_TAG, dest);
1092 }
1093
1094 /* merge the results of the search results */
1095 for (dest = 1; dest < cfg->nproc; ++dest)
1096 {
1097 P7_PIPELINE *mpi_pli = NULL;
1098 P7_TOPHITS *mpi_th = NULL;
1099
1100 /* send an empty block to signal the worker they are done */
1101 MPI_Send(&block, 3, MPI_LONG_LONG_INT, dest, HMMER_BLOCK_TAG, MPI_COMM_WORLD);
1102
1103 /* wait for the results */
1104 if ((status = p7_tophits_MPIRecv(dest, HMMER_TOPHITS_TAG, MPI_COMM_WORLD, &mpi_buf, &mpi_size, &mpi_th)) != eslOK)
1105 mpi_failure("Unexpected error %d receiving tophits from %d", status, dest);
1106
1107 if ((status = p7_pipeline_MPIRecv(dest, HMMER_PIPELINE_TAG, MPI_COMM_WORLD, &mpi_buf, &mpi_size, go, &mpi_pli)) != eslOK)
1108 mpi_failure("Unexpected error %d receiving pipeline from %d", status, dest);
1109
1110 p7_tophits_Merge(th, mpi_th);
1111 p7_pipeline_Merge(pli, mpi_pli);
1112
1113 p7_pipeline_Destroy(mpi_pli);
1114 p7_tophits_Destroy(mpi_th);
1115 }
1116
1117 /* Print the results. */
1118 p7_tophits_SortBySortkey(th);
1119 p7_tophits_Threshold(th, pli);
1120 p7_tophits_Targets(ofp, th, pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1121 p7_tophits_Domains(ofp, th, pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1122
1123 if (tblfp) p7_tophits_TabularTargets(tblfp, qsq->name, qsq->acc, th, pli, (nquery == 1));
1124 if (domtblfp) p7_tophits_TabularDomains(domtblfp, qsq->name, qsq->acc, th, pli, (nquery == 1));
1125 if (pfamtblfp) p7_tophits_TabularXfam(pfamtblfp, qsq->name, qsq->acc, th, pli);
1126
1127 esl_stopwatch_Stop(w);
1128 p7_pli_Statistics(ofp, pli, w);
1129 if (fprintf(ofp, "//\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1130
1131 /* Output the results in an MSA (-A option) */
1132 if (afp) {
1133 ESL_MSA *msa = NULL;
1134
1135 if ( p7_tophits_Alignment(th, abc, NULL, NULL, 0, p7_ALL_CONSENSUS_COLS, &msa) == eslOK)
1136 {
1137 esl_msa_SetName (msa, om->name, -1); // don't use qsq->name; it's optional in a ESL_SQ, and SingleBuilder took care of naming model.
1138 if (qsq->acc[0] != '\0') esl_msa_SetAccession(msa, qsq->acc, -1);
1139 if (qsq->desc[0] != '\0') esl_msa_SetDesc (msa, qsq->desc, -1);
1140 esl_msa_FormatAuthor(msa, "phmmer (HMMER %s)", HMMER_VERSION);
1141
1142 esl_msa_FormatAuthor(msa, "phmmer (HMMER %s)", HMMER_VERSION);
1143
1144 if (textw > 0) esl_msafile_Write(afp, msa, eslMSAFILE_STOCKHOLM);
1145 else esl_msafile_Write(afp, msa, eslMSAFILE_PFAM);
1146
1147 if (fprintf(ofp, "# Alignment of %d hits satisfying inclusion thresholds saved to: %s\n", msa->nseq, esl_opt_GetString(go, "-A")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
1148 }
1149 else { if (fprintf(ofp, "# No hits satisfy inclusion thresholds; no alignment saved\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
1150 esl_msa_Destroy(msa);
1151 }
1152
1153 p7_tophits_Destroy(th);
1154 p7_pipeline_Destroy(pli);
1155 p7_oprofile_Destroy(om);
1156 esl_sq_Reuse(qsq);
1157 } /* end outer loop over query sequences */
1158 if (qstatus == eslEFORMAT) mpi_failure("Parse failed (sequence file %s):\n%s\n",
1159 qfp->filename, esl_sqfile_GetErrorBuf(qfp));
1160 else if (qstatus != eslEOF) mpi_failure("Unexpected error %d reading sequence file %s",
1161 qstatus, qfp->filename);
1162
1163 /* monitor all the workers to make sure they have ended */
1164 for (i = 1; i < cfg->nproc; ++i)
1165 {
1166 if (MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &mpistatus) != 0)
1167 mpi_failure("MPI error %d receiving message from %d\n", mpistatus.MPI_SOURCE);
1168
1169 MPI_Get_count(&mpistatus, MPI_PACKED, &size);
1170 if (mpi_buf == NULL || size > mpi_size) {
1171 void *tmp;
1172 ESL_RALLOC(mpi_buf, tmp, sizeof(char) * size);
1173 mpi_size = size;
1174 }
1175
1176 dest = mpistatus.MPI_SOURCE;
1177 MPI_Recv(mpi_buf, size, MPI_PACKED, dest, mpistatus.MPI_TAG, MPI_COMM_WORLD, &mpistatus);
1178
1179 if (mpistatus.MPI_TAG == HMMER_ERROR_TAG)
1180 mpi_failure("MPI client %d raised error:\n%s\n", dest, mpi_buf);
1181 if (mpistatus.MPI_TAG != HMMER_TERMINATING_TAG)
1182 mpi_failure("Unexpected tag %d from %d\n", mpistatus.MPI_TAG, dest);
1183 }
1184
1185 /* Terminate outputs - any last words?
1186 */
1187 if (tblfp) p7_tophits_TabularTail(tblfp, "phmmer", p7_SEARCH_SEQS, cfg->qfile, cfg->dbfile, go);
1188 if (domtblfp) p7_tophits_TabularTail(domtblfp, "phmmer", p7_SEARCH_SEQS, cfg->qfile, cfg->dbfile, go);
1189 if (pfamtblfp)p7_tophits_TabularTail(pfamtblfp, "phmmer", p7_SEARCH_SEQS, cfg->qfile, cfg->dbfile, go);
1190 if (ofp) { if (fprintf(ofp, "[ok]\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
1191
1192 /* Cleanup - prepare for successful exit
1193 */
1194 free(list);
1195 if (mpi_buf != NULL) free(mpi_buf);
1196
1197 p7_bg_Destroy(bg);
1198
1199 esl_sqfile_Close(dbfp);
1200 esl_sqfile_Close(qfp);
1201 esl_stopwatch_Destroy(w);
1202 esl_sq_Destroy(dbsq);
1203 esl_sq_Destroy(qsq);
1204 p7_builder_Destroy(bld);
1205 esl_alphabet_Destroy(abc);
1206
1207 if (ofp != stdout) fclose(ofp);
1208 if (afp != NULL) fclose(afp);
1209 if (tblfp != NULL) fclose(tblfp);
1210 if (domtblfp != NULL) fclose(domtblfp);
1211 if (pfamtblfp) fclose(pfamtblfp);
1212 return eslOK;
1213
1214 ERROR:
1215 return status;
1216 }
1217
1218
1219 static int
mpi_worker(ESL_GETOPTS * go,struct cfg_s * cfg)1220 mpi_worker(ESL_GETOPTS *go, struct cfg_s *cfg)
1221 {
1222 int qformat = eslSQFILE_UNKNOWN; /* format of qfile */
1223 P7_BG *bg = NULL; /* null model */
1224 ESL_SQFILE *qfp = NULL; /* open qfile */
1225 ESL_SQ *qsq = NULL; /* query sequence */
1226 int dbformat = eslSQFILE_UNKNOWN; /* format of dbfile */
1227 ESL_SQFILE *dbfp = NULL; /* open dbfile */
1228 ESL_SQ *dbsq = NULL; /* target sequence */
1229 ESL_ALPHABET *abc = NULL; /* sequence alphabet */
1230 P7_BUILDER *bld = NULL; /* HMM construction configuration */
1231 ESL_STOPWATCH *w = NULL; /* for timing */
1232 int seed;
1233 int status = eslOK;
1234 int qstatus = eslOK;
1235 int sstatus = eslOK;
1236
1237 char *mpi_buf = NULL; /* buffer used to pack/unpack structures */
1238 int mpi_size = 0; /* size of the allocated buffer */
1239
1240 MPI_Status mpistatus;
1241
1242 /* Initializations */
1243 abc = esl_alphabet_Create(eslAMINO);
1244 w = esl_stopwatch_Create();
1245 bg = p7_bg_Create(abc);
1246
1247 /* If caller declared input formats, decode them */
1248 if (esl_opt_IsOn(go, "--qformat")) {
1249 qformat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--qformat"));
1250 if (qformat == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized input sequence file format\n", esl_opt_GetString(go, "--qformat"));
1251 }
1252 if (esl_opt_IsOn(go, "--tformat")) {
1253 dbformat = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--tformat"));
1254 if (dbformat == eslSQFILE_UNKNOWN) p7_Fail("%s is not a recognized sequence database file format\n", esl_opt_GetString(go, "--tformat"));
1255 }
1256
1257 /* Initialize a default builder configuration,
1258 * then set only the options we need for single sequence search
1259 */
1260 bld = p7_builder_Create(NULL, abc);
1261 if ((seed = esl_opt_GetInteger(go, "--seed")) > 0)
1262 { /* a little wasteful - we're blowing a couple of usec by reinitializing */
1263 esl_randomness_Init(bld->r, seed);
1264 bld->do_reseeding = TRUE;
1265 }
1266 bld->EmL = esl_opt_GetInteger(go, "--EmL");
1267 bld->EmN = esl_opt_GetInteger(go, "--EmN");
1268 bld->EvL = esl_opt_GetInteger(go, "--EvL");
1269 bld->EvN = esl_opt_GetInteger(go, "--EvN");
1270 bld->EfL = esl_opt_GetInteger(go, "--EfL");
1271 bld->EfN = esl_opt_GetInteger(go, "--EfN");
1272 bld->Eft = esl_opt_GetReal (go, "--Eft");
1273
1274 if (esl_opt_IsOn(go, "--mxfile")) status = p7_builder_SetScoreSystem (bld, esl_opt_GetString(go, "--mxfile"), NULL, esl_opt_GetReal(go, "--popen"), esl_opt_GetReal(go, "--pextend"), bg);
1275 else status = p7_builder_LoadScoreSystem(bld, esl_opt_GetString(go, "--mx"), esl_opt_GetReal(go, "--popen"), esl_opt_GetReal(go, "--pextend"), bg);
1276 if (status != eslOK) mpi_failure("Failed to set single query seq score system:\n%s\n", bld->errbuf);
1277
1278 /* Open the target sequence database for sequential access. */
1279 status = esl_sqfile_OpenDigital(abc, cfg->dbfile, dbformat, p7_SEQDBENV, &dbfp);
1280 if (status == eslENOTFOUND) mpi_failure("Failed to open target sequence database %s for reading\n", cfg->dbfile);
1281 else if (status == eslEFORMAT) mpi_failure("Target sequence database file %s is empty or misformatted\n", cfg->dbfile);
1282 else if (status == eslEINVAL) mpi_failure("Can't autodetect format of a stdin or .gz seqfile");
1283 else if (status != eslOK) mpi_failure("Unexpected error %d opening target sequence database file %s\n", status, cfg->dbfile);
1284 dbsq = esl_sq_CreateDigital(abc);
1285
1286 /* Open the query sequence file */
1287 status = esl_sqfile_OpenDigital(abc, cfg->qfile, qformat, NULL, &qfp);
1288 if (status == eslENOTFOUND) mpi_failure("Failed to open sequence file %s for reading\n", cfg->qfile);
1289 else if (status == eslEFORMAT) mpi_failure("Sequence file %s is empty or misformatted\n", cfg->qfile);
1290 else if (status == eslEINVAL) mpi_failure("Can't autodetect format of a stdin or .gz seqfile");
1291 else if (status != eslOK) mpi_failure ("Unexpected error %d opening sequence file %s\n", status, cfg->qfile);
1292 qsq = esl_sq_CreateDigital(abc);
1293
1294
1295
1296 /* Outer loop over sequence queries */
1297 while ((qstatus = esl_sqio_Read(qfp, qsq)) == eslOK)
1298 {
1299 P7_PIPELINE *pli = NULL; /* processing pipeline */
1300 P7_TOPHITS *th = NULL; /* top-scoring sequence hits */
1301 P7_OPROFILE *om = NULL; /* optimized query profile */
1302
1303 SEQ_BLOCK block;
1304
1305 status = 0;
1306 MPI_Send(&status, 1, MPI_INT, 0, HMMER_READY_TAG, MPI_COMM_WORLD);
1307
1308 if (qsq->n == 0) continue; /* skip zero length seqs as if they aren't even present */
1309
1310 esl_stopwatch_Start(w);
1311
1312 /* Build the model */
1313 p7_SingleBuilder(bld, qsq, bg, NULL, NULL, NULL, &om); /* bypass HMM - only need model */
1314
1315 /* Create processing pipeline and hit list */
1316 th = p7_tophits_Create();
1317 pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */
1318 p7_pli_NewModel(pli, om, bg);
1319
1320 /* receive a sequence block from the master */
1321 MPI_Recv(&block, 3, MPI_LONG_LONG_INT, 0, HMMER_BLOCK_TAG, MPI_COMM_WORLD, &mpistatus);
1322 while (block.count > 0)
1323 {
1324 uint64_t length = 0;
1325 uint64_t count = block.count;
1326
1327 status = esl_sqfile_Position(dbfp, block.offset);
1328 if (status != eslOK) mpi_failure("Cannot position sequence database to %ld\n", block.offset);
1329
1330 while (count > 0 && (sstatus = esl_sqio_Read(dbfp, dbsq)) == eslOK)
1331 {
1332 length = dbsq->eoff - block.offset + 1;
1333
1334 p7_pli_NewSeq(pli, dbsq);
1335 p7_bg_SetLength(bg, dbsq->n);
1336 p7_oprofile_ReconfigLength(om, dbsq->n);
1337
1338 p7_Pipeline(pli, om, bg, dbsq, NULL, th);
1339
1340 esl_sq_Reuse(dbsq);
1341 p7_pipeline_Reuse(pli);
1342
1343 --count;
1344 }
1345
1346 /* lets do a little bit of sanity checking here to make sure the blocks are the same */
1347 if (count > 0) mpi_failure("Block count mismatch - expected %ld found %ld at offset %ld\n", block.count, block.count - count, block.offset);
1348 if (block.length != length) mpi_failure("Block length mismatch - expected %ld found %ld at offset %ld\n", block.length, length, block.offset);
1349
1350 /* inform the master we need another block of sequences */
1351 status = 0;
1352 MPI_Send(&status, 1, MPI_INT, 0, HMMER_READY_TAG, MPI_COMM_WORLD);
1353
1354 /* wait for the next block of sequences */
1355 MPI_Recv(&block, 3, MPI_LONG_LONG_INT, 0, HMMER_BLOCK_TAG, MPI_COMM_WORLD, &mpistatus);
1356 }
1357
1358 esl_stopwatch_Stop(w);
1359
1360 /* Send the top hits back to the master. */
1361 p7_tophits_MPISend(th, 0, HMMER_TOPHITS_TAG, MPI_COMM_WORLD, &mpi_buf, &mpi_size);
1362 p7_pipeline_MPISend(pli, 0, HMMER_PIPELINE_TAG, MPI_COMM_WORLD, &mpi_buf, &mpi_size);
1363
1364 p7_tophits_Destroy(th);
1365 p7_pipeline_Destroy(pli);
1366 p7_oprofile_Destroy(om);
1367 esl_sq_Reuse(qsq);
1368 } /* end outer loop over query sequences */
1369 if (qstatus == eslEFORMAT) mpi_failure("Parse failed (sequence file %s):\n%s\n",
1370 qfp->filename, esl_sqfile_GetErrorBuf(qfp));
1371 else if (qstatus != eslEOF) mpi_failure("Unexpected error %d reading sequence file %s",
1372 qstatus, qfp->filename);
1373
1374 status = 0;
1375 MPI_Send(&status, 1, MPI_INT, 0, HMMER_TERMINATING_TAG, MPI_COMM_WORLD);
1376
1377 if (mpi_buf != NULL) free(mpi_buf);
1378
1379 p7_bg_Destroy(bg);
1380
1381 esl_sqfile_Close(dbfp);
1382 esl_sqfile_Close(qfp);
1383 esl_stopwatch_Destroy(w);
1384 esl_sq_Destroy(dbsq);
1385 esl_sq_Destroy(qsq);
1386 p7_builder_Destroy(bld);
1387 esl_alphabet_Destroy(abc);
1388 return eslOK;
1389 }
1390 #endif /*HMMER_MPI*/
1391
1392
1393 static int
serial_loop(WORKER_INFO * info,ESL_SQFILE * dbfp,int n_targetseqs)1394 serial_loop(WORKER_INFO *info, ESL_SQFILE *dbfp, int n_targetseqs)
1395 {
1396 int sstatus = eslOK;
1397 ESL_SQ *dbsq = NULL; /* one target sequence (digital) */
1398 int seq_cnt = 0;
1399
1400 dbsq = esl_sq_CreateDigital(info->om->abc);
1401
1402 /* Main loop: */
1403 while ((n_targetseqs==-1 || seq_cnt<n_targetseqs) && (sstatus = esl_sqio_Read(dbfp, dbsq)) == eslOK)
1404 {
1405 p7_pli_NewSeq(info->pli, dbsq);
1406 p7_bg_SetLength(info->bg, dbsq->n);
1407 p7_oprofile_ReconfigLength(info->om, dbsq->n);
1408
1409 p7_Pipeline(info->pli, info->om, info->bg, dbsq, NULL, info->th);
1410
1411 seq_cnt++;
1412 esl_sq_Reuse(dbsq);
1413 p7_pipeline_Reuse(info->pli);
1414 }
1415
1416 if (n_targetseqs!=-1 && seq_cnt==n_targetseqs)
1417 sstatus = eslEOF;
1418
1419 esl_sq_Destroy(dbsq);
1420
1421 return sstatus;
1422 }
1423
1424 #ifdef HMMER_THREADS
1425 static int
thread_loop(ESL_THREADS * obj,ESL_WORK_QUEUE * queue,ESL_SQFILE * dbfp,int n_targetseqs)1426 thread_loop(ESL_THREADS *obj, ESL_WORK_QUEUE *queue, ESL_SQFILE *dbfp, int n_targetseqs)
1427 {
1428 int status = eslOK;
1429 int sstatus = eslOK;
1430 int eofCount = 0;
1431 ESL_SQ_BLOCK *block;
1432 void *newBlock;
1433
1434 esl_workqueue_Reset(queue);
1435 esl_threads_WaitForStart(obj);
1436
1437 status = esl_workqueue_ReaderUpdate(queue, NULL, &newBlock);
1438 if (status != eslOK) p7_Fail("Work queue reader failed");
1439
1440 /* Main loop: */
1441 while (sstatus == eslOK)
1442 {
1443 block = (ESL_SQ_BLOCK *) newBlock;
1444
1445 if (n_targetseqs == 0)
1446 {
1447 block->count = 0;
1448 sstatus = eslEOF;
1449 } else {
1450 sstatus = esl_sqio_ReadBlock(dbfp, block, -1, n_targetseqs, FALSE);
1451 n_targetseqs -= block->count;
1452 }
1453
1454 if (sstatus == eslEOF)
1455 {
1456 if (eofCount < esl_threads_GetWorkerCount(obj)) sstatus = eslOK;
1457 ++eofCount;
1458 }
1459
1460 if (sstatus == eslOK)
1461 {
1462 status = esl_workqueue_ReaderUpdate(queue, block, &newBlock);
1463 if (status != eslOK) p7_Fail("Work queue reader failed");
1464 }
1465 }
1466
1467 status = esl_workqueue_ReaderUpdate(queue, block, NULL);
1468 if (status != eslOK) p7_Fail("Work queue reader failed");
1469
1470 if (sstatus == eslEOF)
1471 {
1472 /* wait for all the threads to complete */
1473 esl_threads_WaitForFinish(obj);
1474 esl_workqueue_Complete(queue);
1475 }
1476
1477 return sstatus;
1478 }
1479
1480 static void
pipeline_thread(void * arg)1481 pipeline_thread(void *arg)
1482 {
1483 int i;
1484 int status;
1485 int workeridx;
1486 WORKER_INFO *info;
1487 ESL_THREADS *obj;
1488 ESL_SQ_BLOCK *block = NULL;
1489 void *newBlock;
1490
1491 impl_Init();
1492
1493 obj = (ESL_THREADS *) arg;
1494 esl_threads_Started(obj, &workeridx);
1495
1496 info = (WORKER_INFO *) esl_threads_GetData(obj, workeridx);
1497
1498 status = esl_workqueue_WorkerUpdate(info->queue, NULL, &newBlock);
1499 if (status != eslOK) p7_Fail("Work queue worker failed");
1500
1501 /* loop until all blocks have been processed */
1502 block = (ESL_SQ_BLOCK *) newBlock;
1503 while (block->count > 0)
1504 {
1505 /* Main loop: */
1506 for (i = 0; i < block->count; ++i)
1507 {
1508 ESL_SQ *dbsq = block->list + i;
1509
1510 p7_pli_NewSeq(info->pli, dbsq);
1511 p7_bg_SetLength(info->bg, dbsq->n);
1512 p7_oprofile_ReconfigLength(info->om, dbsq->n);
1513
1514 p7_Pipeline(info->pli, info->om, info->bg, dbsq, NULL, info->th);
1515
1516 esl_sq_Reuse(dbsq);
1517 p7_pipeline_Reuse(info->pli);
1518 }
1519
1520 status = esl_workqueue_WorkerUpdate(info->queue, block, &newBlock);
1521 if (status != eslOK) p7_Fail("Work queue worker failed");
1522
1523 block = (ESL_SQ_BLOCK *) newBlock;
1524 }
1525
1526 status = esl_workqueue_WorkerUpdate(info->queue, block, NULL);
1527 if (status != eslOK) p7_Fail("Work queue worker failed");
1528
1529 esl_threads_Finished(obj, workeridx);
1530 return;
1531 }
1532 #endif /* HMMER_THREADS */
1533
1534
1535