1 /* hmmpgmd: hmmer deamon searchs against a sequence database.
2 *
3 * MSF, Thu Aug 12, 2010 [Janelia]
4 */
5 #include "p7_config.h"
6
7 #ifdef HMMER_THREADS
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 #include <unistd.h>
14 #include <signal.h>
15 #include <pthread.h>
16 #include <setjmp.h>
17 #include <sys/socket.h>
18 #include <arpa/inet.h>
19 #include <syslog.h>
20 #include <assert.h>
21
22 #ifndef HMMER_THREADS
23 #error "Program requires pthreads be enabled."
24 #endif /*HMMER_THREADS*/
25
26 #include "easel.h"
27 #include "esl_alphabet.h"
28 #include "esl_getopts.h"
29 #include "esl_sq.h"
30 #include "esl_sqio.h"
31 #include "esl_stopwatch.h"
32 #include "esl_threads.h"
33 #include "esl_regexp.h"
34
35 #include "hmmer.h"
36 #include "hmmpgmd.h"
37 #include "cachedb.h"
38
39 #define MAX_WORKERS 64
40 #define MAX_BUFFER 4096
41
42 #define CONF_FILE "/etc/hmmpgmd.conf"
43
44 #define REPOPTS "-E,-T,--cut_ga,--cut_nc,--cut_tc"
45 #define DOMREPOPTS "--domE,--domT,--cut_ga,--cut_nc,--cut_tc"
46 #define INCOPTS "--incE,--incT,--cut_ga,--cut_nc,--cut_tc"
47 #define INCDOMOPTS "--incdomE,--incdomT,--cut_ga,--cut_nc,--cut_tc"
48 #define THRESHOPTS "-E,-T,--domE,--domT,--incE,--incT,--incdomE,--incdomT,--cut_ga,--cut_nc,--cut_tc"
49 #define STAGEOPTS "--F1,--F2,--F3"
50
51 static ESL_OPTIONS searchOpts[] = {
52 /* Control of output */
53 { "--acc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "prefer accessions over names in output", 2 },
54 { "--noali", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "don't output alignments, so output is smaller", 2 },
55 /* Control of scoring system */
56 { "--popen", eslARG_REAL, "0.02", NULL, "0<=x<0.5",NULL, NULL, NULL, "gap open probability", 3 },
57 { "--pextend", eslARG_REAL, "0.4", NULL, "0<=x<1", NULL, NULL, NULL, "gap extend probability", 3 },
58 { "--mx", eslARG_STRING, "BLOSUM62", NULL, NULL, NULL, NULL, "--mxfile", "substitution score matrix choice (of some built-in matrices)", 3 },
59 { "--mxfile", eslARG_INFILE, NULL, NULL, NULL, NULL, NULL, "--mx", "read substitution score matrix from file <f>", 3 },
60 /* Control of reporting thresholds */
61 { "-E", eslARG_REAL, "10.0", NULL, "x>0", NULL, NULL, REPOPTS, "report sequences <= this E-value threshold in output", 4 },
62 { "-T", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, REPOPTS, "report sequences >= this score threshold in output", 4 },
63 { "--domE", eslARG_REAL, "10.0", NULL, "x>0", NULL, NULL, DOMREPOPTS, "report domains <= this E-value threshold in output", 4 },
64 { "--domT", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, DOMREPOPTS, "report domains >= this score cutoff in output", 4 },
65 /* Control of inclusion (significance) thresholds */
66 { "--incE", eslARG_REAL, "0.01", NULL, "x>0", NULL, NULL, INCOPTS, "consider sequences <= this E-value threshold as significant", 5 },
67 { "--incT", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, INCOPTS, "consider sequences >= this score threshold as significant", 5 },
68 { "--incdomE", eslARG_REAL, "0.01", NULL, "x>0", NULL, NULL, INCDOMOPTS, "consider domains <= this E-value threshold as significant", 5 },
69 { "--incdomT", eslARG_REAL, FALSE, NULL, NULL, NULL, NULL, INCDOMOPTS, "consider domains >= this score threshold as significant", 5 },
70 /* Model-specific thresholding for both reporting and inclusion */
71 { "--cut_ga", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, THRESHOPTS, "use profile's GA gathering cutoffs to set all thresholding", 6 },
72 { "--cut_nc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, THRESHOPTS, "use profile's NC noise cutoffs to set all thresholding", 6 },
73 { "--cut_tc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, THRESHOPTS, "use profile's TC trusted cutoffs to set all thresholding", 6 },
74 /* Control of acceleration pipeline */
75 { "--max", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, STAGEOPTS, "Turn all heuristic filters off (less speed, more power)", 7 },
76 { "--F1", eslARG_REAL, "0.02", NULL, NULL, NULL, NULL, "--max", "Stage 1 (MSV) threshold: promote hits w/ P <= F1", 7 },
77 { "--F2", eslARG_REAL, "1e-3", NULL, NULL, NULL, NULL, "--max", "Stage 2 (Vit) threshold: promote hits w/ P <= F2", 7 },
78 { "--F3", eslARG_REAL, "1e-5", NULL, NULL, NULL, NULL, "--max", "Stage 3 (Fwd) threshold: promote hits w/ P <= F3", 7 },
79 { "--nobias", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--max", "turn off composition bias filter", 7 },
80 /* Control of E-value calibration */
81 { "--EmL", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "length of sequences for MSV Gumbel mu fit", 11 },
82 { "--EmN", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "number of sequences for MSV Gumbel mu fit", 11 },
83 { "--EvL", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "length of sequences for Viterbi Gumbel mu fit", 11 },
84 { "--EvN", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "number of sequences for Viterbi Gumbel mu fit", 11 },
85 { "--EfL", eslARG_INT, "100", NULL,"n>0", NULL, NULL, NULL, "length of sequences for Forward exp tail tau fit", 11 },
86 { "--EfN", eslARG_INT, "200", NULL,"n>0", NULL, NULL, NULL, "number of sequences for Forward exp tail tau fit", 11 },
87 { "--Eft", eslARG_REAL, "0.04", NULL,"0<x<1", NULL, NULL, NULL, "tail mass for Forward exponential tail tau fit", 11 },
88 /* Other options */
89 { "--seed", eslARG_INT, "42", NULL, "n>=0", NULL, NULL, NULL, "set RNG seed to <n> (if 0: one-time arbitrary seed)", 12 },
90 { "--nonull2", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, NULL, "turn off biased composition score corrections", 12 },
91 { "-Z", eslARG_REAL, FALSE, NULL, "x>0", NULL, NULL, NULL, "set # of comparisons done, for E-value calculation", 12 },
92 { "--domZ", eslARG_REAL, FALSE, NULL, "x>0", NULL, NULL, NULL, "set # of significant seqs, for domain E-value calculation", 12 },
93 { "--hmmdb", eslARG_INT, NULL, NULL, "n>0", NULL, NULL, "--seqdb", "hmm database to search", 12 },
94 { "--seqdb", eslARG_INT, NULL, NULL, "n>0", NULL, NULL, "--hmmdb", "protein database to search", 12 },
95 { "--seqdb_ranges",eslARG_STRING, NULL, NULL, NULL, NULL, "--seqdb", NULL, "range(s) of sequences within --seqdb that will be searched", 12 },
96
97
98 /* name type default env range toggles reqs incomp help docgroup*/
99 { "-c", eslARG_INT, "1", NULL, NULL, NULL, NULL, NULL, "use alt genetic code of NCBI transl table <n>", 99 },
100 { "-l", eslARG_INT, "20", NULL, NULL, NULL, NULL, NULL, "minimum ORF length", 99 },
101 { "-m", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-M", "ORFs must initiate with AUG only", 99 },
102 { "-M", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-m", "ORFs must start with allowed initiation codon", 99 },
103 // { "-W", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use windowed, memory-efficient seq reading", 99 },
104 { "--informat", eslARG_STRING, FALSE, NULL, NULL, NULL, NULL, NULL, "specify that input file is in format <s>", 99 },
105 { "--watson", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "only translate top strand", 99 },
106 { "--crick", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "only translate bottom strand", 99 },
107
108
109 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110 };
111
112 size_t
writen(int fd,const void * vptr,size_t n)113 writen(int fd, const void *vptr, size_t n)
114 {
115 ssize_t remaining;
116 ssize_t outn;
117 const char *ptr;
118
119 ptr = vptr;
120 remaining = n;
121 while (remaining > 0) {
122 if ((outn = write(fd, ptr, remaining)) <= 0) {
123 if (outn < 0 && errno == EINTR) {
124 outn = 0;
125 } else {
126 return -1;
127 }
128 }
129
130 remaining -= outn;
131 ptr += outn;
132 }
133
134 return n;
135 }
136
137 size_t
readn(int fd,void * vptr,size_t n)138 readn(int fd, void *vptr, size_t n)
139 {
140 size_t remaining;
141 size_t bytes;
142 char *ptr;
143
144 ptr = vptr;
145 remaining = n;
146 while (remaining > 0) {
147 if ((bytes = read(fd, ptr, remaining)) <= 0) {
148 if (errno == EINTR) {
149 bytes = 0;
150 } else {
151 return -1;
152 }
153 }
154
155 remaining -= bytes;
156 ptr += bytes;
157 }
158
159 return n - remaining;
160 }
161
162 #define LOG_TO_STDOUT
163 #ifdef LOG_TO_STDOUT
p7_openlog(const char * ident,int option,int facility)164 void p7_openlog(const char *ident, int option, int facility)
165 {
166 /* do nothing */
167 return;
168 }
169
p7_syslog(int priority,const char * format,...)170 void p7_syslog(int priority, const char *format, ...)
171 {
172 va_list ap;
173
174 printf("\n*** ERROR ***\n");
175
176 va_start(ap, format);
177 vprintf(format, ap);
178 va_end(ap);
179
180 printf("\n");
181 fflush(stdout);
182
183 return;
184 }
p7_closelog(void)185 void p7_closelog(void)
186 {
187 /* do nothing */
188 return;
189 }
190 #endif
191
192 int
process_searchopts(int fd,char * cmdstr,ESL_GETOPTS ** ret_opts)193 process_searchopts(int fd, char *cmdstr, ESL_GETOPTS **ret_opts)
194 {
195 int status;
196
197 ESL_GETOPTS *go = NULL;
198
199 if ((go = esl_getopts_Create(searchOpts)) == NULL) return eslEMEM;
200 if ((status = esl_opt_ProcessSpoof(go, cmdstr)) != eslOK) return status;
201 if ((status = esl_opt_VerifyConfig(go)) != eslOK) return status;
202
203 *ret_opts = go;
204 return eslOK;
205 }
206
207 void
free_QueueData(QUEUE_DATA * data)208 free_QueueData(QUEUE_DATA *data)
209 {
210 /* free the query data */
211 esl_getopts_Destroy(data->opts);
212
213 if (data->abc != NULL) esl_alphabet_Destroy(data->abc);
214 if (data->hmm != NULL) p7_hmm_Destroy(data->hmm);
215 if (data->seq != NULL) esl_sq_Destroy(data->seq);
216 if (data->cmd != NULL) free(data->cmd);
217 memset(data, 0, sizeof(*data));
218 free(data);
219 }
220
221 /* Function: hmmpgmd_IsWithinRanges()
222 * Synopsis: Test if the given id falls within one of a collection of ranges
223 *
224 * Purpose: Given an index <sq_idx> and a number <N> of ranges stored in two
225 * parallel arrays of start (<range_starts>) and end (<range_ends>)
226 * positions, return TRUE if sq_idx falls in one of the ranges.
227 * Otherwise return FALSE;
228 *
229 * Returns: <TRUE> if within range(s), otherwise <FALSE>
230 */
231 int
hmmpgmd_IsWithinRanges(int64_t sq_idx,RANGE_LIST * list)232 hmmpgmd_IsWithinRanges (int64_t sq_idx, RANGE_LIST *list ) {
233 int i;
234 for (i=0; i<list->N; i++) {
235 if (sq_idx >= list->starts[i] && sq_idx <= list->ends[i] )
236 return TRUE;
237 }
238 return FALSE;
239 }
240
241
242 /* Function: hmmpgmd_GetRanges()
243 * Synopsis: Parse command flag into range(s)
244 *
245 * Purpose: Given a command flag string <rangestr> of the form
246 * <start1>..<end1>,<start2>..<end2>...
247 * parse the string into a RANGE_LIST <list>
248 *
249 * Returns: <eslOK> on success <TRUE>, <eslEMEM> on memory allocation failure,
250 * otherwise <eslESYNTAX> or <eslFAIL> on parsing errors.
251 */
252 int
hmmpgmd_GetRanges(RANGE_LIST * list,char * rangestr)253 hmmpgmd_GetRanges (RANGE_LIST *list, char *rangestr) {
254 char *range;
255 char *rangestr_cpy;
256 char *rangestr_cpy_ptr;
257 int status;
258
259 list->N = 0;
260 list->starts = NULL;
261 list->ends = NULL;
262
263 //first pass to figure out how much to allocate
264 esl_strdup(rangestr, -1, &rangestr_cpy); // do this because esl_strtok modifies the string, and we shouldn't change the opts value
265 rangestr_cpy_ptr = rangestr_cpy; // do this because esl_strtok advances the pointer on the target string, but we need to free it
266 while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK) list->N++;
267 ESL_ALLOC(list->starts, list->N * sizeof(int));
268 ESL_ALLOC(list->ends, list->N * sizeof(int));
269 free(rangestr_cpy_ptr);
270
271 //2nd pass to get the values
272 list->N = 0;
273 esl_strdup(rangestr, -1, &rangestr_cpy);
274 rangestr_cpy_ptr = rangestr_cpy;
275 while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK) {
276 status = esl_regexp_ParseCoordString(range, list->starts + list->N, list->ends + list->N);
277 if (status == eslESYNTAX) esl_fatal("--seqdb_ranges takes coords <from>..<to>; %s not recognized", range);
278 if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", range);
279 list->N++;
280 }
281 free(rangestr_cpy_ptr);
282
283 return eslOK;
284
285 ERROR:
286 return eslEMEM;
287 }
288
289 #endif /*HMMER_THREADS*/
290