1 /* hmmpgmd: hmmer deamon searchs against a sequence database.
2  *
3  * MSF, Thu Aug 12, 2010 [Janelia]
4  */
5 #include "p7_config.h"
6 
7 #ifdef HMMER_THREADS
8 
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 #include <unistd.h>
14 #include <signal.h>
15 #include <pthread.h>
16 #include <setjmp.h>
17 #include <sys/socket.h>
18 #include <arpa/inet.h>
19 #include <syslog.h>
20 #include <assert.h>
21 
22 #ifndef HMMER_THREADS
23 #error "Program requires pthreads be enabled."
24 #endif /*HMMER_THREADS*/
25 
26 #include "easel.h"
27 #include "esl_alphabet.h"
28 #include "esl_getopts.h"
29 #include "esl_sq.h"
30 #include "esl_sqio.h"
31 #include "esl_stopwatch.h"
32 #include "esl_threads.h"
33 #include "esl_regexp.h"
34 
35 #include "hmmer.h"
36 #include "hmmpgmd.h"
37 #include "cachedb.h"
38 
39 #define MAX_WORKERS  64
40 #define MAX_BUFFER   4096
41 
42 #define CONF_FILE "/etc/hmmpgmd.conf"
43 
44 #define REPOPTS     "-E,-T,--cut_ga,--cut_nc,--cut_tc"
45 #define DOMREPOPTS  "--domE,--domT,--cut_ga,--cut_nc,--cut_tc"
46 #define INCOPTS     "--incE,--incT,--cut_ga,--cut_nc,--cut_tc"
47 #define INCDOMOPTS  "--incdomE,--incdomT,--cut_ga,--cut_nc,--cut_tc"
48 #define THRESHOPTS  "-E,-T,--domE,--domT,--incE,--incT,--incdomE,--incdomT,--cut_ga,--cut_nc,--cut_tc"
49 #define STAGEOPTS   "--F1,--F2,--F3"
50 
51 static ESL_OPTIONS searchOpts[] = {
52   /* Control of output */
53   { "--acc",        eslARG_NONE,        FALSE, NULL, NULL,      NULL,  NULL, NULL,        "prefer accessions over names in output",                       2 },
54   { "--noali",      eslARG_NONE,        FALSE, NULL, NULL,      NULL,  NULL, NULL,        "don't output alignments, so output is smaller",                2 },
55   /* Control of scoring system */
56   { "--popen",      eslARG_REAL,       "0.02", NULL, "0<=x<0.5",NULL,  NULL, NULL,        "gap open probability",                                         3 },
57   { "--pextend",    eslARG_REAL,        "0.4", NULL, "0<=x<1",  NULL,  NULL, NULL,        "gap extend probability",                                       3 },
58   { "--mx",         eslARG_STRING, "BLOSUM62", NULL, NULL,      NULL,  NULL,  "--mxfile", "substitution score matrix choice (of some built-in matrices)", 3 },
59   { "--mxfile",     eslARG_INFILE,       NULL, NULL, NULL,      NULL,  NULL,  "--mx",     "read substitution score matrix from file <f>",                 3 },
60   /* Control of reporting thresholds */
61   { "-E",           eslARG_REAL,     "10.0", NULL, "x>0",     NULL,  NULL, REPOPTS,     "report sequences <= this E-value threshold in output",         4 },
62   { "-T",           eslARG_REAL,      FALSE, NULL, NULL,      NULL,  NULL, REPOPTS,     "report sequences >= this score threshold in output",           4 },
63   { "--domE",       eslARG_REAL,     "10.0", NULL, "x>0",     NULL,  NULL, DOMREPOPTS,  "report domains <= this E-value threshold in output",           4 },
64   { "--domT",       eslARG_REAL,      FALSE, NULL, NULL,      NULL,  NULL, DOMREPOPTS,  "report domains >= this score cutoff in output",                4 },
65   /* Control of inclusion (significance) thresholds */
66   { "--incE",       eslARG_REAL,     "0.01", NULL, "x>0",     NULL,  NULL, INCOPTS,     "consider sequences <= this E-value threshold as significant",  5 },
67   { "--incT",       eslARG_REAL,      FALSE, NULL, NULL,      NULL,  NULL, INCOPTS,     "consider sequences >= this score threshold as significant",    5 },
68   { "--incdomE",    eslARG_REAL,     "0.01", NULL, "x>0",     NULL,  NULL, INCDOMOPTS,  "consider domains <= this E-value threshold as significant",    5 },
69   { "--incdomT",    eslARG_REAL,      FALSE, NULL, NULL,      NULL,  NULL, INCDOMOPTS,  "consider domains >= this score threshold as significant",      5 },
70   /* Model-specific thresholding for both reporting and inclusion */
71   { "--cut_ga",     eslARG_NONE,      FALSE, NULL, NULL,      NULL,  NULL, THRESHOPTS,  "use profile's GA gathering cutoffs to set all thresholding",   6 },
72   { "--cut_nc",     eslARG_NONE,      FALSE, NULL, NULL,      NULL,  NULL, THRESHOPTS,  "use profile's NC noise cutoffs to set all thresholding",       6 },
73   { "--cut_tc",     eslARG_NONE,      FALSE, NULL, NULL,      NULL,  NULL, THRESHOPTS,  "use profile's TC trusted cutoffs to set all thresholding",     6 },
74   /* Control of acceleration pipeline */
75   { "--max",        eslARG_NONE,      FALSE, NULL, NULL,      NULL,  NULL, STAGEOPTS,   "Turn all heuristic filters off (less speed, more power)",      7 },
76   { "--F1",         eslARG_REAL,     "0.02", NULL, NULL,      NULL,  NULL, "--max",     "Stage 1 (MSV) threshold: promote hits w/ P <= F1",             7 },
77   { "--F2",         eslARG_REAL,     "1e-3", NULL, NULL,      NULL,  NULL, "--max",     "Stage 2 (Vit) threshold: promote hits w/ P <= F2",             7 },
78   { "--F3",         eslARG_REAL,     "1e-5", NULL, NULL,      NULL,  NULL, "--max",     "Stage 3 (Fwd) threshold: promote hits w/ P <= F3",             7 },
79   { "--nobias",     eslARG_NONE,       NULL, NULL, NULL,      NULL,  NULL, "--max",     "turn off composition bias filter",                             7 },
80   /* Control of E-value calibration */
81   { "--EmL",        eslARG_INT,       "200", NULL,"n>0",      NULL,  NULL, NULL,        "length of sequences for MSV Gumbel mu fit",                   11 },
82   { "--EmN",        eslARG_INT,       "200", NULL,"n>0",      NULL,  NULL, NULL,        "number of sequences for MSV Gumbel mu fit",                   11 },
83   { "--EvL",        eslARG_INT,       "200", NULL,"n>0",      NULL,  NULL, NULL,        "length of sequences for Viterbi Gumbel mu fit",               11 },
84   { "--EvN",        eslARG_INT,       "200", NULL,"n>0",      NULL,  NULL, NULL,        "number of sequences for Viterbi Gumbel mu fit",               11 },
85   { "--EfL",        eslARG_INT,       "100", NULL,"n>0",      NULL,  NULL, NULL,        "length of sequences for Forward exp tail tau fit",            11 },
86   { "--EfN",        eslARG_INT,       "200", NULL,"n>0",      NULL,  NULL, NULL,        "number of sequences for Forward exp tail tau fit",            11 },
87   { "--Eft",        eslARG_REAL,     "0.04", NULL,"0<x<1",    NULL,  NULL, NULL,        "tail mass for Forward exponential tail tau fit",              11 },
88   /* Other options */
89   { "--seed",       eslARG_INT,        "42", NULL, "n>=0",    NULL,  NULL, NULL,        "set RNG seed to <n> (if 0: one-time arbitrary seed)",         12 },
90   { "--nonull2",    eslARG_NONE,       NULL, NULL, NULL,      NULL,  NULL, NULL,        "turn off biased composition score corrections",               12 },
91   { "-Z",           eslARG_REAL,      FALSE, NULL, "x>0",     NULL,  NULL, NULL,        "set # of comparisons done, for E-value calculation",          12 },
92   { "--domZ",       eslARG_REAL,      FALSE, NULL, "x>0",     NULL,  NULL, NULL,        "set # of significant seqs, for domain E-value calculation",   12 },
93   { "--hmmdb",      eslARG_INT,       NULL,  NULL, "n>0",   NULL,  NULL,  "--seqdb",       "hmm database to search",                                      12 },
94   { "--seqdb",      eslARG_INT,         NULL,  NULL, "n>0",   NULL,  NULL,  "--hmmdb",       "protein database to search",                                  12 },
95   { "--seqdb_ranges",eslARG_STRING,     NULL,  NULL,  NULL,   NULL, "--seqdb", NULL,         "range(s) of sequences within --seqdb that will be searched",  12 },
96 
97 
98   /* name           type        default  env  range toggles reqs incomp  help                                          docgroup*/
99   { "-c",         eslARG_INT,       "1", NULL, NULL, NULL,  NULL, NULL,  "use alt genetic code of NCBI transl table <n>", 99 },
100   { "-l",         eslARG_INT,      "20", NULL, NULL, NULL,  NULL, NULL,  "minimum ORF length",                            99 },
101   { "-m",         eslARG_NONE,    FALSE, NULL, NULL, NULL,  NULL, "-M",  "ORFs must initiate with AUG only",              99 },
102   { "-M",         eslARG_NONE,    FALSE, NULL, NULL, NULL,  NULL, "-m",  "ORFs must start with allowed initiation codon", 99 },
103 //  { "-W",         eslARG_NONE,    FALSE, NULL, NULL, NULL,  NULL, NULL,  "use windowed, memory-efficient seq reading",    99 },
104   { "--informat", eslARG_STRING,  FALSE, NULL, NULL, NULL,  NULL, NULL,  "specify that input file is in format <s>",      99 },
105   { "--watson",   eslARG_NONE,    FALSE, NULL, NULL, NULL,  NULL, NULL,  "only translate top strand",                     99 },
106   { "--crick",    eslARG_NONE,    FALSE, NULL, NULL, NULL,  NULL, NULL,  "only translate bottom strand",                  99 },
107 
108 
109   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110 };
111 
112 size_t
writen(int fd,const void * vptr,size_t n)113 writen(int fd, const void *vptr, size_t n)
114 {
115   ssize_t     remaining;
116   ssize_t     outn;
117   const char *ptr;
118 
119   ptr = vptr;
120   remaining = n;
121   while (remaining > 0) {
122     if ((outn = write(fd, ptr, remaining)) <= 0) {
123       if (outn < 0 && errno == EINTR) {
124         outn = 0;
125       } else {
126         return -1;
127       }
128     }
129 
130     remaining -= outn;
131     ptr += outn;
132   }
133 
134   return n;
135 }
136 
137 size_t
readn(int fd,void * vptr,size_t n)138 readn(int fd, void *vptr, size_t n)
139 {
140   size_t      remaining;
141   size_t      bytes;
142   char       *ptr;
143 
144   ptr = vptr;
145   remaining = n;
146   while (remaining > 0) {
147     if ((bytes = read(fd, ptr, remaining)) <= 0) {
148       if (errno == EINTR) {
149         bytes = 0;
150       } else {
151         return -1;
152       }
153     }
154 
155     remaining -= bytes;
156     ptr += bytes;
157   }
158 
159   return n - remaining;
160 }
161 
162 #define LOG_TO_STDOUT
163 #ifdef LOG_TO_STDOUT
p7_openlog(const char * ident,int option,int facility)164 void p7_openlog(const char *ident, int option, int facility)
165 {
166   /* do nothing */
167   return;
168 }
169 
p7_syslog(int priority,const char * format,...)170 void p7_syslog(int priority, const char *format, ...)
171 {
172   va_list ap;
173 
174   printf("\n*** ERROR ***\n");
175 
176   va_start(ap, format);
177   vprintf(format, ap);
178   va_end(ap);
179 
180   printf("\n");
181   fflush(stdout);
182 
183   return;
184 }
p7_closelog(void)185 void p7_closelog(void)
186 {
187   /* do nothing */
188   return;
189 }
190 #endif
191 
192 int
process_searchopts(int fd,char * cmdstr,ESL_GETOPTS ** ret_opts)193 process_searchopts(int fd, char *cmdstr, ESL_GETOPTS **ret_opts)
194 {
195   int status;
196 
197   ESL_GETOPTS *go = NULL;
198 
199   if ((go = esl_getopts_Create(searchOpts))       == NULL)  return eslEMEM;
200   if ((status = esl_opt_ProcessSpoof(go, cmdstr)) != eslOK) return status;
201   if ((status = esl_opt_VerifyConfig(go))         != eslOK) return status;
202 
203   *ret_opts = go;
204   return eslOK;
205 }
206 
207 void
free_QueueData(QUEUE_DATA * data)208 free_QueueData(QUEUE_DATA *data)
209 {
210   /* free the query data */
211   esl_getopts_Destroy(data->opts);
212 
213   if (data->abc != NULL) esl_alphabet_Destroy(data->abc);
214   if (data->hmm != NULL) p7_hmm_Destroy(data->hmm);
215   if (data->seq != NULL) esl_sq_Destroy(data->seq);
216   if (data->cmd != NULL) free(data->cmd);
217   memset(data, 0, sizeof(*data));
218   free(data);
219 }
220 
221 /* Function:  hmmpgmd_IsWithinRanges()
222  * Synopsis:  Test if the given id falls within one of a collection of ranges
223  *
224  * Purpose:   Given an index <sq_idx> and a number <N> of ranges stored in two
225  *            parallel arrays of start (<range_starts>) and end (<range_ends>)
226  *            positions, return TRUE if sq_idx falls in one of the ranges.
227  *            Otherwise return FALSE;
228  *
229  * Returns:   <TRUE> if within range(s), otherwise <FALSE>
230  */
231 int
hmmpgmd_IsWithinRanges(int64_t sq_idx,RANGE_LIST * list)232 hmmpgmd_IsWithinRanges (int64_t sq_idx, RANGE_LIST *list )  {
233   int i;
234   for (i=0; i<list->N; i++) {
235     if (sq_idx >= list->starts[i] && sq_idx <= list->ends[i] )
236       return TRUE;
237   }
238   return FALSE;
239 }
240 
241 
242 /* Function:  hmmpgmd_GetRanges()
243  * Synopsis:  Parse command flag into range(s)
244  *
245  * Purpose:   Given a command flag string <rangestr> of the form
246  *            <start1>..<end1>,<start2>..<end2>...
247  *            parse the string into a RANGE_LIST <list>
248  *
249  * Returns:   <eslOK> on success <TRUE>, <eslEMEM> on memory allocation failure,
250  *            otherwise <eslESYNTAX> or <eslFAIL> on parsing errors.
251  */
252 int
hmmpgmd_GetRanges(RANGE_LIST * list,char * rangestr)253 hmmpgmd_GetRanges (RANGE_LIST *list, char *rangestr)  {
254   char *range;
255   char *rangestr_cpy;
256   char *rangestr_cpy_ptr;
257   int status;
258 
259   list->N      = 0;
260   list->starts = NULL;
261   list->ends   = NULL;
262 
263   //first pass to figure out how much to allocate
264   esl_strdup(rangestr, -1, &rangestr_cpy); // do this because esl_strtok modifies the string, and we shouldn't change the opts value
265   rangestr_cpy_ptr = rangestr_cpy;         // do this because esl_strtok advances the pointer on the target string, but we need to free it
266   while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK)  list->N++;
267   ESL_ALLOC(list->starts, list->N * sizeof(int));
268   ESL_ALLOC(list->ends,   list->N * sizeof(int));
269   free(rangestr_cpy_ptr);
270 
271   //2nd pass to get the values
272   list->N = 0;
273   esl_strdup(rangestr, -1, &rangestr_cpy);
274   rangestr_cpy_ptr = rangestr_cpy;
275   while ( (status = esl_strtok(&rangestr_cpy, ",", &range) ) == eslOK) {
276     status = esl_regexp_ParseCoordString(range, list->starts + list->N, list->ends + list->N);
277     if (status == eslESYNTAX) esl_fatal("--seqdb_ranges takes coords <from>..<to>; %s not recognized", range);
278     if (status == eslFAIL)    esl_fatal("Failed to find <from> or <to> coord in %s", range);
279     list->N++;
280   }
281   free(rangestr_cpy_ptr);
282 
283   return eslOK;
284 
285 ERROR:
286   return eslEMEM;
287 }
288 
289 #endif /*HMMER_THREADS*/
290