1 /* Fetch a sequence (or part of one) from a sequence flatfile.
2  *
3  * From squid's sfetch and ffetch
4  * SRE, Mon Mar 31 16:12:50 2008 [Janelia]
5  */
6 #include "esl_config.h"
7 
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <string.h>
11 
12 #include "easel.h"
13 #include "esl_getopts.h"
14 #include "esl_fileparser.h"
15 #include "esl_keyhash.h"
16 #include "esl_regexp.h"
17 #include "esl_ssi.h"
18 #include "esl_sq.h"
19 #include "esl_sqio.h"
20 
21 static char banner[] = "retrieve sequence(s) from a file";
22 static char usage1[] = "[options] <sqfile> <name>        (one seq named <name>)";
23 static char usage2[] = "[options] -f <sqfile> <namefile> (all seqs in <namefile>)";
24 static char usage3[] = "[options] --index <sqfile>       (index <sqfile>)";
25 
26 static void
cmdline_failure(char * argv0,char * format,...)27 cmdline_failure(char *argv0, char *format, ...)
28 {
29   va_list argp;
30   va_start(argp, format);
31   vfprintf(stderr, format, argp);
32   va_end(argp);
33   esl_usage(stdout, argv0, usage1);
34   esl_usage(stdout, argv0, usage2);
35   esl_usage(stdout, argv0, usage3);
36   printf("\nTo see more help on available options, do %s -h\n\n", argv0);
37   exit(1);
38 }
39 
40 static void
cmdline_help(char * argv0,ESL_GETOPTS * go)41 cmdline_help(char *argv0, ESL_GETOPTS *go)
42 {
43   esl_banner(stdout, argv0, banner);
44   esl_usage (stdout, argv0, usage1);
45   esl_usage (stdout, argv0, usage2);
46   esl_usage (stdout, argv0, usage3);
47   puts("\n where general options are:");
48   esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
49   puts("\n Options for retrieving subsequences:");
50   esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
51   puts("\n  On command line, subseq coords are separated by any nonnumeric, nonspace character(s).");
52   puts("  for example, -c 23..100 or -c 23/100 or -c 23-100 all work.\n");
53   puts("  Additionally, to retrieve a suffix to the end, omit the end coord or set it to zero; -c 23.. ");
54   puts("  will work, as will -c 23..0\n");
55   puts("  By default, the subseq will be named <source name>/<from>-<to>. To assign a name of");
56   puts("  your choice, use -n <newname>.\n");
57   puts("  In retrieving subsequences listed in a file (-C -f, or just -Cf), each line of the file");
58   puts("  is in GDF format: <newname> <from> <to> <source seqname>, space/tab delimited.\n");
59   puts("  When <start> coordinate is greater than <end>, for DNA or RNA, the reverse complement is");
60   puts("  retrieved; in protein sequence, this is an error. The -r option is another way to revcomp.");
61   puts("\n other options:");
62   esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
63   exit(0);
64 }
65 
66 static ESL_OPTIONS options[] = {
67   /* name          type           default env   range togs  reqs               incomp                help                                                 docgroup */
68   { "-h",          eslARG_NONE,   FALSE,  NULL, NULL, NULL, NULL,              NULL,                 "help; show brief info on version and usage",        1 },
69   { "-o",          eslARG_OUTFILE,FALSE,  NULL, NULL, NULL, NULL,              "-O,--index",         "output sequences to file <f> instead of stdout",    1 },
70   { "-O",          eslARG_NONE,   FALSE,  NULL, NULL, NULL, NULL,              "-o,-f,--index",      "output sequence to file named <key>",               1 },
71   { "-n",          eslARG_STRING, FALSE,  NULL, NULL, NULL, NULL,              "-f,--index",         "rename the sequence <s>",                           1 },
72   { "-r",          eslARG_NONE,   FALSE,  NULL, NULL, NULL, NULL,              "--index",            "reverse complement the seq(s)",                     1 },
73 
74 
75   { "-c",          eslARG_STRING, FALSE,  NULL, NULL, NULL, NULL,              "-f,--index",         "retrieve subsequence coords <from>..<to>",          2 },
76   { "-C",          eslARG_NONE,   FALSE,  NULL, NULL, NULL, "-f",              "--index",            "<namefile> in <f> contains subseq coords too",      2 },
77 
78   { "--informat",  eslARG_STRING, FALSE,  NULL, NULL, NULL, NULL,              NULL,                 "specify that input file is in format <s>",          3 },
79 
80   /* undocumented as options, because they're documented as alternative invocations: */
81   { "-f",          eslARG_NONE,  FALSE,   NULL, NULL, NULL, NULL,              "--index",           "second cmdline arg is a file of names to retrieve", 99 },
82   { "--index",     eslARG_NONE,  FALSE,   NULL, NULL, NULL, NULL,               NULL,               "index <sqfile>, creating <sqfile>.ssi",             99 },
83 
84  { 0,0,0,0,0,0,0,0,0,0 },
85 };
86 
87 static void create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp);
88 static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp);
89 static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp);
90 static void multifetch_subseq(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp);
91 static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname,
92 			    char *key, uint32_t given_start, uint32_t given_end);
93 
94 int
main(int argc,char ** argv)95 main(int argc, char **argv)
96 {
97   ESL_GETOPTS  *go      = NULL;	                        /* application configuration       */
98   char         *seqfile = NULL;	                        /* sequence file name              */
99   int           infmt   = eslSQFILE_UNKNOWN;		/* format code for seqfile         */
100   ESL_SQFILE   *sqfp    = NULL;                         /* open sequence file              */
101   FILE         *ofp     = NULL;	                        /* output stream for sequences     */
102   int           status;		                        /* easel return code               */
103 
104   /***********************************************
105    * Parse command line
106    ***********************************************/
107 
108   go = esl_getopts_Create(options);
109   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
110   if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n",       go->errbuf);
111   if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help   (argv[0], go);
112   if (esl_opt_ArgNumber(go) < 1)                       cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
113 
114   /* Open the sequence file */
115   seqfile = esl_opt_GetArg(go, 1);
116   if (esl_opt_GetString(go, "--informat") != NULL) {
117     infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
118     if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat"));
119   }
120   status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
121   if      (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n",     seqfile);
122   else if (status == eslEFORMAT)   cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile);
123   else if (status == eslEINVAL)    cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n");
124   else if (status != eslOK)        cmdline_failure(argv[0], "Open failed, code %d.\n", status);
125 
126   /* Open the output file, if any */
127   if (esl_opt_GetBoolean(go, "-O"))
128     {
129       if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL)
130 	cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetArg(go, 2));
131     }
132   else if (esl_opt_GetString(go, "-o") != NULL)
133     {
134       if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL)
135 	cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o"));
136     }
137   else ofp = stdout;
138 
139   /* Indexing  mode */
140   if (esl_opt_GetBoolean(go, "--index"))
141     {
142       if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
143       if (sqfp->data.ascii.do_gzip)   cmdline_failure(argv[0], "Can't index a .gz compressed file");
144       if (sqfp->data.ascii.do_stdin)  cmdline_failure(argv[0], "Can't index a standard input pipe");
145 
146       create_ssi_index(go, sqfp);
147     }
148 
149   /* List retrieval mode */
150   else if (esl_opt_GetBoolean(go, "-f"))
151     {
152       if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
153 
154       /* Open the SSI index for retrieval */
155       if (! sqfp->data.ascii.do_gzip && ! sqfp->data.ascii.do_stdin &&  ! esl_sqio_IsAlignment(sqfp->format))
156 	{
157 	  status = esl_sqfile_OpenSSI(sqfp, NULL);
158 	  if      (status == eslEFORMAT)   cmdline_failure(argv[0], "SSI index is in incorrect format\n");
159 	  else if (status == eslERANGE)    cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n");
160 	  else if (status != eslOK)        cmdline_failure(argv[0], "Failed to open SSI index\n");
161 	}
162 
163       if (esl_opt_GetBoolean(go, "-C")) multifetch_subseq(go, ofp, esl_opt_GetArg(go, 2), sqfp);
164       else              	        multifetch       (go, ofp, esl_opt_GetArg(go, 2), sqfp);
165     }
166 
167   /* Single sequence retrieval mode */
168   else
169     {
170       if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
171       char *key     = esl_opt_GetArg(go, 2);
172       char *cstring = esl_opt_GetString(go, "-c");
173       char *newname = esl_opt_GetString(go, "-n");
174 
175       /* Open the SSI index for retrieval */
176       if (! sqfp->data.ascii.do_gzip && ! sqfp->data.ascii.do_stdin &&  ! esl_sqio_IsAlignment(sqfp->format))
177 	{
178 	  status = esl_sqfile_OpenSSI(sqfp, NULL);
179 	  if      (status == eslEFORMAT)   cmdline_failure(argv[0], "SSI index is in incorrect format\n");
180 	  else if (status == eslERANGE)    cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n");
181 	  else if (status != eslOK)        cmdline_failure(argv[0], "Failed to open SSI index\n");
182 	}
183 
184       /* -c: subsequence retrieval; else full sequence retrieval */
185       if (cstring != NULL)
186 	{
187 	  uint32_t start, end;
188 
189 	  status = esl_regexp_ParseCoordString(cstring, &start, &end);
190 	  if (status == eslESYNTAX) esl_fatal("-c takes arg of subseq coords <from>..<to>; %s not recognized", cstring);
191 	  if (status == eslFAIL)    esl_fatal("Failed to find <from> or <to> coord in %s", cstring);
192 
193 	  onefetch_subseq(go, ofp, sqfp, newname, key, start, end);
194 	  if (ofp != stdout) printf("\n\nRetrieved subsequence %s/%d-%d.\n",  key, start, end);
195 	}
196       else
197 	{
198 	  onefetch(go, ofp, esl_opt_GetArg(go, 2), sqfp);
199 	  if (ofp != stdout) printf("\n\nRetrieved sequence %s.\n",  esl_opt_GetArg(go, 2));
200 	}
201     }
202 
203   esl_sqfile_Close(sqfp);
204   esl_getopts_Destroy(go);
205   return 0;
206 }
207 
208 
209 /* Create an SSI index file for open sequence file <sqfp>.
210  * Both name and accession of sequences are stored as keys.
211  */
212 static void
create_ssi_index(ESL_GETOPTS * go,ESL_SQFILE * sqfp)213 create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp)
214 {
215   ESL_NEWSSI *ns      = NULL;
216   ESL_SQ     *sq      = esl_sq_Create();
217   int         nseq    = 0;
218   char       *ssifile = NULL;
219   uint16_t    fh;
220   int         status;
221 
222   esl_strdup(sqfp->filename, -1, &ssifile);
223   esl_strcat(&ssifile, -1, ".ssi", 4);
224   status = esl_newssi_Open(ssifile, TRUE, &ns); /* TRUE is for allowing overwrite. */
225   if      (status == eslENOTFOUND)   esl_fatal("failed to open SSI index %s", ssifile);
226   else if (status == eslEOVERWRITE)  esl_fatal("SSI index %s already exists; delete or rename it", ssifile); /* won't happen, see TRUE above... */
227   else if (status != eslOK)          esl_fatal("failed to create a new SSI index");
228 
229   if (esl_newssi_AddFile(ns, sqfp->filename, sqfp->format, &fh) != eslOK)
230     esl_fatal("Failed to add sequence file %s to new SSI index\n", sqfp->filename);
231 
232   printf("Creating SSI index for %s...    ", sqfp->filename);
233   fflush(stdout);
234 
235   while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK)
236     {
237       nseq++;
238       if (sq->name == NULL) esl_fatal("Every sequence must have a name to be indexed. Failed to find name of seq #%d\n", nseq);
239 
240       if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK)
241 	esl_fatal("Failed to add key %s to SSI index", sq->name);
242 
243       if (sq->acc[0] != '\0') {
244 	if (esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK)
245 	  esl_fatal("Failed to add secondary key %s to SSI index", sq->acc);
246       }
247       esl_sq_Reuse(sq);
248     }
249   if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
250 					   sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
251   else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
252 					    status, sqfp->filename);
253 
254   /* Determine if the file was suitable for fast subseq lookup. */
255   if (sqfp->data.ascii.bpl > 0 && sqfp->data.ascii.rpl > 0) {
256     if ((status = esl_newssi_SetSubseq(ns, fh, sqfp->data.ascii.bpl, sqfp->data.ascii.rpl)) != eslOK)
257       esl_fatal("Failed to set %s for fast subseq lookup.");
258   }
259 
260   /* Save the SSI file to disk */
261   if (esl_newssi_Write(ns) != eslOK)
262     esl_fatal("\nFailed to write keys to ssi file %s:\n  %s", ssifile, ns->errbuf);
263 
264   /* Done - output and exit. */
265   printf("done.\n");
266   if (ns->nsecondary > 0)
267     printf("Indexed %d sequences (%ld names and %ld accessions).\n", nseq, (long) ns->nprimary, (long) ns->nsecondary);
268   else
269     printf("Indexed %d sequences (%ld names).\n", nseq, (long) ns->nprimary);
270   printf("SSI index written to file %s\n", ssifile);
271 
272   free(ssifile);
273   esl_sq_Destroy(sq);
274   esl_newssi_Close(ns);
275   return;
276 }
277 
278 /* multifetch:
279  * given a file containing lines with one name or key per line;
280  * parse the file line-by-line;
281  * if we have an SSI index available, retrieve the seqs by key
282  * as we see each line;
283  * else, without an SSI index, store the keys in a hash, then
284  * read the entire seq file in a single pass, outputting seqs
285  * that are in our keylist.
286  *
287  * Note that with an SSI index, you get the seqs in the order they
288  * appear in the <keyfile>, but without an SSI index, you get seqs in
289  * the order they occur in the seq file.
290  */
291 static void
multifetch(ESL_GETOPTS * go,FILE * ofp,char * keyfile,ESL_SQFILE * sqfp)292 multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp)
293 {
294   ESL_KEYHASH    *keys   = esl_keyhash_Create();
295   ESL_FILEPARSER *efp    = NULL;
296   int             nseq   = 0;
297   int             nkeys  = 0;
298   char           *key;
299   int             keylen;
300   int             keyidx;
301   int             status;
302 
303 
304   if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK)  esl_fatal("Failed to open key file %s\n", keyfile);
305   esl_fileparser_SetCommentChar(efp, '#');
306 
307   while (esl_fileparser_NextLine(efp) == eslOK)
308     {
309       if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
310 	esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile);
311 
312       status = esl_keyhash_Store(keys, key, keylen, &keyidx);
313       if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile);
314 
315       /* if we have an SSI index, just fetch them as we go. */
316       if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp);  nseq++; }
317       nkeys++;
318     }
319 
320   /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */
321   if (sqfp->data.ascii.ssi == NULL)
322     {
323       ESL_SQ *sq     = esl_sq_Create();
324 
325       while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
326 	{
327 	  if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) ||
328 	       (sq->acc[0]  != '\0' && esl_keyhash_Lookup(keys, sq->acc,  -1, NULL) == eslOK))
329 	    {
330 	      if (esl_opt_GetBoolean(go, "-r") )
331 		if (esl_sq_ReverseComplement(sq) != eslOK)
332 		  esl_fatal("Failed to reverse complement %s\n", sq->name);
333 	      esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
334 	      nseq++;
335 	    }
336 	  esl_sq_Reuse(sq);
337 	}
338       if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
339 					       sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
340       else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
341 					       status, sqfp->filename);
342       esl_sq_Destroy(sq);
343     }
344 
345   if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq);
346 
347   if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq);
348 
349   esl_keyhash_Destroy(keys);
350   esl_fileparser_Close(efp);
351   return;
352 }
353 
354 
355 
356 /* onefetch():
357  * Given one <key> (a seq name or accession), retrieve the corresponding sequence.
358  * In SSI mode, we can do this quickly by positioning the file, then regurgitating
359  * every line until the end-of-record marker; we don't even have to parse.
360  * Without an SSI index, we have to parse the file sequentially 'til we find
361  * the one we're after.
362  */
363 static void
onefetch(ESL_GETOPTS * go,FILE * ofp,char * key,ESL_SQFILE * sqfp)364 onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp)
365 {
366   ESL_SQ  *sq            = esl_sq_Create();
367   int      do_revcomp    = esl_opt_GetBoolean(go, "-r");
368   char    *newname       = esl_opt_GetString(go, "-n");
369   int      status;
370 
371   /* Try to position the file at the desired sequence with SSI. */
372   if (sqfp->data.ascii.ssi != NULL)
373     {
374       status = esl_sqfile_PositionByKey(sqfp, key);
375       if      (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", key, sqfp->filename);
376       else if (status == eslEFORMAT)   esl_fatal("Failed to parse SSI index for %s\n", sqfp->filename);
377       else if (status != eslOK)        esl_fatal("Failed to look up location of seq %s in SSI index of file %s\n", key, sqfp->filename);
378 
379       status = esl_sqio_Read(sqfp, sq);
380       if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
381 					       sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
382       else if (status == eslEOF)     esl_fatal("Unexpected EOF reading sequence file %s",
383 					       status, sqfp->filename);
384       else if (status != eslOK)      esl_fatal("Unexpected error %d reading sequence file %s",
385 					       status, sqfp->filename);
386 
387       if (strcmp(key, sq->name) != 0 && strcmp(key, sq->acc) != 0)
388 	esl_fatal("whoa, internal error; found the wrong sequence %s, not %s", sq->name, key);
389     }
390   else
391     { /* Else, we have to read the whole damn file sequentially until we find the seq */
392       while ((status = esl_sqio_Read(sqfp, sq)) != eslEOF) {
393 	if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
394 						 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
395 	else if (status != eslOK)      esl_fatal("Unexpected error %d reading sequence file %s",
396 						 status, sqfp->filename);
397 
398 	if (strcmp(key, sq->name) == 0 || strcmp(key, sq->acc) == 0) break;
399 	esl_sq_Reuse(sq);
400       }
401       if (status == eslEOF) esl_fatal("Failed to find sequence %s in file %s\n", key, sqfp->filename);
402 
403     }
404 
405   if (do_revcomp == FALSE && newname == NULL && ! esl_sqio_IsAlignment(sqfp->format))
406     { /* If we're not manipulating the sequence in any way, and it's not from an alignment file, we can Echo() it. */
407       if (esl_sqio_Echo(sqfp, sq, ofp) != eslOK) esl_fatal("Echo failed: %s\n", esl_sqfile_GetErrorBuf(sqfp));
408     }
409   else
410     { /* Otherwise we Write() the parsed version. */
411       if (do_revcomp && esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name);
412       if (newname != NULL) esl_sq_SetName(sq, newname);
413       esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
414     }
415 
416   esl_sq_Destroy(sq);
417 }
418 
419 static void
multifetch_subseq(ESL_GETOPTS * go,FILE * ofp,char * gdffile,ESL_SQFILE * sqfp)420 multifetch_subseq(ESL_GETOPTS *go, FILE *ofp, char *gdffile, ESL_SQFILE *sqfp)
421 {
422   ESL_FILEPARSER *efp    = NULL;
423   char           *newname;
424   char           *s;
425   int             n1, n2;
426   int             start, end;
427   char           *source;
428 
429   if (esl_fileparser_Open(gdffile, NULL, &efp) != eslOK)  esl_fatal("Failed to open key file %s\n", gdffile);
430   esl_fileparser_SetCommentChar(efp, '#');
431 
432   while (esl_fileparser_NextLine(efp) == eslOK)
433     {
434       if (esl_fileparser_GetTokenOnLine(efp, &newname, &n1) != eslOK)
435 	esl_fatal("Failed to read subseq name on line %d of file %s\n", efp->linenumber, gdffile);
436 
437       if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK)
438 	esl_fatal("Failed to read start coord on line %d of file %s\n", efp->linenumber, gdffile);
439       start = atoi(s);
440       if(start <= 0)
441 	esl_fatal("Read invalid start coord %d on line %d of file %s (must be positive integer)\n", start, efp->linenumber, gdffile);
442 
443       if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK)
444 	esl_fatal("Failed to read end coord on line %d of file %s\n", efp->linenumber, gdffile);
445       end   = atoi(s);
446       if(end < 0)
447 	esl_fatal("Read invalid end coord %d on line %d of file %s (must be positive integer, or 0 for full length)\n", end, efp->linenumber, gdffile);
448 
449       if (esl_fileparser_GetTokenOnLine(efp, &source, &n2) != eslOK)
450 	esl_fatal("Failed to read source seq name on line %d of file %s\n", efp->linenumber, gdffile);
451 
452       onefetch_subseq(go, ofp, sqfp, newname, source, start, end);
453     }
454   esl_fileparser_Close(efp);
455 }
456 
457 static void
onefetch_subseq(ESL_GETOPTS * go,FILE * ofp,ESL_SQFILE * sqfp,char * newname,char * key,uint32_t given_start,uint32_t given_end)458 onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end)
459 {
460   int    start, end;
461   int    do_revcomp;
462   ESL_SQ *sq = esl_sq_Create();
463 
464   if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index");
465 
466   /* reverse complement indicated by coords. */
467   /* -c 52: would be 52,0, so watch out for given_end = 0 case */
468   if (given_end != 0 && given_start > given_end)
469     { start = given_end;   end = given_start; do_revcomp = TRUE;  }
470   else
471     { start = given_start; end = given_end;   do_revcomp = FALSE; }
472 
473   if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp));
474 
475   if      (newname != NULL) esl_sq_SetName(sq, newname);
476   else                      esl_sq_FormatName(sq, "%s/%u-%" PRId64, key, given_start, (given_end == 0) ? sq->L : given_end);
477 
478   /* Two ways we might have been asked to revcomp: by coord, or by -r option */
479   /* (If both happen, they'll cancel each other out) */
480   if (do_revcomp)
481     if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name);
482   if (esl_opt_GetBoolean(go, "-r"))
483     if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name);
484 
485   esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
486   esl_sq_Destroy(sq);
487 }
488 
489 
490