1 /* Fetch a sequence (or part of one) from a sequence flatfile.
2 *
3 * From squid's sfetch and ffetch
4 * SRE, Mon Mar 31 16:12:50 2008 [Janelia]
5 */
6 #include "esl_config.h"
7
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <string.h>
11
12 #include "easel.h"
13 #include "esl_getopts.h"
14 #include "esl_fileparser.h"
15 #include "esl_keyhash.h"
16 #include "esl_regexp.h"
17 #include "esl_ssi.h"
18 #include "esl_sq.h"
19 #include "esl_sqio.h"
20
21 static char banner[] = "retrieve sequence(s) from a file";
22 static char usage1[] = "[options] <sqfile> <name> (one seq named <name>)";
23 static char usage2[] = "[options] -f <sqfile> <namefile> (all seqs in <namefile>)";
24 static char usage3[] = "[options] --index <sqfile> (index <sqfile>)";
25
26 static void
cmdline_failure(char * argv0,char * format,...)27 cmdline_failure(char *argv0, char *format, ...)
28 {
29 va_list argp;
30 va_start(argp, format);
31 vfprintf(stderr, format, argp);
32 va_end(argp);
33 esl_usage(stdout, argv0, usage1);
34 esl_usage(stdout, argv0, usage2);
35 esl_usage(stdout, argv0, usage3);
36 printf("\nTo see more help on available options, do %s -h\n\n", argv0);
37 exit(1);
38 }
39
40 static void
cmdline_help(char * argv0,ESL_GETOPTS * go)41 cmdline_help(char *argv0, ESL_GETOPTS *go)
42 {
43 esl_banner(stdout, argv0, banner);
44 esl_usage (stdout, argv0, usage1);
45 esl_usage (stdout, argv0, usage2);
46 esl_usage (stdout, argv0, usage3);
47 puts("\n where general options are:");
48 esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
49 puts("\n Options for retrieving subsequences:");
50 esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
51 puts("\n On command line, subseq coords are separated by any nonnumeric, nonspace character(s).");
52 puts(" for example, -c 23..100 or -c 23/100 or -c 23-100 all work.\n");
53 puts(" Additionally, to retrieve a suffix to the end, omit the end coord or set it to zero; -c 23.. ");
54 puts(" will work, as will -c 23..0\n");
55 puts(" By default, the subseq will be named <source name>/<from>-<to>. To assign a name of");
56 puts(" your choice, use -n <newname>.\n");
57 puts(" In retrieving subsequences listed in a file (-C -f, or just -Cf), each line of the file");
58 puts(" is in GDF format: <newname> <from> <to> <source seqname>, space/tab delimited.\n");
59 puts(" When <start> coordinate is greater than <end>, for DNA or RNA, the reverse complement is");
60 puts(" retrieved; in protein sequence, this is an error. The -r option is another way to revcomp.");
61 puts("\n other options:");
62 esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
63 exit(0);
64 }
65
66 static ESL_OPTIONS options[] = {
67 /* name type default env range togs reqs incomp help docgroup */
68 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "help; show brief info on version and usage", 1 },
69 { "-o", eslARG_OUTFILE,FALSE, NULL, NULL, NULL, NULL, "-O,--index", "output sequences to file <f> instead of stdout", 1 },
70 { "-O", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-o,-f,--index", "output sequence to file named <key>", 1 },
71 { "-n", eslARG_STRING, FALSE, NULL, NULL, NULL, NULL, "-f,--index", "rename the sequence <s>", 1 },
72 { "-r", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "--index", "reverse complement the seq(s)", 1 },
73
74
75 { "-c", eslARG_STRING, FALSE, NULL, NULL, NULL, NULL, "-f,--index", "retrieve subsequence coords <from>..<to>", 2 },
76 { "-C", eslARG_NONE, FALSE, NULL, NULL, NULL, "-f", "--index", "<namefile> in <f> contains subseq coords too", 2 },
77
78 { "--informat", eslARG_STRING, FALSE, NULL, NULL, NULL, NULL, NULL, "specify that input file is in format <s>", 3 },
79
80 /* undocumented as options, because they're documented as alternative invocations: */
81 { "-f", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "--index", "second cmdline arg is a file of names to retrieve", 99 },
82 { "--index", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "index <sqfile>, creating <sqfile>.ssi", 99 },
83
84 { 0,0,0,0,0,0,0,0,0,0 },
85 };
86
87 static void create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp);
88 static void multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp);
89 static void onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp);
90 static void multifetch_subseq(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp);
91 static void onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname,
92 char *key, uint32_t given_start, uint32_t given_end);
93
94 int
main(int argc,char ** argv)95 main(int argc, char **argv)
96 {
97 ESL_GETOPTS *go = NULL; /* application configuration */
98 char *seqfile = NULL; /* sequence file name */
99 int infmt = eslSQFILE_UNKNOWN; /* format code for seqfile */
100 ESL_SQFILE *sqfp = NULL; /* open sequence file */
101 FILE *ofp = NULL; /* output stream for sequences */
102 int status; /* easel return code */
103
104 /***********************************************
105 * Parse command line
106 ***********************************************/
107
108 go = esl_getopts_Create(options);
109 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
110 if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in configuration: %s\n", go->errbuf);
111 if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go);
112 if (esl_opt_ArgNumber(go) < 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
113
114 /* Open the sequence file */
115 seqfile = esl_opt_GetArg(go, 1);
116 if (esl_opt_GetString(go, "--informat") != NULL) {
117 infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
118 if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat"));
119 }
120 status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
121 if (status == eslENOTFOUND) cmdline_failure(argv[0], "Sequence file %s not found.\n", seqfile);
122 else if (status == eslEFORMAT) cmdline_failure(argv[0], "Format of file %s unrecognized.\n", seqfile);
123 else if (status == eslEINVAL) cmdline_failure(argv[0], "Can't autodetect stdin or .gz.\n");
124 else if (status != eslOK) cmdline_failure(argv[0], "Open failed, code %d.\n", status);
125
126 /* Open the output file, if any */
127 if (esl_opt_GetBoolean(go, "-O"))
128 {
129 if ((ofp = fopen(esl_opt_GetArg(go, 2), "w")) == NULL)
130 cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetArg(go, 2));
131 }
132 else if (esl_opt_GetString(go, "-o") != NULL)
133 {
134 if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL)
135 cmdline_failure(argv[0], "Failed to open output file %s\n", esl_opt_GetString(go, "-o"));
136 }
137 else ofp = stdout;
138
139 /* Indexing mode */
140 if (esl_opt_GetBoolean(go, "--index"))
141 {
142 if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
143 if (sqfp->data.ascii.do_gzip) cmdline_failure(argv[0], "Can't index a .gz compressed file");
144 if (sqfp->data.ascii.do_stdin) cmdline_failure(argv[0], "Can't index a standard input pipe");
145
146 create_ssi_index(go, sqfp);
147 }
148
149 /* List retrieval mode */
150 else if (esl_opt_GetBoolean(go, "-f"))
151 {
152 if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
153
154 /* Open the SSI index for retrieval */
155 if (! sqfp->data.ascii.do_gzip && ! sqfp->data.ascii.do_stdin && ! esl_sqio_IsAlignment(sqfp->format))
156 {
157 status = esl_sqfile_OpenSSI(sqfp, NULL);
158 if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n");
159 else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n");
160 else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n");
161 }
162
163 if (esl_opt_GetBoolean(go, "-C")) multifetch_subseq(go, ofp, esl_opt_GetArg(go, 2), sqfp);
164 else multifetch (go, ofp, esl_opt_GetArg(go, 2), sqfp);
165 }
166
167 /* Single sequence retrieval mode */
168 else
169 {
170 if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
171 char *key = esl_opt_GetArg(go, 2);
172 char *cstring = esl_opt_GetString(go, "-c");
173 char *newname = esl_opt_GetString(go, "-n");
174
175 /* Open the SSI index for retrieval */
176 if (! sqfp->data.ascii.do_gzip && ! sqfp->data.ascii.do_stdin && ! esl_sqio_IsAlignment(sqfp->format))
177 {
178 status = esl_sqfile_OpenSSI(sqfp, NULL);
179 if (status == eslEFORMAT) cmdline_failure(argv[0], "SSI index is in incorrect format\n");
180 else if (status == eslERANGE) cmdline_failure(argv[0], "SSI index is in 64-bit format and we can't read it\n");
181 else if (status != eslOK) cmdline_failure(argv[0], "Failed to open SSI index\n");
182 }
183
184 /* -c: subsequence retrieval; else full sequence retrieval */
185 if (cstring != NULL)
186 {
187 uint32_t start, end;
188
189 status = esl_regexp_ParseCoordString(cstring, &start, &end);
190 if (status == eslESYNTAX) esl_fatal("-c takes arg of subseq coords <from>..<to>; %s not recognized", cstring);
191 if (status == eslFAIL) esl_fatal("Failed to find <from> or <to> coord in %s", cstring);
192
193 onefetch_subseq(go, ofp, sqfp, newname, key, start, end);
194 if (ofp != stdout) printf("\n\nRetrieved subsequence %s/%d-%d.\n", key, start, end);
195 }
196 else
197 {
198 onefetch(go, ofp, esl_opt_GetArg(go, 2), sqfp);
199 if (ofp != stdout) printf("\n\nRetrieved sequence %s.\n", esl_opt_GetArg(go, 2));
200 }
201 }
202
203 esl_sqfile_Close(sqfp);
204 esl_getopts_Destroy(go);
205 return 0;
206 }
207
208
209 /* Create an SSI index file for open sequence file <sqfp>.
210 * Both name and accession of sequences are stored as keys.
211 */
212 static void
create_ssi_index(ESL_GETOPTS * go,ESL_SQFILE * sqfp)213 create_ssi_index(ESL_GETOPTS *go, ESL_SQFILE *sqfp)
214 {
215 ESL_NEWSSI *ns = NULL;
216 ESL_SQ *sq = esl_sq_Create();
217 int nseq = 0;
218 char *ssifile = NULL;
219 uint16_t fh;
220 int status;
221
222 esl_strdup(sqfp->filename, -1, &ssifile);
223 esl_strcat(&ssifile, -1, ".ssi", 4);
224 status = esl_newssi_Open(ssifile, TRUE, &ns); /* TRUE is for allowing overwrite. */
225 if (status == eslENOTFOUND) esl_fatal("failed to open SSI index %s", ssifile);
226 else if (status == eslEOVERWRITE) esl_fatal("SSI index %s already exists; delete or rename it", ssifile); /* won't happen, see TRUE above... */
227 else if (status != eslOK) esl_fatal("failed to create a new SSI index");
228
229 if (esl_newssi_AddFile(ns, sqfp->filename, sqfp->format, &fh) != eslOK)
230 esl_fatal("Failed to add sequence file %s to new SSI index\n", sqfp->filename);
231
232 printf("Creating SSI index for %s... ", sqfp->filename);
233 fflush(stdout);
234
235 while ((status = esl_sqio_ReadInfo(sqfp, sq)) == eslOK)
236 {
237 nseq++;
238 if (sq->name == NULL) esl_fatal("Every sequence must have a name to be indexed. Failed to find name of seq #%d\n", nseq);
239
240 if (esl_newssi_AddKey(ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK)
241 esl_fatal("Failed to add key %s to SSI index", sq->name);
242
243 if (sq->acc[0] != '\0') {
244 if (esl_newssi_AddAlias(ns, sq->acc, sq->name) != eslOK)
245 esl_fatal("Failed to add secondary key %s to SSI index", sq->acc);
246 }
247 esl_sq_Reuse(sq);
248 }
249 if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
250 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
251 else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s",
252 status, sqfp->filename);
253
254 /* Determine if the file was suitable for fast subseq lookup. */
255 if (sqfp->data.ascii.bpl > 0 && sqfp->data.ascii.rpl > 0) {
256 if ((status = esl_newssi_SetSubseq(ns, fh, sqfp->data.ascii.bpl, sqfp->data.ascii.rpl)) != eslOK)
257 esl_fatal("Failed to set %s for fast subseq lookup.");
258 }
259
260 /* Save the SSI file to disk */
261 if (esl_newssi_Write(ns) != eslOK)
262 esl_fatal("\nFailed to write keys to ssi file %s:\n %s", ssifile, ns->errbuf);
263
264 /* Done - output and exit. */
265 printf("done.\n");
266 if (ns->nsecondary > 0)
267 printf("Indexed %d sequences (%ld names and %ld accessions).\n", nseq, (long) ns->nprimary, (long) ns->nsecondary);
268 else
269 printf("Indexed %d sequences (%ld names).\n", nseq, (long) ns->nprimary);
270 printf("SSI index written to file %s\n", ssifile);
271
272 free(ssifile);
273 esl_sq_Destroy(sq);
274 esl_newssi_Close(ns);
275 return;
276 }
277
278 /* multifetch:
279 * given a file containing lines with one name or key per line;
280 * parse the file line-by-line;
281 * if we have an SSI index available, retrieve the seqs by key
282 * as we see each line;
283 * else, without an SSI index, store the keys in a hash, then
284 * read the entire seq file in a single pass, outputting seqs
285 * that are in our keylist.
286 *
287 * Note that with an SSI index, you get the seqs in the order they
288 * appear in the <keyfile>, but without an SSI index, you get seqs in
289 * the order they occur in the seq file.
290 */
291 static void
multifetch(ESL_GETOPTS * go,FILE * ofp,char * keyfile,ESL_SQFILE * sqfp)292 multifetch(ESL_GETOPTS *go, FILE *ofp, char *keyfile, ESL_SQFILE *sqfp)
293 {
294 ESL_KEYHASH *keys = esl_keyhash_Create();
295 ESL_FILEPARSER *efp = NULL;
296 int nseq = 0;
297 int nkeys = 0;
298 char *key;
299 int keylen;
300 int keyidx;
301 int status;
302
303
304 if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", keyfile);
305 esl_fileparser_SetCommentChar(efp, '#');
306
307 while (esl_fileparser_NextLine(efp) == eslOK)
308 {
309 if (esl_fileparser_GetTokenOnLine(efp, &key, &keylen) != eslOK)
310 esl_fatal("Failed to read seq name on line %d of file %s\n", efp->linenumber, keyfile);
311
312 status = esl_keyhash_Store(keys, key, keylen, &keyidx);
313 if (status == eslEDUP) esl_fatal("seq key %s occurs more than once in file %s\n", key, keyfile);
314
315 /* if we have an SSI index, just fetch them as we go. */
316 if (sqfp->data.ascii.ssi != NULL) { onefetch(go, ofp, key, sqfp); nseq++; }
317 nkeys++;
318 }
319
320 /* If we don't have an SSI index, we haven't fetched anything yet; do it now. */
321 if (sqfp->data.ascii.ssi == NULL)
322 {
323 ESL_SQ *sq = esl_sq_Create();
324
325 while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
326 {
327 if ( (sq->name[0] != '\0' && esl_keyhash_Lookup(keys, sq->name, -1, NULL) == eslOK) ||
328 (sq->acc[0] != '\0' && esl_keyhash_Lookup(keys, sq->acc, -1, NULL) == eslOK))
329 {
330 if (esl_opt_GetBoolean(go, "-r") )
331 if (esl_sq_ReverseComplement(sq) != eslOK)
332 esl_fatal("Failed to reverse complement %s\n", sq->name);
333 esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
334 nseq++;
335 }
336 esl_sq_Reuse(sq);
337 }
338 if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
339 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
340 else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s",
341 status, sqfp->filename);
342 esl_sq_Destroy(sq);
343 }
344
345 if (nkeys != nseq) esl_fatal("Tried to retrieve %d keys, but only retrieved %d sequences\n", nkeys, nseq);
346
347 if (ofp != stdout) printf("\nRetrieved %d sequences.\n", nseq);
348
349 esl_keyhash_Destroy(keys);
350 esl_fileparser_Close(efp);
351 return;
352 }
353
354
355
356 /* onefetch():
357 * Given one <key> (a seq name or accession), retrieve the corresponding sequence.
358 * In SSI mode, we can do this quickly by positioning the file, then regurgitating
359 * every line until the end-of-record marker; we don't even have to parse.
360 * Without an SSI index, we have to parse the file sequentially 'til we find
361 * the one we're after.
362 */
363 static void
onefetch(ESL_GETOPTS * go,FILE * ofp,char * key,ESL_SQFILE * sqfp)364 onefetch(ESL_GETOPTS *go, FILE *ofp, char *key, ESL_SQFILE *sqfp)
365 {
366 ESL_SQ *sq = esl_sq_Create();
367 int do_revcomp = esl_opt_GetBoolean(go, "-r");
368 char *newname = esl_opt_GetString(go, "-n");
369 int status;
370
371 /* Try to position the file at the desired sequence with SSI. */
372 if (sqfp->data.ascii.ssi != NULL)
373 {
374 status = esl_sqfile_PositionByKey(sqfp, key);
375 if (status == eslENOTFOUND) esl_fatal("seq %s not found in SSI index for file %s\n", key, sqfp->filename);
376 else if (status == eslEFORMAT) esl_fatal("Failed to parse SSI index for %s\n", sqfp->filename);
377 else if (status != eslOK) esl_fatal("Failed to look up location of seq %s in SSI index of file %s\n", key, sqfp->filename);
378
379 status = esl_sqio_Read(sqfp, sq);
380 if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
381 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
382 else if (status == eslEOF) esl_fatal("Unexpected EOF reading sequence file %s",
383 status, sqfp->filename);
384 else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s",
385 status, sqfp->filename);
386
387 if (strcmp(key, sq->name) != 0 && strcmp(key, sq->acc) != 0)
388 esl_fatal("whoa, internal error; found the wrong sequence %s, not %s", sq->name, key);
389 }
390 else
391 { /* Else, we have to read the whole damn file sequentially until we find the seq */
392 while ((status = esl_sqio_Read(sqfp, sq)) != eslEOF) {
393 if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
394 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
395 else if (status != eslOK) esl_fatal("Unexpected error %d reading sequence file %s",
396 status, sqfp->filename);
397
398 if (strcmp(key, sq->name) == 0 || strcmp(key, sq->acc) == 0) break;
399 esl_sq_Reuse(sq);
400 }
401 if (status == eslEOF) esl_fatal("Failed to find sequence %s in file %s\n", key, sqfp->filename);
402
403 }
404
405 if (do_revcomp == FALSE && newname == NULL && ! esl_sqio_IsAlignment(sqfp->format))
406 { /* If we're not manipulating the sequence in any way, and it's not from an alignment file, we can Echo() it. */
407 if (esl_sqio_Echo(sqfp, sq, ofp) != eslOK) esl_fatal("Echo failed: %s\n", esl_sqfile_GetErrorBuf(sqfp));
408 }
409 else
410 { /* Otherwise we Write() the parsed version. */
411 if (do_revcomp && esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name);
412 if (newname != NULL) esl_sq_SetName(sq, newname);
413 esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
414 }
415
416 esl_sq_Destroy(sq);
417 }
418
419 static void
multifetch_subseq(ESL_GETOPTS * go,FILE * ofp,char * gdffile,ESL_SQFILE * sqfp)420 multifetch_subseq(ESL_GETOPTS *go, FILE *ofp, char *gdffile, ESL_SQFILE *sqfp)
421 {
422 ESL_FILEPARSER *efp = NULL;
423 char *newname;
424 char *s;
425 int n1, n2;
426 int start, end;
427 char *source;
428
429 if (esl_fileparser_Open(gdffile, NULL, &efp) != eslOK) esl_fatal("Failed to open key file %s\n", gdffile);
430 esl_fileparser_SetCommentChar(efp, '#');
431
432 while (esl_fileparser_NextLine(efp) == eslOK)
433 {
434 if (esl_fileparser_GetTokenOnLine(efp, &newname, &n1) != eslOK)
435 esl_fatal("Failed to read subseq name on line %d of file %s\n", efp->linenumber, gdffile);
436
437 if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK)
438 esl_fatal("Failed to read start coord on line %d of file %s\n", efp->linenumber, gdffile);
439 start = atoi(s);
440 if(start <= 0)
441 esl_fatal("Read invalid start coord %d on line %d of file %s (must be positive integer)\n", start, efp->linenumber, gdffile);
442
443 if (esl_fileparser_GetTokenOnLine(efp, &s, NULL) != eslOK)
444 esl_fatal("Failed to read end coord on line %d of file %s\n", efp->linenumber, gdffile);
445 end = atoi(s);
446 if(end < 0)
447 esl_fatal("Read invalid end coord %d on line %d of file %s (must be positive integer, or 0 for full length)\n", end, efp->linenumber, gdffile);
448
449 if (esl_fileparser_GetTokenOnLine(efp, &source, &n2) != eslOK)
450 esl_fatal("Failed to read source seq name on line %d of file %s\n", efp->linenumber, gdffile);
451
452 onefetch_subseq(go, ofp, sqfp, newname, source, start, end);
453 }
454 esl_fileparser_Close(efp);
455 }
456
457 static void
onefetch_subseq(ESL_GETOPTS * go,FILE * ofp,ESL_SQFILE * sqfp,char * newname,char * key,uint32_t given_start,uint32_t given_end)458 onefetch_subseq(ESL_GETOPTS *go, FILE *ofp, ESL_SQFILE *sqfp, char *newname, char *key, uint32_t given_start, uint32_t given_end)
459 {
460 int start, end;
461 int do_revcomp;
462 ESL_SQ *sq = esl_sq_Create();
463
464 if (sqfp->data.ascii.ssi == NULL) esl_fatal("no ssi index");
465
466 /* reverse complement indicated by coords. */
467 /* -c 52: would be 52,0, so watch out for given_end = 0 case */
468 if (given_end != 0 && given_start > given_end)
469 { start = given_end; end = given_start; do_revcomp = TRUE; }
470 else
471 { start = given_start; end = given_end; do_revcomp = FALSE; }
472
473 if (esl_sqio_FetchSubseq(sqfp, key, start, end, sq) != eslOK) esl_fatal(esl_sqfile_GetErrorBuf(sqfp));
474
475 if (newname != NULL) esl_sq_SetName(sq, newname);
476 else esl_sq_FormatName(sq, "%s/%u-%" PRId64, key, given_start, (given_end == 0) ? sq->L : given_end);
477
478 /* Two ways we might have been asked to revcomp: by coord, or by -r option */
479 /* (If both happen, they'll cancel each other out) */
480 if (do_revcomp)
481 if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name);
482 if (esl_opt_GetBoolean(go, "-r"))
483 if (esl_sq_ReverseComplement(sq) != eslOK) esl_fatal("Failed to reverse complement %s; is it a protein?\n", sq->name);
484
485 esl_sqio_Write(ofp, sq, eslSQFILE_FASTA, FALSE);
486 esl_sq_Destroy(sq);
487 }
488
489
490