1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 /* Implement fastx_getseq, fastx_getseqs and fastx_getsubseq as described here:
62 https://drive5.com/usearch/manual/cmd_fastx_getseqs.html */
63
64 #include "vsearch.h"
65
66 static int labels_alloc = 0;
67 static int labels_count = 0;
68 static int labels_longest = 0;
69 static char * * labels_data = nullptr;
70
read_labels_file(char * filename)71 void read_labels_file(char * filename)
72 {
73 FILE * fp_labels = fopen_input(filename);
74 if (! fp_labels) {
75 fatal("Unable to open labels file (%s)", filename);
76
77 }
78
79 xstat_t fs;
80 if (xfstat(fileno(fp_labels), & fs)) {
81 fatal("Unable to get status for labels file (%s)", filename);
82
83 }
84
85 bool is_pipe = S_ISFIFO(fs.st_mode);
86 uint64_t file_size = 0;
87 if (! is_pipe) {
88 file_size = fs.st_size;
89
90 }
91
92 progress_init("Reading labels", file_size);
93
94 while(true)
95 {
96 const int buffer_size = 1024;
97 char buffer[buffer_size];
98 char * ret = fgets(buffer, buffer_size, fp_labels);
99 if (ret)
100 {
101 int len = strlen(buffer);
102 if ((len > 0) && (buffer[len - 1] == '\n'))
103 {
104 buffer[len - 1] = 0;
105 len--;
106 }
107
108 if (len > labels_longest) {
109 labels_longest = len;
110
111 }
112
113 if (labels_count + 1 > labels_alloc)
114 {
115 labels_alloc += 1024;
116 labels_data = (char * *) realloc(labels_data,
117 labels_alloc * sizeof (char*));
118 if (! labels_data) {
119 fatal("Unable to allocate memory for labels");
120
121 }
122 }
123 labels_data[labels_count++] = strdup(buffer);
124 }
125 else {
126 break;
127
128 }
129 }
130
131 fclose(fp_labels);
132 progress_done();
133
134 if (labels_longest >= 1023)
135 {
136 if (!opt_quiet) {
137 fprintf(stderr, "WARNING: Labels longer than 1023 characters are not supported\n");
138
139 }
140
141 if (opt_log) {
142 fprintf(fp_log, "WARNING: Labels longer than 1023 characters are not supported\n");
143
144 }
145 }
146 }
147
free_labels()148 void free_labels()
149 {
150 for(int i=0; i < labels_count; i++) {
151 free(labels_data[i]);
152
153 }
154 free(labels_data);
155 labels_data = nullptr;
156 }
157
test_label_match(fastx_handle h)158 bool test_label_match(fastx_handle h)
159 {
160 char * header = fastx_get_header(h);
161 int hlen = fastx_get_header_length(h);
162 char * field_buffer = nullptr;
163 int field_len = 0;
164 if (opt_label_field)
165 {
166 field_len = strlen(opt_label_field);
167 int field_buffer_size = field_len + 2;
168 if (opt_label_word) {
169 field_buffer_size += strlen(opt_label_word);
170 } else {
171 field_buffer_size += labels_longest;
172
173 }
174 field_buffer = (char *) xmalloc(field_buffer_size);
175 sprintf(field_buffer, "%s=", opt_label_field);
176 }
177
178 if (opt_label)
179 {
180 char * needle = opt_label;
181 int wlen = strlen(needle);
182 if (opt_label_substr_match) {
183 return xstrcasestr(header, needle);
184 } else {
185 return (hlen == wlen) && ! strcasecmp(header, needle);
186
187 }
188 }
189 else if (opt_labels)
190 {
191 if (opt_label_substr_match)
192 {
193 for (int i = 0; i < labels_count; i++) {
194 if (xstrcasestr(header, labels_data[i])) {
195 return true;
196
197 }
198
199 }
200 }
201 else
202 {
203 for (int i = 0; i < labels_count; i++)
204 {
205 char * needle = labels_data[i];
206 int wlen = strlen(needle);
207 if ((hlen == wlen) && ! strcasecmp(header, needle)) {
208 return true;
209
210 }
211 }
212 }
213 }
214 else if (opt_label_word)
215 {
216 char * needle = opt_label_word;
217 if (opt_label_field)
218 {
219 strcpy(field_buffer + field_len + 1, needle);
220 needle = field_buffer;
221 }
222 int wlen = strlen(needle);
223 char * hit = header;
224 while (true)
225 {
226 hit = strstr(hit, needle);
227 if (hit)
228 {
229 if (opt_label_field)
230 {
231 /* check of field */
232 if (((hit == header) ||
233 (*(hit - 1) == ';')) &&
234 ((hit + wlen == header + hlen) ||
235 (*(hit + wlen) == ';'))) {
236 return true;
237
238 }
239 }
240 else
241 {
242 /* check of full word */
243 if (((hit == header) ||
244 (!isalnum(*(hit - 1)))) &&
245 ((hit + wlen == header + hlen) ||
246 (!isalnum(*(hit + wlen))))) {
247 return true;
248
249 }
250 }
251 hit++;
252 }
253 else {
254 break;
255
256 }
257 }
258 }
259 else if (opt_label_words)
260 {
261 for (int i = 0; i < labels_count; i++)
262 {
263 char * needle = labels_data[i];
264 if (opt_label_field)
265 {
266 strcpy(field_buffer + field_len + 1, needle);
267 needle = field_buffer;
268 }
269 int wlen = strlen(needle);
270 char * hit = header;
271 while (true)
272 {
273 hit = strstr(hit, needle);
274 if (hit)
275 {
276 if (opt_label_field)
277 {
278 /* check of field */
279 if (((hit == header) ||
280 (*(hit - 1) == ';')) &&
281 ((hit + wlen == header + hlen) ||
282 (*(hit + wlen) == ';'))) {
283 return true;
284
285 }
286 }
287 else
288 {
289 /* check of full word */
290 if (((hit == header) ||
291 (!isalnum(*(hit - 1)))) &&
292 ((hit + wlen == header + hlen) ||
293 (!isalnum(*(hit + wlen))))) {
294 return true;
295
296 }
297 }
298 hit++;
299 }
300 else {
301 break;
302
303 }
304 }
305 }
306 }
307 return false;
308 }
309
getseq(char * filename)310 void getseq(char * filename)
311 {
312 if ((!opt_fastqout) && (!opt_fastaout) &&
313 (!opt_notmatched) && (!opt_notmatchedfq)) {
314 fatal("No output files specified");
315
316 }
317
318 if (opt_fastx_getseq)
319 {
320 if (! opt_label) {
321 fatal("Missing label option");
322
323 }
324 }
325 else if (opt_fastx_getsubseq)
326 {
327 if (! opt_label) {
328 fatal("Missing label option");
329
330 }
331
332 if ((opt_subseq_start < 1) || (opt_subseq_end < 1)) {
333 fatal("The argument to options subseq_start and subseq_end must be at least 1");
334
335 }
336
337 if (opt_subseq_start > opt_subseq_end) {
338 fatal("The argument to option subseq_start must be equal or less than to subseq_end");
339
340 }
341 }
342 else if (opt_fastx_getseqs)
343 {
344 int label_options = 0;
345 if (opt_label) {
346 label_options++;
347
348 }
349 if (opt_labels) {
350 label_options++;
351
352 }
353 if (opt_label_word) {
354 label_options++;
355
356 }
357 if (opt_label_words) {
358 label_options++;
359
360 }
361
362 if (label_options != 1) {
363 fatal("Specify one label option (label, labels, label_word or label_words)");
364
365 }
366
367 if (opt_labels) {
368 read_labels_file(opt_labels);
369
370 }
371
372 if (opt_label_words) {
373 read_labels_file(opt_label_words);
374
375 }
376 }
377
378 fastx_handle h1 = nullptr;
379
380 h1 = fastx_open(filename);
381
382 if (!h1) {
383 fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
384
385 }
386
387 if ((opt_fastqout || opt_notmatchedfq) && ! (h1->is_fastq || h1->is_empty)) {
388 fatal("Cannot write FASTQ output from FASTA input");
389
390 }
391
392 uint64_t filesize = fastx_get_size(h1);
393
394 FILE * fp_fastaout = nullptr;
395 FILE * fp_fastqout = nullptr;
396 FILE * fp_notmatched = nullptr;
397 FILE * fp_notmatchedfq = nullptr;
398
399 if (opt_fastaout)
400 {
401 fp_fastaout = fopen_output(opt_fastaout);
402 if (!fp_fastaout) {
403 fatal("Unable to open FASTA output file for writing");
404
405 }
406 }
407
408 if (opt_fastqout)
409 {
410 fp_fastqout = fopen_output(opt_fastqout);
411 if (!fp_fastqout) {
412 fatal("Unable to open FASTQ output file for writing");
413
414 }
415 }
416
417 if (opt_notmatched)
418 {
419 fp_notmatched = fopen_output(opt_notmatched);
420 if (!fp_notmatched) {
421 fatal("Unable to open FASTA output file (notmatched) for writing");
422
423 }
424 }
425
426 if (opt_notmatchedfq)
427 {
428 fp_notmatchedfq = fopen_output(opt_notmatchedfq);
429 if (!fp_notmatchedfq) {
430 fatal("Unable to open FASTQ output file (notmatchedfq) for writing");
431
432 }
433 }
434
435 progress_init("Extracting sequences", filesize);
436
437 int64_t kept = 0;
438 int64_t discarded = 0;
439
440 while(fastx_next(h1, ! opt_notrunclabels, chrmap_no_change))
441 {
442 bool match = test_label_match(h1);
443
444 int64_t start = 1;
445 int64_t end = fastx_get_sequence_length(h1);
446 if (opt_fastx_getsubseq)
447 {
448 if (opt_subseq_start > start) {
449 start = opt_subseq_start;
450
451 }
452 if (opt_subseq_end < end) {
453 end = opt_subseq_end;
454
455 }
456 }
457 int64_t length = end - start + 1;
458
459 if (match)
460 {
461 /* keep the sequence(s) */
462
463 kept++;
464
465 if (opt_fastaout) {
466 fasta_print_general(fp_fastaout,
467 nullptr,
468 fastx_get_sequence(h1) + start - 1,
469 length,
470 fastx_get_header(h1),
471 fastx_get_header_length(h1),
472 fastx_get_abundance(h1),
473 kept,
474 -1.0,
475 -1,
476 -1,
477 nullptr,
478 0.0);
479
480 }
481
482 if (opt_fastqout) {
483 fastq_print_general(fp_fastqout,
484 fastx_get_sequence(h1) + start - 1,
485 length,
486 fastx_get_header(h1),
487 fastx_get_header_length(h1),
488 fastx_get_quality(h1) + start - 1,
489 fastx_get_abundance(h1),
490 kept,
491 -1.0);
492
493 }
494
495 }
496 else
497 {
498 /* discard the sequence */
499
500 discarded++;
501
502 if (opt_notmatched) {
503 fasta_print_general(fp_notmatched,
504 nullptr,
505 fastx_get_sequence(h1) + start - 1,
506 length,
507 fastx_get_header(h1),
508 fastx_get_header_length(h1),
509 fastx_get_abundance(h1),
510 discarded,
511 -1.0,
512 -1,
513 -1,
514 nullptr,
515 0.0);
516
517 }
518
519 if (opt_notmatchedfq) {
520 fastq_print_general(fp_notmatchedfq,
521 fastx_get_sequence(h1) + start - 1,
522 length,
523 fastx_get_header(h1),
524 fastx_get_header_length(h1),
525 fastx_get_quality(h1) + start - 1,
526 fastx_get_abundance(h1),
527 discarded,
528 -1.0);
529
530 }
531 }
532
533 progress_update(fastx_get_position(h1));
534 }
535
536 progress_done();
537
538 if (! opt_quiet)
539 {
540 fprintf(stderr,
541 "%" PRId64 " of %" PRId64 " sequences extracted",
542 kept,
543 kept + discarded);
544 if (kept + discarded > 0) {
545 fprintf(stderr,
546 " (%.1lf%%)",
547 100.0 * kept / (kept + discarded));
548
549 }
550 fprintf(stderr, "\n");
551 }
552
553 if (opt_log)
554 {
555 fprintf(fp_log,
556 "%" PRId64 " of %" PRId64 " sequences extracted",
557 kept,
558 kept + discarded);
559 if (kept + discarded > 0) {
560 fprintf(fp_log,
561 " (%.1lf%%)",
562 100.0 * kept / (kept + discarded));
563
564 }
565 fprintf(fp_log, "\n");
566 }
567
568 if (opt_fastaout) {
569 fclose(fp_fastaout);
570
571 }
572
573 if (opt_fastqout) {
574 fclose(fp_fastqout);
575
576 }
577
578 if (opt_notmatched) {
579 fclose(fp_notmatched);
580
581 }
582
583 if (opt_notmatchedfq) {
584 fclose(fp_notmatchedfq);
585
586 }
587
588 fastx_close(h1);
589
590 if (opt_labels || opt_label_words) {
591 free_labels();
592
593 }
594 }
595
fastx_getseq()596 void fastx_getseq()
597 {
598 getseq(opt_fastx_getseq);
599 }
600
fastx_getseqs()601 void fastx_getseqs()
602 {
603 getseq(opt_fastx_getseqs);
604 }
605
fastx_getsubseq()606 void fastx_getsubseq()
607 {
608 getseq(opt_fastx_getsubseq);
609 }
610