1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 /* Implement fastx_getseq, fastx_getseqs and fastx_getsubseq as described here:
62    https://drive5.com/usearch/manual/cmd_fastx_getseqs.html                  */
63 
64 #include "vsearch.h"
65 
66 static int labels_alloc = 0;
67 static int labels_count = 0;
68 static int labels_longest = 0;
69 static char * * labels_data = nullptr;
70 
read_labels_file(char * filename)71 void read_labels_file(char * filename)
72 {
73   FILE * fp_labels = fopen_input(filename);
74   if (! fp_labels) {
75     fatal("Unable to open labels file (%s)", filename);
76 
77         }
78 
79   xstat_t fs;
80   if (xfstat(fileno(fp_labels), & fs)) {
81     fatal("Unable to get status for labels file (%s)", filename);
82 
83         }
84 
85   bool is_pipe = S_ISFIFO(fs.st_mode);
86   uint64_t file_size = 0;
87   if (! is_pipe) {
88     file_size = fs.st_size;
89 
90         }
91 
92   progress_init("Reading labels", file_size);
93 
94   while(true)
95     {
96       const int buffer_size = 1024;
97       char buffer[buffer_size];
98       char * ret = fgets(buffer, buffer_size, fp_labels);
99       if (ret)
100         {
101           int len = strlen(buffer);
102           if ((len > 0) && (buffer[len - 1] == '\n'))
103             {
104               buffer[len - 1] = 0;
105               len--;
106             }
107 
108           if (len > labels_longest) {
109             labels_longest = len;
110 
111         }
112 
113           if (labels_count + 1 > labels_alloc)
114             {
115               labels_alloc += 1024;
116               labels_data = (char * *) realloc(labels_data,
117                                                labels_alloc * sizeof (char*));
118               if (! labels_data) {
119                 fatal("Unable to allocate memory for labels");
120 
121         }
122             }
123           labels_data[labels_count++] = strdup(buffer);
124         }
125       else {
126         break;
127 
128         }
129     }
130 
131   fclose(fp_labels);
132   progress_done();
133 
134   if (labels_longest >= 1023)
135     {
136       if (!opt_quiet) {
137         fprintf(stderr, "WARNING: Labels longer than 1023 characters are not supported\n");
138 
139         }
140 
141       if (opt_log) {
142         fprintf(fp_log, "WARNING: Labels longer than 1023 characters are not supported\n");
143 
144         }
145     }
146 }
147 
free_labels()148 void free_labels()
149 {
150   for(int i=0; i < labels_count; i++) {
151     free(labels_data[i]);
152 
153         }
154   free(labels_data);
155   labels_data = nullptr;
156 }
157 
test_label_match(fastx_handle h)158 bool test_label_match(fastx_handle h)
159 {
160   char * header = fastx_get_header(h);
161   int hlen = fastx_get_header_length(h);
162   char * field_buffer = nullptr;
163   int field_len = 0;
164   if (opt_label_field)
165     {
166       field_len = strlen(opt_label_field);
167       int field_buffer_size = field_len + 2;
168       if (opt_label_word) {
169         field_buffer_size += strlen(opt_label_word);
170       } else {
171         field_buffer_size += labels_longest;
172 
173         }
174       field_buffer = (char *) xmalloc(field_buffer_size);
175       sprintf(field_buffer, "%s=", opt_label_field);
176     }
177 
178   if (opt_label)
179     {
180       char * needle = opt_label;
181       int wlen = strlen(needle);
182       if (opt_label_substr_match) {
183         return xstrcasestr(header, needle);
184       } else {
185         return (hlen == wlen) && ! strcasecmp(header, needle);
186 
187         }
188     }
189   else if (opt_labels)
190     {
191       if (opt_label_substr_match)
192         {
193           for (int i = 0; i < labels_count; i++) {
194             if (xstrcasestr(header, labels_data[i])) {
195               return true;
196 
197         }
198 
199         }
200         }
201       else
202         {
203           for (int i = 0; i < labels_count; i++)
204             {
205               char * needle = labels_data[i];
206               int wlen = strlen(needle);
207               if ((hlen == wlen) && ! strcasecmp(header, needle)) {
208                 return true;
209 
210         }
211             }
212         }
213     }
214   else if (opt_label_word)
215     {
216       char * needle = opt_label_word;
217       if (opt_label_field)
218         {
219           strcpy(field_buffer + field_len + 1, needle);
220           needle = field_buffer;
221         }
222       int wlen = strlen(needle);
223       char * hit = header;
224       while (true)
225         {
226           hit = strstr(hit, needle);
227           if (hit)
228             {
229               if (opt_label_field)
230                 {
231                   /* check of field */
232                   if (((hit == header) ||
233                        (*(hit - 1) == ';')) &&
234                       ((hit + wlen == header + hlen) ||
235                        (*(hit + wlen) == ';'))) {
236                     return true;
237 
238         }
239                 }
240               else
241                 {
242                   /* check of full word */
243                   if (((hit == header) ||
244                        (!isalnum(*(hit - 1)))) &&
245                       ((hit + wlen == header + hlen) ||
246                        (!isalnum(*(hit + wlen))))) {
247                     return true;
248 
249         }
250                 }
251               hit++;
252             }
253           else {
254             break;
255 
256         }
257         }
258     }
259   else if (opt_label_words)
260     {
261       for (int i = 0; i < labels_count; i++)
262         {
263           char * needle = labels_data[i];
264           if (opt_label_field)
265             {
266               strcpy(field_buffer + field_len + 1, needle);
267               needle = field_buffer;
268             }
269           int wlen = strlen(needle);
270           char * hit = header;
271           while (true)
272             {
273               hit = strstr(hit, needle);
274               if (hit)
275                 {
276                   if (opt_label_field)
277                     {
278                       /* check of field */
279                       if (((hit == header) ||
280                            (*(hit - 1) == ';')) &&
281                           ((hit + wlen == header + hlen) ||
282                            (*(hit + wlen) == ';'))) {
283                         return true;
284 
285         }
286                     }
287                   else
288                     {
289                       /* check of full word */
290                       if (((hit == header) ||
291                            (!isalnum(*(hit - 1)))) &&
292                           ((hit + wlen == header + hlen) ||
293                            (!isalnum(*(hit + wlen))))) {
294                         return true;
295 
296         }
297                     }
298                   hit++;
299                 }
300               else {
301                 break;
302 
303         }
304             }
305         }
306     }
307   return false;
308 }
309 
getseq(char * filename)310 void getseq(char * filename)
311 {
312   if ((!opt_fastqout) && (!opt_fastaout) &&
313       (!opt_notmatched) && (!opt_notmatchedfq)) {
314     fatal("No output files specified");
315 
316         }
317 
318   if (opt_fastx_getseq)
319     {
320       if (! opt_label) {
321         fatal("Missing label option");
322 
323         }
324     }
325   else if (opt_fastx_getsubseq)
326     {
327       if (! opt_label) {
328         fatal("Missing label option");
329 
330         }
331 
332       if ((opt_subseq_start < 1) || (opt_subseq_end < 1)) {
333         fatal("The argument to options subseq_start and subseq_end must be at least 1");
334 
335         }
336 
337       if (opt_subseq_start > opt_subseq_end) {
338         fatal("The argument to option subseq_start must be equal or less than to subseq_end");
339 
340         }
341     }
342   else if (opt_fastx_getseqs)
343     {
344       int label_options = 0;
345       if (opt_label) {
346         label_options++;
347 
348         }
349       if (opt_labels) {
350         label_options++;
351 
352         }
353       if (opt_label_word) {
354         label_options++;
355 
356         }
357       if (opt_label_words) {
358         label_options++;
359 
360         }
361 
362       if (label_options != 1) {
363         fatal("Specify one label option (label, labels, label_word or label_words)");
364 
365         }
366 
367       if (opt_labels) {
368         read_labels_file(opt_labels);
369 
370         }
371 
372       if (opt_label_words) {
373         read_labels_file(opt_label_words);
374 
375         }
376     }
377 
378   fastx_handle h1 = nullptr;
379 
380   h1 = fastx_open(filename);
381 
382   if (!h1) {
383     fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
384 
385         }
386 
387   if ((opt_fastqout || opt_notmatchedfq) && ! (h1->is_fastq || h1->is_empty)) {
388     fatal("Cannot write FASTQ output from FASTA input");
389 
390         }
391 
392   uint64_t filesize = fastx_get_size(h1);
393 
394   FILE * fp_fastaout = nullptr;
395   FILE * fp_fastqout = nullptr;
396   FILE * fp_notmatched = nullptr;
397   FILE * fp_notmatchedfq = nullptr;
398 
399   if (opt_fastaout)
400     {
401       fp_fastaout = fopen_output(opt_fastaout);
402       if (!fp_fastaout) {
403         fatal("Unable to open FASTA output file for writing");
404 
405         }
406     }
407 
408   if (opt_fastqout)
409     {
410       fp_fastqout = fopen_output(opt_fastqout);
411       if (!fp_fastqout) {
412         fatal("Unable to open FASTQ output file for writing");
413 
414         }
415     }
416 
417   if (opt_notmatched)
418     {
419       fp_notmatched = fopen_output(opt_notmatched);
420       if (!fp_notmatched) {
421         fatal("Unable to open FASTA output file (notmatched) for writing");
422 
423         }
424     }
425 
426   if (opt_notmatchedfq)
427     {
428       fp_notmatchedfq = fopen_output(opt_notmatchedfq);
429       if (!fp_notmatchedfq) {
430         fatal("Unable to open FASTQ output file (notmatchedfq) for writing");
431 
432         }
433     }
434 
435   progress_init("Extracting sequences", filesize);
436 
437   int64_t kept = 0;
438   int64_t discarded = 0;
439 
440   while(fastx_next(h1, ! opt_notrunclabels, chrmap_no_change))
441     {
442       bool match = test_label_match(h1);
443 
444       int64_t start = 1;
445       int64_t end = fastx_get_sequence_length(h1);
446       if (opt_fastx_getsubseq)
447         {
448           if (opt_subseq_start > start) {
449             start = opt_subseq_start;
450 
451         }
452           if (opt_subseq_end < end) {
453             end = opt_subseq_end;
454 
455         }
456         }
457       int64_t length = end - start + 1;
458 
459       if (match)
460         {
461           /* keep the sequence(s) */
462 
463           kept++;
464 
465           if (opt_fastaout) {
466             fasta_print_general(fp_fastaout,
467                                 nullptr,
468                                 fastx_get_sequence(h1) + start - 1,
469                                 length,
470                                 fastx_get_header(h1),
471                                 fastx_get_header_length(h1),
472                                 fastx_get_abundance(h1),
473                                 kept,
474                                 -1.0,
475                                 -1,
476                                 -1,
477                                 nullptr,
478                                 0.0);
479 
480         }
481 
482           if (opt_fastqout) {
483             fastq_print_general(fp_fastqout,
484                                 fastx_get_sequence(h1) + start - 1,
485                                 length,
486                                 fastx_get_header(h1),
487                                 fastx_get_header_length(h1),
488                                 fastx_get_quality(h1) + start - 1,
489                                 fastx_get_abundance(h1),
490                                 kept,
491                                 -1.0);
492 
493         }
494 
495         }
496       else
497         {
498           /* discard the sequence */
499 
500           discarded++;
501 
502           if (opt_notmatched) {
503             fasta_print_general(fp_notmatched,
504                                 nullptr,
505                                 fastx_get_sequence(h1) + start - 1,
506                                 length,
507                                 fastx_get_header(h1),
508                                 fastx_get_header_length(h1),
509                                 fastx_get_abundance(h1),
510                                 discarded,
511                                 -1.0,
512                                 -1,
513                                 -1,
514                                 nullptr,
515                                 0.0);
516 
517         }
518 
519           if (opt_notmatchedfq) {
520             fastq_print_general(fp_notmatchedfq,
521                                 fastx_get_sequence(h1) + start - 1,
522                                 length,
523                                 fastx_get_header(h1),
524                                 fastx_get_header_length(h1),
525                                 fastx_get_quality(h1) + start - 1,
526                                 fastx_get_abundance(h1),
527                                 discarded,
528                                 -1.0);
529 
530         }
531         }
532 
533       progress_update(fastx_get_position(h1));
534     }
535 
536   progress_done();
537 
538   if (! opt_quiet)
539     {
540       fprintf(stderr,
541               "%" PRId64 " of %" PRId64 " sequences extracted",
542               kept,
543               kept + discarded);
544       if (kept + discarded > 0) {
545         fprintf(stderr,
546                 " (%.1lf%%)",
547                 100.0 * kept / (kept + discarded));
548 
549         }
550       fprintf(stderr, "\n");
551     }
552 
553   if (opt_log)
554     {
555       fprintf(fp_log,
556               "%" PRId64 " of %" PRId64 " sequences extracted",
557               kept,
558               kept + discarded);
559       if (kept + discarded > 0) {
560         fprintf(fp_log,
561                 " (%.1lf%%)",
562                 100.0 * kept / (kept + discarded));
563 
564         }
565       fprintf(fp_log, "\n");
566     }
567 
568   if (opt_fastaout) {
569     fclose(fp_fastaout);
570 
571         }
572 
573   if (opt_fastqout) {
574     fclose(fp_fastqout);
575 
576         }
577 
578   if (opt_notmatched) {
579     fclose(fp_notmatched);
580 
581         }
582 
583   if (opt_notmatchedfq) {
584     fclose(fp_notmatchedfq);
585 
586         }
587 
588   fastx_close(h1);
589 
590   if (opt_labels || opt_label_words) {
591     free_labels();
592 
593         }
594 }
595 
fastx_getseq()596 void fastx_getseq()
597 {
598   getseq(opt_fastx_getseq);
599 }
600 
fastx_getseqs()601 void fastx_getseqs()
602 {
603   getseq(opt_fastx_getseqs);
604 }
605 
fastx_getsubseq()606 void fastx_getsubseq()
607 {
608   getseq(opt_fastx_getsubseq);
609 }
610