1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
fastq_get_qual(char q)63 inline int fastq_get_qual(char q)
64 {
65   int qual = q - opt_fastq_ascii;
66 
67   if (qual < opt_fastq_qmin)
68     {
69       fprintf(stderr,
70               "\n\nFatal error: FASTQ quality value (%d) below qmin (%"
71               PRId64 ")\n",
72               qual, opt_fastq_qmin);
73       if (fp_log)
74         {
75           fprintf(stderr,
76                   "\n\nFatal error: FASTQ quality value (%d) below qmin (%"
77                   PRId64 ")\n",
78                   qual, opt_fastq_qmin);
79         }
80       exit(EXIT_FAILURE);
81     }
82   else if (qual > opt_fastq_qmax)
83     {
84       fprintf(stderr,
85               "\n\nFatal error: FASTQ quality value (%d) above qmax (%"
86               PRId64 ")\n",
87               qual, opt_fastq_qmax);
88       fprintf(stderr,
89               "By default, quality values range from 0 to 41.\n"
90               "To allow higher quality values, "
91               "please use the option --fastq_qmax %d\n", qual);
92       if (fp_log)
93         {
94           fprintf(fp_log,
95                   "\n\nFatal error: FASTQ quality value (%d) above qmax (%"
96                   PRId64 ")\n",
97                   qual, opt_fastq_qmax);
98           fprintf(fp_log,
99                   "By default, quality values range from 0 to 41.\n"
100                   "To allow higher quality values, "
101                   "please use the option --fastq_qmax %d\n", qual);
102         }
103       exit(EXIT_FAILURE);
104     }
105   return qual;
106 }
107 
108 struct analysis_res
109 {
110   bool discarded;
111   bool truncated;
112   int start;
113   int length;
114   double ee;
115 };
116 
analyse(fastx_handle h)117 struct analysis_res analyse(fastx_handle h)
118 {
119   struct analysis_res res = { false, false, 0, 0, -1.0 };
120   res.length = fastx_get_sequence_length(h);
121   int64_t old_length = res.length;
122 
123   /* strip left (5') end */
124   if (opt_fastq_stripleft < res.length)
125     {
126       res.start += opt_fastq_stripleft;
127       res.length -= opt_fastq_stripleft;
128     }
129   else
130     {
131       res.start = res.length;
132       res.length = 0;
133     }
134 
135   /* strip right (3') end */
136   if (opt_fastq_stripright < res.length) {
137     res.length -= opt_fastq_stripright;
138   } else {
139     res.length = 0;
140 
141         }
142 
143   /* truncate trailing (3') part */
144   if (opt_fastq_trunclen >= 0) {
145     if (res.length > opt_fastq_trunclen) {
146       res.length = opt_fastq_trunclen;
147 
148         }
149 
150         }
151 
152   /* truncate trailing (3') part, but keep if short */
153   if (opt_fastq_trunclen_keep >= 0) {
154     if (res.length > opt_fastq_trunclen_keep) {
155       res.length = opt_fastq_trunclen_keep;
156 
157         }
158 
159         }
160 
161   if (h->is_fastq)
162     {
163       /* truncate by quality and expected errors (ee) */
164       res.ee = 0.0;
165       char * q = fastx_get_quality(h) + res.start;
166       for (int64_t i = 0; i < res.length; i++)
167         {
168           int qual = fastq_get_qual(q[i]);
169           double e = exp10(-0.1 * qual);
170           res.ee += e;
171 
172           if ((qual <= opt_fastq_truncqual) ||
173               (res.ee > opt_fastq_truncee))
174             {
175               res.ee -= e;
176               res.length = i;
177               break;
178             }
179         }
180 
181       /* filter by expected errors (ee) */
182       if (res.ee > opt_fastq_maxee) {
183         res.discarded = true;
184 
185         }
186       if ((res.length > 0) && (res.ee / res.length > opt_fastq_maxee_rate)) {
187         res.discarded = true;
188 
189         }
190     }
191 
192   /* filter by length */
193   if ((opt_fastq_trunclen >= 0) && (res.length < opt_fastq_trunclen)) {
194     res.discarded = true;
195 
196         }
197   if (res.length < opt_fastq_minlen) {
198     res.discarded = true;
199 
200         }
201   if (res.length > opt_fastq_maxlen) {
202     res.discarded = true;
203 
204         }
205 
206   /* filter by n's */
207   int64_t ncount = 0;
208   char * p = fastx_get_sequence(h) + res.start;
209   for (int64_t i = 0; i < res.length; i++)
210     {
211       int pc = p[i];
212       if ((pc == 'N') || (pc == 'n')) {
213         ncount++;
214 
215         }
216     }
217   if (ncount > opt_fastq_maxns) {
218     res.discarded = true;
219 
220         }
221 
222   /* filter by abundance */
223   int64_t abundance = fastx_get_abundance(h);
224   if (abundance < opt_minsize) {
225     res.discarded = true;
226 
227         }
228   if (abundance > opt_maxsize) {
229     res.discarded = true;
230 
231         }
232 
233   res.truncated = res.length < old_length;
234 
235   return res;
236 }
237 
filter(bool fastq_only,char * filename)238 void filter(bool fastq_only, char * filename)
239 {
240   if ((!opt_fastqout) && (!opt_fastaout) &&
241       (!opt_fastqout_discarded) && (!opt_fastaout_discarded) &&
242       (!opt_fastqout_rev) && (!opt_fastaout_rev) &&
243       (!opt_fastqout_discarded_rev) && (!opt_fastaout_discarded_rev)) {
244     fatal("No output files specified");
245 
246         }
247 
248   fastx_handle h1 = nullptr;
249   fastx_handle h2 = nullptr;
250 
251   h1 = fastx_open(filename);
252 
253   if (!h1) {
254     fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
255 
256         }
257 
258   if (! (h1->is_fastq || h1->is_empty))
259     {
260       if (fastq_only)
261         {
262           fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
263         }
264       else if (opt_eeout ||
265                (opt_fastq_ascii != 33) ||
266                opt_fastq_eeout ||
267                (opt_fastq_maxee < DBL_MAX) ||
268                (opt_fastq_maxee_rate < DBL_MAX) ||
269                opt_fastqout ||
270                (opt_fastq_qmax < 41) ||
271                (opt_fastq_qmin > 0) ||
272                (opt_fastq_truncee < DBL_MAX) ||
273                (opt_fastq_truncqual < LONG_MIN) ||
274                opt_fastqout_discarded ||
275                opt_fastqout_discarded_rev ||
276                opt_fastqout_rev)
277         {
278           fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual,  fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
279         }
280     }
281 
282   uint64_t filesize = fastx_get_size(h1);
283 
284   if (opt_reverse)
285     {
286       h2 = fastx_open(opt_reverse);
287 
288       if (!h2) {
289         fatal("Unrecognized file type (not proper FASTA or FASTQ format) for reverse reads");
290 
291         }
292 
293       if (h1->is_fastq != h2->is_fastq) {
294         fatal("The forward and reverse input sequence must in the same format, either FASTA or FASTQ");
295 
296         }
297 
298       if (! (h2->is_fastq || h2->is_empty))
299         {
300           if (fastq_only)
301             {
302               fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
303             }
304           else if (opt_eeout ||
305                    (opt_fastq_ascii != 33) ||
306                    opt_fastq_eeout ||
307                    (opt_fastq_maxee < DBL_MAX) ||
308                    (opt_fastq_maxee_rate < DBL_MAX) ||
309                    opt_fastqout ||
310                    (opt_fastq_qmax < 41) ||
311                    (opt_fastq_qmin > 0) ||
312                    (opt_fastq_truncee < DBL_MAX) ||
313                    (opt_fastq_truncqual < LONG_MIN) ||
314                    opt_fastqout_discarded ||
315                    opt_fastqout_discarded_rev ||
316                    opt_fastqout_rev)
317             {
318               fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual,  fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
319             }
320         }
321     }
322 
323   FILE * fp_fastaout = nullptr;
324   FILE * fp_fastqout = nullptr;
325   FILE * fp_fastaout_discarded = nullptr;
326   FILE * fp_fastqout_discarded = nullptr;
327 
328   FILE * fp_fastaout_rev = nullptr;
329   FILE * fp_fastqout_rev = nullptr;
330   FILE * fp_fastaout_discarded_rev = nullptr;
331   FILE * fp_fastqout_discarded_rev = nullptr;
332 
333   if (opt_fastaout)
334     {
335       fp_fastaout = fopen_output(opt_fastaout);
336       if (!fp_fastaout) {
337         fatal("Unable to open FASTA output file for writing");
338 
339         }
340     }
341 
342   if (opt_fastqout)
343     {
344       fp_fastqout = fopen_output(opt_fastqout);
345       if (!fp_fastqout) {
346         fatal("Unable to open FASTQ output file for writing");
347 
348         }
349     }
350 
351   if (opt_fastaout_discarded)
352     {
353       fp_fastaout_discarded = fopen_output(opt_fastaout_discarded);
354       if (!fp_fastaout_discarded) {
355         fatal("Unable to open FASTA output file for writing");
356 
357         }
358     }
359 
360   if (opt_fastqout_discarded)
361     {
362       fp_fastqout_discarded = fopen_output(opt_fastqout_discarded);
363       if (!fp_fastqout_discarded) {
364         fatal("Unable to open FASTQ output file for writing");
365 
366         }
367     }
368 
369   if (h2)
370     {
371       if (opt_fastaout_rev)
372         {
373           fp_fastaout_rev = fopen_output(opt_fastaout_rev);
374           if (!fp_fastaout_rev) {
375             fatal("Unable to open FASTA output file for writing");
376 
377         }
378         }
379 
380       if (opt_fastqout_rev)
381         {
382           fp_fastqout_rev = fopen_output(opt_fastqout_rev);
383           if (!fp_fastqout_rev) {
384             fatal("Unable to open FASTQ output file for writing");
385 
386         }
387         }
388 
389       if (opt_fastaout_discarded_rev)
390         {
391           fp_fastaout_discarded_rev = fopen_output(opt_fastaout_discarded_rev);
392           if (!fp_fastaout_discarded_rev) {
393             fatal("Unable to open FASTA output file for writing");
394 
395         }
396         }
397 
398       if (opt_fastqout_discarded_rev)
399         {
400           fp_fastqout_discarded_rev = fopen_output(opt_fastqout_discarded_rev);
401           if (!fp_fastqout_discarded_rev) {
402             fatal("Unable to open FASTQ output file for writing");
403 
404         }
405         }
406     }
407 
408   progress_init("Reading input file", filesize);
409 
410   int64_t kept = 0;
411   int64_t discarded = 0;
412   int64_t truncated = 0;
413 
414   while(fastx_next(h1, false, chrmap_no_change))
415     {
416       if (h2 && ! fastx_next(h2, false, chrmap_no_change)) {
417         fatal("More forward reads than reverse reads");
418 
419         }
420 
421       struct analysis_res res1 = { false, false, 0, 0, 0.0 } ;
422       struct analysis_res res2 = { false, false, 0, 0, -1.0 } ;
423 
424       res1 = analyse(h1);
425       if (h2) {
426         res2 = analyse(h2);
427 
428         }
429 
430       if (res1.discarded || res2.discarded)
431         {
432           /* discard the sequence(s) */
433 
434           discarded++;
435 
436           if (opt_fastaout_discarded) {
437             fasta_print_general(fp_fastaout_discarded,
438                                 nullptr,
439                                 fastx_get_sequence(h1) + res1.start,
440                                 res1.length,
441                                 fastx_get_header(h1),
442                                 fastx_get_header_length(h1),
443                                 fastx_get_abundance(h1),
444                                 discarded,
445                                 res1.ee,
446                                 -1,
447                                 -1,
448                                 nullptr,
449                                 0.0);
450 
451         }
452 
453           if (opt_fastqout_discarded) {
454             fastq_print_general(fp_fastqout_discarded,
455                                 fastx_get_sequence(h1) + res1.start,
456                                 res1.length,
457                                 fastx_get_header(h1),
458                                 fastx_get_header_length(h1),
459                                 fastx_get_quality(h1) + res1.start,
460                                 fastx_get_abundance(h1),
461                                 discarded,
462                                 res1.ee);
463 
464         }
465 
466           if (h2)
467             {
468               if (opt_fastaout_discarded_rev) {
469                 fasta_print_general(fp_fastaout_discarded_rev,
470                                     nullptr,
471                                     fastx_get_sequence(h2) + res2.start,
472                                     res2.length,
473                                     fastx_get_header(h2),
474                                     fastx_get_header_length(h2),
475                                     fastx_get_abundance(h2),
476                                     discarded,
477                                     res2.ee,
478                                     -1,
479                                     -1,
480                                     nullptr,
481                                     0.0);
482 
483         }
484 
485               if (opt_fastqout_discarded_rev) {
486                 fastq_print_general(fp_fastqout_discarded_rev,
487                                     fastx_get_sequence(h2) + res2.start,
488                                     res2.length,
489                                     fastx_get_header(h2),
490                                     fastx_get_header_length(h2),
491                                     fastx_get_quality(h2) + res2.start,
492                                     fastx_get_abundance(h2),
493                                     discarded,
494                                     res2.ee);
495 
496         }
497             }
498         }
499       else
500         {
501           /* keep the sequence(s) */
502 
503           kept++;
504 
505           if (res1.truncated || res2.truncated) {
506             truncated++;
507 
508         }
509 
510           if (opt_fastaout) {
511             fasta_print_general(fp_fastaout,
512                                 nullptr,
513                                 fastx_get_sequence(h1) + res1.start,
514                                 res1.length,
515                                 fastx_get_header(h1),
516                                 fastx_get_header_length(h1),
517                                 fastx_get_abundance(h1),
518                                 kept,
519                                 res1.ee,
520                                 -1,
521                                 -1,
522                                 nullptr,
523                                 0.0);
524 
525         }
526 
527           if (opt_fastqout) {
528             fastq_print_general(fp_fastqout,
529                                 fastx_get_sequence(h1) + res1.start,
530                                 res1.length,
531                                 fastx_get_header(h1),
532                                 fastx_get_header_length(h1),
533                                 fastx_get_quality(h1) + res1.start,
534                                 fastx_get_abundance(h1),
535                                 kept,
536                                 res1.ee);
537 
538         }
539 
540           if (h2)
541             {
542               if (opt_fastaout_rev) {
543                 fasta_print_general(fp_fastaout_rev,
544                                     nullptr,
545                                     fastx_get_sequence(h2) + res2.start,
546                                     res2.length,
547                                     fastx_get_header(h2),
548                                     fastx_get_header_length(h2),
549                                     fastx_get_abundance(h2),
550                                     kept,
551                                     res2.ee,
552                                     -1,
553                                     -1,
554                                     nullptr,
555                                     0.0);
556 
557         }
558 
559               if (opt_fastqout_rev) {
560                 fastq_print_general(fp_fastqout_rev,
561                                     fastx_get_sequence(h2) + res2.start,
562                                     res2.length,
563                                     fastx_get_header(h2),
564                                     fastx_get_header_length(h2),
565                                     fastx_get_quality(h2) + res2.start,
566                                     fastx_get_abundance(h2),
567                                     kept,
568                                     res2.ee);
569 
570         }
571             }
572         }
573 
574       progress_update(fastx_get_position(h1));
575     }
576 
577   progress_done();
578 
579   if (h2 && fastx_next(h2, false, chrmap_no_change)) {
580     fatal("More reverse reads than forward reads");
581 
582         }
583 
584   if (! opt_quiet) {
585     fprintf(stderr,
586             "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n",
587             kept,
588             truncated,
589             discarded);
590 
591         }
592 
593   if (opt_log) {
594     fprintf(fp_log,
595             "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n",
596             kept,
597             truncated,
598             discarded);
599 
600         }
601 
602   if (h2)
603     {
604       if (opt_fastaout_rev) {
605         fclose(fp_fastaout_rev);
606 
607         }
608 
609       if (opt_fastqout_rev) {
610         fclose(fp_fastqout_rev);
611 
612         }
613 
614       if (opt_fastaout_discarded_rev) {
615         fclose(fp_fastaout_discarded_rev);
616 
617         }
618 
619       if (opt_fastqout_discarded_rev) {
620         fclose(fp_fastqout_discarded_rev);
621 
622         }
623 
624       fastx_close(h2);
625     }
626 
627   if (opt_fastaout) {
628     fclose(fp_fastaout);
629 
630         }
631 
632   if (opt_fastqout) {
633     fclose(fp_fastqout);
634 
635         }
636 
637   if (opt_fastaout_discarded) {
638     fclose(fp_fastaout_discarded);
639 
640         }
641 
642   if (opt_fastqout_discarded) {
643     fclose(fp_fastqout_discarded);
644 
645         }
646 
647   fastx_close(h1);
648 }
649 
fastq_filter()650 void fastq_filter()
651 {
652   filter(true, opt_fastq_filter);
653 }
654 
fastx_filter()655 void fastx_filter()
656 {
657   filter(false, opt_fastx_filter);
658 }
659