1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
fastq_get_qual(char q)63 inline int fastq_get_qual(char q)
64 {
65 int qual = q - opt_fastq_ascii;
66
67 if (qual < opt_fastq_qmin)
68 {
69 fprintf(stderr,
70 "\n\nFatal error: FASTQ quality value (%d) below qmin (%"
71 PRId64 ")\n",
72 qual, opt_fastq_qmin);
73 if (fp_log)
74 {
75 fprintf(stderr,
76 "\n\nFatal error: FASTQ quality value (%d) below qmin (%"
77 PRId64 ")\n",
78 qual, opt_fastq_qmin);
79 }
80 exit(EXIT_FAILURE);
81 }
82 else if (qual > opt_fastq_qmax)
83 {
84 fprintf(stderr,
85 "\n\nFatal error: FASTQ quality value (%d) above qmax (%"
86 PRId64 ")\n",
87 qual, opt_fastq_qmax);
88 fprintf(stderr,
89 "By default, quality values range from 0 to 41.\n"
90 "To allow higher quality values, "
91 "please use the option --fastq_qmax %d\n", qual);
92 if (fp_log)
93 {
94 fprintf(fp_log,
95 "\n\nFatal error: FASTQ quality value (%d) above qmax (%"
96 PRId64 ")\n",
97 qual, opt_fastq_qmax);
98 fprintf(fp_log,
99 "By default, quality values range from 0 to 41.\n"
100 "To allow higher quality values, "
101 "please use the option --fastq_qmax %d\n", qual);
102 }
103 exit(EXIT_FAILURE);
104 }
105 return qual;
106 }
107
108 struct analysis_res
109 {
110 bool discarded;
111 bool truncated;
112 int start;
113 int length;
114 double ee;
115 };
116
analyse(fastx_handle h)117 struct analysis_res analyse(fastx_handle h)
118 {
119 struct analysis_res res = { false, false, 0, 0, -1.0 };
120 res.length = fastx_get_sequence_length(h);
121 int64_t old_length = res.length;
122
123 /* strip left (5') end */
124 if (opt_fastq_stripleft < res.length)
125 {
126 res.start += opt_fastq_stripleft;
127 res.length -= opt_fastq_stripleft;
128 }
129 else
130 {
131 res.start = res.length;
132 res.length = 0;
133 }
134
135 /* strip right (3') end */
136 if (opt_fastq_stripright < res.length) {
137 res.length -= opt_fastq_stripright;
138 } else {
139 res.length = 0;
140
141 }
142
143 /* truncate trailing (3') part */
144 if (opt_fastq_trunclen >= 0) {
145 if (res.length > opt_fastq_trunclen) {
146 res.length = opt_fastq_trunclen;
147
148 }
149
150 }
151
152 /* truncate trailing (3') part, but keep if short */
153 if (opt_fastq_trunclen_keep >= 0) {
154 if (res.length > opt_fastq_trunclen_keep) {
155 res.length = opt_fastq_trunclen_keep;
156
157 }
158
159 }
160
161 if (h->is_fastq)
162 {
163 /* truncate by quality and expected errors (ee) */
164 res.ee = 0.0;
165 char * q = fastx_get_quality(h) + res.start;
166 for (int64_t i = 0; i < res.length; i++)
167 {
168 int qual = fastq_get_qual(q[i]);
169 double e = exp10(-0.1 * qual);
170 res.ee += e;
171
172 if ((qual <= opt_fastq_truncqual) ||
173 (res.ee > opt_fastq_truncee))
174 {
175 res.ee -= e;
176 res.length = i;
177 break;
178 }
179 }
180
181 /* filter by expected errors (ee) */
182 if (res.ee > opt_fastq_maxee) {
183 res.discarded = true;
184
185 }
186 if ((res.length > 0) && (res.ee / res.length > opt_fastq_maxee_rate)) {
187 res.discarded = true;
188
189 }
190 }
191
192 /* filter by length */
193 if ((opt_fastq_trunclen >= 0) && (res.length < opt_fastq_trunclen)) {
194 res.discarded = true;
195
196 }
197 if (res.length < opt_fastq_minlen) {
198 res.discarded = true;
199
200 }
201 if (res.length > opt_fastq_maxlen) {
202 res.discarded = true;
203
204 }
205
206 /* filter by n's */
207 int64_t ncount = 0;
208 char * p = fastx_get_sequence(h) + res.start;
209 for (int64_t i = 0; i < res.length; i++)
210 {
211 int pc = p[i];
212 if ((pc == 'N') || (pc == 'n')) {
213 ncount++;
214
215 }
216 }
217 if (ncount > opt_fastq_maxns) {
218 res.discarded = true;
219
220 }
221
222 /* filter by abundance */
223 int64_t abundance = fastx_get_abundance(h);
224 if (abundance < opt_minsize) {
225 res.discarded = true;
226
227 }
228 if (abundance > opt_maxsize) {
229 res.discarded = true;
230
231 }
232
233 res.truncated = res.length < old_length;
234
235 return res;
236 }
237
filter(bool fastq_only,char * filename)238 void filter(bool fastq_only, char * filename)
239 {
240 if ((!opt_fastqout) && (!opt_fastaout) &&
241 (!opt_fastqout_discarded) && (!opt_fastaout_discarded) &&
242 (!opt_fastqout_rev) && (!opt_fastaout_rev) &&
243 (!opt_fastqout_discarded_rev) && (!opt_fastaout_discarded_rev)) {
244 fatal("No output files specified");
245
246 }
247
248 fastx_handle h1 = nullptr;
249 fastx_handle h2 = nullptr;
250
251 h1 = fastx_open(filename);
252
253 if (!h1) {
254 fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
255
256 }
257
258 if (! (h1->is_fastq || h1->is_empty))
259 {
260 if (fastq_only)
261 {
262 fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
263 }
264 else if (opt_eeout ||
265 (opt_fastq_ascii != 33) ||
266 opt_fastq_eeout ||
267 (opt_fastq_maxee < DBL_MAX) ||
268 (opt_fastq_maxee_rate < DBL_MAX) ||
269 opt_fastqout ||
270 (opt_fastq_qmax < 41) ||
271 (opt_fastq_qmin > 0) ||
272 (opt_fastq_truncee < DBL_MAX) ||
273 (opt_fastq_truncqual < LONG_MIN) ||
274 opt_fastqout_discarded ||
275 opt_fastqout_discarded_rev ||
276 opt_fastqout_rev)
277 {
278 fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
279 }
280 }
281
282 uint64_t filesize = fastx_get_size(h1);
283
284 if (opt_reverse)
285 {
286 h2 = fastx_open(opt_reverse);
287
288 if (!h2) {
289 fatal("Unrecognized file type (not proper FASTA or FASTQ format) for reverse reads");
290
291 }
292
293 if (h1->is_fastq != h2->is_fastq) {
294 fatal("The forward and reverse input sequence must in the same format, either FASTA or FASTQ");
295
296 }
297
298 if (! (h2->is_fastq || h2->is_empty))
299 {
300 if (fastq_only)
301 {
302 fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
303 }
304 else if (opt_eeout ||
305 (opt_fastq_ascii != 33) ||
306 opt_fastq_eeout ||
307 (opt_fastq_maxee < DBL_MAX) ||
308 (opt_fastq_maxee_rate < DBL_MAX) ||
309 opt_fastqout ||
310 (opt_fastq_qmax < 41) ||
311 (opt_fastq_qmin > 0) ||
312 (opt_fastq_truncee < DBL_MAX) ||
313 (opt_fastq_truncqual < LONG_MIN) ||
314 opt_fastqout_discarded ||
315 opt_fastqout_discarded_rev ||
316 opt_fastqout_rev)
317 {
318 fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
319 }
320 }
321 }
322
323 FILE * fp_fastaout = nullptr;
324 FILE * fp_fastqout = nullptr;
325 FILE * fp_fastaout_discarded = nullptr;
326 FILE * fp_fastqout_discarded = nullptr;
327
328 FILE * fp_fastaout_rev = nullptr;
329 FILE * fp_fastqout_rev = nullptr;
330 FILE * fp_fastaout_discarded_rev = nullptr;
331 FILE * fp_fastqout_discarded_rev = nullptr;
332
333 if (opt_fastaout)
334 {
335 fp_fastaout = fopen_output(opt_fastaout);
336 if (!fp_fastaout) {
337 fatal("Unable to open FASTA output file for writing");
338
339 }
340 }
341
342 if (opt_fastqout)
343 {
344 fp_fastqout = fopen_output(opt_fastqout);
345 if (!fp_fastqout) {
346 fatal("Unable to open FASTQ output file for writing");
347
348 }
349 }
350
351 if (opt_fastaout_discarded)
352 {
353 fp_fastaout_discarded = fopen_output(opt_fastaout_discarded);
354 if (!fp_fastaout_discarded) {
355 fatal("Unable to open FASTA output file for writing");
356
357 }
358 }
359
360 if (opt_fastqout_discarded)
361 {
362 fp_fastqout_discarded = fopen_output(opt_fastqout_discarded);
363 if (!fp_fastqout_discarded) {
364 fatal("Unable to open FASTQ output file for writing");
365
366 }
367 }
368
369 if (h2)
370 {
371 if (opt_fastaout_rev)
372 {
373 fp_fastaout_rev = fopen_output(opt_fastaout_rev);
374 if (!fp_fastaout_rev) {
375 fatal("Unable to open FASTA output file for writing");
376
377 }
378 }
379
380 if (opt_fastqout_rev)
381 {
382 fp_fastqout_rev = fopen_output(opt_fastqout_rev);
383 if (!fp_fastqout_rev) {
384 fatal("Unable to open FASTQ output file for writing");
385
386 }
387 }
388
389 if (opt_fastaout_discarded_rev)
390 {
391 fp_fastaout_discarded_rev = fopen_output(opt_fastaout_discarded_rev);
392 if (!fp_fastaout_discarded_rev) {
393 fatal("Unable to open FASTA output file for writing");
394
395 }
396 }
397
398 if (opt_fastqout_discarded_rev)
399 {
400 fp_fastqout_discarded_rev = fopen_output(opt_fastqout_discarded_rev);
401 if (!fp_fastqout_discarded_rev) {
402 fatal("Unable to open FASTQ output file for writing");
403
404 }
405 }
406 }
407
408 progress_init("Reading input file", filesize);
409
410 int64_t kept = 0;
411 int64_t discarded = 0;
412 int64_t truncated = 0;
413
414 while(fastx_next(h1, false, chrmap_no_change))
415 {
416 if (h2 && ! fastx_next(h2, false, chrmap_no_change)) {
417 fatal("More forward reads than reverse reads");
418
419 }
420
421 struct analysis_res res1 = { false, false, 0, 0, 0.0 } ;
422 struct analysis_res res2 = { false, false, 0, 0, -1.0 } ;
423
424 res1 = analyse(h1);
425 if (h2) {
426 res2 = analyse(h2);
427
428 }
429
430 if (res1.discarded || res2.discarded)
431 {
432 /* discard the sequence(s) */
433
434 discarded++;
435
436 if (opt_fastaout_discarded) {
437 fasta_print_general(fp_fastaout_discarded,
438 nullptr,
439 fastx_get_sequence(h1) + res1.start,
440 res1.length,
441 fastx_get_header(h1),
442 fastx_get_header_length(h1),
443 fastx_get_abundance(h1),
444 discarded,
445 res1.ee,
446 -1,
447 -1,
448 nullptr,
449 0.0);
450
451 }
452
453 if (opt_fastqout_discarded) {
454 fastq_print_general(fp_fastqout_discarded,
455 fastx_get_sequence(h1) + res1.start,
456 res1.length,
457 fastx_get_header(h1),
458 fastx_get_header_length(h1),
459 fastx_get_quality(h1) + res1.start,
460 fastx_get_abundance(h1),
461 discarded,
462 res1.ee);
463
464 }
465
466 if (h2)
467 {
468 if (opt_fastaout_discarded_rev) {
469 fasta_print_general(fp_fastaout_discarded_rev,
470 nullptr,
471 fastx_get_sequence(h2) + res2.start,
472 res2.length,
473 fastx_get_header(h2),
474 fastx_get_header_length(h2),
475 fastx_get_abundance(h2),
476 discarded,
477 res2.ee,
478 -1,
479 -1,
480 nullptr,
481 0.0);
482
483 }
484
485 if (opt_fastqout_discarded_rev) {
486 fastq_print_general(fp_fastqout_discarded_rev,
487 fastx_get_sequence(h2) + res2.start,
488 res2.length,
489 fastx_get_header(h2),
490 fastx_get_header_length(h2),
491 fastx_get_quality(h2) + res2.start,
492 fastx_get_abundance(h2),
493 discarded,
494 res2.ee);
495
496 }
497 }
498 }
499 else
500 {
501 /* keep the sequence(s) */
502
503 kept++;
504
505 if (res1.truncated || res2.truncated) {
506 truncated++;
507
508 }
509
510 if (opt_fastaout) {
511 fasta_print_general(fp_fastaout,
512 nullptr,
513 fastx_get_sequence(h1) + res1.start,
514 res1.length,
515 fastx_get_header(h1),
516 fastx_get_header_length(h1),
517 fastx_get_abundance(h1),
518 kept,
519 res1.ee,
520 -1,
521 -1,
522 nullptr,
523 0.0);
524
525 }
526
527 if (opt_fastqout) {
528 fastq_print_general(fp_fastqout,
529 fastx_get_sequence(h1) + res1.start,
530 res1.length,
531 fastx_get_header(h1),
532 fastx_get_header_length(h1),
533 fastx_get_quality(h1) + res1.start,
534 fastx_get_abundance(h1),
535 kept,
536 res1.ee);
537
538 }
539
540 if (h2)
541 {
542 if (opt_fastaout_rev) {
543 fasta_print_general(fp_fastaout_rev,
544 nullptr,
545 fastx_get_sequence(h2) + res2.start,
546 res2.length,
547 fastx_get_header(h2),
548 fastx_get_header_length(h2),
549 fastx_get_abundance(h2),
550 kept,
551 res2.ee,
552 -1,
553 -1,
554 nullptr,
555 0.0);
556
557 }
558
559 if (opt_fastqout_rev) {
560 fastq_print_general(fp_fastqout_rev,
561 fastx_get_sequence(h2) + res2.start,
562 res2.length,
563 fastx_get_header(h2),
564 fastx_get_header_length(h2),
565 fastx_get_quality(h2) + res2.start,
566 fastx_get_abundance(h2),
567 kept,
568 res2.ee);
569
570 }
571 }
572 }
573
574 progress_update(fastx_get_position(h1));
575 }
576
577 progress_done();
578
579 if (h2 && fastx_next(h2, false, chrmap_no_change)) {
580 fatal("More reverse reads than forward reads");
581
582 }
583
584 if (! opt_quiet) {
585 fprintf(stderr,
586 "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n",
587 kept,
588 truncated,
589 discarded);
590
591 }
592
593 if (opt_log) {
594 fprintf(fp_log,
595 "%" PRId64 " sequences kept (of which %" PRId64 " truncated), %" PRId64 " sequences discarded.\n",
596 kept,
597 truncated,
598 discarded);
599
600 }
601
602 if (h2)
603 {
604 if (opt_fastaout_rev) {
605 fclose(fp_fastaout_rev);
606
607 }
608
609 if (opt_fastqout_rev) {
610 fclose(fp_fastqout_rev);
611
612 }
613
614 if (opt_fastaout_discarded_rev) {
615 fclose(fp_fastaout_discarded_rev);
616
617 }
618
619 if (opt_fastqout_discarded_rev) {
620 fclose(fp_fastqout_discarded_rev);
621
622 }
623
624 fastx_close(h2);
625 }
626
627 if (opt_fastaout) {
628 fclose(fp_fastaout);
629
630 }
631
632 if (opt_fastqout) {
633 fclose(fp_fastqout);
634
635 }
636
637 if (opt_fastaout_discarded) {
638 fclose(fp_fastaout_discarded);
639
640 }
641
642 if (opt_fastqout_discarded) {
643 fclose(fp_fastqout_discarded);
644
645 }
646
647 fastx_close(h1);
648 }
649
fastq_filter()650 void fastq_filter()
651 {
652 filter(true, opt_fastq_filter);
653 }
654
fastx_filter()655 void fastx_filter()
656 {
657 filter(false, opt_fastx_filter);
658 }
659