1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
fastq_chars()63 void fastq_chars()
64 {
65 uint64_t sequence_chars[256];
66 uint64_t quality_chars[256];
67 uint64_t tail_chars[256];
68 uint64_t total_chars = 0;
69 int maxrun[256];
70
71 for(int c=0; c<256; c++)
72 {
73 sequence_chars[c] = 0;
74 quality_chars[c] = 0;
75 tail_chars[c] = 0;
76 maxrun[c] = 0;
77 }
78
79 fastx_handle h = fastq_open(opt_fastq_chars);
80
81 uint64_t filesize = fastq_get_size(h);
82
83 progress_init("Reading FASTQ file", filesize);
84
85 uint64_t seq_count = 0;
86
87 int qmin_n = 255, qmax_n = 0;
88
89 while(fastq_next(h, false, chrmap_upcase))
90 {
91 int64_t len = fastq_get_sequence_length(h);
92 char * p = fastq_get_sequence(h);
93 char * q = fastq_get_quality(h);
94
95 seq_count++;
96 total_chars += len;
97
98 int run_char = -1;
99 int run = 0;
100
101 int64_t i = 0;
102 while(i<len)
103 {
104 int pc = *p++;
105 int qc = *q++;
106 sequence_chars[pc]++;
107 quality_chars[qc]++;
108
109 if ((pc == 'N') || (pc == 'n'))
110 {
111 if (qc < qmin_n) {
112 qmin_n = qc;
113
114 }
115 if (qc > qmax_n) {
116 qmax_n = qc;
117
118 }
119 }
120
121 if (pc == run_char)
122 {
123 run++;
124 if (run > maxrun[run_char]) {
125 maxrun[run_char] = run;
126
127 }
128 }
129 else
130 {
131 run_char = pc;
132 run = 0;
133 }
134
135 i++;
136 }
137
138 if (len >= opt_fastq_tail)
139 {
140 q = fastq_get_quality(h) + len - 1;
141 int tail_char = *q--;
142 int tail_len = 1;
143 while(*q-- == tail_char)
144 {
145 tail_len++;
146 if (tail_len >= opt_fastq_tail) {
147 break;
148
149 }
150 }
151 if (tail_len >= opt_fastq_tail) {
152 tail_chars[tail_char]++;
153
154 }
155 }
156
157 progress_update(fastq_get_position(h));
158 }
159 progress_done();
160
161 fastq_close(h);
162
163 char qmin = 0;
164 char qmax = 0;
165
166 for(int c=0; c<=255; c++)
167 {
168 if (quality_chars[c])
169 {
170 qmin = c;
171 break;
172 }
173 }
174
175 for(int c=255; c>=0; c--)
176 {
177 if (quality_chars[c])
178 {
179 qmax = c;
180 break;
181 }
182 }
183
184 char fastq_ascii, fastq_qmin, fastq_qmax;
185
186 if ((qmin < 59) || (qmax < 75)) {
187 fastq_ascii = 33;
188 } else {
189 fastq_ascii = 64;
190
191 }
192
193 fastq_qmax = qmax - fastq_ascii;
194 fastq_qmin = qmin - fastq_ascii;
195
196 if (!opt_quiet)
197 {
198 fprintf(stderr, "Read %" PRIu64 " sequences.\n", seq_count);
199
200 if (seq_count > 0)
201 {
202 fprintf(stderr, "Qmin %d, QMax %d, Range %d\n",
203 qmin, qmax, qmax-qmin+1);
204
205 fprintf(stderr, "Guess: -fastq_qmin %d -fastq_qmax %d -fastq_ascii %d\n",
206 fastq_qmin, fastq_qmax, fastq_ascii);
207
208 if (fastq_ascii == 64)
209 {
210 if (qmin < 64) {
211 fprintf(stderr, "Guess: Solexa format (phred+64)\n");
212 } else if (qmin < 66) {
213 fprintf(stderr, "Guess: Illumina 1.3+ format (phred+64)\n");
214 } else {
215 fprintf(stderr, "Guess: Illumina 1.5+ format (phred+64)\n");
216
217 }
218 }
219 else
220 {
221 if (qmax > 73) {
222 fprintf(stderr, "Guess: Illumina 1.8+ format (phred+33)\n");
223 } else {
224 fprintf(stderr, "Guess: Original Sanger format (phred+33)\n");
225
226 }
227 }
228
229 fprintf(stderr, "\n");
230 fprintf(stderr, "Letter N Freq MaxRun\n");
231 fprintf(stderr, "------ ---------- ------ ------\n");
232
233 for(int c=0; c<256; c++)
234 {
235 if (sequence_chars[c] > 0)
236 {
237 fprintf(stderr, " %c %10" PRIu64 " %5.1f%% %6d",
238 c,
239 sequence_chars[c],
240 100.0 * sequence_chars[c] / total_chars,
241 maxrun[c]);
242 if ((c == 'N') || (c == 'n'))
243 {
244 if (qmin_n < qmax_n) {
245 fprintf(stderr, " Q=%c..%c", qmin_n, qmax_n);
246 } else {
247 fprintf(stderr, " Q=%c", qmin_n);
248
249 }
250 }
251 fprintf(stderr, "\n");
252 }
253 }
254
255 fprintf(stderr, "\n");
256 fprintf(stderr, "Char ASCII Freq Tails\n");
257 fprintf(stderr, "---- ----- ------ ----------\n");
258
259 for(int c=qmin; c<=qmax; c++)
260 {
261 if (quality_chars[c] > 0)
262 {
263 fprintf(stderr, " '%c' %5d %5.1f%% %10" PRIu64 "\n",
264 c,
265 c,
266 100.0 * quality_chars[c] / total_chars,
267 tail_chars[c]);
268 }
269 }
270 }
271 }
272
273 if (opt_log)
274 {
275 fprintf(fp_log, "Read %" PRIu64 " sequences.\n", seq_count);
276
277 if (seq_count > 0)
278 {
279 fprintf(fp_log, "Qmin %d, QMax %d, Range %d\n",
280 qmin, qmax, qmax-qmin+1);
281
282 fprintf(fp_log, "Guess: -fastq_qmin %d -fastq_qmax %d -fastq_ascii %d\n",
283 fastq_qmin, fastq_qmax, fastq_ascii);
284
285 if (fastq_ascii == 64)
286 {
287 if (qmin < 64) {
288 fprintf(fp_log, "Guess: Solexa format (phred+64)\n");
289 } else if (qmin < 66) {
290 fprintf(fp_log, "Guess: Illumina 1.3+ format (phred+64)\n");
291 } else {
292 fprintf(fp_log, "Guess: Illumina 1.5+ format (phred+64)\n");
293
294 }
295 }
296 else
297 {
298 if (qmax > 73) {
299 fprintf(fp_log, "Guess: Illumina 1.8+ format (phred+33)\n");
300 } else {
301 fprintf(fp_log, "Guess: Original Sanger format (phred+33)\n");
302
303 }
304 }
305
306 fprintf(fp_log, "\n");
307 fprintf(fp_log, "Letter N Freq MaxRun\n");
308 fprintf(fp_log, "------ ---------- ------ ------\n");
309
310 for(int c=0; c<256; c++)
311 {
312 if (sequence_chars[c] > 0)
313 {
314 fprintf(fp_log, " %c %10" PRIu64 " %5.1f%% %6d",
315 c,
316 sequence_chars[c],
317 100.0 * sequence_chars[c] / total_chars,
318 maxrun[c]);
319 if ((c == 'N') || (c == 'n'))
320 {
321 if (qmin_n < qmax_n) {
322 fprintf(fp_log, " Q=%c..%c", qmin_n, qmax_n);
323 } else {
324 fprintf(fp_log, " Q=%c", qmin_n);
325
326 }
327 }
328 fprintf(fp_log, "\n");
329 }
330 }
331
332 fprintf(fp_log, "\n");
333 fprintf(fp_log, "Char ASCII Freq Tails\n");
334 fprintf(fp_log, "---- ----- ------ ----------\n");
335
336 for(int c=qmin; c<=qmax; c++)
337 {
338 if (quality_chars[c] > 0)
339 {
340 fprintf(fp_log, " '%c' %5d %5.1f%% %10" PRIu64 "\n",
341 c,
342 c,
343 100.0 * quality_chars[c] / total_chars,
344 tail_chars[c]);
345 }
346 }
347 }
348 }
349 }
350
q2p(double q)351 double q2p(double q)
352 {
353 return exp10(- q / 10.0);
354 }
355
fastq_stats()356 void fastq_stats()
357 {
358 fastx_handle h = fastq_open(opt_fastq_stats);
359
360 uint64_t filesize = fastq_get_size(h);
361
362 progress_init("Reading FASTQ file", filesize);
363
364 uint64_t seq_count = 0;
365 uint64_t symbols = 0;
366
367 int64_t read_length_alloc = 512;
368
369 auto * read_length_table = (uint64_t*) xmalloc(sizeof(uint64_t) * read_length_alloc);
370 memset(read_length_table, 0, sizeof(uint64_t) * read_length_alloc);
371
372 auto * qual_length_table = (uint64_t*) xmalloc(sizeof(uint64_t) * read_length_alloc * 256);
373 memset(qual_length_table, 0, sizeof(uint64_t) * read_length_alloc * 256);
374
375 auto * ee_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 4);
376 memset(ee_length_table, 0, sizeof(uint64_t) * read_length_alloc * 4);
377
378 auto * q_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 4);
379 memset(q_length_table, 0, sizeof(uint64_t) * read_length_alloc * 4);
380
381 auto * sumee_length_table = (double *) xmalloc(sizeof(double) * read_length_alloc);
382 memset(sumee_length_table, 0, sizeof(double) * read_length_alloc);
383
384 int64_t len_min = LONG_MAX;
385 int64_t len_max = 0;
386
387 int qmin = +1000;
388 int qmax = -1000;
389
390 uint64_t quality_chars[256];
391 for(uint64_t & quality_char : quality_chars)
392 {
393 quality_char = 0;
394 }
395
396 while(fastq_next(h, false, chrmap_upcase))
397 {
398 seq_count++;
399
400 int64_t len = fastq_get_sequence_length(h);
401 char * q = fastq_get_quality(h);
402
403 /* update length statistics */
404
405 if (len+1 > read_length_alloc)
406 {
407 read_length_table = (uint64_t*) xrealloc(read_length_table,
408 sizeof(uint64_t) * (len+1));
409 memset(read_length_table + read_length_alloc, 0,
410 sizeof(uint64_t) * (len + 1 - read_length_alloc));
411
412 qual_length_table = (uint64_t*) xrealloc(qual_length_table,
413 sizeof(uint64_t) * (len+1) * 256);
414 memset(qual_length_table + 256 * read_length_alloc, 0,
415 sizeof(uint64_t) * (len + 1 - read_length_alloc) * 256);
416
417 ee_length_table = (uint64_t*) xrealloc(ee_length_table,
418 sizeof(uint64_t) * (len+1) * 4);
419 memset(ee_length_table + 4 * read_length_alloc, 0,
420 sizeof(uint64_t) * (len + 1 - read_length_alloc) * 4);
421
422 q_length_table = (uint64_t*) xrealloc(q_length_table,
423 sizeof(uint64_t) * (len+1) * 4);
424 memset(q_length_table + 4 * read_length_alloc, 0,
425 sizeof(uint64_t) * (len + 1 - read_length_alloc) * 4);
426
427 sumee_length_table = (double *) xrealloc(sumee_length_table,
428 sizeof(double) * (len+1));
429 memset(sumee_length_table + read_length_alloc, 0,
430 sizeof(double) * (len + 1 - read_length_alloc));
431
432 read_length_alloc = len + 1;
433 }
434
435 read_length_table[len]++;
436
437 if (len < len_min) {
438 len_min = len;
439
440 }
441 if (len > len_max) {
442 len_max = len;
443
444 }
445
446 /* update quality statistics */
447
448 symbols += len;
449
450 double ee_limit[4] = { 1.0, 0.5, 0.25, 0.1 };
451
452 double ee = 0.0;
453 int qmin_this = 1000;
454 for(int64_t i=0; i < len; i++)
455 {
456 int qc = q[i];
457
458 int qual = qc - opt_fastq_ascii;
459 if ((qual < opt_fastq_qmin) || (qual > opt_fastq_qmax))
460 {
461 char * msg;
462 if (xsprintf(& msg,
463 "FASTQ quality value (%d) out of range (%" PRId64 "-%" PRId64 ").\n"
464 "Please adjust the FASTQ quality base character or range with the\n"
465 "--fastq_ascii, --fastq_qmin or --fastq_qmax options. For a complete\n"
466 "diagnosis with suggested values, please run vsearch --fastq_chars file.",
467 qual, opt_fastq_qmin, opt_fastq_qmax) > 0) {
468 fatal(msg);
469 } else {
470 fatal("Out of memory");
471
472 }
473 xfree(msg);
474 }
475
476 quality_chars[qc]++;
477 if (qc < qmin) {
478 qmin = qc;
479
480 }
481 if (qc > qmax) {
482 qmax = qc;
483
484 }
485
486 qual_length_table[256*i + qc]++;
487
488 ee += q2p(qual);
489
490 sumee_length_table[i] += ee;
491
492 for(int z=0; z<4; z++)
493 {
494 if (ee <= ee_limit[z]) {
495 ee_length_table[4*i+z]++;
496 } else {
497 break;
498
499 }
500 }
501
502 if (qual < qmin_this) {
503 qmin_this = qual;
504
505 }
506
507 for(int z=0; z<4; z++)
508 {
509 if (qmin_this > 5*(z+1)) {
510 q_length_table[4*i+z]++;
511 } else {
512 break;
513
514 }
515 }
516 }
517
518 progress_update(fastq_get_position(h));
519 }
520 progress_done();
521
522 /* compute various distributions */
523
524 auto * length_dist = (uint64_t*) xmalloc(sizeof(uint64_t) * (len_max+1));
525 auto * symb_dist = (int64_t*) xmalloc(sizeof(int64_t) * (len_max+1));
526
527 auto * rate_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
528 auto * avgq_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
529 auto * avgee_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
530 auto * avgp_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
531
532 int64_t length_accum = 0;
533 int64_t symb_accum = 0;
534
535 for(int64_t i = 0; i <= len_max; i++)
536 {
537 length_accum += read_length_table[i];
538 length_dist[i] = length_accum;
539
540 symb_accum += seq_count - length_accum;
541 symb_dist[i] = symb_accum;
542
543 int64_t q = 0;
544 int64_t x = 0;
545 double e_sum = 0.0;
546 for(int c=qmin; c<=qmax; c++)
547 {
548 int qual = c - opt_fastq_ascii;
549 x += qual_length_table[256*i + c];
550 q += qual_length_table[256*i + c] * qual;
551 e_sum += qual_length_table[256*i + c] * q2p(qual);
552 }
553 avgq_dist[i] = 1.0 * q / x;
554 avgp_dist[i] = e_sum / x;
555 avgee_dist[i] = sumee_length_table[i] / x;
556 rate_dist[i] = avgee_dist[i] / (i+1);
557 }
558
559 if (fp_log)
560 {
561 fprintf(fp_log, "\n");
562 fprintf(fp_log, "Read length distribution\n");
563 fprintf(fp_log, " L N Pct AccPct\n");
564 fprintf(fp_log, "------- ---------- ------- -------\n");
565
566 for(int64_t i = len_max; i >= len_min; i--)
567 {
568 if (read_length_table[i] > 0) {
569 fprintf(fp_log, "%2s%5" PRId64 " %10" PRIu64 " %5.1lf%% %5.1lf%%\n",
570 (i == len_max ? ">=" : " "),
571 i,
572 read_length_table[i],
573 read_length_table[i] * 100.0 / seq_count,
574 100.0 * (seq_count - length_dist[i-1]) / seq_count);
575
576 }
577 }
578
579 fprintf(fp_log, "\n");
580 fprintf(fp_log, "Q score distribution\n");
581 fprintf(fp_log, "ASCII Q Pe N Pct AccPct\n");
582 fprintf(fp_log, "----- --- ------- ---------- ------- -------\n");
583
584 int64_t qual_accum = 0;
585 for(int c = qmax ; c >= qmin ; c--)
586 {
587 if (quality_chars[c] > 0)
588 {
589 qual_accum += quality_chars[c];
590 fprintf(fp_log,
591 " %c %3" PRId64 " %7.5lf %10" PRIu64 " %6.1lf%% %6.1lf%%\n",
592 c,
593 c - opt_fastq_ascii,
594 q2p(c - opt_fastq_ascii),
595 quality_chars[c],
596 100.0 * quality_chars[c] / symbols,
597 100.0 * qual_accum / symbols);
598 }
599 }
600
601 fprintf(fp_log, "\n");
602 fprintf(fp_log, " L PctRecs AvgQ P(AvgQ) AvgP AvgEE Rate RatePct\n");
603 fprintf(fp_log, "----- ------- ---- ------- -------- ----- --------- --------\n");
604
605 for(int64_t i = 2; i <= len_max; i++)
606 {
607 double PctRecs = 100.0 * (seq_count - length_dist[i-1]) / seq_count;
608 double AvgQ = avgq_dist[i-1];
609 double AvgP = avgp_dist[i-1];
610 double AvgEE = avgee_dist[i-1];
611 double Rate = rate_dist[i-1];
612
613 fprintf(fp_log,
614 "%5" PRId64 " %6.1lf%% %4.1lf %7.5lf %8.6lf %5.2lf %9.6lf %7.3lf%%\n",
615 i,
616 PctRecs,
617 AvgQ,
618 q2p(AvgQ),
619 AvgP,
620 AvgEE,
621 Rate,
622 100.0 * Rate);
623 }
624
625 fprintf(fp_log, "\n");
626 fprintf(fp_log, " L 1.0000 0.5000 0.2500 0.1000 1.0000 0.5000 0.2500 0.1000\n");
627 fprintf(fp_log, "----- ------- ------- ------- ------- ------- ------- ------- -------\n");
628
629 for(int64_t i = len_max; i >= 1; i--)
630 {
631 int64_t read_count[4];
632 double read_percentage[4];
633
634 for(int z=0; z<4; z++)
635 {
636 read_count[z] = ee_length_table[4*(i-1)+z];
637 read_percentage[z] = 100.0 * read_count[z] / seq_count;
638 }
639
640 if (read_count[0] > 0)
641 {
642 fprintf(fp_log,
643 "%5" PRId64 " %7" PRId64 " %7" PRId64 " %7" PRId64 " %7" PRId64 " "
644 "%6.2lf%% %6.2lf%% %6.2lf%% %6.2lf%%\n",
645 i,
646 read_count[0], read_count[1],
647 read_count[2], read_count[3],
648 read_percentage[0], read_percentage[1],
649 read_percentage[2], read_percentage[3]);
650 }
651 }
652
653
654 fprintf(fp_log, "\n");
655 fprintf(fp_log, "Truncate at first Q\n");
656 fprintf(fp_log, " Len Q=5 Q=10 Q=15 Q=20\n");
657 fprintf(fp_log, "----- ------ ------ ------ ------\n");
658
659 for(int64_t i = len_max; i >= MAX(1, len_max/2); i--)
660 {
661 double read_percentage[4];
662
663 for(int z=0; z<4; z++) {
664 read_percentage[z] = 100.0 * q_length_table[4*(i-1)+z] / seq_count;
665
666 }
667
668 fprintf(fp_log, "%5" PRId64 " %5.1lf%% %5.1lf%% %5.1lf%% %5.1lf%%\n",
669 i,
670 read_percentage[0], read_percentage[1],
671 read_percentage[2], read_percentage[3]);
672 }
673
674 fprintf(fp_log, "\n");
675 fprintf(fp_log, "%10" PRIu64 " Recs (%.1lfM), 0 too long\n",
676 seq_count, seq_count / 1.0e6);
677 if (seq_count > 0) {
678 fprintf(fp_log, "%10.1lf Avg length\n", 1.0 * symbols / seq_count);
679
680 }
681 fprintf(fp_log, "%9.1lfM Bases\n", symbols / 1.0e6);
682 }
683
684 xfree(read_length_table);
685 xfree(qual_length_table);
686 xfree(ee_length_table);
687 xfree(q_length_table);
688 xfree(sumee_length_table);
689
690 xfree(length_dist);
691 xfree(symb_dist);
692 xfree(rate_dist);
693 xfree(avgq_dist);
694 xfree(avgee_dist);
695 xfree(avgp_dist);
696
697 fastq_close(h);
698
699 if (!opt_quiet)
700 {
701 fprintf(stderr, "Read %" PRIu64 " sequences.\n", seq_count);
702 }
703 }
704
fastx_revcomp()705 void fastx_revcomp()
706 {
707 uint64_t buffer_alloc = 512;
708 char * seq_buffer = (char*) xmalloc(buffer_alloc);
709 char * qual_buffer = (char*) xmalloc(buffer_alloc);
710
711 fastx_handle h = fastx_open(opt_fastx_revcomp);
712
713 if (!h) {
714 fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
715
716 }
717
718 if (opt_fastqout && ! (h->is_fastq || h->is_empty)) {
719 fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores");
720
721 }
722
723 uint64_t filesize = fastx_get_size(h);
724
725 FILE * fp_fastaout = nullptr;
726 FILE * fp_fastqout = nullptr;
727
728 if (opt_fastaout)
729 {
730 fp_fastaout = fopen_output(opt_fastaout);
731 if (!fp_fastaout) {
732 fatal("Unable to open FASTA output file for writing");
733
734 }
735 }
736
737 if (opt_fastqout)
738 {
739 fp_fastqout = fopen_output(opt_fastqout);
740 if (!fp_fastqout) {
741 fatal("Unable to open FASTQ output file for writing");
742
743 }
744 }
745
746 if (h->is_fastq) {
747 progress_init("Reading FASTQ file", filesize);
748 } else {
749 progress_init("Reading FASTA file", filesize);
750
751 }
752
753 int count = 0;
754
755 while(fastx_next(h, false, chrmap_no_change))
756 {
757 count++;
758
759 /* header */
760
761 uint64_t hlen = fastx_get_header_length(h);
762 char * header = fastx_get_header(h);
763 int64_t abundance = fastx_get_abundance(h);
764
765
766 /* sequence */
767
768 uint64_t length = fastx_get_sequence_length(h);
769
770 if (length + 1 > buffer_alloc)
771 {
772 buffer_alloc = length + 1;
773 seq_buffer = (char *) xrealloc(seq_buffer, buffer_alloc);
774 qual_buffer = (char *) xrealloc(qual_buffer, buffer_alloc);
775 }
776
777 char * p = fastx_get_sequence(h);
778 reverse_complement(seq_buffer, p, length);
779
780
781 /* quality values */
782
783 char * q = fastx_get_quality(h);
784
785 if (fastx_is_fastq(h))
786 {
787 /* reverse quality values */
788 for(uint64_t i=0; i<length; i++) {
789 qual_buffer[i] = q[length-1-i];
790
791 }
792 qual_buffer[length] = 0;
793 }
794
795 if (opt_fastaout) {
796 fasta_print_general(fp_fastaout,
797 nullptr,
798 seq_buffer,
799 length,
800 header,
801 hlen,
802 abundance,
803 count,
804 -1.0,
805 -1, -1, nullptr, 0.0);
806
807 }
808
809 if (opt_fastqout) {
810 fastq_print_general(fp_fastqout,
811 seq_buffer,
812 length,
813 header,
814 hlen,
815 qual_buffer,
816 abundance,
817 count,
818 -1.0);
819
820 }
821
822 progress_update(fastx_get_position(h));
823 }
824 progress_done();
825
826 if (opt_fastaout) {
827 fclose(fp_fastaout);
828
829 }
830
831 if (opt_fastqout) {
832 fclose(fp_fastqout);
833
834 }
835
836 fastx_close(h);
837
838 xfree(seq_buffer);
839 xfree(qual_buffer);
840 }
841
fastq_convert()842 void fastq_convert()
843 {
844 fastx_handle h = fastq_open(opt_fastq_convert);
845
846 if (!h) {
847 fatal("Unable to open FASTQ file");
848
849 }
850
851 uint64_t filesize = fastq_get_size(h);
852
853 FILE * fp_fastqout = nullptr;
854
855 fp_fastqout = fopen_output(opt_fastqout);
856 if (!fp_fastqout) {
857 fatal("Unable to open FASTQ output file for writing");
858
859 }
860
861 progress_init("Reading FASTQ file", filesize);
862
863 int j = 1;
864 while(fastq_next(h, false, chrmap_no_change))
865 {
866 /* header */
867
868 char * header = fastq_get_header(h);
869 int64_t abundance = fastq_get_abundance(h);
870
871 /* sequence */
872
873 uint64_t length = fastq_get_sequence_length(h);
874 char * sequence = fastq_get_sequence(h);
875
876 /* convert quality values */
877
878 char * quality = fastq_get_quality(h);
879 for(uint64_t i=0; i<length; i++)
880 {
881 int q = quality[i] - opt_fastq_ascii;
882 if (q < opt_fastq_qmin)
883 {
884 fprintf(stderr,
885 "\nFASTQ quality score (%d) below minimum (%" PRId64
886 ") in entry no %" PRIu64
887 " starting on line %" PRIu64 "\n",
888 q,
889 opt_fastq_qmin,
890 fastq_get_seqno(h) + 1,
891 fastq_get_lineno(h));
892 fatal("FASTQ quality score too low");
893 }
894 if (q > opt_fastq_qmax)
895 {
896 fprintf(stderr,
897 "\nFASTQ quality score (%d) above maximum (%" PRId64
898 ") in entry no %" PRIu64
899 " starting on line %" PRIu64 "\n",
900 q,
901 opt_fastq_qmax,
902 fastq_get_seqno(h) + 1,
903 fastq_get_lineno(h));
904 fatal("FASTQ quality score too high");
905 }
906 if (q < opt_fastq_qminout) {
907 q = opt_fastq_qminout;
908
909 }
910 if (q > opt_fastq_qmaxout) {
911 q = opt_fastq_qmaxout;
912
913 }
914 q += opt_fastq_asciiout;
915 if (q < 33) {
916 q = 33;
917
918 }
919 if (q > 126) {
920 q = 126;
921
922 }
923 quality[i] = q;
924 }
925 quality[length] = 0;
926
927 int hlen = fastq_get_header_length(h);
928 fastq_print_general(fp_fastqout,
929 sequence,
930 length,
931 header,
932 hlen,
933 quality,
934 abundance,
935 j,
936 -1.0);
937
938 j++;
939 progress_update(fastq_get_position(h));
940 }
941
942 progress_done();
943
944 fclose(fp_fastqout);
945 fastq_close(h);
946 }
947