1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
fastq_chars()63 void fastq_chars()
64 {
65   uint64_t sequence_chars[256];
66   uint64_t quality_chars[256];
67   uint64_t tail_chars[256];
68   uint64_t total_chars = 0;
69   int maxrun[256];
70 
71   for(int c=0; c<256; c++)
72     {
73       sequence_chars[c] = 0;
74       quality_chars[c] = 0;
75       tail_chars[c] = 0;
76       maxrun[c] = 0;
77     }
78 
79   fastx_handle h = fastq_open(opt_fastq_chars);
80 
81   uint64_t filesize = fastq_get_size(h);
82 
83   progress_init("Reading FASTQ file", filesize);
84 
85   uint64_t seq_count = 0;
86 
87   int qmin_n = 255, qmax_n = 0;
88 
89   while(fastq_next(h, false, chrmap_upcase))
90     {
91       int64_t len = fastq_get_sequence_length(h);
92       char * p = fastq_get_sequence(h);
93       char * q = fastq_get_quality(h);
94 
95       seq_count++;
96       total_chars += len;
97 
98       int run_char = -1;
99       int run = 0;
100 
101       int64_t i = 0;
102       while(i<len)
103         {
104           int pc = *p++;
105           int qc = *q++;
106           sequence_chars[pc]++;
107           quality_chars[qc]++;
108 
109           if ((pc == 'N') || (pc == 'n'))
110             {
111               if (qc < qmin_n) {
112                 qmin_n = qc;
113 
114         }
115               if (qc > qmax_n) {
116                 qmax_n = qc;
117 
118         }
119             }
120 
121           if (pc == run_char)
122             {
123               run++;
124               if (run > maxrun[run_char]) {
125                 maxrun[run_char] = run;
126 
127         }
128             }
129           else
130             {
131               run_char = pc;
132               run = 0;
133             }
134 
135           i++;
136         }
137 
138       if (len >= opt_fastq_tail)
139         {
140           q = fastq_get_quality(h) + len - 1;
141           int tail_char = *q--;
142           int tail_len = 1;
143           while(*q-- == tail_char)
144             {
145               tail_len++;
146               if (tail_len >= opt_fastq_tail) {
147                 break;
148 
149         }
150             }
151           if (tail_len >= opt_fastq_tail) {
152             tail_chars[tail_char]++;
153 
154         }
155         }
156 
157       progress_update(fastq_get_position(h));
158     }
159   progress_done();
160 
161   fastq_close(h);
162 
163   char qmin = 0;
164   char qmax = 0;
165 
166   for(int c=0; c<=255; c++)
167     {
168       if (quality_chars[c])
169         {
170           qmin = c;
171           break;
172         }
173     }
174 
175   for(int c=255; c>=0; c--)
176     {
177       if (quality_chars[c])
178         {
179           qmax = c;
180           break;
181         }
182     }
183 
184   char fastq_ascii, fastq_qmin, fastq_qmax;
185 
186   if ((qmin < 59) || (qmax < 75)) {
187     fastq_ascii = 33;
188   } else {
189     fastq_ascii = 64;
190 
191         }
192 
193   fastq_qmax = qmax - fastq_ascii;
194   fastq_qmin = qmin - fastq_ascii;
195 
196   if (!opt_quiet)
197     {
198       fprintf(stderr, "Read %" PRIu64 " sequences.\n", seq_count);
199 
200       if (seq_count > 0)
201         {
202           fprintf(stderr, "Qmin %d, QMax %d, Range %d\n",
203                   qmin, qmax, qmax-qmin+1);
204 
205           fprintf(stderr, "Guess: -fastq_qmin %d -fastq_qmax %d -fastq_ascii %d\n",
206                   fastq_qmin, fastq_qmax, fastq_ascii);
207 
208           if (fastq_ascii == 64)
209             {
210               if (qmin < 64) {
211                 fprintf(stderr, "Guess: Solexa format (phred+64)\n");
212               } else if (qmin < 66) {
213                 fprintf(stderr, "Guess: Illumina 1.3+ format (phred+64)\n");
214               } else {
215                 fprintf(stderr, "Guess: Illumina 1.5+ format (phred+64)\n");
216 
217         }
218             }
219           else
220             {
221               if (qmax > 73) {
222                 fprintf(stderr, "Guess: Illumina 1.8+ format (phred+33)\n");
223               } else {
224                 fprintf(stderr, "Guess: Original Sanger format (phred+33)\n");
225 
226         }
227             }
228 
229           fprintf(stderr, "\n");
230           fprintf(stderr, "Letter          N   Freq MaxRun\n");
231           fprintf(stderr, "------ ---------- ------ ------\n");
232 
233           for(int c=0; c<256; c++)
234             {
235               if (sequence_chars[c] > 0)
236                 {
237                   fprintf(stderr, "     %c %10" PRIu64 " %5.1f%% %6d",
238                           c,
239                           sequence_chars[c],
240                           100.0 * sequence_chars[c] / total_chars,
241                           maxrun[c]);
242                   if ((c == 'N') || (c == 'n'))
243                     {
244                       if (qmin_n < qmax_n) {
245                         fprintf(stderr, "  Q=%c..%c", qmin_n, qmax_n);
246                       } else {
247                         fprintf(stderr, "  Q=%c", qmin_n);
248 
249         }
250                     }
251                   fprintf(stderr, "\n");
252                 }
253             }
254 
255           fprintf(stderr, "\n");
256           fprintf(stderr, "Char  ASCII    Freq       Tails\n");
257           fprintf(stderr, "----  -----  ------  ----------\n");
258 
259           for(int c=qmin; c<=qmax; c++)
260             {
261               if (quality_chars[c] > 0)
262                 {
263                   fprintf(stderr, " '%c'  %5d  %5.1f%%  %10" PRIu64 "\n",
264                           c,
265                           c,
266                           100.0 * quality_chars[c] / total_chars,
267                           tail_chars[c]);
268                 }
269             }
270         }
271     }
272 
273   if (opt_log)
274     {
275       fprintf(fp_log, "Read %" PRIu64 " sequences.\n", seq_count);
276 
277       if (seq_count > 0)
278         {
279           fprintf(fp_log, "Qmin %d, QMax %d, Range %d\n",
280                   qmin, qmax, qmax-qmin+1);
281 
282           fprintf(fp_log, "Guess: -fastq_qmin %d -fastq_qmax %d -fastq_ascii %d\n",
283                   fastq_qmin, fastq_qmax, fastq_ascii);
284 
285           if (fastq_ascii == 64)
286             {
287               if (qmin < 64) {
288                 fprintf(fp_log, "Guess: Solexa format (phred+64)\n");
289               } else if (qmin < 66) {
290                 fprintf(fp_log, "Guess: Illumina 1.3+ format (phred+64)\n");
291               } else {
292                 fprintf(fp_log, "Guess: Illumina 1.5+ format (phred+64)\n");
293 
294         }
295             }
296           else
297             {
298               if (qmax > 73) {
299                 fprintf(fp_log, "Guess: Illumina 1.8+ format (phred+33)\n");
300               } else {
301                 fprintf(fp_log, "Guess: Original Sanger format (phred+33)\n");
302 
303         }
304             }
305 
306           fprintf(fp_log, "\n");
307           fprintf(fp_log, "Letter          N   Freq MaxRun\n");
308           fprintf(fp_log, "------ ---------- ------ ------\n");
309 
310           for(int c=0; c<256; c++)
311             {
312               if (sequence_chars[c] > 0)
313                 {
314                   fprintf(fp_log, "     %c %10" PRIu64 " %5.1f%% %6d",
315                           c,
316                           sequence_chars[c],
317                           100.0 * sequence_chars[c] / total_chars,
318                           maxrun[c]);
319                   if ((c == 'N') || (c == 'n'))
320                     {
321                       if (qmin_n < qmax_n) {
322                         fprintf(fp_log, "  Q=%c..%c", qmin_n, qmax_n);
323                       } else {
324                         fprintf(fp_log, "  Q=%c", qmin_n);
325 
326         }
327                     }
328                   fprintf(fp_log, "\n");
329                 }
330             }
331 
332           fprintf(fp_log, "\n");
333           fprintf(fp_log, "Char  ASCII    Freq       Tails\n");
334           fprintf(fp_log, "----  -----  ------  ----------\n");
335 
336           for(int c=qmin; c<=qmax; c++)
337             {
338               if (quality_chars[c] > 0)
339                 {
340                   fprintf(fp_log, " '%c'  %5d  %5.1f%%  %10" PRIu64 "\n",
341                           c,
342                           c,
343                           100.0 * quality_chars[c] / total_chars,
344                           tail_chars[c]);
345                 }
346             }
347         }
348     }
349 }
350 
q2p(double q)351 double q2p(double q)
352 {
353   return exp10(- q / 10.0);
354 }
355 
fastq_stats()356 void fastq_stats()
357 {
358   fastx_handle h = fastq_open(opt_fastq_stats);
359 
360   uint64_t filesize = fastq_get_size(h);
361 
362   progress_init("Reading FASTQ file", filesize);
363 
364   uint64_t seq_count = 0;
365   uint64_t symbols = 0;
366 
367   int64_t read_length_alloc = 512;
368 
369   auto * read_length_table = (uint64_t*) xmalloc(sizeof(uint64_t) * read_length_alloc);
370   memset(read_length_table, 0, sizeof(uint64_t) * read_length_alloc);
371 
372   auto * qual_length_table = (uint64_t*) xmalloc(sizeof(uint64_t) * read_length_alloc * 256);
373   memset(qual_length_table, 0, sizeof(uint64_t) * read_length_alloc * 256);
374 
375   auto * ee_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 4);
376   memset(ee_length_table, 0, sizeof(uint64_t) * read_length_alloc * 4);
377 
378   auto * q_length_table = (uint64_t *) xmalloc(sizeof(uint64_t) * read_length_alloc * 4);
379   memset(q_length_table, 0, sizeof(uint64_t) * read_length_alloc * 4);
380 
381   auto * sumee_length_table = (double *) xmalloc(sizeof(double) * read_length_alloc);
382   memset(sumee_length_table, 0, sizeof(double) * read_length_alloc);
383 
384   int64_t len_min = LONG_MAX;
385   int64_t len_max = 0;
386 
387   int qmin = +1000;
388   int qmax = -1000;
389 
390   uint64_t quality_chars[256];
391   for(uint64_t & quality_char : quality_chars)
392     {
393       quality_char = 0;
394     }
395 
396   while(fastq_next(h, false, chrmap_upcase))
397     {
398       seq_count++;
399 
400       int64_t len = fastq_get_sequence_length(h);
401       char * q = fastq_get_quality(h);
402 
403       /* update length statistics */
404 
405       if (len+1 > read_length_alloc)
406         {
407           read_length_table = (uint64_t*) xrealloc(read_length_table,
408                                               sizeof(uint64_t) * (len+1));
409           memset(read_length_table + read_length_alloc, 0,
410                  sizeof(uint64_t) * (len + 1 - read_length_alloc));
411 
412           qual_length_table = (uint64_t*) xrealloc(qual_length_table,
413                                               sizeof(uint64_t) * (len+1) * 256);
414           memset(qual_length_table + 256 * read_length_alloc, 0,
415                  sizeof(uint64_t) * (len + 1 - read_length_alloc) * 256);
416 
417           ee_length_table = (uint64_t*) xrealloc(ee_length_table,
418                                             sizeof(uint64_t) * (len+1) * 4);
419           memset(ee_length_table + 4 * read_length_alloc, 0,
420                  sizeof(uint64_t) * (len + 1 - read_length_alloc) * 4);
421 
422           q_length_table = (uint64_t*) xrealloc(q_length_table,
423                                            sizeof(uint64_t) * (len+1) * 4);
424           memset(q_length_table + 4 * read_length_alloc, 0,
425                  sizeof(uint64_t) * (len + 1 - read_length_alloc) * 4);
426 
427           sumee_length_table = (double *) xrealloc(sumee_length_table,
428                                                    sizeof(double) * (len+1));
429           memset(sumee_length_table + read_length_alloc, 0,
430                  sizeof(double) * (len + 1 - read_length_alloc));
431 
432           read_length_alloc = len + 1;
433         }
434 
435       read_length_table[len]++;
436 
437       if (len < len_min) {
438         len_min = len;
439 
440         }
441       if (len > len_max) {
442         len_max = len;
443 
444         }
445 
446       /* update quality statistics */
447 
448       symbols += len;
449 
450       double ee_limit[4] = { 1.0, 0.5, 0.25, 0.1 };
451 
452       double ee = 0.0;
453       int qmin_this = 1000;
454       for(int64_t i=0; i < len; i++)
455         {
456           int qc = q[i];
457 
458           int qual = qc - opt_fastq_ascii;
459           if ((qual < opt_fastq_qmin) || (qual > opt_fastq_qmax))
460             {
461               char * msg;
462               if (xsprintf(& msg,
463                            "FASTQ quality value (%d) out of range (%" PRId64 "-%" PRId64 ").\n"
464                            "Please adjust the FASTQ quality base character or range with the\n"
465                            "--fastq_ascii, --fastq_qmin or --fastq_qmax options. For a complete\n"
466                            "diagnosis with suggested values, please run vsearch --fastq_chars file.",
467                            qual, opt_fastq_qmin, opt_fastq_qmax) > 0) {
468                 fatal(msg);
469               } else {
470                 fatal("Out of memory");
471 
472         }
473               xfree(msg);
474             }
475 
476           quality_chars[qc]++;
477           if (qc < qmin) {
478             qmin = qc;
479 
480         }
481           if (qc > qmax) {
482             qmax = qc;
483 
484         }
485 
486           qual_length_table[256*i + qc]++;
487 
488           ee += q2p(qual);
489 
490           sumee_length_table[i] += ee;
491 
492           for(int z=0; z<4; z++)
493             {
494               if (ee <= ee_limit[z]) {
495                 ee_length_table[4*i+z]++;
496               } else {
497                 break;
498 
499         }
500             }
501 
502           if (qual < qmin_this) {
503             qmin_this = qual;
504 
505         }
506 
507           for(int z=0; z<4; z++)
508             {
509               if (qmin_this > 5*(z+1)) {
510                 q_length_table[4*i+z]++;
511               } else {
512                 break;
513 
514         }
515             }
516         }
517 
518       progress_update(fastq_get_position(h));
519     }
520   progress_done();
521 
522   /* compute various distributions */
523 
524   auto * length_dist = (uint64_t*) xmalloc(sizeof(uint64_t) * (len_max+1));
525   auto * symb_dist = (int64_t*) xmalloc(sizeof(int64_t) * (len_max+1));
526 
527   auto * rate_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
528   auto * avgq_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
529   auto * avgee_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
530   auto * avgp_dist = (double*) xmalloc(sizeof(double) * (len_max+1));
531 
532   int64_t length_accum = 0;
533   int64_t symb_accum = 0;
534 
535   for(int64_t i = 0; i <= len_max; i++)
536     {
537       length_accum += read_length_table[i];
538       length_dist[i] = length_accum;
539 
540       symb_accum += seq_count - length_accum;
541       symb_dist[i] = symb_accum;
542 
543       int64_t q = 0;
544       int64_t x = 0;
545       double e_sum = 0.0;
546       for(int c=qmin; c<=qmax; c++)
547         {
548           int qual = c - opt_fastq_ascii;
549           x += qual_length_table[256*i + c];
550           q += qual_length_table[256*i + c] * qual;
551           e_sum += qual_length_table[256*i + c] * q2p(qual);
552         }
553       avgq_dist[i] = 1.0 * q / x;
554       avgp_dist[i] = e_sum / x;
555       avgee_dist[i] = sumee_length_table[i] / x;
556       rate_dist[i] = avgee_dist[i] / (i+1);
557     }
558 
559   if (fp_log)
560     {
561       fprintf(fp_log, "\n");
562       fprintf(fp_log, "Read length distribution\n");
563       fprintf(fp_log, "      L           N      Pct   AccPct\n");
564       fprintf(fp_log, "-------  ----------  -------  -------\n");
565 
566       for(int64_t i = len_max; i >= len_min; i--)
567         {
568           if (read_length_table[i] > 0) {
569             fprintf(fp_log, "%2s%5" PRId64 "  %10" PRIu64 "   %5.1lf%%   %5.1lf%%\n",
570                     (i == len_max ? ">=" : "  "),
571                     i,
572                     read_length_table[i],
573                     read_length_table[i] * 100.0 / seq_count,
574                     100.0 * (seq_count - length_dist[i-1]) / seq_count);
575 
576         }
577         }
578 
579       fprintf(fp_log, "\n");
580       fprintf(fp_log, "Q score distribution\n");
581       fprintf(fp_log, "ASCII    Q       Pe           N      Pct   AccPct\n");
582       fprintf(fp_log, "-----  ---  -------  ----------  -------  -------\n");
583 
584       int64_t qual_accum = 0;
585       for(int c = qmax ; c >= qmin ; c--)
586         {
587           if (quality_chars[c] > 0)
588             {
589               qual_accum += quality_chars[c];
590               fprintf(fp_log,
591                       "    %c  %3" PRId64 "  %7.5lf  %10" PRIu64 "  %6.1lf%%  %6.1lf%%\n",
592                       c,
593                       c - opt_fastq_ascii,
594                       q2p(c - opt_fastq_ascii),
595                       quality_chars[c],
596                       100.0 * quality_chars[c] / symbols,
597                       100.0 * qual_accum / symbols);
598             }
599         }
600 
601       fprintf(fp_log, "\n");
602       fprintf(fp_log, "    L  PctRecs  AvgQ  P(AvgQ)      AvgP  AvgEE       Rate   RatePct\n");
603       fprintf(fp_log, "-----  -------  ----  -------  --------  -----  ---------  --------\n");
604 
605       for(int64_t i = 2; i <= len_max; i++)
606         {
607           double PctRecs = 100.0 * (seq_count - length_dist[i-1]) / seq_count;
608           double AvgQ = avgq_dist[i-1];
609           double AvgP = avgp_dist[i-1];
610           double AvgEE = avgee_dist[i-1];
611           double Rate = rate_dist[i-1];
612 
613           fprintf(fp_log,
614                   "%5" PRId64 "  %6.1lf%%  %4.1lf  %7.5lf  %8.6lf  %5.2lf  %9.6lf  %7.3lf%%\n",
615                   i,
616                   PctRecs,
617                   AvgQ,
618                   q2p(AvgQ),
619                   AvgP,
620                   AvgEE,
621                   Rate,
622                   100.0 * Rate);
623         }
624 
625       fprintf(fp_log, "\n");
626       fprintf(fp_log, "    L   1.0000   0.5000   0.2500   0.1000   1.0000   0.5000   0.2500   0.1000\n");
627       fprintf(fp_log, "-----  -------  -------  -------  -------  -------  -------  -------  -------\n");
628 
629       for(int64_t i = len_max; i >= 1; i--)
630         {
631           int64_t read_count[4];
632           double read_percentage[4];
633 
634           for(int z=0; z<4; z++)
635             {
636               read_count[z] = ee_length_table[4*(i-1)+z];
637               read_percentage[z] = 100.0 * read_count[z] / seq_count;
638             }
639 
640           if (read_count[0] > 0)
641             {
642               fprintf(fp_log,
643                       "%5" PRId64 "  %7" PRId64 "  %7" PRId64 "  %7" PRId64 "  %7" PRId64 "  "
644                       "%6.2lf%%  %6.2lf%%  %6.2lf%%  %6.2lf%%\n",
645                       i,
646                       read_count[0], read_count[1],
647                       read_count[2], read_count[3],
648                       read_percentage[0], read_percentage[1],
649                       read_percentage[2], read_percentage[3]);
650             }
651         }
652 
653 
654       fprintf(fp_log, "\n");
655       fprintf(fp_log, "Truncate at first Q\n");
656       fprintf(fp_log, "  Len     Q=5    Q=10    Q=15    Q=20\n");
657       fprintf(fp_log, "-----  ------  ------  ------  ------\n");
658 
659       for(int64_t i = len_max; i >= MAX(1, len_max/2); i--)
660         {
661           double read_percentage[4];
662 
663           for(int z=0; z<4; z++) {
664             read_percentage[z] = 100.0 * q_length_table[4*(i-1)+z] / seq_count;
665 
666         }
667 
668           fprintf(fp_log, "%5" PRId64 "  %5.1lf%%  %5.1lf%%  %5.1lf%%  %5.1lf%%\n",
669                   i,
670                   read_percentage[0], read_percentage[1],
671                   read_percentage[2], read_percentage[3]);
672         }
673 
674       fprintf(fp_log, "\n");
675       fprintf(fp_log, "%10" PRIu64 "  Recs (%.1lfM), 0 too long\n",
676               seq_count, seq_count / 1.0e6);
677       if (seq_count > 0) {
678         fprintf(fp_log, "%10.1lf  Avg length\n", 1.0 * symbols / seq_count);
679 
680         }
681       fprintf(fp_log, "%9.1lfM  Bases\n", symbols / 1.0e6);
682     }
683 
684   xfree(read_length_table);
685   xfree(qual_length_table);
686   xfree(ee_length_table);
687   xfree(q_length_table);
688   xfree(sumee_length_table);
689 
690   xfree(length_dist);
691   xfree(symb_dist);
692   xfree(rate_dist);
693   xfree(avgq_dist);
694   xfree(avgee_dist);
695   xfree(avgp_dist);
696 
697   fastq_close(h);
698 
699   if (!opt_quiet)
700     {
701         fprintf(stderr, "Read %" PRIu64 " sequences.\n", seq_count);
702     }
703 }
704 
fastx_revcomp()705 void fastx_revcomp()
706 {
707   uint64_t buffer_alloc = 512;
708   char * seq_buffer = (char*) xmalloc(buffer_alloc);
709   char * qual_buffer = (char*) xmalloc(buffer_alloc);
710 
711   fastx_handle h = fastx_open(opt_fastx_revcomp);
712 
713   if (!h) {
714     fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
715 
716         }
717 
718   if (opt_fastqout && ! (h->is_fastq || h->is_empty)) {
719     fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores");
720 
721         }
722 
723   uint64_t filesize = fastx_get_size(h);
724 
725   FILE * fp_fastaout = nullptr;
726   FILE * fp_fastqout = nullptr;
727 
728   if (opt_fastaout)
729     {
730       fp_fastaout = fopen_output(opt_fastaout);
731       if (!fp_fastaout) {
732         fatal("Unable to open FASTA output file for writing");
733 
734         }
735     }
736 
737   if (opt_fastqout)
738     {
739       fp_fastqout = fopen_output(opt_fastqout);
740       if (!fp_fastqout) {
741         fatal("Unable to open FASTQ output file for writing");
742 
743         }
744     }
745 
746   if (h->is_fastq) {
747     progress_init("Reading FASTQ file", filesize);
748   } else {
749     progress_init("Reading FASTA file", filesize);
750 
751         }
752 
753   int count = 0;
754 
755   while(fastx_next(h, false, chrmap_no_change))
756     {
757       count++;
758 
759       /* header */
760 
761       uint64_t hlen = fastx_get_header_length(h);
762       char * header = fastx_get_header(h);
763       int64_t abundance = fastx_get_abundance(h);
764 
765 
766       /* sequence */
767 
768       uint64_t length = fastx_get_sequence_length(h);
769 
770       if (length + 1 > buffer_alloc)
771         {
772           buffer_alloc = length + 1;
773           seq_buffer = (char *) xrealloc(seq_buffer, buffer_alloc);
774           qual_buffer = (char *) xrealloc(qual_buffer, buffer_alloc);
775         }
776 
777       char * p = fastx_get_sequence(h);
778       reverse_complement(seq_buffer, p, length);
779 
780 
781       /* quality values */
782 
783       char * q = fastx_get_quality(h);
784 
785       if (fastx_is_fastq(h))
786         {
787           /* reverse quality values */
788           for(uint64_t i=0; i<length; i++) {
789             qual_buffer[i] = q[length-1-i];
790 
791         }
792           qual_buffer[length] = 0;
793         }
794 
795       if (opt_fastaout) {
796         fasta_print_general(fp_fastaout,
797                             nullptr,
798                             seq_buffer,
799                             length,
800                             header,
801                             hlen,
802                             abundance,
803                             count,
804                             -1.0,
805                             -1, -1, nullptr, 0.0);
806 
807         }
808 
809       if (opt_fastqout) {
810         fastq_print_general(fp_fastqout,
811                             seq_buffer,
812                             length,
813                             header,
814                             hlen,
815                             qual_buffer,
816                             abundance,
817                             count,
818                             -1.0);
819 
820         }
821 
822       progress_update(fastx_get_position(h));
823     }
824   progress_done();
825 
826   if (opt_fastaout) {
827     fclose(fp_fastaout);
828 
829         }
830 
831   if (opt_fastqout) {
832     fclose(fp_fastqout);
833 
834         }
835 
836   fastx_close(h);
837 
838   xfree(seq_buffer);
839   xfree(qual_buffer);
840 }
841 
fastq_convert()842 void fastq_convert()
843 {
844   fastx_handle h = fastq_open(opt_fastq_convert);
845 
846   if (!h) {
847     fatal("Unable to open FASTQ file");
848 
849         }
850 
851   uint64_t filesize = fastq_get_size(h);
852 
853   FILE * fp_fastqout = nullptr;
854 
855   fp_fastqout = fopen_output(opt_fastqout);
856   if (!fp_fastqout) {
857     fatal("Unable to open FASTQ output file for writing");
858 
859         }
860 
861   progress_init("Reading FASTQ file", filesize);
862 
863   int j = 1;
864   while(fastq_next(h, false, chrmap_no_change))
865     {
866       /* header */
867 
868       char * header = fastq_get_header(h);
869       int64_t abundance = fastq_get_abundance(h);
870 
871       /* sequence */
872 
873       uint64_t length = fastq_get_sequence_length(h);
874       char * sequence = fastq_get_sequence(h);
875 
876       /* convert quality values */
877 
878       char * quality = fastq_get_quality(h);
879       for(uint64_t i=0; i<length; i++)
880         {
881           int q = quality[i] - opt_fastq_ascii;
882           if (q < opt_fastq_qmin)
883             {
884               fprintf(stderr,
885                       "\nFASTQ quality score (%d) below minimum (%" PRId64
886                       ") in entry no %" PRIu64
887                       " starting on line %" PRIu64 "\n",
888                       q,
889                       opt_fastq_qmin,
890                       fastq_get_seqno(h) + 1,
891                       fastq_get_lineno(h));
892               fatal("FASTQ quality score too low");
893             }
894           if (q > opt_fastq_qmax)
895             {
896               fprintf(stderr,
897                       "\nFASTQ quality score (%d) above maximum (%" PRId64
898                       ") in entry no %" PRIu64
899                       " starting on line %" PRIu64 "\n",
900                       q,
901                       opt_fastq_qmax,
902                       fastq_get_seqno(h) + 1,
903                       fastq_get_lineno(h));
904               fatal("FASTQ quality score too high");
905             }
906           if (q < opt_fastq_qminout) {
907             q = opt_fastq_qminout;
908 
909         }
910           if (q > opt_fastq_qmaxout) {
911             q = opt_fastq_qmaxout;
912 
913         }
914           q += opt_fastq_asciiout;
915           if (q < 33) {
916             q = 33;
917 
918         }
919           if (q > 126) {
920             q = 126;
921 
922         }
923           quality[i] = q;
924         }
925       quality[length] = 0;
926 
927       int hlen = fastq_get_header_length(h);
928       fastq_print_general(fp_fastqout,
929                           sequence,
930                           length,
931                           header,
932                           hlen,
933                           quality,
934                           abundance,
935                           j,
936                           -1.0);
937 
938       j++;
939       progress_update(fastq_get_position(h));
940     }
941 
942   progress_done();
943 
944   fclose(fp_fastqout);
945   fastq_close(h);
946 }
947