1 /*  bam_fastq.c -- FASTA and FASTQ file generation
2 
3     Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
4     Portions copyright (C) 2009, 2011, 2012 Broad Institute.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notices and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <stdbool.h>
32 #include <ctype.h>
33 #include <assert.h>
34 #include <inttypes.h>
35 #include <unistd.h>
36 
37 #include "htslib/sam.h"
38 #include "htslib/klist.h"
39 #include "htslib/kstring.h"
40 #include "htslib/bgzf.h"
41 #include "htslib/thread_pool.h"
42 #include "samtools.h"
43 #include "sam_opts.h"
44 
45 #define DEFAULT_BARCODE_TAG "BC"
46 #define DEFAULT_QUALITY_TAG "QT"
47 #define INDEX_SEPARATOR "+"
48 
49 int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
bam2fq_usage(FILE * to,const char * command)50 static void bam2fq_usage(FILE *to, const char *command)
51 {
52     int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
53     fprintf(to,
54 "Usage: samtools %s [options...] <in.bam>\n", command);
55     fprintf(to,
56 "\n"
57 "Description:\n"
58 "Converts a SAM, BAM or CRAM to %s format.\n"
59 "\n"
60 "Options:\n"
61 "  -0 FILE      write reads designated READ_OTHER to FILE\n"
62 "  -1 FILE      write reads designated READ1 to FILE\n"
63 "  -2 FILE      write reads designated READ2 to FILE\n"
64 "  -o FILE      write reads designated READ1 or READ2 to FILE\n"
65 "               note: if a singleton file is specified with -s, only\n"
66 "               paired reads will be written to the -1 and -2 files.\n"
67 "  -f INT       only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
68 "  -F INT       only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
69 "  -G INT       only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
70 "  -n           don't append /1 and /2 to the read name\n"
71 "  -N           always append /1 and /2 to the read name\n",
72     fq ? "FASTQ" : "FASTA");
73     if (fq) fprintf(to,
74 "  -O           output quality in the OQ tag if present\n");
75     fprintf(to,
76 "  -s FILE      write singleton reads designated READ1 or READ2 to FILE\n"
77 "  -t           copy RG, BC and QT tags to the %s header line\n",
78     fq ? "FASTQ" : "FASTA");
79     fprintf(to,
80 "  -T TAGLIST   copy arbitrary tags to the %s header line\n",
81     fq ? "FASTQ" : "FASTA");
82     if (fq) fprintf(to,
83 "  -v INT       default quality score if not given in file [1]\n"
84 "  -i           add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
85 "  -c INT       compression level [0..9] to use when writing bgzf files [1]\n"
86 "  --i1 FILE    write first index reads to FILE\n"
87 "  --i2 FILE    write second index reads to FILE\n"
88 "  --barcode-tag TAG\n"
89 "               Barcode tag [" DEFAULT_BARCODE_TAG "]\n"
90 "  --quality-tag TAG\n"
91 "               Quality tag [" DEFAULT_QUALITY_TAG "]\n"
92 "  --index-format STR\n"
93 "               How to parse barcode and quality tags\n\n");
94     sam_global_opt_help(to, "-.--.@-.");
95     fprintf(to,
96 "\n"
97 "The files will be automatically compressed if the file names have a .gz\n"
98 "or .bgzf extension.  The input to this program must be collated by name.\n"
99 "Run 'samtools collate' or 'samtools sort -n' to achieve this.\n"
100 "\n"
101 "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n"
102 "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n"
103 "Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n"
104 "Run 'samtools flags' for more information on flag codes and meanings.\n");
105     fprintf(to,
106 "\n"
107 "The index-format string describes how to parse the barcode and quality tags.\n"
108 "It is made up of 'i' or 'n' followed by a length or '*'.  For example:\n"
109 "   i14i8       The first 14 characters are index 1, the next 8 are index 2\n"
110 "   n8i14       Ignore the first 8 characters, and use the next 14 for index 1\n\n"
111 "If the tag contains a separator, then the numeric part can be replaced with\n"
112 "'*' to mean 'read until the separator or end of tag', for example:\n"
113 "   i*i*        Break the tag at the separator into index 1 and index 2\n"
114 "   n*i*        Ignore the left part of the tag until the separator,\n"
115 "               then use the second part of the tag as index 1\n");
116     fprintf(to,
117 "\n"
118 "Examples:\n"
119 "To get just the paired reads in separate files, use:\n"
120 "   samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n"
121 "\nTo get all non-supplementary/secondary reads in a single file, redirect\n"
122 "the output:\n"
123 "   samtools %s in.bam > all_reads.%s\n",
124             command, fq ? "fq" : "fa", fq ? "fq" : "fa",
125             command, fq ? "fq" : "fa");
126 }
127 
128 typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
129 typedef enum { FASTA, FASTQ } fastfile;
130 typedef struct bam2fq_opts {
131     char *fnse;
132     char *fnr[3];
133     char *fn_input; // pointer to input filename in argv do not free
134     bool has12, has12always, use_oq, copy_tags, illumina_tag;
135     int flag_on, flag_off, flag_alloff;
136     sam_global_args ga;
137     fastfile filetype;
138     int def_qual;
139     char *barcode_tag;
140     char *quality_tag;
141     char *index_file[2];
142     char *index_format;
143     char *extra_tags;
144     char compression_level;
145 } bam2fq_opts_t;
146 
147 typedef struct bam2fq_state {
148     samFile *fp;
149     samFile *fpse;
150     samFile *fpr[3];
151     samFile *fpi[3];
152     samFile *hstdout;
153     sam_hdr_t *h;
154     bool has12, use_oq, copy_tags, illumina_tag;
155     int flag_on, flag_off, flag_alloff;
156     fastfile filetype;
157     int def_qual;
158     char *index_sequence;
159     char compression_level;
160     htsThreadPool p;
161 } bam2fq_state_t;
162 
which_readpart(const bam1_t * b)163 static readpart which_readpart(const bam1_t *b)
164 {
165     if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
166         return READ_1;
167     } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) {
168         return READ_2;
169     } else {
170         return READ_UNKNOWN;
171     }
172 }
173 
free_opts(bam2fq_opts_t * opts)174 static void free_opts(bam2fq_opts_t *opts)
175 {
176     free(opts);
177 }
178 
179 // return true if valid
parse_opts(int argc,char * argv[],bam2fq_opts_t ** opts_out)180 static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
181 {
182     // Parse args
183     bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
184     opts->has12 = true;
185     opts->has12always = false;
186     opts->filetype = FASTQ;
187     opts->def_qual = 1;
188     opts->barcode_tag = NULL;
189     opts->quality_tag = NULL;
190     opts->index_format = NULL;
191     opts->index_file[0] = NULL;
192     opts->index_file[1] = NULL;
193     opts->extra_tags = NULL;
194     opts->compression_level = 1;
195     opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY;
196     int flag_off_set = 0;
197 
198     int c;
199     sam_global_args_init(&opts->ga);
200     static const struct option lopts[] = {
201         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
202         {"i1", required_argument, NULL, 1},
203         {"I1", required_argument, NULL, 1},
204         {"i2", required_argument, NULL, 2},
205         {"I2", required_argument, NULL, 2},
206         {"if", required_argument, NULL, 3},
207         {"IF", required_argument, NULL, 3},
208         {"index-format", required_argument, NULL, 3},
209         {"barcode-tag", required_argument, NULL, 'b'},
210         {"quality-tag", required_argument, NULL, 'q'},
211         { NULL, 0, NULL, 0 }
212     };
213     while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
214                             lopts, NULL)) > 0) {
215         switch (c) {
216             case 'b': opts->barcode_tag = optarg; break;
217             case 'q': opts->quality_tag = optarg; break;
218             case  1 : opts->index_file[0] = optarg; break;
219             case  2 : opts->index_file[1] = optarg; break;
220             case  3 : opts->index_format = optarg; break;
221             case '0': opts->fnr[0] = optarg; break;
222             case '1': opts->fnr[1] = optarg; break;
223             case '2': opts->fnr[2] = optarg; break;
224             case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break;
225             case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
226             case 'F':
227                 if (!flag_off_set) {
228                     flag_off_set = 1;
229                     opts->flag_off = 0;
230                 }
231                 opts->flag_off |= strtol(optarg, 0, 0);
232                 break;
233             case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
234             case 'n': opts->has12 = false; break;
235             case 'N': opts->has12always = true; break;
236             case 'O': opts->use_oq = true; break;
237             case 's': opts->fnse = optarg; break;
238             case 't': opts->copy_tags = true; break;
239             case 'i': opts->illumina_tag = true; break;
240             case 'c':
241                 opts->compression_level = atoi(optarg);
242                 if (opts->compression_level < 0)
243                     opts->compression_level = 0;
244                 if (opts->compression_level > 9)
245                     opts->compression_level = 9;
246                 break;
247             case 'T': opts->extra_tags = optarg; break;
248             case 'v': opts->def_qual = atoi(optarg); break;
249 
250             case '?':
251                 bam2fq_usage(stderr, argv[0]);
252                 free_opts(opts);
253                 return false;
254             default:
255                 if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
256                     bam2fq_usage(stderr, argv[0]);
257                     free_opts(opts);
258                     return false;
259                 }
260                 break;
261         }
262     }
263 
264     if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
265     if (opts->has12always) opts->has12 = true;
266 
267     if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG;
268     if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG;
269 
270     int nIndex = 0;
271     if (opts->index_format) {
272         char *s;
273         for (s = opts->index_format; *s; s++) {
274             if (*s == 'i') nIndex++;
275         }
276     }
277     if (nIndex>2) {
278         fprintf(stderr,"Invalid index format: more than 2 indexes\n");
279         bam2fq_usage(stderr, argv[0]);
280         free_opts(opts);
281         return false;
282     }
283 
284     if (opts->index_file[1] && !opts->index_file[0]) {
285         fprintf(stderr, "Index one specified, but index two not given\n");
286         bam2fq_usage(stderr, argv[0]);
287         free_opts(opts);
288         return false;
289     }
290 
291     if (opts->illumina_tag && !nIndex) {
292         fprintf(stderr, "You must specify an index format (--index-format) with the Illumina Casava (-i) option\n");
293         bam2fq_usage(stderr, argv[0]);
294         free_opts(opts);
295         return false;
296     }
297 
298     if (nIndex==0 && opts->index_file[0]) {
299         fprintf(stderr, "index_format not specified, but index file given\n");
300         bam2fq_usage(stderr, argv[0]);
301         free_opts(opts);
302         return false;
303     }
304 
305     if (opts->def_qual < 0 || 93 < opts->def_qual) {
306         fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
307         bam2fq_usage(stderr, argv[0]);
308         free_opts(opts);
309         return false;
310     }
311 
312     const char* type_str = argv[0];
313     if (strcasecmp("fastq", type_str) == 0 ||
314         strcasecmp("bam2fq", type_str) == 0) {
315         opts->filetype = FASTQ;
316     } else if (strcasecmp("fasta", type_str) == 0) {
317         opts->filetype = FASTA;
318     } else {
319         print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
320         bam2fq_usage(stderr, argv[0]);
321         free_opts(opts);
322         return false;
323     }
324 
325     if (argc == optind && isatty(STDIN_FILENO)) {
326         bam2fq_usage(stdout, argv[0]);
327         free_opts(opts);
328         return true;
329     }
330 
331     if (argc - optind > 1) {
332         fprintf(stderr, "Too many arguments.\n");
333         bam2fq_usage(stderr, argv[0]);
334         free_opts(opts);
335         return false;
336     }
337     opts->fn_input = argc > optind ? argv[optind] : "-";
338     *opts_out = opts;
339     return true;
340 }
341 
set_sam_opts(samFile * fp,bam2fq_state_t * state,const bam2fq_opts_t * opts)342 void set_sam_opts(samFile *fp, bam2fq_state_t *state,
343                   const bam2fq_opts_t *opts) {
344     if (state->has12)
345         hts_set_opt(fp, FASTQ_OPT_RNUM, 1);
346 
347     if (state->illumina_tag)
348         hts_set_opt(fp, FASTQ_OPT_CASAVA, 1);
349 
350     hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
351 
352     kstring_t tag_list = {0,0};
353     if (state->copy_tags)
354         kputs("RG,BC,QT", &tag_list);
355     if (opts->extra_tags) {
356         if (tag_list.l)
357             kputc(',', &tag_list);
358         kputs(opts->extra_tags, &tag_list);
359     }
360     if (tag_list.l)
361         hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
362     ks_free(&tag_list);
363 }
364 
365 // Open a file as normal or gzipped based on filename.
366 // Note we always use bgzf and don't bother to attempt non-blocked
367 // gzip streams.  This is a departure from the old fastq code.
sam_open_z(char * fn,char * mode,bam2fq_state_t * state)368 static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) {
369     char modez[6];
370     strcpy(modez, mode);
371 
372     size_t l = strlen(fn);
373     if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) ||
374         (l > 4 && strcmp(fn+l-4, ".bgz") == 0) ||
375         (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) {
376         char m[3] = {'z', state->compression_level+'0', '\0'};
377         strcat(modez, m);
378     }
379 
380     samFile *fp = sam_open(fn, modez);
381     if (!fp)
382         return NULL;
383 
384     if (state->p.pool)
385         hts_set_thread_pool(fp, &state->p);
386 
387     return fp;
388 }
389 
init_state(const bam2fq_opts_t * opts,bam2fq_state_t ** state_out)390 static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
391 {
392     char *mode = opts->filetype == FASTA ? "wF" : "wf";
393 
394     bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
395     if (!state)
396         return false;
397     state->flag_on = opts->flag_on;
398     state->flag_off = opts->flag_off;
399     state->flag_alloff = opts->flag_alloff;
400     state->has12 = opts->has12;
401     state->use_oq = opts->use_oq;
402     state->illumina_tag = opts->illumina_tag;
403     state->copy_tags = opts->copy_tags;
404     state->filetype = opts->filetype;
405     state->def_qual = opts->def_qual;
406     state->index_sequence = NULL;
407     state->hstdout = NULL;
408     state->compression_level = opts->compression_level;
409 
410     state->fp = sam_open(opts->fn_input, "r");
411     if (state->fp == NULL) {
412         print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
413         free(state);
414         return false;
415     }
416 
417     state->p.pool = NULL;
418     if (opts->ga.nthreads > 0) {
419         if (!(state->p.pool = hts_tpool_init(opts->ga.nthreads))) {
420             fprintf(stderr, "Failed to create thread pool\n");
421             free(state);
422             return false;
423         }
424         state->p.qsize = opts->ga.nthreads*2;
425         hts_set_thread_pool(state->fp, &state->p);
426     }
427 
428     uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
429     if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
430     if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
431         fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
432         free(state);
433         return false;
434     }
435     if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) {
436         fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
437         free(state);
438         return false;
439     }
440     if (opts->fnse) {
441         if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) {
442             print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse);
443             free(state);
444             return false;
445         }
446         set_sam_opts(state->fpse, state, opts);
447     }
448 
449     if (opts->ga.reference) {
450         if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) {
451             print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference);
452             free(state);
453             return false;
454         }
455     }
456 
457     // single, read1, read2
458     int i, j;
459     for (i = 0; i < 3; ++i) {
460         if (opts->fnr[i]) {
461             for (j = 0; j < i; j++)
462                 if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0)
463                     break;
464             if (j == i) {
465                 if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) {
466                     print_error_errno("bam2fq", "Cannot open r%d file \"%s\"",
467                                       i, opts->fnr[i]);
468                     free(state);
469                     return false;
470                 }
471                 set_sam_opts(state->fpr[i], state, opts);
472             } else {
473                 state->fpr[i] = state->fpr[j];
474             }
475         } else {
476             if (!state->hstdout) {
477                 if (!(state->hstdout = sam_open_z("-", mode, state))) {
478                     print_error_errno("bam2fq", "Cannot open STDOUT");
479                     free(state);
480                     return false;
481                 }
482                 set_sam_opts(state->hstdout, state, opts);
483                 autoflush_if_stdout(state->hstdout, "-");
484             }
485             state->fpr[i] = state->hstdout;
486         }
487     }
488 
489     // index 1, index 2
490     for (i = 0; i < 2; i++) {
491         state->fpi[i] = NULL;
492         if (opts->index_file[i]) {
493             for (j = 0; j < 3; j++)
494                 if (opts->fnr[j] && strcmp(opts->fnr[j], opts->index_file[i]) == 0)
495                     break;
496             for (j -= 3; j >= 0 && j < i; j++)
497                 if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0)
498                     break;
499             if (i == j) {
500                 if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode,
501                                                  state))) {
502                     print_error_errno("bam2fq", "Cannot open i%d file \"%s\"",
503                                       i+1, opts->index_file[i]);
504                     free(state);
505                     return false;
506                 }
507                 set_sam_opts(state->fpi[i], state, opts);
508             } else if (j < 0) {
509                 state->fpi[i] = state->fpr[j+3];
510             } else {
511                 state->fpi[i] = state->fpi[j];
512             }
513         }
514     }
515 
516     state->h = sam_hdr_read(state->fp);
517     if (state->h == NULL) {
518         fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input);
519         free(state);
520         return false;
521     }
522 
523     *state_out = state;
524     return true;
525 }
526 
destroy_state(const bam2fq_opts_t * opts,bam2fq_state_t * state,int * status)527 static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status)
528 {
529     bool valid = true;
530     sam_hdr_destroy(state->h);
531     check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
532     if (state->fpse && sam_close(state->fpse) < 0) {
533         print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse);
534         valid = false;
535     }
536 
537     int i, j;
538     for (i = 0; i < 3; ++i) {
539         if (state->fpr[i] != state->hstdout) {
540             for (j = 0; j < i; j++)
541                 if (state->fpr[i] == state->fpr[j])
542                     break;
543             if (j == i && sam_close(state->fpr[i])) {
544                 print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]);
545                 valid = false;
546             }
547         }
548     }
549     if (state->hstdout) {
550         release_autoflush(state->hstdout);
551         if (sam_close(state->hstdout) < 0) {
552             print_error_errno("bam2fq", "Error closing STDOUT");
553             valid = false;
554         }
555     }
556     for (i = 0; i < 2; i++) {
557         for (j = 0; j < 3; j++)
558             if (state->fpi[i] == state->fpr[j])
559                 break;
560         for (j -= 3; j >= 0 && j < i; j++)
561             if (state->fpi[i] == state->fpi[j])
562                 break;
563         if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) {
564             print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
565             valid = false;
566         }
567     }
568     free(state->index_sequence);
569     if (state->p.pool)
570         hts_tpool_destroy(state->p.pool);
571     free(state);
572     return valid;
573 }
574 
filter_it_out(const bam1_t * b,const bam2fq_state_t * state)575 static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
576 {
577     return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
578         ||  (b->core.flag&(state->flag_off)) != 0
579         ||  (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
580 
581 }
582 
write_index_rec(samFile * fp,bam1_t * b,bam2fq_state_t * state,bam2fq_opts_t * opts,char * seq,int seq_len,char * qual,int qual_len)583 int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state,
584                     bam2fq_opts_t* opts, char *seq, int seq_len,
585                     char *qual, int qual_len) {
586     if (!fp || !b || !seq_len)
587         return 0;
588 
589     int ret = -1;
590     bam1_t *b2 = bam_init1(); // FIXME: reuse
591     if (!b2)
592         return -1;
593 
594     size_t aux_len = b->data + b->l_data - bam_get_aux(b);
595     if (bam_set1(b2, b->core.l_qname, bam_get_qname(b),
596                  (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE,
597                  -1, -1, 0,    // refid, pos, mapq
598                  0, NULL,      // cigar
599                  -1, -1, 0,    // rnext, pnext, tlen
600                  seq_len, seq, qual,
601                  aux_len) < 0)
602         goto err;
603 
604     uint8_t *q = bam_get_qual(b2);
605     if (qual) {
606         int i;
607         for (i = 0; i < seq_len; i++)
608             q[i] -= '!';
609     } else {
610         memset(q, opts->def_qual, seq_len);
611     }
612 
613     memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len);
614     b2->l_data += aux_len;
615     if (sam_write1(fp, state->h, b2) < 0)
616         goto err;
617 
618     ret = 0;
619  err:
620     if (b2)
621         bam_destroy1(b2);
622     return ret;
623 }
624 
output_index(bam1_t * b1,bam1_t * b2,bam2fq_state_t * state,bam2fq_opts_t * opts)625 int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
626                  bam2fq_opts_t* opts) {
627     bam1_t *b = b1 ? b1 : b2;
628 
629     char *ifmt = opts->index_format;
630     if (!ifmt)
631         ifmt = "i*i*";
632 
633     // Get seq / qual elements
634     char *bc = NULL, *qt = NULL;
635     if (b1)
636         bc = (char *)bam_aux_get(b1, opts->barcode_tag);
637     if (b2 && !bc)
638         bc = (char *)bam_aux_get(b2, opts->barcode_tag);
639     if (!bc)
640         return 0;
641     else
642         bc++; // skip Z
643 
644     if (b1)
645         qt = (char *)bam_aux_get(b1, opts->quality_tag);
646     if (b2 && !qt)
647         qt = (char *)bam_aux_get(b2, opts->quality_tag);
648     if (qt && strlen(bc) != strlen(qt)-1)
649         qt = NULL;
650     else if (qt)
651         qt++;
652 
653     int inum = 0;
654     while (inum < 2) {
655         char fc = *ifmt++;
656         if (!fc)
657             break; // ran out of index-format
658 
659         long len, rem = 0;
660         if (isdigit(*ifmt)) {
661             rem = len = strtol(ifmt, &ifmt, 10);
662         } else {
663             ifmt++;
664             len = 0;
665         }
666 
667         char *bc_end = bc, *qt_end = qt;
668         while (len ? *bc_end && rem-- : isalpha(*bc_end))
669             bc_end++, qt_end += qt != NULL;
670 
671         switch (fc) {
672         case 'n':
673             // skip
674             bc = bc_end + (len==0);
675             if (qt)
676                 qt = qt_end + (len==0);
677             break;
678 
679         case 'i':
680             if (write_index_rec(state->fpi[inum], b, state, opts,
681                                 bc, bc_end-bc, qt, qt_end-qt) < 0)
682                 return -1;
683             bc = bc_end + (len==0);
684             if (qt)
685                 qt = qt_end + (len==0);
686             inum++;
687             break;
688 
689         default:
690             fprintf(stderr, "Unknown index-format code\n");
691             return -1;
692         }
693     }
694 
695     return 0;
696 }
697 
flush_rec(bam2fq_state_t * state,bam2fq_opts_t * opts,bam1_t * b[4],int score[3],int best[3],int64_t * n_singletons)698 static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts,
699                      bam1_t *b[4], int score[3], int best[3],
700                      int64_t *n_singletons) {
701     // Paired data, with 1 or 2 ends present.
702     if (score[1] > 0 && score[2] > 0) {
703         // If CASAVA tag is required and barcode is only on R1,
704         // copy it to R2
705         if (state->illumina_tag) {
706             char *tag;
707             if ((tag = (char *)bam_aux_get(b[best[1]],
708                                            opts->barcode_tag)))
709                 if (bam_aux_update_str(b[best[2]],
710                                        opts->barcode_tag,
711                                        strlen(tag), tag+1) < 0)
712                     goto err;
713             if ((tag = (char *)bam_aux_get(b[best[1]],
714                                            opts->quality_tag)))
715                 if (bam_aux_update_str(b[best[2]],
716                                        opts->quality_tag,
717                                        strlen(tag), tag+1) < 0)
718                     goto err;
719 
720         }
721         if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
722             goto err;
723         if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
724             goto err;
725 
726         if (output_index(b[best[1]], b[best[2]], state, opts) < 0)
727             goto err;
728     } else if (score[1] > 0 || score[2] > 0) {
729         if (state->fpse) {
730             // print whichever one exists to fpse
731             if (score[1] > 0) {
732                 if (sam_write1(state->fpse, state->h, b[best[1]]) < 0)
733                     goto err;
734             } else {
735                 if (sam_write1(state->fpse, state->h, b[best[2]]) < 0)
736                     goto err;
737             }
738             ++(*n_singletons);
739         } else {
740             if (score[1] > 0) {
741                 if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
742                     goto err;
743             } else {
744                 if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
745                     goto err;
746             }
747         }
748 
749         if (output_index(score[1] > 0 ? b[best[1]] : NULL,
750                          score[2] > 0 ? b[best[2]] : NULL,
751                          state, opts) < 0)
752             goto err;
753     }
754 
755     if (score[0]) { // single ended data (neither READ1 nor READ2)
756         if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0)
757             goto err;
758 
759         if (output_index(b[best[0]], NULL, state, opts) < 0)
760             goto err;
761     }
762 
763     return 0;
764 
765  err:
766     return -1;
767 }
768 
bam2fq_mainloop(bam2fq_state_t * state,bam2fq_opts_t * opts)769 static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
770 {
771     int n;
772     char *current_qname = NULL;
773     int64_t n_reads = 0, n_singletons = 0; // Statistics
774     int score[3];
775     int at_eof;
776     bool valid = false;
777     int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices;
778                                 // indexed by [readpart]
779     bam1_t *b[4];               // 3 readparts, plus current record
780 
781     for (n = 0; n < 4; n++) {
782         if (!(b[n] = bam_init1())) {
783             perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
784             return false;
785         }
786     }
787 
788     n = 0;
789     while (true) {
790         int res = sam_read1(state->fp, state->h, b[n]);
791         if (res < -1) {
792             print_error("bam2fq", "Failed to read bam record");
793             goto err;
794         }
795         at_eof = res < 0;
796 
797         if (!at_eof && filter_it_out(b[n], state))
798             continue;
799         if (!at_eof) {
800             ++n_reads;
801 
802             // Handle -O option: use OQ for qual
803             uint8_t *oq;
804             if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') {
805                 int i, l = strlen((char *)++oq);
806                 uint8_t *qual = bam_get_qual(b[n]);
807                 for (i = 0; i < l && i < b[n]->core.l_qseq; i++)
808                     qual[i] = oq[i] - '!';
809             }
810         }
811 
812         if (at_eof
813             || !current_qname
814             || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) {
815             // New name, so flush best examples of previous name.
816             if (current_qname)
817                 if (flush_rec(state, opts, b, score, best, &n_singletons) < 0)
818                     goto err;
819 
820             current_qname = bam_get_qname(b[n]);
821             score[0] = score[1] = score[2] = 0;
822 
823             if (at_eof) { break; }
824         }
825 
826         // Prefer a copy of the read that has base qualities
827         int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1;
828         readpart rp = which_readpart(b[n]);
829         if (score[rp] < b_score) {
830             score[rp] = b_score;
831             // Record b[n] slot for best copy of readpair and find a new
832             // slot for next bam read
833             best[rp] = n;
834             int used_slot[4] = {0}, i;
835             for (i = 0; i < 3; i++)
836                 if (best[i] >= 0)
837                     used_slot[best[i]] = 1;
838             for (i = 0; i < 4 && used_slot[i]; i++)
839                 ;
840             n = i;
841         }
842     }
843 
844     valid = true;
845  err:
846     if (!valid)
847         print_error_errno("bam2fq", "Error writing to FASTx files.");
848 
849     for (n = 0; n < 4; n++)
850         bam_destroy1(b[n]);
851 
852     fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n",
853             __func__, n_singletons);
854     fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n",
855             __func__, n_reads);
856 
857     return valid;
858 }
859 
main_bam2fq(int argc,char * argv[])860 int main_bam2fq(int argc, char *argv[])
861 {
862     int status = EXIT_FAILURE;
863     bam2fq_opts_t* opts = NULL;
864     bam2fq_state_t* state = NULL;
865 
866     bool valid = parse_opts(argc, argv, &opts);
867     if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
868 
869     if (!init_state(opts, &state)) goto err;
870 
871     if (!bam2fq_mainloop(state,opts)) goto err;
872 
873     if (!destroy_state(opts, state, &status)) goto err;
874 
875     status = EXIT_SUCCESS;
876  err:
877     sam_global_args_free(&opts->ga);
878     free_opts(opts);
879 
880     return status;
881 }
882