1 /* bam_fastq.c -- FASTA and FASTQ file generation
2
3 Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
4 Portions copyright (C) 2009, 2011, 2012 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notices and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <stdbool.h>
32 #include <ctype.h>
33 #include <assert.h>
34 #include <inttypes.h>
35 #include <unistd.h>
36
37 #include "htslib/sam.h"
38 #include "htslib/klist.h"
39 #include "htslib/kstring.h"
40 #include "htslib/bgzf.h"
41 #include "htslib/thread_pool.h"
42 #include "samtools.h"
43 #include "sam_opts.h"
44
45 #define DEFAULT_BARCODE_TAG "BC"
46 #define DEFAULT_QUALITY_TAG "QT"
47 #define INDEX_SEPARATOR "+"
48
49 int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
bam2fq_usage(FILE * to,const char * command)50 static void bam2fq_usage(FILE *to, const char *command)
51 {
52 int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
53 fprintf(to,
54 "Usage: samtools %s [options...] <in.bam>\n", command);
55 fprintf(to,
56 "\n"
57 "Description:\n"
58 "Converts a SAM, BAM or CRAM to %s format.\n"
59 "\n"
60 "Options:\n"
61 " -0 FILE write reads designated READ_OTHER to FILE\n"
62 " -1 FILE write reads designated READ1 to FILE\n"
63 " -2 FILE write reads designated READ2 to FILE\n"
64 " -o FILE write reads designated READ1 or READ2 to FILE\n"
65 " note: if a singleton file is specified with -s, only\n"
66 " paired reads will be written to the -1 and -2 files.\n"
67 " -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
68 " -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
69 " -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
70 " -n don't append /1 and /2 to the read name\n"
71 " -N always append /1 and /2 to the read name\n",
72 fq ? "FASTQ" : "FASTA");
73 if (fq) fprintf(to,
74 " -O output quality in the OQ tag if present\n");
75 fprintf(to,
76 " -s FILE write singleton reads designated READ1 or READ2 to FILE\n"
77 " -t copy RG, BC and QT tags to the %s header line\n",
78 fq ? "FASTQ" : "FASTA");
79 fprintf(to,
80 " -T TAGLIST copy arbitrary tags to the %s header line\n",
81 fq ? "FASTQ" : "FASTA");
82 if (fq) fprintf(to,
83 " -v INT default quality score if not given in file [1]\n"
84 " -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
85 " -c INT compression level [0..9] to use when writing bgzf files [1]\n"
86 " --i1 FILE write first index reads to FILE\n"
87 " --i2 FILE write second index reads to FILE\n"
88 " --barcode-tag TAG\n"
89 " Barcode tag [" DEFAULT_BARCODE_TAG "]\n"
90 " --quality-tag TAG\n"
91 " Quality tag [" DEFAULT_QUALITY_TAG "]\n"
92 " --index-format STR\n"
93 " How to parse barcode and quality tags\n\n");
94 sam_global_opt_help(to, "-.--.@-.");
95 fprintf(to,
96 "\n"
97 "The files will be automatically compressed if the file names have a .gz\n"
98 "or .bgzf extension. The input to this program must be collated by name.\n"
99 "Run 'samtools collate' or 'samtools sort -n' to achieve this.\n"
100 "\n"
101 "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n"
102 "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n"
103 "Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n"
104 "Run 'samtools flags' for more information on flag codes and meanings.\n");
105 fprintf(to,
106 "\n"
107 "The index-format string describes how to parse the barcode and quality tags.\n"
108 "It is made up of 'i' or 'n' followed by a length or '*'. For example:\n"
109 " i14i8 The first 14 characters are index 1, the next 8 are index 2\n"
110 " n8i14 Ignore the first 8 characters, and use the next 14 for index 1\n\n"
111 "If the tag contains a separator, then the numeric part can be replaced with\n"
112 "'*' to mean 'read until the separator or end of tag', for example:\n"
113 " i*i* Break the tag at the separator into index 1 and index 2\n"
114 " n*i* Ignore the left part of the tag until the separator,\n"
115 " then use the second part of the tag as index 1\n");
116 fprintf(to,
117 "\n"
118 "Examples:\n"
119 "To get just the paired reads in separate files, use:\n"
120 " samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n"
121 "\nTo get all non-supplementary/secondary reads in a single file, redirect\n"
122 "the output:\n"
123 " samtools %s in.bam > all_reads.%s\n",
124 command, fq ? "fq" : "fa", fq ? "fq" : "fa",
125 command, fq ? "fq" : "fa");
126 }
127
128 typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
129 typedef enum { FASTA, FASTQ } fastfile;
130 typedef struct bam2fq_opts {
131 char *fnse;
132 char *fnr[3];
133 char *fn_input; // pointer to input filename in argv do not free
134 bool has12, has12always, use_oq, copy_tags, illumina_tag;
135 int flag_on, flag_off, flag_alloff;
136 sam_global_args ga;
137 fastfile filetype;
138 int def_qual;
139 char *barcode_tag;
140 char *quality_tag;
141 char *index_file[2];
142 char *index_format;
143 char *extra_tags;
144 char compression_level;
145 } bam2fq_opts_t;
146
147 typedef struct bam2fq_state {
148 samFile *fp;
149 samFile *fpse;
150 samFile *fpr[3];
151 samFile *fpi[3];
152 samFile *hstdout;
153 sam_hdr_t *h;
154 bool has12, use_oq, copy_tags, illumina_tag;
155 int flag_on, flag_off, flag_alloff;
156 fastfile filetype;
157 int def_qual;
158 char *index_sequence;
159 char compression_level;
160 htsThreadPool p;
161 } bam2fq_state_t;
162
which_readpart(const bam1_t * b)163 static readpart which_readpart(const bam1_t *b)
164 {
165 if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
166 return READ_1;
167 } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) {
168 return READ_2;
169 } else {
170 return READ_UNKNOWN;
171 }
172 }
173
free_opts(bam2fq_opts_t * opts)174 static void free_opts(bam2fq_opts_t *opts)
175 {
176 free(opts);
177 }
178
179 // return true if valid
parse_opts(int argc,char * argv[],bam2fq_opts_t ** opts_out)180 static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
181 {
182 // Parse args
183 bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
184 opts->has12 = true;
185 opts->has12always = false;
186 opts->filetype = FASTQ;
187 opts->def_qual = 1;
188 opts->barcode_tag = NULL;
189 opts->quality_tag = NULL;
190 opts->index_format = NULL;
191 opts->index_file[0] = NULL;
192 opts->index_file[1] = NULL;
193 opts->extra_tags = NULL;
194 opts->compression_level = 1;
195 opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY;
196 int flag_off_set = 0;
197
198 int c;
199 sam_global_args_init(&opts->ga);
200 static const struct option lopts[] = {
201 SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
202 {"i1", required_argument, NULL, 1},
203 {"I1", required_argument, NULL, 1},
204 {"i2", required_argument, NULL, 2},
205 {"I2", required_argument, NULL, 2},
206 {"if", required_argument, NULL, 3},
207 {"IF", required_argument, NULL, 3},
208 {"index-format", required_argument, NULL, 3},
209 {"barcode-tag", required_argument, NULL, 'b'},
210 {"quality-tag", required_argument, NULL, 'q'},
211 { NULL, 0, NULL, 0 }
212 };
213 while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
214 lopts, NULL)) > 0) {
215 switch (c) {
216 case 'b': opts->barcode_tag = optarg; break;
217 case 'q': opts->quality_tag = optarg; break;
218 case 1 : opts->index_file[0] = optarg; break;
219 case 2 : opts->index_file[1] = optarg; break;
220 case 3 : opts->index_format = optarg; break;
221 case '0': opts->fnr[0] = optarg; break;
222 case '1': opts->fnr[1] = optarg; break;
223 case '2': opts->fnr[2] = optarg; break;
224 case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break;
225 case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
226 case 'F':
227 if (!flag_off_set) {
228 flag_off_set = 1;
229 opts->flag_off = 0;
230 }
231 opts->flag_off |= strtol(optarg, 0, 0);
232 break;
233 case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
234 case 'n': opts->has12 = false; break;
235 case 'N': opts->has12always = true; break;
236 case 'O': opts->use_oq = true; break;
237 case 's': opts->fnse = optarg; break;
238 case 't': opts->copy_tags = true; break;
239 case 'i': opts->illumina_tag = true; break;
240 case 'c':
241 opts->compression_level = atoi(optarg);
242 if (opts->compression_level < 0)
243 opts->compression_level = 0;
244 if (opts->compression_level > 9)
245 opts->compression_level = 9;
246 break;
247 case 'T': opts->extra_tags = optarg; break;
248 case 'v': opts->def_qual = atoi(optarg); break;
249
250 case '?':
251 bam2fq_usage(stderr, argv[0]);
252 free_opts(opts);
253 return false;
254 default:
255 if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
256 bam2fq_usage(stderr, argv[0]);
257 free_opts(opts);
258 return false;
259 }
260 break;
261 }
262 }
263
264 if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
265 if (opts->has12always) opts->has12 = true;
266
267 if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG;
268 if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG;
269
270 int nIndex = 0;
271 if (opts->index_format) {
272 char *s;
273 for (s = opts->index_format; *s; s++) {
274 if (*s == 'i') nIndex++;
275 }
276 }
277 if (nIndex>2) {
278 fprintf(stderr,"Invalid index format: more than 2 indexes\n");
279 bam2fq_usage(stderr, argv[0]);
280 free_opts(opts);
281 return false;
282 }
283
284 if (opts->index_file[1] && !opts->index_file[0]) {
285 fprintf(stderr, "Index one specified, but index two not given\n");
286 bam2fq_usage(stderr, argv[0]);
287 free_opts(opts);
288 return false;
289 }
290
291 if (opts->illumina_tag && !nIndex) {
292 fprintf(stderr, "You must specify an index format (--index-format) with the Illumina Casava (-i) option\n");
293 bam2fq_usage(stderr, argv[0]);
294 free_opts(opts);
295 return false;
296 }
297
298 if (nIndex==0 && opts->index_file[0]) {
299 fprintf(stderr, "index_format not specified, but index file given\n");
300 bam2fq_usage(stderr, argv[0]);
301 free_opts(opts);
302 return false;
303 }
304
305 if (opts->def_qual < 0 || 93 < opts->def_qual) {
306 fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
307 bam2fq_usage(stderr, argv[0]);
308 free_opts(opts);
309 return false;
310 }
311
312 const char* type_str = argv[0];
313 if (strcasecmp("fastq", type_str) == 0 ||
314 strcasecmp("bam2fq", type_str) == 0) {
315 opts->filetype = FASTQ;
316 } else if (strcasecmp("fasta", type_str) == 0) {
317 opts->filetype = FASTA;
318 } else {
319 print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
320 bam2fq_usage(stderr, argv[0]);
321 free_opts(opts);
322 return false;
323 }
324
325 if (argc == optind && isatty(STDIN_FILENO)) {
326 bam2fq_usage(stdout, argv[0]);
327 free_opts(opts);
328 return true;
329 }
330
331 if (argc - optind > 1) {
332 fprintf(stderr, "Too many arguments.\n");
333 bam2fq_usage(stderr, argv[0]);
334 free_opts(opts);
335 return false;
336 }
337 opts->fn_input = argc > optind ? argv[optind] : "-";
338 *opts_out = opts;
339 return true;
340 }
341
set_sam_opts(samFile * fp,bam2fq_state_t * state,const bam2fq_opts_t * opts)342 void set_sam_opts(samFile *fp, bam2fq_state_t *state,
343 const bam2fq_opts_t *opts) {
344 if (state->has12)
345 hts_set_opt(fp, FASTQ_OPT_RNUM, 1);
346
347 if (state->illumina_tag)
348 hts_set_opt(fp, FASTQ_OPT_CASAVA, 1);
349
350 hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
351
352 kstring_t tag_list = {0,0};
353 if (state->copy_tags)
354 kputs("RG,BC,QT", &tag_list);
355 if (opts->extra_tags) {
356 if (tag_list.l)
357 kputc(',', &tag_list);
358 kputs(opts->extra_tags, &tag_list);
359 }
360 if (tag_list.l)
361 hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
362 ks_free(&tag_list);
363 }
364
365 // Open a file as normal or gzipped based on filename.
366 // Note we always use bgzf and don't bother to attempt non-blocked
367 // gzip streams. This is a departure from the old fastq code.
sam_open_z(char * fn,char * mode,bam2fq_state_t * state)368 static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) {
369 char modez[6];
370 strcpy(modez, mode);
371
372 size_t l = strlen(fn);
373 if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) ||
374 (l > 4 && strcmp(fn+l-4, ".bgz") == 0) ||
375 (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) {
376 char m[3] = {'z', state->compression_level+'0', '\0'};
377 strcat(modez, m);
378 }
379
380 samFile *fp = sam_open(fn, modez);
381 if (!fp)
382 return NULL;
383
384 if (state->p.pool)
385 hts_set_thread_pool(fp, &state->p);
386
387 return fp;
388 }
389
init_state(const bam2fq_opts_t * opts,bam2fq_state_t ** state_out)390 static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
391 {
392 char *mode = opts->filetype == FASTA ? "wF" : "wf";
393
394 bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
395 if (!state)
396 return false;
397 state->flag_on = opts->flag_on;
398 state->flag_off = opts->flag_off;
399 state->flag_alloff = opts->flag_alloff;
400 state->has12 = opts->has12;
401 state->use_oq = opts->use_oq;
402 state->illumina_tag = opts->illumina_tag;
403 state->copy_tags = opts->copy_tags;
404 state->filetype = opts->filetype;
405 state->def_qual = opts->def_qual;
406 state->index_sequence = NULL;
407 state->hstdout = NULL;
408 state->compression_level = opts->compression_level;
409
410 state->fp = sam_open(opts->fn_input, "r");
411 if (state->fp == NULL) {
412 print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
413 free(state);
414 return false;
415 }
416
417 state->p.pool = NULL;
418 if (opts->ga.nthreads > 0) {
419 if (!(state->p.pool = hts_tpool_init(opts->ga.nthreads))) {
420 fprintf(stderr, "Failed to create thread pool\n");
421 free(state);
422 return false;
423 }
424 state->p.qsize = opts->ga.nthreads*2;
425 hts_set_thread_pool(state->fp, &state->p);
426 }
427
428 uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
429 if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX;
430 if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
431 fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
432 free(state);
433 return false;
434 }
435 if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) {
436 fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
437 free(state);
438 return false;
439 }
440 if (opts->fnse) {
441 if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) {
442 print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse);
443 free(state);
444 return false;
445 }
446 set_sam_opts(state->fpse, state, opts);
447 }
448
449 if (opts->ga.reference) {
450 if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) {
451 print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference);
452 free(state);
453 return false;
454 }
455 }
456
457 // single, read1, read2
458 int i, j;
459 for (i = 0; i < 3; ++i) {
460 if (opts->fnr[i]) {
461 for (j = 0; j < i; j++)
462 if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0)
463 break;
464 if (j == i) {
465 if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) {
466 print_error_errno("bam2fq", "Cannot open r%d file \"%s\"",
467 i, opts->fnr[i]);
468 free(state);
469 return false;
470 }
471 set_sam_opts(state->fpr[i], state, opts);
472 } else {
473 state->fpr[i] = state->fpr[j];
474 }
475 } else {
476 if (!state->hstdout) {
477 if (!(state->hstdout = sam_open_z("-", mode, state))) {
478 print_error_errno("bam2fq", "Cannot open STDOUT");
479 free(state);
480 return false;
481 }
482 set_sam_opts(state->hstdout, state, opts);
483 autoflush_if_stdout(state->hstdout, "-");
484 }
485 state->fpr[i] = state->hstdout;
486 }
487 }
488
489 // index 1, index 2
490 for (i = 0; i < 2; i++) {
491 state->fpi[i] = NULL;
492 if (opts->index_file[i]) {
493 for (j = 0; j < 3; j++)
494 if (opts->fnr[j] && strcmp(opts->fnr[j], opts->index_file[i]) == 0)
495 break;
496 for (j -= 3; j >= 0 && j < i; j++)
497 if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0)
498 break;
499 if (i == j) {
500 if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode,
501 state))) {
502 print_error_errno("bam2fq", "Cannot open i%d file \"%s\"",
503 i+1, opts->index_file[i]);
504 free(state);
505 return false;
506 }
507 set_sam_opts(state->fpi[i], state, opts);
508 } else if (j < 0) {
509 state->fpi[i] = state->fpr[j+3];
510 } else {
511 state->fpi[i] = state->fpi[j];
512 }
513 }
514 }
515
516 state->h = sam_hdr_read(state->fp);
517 if (state->h == NULL) {
518 fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input);
519 free(state);
520 return false;
521 }
522
523 *state_out = state;
524 return true;
525 }
526
destroy_state(const bam2fq_opts_t * opts,bam2fq_state_t * state,int * status)527 static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status)
528 {
529 bool valid = true;
530 sam_hdr_destroy(state->h);
531 check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
532 if (state->fpse && sam_close(state->fpse) < 0) {
533 print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse);
534 valid = false;
535 }
536
537 int i, j;
538 for (i = 0; i < 3; ++i) {
539 if (state->fpr[i] != state->hstdout) {
540 for (j = 0; j < i; j++)
541 if (state->fpr[i] == state->fpr[j])
542 break;
543 if (j == i && sam_close(state->fpr[i])) {
544 print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]);
545 valid = false;
546 }
547 }
548 }
549 if (state->hstdout) {
550 release_autoflush(state->hstdout);
551 if (sam_close(state->hstdout) < 0) {
552 print_error_errno("bam2fq", "Error closing STDOUT");
553 valid = false;
554 }
555 }
556 for (i = 0; i < 2; i++) {
557 for (j = 0; j < 3; j++)
558 if (state->fpi[i] == state->fpr[j])
559 break;
560 for (j -= 3; j >= 0 && j < i; j++)
561 if (state->fpi[i] == state->fpi[j])
562 break;
563 if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) {
564 print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
565 valid = false;
566 }
567 }
568 free(state->index_sequence);
569 if (state->p.pool)
570 hts_tpool_destroy(state->p.pool);
571 free(state);
572 return valid;
573 }
574
filter_it_out(const bam1_t * b,const bam2fq_state_t * state)575 static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
576 {
577 return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
578 || (b->core.flag&(state->flag_off)) != 0
579 || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
580
581 }
582
write_index_rec(samFile * fp,bam1_t * b,bam2fq_state_t * state,bam2fq_opts_t * opts,char * seq,int seq_len,char * qual,int qual_len)583 int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state,
584 bam2fq_opts_t* opts, char *seq, int seq_len,
585 char *qual, int qual_len) {
586 if (!fp || !b || !seq_len)
587 return 0;
588
589 int ret = -1;
590 bam1_t *b2 = bam_init1(); // FIXME: reuse
591 if (!b2)
592 return -1;
593
594 size_t aux_len = b->data + b->l_data - bam_get_aux(b);
595 if (bam_set1(b2, b->core.l_qname, bam_get_qname(b),
596 (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE,
597 -1, -1, 0, // refid, pos, mapq
598 0, NULL, // cigar
599 -1, -1, 0, // rnext, pnext, tlen
600 seq_len, seq, qual,
601 aux_len) < 0)
602 goto err;
603
604 uint8_t *q = bam_get_qual(b2);
605 if (qual) {
606 int i;
607 for (i = 0; i < seq_len; i++)
608 q[i] -= '!';
609 } else {
610 memset(q, opts->def_qual, seq_len);
611 }
612
613 memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len);
614 b2->l_data += aux_len;
615 if (sam_write1(fp, state->h, b2) < 0)
616 goto err;
617
618 ret = 0;
619 err:
620 if (b2)
621 bam_destroy1(b2);
622 return ret;
623 }
624
output_index(bam1_t * b1,bam1_t * b2,bam2fq_state_t * state,bam2fq_opts_t * opts)625 int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
626 bam2fq_opts_t* opts) {
627 bam1_t *b = b1 ? b1 : b2;
628
629 char *ifmt = opts->index_format;
630 if (!ifmt)
631 ifmt = "i*i*";
632
633 // Get seq / qual elements
634 char *bc = NULL, *qt = NULL;
635 if (b1)
636 bc = (char *)bam_aux_get(b1, opts->barcode_tag);
637 if (b2 && !bc)
638 bc = (char *)bam_aux_get(b2, opts->barcode_tag);
639 if (!bc)
640 return 0;
641 else
642 bc++; // skip Z
643
644 if (b1)
645 qt = (char *)bam_aux_get(b1, opts->quality_tag);
646 if (b2 && !qt)
647 qt = (char *)bam_aux_get(b2, opts->quality_tag);
648 if (qt && strlen(bc) != strlen(qt)-1)
649 qt = NULL;
650 else if (qt)
651 qt++;
652
653 int inum = 0;
654 while (inum < 2) {
655 char fc = *ifmt++;
656 if (!fc)
657 break; // ran out of index-format
658
659 long len, rem = 0;
660 if (isdigit(*ifmt)) {
661 rem = len = strtol(ifmt, &ifmt, 10);
662 } else {
663 ifmt++;
664 len = 0;
665 }
666
667 char *bc_end = bc, *qt_end = qt;
668 while (len ? *bc_end && rem-- : isalpha(*bc_end))
669 bc_end++, qt_end += qt != NULL;
670
671 switch (fc) {
672 case 'n':
673 // skip
674 bc = bc_end + (len==0);
675 if (qt)
676 qt = qt_end + (len==0);
677 break;
678
679 case 'i':
680 if (write_index_rec(state->fpi[inum], b, state, opts,
681 bc, bc_end-bc, qt, qt_end-qt) < 0)
682 return -1;
683 bc = bc_end + (len==0);
684 if (qt)
685 qt = qt_end + (len==0);
686 inum++;
687 break;
688
689 default:
690 fprintf(stderr, "Unknown index-format code\n");
691 return -1;
692 }
693 }
694
695 return 0;
696 }
697
flush_rec(bam2fq_state_t * state,bam2fq_opts_t * opts,bam1_t * b[4],int score[3],int best[3],int64_t * n_singletons)698 static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts,
699 bam1_t *b[4], int score[3], int best[3],
700 int64_t *n_singletons) {
701 // Paired data, with 1 or 2 ends present.
702 if (score[1] > 0 && score[2] > 0) {
703 // If CASAVA tag is required and barcode is only on R1,
704 // copy it to R2
705 if (state->illumina_tag) {
706 char *tag;
707 if ((tag = (char *)bam_aux_get(b[best[1]],
708 opts->barcode_tag)))
709 if (bam_aux_update_str(b[best[2]],
710 opts->barcode_tag,
711 strlen(tag), tag+1) < 0)
712 goto err;
713 if ((tag = (char *)bam_aux_get(b[best[1]],
714 opts->quality_tag)))
715 if (bam_aux_update_str(b[best[2]],
716 opts->quality_tag,
717 strlen(tag), tag+1) < 0)
718 goto err;
719
720 }
721 if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
722 goto err;
723 if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
724 goto err;
725
726 if (output_index(b[best[1]], b[best[2]], state, opts) < 0)
727 goto err;
728 } else if (score[1] > 0 || score[2] > 0) {
729 if (state->fpse) {
730 // print whichever one exists to fpse
731 if (score[1] > 0) {
732 if (sam_write1(state->fpse, state->h, b[best[1]]) < 0)
733 goto err;
734 } else {
735 if (sam_write1(state->fpse, state->h, b[best[2]]) < 0)
736 goto err;
737 }
738 ++(*n_singletons);
739 } else {
740 if (score[1] > 0) {
741 if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
742 goto err;
743 } else {
744 if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
745 goto err;
746 }
747 }
748
749 if (output_index(score[1] > 0 ? b[best[1]] : NULL,
750 score[2] > 0 ? b[best[2]] : NULL,
751 state, opts) < 0)
752 goto err;
753 }
754
755 if (score[0]) { // single ended data (neither READ1 nor READ2)
756 if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0)
757 goto err;
758
759 if (output_index(b[best[0]], NULL, state, opts) < 0)
760 goto err;
761 }
762
763 return 0;
764
765 err:
766 return -1;
767 }
768
bam2fq_mainloop(bam2fq_state_t * state,bam2fq_opts_t * opts)769 static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
770 {
771 int n;
772 char *current_qname = NULL;
773 int64_t n_reads = 0, n_singletons = 0; // Statistics
774 int score[3];
775 int at_eof;
776 bool valid = false;
777 int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices;
778 // indexed by [readpart]
779 bam1_t *b[4]; // 3 readparts, plus current record
780
781 for (n = 0; n < 4; n++) {
782 if (!(b[n] = bam_init1())) {
783 perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
784 return false;
785 }
786 }
787
788 n = 0;
789 while (true) {
790 int res = sam_read1(state->fp, state->h, b[n]);
791 if (res < -1) {
792 print_error("bam2fq", "Failed to read bam record");
793 goto err;
794 }
795 at_eof = res < 0;
796
797 if (!at_eof && filter_it_out(b[n], state))
798 continue;
799 if (!at_eof) {
800 ++n_reads;
801
802 // Handle -O option: use OQ for qual
803 uint8_t *oq;
804 if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') {
805 int i, l = strlen((char *)++oq);
806 uint8_t *qual = bam_get_qual(b[n]);
807 for (i = 0; i < l && i < b[n]->core.l_qseq; i++)
808 qual[i] = oq[i] - '!';
809 }
810 }
811
812 if (at_eof
813 || !current_qname
814 || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) {
815 // New name, so flush best examples of previous name.
816 if (current_qname)
817 if (flush_rec(state, opts, b, score, best, &n_singletons) < 0)
818 goto err;
819
820 current_qname = bam_get_qname(b[n]);
821 score[0] = score[1] = score[2] = 0;
822
823 if (at_eof) { break; }
824 }
825
826 // Prefer a copy of the read that has base qualities
827 int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1;
828 readpart rp = which_readpart(b[n]);
829 if (score[rp] < b_score) {
830 score[rp] = b_score;
831 // Record b[n] slot for best copy of readpair and find a new
832 // slot for next bam read
833 best[rp] = n;
834 int used_slot[4] = {0}, i;
835 for (i = 0; i < 3; i++)
836 if (best[i] >= 0)
837 used_slot[best[i]] = 1;
838 for (i = 0; i < 4 && used_slot[i]; i++)
839 ;
840 n = i;
841 }
842 }
843
844 valid = true;
845 err:
846 if (!valid)
847 print_error_errno("bam2fq", "Error writing to FASTx files.");
848
849 for (n = 0; n < 4; n++)
850 bam_destroy1(b[n]);
851
852 fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n",
853 __func__, n_singletons);
854 fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n",
855 __func__, n_reads);
856
857 return valid;
858 }
859
main_bam2fq(int argc,char * argv[])860 int main_bam2fq(int argc, char *argv[])
861 {
862 int status = EXIT_FAILURE;
863 bam2fq_opts_t* opts = NULL;
864 bam2fq_state_t* state = NULL;
865
866 bool valid = parse_opts(argc, argv, &opts);
867 if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
868
869 if (!init_state(opts, &state)) goto err;
870
871 if (!bam2fq_mainloop(state,opts)) goto err;
872
873 if (!destroy_state(opts, state, &status)) goto err;
874
875 status = EXIT_SUCCESS;
876 err:
877 sam_global_args_free(&opts->ga);
878 free_opts(opts);
879
880 return status;
881 }
882