1 /* bam_cat.c -- efficiently concatenates bam files.
2
3 Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
4 Modified SAMtools work copyright (C) 2010 Illumina, Inc.
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notices and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 DEALINGS IN THE SOFTWARE. */
23
24 /*
25 bam_cat can be used to concatenate BAM files. Under special
26 circumstances, it can be used as an alternative to 'samtools merge' to
27 concatenate multiple sorted files into a single sorted file. For this
28 to work each file must be sorted, and the sorted files must be given
29 as command line arguments in order such that the final read in file i
30 is less than or equal to the first read in file i+1.
31
32 This code is derived from the bam_reheader function in samtools 0.1.8
33 and modified to perform concatenation by Chris Saunders on behalf of
34 Illumina.
35 */
36
37 #include <config.h>
38
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <unistd.h>
42 #include <string.h>
43 #include <strings.h>
44
45 #include "htslib/bgzf.h"
46 #include "htslib/sam.h"
47 #include "htslib/cram.h"
48 #include "htslib/kstring.h"
49 #include "samtools.h"
50 #include "sam_opts.h"
51
52 /*
53 * Check the files are consistent and capable of being concatenated.
54 * Also fills out the version numbers and produces a new sam_hdr_t
55 * structure with merged RG lines.
56 * Note it is only a simple merge.
57 *
58 * Returns updated header on success;
59 * NULL on failure.
60 */
cram_cat_check_hdr(int nfn,char * const * fn,const sam_hdr_t * h,int * vers_maj_p,int * vers_min_p)61 static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h,
62 int *vers_maj_p, int *vers_min_p) {
63 int i, vers_maj = -1, vers_min = -1;
64 sam_hdr_t *new_h = NULL, *old_h = NULL;
65 samFile *in = NULL;
66 kstring_t ks = KS_INITIALIZE;
67
68 if (h) {
69 new_h = sam_hdr_dup(h);
70 if (!new_h) {
71 fprintf(stderr, "[%s] ERROR: header duplication failed.\n",
72 __func__);
73 goto fail;
74 }
75 }
76
77 for (i = 0; i < nfn; ++i) {
78 cram_fd *in_c;
79 int ki;
80
81 in = sam_open(fn[i], "rc");
82 if (in == 0) {
83 print_error_errno("cat", "fail to open file '%s'", fn[i]);
84 goto fail;
85 }
86 in_c = in->fp.cram;
87
88 int vmaj = cram_major_vers(in_c);
89 int vmin = cram_minor_vers(in_c);
90 if ((vers_maj != -1 && vers_maj != vmaj) ||
91 (vers_min != -1 && vers_min != vmin)) {
92 fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
93 __func__);
94 goto fail;
95 }
96 vers_maj = vmaj;
97 vers_min = vmin;
98
99 old_h = sam_hdr_read(in);
100 if (!old_h) {
101 fprintf(stderr, "[%s] ERROR: header reading for file '%s' filed.\n",
102 __func__, fn[i]);
103 goto fail;
104 }
105
106 if (!new_h) {
107 new_h = sam_hdr_dup(old_h);
108 if (!new_h) {
109 fprintf(stderr, "[%s] ERROR: header duplication for file '%s' failed.\n",
110 __func__, fn[i]);
111 goto fail;
112 }
113 sam_hdr_destroy(old_h);
114 sam_close(in);
115 continue;
116 }
117
118 int old_count = sam_hdr_count_lines(old_h, "RG");
119 for (ki = 0; ki < old_count; ki++) {
120 const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
121 if (old_name) {
122 int new_i = sam_hdr_line_index(new_h, "RG", old_name);
123 if (-1 == new_i) { // line does not exist in the new header
124 if (sam_hdr_find_line_pos(old_h, "RG", ki, &ks) ||
125 !ks.s || sam_hdr_add_lines(new_h, ks.s, ks.l)) {
126 fprintf(stderr, "[%s] ERROR: failed to add @RG line 'ID:%s' from file '%s'\n",
127 __func__, old_name, fn[i]);
128 goto fail;
129 }
130 ks_free(&ks);
131 }
132 } else {
133 fprintf(stderr, "[%s] ERROR: failed to read %d @RG line from file '%s'\n",
134 __func__, ki, fn[i]);
135 goto fail;
136 }
137 }
138
139 if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
140 for (ki = 0; ki < old_count; ki++) {
141 const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
142 const char *new_name = sam_hdr_line_name(new_h, "RG", ki);
143 if (!old_name || !new_name || strcmp(old_name, new_name)) {
144 fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
145 __func__);
146 goto fail;
147 }
148 }
149 }
150
151 sam_hdr_destroy(old_h);
152 sam_close(in);
153 }
154
155 ks_free(&ks);
156
157 *vers_maj_p = vers_maj;
158 *vers_min_p = vers_min;
159
160 return new_h;
161
162 fail:
163 ks_free(&ks);
164 if (old_h) sam_hdr_destroy(old_h);
165 if (new_h) sam_hdr_destroy(new_h);
166 if (in) sam_close(in);
167
168 return NULL;
169 }
170
171
172 /*
173 * CRAM files don't store the RG:Z:ID per read in the aux field.
174 * Instead they have a numerical data series (RG) to point each read
175 * back to the Nth @RG line in the file. This means that we may need
176 * to edit the RG data series (if the files were produced from
177 * "samtools split" for example).
178 *
179 * The encoding method is stored in the compression header. Typical
180 * examples:
181 *
182 * RG => EXTERNAL {18} # Block content-id 18 holds RG values
183 * # as a series of ITF8 encoded values
184 *
185 * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
186 * # One RG value #-1. (No RG)
187 *
188 * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG)
189 *
190 * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
191 * # Two RG values, #0 and #1, written
192 * # to the CORE block and possibly
193 * # mixed with other data series.
194 *
195 * A single value can (but may not be) implemented as a zero bit
196 * huffman code. In this situation we can change the meta-data in the
197 * compression header to renumber an RG value..
198 */
cram_cat(int nfn,char * const * fn,const sam_hdr_t * h,const char * outcram,sam_global_args * ga,char * arg_list,int no_pg)199 int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg)
200 {
201 samFile *out;
202 cram_fd *out_c;
203 int i, vers_maj, vers_min;
204 sam_hdr_t *new_h = NULL;
205
206 /* Check consistent versioning and compatible headers */
207 if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min)))
208 return -1;
209
210 /* Open the file with cram_vers */
211 char vers[100];
212 sprintf(vers, "%d.%d", vers_maj, vers_min);
213 out = sam_open_format(outcram, "wc", &ga->out);
214 if (out == 0) {
215 print_error_errno("cat", "fail to open output file '%s'", outcram);
216 return -1;
217 }
218 out_c = out->fp.cram;
219 cram_set_option(out_c, CRAM_OPT_VERSION, vers);
220 //fprintf(stderr, "Creating cram vers %s\n", vers);
221
222 if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
223 "VN", samtools_version(),
224 arg_list ? "CL": NULL,
225 arg_list ? arg_list : NULL,
226 NULL))
227 return -1;
228
229 if (sam_hdr_write(out, new_h) < 0) {
230 print_error_errno("cat", "Couldn't write header");
231 return -1;
232 }
233
234 for (i = 0; i < nfn; ++i) {
235 samFile *in;
236 cram_fd *in_c;
237 cram_container *c;
238 sam_hdr_t *old_h;
239 int new_rg = -1;
240
241 in = sam_open(fn[i], "rc");
242 if (in == 0) {
243 print_error_errno("cat", "fail to open file '%s'", fn[i]);
244 return -1;
245 }
246 in_c = in->fp.cram;
247
248 old_h = sam_hdr_read(in);
249 if (!old_h) {
250 print_error("cat", "fail to read the header of file '%s'", fn[i]);
251 return -1;
252 }
253
254 // Compute RG mapping if suitable for changing.
255 if (sam_hdr_count_lines(old_h, "RG") == 1) {
256 const char *old_name = sam_hdr_line_name(old_h, "RG", 0);
257 if (old_name) {
258 new_rg = sam_hdr_line_index(new_h, "RG", old_name);
259 if (new_rg < 0) {
260 print_error("cat", "fail to find @RG line '%s' in the new header", old_name);
261 return -1;
262 }
263 } else {
264 print_error("cat", "fail to find @RG line in file '%s'", fn[i]);
265 return -1;
266 }
267 } else {
268 new_rg = 0;
269 }
270
271 // Copy contains and blocks within them
272 while ((c = cram_read_container(in_c))) {
273 if (cram_container_is_empty(in_c)) {
274 cram_block *blk;
275 // Container compression header
276 if (!(blk = cram_read_block(in_c)))
277 return -1;
278 cram_free_block(blk);
279 cram_free_container(c);
280 continue;
281 }
282
283 // If we have just one RG key and new_rg != 0 then
284 // we need to edit the compression header. IF WE CAN.
285 if (new_rg) {
286 int zero = 0;
287 //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg);
288 cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
289 } else {
290 int32_t num_slices;
291 cram_block *blk;
292
293 // Not switching rg so do the usual read/write loop
294 if (cram_write_container(out_c, c) != 0)
295 return -1;
296
297 // Container compression header
298 if (!(blk = cram_read_block(in_c)))
299 return -1;
300 if (cram_write_block(out_c, blk) != 0) {
301 cram_free_block(blk);
302 return -1;
303 }
304 cram_free_block(blk);
305
306
307 // Container num_blocks can be invalid, due to a bug.
308 // Instead we iterate in slice context instead.
309 (void)cram_container_get_landmarks(c, &num_slices);
310 cram_copy_slice(in_c, out_c, num_slices);
311 }
312
313 cram_free_container(c);
314 }
315
316 sam_hdr_destroy(old_h);
317 sam_close(in);
318 }
319 sam_close(out);
320 sam_hdr_destroy(new_h);
321
322 return 0;
323 }
324
325
326 #define BUF_SIZE 0x10000
327
328 #define GZIPID1 31
329 #define GZIPID2 139
330
331 #define BGZF_EMPTY_BLOCK_SIZE 28
332
bam_cat(int nfn,char * const * fn,sam_hdr_t * h,const char * outbam,char * arg_list,int no_pg)333 int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
334 {
335 BGZF *fp, *in = NULL;
336 uint8_t *buf = NULL;
337 uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
338 const int es=BGZF_EMPTY_BLOCK_SIZE;
339 int i;
340
341 fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
342 if (fp == 0) {
343 print_error_errno("cat", "fail to open output file '%s'", outbam);
344 return -1;
345 }
346 if (h) {
347 if (!no_pg && sam_hdr_add_pg(h, "samtools",
348 "VN", samtools_version(),
349 arg_list ? "CL": NULL,
350 arg_list ? arg_list : NULL,
351 NULL))
352 goto fail;
353
354 if (bam_hdr_write(fp, h) < 0) {
355 print_error_errno("cat", "Couldn't write header");
356 goto fail;
357 }
358 }
359
360 buf = (uint8_t*) malloc(BUF_SIZE);
361 if (!buf) {
362 fprintf(stderr, "[%s] Couldn't allocate buffer\n", __func__);
363 goto fail;
364 }
365 for(i = 0; i < nfn; ++i){
366 sam_hdr_t *old;
367 int len,j;
368
369 in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
370 if (in == 0) {
371 print_error_errno("cat", "fail to open file '%s'", fn[i]);
372 goto fail;
373 }
374 if (in->is_write) return -1;
375
376 old = bam_hdr_read(in);
377 if (old == NULL) {
378 fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
379 __func__, fn[i]);
380 goto fail;
381 }
382 if (h == 0 && i == 0) {
383 if (!no_pg && sam_hdr_add_pg(old, "samtools",
384 "VN", samtools_version(),
385 arg_list ? "CL": NULL,
386 arg_list ? arg_list : NULL,
387 NULL))
388 goto fail;
389
390 if (bam_hdr_write(fp, old) < 0) {
391 print_error_errno("cat", "Couldn't write header");
392 goto fail;
393 }
394 }
395
396 if (in->block_offset < in->block_length) {
397 if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
398 if (bgzf_flush(fp) != 0) goto write_fail;
399 }
400
401 j=0;
402 while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
403 if(len<es){
404 int diff=es-len;
405 if(j==0) {
406 fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
407 goto fail;
408 }
409 if (bgzf_raw_write(fp, ebuf, len) < 0) goto write_fail;
410
411 memcpy(ebuf,ebuf+len,diff);
412 memcpy(ebuf+diff,buf,len);
413 } else {
414 if(j!=0) {
415 if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
416 }
417 len-= es;
418 memcpy(ebuf,buf+len,es);
419 if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail;
420 }
421 j=1;
422 }
423
424 /* check final gzip block */
425 {
426 const uint8_t gzip1=ebuf[0];
427 const uint8_t gzip2=ebuf[1];
428 const uint32_t isize=*((uint32_t*)(ebuf+es-4));
429 if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
430 fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
431 fprintf(stderr, " Possible output corruption.\n");
432 if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
433 }
434 }
435 sam_hdr_destroy(old);
436 bgzf_close(in);
437 in = NULL;
438 }
439 free(buf);
440 if (bgzf_close(fp) < 0) {
441 fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
442 return -1;
443 }
444 return 0;
445
446 write_fail:
447 fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
448 fail:
449 if (in) bgzf_close(in);
450 if (fp) bgzf_close(fp);
451 free(buf);
452 return -1;
453 }
454
455
main_cat(int argc,char * argv[])456 int main_cat(int argc, char *argv[])
457 {
458 sam_hdr_t *h = 0;
459 char *outfn = 0;
460 char **infns = NULL; // files to concatenate
461 int infns_size = 0;
462 int c, ret = 0, no_pg = 0, usage = 0;
463 samFile *in;
464 sam_global_args ga;
465
466 static const struct option lopts[] = {
467 SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'),
468 {"no-PG", no_argument, NULL, 1},
469 { NULL, 0, NULL, 0 }
470 };
471
472 char *arg_list = NULL;
473
474 sam_global_args_init(&ga);
475
476 while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
477 switch (c) {
478 case 'h': {
479 samFile *fph = sam_open(optarg, "r");
480 if (fph == 0) {
481 fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg);
482 return 1;
483 }
484 h = sam_hdr_read(fph);
485 if (h == NULL) {
486 fprintf(stderr,
487 "[%s] ERROR: failed to read the header from '%s'.\n",
488 __func__, optarg);
489 return 1;
490 }
491 sam_close(fph);
492 break;
493 }
494 case 'o': outfn = strdup(optarg); break;
495 case 'b': {
496 // add file names in "optarg" to the list
497 // of files to concatenate
498 int nfns;
499 char **fns_read = hts_readlines(optarg, &nfns);
500 if (fns_read) {
501 infns = realloc(infns, (infns_size + nfns) * sizeof(char*));
502 if (infns == NULL) { ret = 1; goto end; }
503 memcpy(infns+infns_size, fns_read, nfns * sizeof(char*));
504 infns_size += nfns;
505 free(fns_read);
506 } else {
507 print_error("cat", "Invalid file list \"%s\"", optarg);
508 ret = 1;
509 }
510 break;
511 }
512 case 1:
513 no_pg = 1;
514 break;
515 default:
516 if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
517 /* else fall-through */
518 case '?': usage=1; break;
519 }
520 }
521
522 if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
523 print_error("cat", "failed to create arg_list");
524 return 1;
525 }
526
527 // Append files specified in argv to the list.
528 int nargv_fns = argc - optind;
529 if (nargv_fns > 0) {
530 infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*));
531 if (infns == NULL) { ret = 1; goto end; }
532 memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*));
533 }
534
535 // Require at least one input file
536 if (infns_size + nargv_fns == 0 || usage) {
537 fprintf(stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
538 fprintf(stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
539 fprintf(stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
540 fprintf(stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n");
541 fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n");
542 fprintf(stderr, " -o FILE output BAM/CRAM\n");
543 fprintf(stderr, " --no-PG do not add a PG line\n");
544 sam_global_opt_help(stderr, "--..-@-.");
545 return 1;
546 }
547
548 in = sam_open(infns[0], "r");
549 if (!in) {
550 print_error_errno("cat", "failed to open file '%s'", infns[0]);
551 return 1;
552 }
553
554 switch (hts_get_format(in)->format) {
555 case bam:
556 sam_close(in);
557 if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
558 ret = 1;
559 break;
560
561 case cram:
562 sam_close(in);
563 if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0)
564 ret = 1;
565 break;
566
567 default:
568 sam_close(in);
569 fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
570 return 1;
571 }
572
573 end:
574 if (infns_size > 0) {
575 int i;
576 for (i=0; i<infns_size; i++)
577 free(infns[i]);
578 }
579
580 free(outfn);
581 free(infns);
582 free(arg_list);
583 if (h)
584 sam_hdr_destroy(h);
585
586 return ret;
587 }
588