1 /*  bam_cat.c -- efficiently concatenates bam files.
2 
3     Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
4     Modified SAMtools work copyright (C) 2010 Illumina, Inc.
5 
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12 
13 The above copyright notices and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15 
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 DEALINGS IN THE SOFTWARE.  */
23 
24 /*
25 bam_cat can be used to concatenate BAM files. Under special
26 circumstances, it can be used as an alternative to 'samtools merge' to
27 concatenate multiple sorted files into a single sorted file. For this
28 to work each file must be sorted, and the sorted files must be given
29 as command line arguments in order such that the final read in file i
30 is less than or equal to the first read in file i+1.
31 
32 This code is derived from the bam_reheader function in samtools 0.1.8
33 and modified to perform concatenation by Chris Saunders on behalf of
34 Illumina.
35 */
36 
37 #include <config.h>
38 
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <unistd.h>
42 #include <string.h>
43 #include <strings.h>
44 
45 #include "htslib/bgzf.h"
46 #include "htslib/sam.h"
47 #include "htslib/cram.h"
48 #include "htslib/kstring.h"
49 #include "samtools.h"
50 #include "sam_opts.h"
51 
52 /*
53  * Check the files are consistent and capable of being concatenated.
54  * Also fills out the version numbers and produces a new sam_hdr_t
55  * structure with merged RG lines.
56  * Note it is only a simple merge.
57  *
58  * Returns updated header on success;
59  *        NULL on failure.
60  */
cram_cat_check_hdr(int nfn,char * const * fn,const sam_hdr_t * h,int * vers_maj_p,int * vers_min_p)61 static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h,
62                                      int *vers_maj_p, int *vers_min_p) {
63     int i, vers_maj = -1, vers_min = -1;
64     sam_hdr_t *new_h = NULL, *old_h = NULL;
65     samFile *in = NULL;
66     kstring_t ks = KS_INITIALIZE;
67 
68     if (h) {
69         new_h = sam_hdr_dup(h);
70         if (!new_h) {
71             fprintf(stderr, "[%s] ERROR: header duplication failed.\n",
72                     __func__);
73             goto fail;
74         }
75     }
76 
77     for (i = 0; i < nfn; ++i) {
78         cram_fd *in_c;
79         int ki;
80 
81         in = sam_open(fn[i], "rc");
82         if (in == 0) {
83             print_error_errno("cat", "fail to open file '%s'", fn[i]);
84             goto fail;
85         }
86         in_c = in->fp.cram;
87 
88         int vmaj = cram_major_vers(in_c);
89         int vmin = cram_minor_vers(in_c);
90         if ((vers_maj != -1 && vers_maj != vmaj) ||
91             (vers_min != -1 && vers_min != vmin)) {
92             fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n",
93                     __func__);
94             goto fail;
95         }
96         vers_maj = vmaj;
97         vers_min = vmin;
98 
99         old_h = sam_hdr_read(in);
100         if (!old_h) {
101             fprintf(stderr, "[%s] ERROR: header reading for file '%s' filed.\n",
102                     __func__, fn[i]);
103             goto fail;
104         }
105 
106         if (!new_h) {
107             new_h = sam_hdr_dup(old_h);
108             if (!new_h) {
109                 fprintf(stderr, "[%s] ERROR: header duplication for file '%s' failed.\n",
110                         __func__, fn[i]);
111                 goto fail;
112             }
113             sam_hdr_destroy(old_h);
114             sam_close(in);
115             continue;
116         }
117 
118         int old_count = sam_hdr_count_lines(old_h, "RG");
119         for (ki = 0; ki < old_count; ki++) {
120             const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
121             if (old_name) {
122                 int new_i = sam_hdr_line_index(new_h, "RG", old_name);
123                 if (-1 == new_i) { // line does not exist in the new header
124                     if (sam_hdr_find_line_pos(old_h, "RG", ki, &ks) ||
125                         !ks.s || sam_hdr_add_lines(new_h, ks.s, ks.l)) {
126                         fprintf(stderr, "[%s] ERROR: failed to add @RG line 'ID:%s' from file '%s'\n",
127                                 __func__, old_name, fn[i]);
128                         goto fail;
129                     }
130                     ks_free(&ks);
131                 }
132             } else {
133                 fprintf(stderr, "[%s] ERROR: failed to read %d @RG line from file '%s'\n",
134                         __func__, ki, fn[i]);
135                 goto fail;
136             }
137         }
138 
139         if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) {
140             for (ki = 0; ki < old_count; ki++) {
141                 const char *old_name = sam_hdr_line_name(old_h, "RG", ki);
142                 const char *new_name = sam_hdr_line_name(new_h, "RG", ki);
143                 if (!old_name || !new_name || strcmp(old_name, new_name)) {
144                     fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n",
145                             __func__);
146                     goto fail;
147                 }
148             }
149         }
150 
151         sam_hdr_destroy(old_h);
152         sam_close(in);
153     }
154 
155     ks_free(&ks);
156 
157     *vers_maj_p = vers_maj;
158     *vers_min_p = vers_min;
159 
160     return new_h;
161 
162 fail:
163     ks_free(&ks);
164     if (old_h) sam_hdr_destroy(old_h);
165     if (new_h) sam_hdr_destroy(new_h);
166     if (in) sam_close(in);
167 
168     return NULL;
169 }
170 
171 
172 /*
173  * CRAM files don't store the RG:Z:ID per read in the aux field.
174  * Instead they have a numerical data series (RG) to point each read
175  * back to the Nth @RG line in the file.  This means that we may need
176  * to edit the RG data series (if the files were produced from
177  * "samtools split" for example).
178  *
179  * The encoding method is stored in the compression header. Typical
180  * examples:
181  *
182  * RG => EXTERNAL {18}           # Block content-id 18 holds RG values
183  *                               # as a series of ITF8 encoded values
184  *
185  * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
186  *                               # One RG value #-1.  (No RG)
187  *
188  * RG => HUFFMAN {1, 0, 1, 0}    # One RG value #0 (always first RG)
189  *
190  * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
191  *                               # Two RG values, #0 and #1, written
192  *                               # to the CORE block and possibly
193  *                               # mixed with other data series.
194  *
195  * A single value can (but may not be) implemented as a zero bit
196  * huffman code.  In this situation we can change the meta-data in the
197  * compression header to renumber an RG value..
198  */
cram_cat(int nfn,char * const * fn,const sam_hdr_t * h,const char * outcram,sam_global_args * ga,char * arg_list,int no_pg)199 int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg)
200 {
201     samFile *out;
202     cram_fd *out_c;
203     int i, vers_maj, vers_min;
204     sam_hdr_t *new_h = NULL;
205 
206     /* Check consistent versioning and compatible headers */
207     if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min)))
208         return -1;
209 
210     /* Open the file with cram_vers */
211     char vers[100];
212     sprintf(vers, "%d.%d", vers_maj, vers_min);
213     out = sam_open_format(outcram, "wc", &ga->out);
214     if (out == 0) {
215         print_error_errno("cat", "fail to open output file '%s'", outcram);
216         return -1;
217     }
218     out_c = out->fp.cram;
219     cram_set_option(out_c, CRAM_OPT_VERSION, vers);
220     //fprintf(stderr, "Creating cram vers %s\n", vers);
221 
222     if (!no_pg && sam_hdr_add_pg(new_h, "samtools",
223                                  "VN", samtools_version(),
224                                  arg_list ? "CL": NULL,
225                                  arg_list ? arg_list : NULL,
226                                  NULL))
227         return -1;
228 
229     if (sam_hdr_write(out, new_h) < 0) {
230         print_error_errno("cat", "Couldn't write header");
231         return -1;
232     }
233 
234     for (i = 0; i < nfn; ++i) {
235         samFile *in;
236         cram_fd *in_c;
237         cram_container *c;
238         sam_hdr_t *old_h;
239         int new_rg = -1;
240 
241         in = sam_open(fn[i], "rc");
242         if (in == 0) {
243             print_error_errno("cat", "fail to open file '%s'", fn[i]);
244             return -1;
245         }
246         in_c = in->fp.cram;
247 
248         old_h = sam_hdr_read(in);
249         if (!old_h) {
250             print_error("cat", "fail to read the header of file '%s'", fn[i]);
251             return -1;
252         }
253 
254         // Compute RG mapping if suitable for changing.
255         if (sam_hdr_count_lines(old_h, "RG") == 1) {
256             const char *old_name = sam_hdr_line_name(old_h, "RG", 0);
257             if (old_name) {
258                 new_rg = sam_hdr_line_index(new_h, "RG", old_name);
259                 if (new_rg < 0) {
260                     print_error("cat", "fail to find @RG line '%s' in the new header", old_name);
261                     return -1;
262                 }
263             } else {
264                 print_error("cat", "fail to find @RG line in file '%s'", fn[i]);
265                 return -1;
266             }
267         } else {
268             new_rg = 0;
269         }
270 
271         // Copy contains and blocks within them
272         while ((c = cram_read_container(in_c))) {
273             if (cram_container_is_empty(in_c)) {
274                 cram_block *blk;
275                 // Container compression header
276                 if (!(blk = cram_read_block(in_c)))
277                     return -1;
278                 cram_free_block(blk);
279                 cram_free_container(c);
280                 continue;
281             }
282 
283             // If we have just one RG key and new_rg != 0 then
284             // we need to edit the compression header. IF WE CAN.
285             if (new_rg) {
286                 int zero = 0;
287                 //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg);
288                 cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
289             } else {
290                 int32_t num_slices;
291                 cram_block *blk;
292 
293                 // Not switching rg so do the usual read/write loop
294                 if (cram_write_container(out_c, c) != 0)
295                     return -1;
296 
297                 // Container compression header
298                 if (!(blk = cram_read_block(in_c)))
299                     return -1;
300                 if (cram_write_block(out_c, blk) != 0) {
301                     cram_free_block(blk);
302                     return -1;
303                 }
304                 cram_free_block(blk);
305 
306 
307                 // Container num_blocks can be invalid, due to a bug.
308                 // Instead we iterate in slice context instead.
309                 (void)cram_container_get_landmarks(c, &num_slices);
310                 cram_copy_slice(in_c, out_c, num_slices);
311             }
312 
313             cram_free_container(c);
314         }
315 
316         sam_hdr_destroy(old_h);
317         sam_close(in);
318     }
319     sam_close(out);
320     sam_hdr_destroy(new_h);
321 
322     return 0;
323 }
324 
325 
326 #define BUF_SIZE 0x10000
327 
328 #define GZIPID1 31
329 #define GZIPID2 139
330 
331 #define BGZF_EMPTY_BLOCK_SIZE 28
332 
bam_cat(int nfn,char * const * fn,sam_hdr_t * h,const char * outbam,char * arg_list,int no_pg)333 int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg)
334 {
335     BGZF *fp, *in = NULL;
336     uint8_t *buf = NULL;
337     uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
338     const int es=BGZF_EMPTY_BLOCK_SIZE;
339     int i;
340 
341     fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
342     if (fp == 0) {
343         print_error_errno("cat", "fail to open output file '%s'", outbam);
344         return -1;
345     }
346     if (h) {
347         if (!no_pg && sam_hdr_add_pg(h, "samtools",
348                                      "VN", samtools_version(),
349                                      arg_list ? "CL": NULL,
350                                      arg_list ? arg_list : NULL,
351                                      NULL))
352             goto fail;
353 
354         if (bam_hdr_write(fp, h) < 0) {
355             print_error_errno("cat", "Couldn't write header");
356             goto fail;
357         }
358     }
359 
360     buf = (uint8_t*) malloc(BUF_SIZE);
361     if (!buf) {
362         fprintf(stderr, "[%s] Couldn't allocate buffer\n", __func__);
363         goto fail;
364     }
365     for(i = 0; i < nfn; ++i){
366         sam_hdr_t *old;
367         int len,j;
368 
369         in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
370         if (in == 0) {
371             print_error_errno("cat", "fail to open file '%s'", fn[i]);
372             goto fail;
373         }
374         if (in->is_write) return -1;
375 
376         old = bam_hdr_read(in);
377         if (old == NULL) {
378             fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
379                     __func__, fn[i]);
380             goto fail;
381         }
382         if (h == 0 && i == 0) {
383             if (!no_pg && sam_hdr_add_pg(old, "samtools",
384                                          "VN", samtools_version(),
385                                          arg_list ? "CL": NULL,
386                                          arg_list ? arg_list : NULL,
387                                          NULL))
388                 goto fail;
389 
390             if (bam_hdr_write(fp, old) < 0) {
391                 print_error_errno("cat", "Couldn't write header");
392                 goto fail;
393             }
394         }
395 
396         if (in->block_offset < in->block_length) {
397             if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
398             if (bgzf_flush(fp) != 0) goto write_fail;
399         }
400 
401         j=0;
402         while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
403             if(len<es){
404                 int diff=es-len;
405                 if(j==0) {
406                     fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
407                     goto fail;
408                 }
409                 if (bgzf_raw_write(fp, ebuf, len) < 0) goto write_fail;
410 
411                 memcpy(ebuf,ebuf+len,diff);
412                 memcpy(ebuf+diff,buf,len);
413             } else {
414                 if(j!=0) {
415                     if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
416                 }
417                 len-= es;
418                 memcpy(ebuf,buf+len,es);
419                 if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail;
420             }
421             j=1;
422         }
423 
424         /* check final gzip block */
425         {
426             const uint8_t gzip1=ebuf[0];
427             const uint8_t gzip2=ebuf[1];
428             const uint32_t isize=*((uint32_t*)(ebuf+es-4));
429             if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
430                 fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
431                 fprintf(stderr, " Possible output corruption.\n");
432                 if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail;
433             }
434         }
435         sam_hdr_destroy(old);
436         bgzf_close(in);
437         in = NULL;
438     }
439     free(buf);
440     if (bgzf_close(fp) < 0) {
441         fprintf(stderr, "[%s] Error on closing '%s'.\n", __func__, outbam);
442         return -1;
443     }
444     return 0;
445 
446  write_fail:
447     fprintf(stderr, "[%s] Error writing to '%s'.\n", __func__, outbam);
448  fail:
449     if (in) bgzf_close(in);
450     if (fp) bgzf_close(fp);
451     free(buf);
452     return -1;
453 }
454 
455 
main_cat(int argc,char * argv[])456 int main_cat(int argc, char *argv[])
457 {
458     sam_hdr_t *h = 0;
459     char *outfn = 0;
460     char **infns = NULL; // files to concatenate
461     int infns_size = 0;
462     int c, ret = 0, no_pg = 0, usage = 0;
463     samFile *in;
464     sam_global_args ga;
465 
466     static const struct option lopts[] = {
467         SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'),
468         {"no-PG", no_argument, NULL, 1},
469         { NULL, 0, NULL, 0 }
470     };
471 
472     char *arg_list = NULL;
473 
474     sam_global_args_init(&ga);
475 
476     while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
477         switch (c) {
478             case 'h': {
479                 samFile *fph = sam_open(optarg, "r");
480                 if (fph == 0) {
481                     fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg);
482                     return 1;
483                 }
484                 h = sam_hdr_read(fph);
485                 if (h == NULL) {
486                     fprintf(stderr,
487                             "[%s] ERROR: failed to read the header from '%s'.\n",
488                             __func__, optarg);
489                     return 1;
490                 }
491                 sam_close(fph);
492                 break;
493             }
494             case 'o': outfn = strdup(optarg); break;
495             case 'b': {
496                 // add file names in "optarg" to the list
497                 // of files to concatenate
498                 int nfns;
499                 char **fns_read = hts_readlines(optarg, &nfns);
500                 if (fns_read) {
501                     infns = realloc(infns, (infns_size + nfns) * sizeof(char*));
502                     if (infns == NULL) { ret = 1; goto end; }
503                     memcpy(infns+infns_size, fns_read, nfns * sizeof(char*));
504                     infns_size += nfns;
505                     free(fns_read);
506                 } else {
507                     print_error("cat", "Invalid file list \"%s\"", optarg);
508                     ret = 1;
509                 }
510                 break;
511             }
512             case 1:
513                 no_pg = 1;
514                 break;
515             default:
516                 if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
517                 /* else fall-through */
518             case '?': usage=1; break;
519         }
520     }
521 
522     if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) {
523         print_error("cat", "failed to create arg_list");
524         return 1;
525     }
526 
527     // Append files specified in argv to the list.
528     int nargv_fns = argc - optind;
529     if (nargv_fns > 0) {
530         infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*));
531         if (infns == NULL) { ret = 1; goto end; }
532         memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*));
533     }
534 
535     // Require at least one input file
536     if (infns_size + nargv_fns == 0 || usage) {
537         fprintf(stderr, "Usage: samtools cat [options] <in1.bam>  [... <inN.bam>]\n");
538         fprintf(stderr, "       samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
539         fprintf(stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
540         fprintf(stderr, "Options: -b FILE  list of input BAM/CRAM file names, one per line\n");
541         fprintf(stderr, "         -h FILE  copy the header from FILE [default is 1st input file]\n");
542         fprintf(stderr, "         -o FILE  output BAM/CRAM\n");
543         fprintf(stderr, "         --no-PG  do not add a PG line\n");
544         sam_global_opt_help(stderr, "--..-@-.");
545         return 1;
546     }
547 
548     in = sam_open(infns[0], "r");
549     if (!in) {
550         print_error_errno("cat", "failed to open file '%s'", infns[0]);
551         return 1;
552     }
553 
554     switch (hts_get_format(in)->format) {
555     case bam:
556         sam_close(in);
557         if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0)
558             ret = 1;
559         break;
560 
561     case cram:
562         sam_close(in);
563         if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0)
564             ret = 1;
565         break;
566 
567     default:
568         sam_close(in);
569         fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
570         return 1;
571     }
572 
573  end:
574     if (infns_size > 0) {
575         int i;
576         for (i=0; i<infns_size; i++)
577             free(infns[i]);
578     }
579 
580     free(outfn);
581     free(infns);
582     free(arg_list);
583     if (h)
584         sam_hdr_destroy(h);
585 
586     return ret;
587 }
588