1 /*  bam_split.c -- split subcommand.
2 
3     Copyright (C) 2013-2016,2018-2019 Genome Research Ltd.
4 
5     Author: Martin Pollard <mp15@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <config.h>
26 
27 #include <string.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <stdbool.h>
31 #include <limits.h>
32 #include <unistd.h>
33 #include <regex.h>
34 #include <assert.h>
35 #include <htslib/sam.h>
36 #include <htslib/khash.h>
37 #include <htslib/kstring.h>
38 #include <htslib/cram.h>
39 #include "htslib/thread_pool.h"
40 #include "sam_opts.h"
41 #include "samtools.h"
42 
43 
44 KHASH_MAP_INIT_STR(c2i, int)
45 
46 struct parsed_opts {
47     const char *merged_input_name;
48     const char *unaccounted_header_name;
49     const char *unaccounted_name;
50     const char *output_format_string;
51     bool verbose;
52     int no_pg;
53     sam_global_args ga;
54 };
55 
56 typedef struct parsed_opts parsed_opts_t;
57 
58 struct state {
59     samFile* merged_input_file;
60     sam_hdr_t* merged_input_header;
61     samFile* unaccounted_file;
62     sam_hdr_t* unaccounted_header;
63     size_t output_count;
64     char** rg_id;
65     char **rg_index_file_name;
66     char **rg_output_file_name;
67     samFile** rg_output_file;
68     sam_hdr_t** rg_output_header;
69     kh_c2i_t* rg_hash;
70     htsThreadPool p;
71     int write_index;
72 };
73 
74 typedef struct state state_t;
75 
76 static int cleanup_state(state_t* status, bool check_close);
77 static void cleanup_opts(parsed_opts_t* opts);
78 
usage(FILE * write_to)79 static void usage(FILE *write_to)
80 {
81     fprintf(write_to,
82 "Usage: samtools split [-u <unaccounted.bam>] [-h <unaccounted_header.sam>]\n"
83 "                      [-f <format_string>] [-v] <merged.bam>\n"
84 "Options:\n"
85 "  -f STRING       output filename format string [\"%%*_%%#.%%.\"]\n"
86 "  -u FILE1        put reads with no RG tag or an unrecognised RG tag in FILE1\n"
87 "  -h FILE2        ... and override the header with FILE2 (-u file only)\n"
88 "  -v              verbose output\n"
89 "  --no-PG         do not add a PG line\n");
90     sam_global_opt_help(write_to, "-....@..");
91     fprintf(write_to,
92 "\n"
93 "Format string expansions:\n"
94 "  %%%%     %%\n"
95 "  %%*     basename\n"
96 "  %%#     @RG index\n"
97 "  %%!     @RG ID\n"
98 "  %%.     filename extension for output format\n"
99       );
100 }
101 
102 // Takes the command line options and turns them into something we can understand
parse_args(int argc,char ** argv)103 static parsed_opts_t* parse_args(int argc, char** argv)
104 {
105     if (argc == 1) { usage(stdout); return NULL; }
106 
107     const char *optstring = "vf:h:u:@:";
108 
109     static const struct option lopts[] = {
110         SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
111         {"no-PG", no_argument, NULL, 1},
112         { NULL, 0, NULL, 0 }
113     };
114 
115     parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
116     if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
117 
118     sam_global_args_init(&retval->ga);
119 
120     int opt;
121     while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) {
122         switch (opt) {
123         case 'f':
124             retval->output_format_string = optarg;
125             break;
126         case 'h':
127             retval->unaccounted_header_name = optarg;
128             break;
129         case 'v':
130             retval->verbose = true;
131             break;
132         case 'u':
133             retval->unaccounted_name = optarg;
134             break;
135         case 1:
136             retval->no_pg = 1;
137             break;
138         default:
139             if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
140             /* else fall-through */
141         case '?':
142             usage(stdout);
143             free(retval);
144             return NULL;
145         }
146     }
147 
148     if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%.";
149 
150     argc -= optind;
151     argv += optind;
152 
153     if (argc != 1) {
154         print_error("split", "Invalid number of arguments: %d", argc);
155         usage(stderr);
156         free(retval);
157         return NULL;
158     }
159 
160     retval->merged_input_name = argv[0];
161 
162     return retval;
163 }
164 
165 // Expands a output filename format string
expand_format_string(const char * format_string,const char * basename,const char * rg_id,const int rg_idx,const htsFormat * format)166 static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
167 {
168     kstring_t str = { 0, 0, NULL };
169     const char* pointer = format_string;
170     const char* next;
171     while ((next = strchr(pointer, '%')) != NULL) {
172         if (kputsn(pointer, next-pointer, &str) < 0) goto memfail;
173         ++next;
174         switch (*next) {
175             case '%':
176                 if (kputc('%', &str) < 0) goto memfail;
177                 break;
178             case '*':
179                 if (kputs(basename, &str) < 0) goto memfail;
180                 break;
181             case '#':
182                 if (kputl(rg_idx, &str) < 0) goto memfail;
183                 break;
184             case '!':
185                 if (kputs(rg_id, &str) < 0) goto memfail;
186                 break;
187             case '.':
188                 // Only really need to cope with sam, bam, cram
189                 if (format->format != unknown_format) {
190                     if (kputs(hts_format_file_extension(format), &str) < 0)
191                         goto memfail;
192                 } else {
193                     if (kputs("bam", &str) < 0) goto memfail;
194                 }
195                 break;
196             case '\0':
197                 print_error("split", "Trailing %% in filename format string");
198                 goto fail;
199             default:
200                 // Error is: fprintf(stderr, "bad format string, unknown format specifier\n");
201                 print_error("split", "Unknown specifier %%%c in filename format string", *next);
202                 goto fail;
203         }
204         pointer = next + 1;
205     }
206     if (kputs(pointer, &str) < 0) goto memfail;
207     return ks_release(&str);
208 
209  memfail:
210     print_error_errno("split", "Couldn't build output filename");
211  fail:
212     free(str.s);
213     return NULL;
214 }
215 
216 // Parse the header, count the number of RG tags and return a list of their names
count_RG(sam_hdr_t * hdr,size_t * count,char *** output_name)217 static bool count_RG(sam_hdr_t* hdr, size_t* count, char*** output_name)
218 {
219     char **names = NULL;
220     kstring_t id_val = KS_INITIALIZE;
221     int i, n_rg = sam_hdr_count_lines(hdr, "RG");
222 
223     if (n_rg < 0) {
224         print_error("split", "Failed to get @RG IDs");
225         *count = 0;
226         *output_name = NULL;
227         return false;
228     }
229 
230     if (n_rg == 0) {
231         *count = 0;
232         *output_name = NULL;
233         return true;
234     }
235 
236     names = calloc(n_rg, sizeof(names[0]));
237     if (!names) goto memfail;
238 
239     for (i = 0; i < n_rg; i++) {
240         if (sam_hdr_find_tag_pos(hdr, "RG", i, "ID", &id_val) < 0) goto memfail;
241         names[i] = ks_release(&id_val);
242     }
243 
244     *count = n_rg;
245     *output_name = names;
246     return true;
247 
248  memfail:
249     print_error_errno("split", "Failed to get @RG IDs");
250     *count = 0;
251     *output_name = NULL;
252     ks_free(&id_val);
253     free(names);
254     return false;
255 }
256 
header_compatible(sam_hdr_t * hdr1,sam_hdr_t * hdr2)257 static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2)
258 {
259     size_t n;
260     if (sam_hdr_nref(hdr1) != sam_hdr_nref(hdr2)) {
261         print_error("split",
262                     "Unaccounted header contains wrong number of references");
263         return -1;
264     }
265     for (n = 0; n < sam_hdr_nref(hdr1); n++) {
266         hts_pos_t h1_len = sam_hdr_tid2len(hdr1, n);
267         hts_pos_t h2_len = sam_hdr_tid2len(hdr2, n);
268         if (h1_len != h2_len) {
269             print_error("split",
270                         "Unaccounted header reference %zu \"%s\" is not the same length as in the input file",
271                         n + 1, sam_hdr_tid2name(hdr2, n));
272             return -1;
273         }
274     }
275     return 0;
276 }
277 
278 // Set the initial state
init(parsed_opts_t * opts,const char * arg_list)279 static state_t* init(parsed_opts_t* opts, const char *arg_list)
280 {
281     state_t* retval = calloc(sizeof(state_t), 1);
282     if (!retval) {
283         print_error_errno("split", "Initialisation failed");
284         return NULL;
285     }
286 
287     if (opts->ga.nthreads > 0) {
288         if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) {
289             fprintf(stderr, "Error creating thread pool\n");
290             cleanup_state(retval, false);
291             return NULL;
292         }
293     }
294 
295     retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
296     if (!retval->merged_input_file) {
297         print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
298         cleanup_state(retval, false);
299         return NULL;
300     }
301     if (retval->p.pool)
302         hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p);
303     retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
304     if (retval->merged_input_header == NULL) {
305         print_error("split", "Could not read header from \"%s\"", opts->merged_input_name);
306         cleanup_state(retval, false);
307         return NULL;
308     }
309 
310     if (opts->unaccounted_name) {
311         if (opts->unaccounted_header_name) {
312             samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
313             if (!hdr_load) {
314                 print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name);
315                 cleanup_state(retval, false);
316                 return NULL;
317             }
318             retval->unaccounted_header = sam_hdr_read(hdr_load);
319             if (retval->unaccounted_header == NULL) {
320                 print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name);
321                 cleanup_state(retval, false);
322                 sam_close(hdr_load);
323                 return NULL;
324             }
325             sam_close(hdr_load);
326             if (header_compatible(retval->merged_input_header,
327                                   retval->unaccounted_header) != 0) {
328                 cleanup_state(retval, false);
329                 return NULL;
330             }
331         } else {
332             retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header);
333             if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools",
334                                     "VN", samtools_version(),
335                                     arg_list ? "CL": NULL,
336                                     arg_list ? arg_list : NULL,
337                                     NULL)) {
338                 print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name);
339                 cleanup_state(retval, false);
340                 return NULL;
341             }
342         }
343 
344         retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
345         if (retval->unaccounted_file == NULL) {
346             print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
347             cleanup_state(retval, false);
348             return NULL;
349         }
350         if (retval->p.pool)
351             hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
352     }
353 
354     // Open output files for RGs
355     if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
356     if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count);
357     // Prevent calloc(0, size);
358     size_t num = retval->output_count ? retval->output_count : 1;
359     retval->rg_index_file_name = (char **)calloc(num, sizeof(char *));
360     retval->rg_output_file_name = (char **)calloc(num, sizeof(char *));
361     retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*));
362     retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
363     retval->rg_hash = kh_init_c2i();
364     if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header ||
365         !retval->rg_hash || !retval->rg_index_file_name) {
366         print_error_errno("split", "Could not initialise output file array");
367         cleanup_state(retval, false);
368         return NULL;
369     }
370 
371     char* dirsep = strrchr(opts->merged_input_name, '/');
372     char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
373     if (!input_base_name) {
374         print_error_errno("split", "Filename manipulation failed");
375         cleanup_state(retval, false);
376         return NULL;
377     }
378     char* extension = strrchr(input_base_name, '.');
379     if (extension) *extension = '\0';
380 
381     size_t i;
382     for (i = 0; i < retval->output_count; i++) {
383         char* output_filename = NULL;
384 
385         output_filename = expand_format_string(opts->output_format_string,
386                                                input_base_name,
387                                                retval->rg_id[i], i,
388                                                &opts->ga.out);
389 
390         if ( output_filename == NULL ) {
391             cleanup_state(retval, false);
392             free(input_base_name);
393             return NULL;
394         }
395 
396         retval->rg_output_file_name[i] = output_filename;
397         retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
398         if (retval->rg_output_file[i] == NULL) {
399             print_error_errno("split", "Could not open \"%s\"", output_filename);
400             cleanup_state(retval, false);
401             free(input_base_name);
402             return NULL;
403         }
404         if (retval->p.pool)
405             hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
406 
407         // Record index in hash
408         int ret;
409         khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret);
410         if (ret < 0) {
411             print_error_errno("split", "Couldn't add @RG ID to look-up table");
412             cleanup_state(retval, false);
413             free(input_base_name);
414             return NULL;
415         }
416         kh_val(retval->rg_hash,iter) = i;
417 
418         // Set and edit header
419         retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header);
420         if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) ||
421            (!opts->no_pg &&
422             sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
423                         "VN", samtools_version(),
424                         arg_list ? "CL": NULL,
425                         arg_list ? arg_list : NULL,
426                         NULL))) {
427             print_error("split", "Could not rewrite header for \"%s\"", output_filename);
428             cleanup_state(retval, false);
429             free(input_base_name);
430             return NULL;
431         }
432     }
433 
434     free(input_base_name);
435     retval->write_index = opts->ga.write_index;
436 
437     return retval;
438 }
439 
split(state_t * state)440 static bool split(state_t* state)
441 {
442     if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
443         print_error_errno("split", "Could not write output file header");
444         return false;
445     }
446     size_t i;
447     for (i = 0; i < state->output_count; i++) {
448         if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
449             print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
450             return false;
451         }
452         if (state->write_index) {
453             state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
454                                                       state->rg_output_file_name[i],
455                                                       state->rg_output_header[i]);
456             if (!state->rg_index_file_name[i]) {
457                 print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
458                 return false;
459             }
460         }
461     }
462 
463     bam1_t* file_read = bam_init1();
464     // Read the first record
465     int r;
466     if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
467         // Nothing more to read?  Ignore this file
468         bam_destroy1(file_read);
469         file_read = NULL;
470         if (r < -1) {
471             print_error("split", "Could not read first input record");
472             return false;
473         }
474     }
475 
476     while (file_read != NULL) {
477         // Get RG tag from read and look it up in hash to find file to output it to
478         uint8_t* tag = bam_aux_get(file_read, "RG");
479         khiter_t iter;
480         if ( tag != NULL ) {
481             char* rg = bam_aux2Z(tag);
482             iter = kh_get_c2i(state->rg_hash, rg);
483         } else {
484             iter = kh_end(state->rg_hash);
485         }
486 
487         // Write the read out to correct file
488         if (iter != kh_end(state->rg_hash)) {
489             // if found write to the appropriate untangled bam
490             int i = kh_val(state->rg_hash,iter);
491             if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
492                 print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
493                 bam_destroy1(file_read);
494                 return false;
495             }
496         } else {
497             // otherwise write to the unaccounted bam if there is one or fail
498             if (state->unaccounted_file == NULL) {
499                 if (tag) {
500                     fprintf(stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
501                 } else {
502                     fprintf(stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
503                 }
504                 bam_destroy1(file_read);
505                 return false;
506             } else {
507                 if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
508                     print_error_errno("split", "Could not write to unaccounted output file");
509                     bam_destroy1(file_read);
510                     return false;
511                 }
512             }
513         }
514 
515         // Replace written read with the next one to process
516         if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
517             // Nothing more to read?  Ignore this file in future
518             bam_destroy1(file_read);
519             file_read = NULL;
520             if (r < -1) {
521                 print_error("split", "Could not read input record");
522                 return false;
523             }
524         }
525     }
526 
527     if (state->write_index) {
528         for (i = 0; i < state->output_count; i++) {
529             if (sam_idx_save(state->rg_output_file[i]) < 0) {
530                 print_error_errno("split", "writing index failed");
531                 return false;
532             }
533             free(state->rg_index_file_name[i]);
534         }
535     }
536 
537     return true;
538 }
539 
cleanup_state(state_t * status,bool check_close)540 static int cleanup_state(state_t* status, bool check_close)
541 {
542     int ret = 0;
543 
544     if (!status) return 0;
545     if (status->unaccounted_header) sam_hdr_destroy(status->unaccounted_header);
546     if (status->unaccounted_file) {
547         if (sam_close(status->unaccounted_file) < 0 && check_close) {
548             print_error("split", "Error on closing unaccounted file");
549             ret = -1;
550         }
551     }
552     sam_close(status->merged_input_file);
553     size_t i;
554     for (i = 0; i < status->output_count; i++) {
555         if (status->rg_output_header && status->rg_output_header[i])
556             sam_hdr_destroy(status->rg_output_header[i]);
557         if (status->rg_output_file && status->rg_output_file[i]) {
558             if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
559                 print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
560                 ret = -1;
561             }
562         }
563         if (status->rg_id) free(status->rg_id[i]);
564         if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
565     }
566     if (status->merged_input_header)
567         sam_hdr_destroy(status->merged_input_header);
568     free(status->rg_output_header);
569     free(status->rg_output_file);
570     free(status->rg_output_file_name);
571     free(status->rg_index_file_name);
572     kh_destroy_c2i(status->rg_hash);
573     free(status->rg_id);
574     if (status->p.pool)
575         hts_tpool_destroy(status->p.pool);
576     free(status);
577 
578     return ret;
579 }
580 
cleanup_opts(parsed_opts_t * opts)581 static void cleanup_opts(parsed_opts_t* opts)
582 {
583     if (!opts) return;
584     sam_global_args_free(&opts->ga);
585     free(opts);
586 }
587 
main_split(int argc,char ** argv)588 int main_split(int argc, char** argv)
589 {
590     int ret = 1;
591     char *arg_list = NULL;
592     parsed_opts_t* opts = parse_args(argc, argv);
593     if (!opts) goto cleanup_opts;
594     if (!opts->no_pg && !(arg_list = stringify_argv(argc+1, argv-1)))
595         goto cleanup_opts;
596     state_t* status = init(opts, arg_list);
597     if (!status) goto cleanup_opts;
598 
599     if (!split(status)) {
600         cleanup_state(status, false);
601         goto cleanup_opts;
602     }
603 
604     ret = cleanup_state(status, true);
605 
606 cleanup_opts:
607     cleanup_opts(opts);
608     free(arg_list);
609 
610     return ret;
611 }
612