1 /* bam_split.c -- split subcommand.
2
3 Copyright (C) 2013-2016,2018-2019 Genome Research Ltd.
4
5 Author: Martin Pollard <mp15@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include <config.h>
26
27 #include <string.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <stdbool.h>
31 #include <limits.h>
32 #include <unistd.h>
33 #include <regex.h>
34 #include <assert.h>
35 #include <htslib/sam.h>
36 #include <htslib/khash.h>
37 #include <htslib/kstring.h>
38 #include <htslib/cram.h>
39 #include "htslib/thread_pool.h"
40 #include "sam_opts.h"
41 #include "samtools.h"
42
43
44 KHASH_MAP_INIT_STR(c2i, int)
45
46 struct parsed_opts {
47 const char *merged_input_name;
48 const char *unaccounted_header_name;
49 const char *unaccounted_name;
50 const char *output_format_string;
51 bool verbose;
52 int no_pg;
53 sam_global_args ga;
54 };
55
56 typedef struct parsed_opts parsed_opts_t;
57
58 struct state {
59 samFile* merged_input_file;
60 sam_hdr_t* merged_input_header;
61 samFile* unaccounted_file;
62 sam_hdr_t* unaccounted_header;
63 size_t output_count;
64 char** rg_id;
65 char **rg_index_file_name;
66 char **rg_output_file_name;
67 samFile** rg_output_file;
68 sam_hdr_t** rg_output_header;
69 kh_c2i_t* rg_hash;
70 htsThreadPool p;
71 int write_index;
72 };
73
74 typedef struct state state_t;
75
76 static int cleanup_state(state_t* status, bool check_close);
77 static void cleanup_opts(parsed_opts_t* opts);
78
usage(FILE * write_to)79 static void usage(FILE *write_to)
80 {
81 fprintf(write_to,
82 "Usage: samtools split [-u <unaccounted.bam>] [-h <unaccounted_header.sam>]\n"
83 " [-f <format_string>] [-v] <merged.bam>\n"
84 "Options:\n"
85 " -f STRING output filename format string [\"%%*_%%#.%%.\"]\n"
86 " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
87 " -h FILE2 ... and override the header with FILE2 (-u file only)\n"
88 " -v verbose output\n"
89 " --no-PG do not add a PG line\n");
90 sam_global_opt_help(write_to, "-....@..");
91 fprintf(write_to,
92 "\n"
93 "Format string expansions:\n"
94 " %%%% %%\n"
95 " %%* basename\n"
96 " %%# @RG index\n"
97 " %%! @RG ID\n"
98 " %%. filename extension for output format\n"
99 );
100 }
101
102 // Takes the command line options and turns them into something we can understand
parse_args(int argc,char ** argv)103 static parsed_opts_t* parse_args(int argc, char** argv)
104 {
105 if (argc == 1) { usage(stdout); return NULL; }
106
107 const char *optstring = "vf:h:u:@:";
108
109 static const struct option lopts[] = {
110 SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
111 {"no-PG", no_argument, NULL, 1},
112 { NULL, 0, NULL, 0 }
113 };
114
115 parsed_opts_t* retval = calloc(sizeof(parsed_opts_t), 1);
116 if (! retval ) { perror("cannot allocate option parsing memory"); return NULL; }
117
118 sam_global_args_init(&retval->ga);
119
120 int opt;
121 while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) {
122 switch (opt) {
123 case 'f':
124 retval->output_format_string = optarg;
125 break;
126 case 'h':
127 retval->unaccounted_header_name = optarg;
128 break;
129 case 'v':
130 retval->verbose = true;
131 break;
132 case 'u':
133 retval->unaccounted_name = optarg;
134 break;
135 case 1:
136 retval->no_pg = 1;
137 break;
138 default:
139 if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break;
140 /* else fall-through */
141 case '?':
142 usage(stdout);
143 free(retval);
144 return NULL;
145 }
146 }
147
148 if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%.";
149
150 argc -= optind;
151 argv += optind;
152
153 if (argc != 1) {
154 print_error("split", "Invalid number of arguments: %d", argc);
155 usage(stderr);
156 free(retval);
157 return NULL;
158 }
159
160 retval->merged_input_name = argv[0];
161
162 return retval;
163 }
164
165 // Expands a output filename format string
expand_format_string(const char * format_string,const char * basename,const char * rg_id,const int rg_idx,const htsFormat * format)166 static char* expand_format_string(const char* format_string, const char* basename, const char* rg_id, const int rg_idx, const htsFormat *format)
167 {
168 kstring_t str = { 0, 0, NULL };
169 const char* pointer = format_string;
170 const char* next;
171 while ((next = strchr(pointer, '%')) != NULL) {
172 if (kputsn(pointer, next-pointer, &str) < 0) goto memfail;
173 ++next;
174 switch (*next) {
175 case '%':
176 if (kputc('%', &str) < 0) goto memfail;
177 break;
178 case '*':
179 if (kputs(basename, &str) < 0) goto memfail;
180 break;
181 case '#':
182 if (kputl(rg_idx, &str) < 0) goto memfail;
183 break;
184 case '!':
185 if (kputs(rg_id, &str) < 0) goto memfail;
186 break;
187 case '.':
188 // Only really need to cope with sam, bam, cram
189 if (format->format != unknown_format) {
190 if (kputs(hts_format_file_extension(format), &str) < 0)
191 goto memfail;
192 } else {
193 if (kputs("bam", &str) < 0) goto memfail;
194 }
195 break;
196 case '\0':
197 print_error("split", "Trailing %% in filename format string");
198 goto fail;
199 default:
200 // Error is: fprintf(stderr, "bad format string, unknown format specifier\n");
201 print_error("split", "Unknown specifier %%%c in filename format string", *next);
202 goto fail;
203 }
204 pointer = next + 1;
205 }
206 if (kputs(pointer, &str) < 0) goto memfail;
207 return ks_release(&str);
208
209 memfail:
210 print_error_errno("split", "Couldn't build output filename");
211 fail:
212 free(str.s);
213 return NULL;
214 }
215
216 // Parse the header, count the number of RG tags and return a list of their names
count_RG(sam_hdr_t * hdr,size_t * count,char *** output_name)217 static bool count_RG(sam_hdr_t* hdr, size_t* count, char*** output_name)
218 {
219 char **names = NULL;
220 kstring_t id_val = KS_INITIALIZE;
221 int i, n_rg = sam_hdr_count_lines(hdr, "RG");
222
223 if (n_rg < 0) {
224 print_error("split", "Failed to get @RG IDs");
225 *count = 0;
226 *output_name = NULL;
227 return false;
228 }
229
230 if (n_rg == 0) {
231 *count = 0;
232 *output_name = NULL;
233 return true;
234 }
235
236 names = calloc(n_rg, sizeof(names[0]));
237 if (!names) goto memfail;
238
239 for (i = 0; i < n_rg; i++) {
240 if (sam_hdr_find_tag_pos(hdr, "RG", i, "ID", &id_val) < 0) goto memfail;
241 names[i] = ks_release(&id_val);
242 }
243
244 *count = n_rg;
245 *output_name = names;
246 return true;
247
248 memfail:
249 print_error_errno("split", "Failed to get @RG IDs");
250 *count = 0;
251 *output_name = NULL;
252 ks_free(&id_val);
253 free(names);
254 return false;
255 }
256
header_compatible(sam_hdr_t * hdr1,sam_hdr_t * hdr2)257 static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2)
258 {
259 size_t n;
260 if (sam_hdr_nref(hdr1) != sam_hdr_nref(hdr2)) {
261 print_error("split",
262 "Unaccounted header contains wrong number of references");
263 return -1;
264 }
265 for (n = 0; n < sam_hdr_nref(hdr1); n++) {
266 hts_pos_t h1_len = sam_hdr_tid2len(hdr1, n);
267 hts_pos_t h2_len = sam_hdr_tid2len(hdr2, n);
268 if (h1_len != h2_len) {
269 print_error("split",
270 "Unaccounted header reference %zu \"%s\" is not the same length as in the input file",
271 n + 1, sam_hdr_tid2name(hdr2, n));
272 return -1;
273 }
274 }
275 return 0;
276 }
277
278 // Set the initial state
init(parsed_opts_t * opts,const char * arg_list)279 static state_t* init(parsed_opts_t* opts, const char *arg_list)
280 {
281 state_t* retval = calloc(sizeof(state_t), 1);
282 if (!retval) {
283 print_error_errno("split", "Initialisation failed");
284 return NULL;
285 }
286
287 if (opts->ga.nthreads > 0) {
288 if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) {
289 fprintf(stderr, "Error creating thread pool\n");
290 cleanup_state(retval, false);
291 return NULL;
292 }
293 }
294
295 retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
296 if (!retval->merged_input_file) {
297 print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
298 cleanup_state(retval, false);
299 return NULL;
300 }
301 if (retval->p.pool)
302 hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p);
303 retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
304 if (retval->merged_input_header == NULL) {
305 print_error("split", "Could not read header from \"%s\"", opts->merged_input_name);
306 cleanup_state(retval, false);
307 return NULL;
308 }
309
310 if (opts->unaccounted_name) {
311 if (opts->unaccounted_header_name) {
312 samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
313 if (!hdr_load) {
314 print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name);
315 cleanup_state(retval, false);
316 return NULL;
317 }
318 retval->unaccounted_header = sam_hdr_read(hdr_load);
319 if (retval->unaccounted_header == NULL) {
320 print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name);
321 cleanup_state(retval, false);
322 sam_close(hdr_load);
323 return NULL;
324 }
325 sam_close(hdr_load);
326 if (header_compatible(retval->merged_input_header,
327 retval->unaccounted_header) != 0) {
328 cleanup_state(retval, false);
329 return NULL;
330 }
331 } else {
332 retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header);
333 if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools",
334 "VN", samtools_version(),
335 arg_list ? "CL": NULL,
336 arg_list ? arg_list : NULL,
337 NULL)) {
338 print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name);
339 cleanup_state(retval, false);
340 return NULL;
341 }
342 }
343
344 retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
345 if (retval->unaccounted_file == NULL) {
346 print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
347 cleanup_state(retval, false);
348 return NULL;
349 }
350 if (retval->p.pool)
351 hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
352 }
353
354 // Open output files for RGs
355 if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL;
356 if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count);
357 // Prevent calloc(0, size);
358 size_t num = retval->output_count ? retval->output_count : 1;
359 retval->rg_index_file_name = (char **)calloc(num, sizeof(char *));
360 retval->rg_output_file_name = (char **)calloc(num, sizeof(char *));
361 retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*));
362 retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*));
363 retval->rg_hash = kh_init_c2i();
364 if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header ||
365 !retval->rg_hash || !retval->rg_index_file_name) {
366 print_error_errno("split", "Could not initialise output file array");
367 cleanup_state(retval, false);
368 return NULL;
369 }
370
371 char* dirsep = strrchr(opts->merged_input_name, '/');
372 char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
373 if (!input_base_name) {
374 print_error_errno("split", "Filename manipulation failed");
375 cleanup_state(retval, false);
376 return NULL;
377 }
378 char* extension = strrchr(input_base_name, '.');
379 if (extension) *extension = '\0';
380
381 size_t i;
382 for (i = 0; i < retval->output_count; i++) {
383 char* output_filename = NULL;
384
385 output_filename = expand_format_string(opts->output_format_string,
386 input_base_name,
387 retval->rg_id[i], i,
388 &opts->ga.out);
389
390 if ( output_filename == NULL ) {
391 cleanup_state(retval, false);
392 free(input_base_name);
393 return NULL;
394 }
395
396 retval->rg_output_file_name[i] = output_filename;
397 retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
398 if (retval->rg_output_file[i] == NULL) {
399 print_error_errno("split", "Could not open \"%s\"", output_filename);
400 cleanup_state(retval, false);
401 free(input_base_name);
402 return NULL;
403 }
404 if (retval->p.pool)
405 hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
406
407 // Record index in hash
408 int ret;
409 khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret);
410 if (ret < 0) {
411 print_error_errno("split", "Couldn't add @RG ID to look-up table");
412 cleanup_state(retval, false);
413 free(input_base_name);
414 return NULL;
415 }
416 kh_val(retval->rg_hash,iter) = i;
417
418 // Set and edit header
419 retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header);
420 if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) ||
421 (!opts->no_pg &&
422 sam_hdr_add_pg(retval->rg_output_header[i], "samtools",
423 "VN", samtools_version(),
424 arg_list ? "CL": NULL,
425 arg_list ? arg_list : NULL,
426 NULL))) {
427 print_error("split", "Could not rewrite header for \"%s\"", output_filename);
428 cleanup_state(retval, false);
429 free(input_base_name);
430 return NULL;
431 }
432 }
433
434 free(input_base_name);
435 retval->write_index = opts->ga.write_index;
436
437 return retval;
438 }
439
split(state_t * state)440 static bool split(state_t* state)
441 {
442 if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
443 print_error_errno("split", "Could not write output file header");
444 return false;
445 }
446 size_t i;
447 for (i = 0; i < state->output_count; i++) {
448 if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
449 print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
450 return false;
451 }
452 if (state->write_index) {
453 state->rg_index_file_name[i] = auto_index(state->rg_output_file[i],
454 state->rg_output_file_name[i],
455 state->rg_output_header[i]);
456 if (!state->rg_index_file_name[i]) {
457 print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]);
458 return false;
459 }
460 }
461 }
462
463 bam1_t* file_read = bam_init1();
464 // Read the first record
465 int r;
466 if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
467 // Nothing more to read? Ignore this file
468 bam_destroy1(file_read);
469 file_read = NULL;
470 if (r < -1) {
471 print_error("split", "Could not read first input record");
472 return false;
473 }
474 }
475
476 while (file_read != NULL) {
477 // Get RG tag from read and look it up in hash to find file to output it to
478 uint8_t* tag = bam_aux_get(file_read, "RG");
479 khiter_t iter;
480 if ( tag != NULL ) {
481 char* rg = bam_aux2Z(tag);
482 iter = kh_get_c2i(state->rg_hash, rg);
483 } else {
484 iter = kh_end(state->rg_hash);
485 }
486
487 // Write the read out to correct file
488 if (iter != kh_end(state->rg_hash)) {
489 // if found write to the appropriate untangled bam
490 int i = kh_val(state->rg_hash,iter);
491 if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
492 print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
493 bam_destroy1(file_read);
494 return false;
495 }
496 } else {
497 // otherwise write to the unaccounted bam if there is one or fail
498 if (state->unaccounted_file == NULL) {
499 if (tag) {
500 fprintf(stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag));
501 } else {
502 fprintf(stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read));
503 }
504 bam_destroy1(file_read);
505 return false;
506 } else {
507 if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
508 print_error_errno("split", "Could not write to unaccounted output file");
509 bam_destroy1(file_read);
510 return false;
511 }
512 }
513 }
514
515 // Replace written read with the next one to process
516 if ((r=sam_read1(state->merged_input_file, state->merged_input_header, file_read)) < 0) {
517 // Nothing more to read? Ignore this file in future
518 bam_destroy1(file_read);
519 file_read = NULL;
520 if (r < -1) {
521 print_error("split", "Could not read input record");
522 return false;
523 }
524 }
525 }
526
527 if (state->write_index) {
528 for (i = 0; i < state->output_count; i++) {
529 if (sam_idx_save(state->rg_output_file[i]) < 0) {
530 print_error_errno("split", "writing index failed");
531 return false;
532 }
533 free(state->rg_index_file_name[i]);
534 }
535 }
536
537 return true;
538 }
539
cleanup_state(state_t * status,bool check_close)540 static int cleanup_state(state_t* status, bool check_close)
541 {
542 int ret = 0;
543
544 if (!status) return 0;
545 if (status->unaccounted_header) sam_hdr_destroy(status->unaccounted_header);
546 if (status->unaccounted_file) {
547 if (sam_close(status->unaccounted_file) < 0 && check_close) {
548 print_error("split", "Error on closing unaccounted file");
549 ret = -1;
550 }
551 }
552 sam_close(status->merged_input_file);
553 size_t i;
554 for (i = 0; i < status->output_count; i++) {
555 if (status->rg_output_header && status->rg_output_header[i])
556 sam_hdr_destroy(status->rg_output_header[i]);
557 if (status->rg_output_file && status->rg_output_file[i]) {
558 if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
559 print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
560 ret = -1;
561 }
562 }
563 if (status->rg_id) free(status->rg_id[i]);
564 if (status->rg_output_file_name) free(status->rg_output_file_name[i]);
565 }
566 if (status->merged_input_header)
567 sam_hdr_destroy(status->merged_input_header);
568 free(status->rg_output_header);
569 free(status->rg_output_file);
570 free(status->rg_output_file_name);
571 free(status->rg_index_file_name);
572 kh_destroy_c2i(status->rg_hash);
573 free(status->rg_id);
574 if (status->p.pool)
575 hts_tpool_destroy(status->p.pool);
576 free(status);
577
578 return ret;
579 }
580
cleanup_opts(parsed_opts_t * opts)581 static void cleanup_opts(parsed_opts_t* opts)
582 {
583 if (!opts) return;
584 sam_global_args_free(&opts->ga);
585 free(opts);
586 }
587
main_split(int argc,char ** argv)588 int main_split(int argc, char** argv)
589 {
590 int ret = 1;
591 char *arg_list = NULL;
592 parsed_opts_t* opts = parse_args(argc, argv);
593 if (!opts) goto cleanup_opts;
594 if (!opts->no_pg && !(arg_list = stringify_argv(argc+1, argv-1)))
595 goto cleanup_opts;
596 state_t* status = init(opts, arg_list);
597 if (!status) goto cleanup_opts;
598
599 if (!split(status)) {
600 cleanup_state(status, false);
601 goto cleanup_opts;
602 }
603
604 ret = cleanup_state(status, true);
605
606 cleanup_opts:
607 cleanup_opts(opts);
608 free(arg_list);
609
610 return ret;
611 }
612