1 /* bam_addrprg.c -- samtools command to add or replace readgroups.
2
3 Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
4
5 Author: Martin O. Pollard <mp15@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include <config.h>
26
27 #include <htslib/sam.h>
28 #include <htslib/kstring.h>
29 #include "samtools.h"
30 #include "htslib/thread_pool.h"
31 #include "sam_opts.h"
32 #include <string.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <stdbool.h>
36 #include <limits.h>
37 #include <assert.h>
38 #include <unistd.h>
39
40 typedef enum {
41 overwrite_all,
42 orphan_only,
43 } rg_mode;
44
45 struct parsed_opts {
46 char* input_name;
47 char* output_name;
48 char* rg_id;
49 char* rg_line;
50 int no_pg;
51 rg_mode mode;
52 sam_global_args ga;
53 htsThreadPool p;
54 int uncompressed;
55 int overwrite_hdr_rg;
56 };
57
58 struct state;
59 typedef struct parsed_opts parsed_opts_t;
60 typedef struct state state_t;
61
62 struct state {
63 samFile* input_file;
64 sam_hdr_t* input_header;
65 samFile* output_file;
66 sam_hdr_t* output_header;
67 char* rg_id;
68 void (*mode_func)(const state_t*, bam1_t*);
69 };
70
cleanup_opts(parsed_opts_t * opts)71 static void cleanup_opts(parsed_opts_t* opts)
72 {
73 if (!opts) return;
74 free(opts->rg_id);
75 free(opts->output_name);
76 free(opts->input_name);
77 free(opts->rg_line);
78 if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
79 sam_global_args_free(&opts->ga);
80 free(opts);
81 }
82
cleanup_state(state_t * state)83 static void cleanup_state(state_t* state)
84 {
85 if (!state) return;
86 free(state->rg_id);
87 if (state->output_file) sam_close(state->output_file);
88 sam_hdr_destroy(state->output_header);
89 if (state->input_file) sam_close(state->input_file);
90 sam_hdr_destroy(state->input_header);
91 free(state);
92 }
93
94 // Converts \t and \n into real tabs and newlines
basic_unescape(const char * in)95 static char* basic_unescape(const char* in)
96 {
97 assert(in);
98 char *ptr, *out;
99 out = ptr = malloc(strlen(in)+1);
100 size_t size = 0;
101 while (*in) {
102 if (*in == '\\') {
103 ++in;
104 if (*in == '\0') {
105 fprintf(stderr, "[%s] Unterminated escape sequence.\n", __func__);
106 free(out);
107 return NULL;
108 }
109 switch (*in) {
110 case '\\':
111 *ptr = '\\';
112 break;
113 case 't':
114 *ptr = '\t';
115 break;
116 case 'n':
117 fprintf(stderr, "[%s] \\n in escape sequence is not supported.\n", __func__);
118 free(out);
119 return NULL;
120 default:
121 fprintf(stderr, "[%s] Unsupported escape sequence.\n", __func__);
122 free(out);
123 return NULL;
124 }
125 } else {
126 *ptr = *in;
127 }
128 ++in;
129 ++ptr;
130 ++size;
131 }
132 *ptr = '\0';
133 ++size;
134 char* tmp = (char*)realloc(out, size);
135 if (!tmp) {
136 free(out);
137 }
138 return tmp;
139 }
140
141 // Malloc a string containing [s,slim) or to the end of s if slim is NULL.
142 // If lenp is non-NULL, stores the length of the resulting string there.
dup_substring(const char * s,const char * slim,size_t * lenp)143 static char *dup_substring(const char *s, const char *slim, size_t *lenp)
144 {
145 size_t len = slim? (slim - s) : strlen(s);
146 char *ns = malloc(len+1);
147 if (ns == NULL) return NULL;
148 memcpy(ns, s, len);
149 ns[len] = '\0';
150 if (lenp) *lenp = len;
151 return ns;
152 }
153
154
155 // Given a @RG line return the id
get_rg_id(const char * line)156 static char* get_rg_id(const char *line)
157 {
158 const char *id = strstr(line, "\tID:");
159 if (! id) return NULL;
160
161 id += 4;
162 return dup_substring(id, strchr(id, '\t'), NULL);
163 }
164
165
usage(FILE * fp)166 static void usage(FILE *fp)
167 {
168 fprintf(fp,
169 "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
170 "\n"
171 "Options:\n"
172 " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
173 " -o FILE Where to write output to [stdout]\n"
174 " -r STRING @RG line text\n"
175 " -R STRING ID of @RG line in existing header to use\n"
176 " -u Output uncompressed data\n"
177 " -w Overwrite an existing @RG line\n"
178 " --no-PG Do not add a PG line\n"
179 );
180 sam_global_opt_help(fp, "..O..@..");
181 }
182
parse_args(int argc,char ** argv,parsed_opts_t ** opts)183 static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
184 {
185 *opts = NULL;
186 int n;
187
188 if (argc == 1) { usage(stdout); return true; }
189
190 parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
191 if (! retval ) {
192 fprintf(stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
193 return false;
194 }
195 // Set defaults
196 retval->mode = overwrite_all;
197 sam_global_args_init(&retval->ga);
198 static const struct option lopts[] = {
199 SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
200 {"no-PG", no_argument, NULL, 1},
201 { NULL, 0, NULL, 0 }
202 };
203 kstring_t rg_line = {0,0,NULL};
204
205 while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
206 switch (n) {
207 case 'r':
208 // Are we adding to existing rg line?
209 if (ks_len(&rg_line) == 0) {
210 if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
211 kputs("@RG\t", &rg_line);
212 }
213 } else {
214 kputs("\t", &rg_line);
215 }
216 kputs(optarg, &rg_line);
217 break;
218 case 'R':
219 retval->rg_id = strdup(optarg);
220 break;
221 case 'm': {
222 if (strcmp(optarg, "overwrite_all") == 0) {
223 retval->mode = overwrite_all;
224 } else if (strcmp(optarg, "orphan_only") == 0) {
225 retval->mode = orphan_only;
226 } else {
227 usage(stderr);
228 return false;
229 }
230 break;
231 }
232 case 'o':
233 retval->output_name = strdup(optarg);
234 break;
235 case 'h':
236 usage(stdout);
237 free(retval);
238 return true;
239 case 1:
240 retval->no_pg = 1;
241 break;
242 case 'u':
243 retval->uncompressed = 1;
244 break;
245 case 'w':
246 retval->overwrite_hdr_rg = 1;
247 break;
248 case '?':
249 usage(stderr);
250 free(retval);
251 return false;
252 case 'O':
253 default:
254 if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
255 usage(stderr);
256 free(retval);
257 return false;
258 }
259 }
260 retval->rg_line = ks_release(&rg_line);
261
262 if (argc-optind < 1) {
263 fprintf(stderr, "You must specify an input file.\n");
264 usage(stderr);
265 cleanup_opts(retval);
266 return false;
267 }
268 if (retval->rg_id && retval->rg_line) {
269 fprintf(stderr, "The options -r and -R are mutually exclusive.\n");
270 cleanup_opts(retval);
271 return false;
272 }
273
274 if (retval->rg_line)
275 {
276 char* tmp = basic_unescape(retval->rg_line);
277
278 if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
279 fprintf(stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
280 free(tmp);
281 cleanup_opts(retval);
282 return false;
283 }
284 free(retval->rg_line);
285 retval->rg_line = tmp;
286 }
287 retval->input_name = strdup(argv[optind+0]);
288
289 if (retval->ga.nthreads > 0) {
290 if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
291 fprintf(stderr, "Error creating thread pool\n");
292 return false;
293 }
294 }
295
296 *opts = retval;
297 return true;
298 }
299
overwrite_all_func(const state_t * state,bam1_t * file_read)300 static void overwrite_all_func(const state_t* state, bam1_t* file_read)
301 {
302 uint8_t* data = (uint8_t*)strdup(state->rg_id);
303 int len = strlen(state->rg_id)+1;
304 // If the old exists delete it
305 uint8_t* old = bam_aux_get(file_read, "RG");
306 if (old != NULL) {
307 bam_aux_del(file_read, old);
308 }
309
310 bam_aux_append(file_read, "RG", 'Z', len, data);
311 free(data);
312 }
313
orphan_only_func(const state_t * state,bam1_t * file_read)314 static void orphan_only_func(const state_t* state, bam1_t* file_read)
315 {
316 uint8_t* data = (uint8_t*)strdup(state->rg_id);
317 int len = strlen(state->rg_id)+1;
318 // If the old exists don't do anything
319 uint8_t* old = bam_aux_get(file_read, "RG");
320 if (old == NULL) {
321 bam_aux_append(file_read, "RG",'Z',len,data);
322 }
323 free(data);
324 }
325
init(const parsed_opts_t * opts,state_t ** state_out)326 static bool init(const parsed_opts_t* opts, state_t** state_out) {
327 char output_mode[9] = "w";
328 state_t* retval = (state_t*) calloc(1, sizeof(state_t));
329
330 if (retval == NULL) {
331 fprintf(stderr, "[init] Out of memory allocating state struct.\n");
332 return false;
333 }
334 *state_out = retval;
335
336 // Open files
337 retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
338 if (retval->input_file == NULL) {
339 print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
340 return false;
341 }
342 retval->input_header = sam_hdr_read(retval->input_file);
343
344 retval->output_header = sam_hdr_dup(retval->input_header);
345
346 if (opts->uncompressed)
347 strcat(output_mode, "0");
348 if (opts->output_name) // File format auto-detection
349 sam_open_mode(output_mode + strlen(output_mode),
350 opts->output_name, NULL);
351 retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
352
353 if (retval->output_file == NULL) {
354 print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
355 return false;
356 }
357
358 if (opts->p.pool) {
359 hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p);
360 hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
361 }
362
363 if (opts->rg_line) {
364 // Append new RG line to header.
365 // Check does not already exist
366 kstring_t hdr_line = { 0, 0, NULL };
367 if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
368 if (opts->overwrite_hdr_rg) {
369 if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
370 fprintf(stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
371 ks_free(&hdr_line);
372 return false;
373 }
374 } else {
375 fprintf(stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
376 ks_free(&hdr_line);
377 return false;
378 }
379 }
380 ks_free(&hdr_line);
381
382 if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
383 fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
384 return false;
385 }
386 if (opts->mode == overwrite_all &&
387 -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) {
388 fprintf(stderr, "[init] Error removing the old RG lines from the output header.\n");
389 return false;
390 }
391 retval->rg_id = strdup(opts->rg_id);
392 } else {
393 if (opts->rg_id) {
394 // Confirm what has been supplied exists
395 kstring_t hdr_line = { 0, 0, NULL };
396 if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) {
397 fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
398 return false;
399 }
400 retval->rg_id = strdup(opts->rg_id);
401 ks_free(&hdr_line);
402 } else {
403 kstring_t rg_id = { 0, 0, NULL };
404 if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
405 fprintf(stderr, "No RG specified on command line or in existing header.\n");
406 return false;
407 }
408 retval->rg_id = ks_release(&rg_id);
409 }
410 }
411
412 switch (opts->mode) {
413 case overwrite_all:
414 retval->mode_func = &overwrite_all_func;
415 break;
416 case orphan_only:
417 retval->mode_func = &orphan_only_func;
418 break;
419 }
420
421 return true;
422 }
423
readgroupise(parsed_opts_t * opts,state_t * state,char * arg_list)424 static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list)
425 {
426 if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools",
427 "VN", samtools_version(),
428 arg_list ? "CL": NULL,
429 arg_list ? arg_list : NULL,
430 NULL))
431 return false;
432
433 if (sam_hdr_write(state->output_file, state->output_header) != 0) {
434 print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__);
435 return false;
436 }
437 char *idx_fn = NULL;
438 if (opts->ga.write_index) {
439 if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header)))
440 return false;
441 }
442
443 bam1_t* file_read = bam_init1();
444 int ret;
445 while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) {
446 state->mode_func(state, file_read);
447
448 if (sam_write1(state->output_file, state->output_header, file_read) < 0) {
449 print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__);
450 bam_destroy1(file_read);
451 free(idx_fn);
452 return false;
453 }
454 }
455 bam_destroy1(file_read);
456 if (ret != -1) {
457 print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__);
458 free(idx_fn);
459 return false;
460 } else {
461
462 if (opts->ga.write_index) {
463 if (sam_idx_save(state->output_file) < 0) {
464 print_error_errno("addreplacerg", "[%s] Writing index failed", __func__);
465 free(idx_fn);
466 return false;
467 }
468 }
469 free(idx_fn);
470 return true;
471 }
472 }
473
main_addreplacerg(int argc,char ** argv)474 int main_addreplacerg(int argc, char** argv)
475 {
476 parsed_opts_t* opts = NULL;
477 state_t* state = NULL;
478 char *arg_list = stringify_argv(argc+1, argv-1);
479 if (!arg_list)
480 return EXIT_FAILURE;
481
482 if (!parse_args(argc, argv, &opts)) goto error;
483 if (opts) { // Not an error but user doesn't want us to proceed
484 if (!init(opts, &state) || !readgroupise(opts, state, arg_list))
485 goto error;
486 }
487
488 cleanup_state(state);
489 cleanup_opts(opts);
490 free(arg_list);
491
492 return EXIT_SUCCESS;
493 error:
494 cleanup_state(state);
495 cleanup_opts(opts);
496 free(arg_list);
497
498 return EXIT_FAILURE;
499 }
500