1 #include "samtools.pysam.h"
2
3 /* bam_addrprg.c -- samtools command to add or replace readgroups.
4
5 Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
6
7 Author: Martin O. Pollard <mp15@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 #include <config.h>
28
29 #include <htslib/sam.h>
30 #include <htslib/kstring.h>
31 #include "samtools.h"
32 #include "htslib/thread_pool.h"
33 #include "sam_opts.h"
34 #include <string.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdbool.h>
38 #include <limits.h>
39 #include <assert.h>
40 #include <unistd.h>
41
42 typedef enum {
43 overwrite_all,
44 orphan_only,
45 } rg_mode;
46
47 struct parsed_opts {
48 char* input_name;
49 char* output_name;
50 char* rg_id;
51 char* rg_line;
52 int no_pg;
53 rg_mode mode;
54 sam_global_args ga;
55 htsThreadPool p;
56 int uncompressed;
57 int overwrite_hdr_rg;
58 };
59
60 struct state;
61 typedef struct parsed_opts parsed_opts_t;
62 typedef struct state state_t;
63
64 struct state {
65 samFile* input_file;
66 sam_hdr_t* input_header;
67 samFile* output_file;
68 sam_hdr_t* output_header;
69 char* rg_id;
70 void (*mode_func)(const state_t*, bam1_t*);
71 };
72
cleanup_opts(parsed_opts_t * opts)73 static void cleanup_opts(parsed_opts_t* opts)
74 {
75 if (!opts) return;
76 free(opts->rg_id);
77 free(opts->output_name);
78 free(opts->input_name);
79 free(opts->rg_line);
80 if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
81 sam_global_args_free(&opts->ga);
82 free(opts);
83 }
84
cleanup_state(state_t * state)85 static void cleanup_state(state_t* state)
86 {
87 if (!state) return;
88 free(state->rg_id);
89 if (state->output_file) sam_close(state->output_file);
90 sam_hdr_destroy(state->output_header);
91 if (state->input_file) sam_close(state->input_file);
92 sam_hdr_destroy(state->input_header);
93 free(state);
94 }
95
96 // Converts \t and \n into real tabs and newlines
basic_unescape(const char * in)97 static char* basic_unescape(const char* in)
98 {
99 assert(in);
100 char *ptr, *out;
101 out = ptr = malloc(strlen(in)+1);
102 size_t size = 0;
103 while (*in) {
104 if (*in == '\\') {
105 ++in;
106 if (*in == '\0') {
107 fprintf(samtools_stderr, "[%s] Unterminated escape sequence.\n", __func__);
108 free(out);
109 return NULL;
110 }
111 switch (*in) {
112 case '\\':
113 *ptr = '\\';
114 break;
115 case 't':
116 *ptr = '\t';
117 break;
118 case 'n':
119 fprintf(samtools_stderr, "[%s] \\n in escape sequence is not supported.\n", __func__);
120 free(out);
121 return NULL;
122 default:
123 fprintf(samtools_stderr, "[%s] Unsupported escape sequence.\n", __func__);
124 free(out);
125 return NULL;
126 }
127 } else {
128 *ptr = *in;
129 }
130 ++in;
131 ++ptr;
132 ++size;
133 }
134 *ptr = '\0';
135 ++size;
136 char* tmp = (char*)realloc(out, size);
137 if (!tmp) {
138 free(out);
139 }
140 return tmp;
141 }
142
143 // Malloc a string containing [s,slim) or to the end of s if slim is NULL.
144 // If lenp is non-NULL, stores the length of the resulting string there.
dup_substring(const char * s,const char * slim,size_t * lenp)145 static char *dup_substring(const char *s, const char *slim, size_t *lenp)
146 {
147 size_t len = slim? (slim - s) : strlen(s);
148 char *ns = malloc(len+1);
149 if (ns == NULL) return NULL;
150 memcpy(ns, s, len);
151 ns[len] = '\0';
152 if (lenp) *lenp = len;
153 return ns;
154 }
155
156
157 // Given a @RG line return the id
get_rg_id(const char * line)158 static char* get_rg_id(const char *line)
159 {
160 const char *id = strstr(line, "\tID:");
161 if (! id) return NULL;
162
163 id += 4;
164 return dup_substring(id, strchr(id, '\t'), NULL);
165 }
166
167
usage(FILE * fp)168 static void usage(FILE *fp)
169 {
170 fprintf(fp,
171 "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
172 "\n"
173 "Options:\n"
174 " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
175 " -o FILE Where to write output to [samtools_stdout]\n"
176 " -r STRING @RG line text\n"
177 " -R STRING ID of @RG line in existing header to use\n"
178 " -u Output uncompressed data\n"
179 " -w Overwrite an existing @RG line\n"
180 " --no-PG Do not add a PG line\n"
181 );
182 sam_global_opt_help(fp, "..O..@..");
183 }
184
parse_args(int argc,char ** argv,parsed_opts_t ** opts)185 static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
186 {
187 *opts = NULL;
188 int n;
189
190 if (argc == 1) { usage(samtools_stdout); return true; }
191
192 parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
193 if (! retval ) {
194 fprintf(samtools_stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
195 return false;
196 }
197 // Set defaults
198 retval->mode = overwrite_all;
199 sam_global_args_init(&retval->ga);
200 static const struct option lopts[] = {
201 SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
202 {"no-PG", no_argument, NULL, 1},
203 { NULL, 0, NULL, 0 }
204 };
205 kstring_t rg_line = {0,0,NULL};
206
207 while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
208 switch (n) {
209 case 'r':
210 // Are we adding to existing rg line?
211 if (ks_len(&rg_line) == 0) {
212 if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
213 kputs("@RG\t", &rg_line);
214 }
215 } else {
216 kputs("\t", &rg_line);
217 }
218 kputs(optarg, &rg_line);
219 break;
220 case 'R':
221 retval->rg_id = strdup(optarg);
222 break;
223 case 'm': {
224 if (strcmp(optarg, "overwrite_all") == 0) {
225 retval->mode = overwrite_all;
226 } else if (strcmp(optarg, "orphan_only") == 0) {
227 retval->mode = orphan_only;
228 } else {
229 usage(samtools_stderr);
230 return false;
231 }
232 break;
233 }
234 case 'o':
235 retval->output_name = strdup(optarg);
236 break;
237 case 'h':
238 usage(samtools_stdout);
239 free(retval);
240 return true;
241 case 1:
242 retval->no_pg = 1;
243 break;
244 case 'u':
245 retval->uncompressed = 1;
246 break;
247 case 'w':
248 retval->overwrite_hdr_rg = 1;
249 break;
250 case '?':
251 usage(samtools_stderr);
252 free(retval);
253 return false;
254 case 'O':
255 default:
256 if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
257 usage(samtools_stderr);
258 free(retval);
259 return false;
260 }
261 }
262 retval->rg_line = ks_release(&rg_line);
263
264 if (argc-optind < 1) {
265 fprintf(samtools_stderr, "You must specify an input file.\n");
266 usage(samtools_stderr);
267 cleanup_opts(retval);
268 return false;
269 }
270 if (retval->rg_id && retval->rg_line) {
271 fprintf(samtools_stderr, "The options -r and -R are mutually exclusive.\n");
272 cleanup_opts(retval);
273 return false;
274 }
275
276 if (retval->rg_line)
277 {
278 char* tmp = basic_unescape(retval->rg_line);
279
280 if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
281 fprintf(samtools_stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
282 free(tmp);
283 cleanup_opts(retval);
284 return false;
285 }
286 free(retval->rg_line);
287 retval->rg_line = tmp;
288 }
289 retval->input_name = strdup(argv[optind+0]);
290
291 if (retval->ga.nthreads > 0) {
292 if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
293 fprintf(samtools_stderr, "Error creating thread pool\n");
294 return false;
295 }
296 }
297
298 *opts = retval;
299 return true;
300 }
301
overwrite_all_func(const state_t * state,bam1_t * file_read)302 static void overwrite_all_func(const state_t* state, bam1_t* file_read)
303 {
304 uint8_t* data = (uint8_t*)strdup(state->rg_id);
305 int len = strlen(state->rg_id)+1;
306 // If the old exists delete it
307 uint8_t* old = bam_aux_get(file_read, "RG");
308 if (old != NULL) {
309 bam_aux_del(file_read, old);
310 }
311
312 bam_aux_append(file_read, "RG", 'Z', len, data);
313 free(data);
314 }
315
orphan_only_func(const state_t * state,bam1_t * file_read)316 static void orphan_only_func(const state_t* state, bam1_t* file_read)
317 {
318 uint8_t* data = (uint8_t*)strdup(state->rg_id);
319 int len = strlen(state->rg_id)+1;
320 // If the old exists don't do anything
321 uint8_t* old = bam_aux_get(file_read, "RG");
322 if (old == NULL) {
323 bam_aux_append(file_read, "RG",'Z',len,data);
324 }
325 free(data);
326 }
327
init(const parsed_opts_t * opts,state_t ** state_out)328 static bool init(const parsed_opts_t* opts, state_t** state_out) {
329 char output_mode[9] = "w";
330 state_t* retval = (state_t*) calloc(1, sizeof(state_t));
331
332 if (retval == NULL) {
333 fprintf(samtools_stderr, "[init] Out of memory allocating state struct.\n");
334 return false;
335 }
336 *state_out = retval;
337
338 // Open files
339 retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
340 if (retval->input_file == NULL) {
341 print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
342 return false;
343 }
344 retval->input_header = sam_hdr_read(retval->input_file);
345
346 retval->output_header = sam_hdr_dup(retval->input_header);
347
348 if (opts->uncompressed)
349 strcat(output_mode, "0");
350 if (opts->output_name) // File format auto-detection
351 sam_open_mode(output_mode + strlen(output_mode),
352 opts->output_name, NULL);
353 retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
354
355 if (retval->output_file == NULL) {
356 print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
357 return false;
358 }
359
360 if (opts->p.pool) {
361 hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p);
362 hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
363 }
364
365 if (opts->rg_line) {
366 // Append new RG line to header.
367 // Check does not already exist
368 kstring_t hdr_line = { 0, 0, NULL };
369 if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
370 if (opts->overwrite_hdr_rg) {
371 if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
372 fprintf(samtools_stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
373 ks_free(&hdr_line);
374 return false;
375 }
376 } else {
377 fprintf(samtools_stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
378 ks_free(&hdr_line);
379 return false;
380 }
381 }
382 ks_free(&hdr_line);
383
384 if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
385 fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
386 return false;
387 }
388 if (opts->mode == overwrite_all &&
389 -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) {
390 fprintf(samtools_stderr, "[init] Error removing the old RG lines from the output header.\n");
391 return false;
392 }
393 retval->rg_id = strdup(opts->rg_id);
394 } else {
395 if (opts->rg_id) {
396 // Confirm what has been supplied exists
397 kstring_t hdr_line = { 0, 0, NULL };
398 if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) {
399 fprintf(samtools_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
400 return false;
401 }
402 retval->rg_id = strdup(opts->rg_id);
403 ks_free(&hdr_line);
404 } else {
405 kstring_t rg_id = { 0, 0, NULL };
406 if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
407 fprintf(samtools_stderr, "No RG specified on command line or in existing header.\n");
408 return false;
409 }
410 retval->rg_id = ks_release(&rg_id);
411 }
412 }
413
414 switch (opts->mode) {
415 case overwrite_all:
416 retval->mode_func = &overwrite_all_func;
417 break;
418 case orphan_only:
419 retval->mode_func = &orphan_only_func;
420 break;
421 }
422
423 return true;
424 }
425
readgroupise(parsed_opts_t * opts,state_t * state,char * arg_list)426 static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list)
427 {
428 if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools",
429 "VN", samtools_version(),
430 arg_list ? "CL": NULL,
431 arg_list ? arg_list : NULL,
432 NULL))
433 return false;
434
435 if (sam_hdr_write(state->output_file, state->output_header) != 0) {
436 print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__);
437 return false;
438 }
439 char *idx_fn = NULL;
440 if (opts->ga.write_index) {
441 if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header)))
442 return false;
443 }
444
445 bam1_t* file_read = bam_init1();
446 int ret;
447 while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) {
448 state->mode_func(state, file_read);
449
450 if (sam_write1(state->output_file, state->output_header, file_read) < 0) {
451 print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__);
452 bam_destroy1(file_read);
453 free(idx_fn);
454 return false;
455 }
456 }
457 bam_destroy1(file_read);
458 if (ret != -1) {
459 print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__);
460 free(idx_fn);
461 return false;
462 } else {
463
464 if (opts->ga.write_index) {
465 if (sam_idx_save(state->output_file) < 0) {
466 print_error_errno("addreplacerg", "[%s] Writing index failed", __func__);
467 free(idx_fn);
468 return false;
469 }
470 }
471 free(idx_fn);
472 return true;
473 }
474 }
475
main_addreplacerg(int argc,char ** argv)476 int main_addreplacerg(int argc, char** argv)
477 {
478 parsed_opts_t* opts = NULL;
479 state_t* state = NULL;
480 char *arg_list = stringify_argv(argc+1, argv-1);
481 if (!arg_list)
482 return EXIT_FAILURE;
483
484 if (!parse_args(argc, argv, &opts)) goto error;
485 if (opts) { // Not an error but user doesn't want us to proceed
486 if (!init(opts, &state) || !readgroupise(opts, state, arg_list))
487 goto error;
488 }
489
490 cleanup_state(state);
491 cleanup_opts(opts);
492 free(arg_list);
493
494 return EXIT_SUCCESS;
495 error:
496 cleanup_state(state);
497 cleanup_opts(opts);
498 free(arg_list);
499
500 return EXIT_FAILURE;
501 }
502