1 /* bam_addrprg.c -- samtools command to add or replace readgroups.
2 
3    Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
4 
5    Author: Martin O. Pollard <mp15@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <config.h>
26 
27 #include <htslib/sam.h>
28 #include <htslib/kstring.h>
29 #include "samtools.h"
30 #include "htslib/thread_pool.h"
31 #include "sam_opts.h"
32 #include <string.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <stdbool.h>
36 #include <limits.h>
37 #include <assert.h>
38 #include <unistd.h>
39 
40 typedef enum {
41     overwrite_all,
42     orphan_only,
43 } rg_mode;
44 
45 struct parsed_opts {
46     char* input_name;
47     char* output_name;
48     char* rg_id;
49     char* rg_line;
50     int no_pg;
51     rg_mode mode;
52     sam_global_args ga;
53     htsThreadPool p;
54     int uncompressed;
55     int overwrite_hdr_rg;
56 };
57 
58 struct state;
59 typedef struct parsed_opts parsed_opts_t;
60 typedef struct state state_t;
61 
62 struct state {
63     samFile* input_file;
64     sam_hdr_t* input_header;
65     samFile* output_file;
66     sam_hdr_t* output_header;
67     char* rg_id;
68     void (*mode_func)(const state_t*, bam1_t*);
69 };
70 
cleanup_opts(parsed_opts_t * opts)71 static void cleanup_opts(parsed_opts_t* opts)
72 {
73     if (!opts) return;
74     free(opts->rg_id);
75     free(opts->output_name);
76     free(opts->input_name);
77     free(opts->rg_line);
78     if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
79     sam_global_args_free(&opts->ga);
80     free(opts);
81 }
82 
cleanup_state(state_t * state)83 static void cleanup_state(state_t* state)
84 {
85     if (!state) return;
86     free(state->rg_id);
87     if (state->output_file) sam_close(state->output_file);
88     sam_hdr_destroy(state->output_header);
89     if (state->input_file) sam_close(state->input_file);
90     sam_hdr_destroy(state->input_header);
91     free(state);
92 }
93 
94 // Converts \t and \n into real tabs and newlines
basic_unescape(const char * in)95 static char* basic_unescape(const char* in)
96 {
97     assert(in);
98     char *ptr, *out;
99     out = ptr = malloc(strlen(in)+1);
100     size_t size = 0;
101     while (*in) {
102         if (*in == '\\') {
103             ++in;
104             if (*in == '\0') {
105                 fprintf(stderr, "[%s] Unterminated escape sequence.\n", __func__);
106                 free(out);
107                 return NULL;
108             }
109             switch (*in) {
110             case '\\':
111                 *ptr = '\\';
112                 break;
113             case 't':
114                 *ptr = '\t';
115                 break;
116             case 'n':
117                 fprintf(stderr, "[%s] \\n in escape sequence is not supported.\n", __func__);
118                 free(out);
119                 return NULL;
120             default:
121                 fprintf(stderr, "[%s] Unsupported escape sequence.\n", __func__);
122                 free(out);
123                 return NULL;
124             }
125         } else {
126             *ptr = *in;
127         }
128         ++in;
129         ++ptr;
130         ++size;
131     }
132     *ptr = '\0';
133     ++size;
134     char* tmp = (char*)realloc(out, size);
135     if (!tmp) {
136         free(out);
137     }
138     return tmp;
139 }
140 
141 // Malloc a string containing [s,slim) or to the end of s if slim is NULL.
142 // If lenp is non-NULL, stores the length of the resulting string there.
dup_substring(const char * s,const char * slim,size_t * lenp)143 static char *dup_substring(const char *s, const char *slim, size_t *lenp)
144 {
145     size_t len = slim? (slim - s) : strlen(s);
146     char *ns = malloc(len+1);
147     if (ns == NULL) return NULL;
148     memcpy(ns, s, len);
149     ns[len] = '\0';
150     if (lenp) *lenp = len;
151     return ns;
152 }
153 
154 
155 // Given a @RG line return the id
get_rg_id(const char * line)156 static char* get_rg_id(const char *line)
157 {
158     const char *id = strstr(line, "\tID:");
159     if (! id) return NULL;
160 
161     id += 4;
162     return dup_substring(id, strchr(id, '\t'), NULL);
163 }
164 
165 
usage(FILE * fp)166 static void usage(FILE *fp)
167 {
168     fprintf(fp,
169             "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
170             "\n"
171             "Options:\n"
172             "  -m MODE   Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
173             "  -o FILE   Where to write output to [stdout]\n"
174             "  -r STRING @RG line text\n"
175             "  -R STRING ID of @RG line in existing header to use\n"
176             "  -u        Output uncompressed data\n"
177             "  -w        Overwrite an existing @RG line\n"
178             "  --no-PG   Do not add a PG line\n"
179             );
180     sam_global_opt_help(fp, "..O..@..");
181 }
182 
parse_args(int argc,char ** argv,parsed_opts_t ** opts)183 static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
184 {
185     *opts = NULL;
186     int n;
187 
188     if (argc == 1) { usage(stdout); return true; }
189 
190     parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
191     if (! retval ) {
192         fprintf(stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
193         return false;
194     }
195     // Set defaults
196     retval->mode = overwrite_all;
197     sam_global_args_init(&retval->ga);
198     static const struct option lopts[] = {
199         SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
200         {"no-PG", no_argument, NULL, 1},
201         { NULL, 0, NULL, 0 }
202     };
203     kstring_t rg_line = {0,0,NULL};
204 
205     while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
206         switch (n) {
207             case 'r':
208                 // Are we adding to existing rg line?
209                 if (ks_len(&rg_line) == 0) {
210                     if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
211                         kputs("@RG\t", &rg_line);
212                     }
213                 } else {
214                     kputs("\t", &rg_line);
215                 }
216                 kputs(optarg, &rg_line);
217                 break;
218             case 'R':
219                 retval->rg_id = strdup(optarg);
220                 break;
221             case 'm': {
222                 if (strcmp(optarg, "overwrite_all") == 0) {
223                     retval->mode = overwrite_all;
224                 } else if (strcmp(optarg, "orphan_only") == 0) {
225                     retval->mode = orphan_only;
226                 } else {
227                     usage(stderr);
228                     return false;
229                 }
230                 break;
231             }
232             case 'o':
233                 retval->output_name = strdup(optarg);
234                 break;
235             case 'h':
236                 usage(stdout);
237                 free(retval);
238                 return true;
239             case 1:
240                 retval->no_pg = 1;
241                 break;
242             case 'u':
243                 retval->uncompressed = 1;
244                 break;
245             case 'w':
246                 retval->overwrite_hdr_rg = 1;
247                 break;
248             case '?':
249                 usage(stderr);
250                 free(retval);
251                 return false;
252             case 'O':
253             default:
254                 if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
255                 usage(stderr);
256                 free(retval);
257                 return false;
258         }
259     }
260     retval->rg_line = ks_release(&rg_line);
261 
262     if (argc-optind < 1) {
263         fprintf(stderr, "You must specify an input file.\n");
264         usage(stderr);
265         cleanup_opts(retval);
266         return false;
267     }
268     if (retval->rg_id && retval->rg_line) {
269         fprintf(stderr, "The options -r and -R are mutually exclusive.\n");
270         cleanup_opts(retval);
271         return false;
272     }
273 
274     if (retval->rg_line)
275     {
276         char* tmp = basic_unescape(retval->rg_line);
277 
278         if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
279             fprintf(stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
280             free(tmp);
281             cleanup_opts(retval);
282             return false;
283         }
284         free(retval->rg_line);
285         retval->rg_line = tmp;
286     }
287     retval->input_name = strdup(argv[optind+0]);
288 
289     if (retval->ga.nthreads > 0) {
290         if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
291             fprintf(stderr, "Error creating thread pool\n");
292             return false;
293         }
294     }
295 
296     *opts = retval;
297     return true;
298 }
299 
overwrite_all_func(const state_t * state,bam1_t * file_read)300 static void overwrite_all_func(const state_t* state, bam1_t* file_read)
301 {
302     uint8_t* data = (uint8_t*)strdup(state->rg_id);
303     int len = strlen(state->rg_id)+1;
304     // If the old exists delete it
305     uint8_t* old = bam_aux_get(file_read, "RG");
306     if (old != NULL) {
307         bam_aux_del(file_read, old);
308     }
309 
310     bam_aux_append(file_read, "RG", 'Z', len, data);
311     free(data);
312 }
313 
orphan_only_func(const state_t * state,bam1_t * file_read)314 static void orphan_only_func(const state_t* state, bam1_t* file_read)
315 {
316     uint8_t* data = (uint8_t*)strdup(state->rg_id);
317     int len = strlen(state->rg_id)+1;
318     // If the old exists don't do anything
319     uint8_t* old = bam_aux_get(file_read, "RG");
320     if (old == NULL) {
321         bam_aux_append(file_read, "RG",'Z',len,data);
322     }
323     free(data);
324 }
325 
init(const parsed_opts_t * opts,state_t ** state_out)326 static bool init(const parsed_opts_t* opts, state_t** state_out) {
327     char output_mode[9] = "w";
328     state_t* retval = (state_t*) calloc(1, sizeof(state_t));
329 
330     if (retval == NULL) {
331         fprintf(stderr, "[init] Out of memory allocating state struct.\n");
332         return false;
333     }
334     *state_out = retval;
335 
336     // Open files
337     retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
338     if (retval->input_file == NULL) {
339         print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
340         return false;
341     }
342     retval->input_header = sam_hdr_read(retval->input_file);
343 
344     retval->output_header = sam_hdr_dup(retval->input_header);
345 
346     if (opts->uncompressed)
347         strcat(output_mode, "0");
348     if (opts->output_name) // File format auto-detection
349         sam_open_mode(output_mode + strlen(output_mode),
350                       opts->output_name, NULL);
351     retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
352 
353     if (retval->output_file == NULL) {
354         print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
355         return false;
356     }
357 
358     if (opts->p.pool) {
359         hts_set_opt(retval->input_file,  HTS_OPT_THREAD_POOL, &opts->p);
360         hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
361     }
362 
363     if (opts->rg_line) {
364         // Append new RG line to header.
365         // Check does not already exist
366         kstring_t hdr_line = { 0, 0, NULL };
367         if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
368             if (opts->overwrite_hdr_rg) {
369                 if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
370                     fprintf(stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
371                     ks_free(&hdr_line);
372                     return false;
373                 }
374             } else {
375                 fprintf(stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
376                 ks_free(&hdr_line);
377                 return false;
378             }
379         }
380         ks_free(&hdr_line);
381 
382         if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
383             fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
384             return false;
385         }
386         if (opts->mode == overwrite_all &&
387             -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) {
388             fprintf(stderr, "[init] Error removing the old RG lines from the output header.\n");
389             return false;
390         }
391         retval->rg_id = strdup(opts->rg_id);
392     } else {
393         if (opts->rg_id) {
394             // Confirm what has been supplied exists
395             kstring_t hdr_line = { 0, 0, NULL };
396             if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) {
397                 fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
398                 return false;
399             }
400             retval->rg_id = strdup(opts->rg_id);
401             ks_free(&hdr_line);
402         } else {
403             kstring_t rg_id = { 0, 0, NULL };
404             if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
405                 fprintf(stderr, "No RG specified on command line or in existing header.\n");
406                 return false;
407             }
408             retval->rg_id = ks_release(&rg_id);
409         }
410     }
411 
412     switch (opts->mode) {
413         case overwrite_all:
414             retval->mode_func = &overwrite_all_func;
415             break;
416         case orphan_only:
417             retval->mode_func = &orphan_only_func;
418             break;
419     }
420 
421     return true;
422 }
423 
readgroupise(parsed_opts_t * opts,state_t * state,char * arg_list)424 static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list)
425 {
426     if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools",
427                                        "VN", samtools_version(),
428                                        arg_list ? "CL": NULL,
429                                        arg_list ? arg_list : NULL,
430                                        NULL))
431         return false;
432 
433     if (sam_hdr_write(state->output_file, state->output_header) != 0) {
434         print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__);
435         return false;
436     }
437     char *idx_fn = NULL;
438     if (opts->ga.write_index) {
439         if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header)))
440             return false;
441     }
442 
443     bam1_t* file_read = bam_init1();
444     int ret;
445     while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) {
446         state->mode_func(state, file_read);
447 
448         if (sam_write1(state->output_file, state->output_header, file_read) < 0) {
449             print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__);
450             bam_destroy1(file_read);
451             free(idx_fn);
452             return false;
453         }
454     }
455     bam_destroy1(file_read);
456     if (ret != -1) {
457         print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__);
458         free(idx_fn);
459         return false;
460     } else {
461 
462         if (opts->ga.write_index) {
463             if (sam_idx_save(state->output_file) < 0) {
464                 print_error_errno("addreplacerg", "[%s] Writing index failed", __func__);
465                 free(idx_fn);
466                 return false;
467             }
468         }
469         free(idx_fn);
470         return true;
471     }
472 }
473 
main_addreplacerg(int argc,char ** argv)474 int main_addreplacerg(int argc, char** argv)
475 {
476     parsed_opts_t* opts = NULL;
477     state_t* state = NULL;
478     char *arg_list = stringify_argv(argc+1, argv-1);
479     if (!arg_list)
480         return EXIT_FAILURE;
481 
482     if (!parse_args(argc, argv, &opts)) goto error;
483     if (opts) { // Not an error but user doesn't want us to proceed
484         if (!init(opts, &state) || !readgroupise(opts, state, arg_list))
485             goto error;
486     }
487 
488     cleanup_state(state);
489     cleanup_opts(opts);
490     free(arg_list);
491 
492     return EXIT_SUCCESS;
493 error:
494     cleanup_state(state);
495     cleanup_opts(opts);
496     free(arg_list);
497 
498     return EXIT_FAILURE;
499 }
500