1 #include "samtools.pysam.h"
2 
3 /* bam_addrprg.c -- samtools command to add or replace readgroups.
4 
5    Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
6 
7    Author: Martin O. Pollard <mp15@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.  */
26 
27 #include <config.h>
28 
29 #include <htslib/sam.h>
30 #include <htslib/kstring.h>
31 #include "samtools.h"
32 #include "htslib/thread_pool.h"
33 #include "sam_opts.h"
34 #include <string.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdbool.h>
38 #include <limits.h>
39 #include <assert.h>
40 #include <unistd.h>
41 
42 typedef enum {
43     overwrite_all,
44     orphan_only,
45 } rg_mode;
46 
47 struct parsed_opts {
48     char* input_name;
49     char* output_name;
50     char* rg_id;
51     char* rg_line;
52     int no_pg;
53     rg_mode mode;
54     sam_global_args ga;
55     htsThreadPool p;
56     int uncompressed;
57     int overwrite_hdr_rg;
58 };
59 
60 struct state;
61 typedef struct parsed_opts parsed_opts_t;
62 typedef struct state state_t;
63 
64 struct state {
65     samFile* input_file;
66     sam_hdr_t* input_header;
67     samFile* output_file;
68     sam_hdr_t* output_header;
69     char* rg_id;
70     void (*mode_func)(const state_t*, bam1_t*);
71 };
72 
cleanup_opts(parsed_opts_t * opts)73 static void cleanup_opts(parsed_opts_t* opts)
74 {
75     if (!opts) return;
76     free(opts->rg_id);
77     free(opts->output_name);
78     free(opts->input_name);
79     free(opts->rg_line);
80     if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
81     sam_global_args_free(&opts->ga);
82     free(opts);
83 }
84 
cleanup_state(state_t * state)85 static void cleanup_state(state_t* state)
86 {
87     if (!state) return;
88     free(state->rg_id);
89     if (state->output_file) sam_close(state->output_file);
90     sam_hdr_destroy(state->output_header);
91     if (state->input_file) sam_close(state->input_file);
92     sam_hdr_destroy(state->input_header);
93     free(state);
94 }
95 
96 // Converts \t and \n into real tabs and newlines
basic_unescape(const char * in)97 static char* basic_unescape(const char* in)
98 {
99     assert(in);
100     char *ptr, *out;
101     out = ptr = malloc(strlen(in)+1);
102     size_t size = 0;
103     while (*in) {
104         if (*in == '\\') {
105             ++in;
106             if (*in == '\0') {
107                 fprintf(samtools_stderr, "[%s] Unterminated escape sequence.\n", __func__);
108                 free(out);
109                 return NULL;
110             }
111             switch (*in) {
112             case '\\':
113                 *ptr = '\\';
114                 break;
115             case 't':
116                 *ptr = '\t';
117                 break;
118             case 'n':
119                 fprintf(samtools_stderr, "[%s] \\n in escape sequence is not supported.\n", __func__);
120                 free(out);
121                 return NULL;
122             default:
123                 fprintf(samtools_stderr, "[%s] Unsupported escape sequence.\n", __func__);
124                 free(out);
125                 return NULL;
126             }
127         } else {
128             *ptr = *in;
129         }
130         ++in;
131         ++ptr;
132         ++size;
133     }
134     *ptr = '\0';
135     ++size;
136     char* tmp = (char*)realloc(out, size);
137     if (!tmp) {
138         free(out);
139     }
140     return tmp;
141 }
142 
143 // Malloc a string containing [s,slim) or to the end of s if slim is NULL.
144 // If lenp is non-NULL, stores the length of the resulting string there.
dup_substring(const char * s,const char * slim,size_t * lenp)145 static char *dup_substring(const char *s, const char *slim, size_t *lenp)
146 {
147     size_t len = slim? (slim - s) : strlen(s);
148     char *ns = malloc(len+1);
149     if (ns == NULL) return NULL;
150     memcpy(ns, s, len);
151     ns[len] = '\0';
152     if (lenp) *lenp = len;
153     return ns;
154 }
155 
156 
157 // Given a @RG line return the id
get_rg_id(const char * line)158 static char* get_rg_id(const char *line)
159 {
160     const char *id = strstr(line, "\tID:");
161     if (! id) return NULL;
162 
163     id += 4;
164     return dup_substring(id, strchr(id, '\t'), NULL);
165 }
166 
167 
usage(FILE * fp)168 static void usage(FILE *fp)
169 {
170     fprintf(fp,
171             "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
172             "\n"
173             "Options:\n"
174             "  -m MODE   Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
175             "  -o FILE   Where to write output to [samtools_stdout]\n"
176             "  -r STRING @RG line text\n"
177             "  -R STRING ID of @RG line in existing header to use\n"
178             "  -u        Output uncompressed data\n"
179             "  -w        Overwrite an existing @RG line\n"
180             "  --no-PG   Do not add a PG line\n"
181             );
182     sam_global_opt_help(fp, "..O..@..");
183 }
184 
parse_args(int argc,char ** argv,parsed_opts_t ** opts)185 static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
186 {
187     *opts = NULL;
188     int n;
189 
190     if (argc == 1) { usage(samtools_stdout); return true; }
191 
192     parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t));
193     if (! retval ) {
194         fprintf(samtools_stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__);
195         return false;
196     }
197     // Set defaults
198     retval->mode = overwrite_all;
199     sam_global_args_init(&retval->ga);
200     static const struct option lopts[] = {
201         SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
202         {"no-PG", no_argument, NULL, 1},
203         { NULL, 0, NULL, 0 }
204     };
205     kstring_t rg_line = {0,0,NULL};
206 
207     while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
208         switch (n) {
209             case 'r':
210                 // Are we adding to existing rg line?
211                 if (ks_len(&rg_line) == 0) {
212                     if (strlen(optarg)<3 || (optarg[0] != '@' && optarg[1] != 'R' && optarg[2] != 'G')) {
213                         kputs("@RG\t", &rg_line);
214                     }
215                 } else {
216                     kputs("\t", &rg_line);
217                 }
218                 kputs(optarg, &rg_line);
219                 break;
220             case 'R':
221                 retval->rg_id = strdup(optarg);
222                 break;
223             case 'm': {
224                 if (strcmp(optarg, "overwrite_all") == 0) {
225                     retval->mode = overwrite_all;
226                 } else if (strcmp(optarg, "orphan_only") == 0) {
227                     retval->mode = orphan_only;
228                 } else {
229                     usage(samtools_stderr);
230                     return false;
231                 }
232                 break;
233             }
234             case 'o':
235                 retval->output_name = strdup(optarg);
236                 break;
237             case 'h':
238                 usage(samtools_stdout);
239                 free(retval);
240                 return true;
241             case 1:
242                 retval->no_pg = 1;
243                 break;
244             case 'u':
245                 retval->uncompressed = 1;
246                 break;
247             case 'w':
248                 retval->overwrite_hdr_rg = 1;
249                 break;
250             case '?':
251                 usage(samtools_stderr);
252                 free(retval);
253                 return false;
254             case 'O':
255             default:
256                 if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break;
257                 usage(samtools_stderr);
258                 free(retval);
259                 return false;
260         }
261     }
262     retval->rg_line = ks_release(&rg_line);
263 
264     if (argc-optind < 1) {
265         fprintf(samtools_stderr, "You must specify an input file.\n");
266         usage(samtools_stderr);
267         cleanup_opts(retval);
268         return false;
269     }
270     if (retval->rg_id && retval->rg_line) {
271         fprintf(samtools_stderr, "The options -r and -R are mutually exclusive.\n");
272         cleanup_opts(retval);
273         return false;
274     }
275 
276     if (retval->rg_line)
277     {
278         char* tmp = basic_unescape(retval->rg_line);
279 
280         if ((retval->rg_id = get_rg_id(tmp)) == NULL) {
281             fprintf(samtools_stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__);
282             free(tmp);
283             cleanup_opts(retval);
284             return false;
285         }
286         free(retval->rg_line);
287         retval->rg_line = tmp;
288     }
289     retval->input_name = strdup(argv[optind+0]);
290 
291     if (retval->ga.nthreads > 0) {
292         if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
293             fprintf(samtools_stderr, "Error creating thread pool\n");
294             return false;
295         }
296     }
297 
298     *opts = retval;
299     return true;
300 }
301 
overwrite_all_func(const state_t * state,bam1_t * file_read)302 static void overwrite_all_func(const state_t* state, bam1_t* file_read)
303 {
304     uint8_t* data = (uint8_t*)strdup(state->rg_id);
305     int len = strlen(state->rg_id)+1;
306     // If the old exists delete it
307     uint8_t* old = bam_aux_get(file_read, "RG");
308     if (old != NULL) {
309         bam_aux_del(file_read, old);
310     }
311 
312     bam_aux_append(file_read, "RG", 'Z', len, data);
313     free(data);
314 }
315 
orphan_only_func(const state_t * state,bam1_t * file_read)316 static void orphan_only_func(const state_t* state, bam1_t* file_read)
317 {
318     uint8_t* data = (uint8_t*)strdup(state->rg_id);
319     int len = strlen(state->rg_id)+1;
320     // If the old exists don't do anything
321     uint8_t* old = bam_aux_get(file_read, "RG");
322     if (old == NULL) {
323         bam_aux_append(file_read, "RG",'Z',len,data);
324     }
325     free(data);
326 }
327 
init(const parsed_opts_t * opts,state_t ** state_out)328 static bool init(const parsed_opts_t* opts, state_t** state_out) {
329     char output_mode[9] = "w";
330     state_t* retval = (state_t*) calloc(1, sizeof(state_t));
331 
332     if (retval == NULL) {
333         fprintf(samtools_stderr, "[init] Out of memory allocating state struct.\n");
334         return false;
335     }
336     *state_out = retval;
337 
338     // Open files
339     retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
340     if (retval->input_file == NULL) {
341         print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
342         return false;
343     }
344     retval->input_header = sam_hdr_read(retval->input_file);
345 
346     retval->output_header = sam_hdr_dup(retval->input_header);
347 
348     if (opts->uncompressed)
349         strcat(output_mode, "0");
350     if (opts->output_name) // File format auto-detection
351         sam_open_mode(output_mode + strlen(output_mode),
352                       opts->output_name, NULL);
353     retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
354 
355     if (retval->output_file == NULL) {
356         print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
357         return false;
358     }
359 
360     if (opts->p.pool) {
361         hts_set_opt(retval->input_file,  HTS_OPT_THREAD_POOL, &opts->p);
362         hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
363     }
364 
365     if (opts->rg_line) {
366         // Append new RG line to header.
367         // Check does not already exist
368         kstring_t hdr_line = { 0, 0, NULL };
369         if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
370             if (opts->overwrite_hdr_rg) {
371                 if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
372                     fprintf(samtools_stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
373                     ks_free(&hdr_line);
374                     return false;
375                 }
376             } else {
377                 fprintf(samtools_stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
378                 ks_free(&hdr_line);
379                 return false;
380             }
381         }
382         ks_free(&hdr_line);
383 
384         if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
385             fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
386             return false;
387         }
388         if (opts->mode == overwrite_all &&
389             -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) {
390             fprintf(samtools_stderr, "[init] Error removing the old RG lines from the output header.\n");
391             return false;
392         }
393         retval->rg_id = strdup(opts->rg_id);
394     } else {
395         if (opts->rg_id) {
396             // Confirm what has been supplied exists
397             kstring_t hdr_line = { 0, 0, NULL };
398             if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) {
399                 fprintf(samtools_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
400                 return false;
401             }
402             retval->rg_id = strdup(opts->rg_id);
403             ks_free(&hdr_line);
404         } else {
405             kstring_t rg_id = { 0, 0, NULL };
406             if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
407                 fprintf(samtools_stderr, "No RG specified on command line or in existing header.\n");
408                 return false;
409             }
410             retval->rg_id = ks_release(&rg_id);
411         }
412     }
413 
414     switch (opts->mode) {
415         case overwrite_all:
416             retval->mode_func = &overwrite_all_func;
417             break;
418         case orphan_only:
419             retval->mode_func = &orphan_only_func;
420             break;
421     }
422 
423     return true;
424 }
425 
readgroupise(parsed_opts_t * opts,state_t * state,char * arg_list)426 static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list)
427 {
428     if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools",
429                                        "VN", samtools_version(),
430                                        arg_list ? "CL": NULL,
431                                        arg_list ? arg_list : NULL,
432                                        NULL))
433         return false;
434 
435     if (sam_hdr_write(state->output_file, state->output_header) != 0) {
436         print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__);
437         return false;
438     }
439     char *idx_fn = NULL;
440     if (opts->ga.write_index) {
441         if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header)))
442             return false;
443     }
444 
445     bam1_t* file_read = bam_init1();
446     int ret;
447     while ((ret = sam_read1(state->input_file, state->input_header, file_read)) >= 0) {
448         state->mode_func(state, file_read);
449 
450         if (sam_write1(state->output_file, state->output_header, file_read) < 0) {
451             print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__);
452             bam_destroy1(file_read);
453             free(idx_fn);
454             return false;
455         }
456     }
457     bam_destroy1(file_read);
458     if (ret != -1) {
459         print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__);
460         free(idx_fn);
461         return false;
462     } else {
463 
464         if (opts->ga.write_index) {
465             if (sam_idx_save(state->output_file) < 0) {
466                 print_error_errno("addreplacerg", "[%s] Writing index failed", __func__);
467                 free(idx_fn);
468                 return false;
469             }
470         }
471         free(idx_fn);
472         return true;
473     }
474 }
475 
main_addreplacerg(int argc,char ** argv)476 int main_addreplacerg(int argc, char** argv)
477 {
478     parsed_opts_t* opts = NULL;
479     state_t* state = NULL;
480     char *arg_list = stringify_argv(argc+1, argv-1);
481     if (!arg_list)
482         return EXIT_FAILURE;
483 
484     if (!parse_args(argc, argv, &opts)) goto error;
485     if (opts) { // Not an error but user doesn't want us to proceed
486         if (!init(opts, &state) || !readgroupise(opts, state, arg_list))
487             goto error;
488     }
489 
490     cleanup_state(state);
491     cleanup_opts(opts);
492     free(arg_list);
493 
494     return EXIT_SUCCESS;
495 error:
496     cleanup_state(state);
497     cleanup_opts(opts);
498     free(arg_list);
499 
500     return EXIT_FAILURE;
501 }
502