1 /*  bam_sample.c -- group data by sample.
2 
3     Copyright (C) 2010, 2011 Broad Institute.
4     Copyright (C) 2013, 2016-2018 Genome Research Ltd.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <htslib/hts.h>
30 #include <htslib/kstring.h>
31 #include <htslib/khash_str2int.h>
32 #include <khash_str2str.h>
33 #include "bam_sample.h"
34 #include "bcftools.h"
35 
36 
37 typedef struct
38 {
39     char *fname;
40     void *rg2idx;       // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup
41     int default_idx;    // default BCF output sample index, set only when all readgroups are treated as one sample
42 }
43 file_t;
44 
45 struct _bam_smpl_t
46 {
47     kstring_t tmp;
48     file_t *files;
49     int ignore_rg, nsmpl, nfiles;
50     char **smpl;        // list of BCF output sample names. Maintained by bsmpl_add_readgroup
51     void *sample_list;  // hash: BAM input sample name to BCF output sample name. This is the -s/-S list
52     int sample_logic;   // the -s/-S logic, 1: include, 0: exclude
53     void *rg_list;      // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list
54     int rg_logic;       // the -G logic, 1: include, 0: exclude
55     void *name2idx;     // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup
56 };
57 
bam_smpl_init(void)58 bam_smpl_t *bam_smpl_init(void)
59 {
60     bam_smpl_t *bsmpl;
61     bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t));
62     bsmpl->name2idx = khash_str2int_init();
63     return bsmpl;
64 }
65 
bam_smpl_destroy(bam_smpl_t * bsmpl)66 void bam_smpl_destroy(bam_smpl_t *bsmpl)
67 {
68     if ( !bsmpl ) return;
69     if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx);
70     if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list);
71     if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list);
72     int i;
73     for (i=0; i<bsmpl->nfiles; i++)
74     {
75         file_t *file = &bsmpl->files[i];
76         if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
77         free(file->fname);
78     }
79     free(bsmpl->smpl);
80     free(bsmpl->files);
81     free(bsmpl->tmp.s);
82     free(bsmpl);
83 }
84 
bam_smpl_ignore_readgroups(bam_smpl_t * bsmpl)85 void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl)
86 {
87     bsmpl->ignore_rg = 1;
88 }
89 
bsmpl_add_readgroup(bam_smpl_t * bsmpl,file_t * file,const char * rg_id,const char * smpl_name)90 static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
91 {
92     int ismpl = -1;
93     if ( smpl_name )
94     {
95         if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
96         {
97             // new sample
98             bsmpl->nsmpl++;
99             bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
100             bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
101             ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
102         }
103     }
104     if ( !strcmp("*",rg_id) )
105     {
106         // all read groups in the bam treated as the same sample
107         file->default_idx = ismpl;
108         return;
109     }
110     if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
111     if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return;    // duplicate @RG:ID
112     khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
113 }
bsmpl_keep_readgroup(bam_smpl_t * bsmpl,file_t * file,const char * rg_id,const char ** smpl_name)114 static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
115 {
116     char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id);    // unique read group present in one bam only
117     if ( !rg_smpl )
118     {
119         // read group specific to this bam
120         bsmpl->tmp.l = 0;
121         ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
122         rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
123     }
124     if ( !rg_smpl )
125     {
126         // any read group in this file?
127         bsmpl->tmp.l = 0;
128         ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
129         rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
130     }
131     if ( !rg_smpl && bsmpl->rg_logic ) return 0;
132     if ( rg_smpl && !bsmpl->rg_logic ) return 0;
133 
134     if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl;    // rename the sample
135     return 1;
136 }
137 
138 /*
139     The logic of this function is a bit complicated because we want to work
140     also with broken bams containing read groups that are not listed in the
141     header. The desired behavior is as follows:
142         - when -G is given, read groups which are not listed in the header must
143           be given explicitly using the "?" symbol in -G.
144           Otherwise:
145         - if the bam has no header, all reads in the file are assigned to a
146           single sample named after the file
147         - if there is at least one sample defined in the header, reads with no
148           read group id or with a read group id not listed in the header are
149           assigned to the first sample encountered in the header
150 */
bam_smpl_add_bam(bam_smpl_t * bsmpl,char * bam_hdr,const char * fname)151 int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
152 {
153     bsmpl->nfiles++;
154     bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
155     file_t *file = &bsmpl->files[bsmpl->nfiles-1];
156     memset(file,0,sizeof(file_t));
157     file->fname  = strdup(fname);
158     file->default_idx = -1;
159 
160     if ( bsmpl->ignore_rg || !bam_hdr )
161     {
162         // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
163         bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
164         return bsmpl->nfiles-1;
165     }
166 
167     void *bam_smpls = khash_str2int_init();
168     int first_smpl = -1, nskipped = 0;
169     const char *p = bam_hdr, *q, *r;
170     while (p != NULL && (q = strstr(p, "@RG")) != 0)
171     {
172         char *eol = strchr(q + 3, '\n');
173         if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
174             p = eol;
175             continue;
176         }
177         p = q + 3;
178         if ((q = strstr(p, "\tID:")) != 0) q += 4;
179         if ((r = strstr(p, "\tSM:")) != 0) r += 4;
180         if (r && q)
181         {
182             char *u, *v;
183             int ioq, ior;
184             for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
185             for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
186             ioq = *u; ior = *v; *u = *v = '\0';
187 
188             // q now points to a null terminated read group id
189             // r points to a null terminated sample name
190             if ( !strcmp("*",q) || !strcmp("?",q) )
191                 error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);
192 
193             int accept_rg = 1;
194             if ( bsmpl->sample_list )
195             {
196                 // restrict samples based on the -s/-S options
197                 char *name = khash_str2str_get(bsmpl->sample_list,r);
198                 if ( bsmpl->sample_logic==0 )
199                     accept_rg = name ? 0 : 1;
200                 else if ( !name )
201                     accept_rg = 0;
202                 else
203                     r = name;
204             }
205             if ( accept_rg && bsmpl->rg_list )
206             {
207                 // restrict readgroups based on the -G option, possibly renaming the sample
208                 accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
209             }
210             if ( accept_rg )
211                 bsmpl_add_readgroup(bsmpl,file,q,r);
212             else
213             {
214                 bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
215                 nskipped++;
216             }
217 
218             if ( first_smpl<0 )
219                 khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
220             if ( !khash_str2int_has_key(bam_smpls,r) )
221                 khash_str2int_inc(bam_smpls,strdup(r));
222 
223             *u = ioq; *v = ior;
224         }
225         else
226             break;
227         p = eol;
228     }
229     int nsmpls = khash_str2int_size(bam_smpls);
230     khash_str2int_destroy_free(bam_smpls);
231 
232     const char *smpl_name = NULL;
233     int accept_null_rg = 1;
234     if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
235     if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;
236 
237     if ( !accept_null_rg && first_smpl==-1 )
238     {
239         // no suitable read group is available in this bam: ignore the whole file.
240         free(file->fname);
241         if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
242         bsmpl->nfiles--;
243         return -1;
244     }
245     if ( !accept_null_rg ) return bsmpl->nfiles-1;
246     if ( nsmpls==1 && !nskipped )
247     {
248         file->default_idx = first_smpl;
249         return bsmpl->nfiles-1;
250     }
251     if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];
252 
253     bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
254     return bsmpl->nfiles-1;
255 }
256 
bam_smpl_get_samples(bam_smpl_t * bsmpl,int * nsmpl)257 const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl)
258 {
259     *nsmpl = bsmpl->nsmpl;
260     return (const char**)bsmpl->smpl;
261 }
262 
bam_smpl_get_sample_id(bam_smpl_t * bsmpl,int bam_id,bam1_t * bam_rec)263 int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec)
264 {
265     file_t *file = &bsmpl->files[bam_id];
266     if ( file->default_idx >= 0 ) return file->default_idx;
267 
268     char *aux_rg = (char*) bam_aux_get(bam_rec, "RG");
269     aux_rg = aux_rg ? aux_rg+1 : "?";
270 
271     int rg_id;
272     if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id;
273     if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id;
274     return -1;
275 }
276 
bam_smpl_add_samples(bam_smpl_t * bsmpl,char * list,int is_file)277 int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
278 {
279     if ( list[0]!='^' ) bsmpl->sample_logic = 1;
280     else list++;
281 
282     int i, nsamples = 0;
283     char **samples = hts_readlist(list, is_file, &nsamples);
284     if ( !nsamples ) return 0;
285 
286     kstring_t ori = {0,0,0};
287     kstring_t ren = {0,0,0};
288 
289     bsmpl->sample_list = khash_str2str_init();
290     for (i=0; i<nsamples; i++)
291     {
292         char *ptr = samples[i];
293         ori.l = ren.l = 0;
294         int escaped = 0;
295         while ( *ptr )
296         {
297             if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
298             if ( isspace(*ptr) && !escaped ) break;
299             kputc(*ptr, &ori);
300             escaped = 0;
301             ptr++;
302         }
303         if ( *ptr )
304         {
305             while ( *ptr && isspace(*ptr) ) ptr++;
306             while ( *ptr )
307             {
308                 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
309                 if ( isspace(*ptr) && !escaped ) break;
310                 kputc(*ptr, &ren);
311                 escaped = 0;
312                 ptr++;
313             }
314         }
315         khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
316         free(samples[i]);
317     }
318     free(samples);
319     free(ori.s);
320     free(ren.s);
321     return nsamples;
322 }
323 
bam_smpl_add_readgroups(bam_smpl_t * bsmpl,char * list,int is_file)324 int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
325 {
326     if ( list[0]!='^' ) bsmpl->rg_logic = 1;
327     else list++;
328 
329     int i, nrows  = 0;
330     char **rows = hts_readlist(list, is_file, &nrows);
331     if ( !nrows ) return 0;
332 
333     kstring_t fld1 = {0,0,0};
334     kstring_t fld2 = {0,0,0};
335     kstring_t fld3 = {0,0,0};
336 
337     bsmpl->rg_list = khash_str2str_init();
338     for (i=0; i<nrows; i++)
339     {
340         char *ptr = rows[i];
341         fld1.l = fld2.l = fld3.l = 0;
342         int escaped = 0;
343         while ( *ptr )
344         {
345             if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
346             if ( isspace(*ptr) && !escaped ) break;
347             kputc(*ptr, &fld1);
348             escaped = 0;
349             ptr++;
350         }
351         if ( *ptr )
352         {
353             while ( *ptr && isspace(*ptr) ) ptr++;
354             while ( *ptr )
355             {
356                 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
357                 if ( isspace(*ptr) && !escaped ) break;
358                 kputc(*ptr, &fld2);
359                 escaped = 0;
360                 ptr++;
361             }
362         }
363         if ( *ptr )
364         {
365             while ( *ptr && isspace(*ptr) ) ptr++;
366             while ( *ptr )
367             {
368                 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
369                 if ( isspace(*ptr) && !escaped ) break;
370                 kputc(*ptr, &fld3);
371                 escaped = 0;
372                 ptr++;
373             }
374         }
375         if ( fld3.l )
376         {
377             // ID FILE SAMPLE
378             kputc('\t',&fld1);
379             kputs(fld2.s,&fld1);
380             fld2.l = 0;
381             kputs(fld3.s,&fld2);
382         }
383         // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
384         char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
385         if ( !value )
386             khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
387         else if ( strcmp(value,fld2.l?fld2.s:"\t") )
388             error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
389         free(rows[i]);
390     }
391     free(rows);
392     free(fld1.s);
393     free(fld2.s);
394     free(fld3.s);
395     return nrows;
396 }
397 
398 
399