1 /* bam_sample.c -- group data by sample.
2
3 Copyright (C) 2010, 2011 Broad Institute.
4 Copyright (C) 2013, 2016-2018 Genome Research Ltd.
5
6 Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <htslib/hts.h>
30 #include <htslib/kstring.h>
31 #include <htslib/khash_str2int.h>
32 #include <khash_str2str.h>
33 #include "bam_sample.h"
34 #include "bcftools.h"
35
36
37 typedef struct
38 {
39 char *fname;
40 void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup
41 int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample
42 }
43 file_t;
44
45 struct _bam_smpl_t
46 {
47 kstring_t tmp;
48 file_t *files;
49 int ignore_rg, nsmpl, nfiles;
50 char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup
51 void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list
52 int sample_logic; // the -s/-S logic, 1: include, 0: exclude
53 void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list
54 int rg_logic; // the -G logic, 1: include, 0: exclude
55 void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup
56 };
57
bam_smpl_init(void)58 bam_smpl_t *bam_smpl_init(void)
59 {
60 bam_smpl_t *bsmpl;
61 bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t));
62 bsmpl->name2idx = khash_str2int_init();
63 return bsmpl;
64 }
65
bam_smpl_destroy(bam_smpl_t * bsmpl)66 void bam_smpl_destroy(bam_smpl_t *bsmpl)
67 {
68 if ( !bsmpl ) return;
69 if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx);
70 if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list);
71 if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list);
72 int i;
73 for (i=0; i<bsmpl->nfiles; i++)
74 {
75 file_t *file = &bsmpl->files[i];
76 if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
77 free(file->fname);
78 }
79 free(bsmpl->smpl);
80 free(bsmpl->files);
81 free(bsmpl->tmp.s);
82 free(bsmpl);
83 }
84
bam_smpl_ignore_readgroups(bam_smpl_t * bsmpl)85 void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl)
86 {
87 bsmpl->ignore_rg = 1;
88 }
89
bsmpl_add_readgroup(bam_smpl_t * bsmpl,file_t * file,const char * rg_id,const char * smpl_name)90 static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
91 {
92 int ismpl = -1;
93 if ( smpl_name )
94 {
95 if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
96 {
97 // new sample
98 bsmpl->nsmpl++;
99 bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
100 bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
101 ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
102 }
103 }
104 if ( !strcmp("*",rg_id) )
105 {
106 // all read groups in the bam treated as the same sample
107 file->default_idx = ismpl;
108 return;
109 }
110 if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
111 if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID
112 khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
113 }
bsmpl_keep_readgroup(bam_smpl_t * bsmpl,file_t * file,const char * rg_id,const char ** smpl_name)114 static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
115 {
116 char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only
117 if ( !rg_smpl )
118 {
119 // read group specific to this bam
120 bsmpl->tmp.l = 0;
121 ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
122 rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
123 }
124 if ( !rg_smpl )
125 {
126 // any read group in this file?
127 bsmpl->tmp.l = 0;
128 ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
129 rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
130 }
131 if ( !rg_smpl && bsmpl->rg_logic ) return 0;
132 if ( rg_smpl && !bsmpl->rg_logic ) return 0;
133
134 if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample
135 return 1;
136 }
137
138 /*
139 The logic of this function is a bit complicated because we want to work
140 also with broken bams containing read groups that are not listed in the
141 header. The desired behavior is as follows:
142 - when -G is given, read groups which are not listed in the header must
143 be given explicitly using the "?" symbol in -G.
144 Otherwise:
145 - if the bam has no header, all reads in the file are assigned to a
146 single sample named after the file
147 - if there is at least one sample defined in the header, reads with no
148 read group id or with a read group id not listed in the header are
149 assigned to the first sample encountered in the header
150 */
bam_smpl_add_bam(bam_smpl_t * bsmpl,char * bam_hdr,const char * fname)151 int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
152 {
153 bsmpl->nfiles++;
154 bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
155 file_t *file = &bsmpl->files[bsmpl->nfiles-1];
156 memset(file,0,sizeof(file_t));
157 file->fname = strdup(fname);
158 file->default_idx = -1;
159
160 if ( bsmpl->ignore_rg || !bam_hdr )
161 {
162 // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
163 bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
164 return bsmpl->nfiles-1;
165 }
166
167 void *bam_smpls = khash_str2int_init();
168 int first_smpl = -1, nskipped = 0;
169 const char *p = bam_hdr, *q, *r;
170 while (p != NULL && (q = strstr(p, "@RG")) != 0)
171 {
172 char *eol = strchr(q + 3, '\n');
173 if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
174 p = eol;
175 continue;
176 }
177 p = q + 3;
178 if ((q = strstr(p, "\tID:")) != 0) q += 4;
179 if ((r = strstr(p, "\tSM:")) != 0) r += 4;
180 if (r && q)
181 {
182 char *u, *v;
183 int ioq, ior;
184 for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
185 for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
186 ioq = *u; ior = *v; *u = *v = '\0';
187
188 // q now points to a null terminated read group id
189 // r points to a null terminated sample name
190 if ( !strcmp("*",q) || !strcmp("?",q) )
191 error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);
192
193 int accept_rg = 1;
194 if ( bsmpl->sample_list )
195 {
196 // restrict samples based on the -s/-S options
197 char *name = khash_str2str_get(bsmpl->sample_list,r);
198 if ( bsmpl->sample_logic==0 )
199 accept_rg = name ? 0 : 1;
200 else if ( !name )
201 accept_rg = 0;
202 else
203 r = name;
204 }
205 if ( accept_rg && bsmpl->rg_list )
206 {
207 // restrict readgroups based on the -G option, possibly renaming the sample
208 accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
209 }
210 if ( accept_rg )
211 bsmpl_add_readgroup(bsmpl,file,q,r);
212 else
213 {
214 bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
215 nskipped++;
216 }
217
218 if ( first_smpl<0 )
219 khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
220 if ( !khash_str2int_has_key(bam_smpls,r) )
221 khash_str2int_inc(bam_smpls,strdup(r));
222
223 *u = ioq; *v = ior;
224 }
225 else
226 break;
227 p = eol;
228 }
229 int nsmpls = khash_str2int_size(bam_smpls);
230 khash_str2int_destroy_free(bam_smpls);
231
232 const char *smpl_name = NULL;
233 int accept_null_rg = 1;
234 if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
235 if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;
236
237 if ( !accept_null_rg && first_smpl==-1 )
238 {
239 // no suitable read group is available in this bam: ignore the whole file.
240 free(file->fname);
241 if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
242 bsmpl->nfiles--;
243 return -1;
244 }
245 if ( !accept_null_rg ) return bsmpl->nfiles-1;
246 if ( nsmpls==1 && !nskipped )
247 {
248 file->default_idx = first_smpl;
249 return bsmpl->nfiles-1;
250 }
251 if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];
252
253 bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
254 return bsmpl->nfiles-1;
255 }
256
bam_smpl_get_samples(bam_smpl_t * bsmpl,int * nsmpl)257 const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl)
258 {
259 *nsmpl = bsmpl->nsmpl;
260 return (const char**)bsmpl->smpl;
261 }
262
bam_smpl_get_sample_id(bam_smpl_t * bsmpl,int bam_id,bam1_t * bam_rec)263 int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec)
264 {
265 file_t *file = &bsmpl->files[bam_id];
266 if ( file->default_idx >= 0 ) return file->default_idx;
267
268 char *aux_rg = (char*) bam_aux_get(bam_rec, "RG");
269 aux_rg = aux_rg ? aux_rg+1 : "?";
270
271 int rg_id;
272 if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id;
273 if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id;
274 return -1;
275 }
276
bam_smpl_add_samples(bam_smpl_t * bsmpl,char * list,int is_file)277 int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
278 {
279 if ( list[0]!='^' ) bsmpl->sample_logic = 1;
280 else list++;
281
282 int i, nsamples = 0;
283 char **samples = hts_readlist(list, is_file, &nsamples);
284 if ( !nsamples ) return 0;
285
286 kstring_t ori = {0,0,0};
287 kstring_t ren = {0,0,0};
288
289 bsmpl->sample_list = khash_str2str_init();
290 for (i=0; i<nsamples; i++)
291 {
292 char *ptr = samples[i];
293 ori.l = ren.l = 0;
294 int escaped = 0;
295 while ( *ptr )
296 {
297 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
298 if ( isspace(*ptr) && !escaped ) break;
299 kputc(*ptr, &ori);
300 escaped = 0;
301 ptr++;
302 }
303 if ( *ptr )
304 {
305 while ( *ptr && isspace(*ptr) ) ptr++;
306 while ( *ptr )
307 {
308 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
309 if ( isspace(*ptr) && !escaped ) break;
310 kputc(*ptr, &ren);
311 escaped = 0;
312 ptr++;
313 }
314 }
315 khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
316 free(samples[i]);
317 }
318 free(samples);
319 free(ori.s);
320 free(ren.s);
321 return nsamples;
322 }
323
bam_smpl_add_readgroups(bam_smpl_t * bsmpl,char * list,int is_file)324 int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
325 {
326 if ( list[0]!='^' ) bsmpl->rg_logic = 1;
327 else list++;
328
329 int i, nrows = 0;
330 char **rows = hts_readlist(list, is_file, &nrows);
331 if ( !nrows ) return 0;
332
333 kstring_t fld1 = {0,0,0};
334 kstring_t fld2 = {0,0,0};
335 kstring_t fld3 = {0,0,0};
336
337 bsmpl->rg_list = khash_str2str_init();
338 for (i=0; i<nrows; i++)
339 {
340 char *ptr = rows[i];
341 fld1.l = fld2.l = fld3.l = 0;
342 int escaped = 0;
343 while ( *ptr )
344 {
345 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
346 if ( isspace(*ptr) && !escaped ) break;
347 kputc(*ptr, &fld1);
348 escaped = 0;
349 ptr++;
350 }
351 if ( *ptr )
352 {
353 while ( *ptr && isspace(*ptr) ) ptr++;
354 while ( *ptr )
355 {
356 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
357 if ( isspace(*ptr) && !escaped ) break;
358 kputc(*ptr, &fld2);
359 escaped = 0;
360 ptr++;
361 }
362 }
363 if ( *ptr )
364 {
365 while ( *ptr && isspace(*ptr) ) ptr++;
366 while ( *ptr )
367 {
368 if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
369 if ( isspace(*ptr) && !escaped ) break;
370 kputc(*ptr, &fld3);
371 escaped = 0;
372 ptr++;
373 }
374 }
375 if ( fld3.l )
376 {
377 // ID FILE SAMPLE
378 kputc('\t',&fld1);
379 kputs(fld2.s,&fld1);
380 fld2.l = 0;
381 kputs(fld3.s,&fld2);
382 }
383 // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
384 char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
385 if ( !value )
386 khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
387 else if ( strcmp(value,fld2.l?fld2.s:"\t") )
388 error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
389 free(rows[i]);
390 }
391 free(rows);
392 free(fld1.s);
393 free(fld2.s);
394 free(fld3.s);
395 return nrows;
396 }
397
398
399