1 /* vcfutils.c -- allele-related utility functions.
2
3 Copyright (C) 2012-2016 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include <config.h>
26
27 #include "htslib/vcfutils.h"
28 #include "htslib/kbitset.h"
29
bcf_calc_ac(const bcf_hdr_t * header,bcf1_t * line,int * ac,int which)30 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
31 {
32 int i;
33 for (i=0; i<line->n_allele; i++) ac[i]=0;
34
35 // Use INFO/AC,AN field only when asked
36 if ( which&BCF_UN_INFO )
37 {
38 bcf_unpack(line, BCF_UN_INFO);
39 int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
40 int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
41 int i, an=-1, ac_len=0, ac_type=0;
42 uint8_t *ac_ptr=NULL;
43 if ( an_id>=0 && ac_id>=0 )
44 {
45 for (i=0; i<line->n_info; i++)
46 {
47 bcf_info_t *z = &line->d.info[i];
48 if ( z->key == an_id ) an = z->v1.i;
49 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
50 }
51 }
52 if ( an>=0 && ac_ptr )
53 {
54 int nac = 0;
55 #define BRANCH_INT(type_t) { \
56 type_t *p = (type_t *) ac_ptr; \
57 for (i=0; i<ac_len; i++) \
58 { \
59 ac[i+1] = p[i]; \
60 nac += p[i]; \
61 } \
62 }
63 switch (ac_type) {
64 case BCF_BT_INT8: BRANCH_INT(int8_t); break;
65 case BCF_BT_INT16: BRANCH_INT(int16_t); break;
66 case BCF_BT_INT32: BRANCH_INT(int32_t); break;
67 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
68 }
69 #undef BRANCH_INT
70 if ( an<nac )
71 {
72 fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
73 exit(1);
74 }
75 ac[0] = an - nac;
76 return 1;
77 }
78 }
79
80 // Split genotype fields only when asked
81 if ( which&BCF_UN_FMT )
82 {
83 int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
84 if ( gt_id<0 ) return 0;
85 bcf_unpack(line, BCF_UN_FMT);
86 bcf_fmt_t *fmt_gt = NULL;
87 for (i=0; i<(int)line->n_fmt; i++)
88 if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
89 if ( !fmt_gt ) return 0;
90 #define BRANCH_INT(type_t,vector_end) { \
91 for (i=0; i<line->n_sample; i++) \
92 { \
93 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
94 int ial; \
95 for (ial=0; ial<fmt_gt->n; ial++) \
96 { \
97 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
98 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
99 if ( p[ial]>>1 > line->n_allele ) \
100 { \
101 fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
102 exit(1); \
103 } \
104 ac[(p[ial]>>1)-1]++; \
105 } \
106 } \
107 }
108 switch (fmt_gt->type) {
109 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
110 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
111 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
112 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
113 }
114 #undef BRANCH_INT
115 return 1;
116 }
117 return 0;
118 }
119
bcf_gt_type(bcf_fmt_t * fmt_ptr,int isample,int * _ial,int * _jal)120 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
121 {
122 int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
123 #define BRANCH_INT(type_t,vector_end) { \
124 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
125 for (i=0; i<fmt_ptr->n; i++) \
126 { \
127 if ( p[i] == vector_end ) break; /* smaller ploidy */ \
128 if ( bcf_gt_is_missing(p[i]) ) return GT_UNKN; /* missing allele */ \
129 int tmp = p[i]>>1; \
130 if ( tmp>1 ) \
131 { \
132 if ( !ial ) { ial = tmp; has_alt = 1; } \
133 else if ( tmp!=ial ) \
134 { \
135 if ( tmp<ial ) \
136 { \
137 jal = ial; \
138 ial = tmp; \
139 } \
140 else \
141 { \
142 jal = tmp; \
143 } \
144 has_alt = 2; \
145 } \
146 } \
147 else has_ref = 1; \
148 nals++; \
149 } \
150 }
151 switch (fmt_ptr->type) {
152 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
153 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
154 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
155 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
156 }
157 #undef BRANCH_INT
158
159 if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
160 if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
161 if ( !nals ) return GT_UNKN;
162 if ( nals==1 )
163 return has_ref ? GT_HAPL_R : GT_HAPL_A;
164 if ( !has_ref )
165 return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
166 if ( !has_alt )
167 return GT_HOM_RR;
168 return GT_HET_RA;
169 }
170
bcf_trim_alleles(const bcf_hdr_t * header,bcf1_t * line)171 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
172 {
173 int i, ret = 0, nrm = 0;
174 kbitset_t *rm_set = NULL;
175 bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
176 if ( !gt ) return 0;
177
178 int *ac = (int*) calloc(line->n_allele,sizeof(int));
179
180 // check if all alleles are populated
181 #define BRANCH(type_t,vector_end) { \
182 for (i=0; i<line->n_sample; i++) \
183 { \
184 type_t *p = (type_t*) (gt->p + i*gt->size); \
185 int ial; \
186 for (ial=0; ial<gt->n; ial++) \
187 { \
188 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
189 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
190 if ( (p[ial]>>1)-1 >= line->n_allele ) { \
191 if (hts_verbose>1) { fprintf(stderr, "[E::%s] allele index is out of bounds at %s:%d\n", __func__, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); } \
192 ret = -1; \
193 goto clean; \
194 } \
195 ac[(p[ial]>>1)-1]++; \
196 } \
197 } \
198 }
199 switch (gt->type) {
200 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
201 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
202 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
203 default: if (hts_verbose>1) { fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); } goto clean; break;
204 }
205 #undef BRANCH
206
207 rm_set = kbs_init(line->n_allele);
208 for (i=1; i<line->n_allele; i++)
209 {
210 if ( !ac[i] ) { kbs_insert(rm_set, i); nrm++; }
211 }
212
213 if ( nrm )
214 if ( bcf_remove_allele_set(header, line, rm_set) )
215 ret = -2;
216
217 clean:
218 free(ac);
219 if (rm_set) kbs_destroy(rm_set);
220 return ret ? ret : nrm;
221 }
222
bcf_remove_alleles(const bcf_hdr_t * header,bcf1_t * line,int rm_mask)223 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
224 {
225 int i;
226 kbitset_t *rm_set = kbs_init(line->n_allele);
227 for (i=1; i<line->n_allele; i++)
228 if ( rm_mask & 1<<i ) kbs_insert(rm_set, i);
229
230 bcf_remove_allele_set(header, line, rm_set);
231 kbs_destroy(rm_set);
232 }
233
bcf_remove_allele_set(const bcf_hdr_t * header,bcf1_t * line,const struct kbitset_t * rm_set)234 int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set)
235 {
236 int *map = (int*) calloc(line->n_allele, sizeof(int));
237 uint8_t *dat = NULL;
238
239 // create map of indexes from old to new ALT numbering and modify ALT
240 kstring_t str = {0,0,0};
241 kputs(line->d.allele[0], &str);
242
243 int nrm = 0, i,j; // i: ori alleles, j: new alleles
244 for (i=1, j=1; i<line->n_allele; i++)
245 {
246 if ( kbs_exists(rm_set, i) )
247 {
248 // remove this allele
249 line->d.allele[i] = NULL;
250 nrm++;
251 continue;
252 }
253 kputc(',', &str);
254 kputs(line->d.allele[i], &str);
255 map[i] = j;
256 j++;
257 }
258 if ( !nrm ) goto clean;
259
260 int nR_ori = line->n_allele;
261 int nR_new = line->n_allele-nrm;
262 if ( nR_new<=0 ) // should not be able to remove reference allele
263 {
264 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Cannot remove reference allele at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
265 bcf_seqname(header,line), line->pos+1, nR_new);
266 goto err;
267 }
268 int nA_ori = nR_ori-1;
269 int nA_new = nR_new-1;
270
271 int nG_ori = nR_ori*(nR_ori + 1)/2;
272 int nG_new = nR_new*(nR_new + 1)/2;
273
274 bcf_update_alleles_str(header, line, str.s);
275
276 // remove from Number=G, Number=R and Number=A INFO fields.
277 int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
278 for (i=0; i<line->n_info; i++)
279 {
280 bcf_info_t *info = &line->d.info[i];
281 int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
282
283 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
284
285 int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
286 if ( type==BCF_HT_FLAG ) continue;
287 int size = 1;
288 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
289
290 mdat = mdat_bytes / size;
291 nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
292 mdat_bytes = mdat * size;
293 if ( nret<0 )
294 {
295 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
296 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
297 goto err;
298 }
299 if ( nret==0 ) continue; // no data for this tag
300
301 if ( type==BCF_HT_STR )
302 {
303 str.l = 0;
304 char *ss = (char*) dat, *se = (char*) dat, s = ss[0];
305 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
306 {
307 int nexp, inc = 0;
308 if ( vlen==BCF_VL_A )
309 {
310 nexp = nA_ori;
311 inc = 1;
312 }
313 else
314 nexp = nR_ori;
315 for (j=0; j<nexp; j++)
316 {
317 if ( !*se ) break;
318 while ( *se && *se!=',' ) se++;
319 if ( kbs_exists(rm_set, j+inc) )
320 {
321 if ( *se ) se++;
322 ss = se;
323 continue;
324 }
325 if ( str.l ) kputc(',',&str);
326 kputsn(ss,se-ss,&str);
327 if ( *se ) se++;
328 ss = se;
329 }
330 if ( j==1 && s == '.' ) continue; // missing
331 if ( j!=nexp )
332 {
333 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
334 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j);
335 goto err;
336 }
337 }
338 else // Number=G, assuming diploid genotype
339 {
340 int k = 0, n = 0;
341 for (j=0; j<nR_ori; j++)
342 {
343 for (k=0; k<=j; k++)
344 {
345 if ( !*se ) break;
346 while ( *se && *se!=',' ) se++;
347 n++;
348 if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) )
349 {
350 if ( *se ) se++;
351 ss = se;
352 continue;
353 }
354 if ( str.l ) kputc(',',&str);
355 kputsn(ss,se-ss,&str);
356 if ( *se ) se++;
357 ss = se;
358 }
359 if ( !*se ) break;
360 }
361 if ( n==1 && s == '.' ) continue; // missing
362 if ( n!=nG_ori )
363 {
364 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
365 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n);
366 goto err;
367 }
368 }
369
370 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
371 if ( nret<0 )
372 {
373 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
374 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
375 goto err;
376 }
377 continue;
378 }
379
380 if (nret==1) // could be missing - check
381 {
382 int missing = 0;
383 #define BRANCH(type_t, is_missing) { \
384 type_t *p = (type_t *) info->vptr; \
385 if ( is_missing ) missing = 1; \
386 }
387 switch (info->type) {
388 case BCF_BT_INT8: BRANCH(int8_t, p[0]==bcf_int8_missing); break;
389 case BCF_BT_INT16: BRANCH(int16_t, p[0]==bcf_int16_missing); break;
390 case BCF_BT_INT32: BRANCH(int32_t, p[0]==bcf_int32_missing); break;
391 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[0])); break;
392 default: if (hts_verbose>1) { fprintf(stderr,"todo: type %d\n", info->type); } goto err;; break;
393 }
394 #undef BRANCH
395 if (missing) continue; // could remove this INFO tag?
396 }
397
398 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
399 {
400 int inc = 0, ntop;
401 if ( vlen==BCF_VL_A )
402 {
403 if ( nret!=nA_ori )
404 {
405 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
406 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret);
407 goto err;
408 }
409 ntop = nA_ori;
410 ndat = nA_new;
411 inc = 1;
412 }
413 else
414 {
415 if ( nret!=nR_ori )
416 {
417 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
418 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret);
419 goto err;
420 }
421 ntop = nR_ori;
422 ndat = nR_new;
423 }
424 int k = 0;
425
426 #define BRANCH(type_t,is_vector_end) \
427 { \
428 type_t *ptr = (type_t*) dat; \
429 int size = sizeof(type_t); \
430 for (j=0; j<ntop; j++) /* j:ori, k:new */ \
431 { \
432 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
433 if ( kbs_exists(rm_set, j+inc) ) continue; \
434 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
435 k++; \
436 } \
437 }
438 switch (type)
439 {
440 case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
441 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
442 }
443 #undef BRANCH
444 }
445 else // Number=G
446 {
447 if ( nret!=nG_ori )
448 {
449 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
450 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret);
451 goto err;
452 }
453 int k, l_ori = -1, l_new = 0;
454 ndat = nG_new;
455
456 #define BRANCH(type_t,is_vector_end) \
457 { \
458 type_t *ptr = (type_t*) dat; \
459 int size = sizeof(type_t); \
460 for (j=0; j<nR_ori; j++) \
461 { \
462 for (k=0; k<=j; k++) \
463 { \
464 l_ori++; \
465 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
466 if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) ) continue; \
467 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
468 l_new++; \
469 } \
470 } \
471 }
472 switch (type)
473 {
474 case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
475 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
476 }
477 #undef BRANCH
478 }
479
480 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
481 if ( nret<0 )
482 {
483 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
484 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
485 goto err;
486 }
487 }
488
489 // Update GT fields, the allele indexes might have changed
490 for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
491 if ( i<line->n_allele )
492 {
493 mdat = mdat_bytes / 4; // sizeof(int32_t)
494 nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
495 mdat_bytes = mdat * 4;
496 if ( nret>0 )
497 {
498 nret /= line->n_sample;
499 int32_t *ptr = (int32_t*) dat;
500 for (i=0; i<line->n_sample; i++)
501 {
502 for (j=0; j<nret; j++)
503 {
504 if ( bcf_gt_is_missing(ptr[j]) ) continue;
505 if ( ptr[j]==bcf_int32_vector_end ) break;
506 int al = bcf_gt_allele(ptr[j]);
507 if ( !( al<nR_ori && map[al]>=0 ) )
508 {
509 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Problem updating genotypes at %s:%d [ al<nR_ori && map[al]>=0 :: al=%d,nR_ori=%d,map[al]=%d ]\n", __FILE__,__LINE__,__FUNCTION__,
510 bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]);
511 goto err;
512 }
513 ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
514 }
515 ptr += nret;
516 }
517 nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
518 if ( nret<0 )
519 {
520 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update FORMAT/GT at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
521 bcf_seqname(header,line), line->pos+1, nret);
522 goto err;
523 }
524 }
525 }
526
527 // Remove from Number=G, Number=R and Number=A FORMAT fields.
528 // Assuming haploid or diploid GTs
529 for (i=0; i<line->n_fmt; i++)
530 {
531 bcf_fmt_t *fmt = &line->d.fmt[i];
532 int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
533
534 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
535
536 int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
537 if ( type==BCF_HT_FLAG ) continue;
538
539 int size = 1;
540 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
541
542 mdat = mdat_bytes / size;
543 nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
544 mdat_bytes = mdat * size;
545 if ( nret<0 )
546 {
547 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
548 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
549 goto err;
550 }
551 if ( nret == 0 ) continue; // no data for this tag
552
553 if ( type==BCF_HT_STR )
554 {
555 int size = nret/line->n_sample; // number of bytes per sample
556 str.l = 0;
557 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
558 {
559 int nexp, inc = 0;
560 if ( vlen==BCF_VL_A )
561 {
562 nexp = nA_ori;
563 inc = 1;
564 }
565 else
566 nexp = nR_ori;
567 for (j=0; j<line->n_sample; j++)
568 {
569 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0];
570 int k_src = 0, k_dst = 0, l = str.l;
571 for (k_src=0; k_src<nexp; k_src++)
572 {
573 if ( ptr>=se || !*ptr) break;
574 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
575 if ( kbs_exists(rm_set, k_src+inc) )
576 {
577 ss = ++ptr;
578 continue;
579 }
580 if ( k_dst ) kputc(',',&str);
581 kputsn(ss,ptr-ss,&str);
582 ss = ++ptr;
583 k_dst++;
584 }
585 if ( k_src==1 && s == '.' ) continue; // missing
586 if ( k_src!=nexp )
587 {
588 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
589 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src);
590 goto err;
591 }
592 l = str.l - l;
593 for (; l<size; l++) kputc(0, &str);
594 }
595 }
596 else // Number=G, diploid or haploid
597 {
598 for (j=0; j<line->n_sample; j++)
599 {
600 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0];
601 int k_src = 0, k_dst = 0, l = str.l;
602 int nexp = 0; // diploid or haploid?
603 while ( ptr<se )
604 {
605 if ( !*ptr ) break;
606 if ( *ptr==',' ) nexp++;
607 ptr++;
608 }
609 if ( ptr!=ss ) nexp++;
610 if ( nexp==1 && s == '.' ) continue; // missing
611 if ( nexp!=nG_ori && nexp!=nR_ori )
612 {
613 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d\n", __FILE__,__LINE__,__FUNCTION__,
614 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp);
615 goto err;
616 }
617 ptr = ss;
618 if ( nexp==nG_ori ) // diploid
619 {
620 int ia, ib;
621 for (ia=0; ia<nR_ori; ia++)
622 {
623 for (ib=0; ib<=ia; ib++)
624 {
625 if ( ptr>=se || !*ptr ) break;
626 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
627 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) )
628 {
629 ss = ++ptr;
630 continue;
631 }
632 if ( k_dst ) kputc(',',&str);
633 kputsn(ss,ptr-ss,&str);
634 ss = ++ptr;
635 k_dst++;
636 }
637 if ( ptr>=se || !*ptr ) break;
638 }
639 }
640 else // haploid
641 {
642 for (k_src=0; k_src<nR_ori; k_src++)
643 {
644 if ( ptr>=se || !*ptr ) break;
645 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
646 if ( kbs_exists(rm_set, k_src) )
647 {
648 ss = ++ptr;
649 continue;
650 }
651 if ( k_dst ) kputc(',',&str);
652 kputsn(ss,ptr-ss,&str);
653 ss = ++ptr;
654 k_dst++;
655 }
656 if ( k_src!=nR_ori )
657 {
658 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d\n", __FILE__,__LINE__,__FUNCTION__,
659 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src);
660 goto err;
661 }
662 l = str.l - l;
663 for (; l<size; l++) kputc(0, &str);
664 }
665 }
666 }
667 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
668 if ( nret<0 )
669 {
670 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
671 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
672 goto err;
673 }
674 continue;
675 }
676
677 int nori = nret / line->n_sample;
678 if ( nori==1 && !(vlen==BCF_VL_A && nori==nA_ori) ) // all values may be missing - check
679 {
680 int all_missing = 1;
681 #define BRANCH(type_t, is_missing) { \
682 for (j=0; j<line->n_sample; j++) \
683 { \
684 type_t *p = (type_t*) (fmt->p + j*fmt->size); \
685 if ( !(is_missing)) { all_missing = 0; break; } \
686 } \
687 }
688 switch (fmt->type) {
689 case BCF_BT_INT8: BRANCH(int8_t, p[0]==bcf_int8_missing); break;
690 case BCF_BT_INT16: BRANCH(int16_t, p[0]==bcf_int16_missing); break;
691 case BCF_BT_INT32: BRANCH(int32_t, p[0]==bcf_int32_missing); break;
692 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[0])); break;
693 default: if (hts_verbose>1) { fprintf(stderr,"todo: type %d\n", fmt->type); } goto err; break;
694 }
695 #undef BRANCH
696 if (all_missing) continue; // could remove this FORMAT tag?
697 }
698
699 if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
700 {
701 int inc = 0, nnew;
702 if ( vlen==BCF_VL_A )
703 {
704 if ( nori!=nA_ori )
705 {
706 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
707 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori);
708 goto err;
709 }
710 ndat = nA_new*line->n_sample;
711 nnew = nA_new;
712 inc = 1;
713 }
714 else
715 {
716 if ( nori!=nR_ori )
717 {
718 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
719 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori);
720 goto err;
721 }
722 ndat = nR_new*line->n_sample;
723 nnew = nR_new;
724 }
725
726 #define BRANCH(type_t,is_vector_end) \
727 { \
728 for (j=0; j<line->n_sample; j++) \
729 { \
730 type_t *ptr_src = ((type_t*)dat) + j*nori; \
731 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
732 int size = sizeof(type_t); \
733 int k_src, k_dst = 0; \
734 for (k_src=0; k_src<nori; k_src++) \
735 { \
736 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
737 if ( kbs_exists(rm_set, k_src+inc) ) continue; \
738 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
739 k_dst++; \
740 } \
741 } \
742 }
743 switch (type)
744 {
745 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
746 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
747 }
748 #undef BRANCH
749 }
750 else // Number=G, diploid or mixture of haploid+diploid
751 {
752 if ( nori!=nG_ori )
753 {
754 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
755 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori);
756 goto err;
757 }
758 ndat = nG_new*line->n_sample;
759
760 #define BRANCH(type_t,is_vector_end) \
761 { \
762 for (j=0; j<line->n_sample; j++) \
763 { \
764 type_t *ptr_src = ((type_t*)dat) + j*nori; \
765 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
766 int size = sizeof(type_t); \
767 int ia, ib, k_dst = 0, k_src; \
768 int nset = 0; /* haploid or diploid? */ \
769 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
770 if ( nset==nR_ori ) /* haploid */ \
771 { \
772 for (k_src=0; k_src<nR_ori; k_src++) \
773 { \
774 if ( kbs_exists(rm_set, k_src) ) continue; \
775 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
776 k_dst++; \
777 } \
778 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
779 } \
780 else /* diploid */ \
781 { \
782 k_src = -1; \
783 for (ia=0; ia<nR_ori; ia++) \
784 { \
785 for (ib=0; ib<=ia; ib++) \
786 { \
787 k_src++; \
788 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
789 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) ) continue; \
790 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
791 k_dst++; \
792 } \
793 } \
794 } \
795 } \
796 }
797 switch (type)
798 {
799 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
800 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
801 }
802 #undef BRANCH
803 }
804 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
805 if ( nret<0 )
806 {
807 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
808 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
809 goto err;
810 }
811 }
812
813 clean:
814 free(str.s);
815 free(map);
816 free(dat);
817 return 0;
818
819 err:
820 free(str.s);
821 free(map);
822 free(dat);
823 return -1;
824 }
825
826