1 /*  vcfutils.c -- allele-related utility functions.
2 
3     Copyright (C) 2012-2016 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <config.h>
26 
27 #include "htslib/vcfutils.h"
28 #include "htslib/kbitset.h"
29 
bcf_calc_ac(const bcf_hdr_t * header,bcf1_t * line,int * ac,int which)30 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
31 {
32     int i;
33     for (i=0; i<line->n_allele; i++) ac[i]=0;
34 
35     // Use INFO/AC,AN field only when asked
36     if ( which&BCF_UN_INFO )
37     {
38         bcf_unpack(line, BCF_UN_INFO);
39         int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
40         int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
41         int i, an=-1, ac_len=0, ac_type=0;
42         uint8_t *ac_ptr=NULL;
43         if ( an_id>=0 && ac_id>=0 )
44         {
45             for (i=0; i<line->n_info; i++)
46             {
47                 bcf_info_t *z = &line->d.info[i];
48                 if ( z->key == an_id ) an = z->v1.i;
49                 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
50             }
51         }
52         if ( an>=0 && ac_ptr )
53         {
54             int nac = 0;
55             #define BRANCH_INT(type_t) {        \
56                 type_t *p = (type_t *) ac_ptr;  \
57                 for (i=0; i<ac_len; i++)        \
58                 {                               \
59                     ac[i+1] = p[i];             \
60                     nac += p[i];                \
61                 }                               \
62             }
63             switch (ac_type) {
64                 case BCF_BT_INT8:  BRANCH_INT(int8_t); break;
65                 case BCF_BT_INT16: BRANCH_INT(int16_t); break;
66                 case BCF_BT_INT32: BRANCH_INT(int32_t); break;
67                 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
68             }
69             #undef BRANCH_INT
70             if ( an<nac )
71             {
72                 fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
73                 exit(1);
74             }
75             ac[0] = an - nac;
76             return 1;
77         }
78     }
79 
80     // Split genotype fields only when asked
81     if ( which&BCF_UN_FMT )
82     {
83         int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
84         if ( gt_id<0 ) return 0;
85         bcf_unpack(line, BCF_UN_FMT);
86         bcf_fmt_t *fmt_gt = NULL;
87         for (i=0; i<(int)line->n_fmt; i++)
88             if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
89         if ( !fmt_gt ) return 0;
90         #define BRANCH_INT(type_t,vector_end) { \
91             for (i=0; i<line->n_sample; i++) \
92             { \
93                 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
94                 int ial; \
95                 for (ial=0; ial<fmt_gt->n; ial++) \
96                 { \
97                     if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
98                     if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
99                     if ( p[ial]>>1 > line->n_allele ) \
100                     { \
101                         fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
102                         exit(1); \
103                     } \
104                     ac[(p[ial]>>1)-1]++; \
105                 } \
106             } \
107         }
108         switch (fmt_gt->type) {
109             case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
110             case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
111             case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
112             default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
113         }
114         #undef BRANCH_INT
115         return 1;
116     }
117     return 0;
118 }
119 
bcf_gt_type(bcf_fmt_t * fmt_ptr,int isample,int * _ial,int * _jal)120 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
121 {
122     int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
123     #define BRANCH_INT(type_t,vector_end) { \
124         type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
125         for (i=0; i<fmt_ptr->n; i++) \
126         { \
127             if ( p[i] == vector_end ) break; /* smaller ploidy */ \
128             if ( bcf_gt_is_missing(p[i]) ) return GT_UNKN; /* missing allele */ \
129             int tmp = p[i]>>1; \
130             if ( tmp>1 ) \
131             { \
132                 if ( !ial ) { ial = tmp; has_alt = 1; } \
133                 else if ( tmp!=ial ) \
134                 { \
135                     if ( tmp<ial ) \
136                     { \
137                         jal = ial; \
138                         ial = tmp; \
139                     } \
140                     else \
141                     { \
142                         jal = tmp; \
143                     } \
144                     has_alt = 2; \
145                 } \
146             } \
147             else has_ref = 1; \
148             nals++; \
149         } \
150     }
151     switch (fmt_ptr->type) {
152         case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
153         case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
154         case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
155         default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
156     }
157     #undef BRANCH_INT
158 
159     if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
160     if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
161     if ( !nals ) return GT_UNKN;
162     if ( nals==1 )
163         return has_ref ? GT_HAPL_R : GT_HAPL_A;
164     if ( !has_ref )
165         return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
166     if ( !has_alt )
167         return GT_HOM_RR;
168     return GT_HET_RA;
169 }
170 
bcf_trim_alleles(const bcf_hdr_t * header,bcf1_t * line)171 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
172 {
173     int i, ret = 0, nrm = 0;
174     kbitset_t *rm_set = NULL;
175     bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
176     if ( !gt ) return 0;
177 
178     int *ac = (int*) calloc(line->n_allele,sizeof(int));
179 
180     // check if all alleles are populated
181     #define BRANCH(type_t,vector_end) { \
182         for (i=0; i<line->n_sample; i++) \
183         { \
184             type_t *p = (type_t*) (gt->p + i*gt->size); \
185             int ial; \
186             for (ial=0; ial<gt->n; ial++) \
187             { \
188                 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
189                 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
190                 if ( (p[ial]>>1)-1 >= line->n_allele ) { \
191                     if (hts_verbose>1) { fprintf(stderr, "[E::%s] allele index is out of bounds at %s:%d\n", __func__, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); } \
192                     ret = -1; \
193                     goto clean; \
194                 } \
195                 ac[(p[ial]>>1)-1]++; \
196             } \
197         } \
198     }
199     switch (gt->type) {
200         case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_vector_end); break;
201         case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
202         case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
203         default: if (hts_verbose>1) { fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); } goto clean; break;
204     }
205     #undef BRANCH
206 
207     rm_set = kbs_init(line->n_allele);
208     for (i=1; i<line->n_allele; i++)
209     {
210         if ( !ac[i] ) { kbs_insert(rm_set, i); nrm++; }
211     }
212 
213     if ( nrm )
214         if ( bcf_remove_allele_set(header, line, rm_set) )
215             ret = -2;
216 
217 clean:
218     free(ac);
219     if (rm_set) kbs_destroy(rm_set);
220     return ret ? ret : nrm;
221 }
222 
bcf_remove_alleles(const bcf_hdr_t * header,bcf1_t * line,int rm_mask)223 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
224 {
225     int i;
226     kbitset_t *rm_set = kbs_init(line->n_allele);
227     for (i=1; i<line->n_allele; i++)
228         if ( rm_mask & 1<<i ) kbs_insert(rm_set, i);
229 
230     bcf_remove_allele_set(header, line, rm_set);
231     kbs_destroy(rm_set);
232 }
233 
bcf_remove_allele_set(const bcf_hdr_t * header,bcf1_t * line,const struct kbitset_t * rm_set)234 int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set)
235 {
236     int *map = (int*) calloc(line->n_allele, sizeof(int));
237     uint8_t *dat = NULL;
238 
239     // create map of indexes from old to new ALT numbering and modify ALT
240     kstring_t str = {0,0,0};
241     kputs(line->d.allele[0], &str);
242 
243     int nrm = 0, i,j;  // i: ori alleles, j: new alleles
244     for (i=1, j=1; i<line->n_allele; i++)
245     {
246         if ( kbs_exists(rm_set, i) )
247         {
248             // remove this allele
249             line->d.allele[i] = NULL;
250             nrm++;
251             continue;
252         }
253         kputc(',', &str);
254         kputs(line->d.allele[i], &str);
255         map[i] = j;
256         j++;
257     }
258     if ( !nrm ) goto clean;
259 
260     int nR_ori = line->n_allele;
261     int nR_new = line->n_allele-nrm;
262     if ( nR_new<=0 ) // should not be able to remove reference allele
263     {
264         if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Cannot remove reference allele at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
265             bcf_seqname(header,line), line->pos+1, nR_new);
266         goto err;
267     }
268     int nA_ori = nR_ori-1;
269     int nA_new = nR_new-1;
270 
271     int nG_ori = nR_ori*(nR_ori + 1)/2;
272     int nG_new = nR_new*(nR_new + 1)/2;
273 
274     bcf_update_alleles_str(header, line, str.s);
275 
276     // remove from Number=G, Number=R and Number=A INFO fields.
277     int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
278     for (i=0; i<line->n_info; i++)
279     {
280         bcf_info_t *info = &line->d.info[i];
281         int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
282 
283         if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
284 
285         int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
286         if ( type==BCF_HT_FLAG ) continue;
287         int size = 1;
288         if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
289 
290         mdat = mdat_bytes / size;
291         nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
292         mdat_bytes = mdat * size;
293         if ( nret<0 )
294         {
295             if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
296                 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
297             goto err;
298         }
299         if ( nret==0 ) continue; // no data for this tag
300 
301         if ( type==BCF_HT_STR )
302         {
303             str.l = 0;
304             char *ss = (char*) dat, *se = (char*) dat, s = ss[0];
305             if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
306             {
307                 int nexp, inc = 0;
308                 if ( vlen==BCF_VL_A )
309                 {
310                     nexp = nA_ori;
311                     inc  = 1;
312                 }
313                 else
314                     nexp = nR_ori;
315                 for (j=0; j<nexp; j++)
316                 {
317                     if ( !*se ) break;
318                     while ( *se && *se!=',' ) se++;
319                     if ( kbs_exists(rm_set, j+inc) )
320                     {
321                         if ( *se ) se++;
322                         ss = se;
323                         continue;
324                     }
325                     if ( str.l ) kputc(',',&str);
326                     kputsn(ss,se-ss,&str);
327                     if ( *se ) se++;
328                     ss = se;
329                 }
330                 if ( j==1 && s == '.' ) continue; // missing
331                 if ( j!=nexp )
332                 {
333                     if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
334                         bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j);
335                     goto err;
336                 }
337             }
338             else    // Number=G, assuming diploid genotype
339             {
340                 int k = 0, n = 0;
341                 for (j=0; j<nR_ori; j++)
342                 {
343                     for (k=0; k<=j; k++)
344                     {
345                         if ( !*se ) break;
346                         while ( *se && *se!=',' ) se++;
347                         n++;
348                         if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) )
349                         {
350                             if ( *se ) se++;
351                             ss = se;
352                             continue;
353                         }
354                         if ( str.l ) kputc(',',&str);
355                         kputsn(ss,se-ss,&str);
356                         if ( *se ) se++;
357                         ss = se;
358                     }
359                     if ( !*se ) break;
360                 }
361                 if ( n==1 && s == '.' ) continue; // missing
362                 if ( n!=nG_ori )
363                 {
364                     if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
365                         bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n);
366                     goto err;
367                 }
368             }
369 
370             nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
371             if ( nret<0 )
372             {
373                 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
374                         bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
375                 goto err;
376             }
377             continue;
378         }
379 
380         if (nret==1) // could be missing - check
381         {
382             int missing = 0;
383             #define BRANCH(type_t, is_missing) { \
384                 type_t *p = (type_t *) info->vptr; \
385                 if ( is_missing ) missing = 1; \
386             }
387             switch (info->type) {
388                 case BCF_BT_INT8:  BRANCH(int8_t,  p[0]==bcf_int8_missing); break;
389                 case BCF_BT_INT16: BRANCH(int16_t, p[0]==bcf_int16_missing); break;
390                 case BCF_BT_INT32: BRANCH(int32_t, p[0]==bcf_int32_missing); break;
391                 case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(p[0])); break;
392                 default: if (hts_verbose>1) { fprintf(stderr,"todo: type %d\n", info->type); } goto err;; break;
393             }
394             #undef BRANCH
395             if (missing) continue; // could remove this INFO tag?
396         }
397 
398         if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
399         {
400             int inc = 0, ntop;
401             if ( vlen==BCF_VL_A )
402             {
403                 if ( nret!=nA_ori )
404                 {
405                     if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
406                         bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret);
407                     goto err;
408                 }
409                 ntop = nA_ori;
410                 ndat = nA_new;
411                 inc  = 1;
412             }
413             else
414             {
415                 if ( nret!=nR_ori )
416                 {
417                     if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
418                         bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret);
419                     goto err;
420                 }
421                 ntop = nR_ori;
422                 ndat = nR_new;
423             }
424             int k = 0;
425 
426             #define BRANCH(type_t,is_vector_end) \
427             { \
428                 type_t *ptr = (type_t*) dat; \
429                 int size = sizeof(type_t); \
430                 for (j=0; j<ntop; j++) /* j:ori, k:new */ \
431                 { \
432                     if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
433                     if ( kbs_exists(rm_set, j+inc) ) continue; \
434                     if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
435                     k++; \
436                 } \
437             }
438             switch (type)
439             {
440                 case BCF_HT_INT:  BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
441                 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
442             }
443             #undef BRANCH
444         }
445         else    // Number=G
446         {
447             if ( nret!=nG_ori )
448             {
449                 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
450                     bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret);
451                 goto err;
452             }
453             int k, l_ori = -1, l_new = 0;
454             ndat = nG_new;
455 
456             #define BRANCH(type_t,is_vector_end) \
457             { \
458                 type_t *ptr = (type_t*) dat; \
459                 int size = sizeof(type_t); \
460                 for (j=0; j<nR_ori; j++) \
461                 { \
462                     for (k=0; k<=j; k++) \
463                     { \
464                         l_ori++; \
465                         if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
466                         if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) ) continue; \
467                         if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
468                         l_new++; \
469                     } \
470                 } \
471             }
472             switch (type)
473             {
474                 case BCF_HT_INT:  BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
475                 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
476             }
477             #undef BRANCH
478         }
479 
480         nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
481         if ( nret<0 )
482         {
483             if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
484                     bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
485             goto err;
486         }
487     }
488 
489     // Update GT fields, the allele indexes might have changed
490     for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
491     if ( i<line->n_allele )
492     {
493         mdat = mdat_bytes / 4;  // sizeof(int32_t)
494         nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
495         mdat_bytes = mdat * 4;
496         if ( nret>0 )
497         {
498             nret /= line->n_sample;
499             int32_t *ptr = (int32_t*) dat;
500             for (i=0; i<line->n_sample; i++)
501             {
502                 for (j=0; j<nret; j++)
503                 {
504                     if ( bcf_gt_is_missing(ptr[j]) ) continue;
505                     if ( ptr[j]==bcf_int32_vector_end ) break;
506                     int al = bcf_gt_allele(ptr[j]);
507                     if ( !( al<nR_ori && map[al]>=0 ) )
508                     {
509                         if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Problem updating genotypes at %s:%d [ al<nR_ori && map[al]>=0 :: al=%d,nR_ori=%d,map[al]=%d ]\n", __FILE__,__LINE__,__FUNCTION__,
510                             bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]);
511                         goto err;
512                     }
513                     ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
514                 }
515                 ptr += nret;
516             }
517             nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
518             if ( nret<0 )
519             {
520                 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update FORMAT/GT at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
521                         bcf_seqname(header,line), line->pos+1, nret);
522                 goto err;
523             }
524         }
525     }
526 
527     // Remove from Number=G, Number=R and Number=A FORMAT fields.
528     // Assuming haploid or diploid GTs
529     for (i=0; i<line->n_fmt; i++)
530     {
531         bcf_fmt_t *fmt = &line->d.fmt[i];
532         int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
533 
534         if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
535 
536         int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
537         if ( type==BCF_HT_FLAG ) continue;
538 
539         int size = 1;
540         if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
541 
542         mdat = mdat_bytes / size;
543         nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
544         mdat_bytes = mdat * size;
545         if ( nret<0 )
546         {
547             if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
548                     bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
549             goto err;
550         }
551         if ( nret == 0 ) continue; // no data for this tag
552 
553         if ( type==BCF_HT_STR )
554         {
555             int size = nret/line->n_sample;     // number of bytes per sample
556             str.l = 0;
557             if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
558             {
559                 int nexp, inc = 0;
560                 if ( vlen==BCF_VL_A )
561                 {
562                     nexp = nA_ori;
563                     inc  = 1;
564                 }
565                 else
566                     nexp = nR_ori;
567                 for (j=0; j<line->n_sample; j++)
568                 {
569                     char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0];
570                     int k_src = 0, k_dst = 0, l = str.l;
571                     for (k_src=0; k_src<nexp; k_src++)
572                     {
573                         if ( ptr>=se || !*ptr) break;
574                         while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
575                         if ( kbs_exists(rm_set, k_src+inc) )
576                         {
577                             ss = ++ptr;
578                             continue;
579                         }
580                         if ( k_dst ) kputc(',',&str);
581                         kputsn(ss,ptr-ss,&str);
582                         ss = ++ptr;
583                         k_dst++;
584                     }
585                     if ( k_src==1 && s == '.' ) continue; // missing
586                     if ( k_src!=nexp )
587                     {
588                         if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
589                             bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src);
590                         goto err;
591                     }
592                     l = str.l - l;
593                     for (; l<size; l++) kputc(0, &str);
594                 }
595             }
596             else    // Number=G, diploid or haploid
597             {
598                 for (j=0; j<line->n_sample; j++)
599                 {
600                     char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0];
601                     int k_src = 0, k_dst = 0, l = str.l;
602                     int nexp = 0; // diploid or haploid?
603                     while ( ptr<se )
604                     {
605                         if ( !*ptr ) break;
606                         if ( *ptr==',' ) nexp++;
607                         ptr++;
608                     }
609                     if ( ptr!=ss ) nexp++;
610                     if ( nexp==1 && s == '.' ) continue; // missing
611                     if ( nexp!=nG_ori && nexp!=nR_ori )
612                     {
613                         if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d\n", __FILE__,__LINE__,__FUNCTION__,
614                             bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp);
615                         goto err;
616                     }
617                     ptr = ss;
618                     if ( nexp==nG_ori ) // diploid
619                     {
620                         int ia, ib;
621                         for (ia=0; ia<nR_ori; ia++)
622                         {
623                             for (ib=0; ib<=ia; ib++)
624                             {
625                                 if ( ptr>=se || !*ptr ) break;
626                                 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
627                                 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) )
628                                 {
629                                     ss = ++ptr;
630                                     continue;
631                                 }
632                                 if ( k_dst ) kputc(',',&str);
633                                 kputsn(ss,ptr-ss,&str);
634                                 ss = ++ptr;
635                                 k_dst++;
636                             }
637                             if ( ptr>=se || !*ptr ) break;
638                         }
639                     }
640                     else    // haploid
641                     {
642                         for (k_src=0; k_src<nR_ori; k_src++)
643                         {
644                             if ( ptr>=se || !*ptr ) break;
645                             while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
646                             if ( kbs_exists(rm_set, k_src) )
647                             {
648                                 ss = ++ptr;
649                                 continue;
650                             }
651                             if ( k_dst ) kputc(',',&str);
652                             kputsn(ss,ptr-ss,&str);
653                             ss = ++ptr;
654                             k_dst++;
655                         }
656                         if ( k_src!=nR_ori )
657                         {
658                             if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d\n", __FILE__,__LINE__,__FUNCTION__,
659                                 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src);
660                             goto err;
661                         }
662                         l = str.l - l;
663                         for (; l<size; l++) kputc(0, &str);
664                     }
665                 }
666             }
667             nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
668             if ( nret<0 )
669             {
670                 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
671                         bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
672                 goto err;
673             }
674             continue;
675         }
676 
677         int nori = nret / line->n_sample;
678         if ( nori==1 && !(vlen==BCF_VL_A && nori==nA_ori) ) // all values may be missing - check
679         {
680             int all_missing = 1;
681             #define BRANCH(type_t, is_missing) { \
682                 for (j=0; j<line->n_sample; j++) \
683                 { \
684                     type_t *p = (type_t*) (fmt->p + j*fmt->size); \
685                     if ( !(is_missing)) { all_missing = 0; break; } \
686                 } \
687             }
688             switch (fmt->type) {
689                 case BCF_BT_INT8:  BRANCH(int8_t,  p[0]==bcf_int8_missing); break;
690                 case BCF_BT_INT16: BRANCH(int16_t, p[0]==bcf_int16_missing); break;
691                 case BCF_BT_INT32: BRANCH(int32_t, p[0]==bcf_int32_missing); break;
692                 case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(p[0])); break;
693                 default: if (hts_verbose>1) { fprintf(stderr,"todo: type %d\n", fmt->type); } goto err; break;
694             }
695             #undef BRANCH
696             if (all_missing) continue; // could remove this FORMAT tag?
697         }
698 
699         if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
700         {
701             int inc = 0, nnew;
702             if ( vlen==BCF_VL_A )
703             {
704                 if ( nori!=nA_ori )
705                 {
706                     if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
707                         bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori);
708                     goto err;
709                 }
710                 ndat = nA_new*line->n_sample;
711                 nnew = nA_new;
712                 inc  = 1;
713             }
714             else
715             {
716                 if ( nori!=nR_ori )
717                 {
718                     if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
719                         bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori);
720                     goto err;
721                 }
722                 ndat = nR_new*line->n_sample;
723                 nnew = nR_new;
724             }
725 
726             #define BRANCH(type_t,is_vector_end) \
727             { \
728                 for (j=0; j<line->n_sample; j++) \
729                 { \
730                     type_t *ptr_src = ((type_t*)dat) + j*nori; \
731                     type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
732                     int size = sizeof(type_t); \
733                     int k_src, k_dst = 0; \
734                     for (k_src=0; k_src<nori; k_src++) \
735                     { \
736                         if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
737                         if ( kbs_exists(rm_set, k_src+inc) ) continue; \
738                         memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
739                         k_dst++; \
740                     } \
741                 } \
742             }
743             switch (type)
744             {
745                 case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
746                 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
747             }
748             #undef BRANCH
749         }
750         else    // Number=G, diploid or mixture of haploid+diploid
751         {
752             if ( nori!=nG_ori )
753             {
754                 if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d\n", __FILE__,__LINE__,__FUNCTION__,
755                     bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori);
756                 goto err;
757             }
758             ndat = nG_new*line->n_sample;
759 
760             #define BRANCH(type_t,is_vector_end) \
761             { \
762                 for (j=0; j<line->n_sample; j++) \
763                 { \
764                     type_t *ptr_src = ((type_t*)dat) + j*nori; \
765                     type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
766                     int size = sizeof(type_t); \
767                     int ia, ib, k_dst = 0, k_src; \
768                     int nset = 0;   /* haploid or diploid? */ \
769                     for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
770                     if ( nset==nR_ori ) /* haploid */ \
771                     { \
772                         for (k_src=0; k_src<nR_ori; k_src++) \
773                         { \
774                             if ( kbs_exists(rm_set, k_src) ) continue; \
775                             memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
776                             k_dst++; \
777                         } \
778                         memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
779                     } \
780                     else /* diploid */ \
781                     { \
782                         k_src = -1; \
783                         for (ia=0; ia<nR_ori; ia++) \
784                         { \
785                             for (ib=0; ib<=ia; ib++) \
786                             { \
787                                 k_src++; \
788                                 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; }  \
789                                 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) ) continue; \
790                                 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
791                                 k_dst++; \
792                             } \
793                         } \
794                     } \
795                 } \
796             }
797             switch (type)
798             {
799                 case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
800                 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
801             }
802             #undef BRANCH
803         }
804         nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
805         if ( nret<0 )
806         {
807             if (hts_verbose>1) fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
808                     bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
809             goto err;
810         }
811     }
812 
813 clean:
814     free(str.s);
815     free(map);
816     free(dat);
817     return 0;
818 
819 err:
820     free(str.s);
821     free(map);
822     free(dat);
823     return -1;
824 }
825 
826