1 /*  vcf.c -- VCF/BCF API functions.
2 
3     Copyright (C) 2012, 2013 Broad Institute.
4     Copyright (C) 2012-2020 Genome Research Ltd.
5     Portions copyright (C) 2014 Intel Corporation.
6 
7     Author: Heng Li <lh3@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.  */
26 
27 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28 #include <config.h>
29 
30 #include <stdio.h>
31 #include <assert.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <stdlib.h>
35 #include <limits.h>
36 #include <stdint.h>
37 #include <inttypes.h>
38 #include <errno.h>
39 
40 #include "htslib/vcf.h"
41 #include "htslib/bgzf.h"
42 #include "htslib/tbx.h"
43 #include "htslib/hfile.h"
44 #include "hts_internal.h"
45 #include "htslib/hts_endian.h"
46 #include "htslib/khash_str2int.h"
47 #include "htslib/kstring.h"
48 #include "htslib/sam.h"
49 
50 #include "htslib/khash.h"
51 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
52 typedef khash_t(vdict) vdict_t;
53 
54 #include "htslib/kseq.h"
55 HTSLIB_EXPORT
56 uint32_t bcf_float_missing    = 0x7F800001;
57 
58 HTSLIB_EXPORT
59 uint32_t bcf_float_vector_end = 0x7F800002;
60 
61 HTSLIB_EXPORT
62 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
63 
64 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
65 
66 /*
67     Partial support for 64-bit POS and Number=1 INFO tags.
68     Notes:
69      - the support for 64-bit values is motivated by POS and INFO/END for large genomes
70      - the use of 64-bit values does not conform to the specification
71      - cannot output 64-bit BCF and if it does, it is not compatible with anything
72      - experimental, use at your risk
73 */
74 #ifdef VCF_ALLOW_INT64
75     #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
76     #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
77 #endif
78 
79 #define BCF_IS_64BIT (1<<30)
80 
81 
find_chrom_header_line(char * s)82 static char *find_chrom_header_line(char *s)
83 {
84     char *nl;
85     if (strncmp(s, "#CHROM\t", 7) == 0) return s;
86     else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
87     else return NULL;
88 }
89 
90 /*************************
91  *** VCF header parser ***
92  *************************/
93 
bcf_hdr_add_sample_len(bcf_hdr_t * h,const char * s,size_t len)94 static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
95 {
96     if ( !s ) return 0;
97     if (len == 0) len = strlen(s);
98 
99     const char *ss = s;
100     while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
101     if ( !*ss || ss - s == len)
102     {
103         hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
104         return -1;
105     }
106 
107     vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
108     int ret;
109     char *sdup = malloc(len + 1);
110     if (!sdup) return -1;
111     memcpy(sdup, s, len);
112     sdup[len] = 0;
113 
114     // Ensure space is available in h->samples
115     size_t n = kh_size(d);
116     char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
117     if (!new_samples) {
118         free(sdup);
119         return -1;
120     }
121     h->samples = new_samples;
122 
123     int k = kh_put(vdict, d, sdup, &ret);
124     if (ret < 0) {
125         free(sdup);
126         return -1;
127     }
128     if (ret) { // absent
129         kh_val(d, k) = bcf_idinfo_def;
130         kh_val(d, k).id = n;
131     } else {
132         hts_log_error("Duplicated sample name '%s'", s);
133         free(sdup);
134         return -1;
135     }
136     h->samples[n] = sdup;
137     h->dirty = 1;
138     return 0;
139 }
140 
bcf_hdr_add_sample(bcf_hdr_t * h,const char * s)141 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
142 {
143     return bcf_hdr_add_sample_len(h, s, 0);
144 }
145 
bcf_hdr_parse_sample_line(bcf_hdr_t * h,const char * str)146 int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
147 {
148     int ret = 0;
149     int i = 0;
150     const char *p, *q;
151     // add samples
152     for (p = q = str;; ++q) {
153         if (*q > '\n') continue;
154         if (++i > 9) {
155             if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1;
156         }
157         if (*q == 0 || *q == '\n' || ret < 0) break;
158         p = q + 1;
159     }
160 
161     return ret;
162 }
163 
bcf_hdr_sync(bcf_hdr_t * h)164 int bcf_hdr_sync(bcf_hdr_t *h)
165 {
166     int i;
167     for (i = 0; i < 3; i++)
168     {
169         vdict_t *d = (vdict_t*)h->dict[i];
170         khint_t k;
171         if ( h->n[i] < kh_size(d) )
172         {
173             bcf_idpair_t *new_idpair;
174             // this should be true only for i=2, BCF_DT_SAMPLE
175             new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
176             if (!new_idpair) return -1;
177             h->n[i] = kh_size(d);
178             h->id[i] = new_idpair;
179         }
180         for (k=kh_begin(d); k<kh_end(d); k++)
181         {
182             if (!kh_exist(d,k)) continue;
183             h->id[i][kh_val(d,k).id].key = kh_key(d,k);
184             h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
185         }
186     }
187     h->dirty = 0;
188     return 0;
189 }
190 
bcf_hrec_destroy(bcf_hrec_t * hrec)191 void bcf_hrec_destroy(bcf_hrec_t *hrec)
192 {
193     if (!hrec) return;
194     free(hrec->key);
195     if ( hrec->value ) free(hrec->value);
196     int i;
197     for (i=0; i<hrec->nkeys; i++)
198     {
199         free(hrec->keys[i]);
200         free(hrec->vals[i]);
201     }
202     free(hrec->keys);
203     free(hrec->vals);
204     free(hrec);
205 }
206 
207 // Copies all fields except IDX.
bcf_hrec_dup(bcf_hrec_t * hrec)208 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
209 {
210     int save_errno;
211     bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
212     if (!out) return NULL;
213 
214     out->type = hrec->type;
215     if ( hrec->key ) {
216         out->key = strdup(hrec->key);
217         if (!out->key) goto fail;
218     }
219     if ( hrec->value ) {
220         out->value = strdup(hrec->value);
221         if (!out->value) goto fail;
222     }
223     out->nkeys = hrec->nkeys;
224     out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
225     if (!out->keys) goto fail;
226     out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
227     if (!out->vals) goto fail;
228     int i, j = 0;
229     for (i=0; i<hrec->nkeys; i++)
230     {
231         if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
232         if ( hrec->keys[i] ) {
233             out->keys[j] = strdup(hrec->keys[i]);
234             if (!out->keys[j]) goto fail;
235         }
236         if ( hrec->vals[i] ) {
237             out->vals[j] = strdup(hrec->vals[i]);
238             if (!out->vals[j]) goto fail;
239         }
240         j++;
241     }
242     if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
243     return out;
244 
245  fail:
246     save_errno = errno;
247     hts_log_error("%s", strerror(errno));
248     bcf_hrec_destroy(out);
249     errno = save_errno;
250     return NULL;
251 }
252 
bcf_hrec_debug(FILE * fp,bcf_hrec_t * hrec)253 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
254 {
255     fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
256     int i;
257     for (i=0; i<hrec->nkeys; i++)
258         fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
259     fprintf(fp, "\n");
260 }
261 
bcf_header_debug(bcf_hdr_t * hdr)262 void bcf_header_debug(bcf_hdr_t *hdr)
263 {
264     int i, j;
265     for (i=0; i<hdr->nhrec; i++)
266     {
267         if ( !hdr->hrec[i]->value )
268         {
269             fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
270             fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
271             for (j=1; j<hdr->hrec[i]->nkeys; j++)
272                 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
273             fprintf(stderr,">\n");
274         }
275         else
276             fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
277     }
278 }
279 
bcf_hrec_add_key(bcf_hrec_t * hrec,const char * str,size_t len)280 int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
281 {
282     char **tmp;
283     size_t n = hrec->nkeys + 1;
284     assert(len > 0 && len < SIZE_MAX);
285     tmp = realloc(hrec->keys, sizeof(char*)*n);
286     if (!tmp) return -1;
287     hrec->keys = tmp;
288     tmp = realloc(hrec->vals, sizeof(char*)*n);
289     if (!tmp) return -1;
290     hrec->vals = tmp;
291 
292     hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
293     if (!hrec->keys[hrec->nkeys]) return -1;
294     memcpy(hrec->keys[hrec->nkeys],str,len);
295     hrec->keys[hrec->nkeys][len] = 0;
296     hrec->vals[hrec->nkeys] = NULL;
297     hrec->nkeys = n;
298     return 0;
299 }
300 
bcf_hrec_set_val(bcf_hrec_t * hrec,int i,const char * str,size_t len,int is_quoted)301 int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
302 {
303     if ( hrec->vals[i] ) {
304         free(hrec->vals[i]);
305         hrec->vals[i] = NULL;
306     }
307     if ( !str ) return 0;
308     if ( is_quoted )
309     {
310         if (len >= SIZE_MAX - 3) {
311             errno = ENOMEM;
312             return -1;
313         }
314         hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
315         if (!hrec->vals[i]) return -1;
316         hrec->vals[i][0] = '"';
317         memcpy(&hrec->vals[i][1],str,len);
318         hrec->vals[i][len+1] = '"';
319         hrec->vals[i][len+2] = 0;
320     }
321     else
322     {
323         if (len == SIZE_MAX) {
324             errno = ENOMEM;
325             return -1;
326         }
327         hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
328         if (!hrec->vals[i]) return -1;
329         memcpy(hrec->vals[i],str,len);
330         hrec->vals[i][len] = 0;
331     }
332     return 0;
333 }
334 
hrec_add_idx(bcf_hrec_t * hrec,int idx)335 int hrec_add_idx(bcf_hrec_t *hrec, int idx)
336 {
337     int n = hrec->nkeys + 1;
338     char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
339     if (!tmp) return -1;
340     hrec->keys = tmp;
341 
342     tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
343     if (!tmp) return -1;
344     hrec->vals = tmp;
345 
346     hrec->keys[hrec->nkeys] = strdup("IDX");
347     if (!hrec->keys[hrec->nkeys]) return -1;
348 
349     kstring_t str = {0,0,0};
350     if (kputw(idx, &str) < 0) {
351         free(hrec->keys[hrec->nkeys]);
352         return -1;
353     }
354     hrec->vals[hrec->nkeys] = str.s;
355     hrec->nkeys = n;
356     return 0;
357 }
358 
bcf_hrec_find_key(bcf_hrec_t * hrec,const char * key)359 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
360 {
361     int i;
362     for (i=0; i<hrec->nkeys; i++)
363         if ( !strcasecmp(key,hrec->keys[i]) ) return i;
364     return -1;
365 }
366 
is_escaped(const char * min,const char * str)367 static inline int is_escaped(const char *min, const char *str)
368 {
369     int n = 0;
370     while ( --str>=min && *str=='\\' ) n++;
371     return n%2;
372 }
373 
bcf_hdr_parse_line(const bcf_hdr_t * h,const char * line,int * len)374 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
375 {
376     bcf_hrec_t *hrec = NULL;
377     const char *p = line;
378     if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
379     p += 2;
380 
381     const char *q = p;
382     while ( *q && *q!='=' && *q != '\n' ) q++;
383     ptrdiff_t n = q-p;
384     if ( *q!='=' || !n ) // wrong format
385         goto malformed_line;
386 
387     hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
388     if (!hrec) { *len = -1; return NULL; }
389     hrec->key = (char*) malloc(sizeof(char)*(n+1));
390     if (!hrec->key) goto fail;
391     memcpy(hrec->key,p,n);
392     hrec->key[n] = 0;
393 
394     p = ++q;
395     if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
396     {
397         while ( *q && *q!='\n' ) q++;
398         hrec->value = (char*) malloc((q-p+1)*sizeof(char));
399         if (!hrec->value) goto fail;
400         memcpy(hrec->value, p, q-p);
401         hrec->value[q-p] = 0;
402         *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
403         return hrec;
404     }
405 
406     // structured line, e.g.
407     // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
408     // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
409     int nopen = 1;
410     while ( *q && *q!='\n' && nopen>0 )
411     {
412         p = ++q;
413         while ( *q && *q==' ' ) { p++; q++; }
414         // ^[A-Za-z_][0-9A-Za-z_.]*$
415         if (p==q && *q && (isalpha_c(*q) || *q=='_'))
416         {
417             q++;
418             while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
419         }
420         n = q-p;
421         int m = 0;
422         while ( *q && *q==' ' ) { q++; m++; }
423         if ( *q!='=' || !n )
424             goto malformed_line;
425 
426         if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
427         p = ++q;
428         while ( *q && *q==' ' ) { p++; q++; }
429         int quoted = *p=='"' ? 1 : 0;
430         if ( quoted ) p++, q++;
431         while ( *q && *q != '\n' )
432         {
433             if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
434             else
435             {
436                 if ( *q=='<' ) nopen++;
437                 if ( *q=='>' ) nopen--;
438                 if ( !nopen ) break;
439                 if ( *q==',' && nopen==1 ) break;
440             }
441             q++;
442         }
443         const char *r = q;
444         while ( r > p && r[-1] == ' ' ) r--;
445         if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
446             goto fail;
447         if ( quoted && *q=='"' ) q++;
448         if ( *q=='>' ) { nopen--; q++; }
449     }
450 
451     // Skip to end of line
452     int nonspace = 0;
453     p = q;
454     while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
455     if (nonspace) {
456         char buffer[320];
457         hts_log_warning("Dropped trailing junk from header line '%s'",
458                         hts_strprint(buffer, sizeof(buffer),
459                                      '"', line, q - line));
460     }
461 
462     *len = q - line + (*q ? 1 : 0);
463     return hrec;
464 
465  fail:
466     *len = -1;
467     bcf_hrec_destroy(hrec);
468     return NULL;
469 
470  malformed_line:
471     {
472         char buffer[320];
473         while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
474         hts_log_error("Could not parse the header line: %s",
475                       hts_strprint(buffer, sizeof(buffer),
476                                    '"', line, q - line));
477         *len = q - line + (*q ? 1 : 0);
478         bcf_hrec_destroy(hrec);
479         return NULL;
480     }
481 }
482 
bcf_hdr_set_idx(bcf_hdr_t * hdr,int dict_type,const char * tag,bcf_idinfo_t * idinfo)483 static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
484 {
485     size_t new_n;
486 
487     // If available, preserve existing IDX
488     if ( idinfo->id==-1 )
489         idinfo->id = hdr->n[dict_type];
490     else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
491     {
492         hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
493             idinfo->id, tag);
494         errno = EINVAL;
495         return -1;
496     }
497 
498     new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
499     if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
500                    &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
501         return -1;
502     }
503     hdr->n[dict_type] = new_n;
504 
505     // NB: the next kh_put call can invalidate the idinfo pointer, therefore
506     // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
507     hdr->id[dict_type][idinfo->id].key = tag;
508 
509     return 0;
510 }
511 
512 // returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
bcf_hdr_register_hrec(bcf_hdr_t * hdr,bcf_hrec_t * hrec)513 static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
514 {
515     // contig
516     int i, ret, replacing = 0;
517     khint_t k;
518     char *str = NULL;
519 
520     if ( !strcmp(hrec->key, "contig") )
521     {
522         hts_pos_t len = 0;
523         hrec->type = BCF_HL_CTG;
524 
525         // Get the contig ID ($str) and length ($j)
526         i = bcf_hrec_find_key(hrec,"length");
527         if ( i<0 ) len = 0;
528         else {
529             char *end = hrec->vals[i];
530             len = strtoll(hrec->vals[i], &end, 10);
531             if (end == hrec->vals[i] || len < 0) return 0;
532         }
533 
534         i = bcf_hrec_find_key(hrec,"ID");
535         if ( i<0 ) return 0;
536         str = strdup(hrec->vals[i]);
537         if (!str) return -1;
538 
539         // Register in the dictionary
540         vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
541         khint_t k = kh_get(vdict, d, str);
542         if ( k != kh_end(d) ) { // already present
543             free(str); str=NULL;
544             if (kh_val(d, k).hrec[0] != NULL) // and not removed
545                 return 0;
546             replacing = 1;
547         } else {
548             k = kh_put(vdict, d, str, &ret);
549             if (ret < 0) { free(str); return -1; }
550         }
551 
552         int idx = bcf_hrec_find_key(hrec,"IDX");
553         if ( idx!=-1 )
554         {
555             char *tmp = hrec->vals[idx];
556             idx = strtol(hrec->vals[idx], &tmp, 10);
557             if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
558             {
559                 if (!replacing) {
560                     kh_del(vdict, d, k);
561                     free(str);
562                 }
563                 hts_log_warning("Error parsing the IDX tag, skipping");
564                 return 0;
565             }
566         }
567 
568         kh_val(d, k) = bcf_idinfo_def;
569         kh_val(d, k).id = idx;
570         kh_val(d, k).info[0] = len;
571         kh_val(d, k).hrec[0] = hrec;
572         if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
573             if (!replacing) {
574                 kh_del(vdict, d, k);
575                 free(str);
576             }
577             return -1;
578         }
579         if ( idx==-1 ) {
580             if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
581                return -1;
582             }
583         }
584 
585         return 1;
586     }
587 
588     if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
589     else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
590     else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
591     else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
592     else return 0;
593 
594     // INFO/FILTER/FORMAT
595     char *id = NULL;
596     uint32_t type = UINT32_MAX, var = UINT32_MAX;
597     int num = -1, idx = -1;
598     for (i=0; i<hrec->nkeys; i++)
599     {
600         if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
601         else if ( !strcmp(hrec->keys[i], "IDX") )
602         {
603             char *tmp = hrec->vals[i];
604             idx = strtol(hrec->vals[i], &tmp, 10);
605             if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
606             {
607                 hts_log_warning("Error parsing the IDX tag, skipping");
608                 return 0;
609             }
610         }
611         else if ( !strcmp(hrec->keys[i], "Type") )
612         {
613             if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
614             else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
615             else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
616             else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
617             else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
618             else
619             {
620                 hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
621                 type = BCF_HT_STR;
622             }
623         }
624         else if ( !strcmp(hrec->keys[i], "Number") )
625         {
626             if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
627             else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
628             else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
629             else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
630             else
631             {
632                 sscanf(hrec->vals[i],"%d",&num);
633                 var = BCF_VL_FIXED;
634             }
635             if (var != BCF_VL_FIXED) num = 0xfffff;
636         }
637     }
638     if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
639         if (type == -1) {
640             hts_log_warning("%s %s field has no Type defined. Assuming String",
641                 *hrec->key == 'I' ? "An" : "A", hrec->key);
642             type = BCF_HT_STR;
643         }
644         if (var == -1) {
645             hts_log_warning("%s %s field has no Number defined. Assuming '.'",
646                 *hrec->key == 'I' ? "An" : "A", hrec->key);
647             var = BCF_VL_VAR;
648         }
649     }
650     uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
651                      (var & 0xf) << 8 |
652                      (type & 0xf) << 4 |
653                      (((uint32_t) hrec->type) & 0xf));
654 
655     if ( !id ) return 0;
656     str = strdup(id);
657     if (!str) return -1;
658 
659     vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
660     k = kh_get(vdict, d, str);
661     if ( k != kh_end(d) )
662     {
663         // already present
664         free(str);
665         if ( kh_val(d, k).hrec[info&0xf] ) return 0;
666         kh_val(d, k).info[info&0xf] = info;
667         kh_val(d, k).hrec[info&0xf] = hrec;
668         if ( idx==-1 ) {
669             if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
670                 return -1;
671             }
672         }
673         return 1;
674     }
675     k = kh_put(vdict, d, str, &ret);
676     if (ret < 0) {
677         free(str);
678         return -1;
679     }
680     kh_val(d, k) = bcf_idinfo_def;
681     kh_val(d, k).info[info&0xf] = info;
682     kh_val(d, k).hrec[info&0xf] = hrec;
683     kh_val(d, k).id = idx;
684     if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
685         kh_del(vdict, d, k);
686         free(str);
687         return -1;
688     }
689     if ( idx==-1 ) {
690         if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
691             return -1;
692         }
693     }
694 
695     return 1;
696 }
697 
bcf_hdr_add_hrec(bcf_hdr_t * hdr,bcf_hrec_t * hrec)698 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
699 {
700     int res;
701     if ( !hrec ) return 0;
702 
703     hrec->type = BCF_HL_GEN;
704     res = bcf_hdr_register_hrec(hdr,hrec);
705     if (res < 0) return -1;
706     if ( !res )
707     {
708         // If one of the hashed field, then it is already present
709         if ( hrec->type != BCF_HL_GEN )
710         {
711             bcf_hrec_destroy(hrec);
712             return 0;
713         }
714 
715         // Is one of the generic fields and already present?
716         int i;
717         for (i=0; i<hdr->nhrec; i++)
718         {
719             if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
720             if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
721             if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
722         }
723         if ( i<hdr->nhrec )
724         {
725             bcf_hrec_destroy(hrec);
726             return 0;
727         }
728     }
729 
730     // New record, needs to be added
731     int n = hdr->nhrec + 1;
732     bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
733     if (!new_hrec) return -1;
734     hdr->hrec = new_hrec;
735     hdr->hrec[hdr->nhrec] = hrec;
736     hdr->dirty = 1;
737     hdr->nhrec = n;
738 
739     return hrec->type==BCF_HL_GEN ? 0 : 1;
740 }
741 
742 /*
743  *  Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
744  *  the STR,GEN lines are searched for linearly in a linked list of all header lines.
745  *  This may become a problem for VCFs with huge headers, we might need to build a
746  *  dictionary for these lines as well.
747  */
bcf_hdr_get_hrec(const bcf_hdr_t * hdr,int type,const char * key,const char * value,const char * str_class)748 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
749 {
750     int i;
751     if ( type==BCF_HL_GEN )
752     {
753         for (i=0; i<hdr->nhrec; i++)
754         {
755             if ( hdr->hrec[i]->type!=type ) continue;
756             if ( strcmp(hdr->hrec[i]->key,key) ) continue;
757             if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
758         }
759         return NULL;
760     }
761     else if ( type==BCF_HL_STR )
762     {
763         for (i=0; i<hdr->nhrec; i++)
764         {
765             if ( hdr->hrec[i]->type!=type ) continue;
766             if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
767             int j = bcf_hrec_find_key(hdr->hrec[i],key);
768             if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
769         }
770         return NULL;
771     }
772     vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
773     khint_t k = kh_get(vdict, d, value);
774     if ( k == kh_end(d) ) return NULL;
775     return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
776 }
777 
bcf_hdr_check_sanity(bcf_hdr_t * hdr)778 void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
779 {
780     static int PL_warned = 0, GL_warned = 0;
781 
782     if ( !PL_warned )
783     {
784         int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
785         if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
786         {
787             hts_log_warning("PL should be declared as Number=G");
788             PL_warned = 1;
789         }
790     }
791     if ( !GL_warned )
792     {
793         int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
794         if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
795         {
796             hts_log_warning("GL should be declared as Number=G");
797             GL_warned = 1;
798         }
799     }
800 }
801 
bcf_hdr_parse(bcf_hdr_t * hdr,char * htxt)802 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
803 {
804     int len, done = 0;
805     char *p = htxt;
806 
807     // Check sanity: "fileformat" string must come as first
808     bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
809     if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
810         hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
811     if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
812         bcf_hrec_destroy(hrec);
813         return -1;
814     }
815 
816     // The filter PASS must appear first in the dictionary
817     hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
818     if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
819         bcf_hrec_destroy(hrec);
820         return -1;
821     }
822 
823     // Parse the whole header
824     do {
825         while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
826             if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
827                 bcf_hrec_destroy(hrec);
828                 return -1;
829             }
830             p += len;
831         }
832         assert(hrec == NULL);
833         if (len < 0) {
834             // len < 0 indicates out-of-memory, or similar error
835             hts_log_error("Could not parse header line: %s", strerror(errno));
836             return -1;
837         } else if (len > 0) {
838             // Bad header line.  bcf_hdr_parse_line() will have logged it.
839             // Skip and try again on the next line (p + len will be the start
840             // of the next one).
841             p += len;
842             continue;
843         }
844 
845         // Next should be the sample line.  If not, it was a malformed
846         // header, in which case print a warning and skip (many VCF
847         // operations do not really care about a few malformed lines).
848         // In the future we may want to add a strict mode that errors in
849         // this case.
850         if ( strncmp("#CHROM\tPOS",p,10) != 0 ) {
851             char *eol = strchr(p, '\n');
852             if (*p != '\0') {
853                 char buffer[320];
854                 hts_log_warning("Could not parse header line: %s",
855                                 hts_strprint(buffer, sizeof(buffer),
856                                                '"', p,
857                                                eol ? (eol - p) : SIZE_MAX));
858             }
859             if (eol) {
860                 p = eol + 1; // Try from the next line.
861             } else {
862                 done = -1; // No more lines left, give up.
863             }
864         } else {
865             done = 1; // Sample line found
866         }
867     } while (!done);
868 
869     if (done < 0) {
870         // No sample line is fatal.
871         hts_log_error("Could not parse the header, sample line not found");
872         return -1;
873     }
874 
875     if (bcf_hdr_parse_sample_line(hdr,p) < 0)
876         return -1;
877     if (bcf_hdr_sync(hdr) < 0)
878         return -1;
879     bcf_hdr_check_sanity(hdr);
880     return 0;
881 }
882 
bcf_hdr_append(bcf_hdr_t * hdr,const char * line)883 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
884 {
885     int len;
886     bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
887     if ( !hrec ) return -1;
888     if (bcf_hdr_add_hrec(hdr, hrec) < 0)
889         return -1;
890     return 0;
891 }
892 
bcf_hdr_remove(bcf_hdr_t * hdr,int type,const char * key)893 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
894 {
895     int i = 0;
896     bcf_hrec_t *hrec;
897     if ( !key )
898     {
899         while ( i<hdr->nhrec )
900         {
901             if ( hdr->hrec[i]->type!=type ) { i++; continue; }
902             hrec = hdr->hrec[i];
903 
904             if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
905             {
906                 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
907                 if ( j>=0 )
908                 {
909                     vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
910                     khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]);
911                     kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
912                 }
913             }
914 
915             hdr->dirty = 1;
916             hdr->nhrec--;
917             if ( i < hdr->nhrec )
918                 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
919             bcf_hrec_destroy(hrec);
920         }
921         return;
922     }
923     while (1)
924     {
925         if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
926         {
927             hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
928             if ( !hrec ) return;
929 
930             for (i=0; i<hdr->nhrec; i++)
931                 if ( hdr->hrec[i]==hrec ) break;
932             assert( i<hdr->nhrec );
933 
934             vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
935             khint_t k = kh_get(vdict, d, key);
936             kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
937         }
938         else
939         {
940             for (i=0; i<hdr->nhrec; i++)
941             {
942                 if ( hdr->hrec[i]->type!=type ) continue;
943                 if ( type==BCF_HL_GEN )
944                 {
945                     if ( !strcmp(hdr->hrec[i]->key,key) ) break;
946                 }
947                 else
948                 {
949                     // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
950                     int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
951                     if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
952                 }
953             }
954             if ( i==hdr->nhrec ) return;
955             hrec = hdr->hrec[i];
956         }
957 
958         hdr->nhrec--;
959         if ( i < hdr->nhrec )
960             memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
961         bcf_hrec_destroy(hrec);
962         hdr->dirty = 1;
963     }
964 }
965 
bcf_hdr_printf(bcf_hdr_t * hdr,const char * fmt,...)966 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
967 {
968     char tmp[256], *line = tmp;
969     va_list ap;
970     va_start(ap, fmt);
971     int n = vsnprintf(line, sizeof(tmp), fmt, ap);
972     va_end(ap);
973 
974     if (n >= sizeof(tmp)) {
975         n++; // For trailing NUL
976         line = (char*)malloc(n);
977         if (!line)
978             return -1;
979 
980         va_start(ap, fmt);
981         vsnprintf(line, n, fmt, ap);
982         va_end(ap);
983     }
984 
985     int ret = bcf_hdr_append(hdr, line);
986 
987     if (line != tmp) free(line);
988     return ret;
989 }
990 
991 
992 /**********************
993  *** BCF header I/O ***
994  **********************/
995 
bcf_hdr_get_version(const bcf_hdr_t * hdr)996 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
997 {
998     bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
999     if ( !hrec )
1000     {
1001         hts_log_warning("No version string found, assuming VCFv4.2");
1002         return "VCFv4.2";
1003     }
1004     return hrec->value;
1005 }
1006 
bcf_hdr_set_version(bcf_hdr_t * hdr,const char * version)1007 int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1008 {
1009     bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1010     if ( !hrec )
1011     {
1012         int len;
1013         kstring_t str = {0,0,0};
1014         ksprintf(&str,"##fileformat=%s", version);
1015         hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1016         free(str.s);
1017     }
1018     else
1019     {
1020         free(hrec->value);
1021         hrec->value = strdup(version);
1022     }
1023     hdr->dirty = 1;
1024     return 0; // FIXME: check for errs in this function (return < 0 if so)
1025 }
1026 
bcf_hdr_init(const char * mode)1027 bcf_hdr_t *bcf_hdr_init(const char *mode)
1028 {
1029     int i;
1030     bcf_hdr_t *h;
1031     h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1032     if (!h) return NULL;
1033     for (i = 0; i < 3; ++i)
1034         if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1035     if ( strchr(mode,'w') )
1036     {
1037         bcf_hdr_append(h, "##fileformat=VCFv4.2");
1038         // The filter PASS must appear first in the dictionary
1039         bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1040     }
1041     return h;
1042 
1043  fail:
1044     for (i = 0; i < 3; ++i)
1045         kh_destroy(vdict, h->dict[i]);
1046     free(h);
1047     return NULL;
1048 }
1049 
bcf_hdr_destroy(bcf_hdr_t * h)1050 void bcf_hdr_destroy(bcf_hdr_t *h)
1051 {
1052     int i;
1053     khint_t k;
1054     if (!h) return;
1055     for (i = 0; i < 3; ++i) {
1056         vdict_t *d = (vdict_t*)h->dict[i];
1057         if (d == 0) continue;
1058         for (k = kh_begin(d); k != kh_end(d); ++k)
1059             if (kh_exist(d, k)) free((char*)kh_key(d, k));
1060         kh_destroy(vdict, d);
1061         free(h->id[i]);
1062     }
1063     for (i=0; i<h->nhrec; i++)
1064         bcf_hrec_destroy(h->hrec[i]);
1065     if (h->nhrec) free(h->hrec);
1066     if (h->samples) free(h->samples);
1067     free(h->keep_samples);
1068     free(h->transl[0]); free(h->transl[1]);
1069     free(h->mem.s);
1070     free(h);
1071 }
1072 
bcf_hdr_read(htsFile * hfp)1073 bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1074 {
1075     if (hfp->format.format == vcf)
1076         return vcf_hdr_read(hfp);
1077     if (hfp->format.format != bcf) {
1078         hts_log_error("Input is not detected as bcf or vcf format");
1079         return NULL;
1080     }
1081 
1082     assert(hfp->is_bgzf);
1083 
1084     BGZF *fp = hfp->fp.bgzf;
1085     uint8_t magic[5];
1086     bcf_hdr_t *h;
1087     h = bcf_hdr_init("r");
1088     if (!h) {
1089         hts_log_error("Failed to allocate bcf header");
1090         return NULL;
1091     }
1092     if (bgzf_read(fp, magic, 5) != 5)
1093     {
1094         hts_log_error("Failed to read the header (reading BCF in text mode?)");
1095         bcf_hdr_destroy(h);
1096         return NULL;
1097     }
1098     if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1099     {
1100         if (!strncmp((char*)magic, "BCF", 3))
1101             hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1102         else
1103             hts_log_error("Invalid BCF2 magic string");
1104         bcf_hdr_destroy(h);
1105         return NULL;
1106     }
1107     uint8_t buf[4];
1108     size_t hlen;
1109     char *htxt = NULL;
1110     if (bgzf_read(fp, buf, 4) != 4) goto fail;
1111     hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1112     if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1113     htxt = (char*)malloc(hlen + 1);
1114     if (!htxt) goto fail;
1115     if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1116     htxt[hlen] = '\0'; // Ensure htxt is terminated
1117     if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1118     free(htxt);
1119     return h;
1120  fail:
1121     hts_log_error("Failed to read BCF header");
1122     free(htxt);
1123     bcf_hdr_destroy(h);
1124     return NULL;
1125 }
1126 
bcf_hdr_write(htsFile * hfp,bcf_hdr_t * h)1127 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1128 {
1129     if (!h) {
1130         errno = EINVAL;
1131         return -1;
1132     }
1133     if ( h->dirty ) {
1134         if (bcf_hdr_sync(h) < 0) return -1;
1135     }
1136     hfp->format.category = variant_data;
1137     if (hfp->format.format == vcf || hfp->format.format == text_format) {
1138         hfp->format.format = vcf;
1139         return vcf_hdr_write(hfp, h);
1140     }
1141 
1142     if (hfp->format.format == binary_format)
1143         hfp->format.format = bcf;
1144 
1145     kstring_t htxt = {0,0,0};
1146     if (bcf_hdr_format(h, 1, &htxt) < 0) {
1147         free(htxt.s);
1148         return -1;
1149     }
1150     kputc('\0', &htxt); // include the \0 byte
1151 
1152     BGZF *fp = hfp->fp.bgzf;
1153     if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1154     uint8_t hlen[4];
1155     u32_to_le(htxt.l, hlen);
1156     if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1157     if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1158 
1159     free(htxt.s);
1160     return 0;
1161 }
1162 
1163 /********************
1164  *** BCF site I/O ***
1165  ********************/
1166 
bcf_init()1167 bcf1_t *bcf_init()
1168 {
1169     bcf1_t *v;
1170     v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1171     return v;
1172 }
1173 
bcf_clear(bcf1_t * v)1174 void bcf_clear(bcf1_t *v)
1175 {
1176     int i;
1177     for (i=0; i<v->d.m_info; i++)
1178     {
1179         if ( v->d.info[i].vptr_free )
1180         {
1181             free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1182             v->d.info[i].vptr_free = 0;
1183         }
1184     }
1185     for (i=0; i<v->d.m_fmt; i++)
1186     {
1187         if ( v->d.fmt[i].p_free )
1188         {
1189             free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1190             v->d.fmt[i].p_free = 0;
1191         }
1192     }
1193     v->rid = v->pos = v->rlen = v->unpacked = 0;
1194     bcf_float_set_missing(v->qual);
1195     v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1196     v->shared.l = v->indiv.l = 0;
1197     v->d.var_type = -1;
1198     v->d.shared_dirty = 0;
1199     v->d.indiv_dirty  = 0;
1200     v->d.n_flt = 0;
1201     v->errcode = 0;
1202     if (v->d.m_als) v->d.als[0] = 0;
1203     if (v->d.m_id) v->d.id[0] = 0;
1204 }
1205 
bcf_empty(bcf1_t * v)1206 void bcf_empty(bcf1_t *v)
1207 {
1208     bcf_clear1(v);
1209     free(v->d.id);
1210     free(v->d.als);
1211     free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1212     if (v->d.var ) free(v->d.var);
1213     free(v->shared.s); free(v->indiv.s);
1214     memset(&v->d,0,sizeof(v->d));
1215     memset(&v->shared,0,sizeof(v->shared));
1216     memset(&v->indiv,0,sizeof(v->indiv));
1217 }
1218 
bcf_destroy(bcf1_t * v)1219 void bcf_destroy(bcf1_t *v)
1220 {
1221     if (!v) return;
1222     bcf_empty1(v);
1223     free(v);
1224 }
1225 
bcf_read1_core(BGZF * fp,bcf1_t * v)1226 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1227 {
1228     uint8_t x[32];
1229     ssize_t ret;
1230     uint32_t shared_len, indiv_len;
1231     if ((ret = bgzf_read(fp, x, 32)) != 32) {
1232         if (ret == 0) return -1;
1233         return -2;
1234     }
1235     bcf_clear1(v);
1236     shared_len = le_to_u32(x);
1237     if (shared_len < 24) return -2;
1238     shared_len -= 24; // to exclude six 32-bit integers
1239     if (ks_resize(&v->shared, shared_len) != 0) return -2;
1240     indiv_len = le_to_u32(x + 4);
1241     if (ks_resize(&v->indiv, indiv_len) != 0) return -2;
1242     v->rid  = le_to_i32(x + 8);
1243     v->pos  = le_to_u32(x + 12);
1244     v->rlen = le_to_i32(x + 16);
1245     v->qual = le_to_float(x + 20);
1246     v->n_info = le_to_u16(x + 24);
1247     v->n_allele = le_to_u16(x + 26);
1248     v->n_sample = le_to_u32(x + 28) & 0xffffff;
1249     v->n_fmt = x[31];
1250     v->shared.l = shared_len;
1251     v->indiv.l = indiv_len;
1252     // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1253     if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1254 
1255     if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1256     if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1257     return 0;
1258 }
1259 
1260 #define bit_array_size(n) ((n)/8+1)
1261 #define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1262 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1263 #define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1264 
bcf_dec_typed_int1_safe(uint8_t * p,uint8_t * end,uint8_t ** q,int32_t * val)1265 static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1266                                    int32_t *val) {
1267     uint32_t t;
1268     if (end - p < 2) return -1;
1269     t = *p++ & 0xf;
1270     /* Use if .. else if ... else instead of switch to force order.  Assumption
1271        is that small integers are more frequent than big ones. */
1272     if (t == BCF_BT_INT8) {
1273         *val = *(int8_t *) p++;
1274     } else {
1275         if (end - p < (1<<bcf_type_shift[t])) return -1;
1276         if (t == BCF_BT_INT16) {
1277             *val = le_to_i16(p);
1278             p += 2;
1279         } else if (t == BCF_BT_INT32) {
1280             *val = le_to_i32(p);
1281             p += 4;
1282 #ifdef VCF_ALLOW_INT64
1283         } else if (t == BCF_BT_INT64) {
1284             // This case should never happen because there should be no
1285             // 64-bit BCFs at all, definitely not coming from htslib
1286             *val = le_to_i64(p);
1287             p += 8;
1288 #endif
1289         } else {
1290             return -1;
1291         }
1292     }
1293     *q = p;
1294     return 0;
1295 }
1296 
bcf_dec_size_safe(uint8_t * p,uint8_t * end,uint8_t ** q,int * num,int * type)1297 static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1298                              int *num, int *type) {
1299     int r;
1300     if (p >= end) return -1;
1301     *type = *p & 0xf;
1302     if (*p>>4 != 15) {
1303         *q = p + 1;
1304         *num = *p >> 4;
1305         return 0;
1306     }
1307     r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1308     if (r) return r;
1309     return *num >= 0 ? 0 : -1;
1310 }
1311 
get_type_name(int type)1312 static const char *get_type_name(int type) {
1313     const char *types[9] = {
1314         "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1315         "unknown", "float", "unknown", "char", "unknown"
1316     };
1317     int t = (type >= 0 && type < 8) ? type : 8;
1318     return types[t];
1319 }
1320 
bcf_record_check_err(const bcf_hdr_t * hdr,bcf1_t * rec,char * type,uint32_t * reports,int i)1321 static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1322                                  char *type, uint32_t *reports, int i) {
1323     if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1324         hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1325                         ": Invalid FORMAT %s %d",
1326                         bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1327     (*reports)++;
1328 }
1329 
bcf_record_check(const bcf_hdr_t * hdr,bcf1_t * rec)1330 static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1331     uint8_t *ptr, *end;
1332     size_t bytes;
1333     uint32_t err = 0;
1334     int type = 0;
1335     int num  = 0;
1336     int reflen = 0;
1337     uint32_t i, reports;
1338     const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1339                                  (1 << BCF_BT_INT16) |
1340 #ifdef VCF_ALLOW_INT64
1341                                  (1 << BCF_BT_INT64) |
1342 #endif
1343                                  (1 << BCF_BT_INT32));
1344     const uint32_t is_valid_type = (is_integer          |
1345                                     (1 << BCF_BT_NULL)  |
1346                                     (1 << BCF_BT_FLOAT) |
1347                                     (1 << BCF_BT_CHAR));
1348     int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1349 
1350     // Check for valid contig ID
1351     if (rec->rid < 0
1352         || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1353                     || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1354         hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1355         err |= BCF_ERR_CTG_INVALID;
1356     }
1357 
1358     // Check ID
1359     ptr = (uint8_t *) rec->shared.s;
1360     end = ptr + rec->shared.l;
1361     if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1362     if (type != BCF_BT_CHAR) {
1363         hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1364         err |= BCF_ERR_TAG_INVALID;
1365     }
1366     bytes = (size_t) num << bcf_type_shift[type];
1367     if (end - ptr < bytes) goto bad_shared;
1368     ptr += bytes;
1369 
1370     // Check REF and ALT
1371     reports = 0;
1372     for (i = 0; i < rec->n_allele; i++) {
1373         if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1374         if (type != BCF_BT_CHAR) {
1375             if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1376                 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1377             err |= BCF_ERR_CHAR;
1378         }
1379         if (i == 0) reflen = num;
1380         bytes = (size_t) num << bcf_type_shift[type];
1381         if (end - ptr < bytes) goto bad_shared;
1382         ptr += bytes;
1383     }
1384 
1385     // Check FILTER
1386     reports = 0;
1387     if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1388     if (num > 0) {
1389         bytes = (size_t) num << bcf_type_shift[type];
1390         if (((1 << type) & is_integer) == 0) {
1391             hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1392             err |= BCF_ERR_TAG_INVALID;
1393             if (end - ptr < bytes) goto bad_shared;
1394             ptr += bytes;
1395         } else {
1396             if (end - ptr < bytes) goto bad_shared;
1397             for (i = 0; i < num; i++) {
1398                 int32_t key = bcf_dec_int1(ptr, type, &ptr);
1399                 if (key < 0
1400                     || (hdr && (key >= max_id
1401                                 || hdr->id[BCF_DT_ID][key].key == NULL))) {
1402                     if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1403                         hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1404                     err |= BCF_ERR_TAG_UNDEF;
1405                 }
1406             }
1407         }
1408     }
1409 
1410     // Check INFO
1411     reports = 0;
1412     bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1413     for (i = 0; i < rec->n_info; i++) {
1414         int32_t key = -1;
1415         if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1416         if (key < 0 || (hdr && (key >= max_id
1417                                 || id_tmp[key].key == NULL))) {
1418             if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1419                 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1420             err |= BCF_ERR_TAG_UNDEF;
1421         }
1422         if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1423         if (((1 << type) & is_valid_type) == 0) {
1424             if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1425                 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1426             err |= BCF_ERR_TAG_INVALID;
1427         }
1428         bytes = (size_t) num << bcf_type_shift[type];
1429         if (end - ptr < bytes) goto bad_shared;
1430         ptr += bytes;
1431     }
1432 
1433     // Check FORMAT and individual information
1434     ptr = (uint8_t *) rec->indiv.s;
1435     end = ptr + rec->indiv.l;
1436     reports = 0;
1437     for (i = 0; i < rec->n_fmt; i++) {
1438         int32_t key = -1;
1439         if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1440         if (key < 0
1441             || (hdr && (key >= max_id
1442                         || id_tmp[key].key == NULL))) {
1443             bcf_record_check_err(hdr, rec, "id", &reports, key);
1444             err |= BCF_ERR_TAG_UNDEF;
1445         }
1446         if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1447         if (((1 << type) & is_valid_type) == 0) {
1448             bcf_record_check_err(hdr, rec, "type", &reports, type);
1449             err |= BCF_ERR_TAG_INVALID;
1450         }
1451         bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1452         if (end - ptr < bytes) goto bad_indiv;
1453         ptr += bytes;
1454     }
1455 
1456     if (!err && rec->rlen < 0) {
1457         // Treat bad rlen as a warning instead of an error, and try to
1458         // fix up by using the length of the stored REF allele.
1459         static int warned = 0;
1460         if (!warned) {
1461             hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1462                             "Only one invalid RLEN will be reported.",
1463                             bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1464             warned = 1;
1465         }
1466         rec->rlen = reflen >= 0 ? reflen : 0;
1467     }
1468 
1469     rec->errcode |= err;
1470 
1471     return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1472 
1473  bad_shared:
1474     hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1475     return -2;
1476 
1477  bad_indiv:
1478     hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1479     return -2;
1480 }
1481 
1482 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
bcf_subset_format(const bcf_hdr_t * hdr,bcf1_t * rec)1483 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1484 {
1485     if ( !hdr->keep_samples ) return 0;
1486     if ( !bcf_hdr_nsamples(hdr) )
1487     {
1488         rec->indiv.l = rec->n_sample = 0;
1489         return 0;
1490     }
1491 
1492     int i, j;
1493     uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1494     bcf_dec_t *dec = &rec->d;
1495     hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1496     for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1497 
1498     for (i=0; i<rec->n_fmt; i++)
1499     {
1500         ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1501         src = dec->fmt[i].p - dec->fmt[i].size;
1502         if ( dst )
1503         {
1504             memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1505             dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1506         }
1507         dst = dec->fmt[i].p;
1508         for (j=0; j<hdr->nsamples_ori; j++)
1509         {
1510             src += dec->fmt[i].size;
1511             if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1512             memmove(dst, src, dec->fmt[i].size);
1513             dst += dec->fmt[i].size;
1514         }
1515         rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1516         dec->fmt[i].p_len = dst - dec->fmt[i].p;
1517     }
1518     rec->unpacked |= BCF_UN_FMT;
1519 
1520     rec->n_sample = bcf_hdr_nsamples(hdr);
1521     return 0;
1522 }
1523 
bcf_read(htsFile * fp,const bcf_hdr_t * h,bcf1_t * v)1524 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1525 {
1526     if (fp->format.format == vcf) return vcf_read(fp,h,v);
1527     int ret = bcf_read1_core(fp->fp.bgzf, v);
1528     if (ret == 0) ret = bcf_record_check(h, v);
1529     if ( ret!=0 || !h->keep_samples ) return ret;
1530     return bcf_subset_format(h,v);
1531 }
1532 
bcf_readrec(BGZF * fp,void * null,void * vv,int * tid,hts_pos_t * beg,hts_pos_t * end)1533 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1534 {
1535     bcf1_t *v = (bcf1_t *) vv;
1536     int ret = bcf_read1_core(fp, v);
1537     if (ret == 0) ret = bcf_record_check(NULL, v);
1538     if (ret  >= 0)
1539         *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1540     return ret;
1541 }
1542 
bcf1_sync_id(bcf1_t * line,kstring_t * str)1543 static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1544 {
1545     // single typed string
1546     if ( line->d.id && strcmp(line->d.id, ".") ) {
1547         return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1548     } else {
1549         return bcf_enc_size(str, 0, BCF_BT_CHAR);
1550     }
1551 }
bcf1_sync_alleles(bcf1_t * line,kstring_t * str)1552 static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1553 {
1554     // list of typed strings
1555     int i;
1556     for (i=0; i<line->n_allele; i++) {
1557         if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1558             return -1;
1559     }
1560     if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1561     return 0;
1562 }
bcf1_sync_filter(bcf1_t * line,kstring_t * str)1563 static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1564 {
1565     // typed vector of integers
1566     if ( line->d.n_flt ) {
1567         return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1568     } else {
1569         return bcf_enc_vint(str, 0, 0, -1);
1570     }
1571 }
1572 
bcf1_sync_info(bcf1_t * line,kstring_t * str)1573 static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1574 {
1575     // pairs of typed vectors
1576     int i, irm = -1, e = 0;
1577     for (i=0; i<line->n_info; i++)
1578     {
1579         bcf_info_t *info = &line->d.info[i];
1580         if ( !info->vptr )
1581         {
1582             // marked for removal
1583             if ( irm < 0 ) irm = i;
1584             continue;
1585         }
1586         e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
1587         if ( irm >=0 )
1588         {
1589             bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
1590             while ( irm<=i && line->d.info[irm].vptr ) irm++;
1591         }
1592     }
1593     if ( irm>=0 ) line->n_info = irm;
1594     return e == 0 ? 0 : -1;
1595 }
1596 
bcf1_sync(bcf1_t * line)1597 static int bcf1_sync(bcf1_t *line)
1598 {
1599     char *shared_ori = line->shared.s;
1600     size_t prev_len;
1601 
1602     kstring_t tmp = {0,0,0};
1603     if ( !line->shared.l )
1604     {
1605         // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
1606         tmp = line->shared;
1607         bcf1_sync_id(line, &tmp);
1608         line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1609 
1610         bcf1_sync_alleles(line, &tmp);
1611         line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1612 
1613         bcf1_sync_filter(line, &tmp);
1614         line->unpack_size[2] = tmp.l - prev_len;
1615 
1616         bcf1_sync_info(line, &tmp);
1617         line->shared = tmp;
1618     }
1619     else if ( line->d.shared_dirty )
1620     {
1621         // The line was edited, update the BCF data block.
1622 
1623         if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
1624 
1625         // ptr_ori points to the original unchanged BCF data.
1626         uint8_t *ptr_ori = (uint8_t *) line->shared.s;
1627 
1628         // ID: single typed string
1629         if ( line->d.shared_dirty & BCF1_DIRTY_ID )
1630             bcf1_sync_id(line, &tmp);
1631         else
1632             kputsn_(ptr_ori, line->unpack_size[0], &tmp);
1633         ptr_ori += line->unpack_size[0];
1634         line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1635 
1636         // REF+ALT: list of typed strings
1637         if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
1638             bcf1_sync_alleles(line, &tmp);
1639         else
1640         {
1641             kputsn_(ptr_ori, line->unpack_size[1], &tmp);
1642             if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1643         }
1644         ptr_ori += line->unpack_size[1];
1645         line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1646 
1647         if ( line->unpacked & BCF_UN_FLT )
1648         {
1649             // FILTER: typed vector of integers
1650             if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
1651                 bcf1_sync_filter(line, &tmp);
1652             else if ( line->d.n_flt )
1653                 kputsn_(ptr_ori, line->unpack_size[2], &tmp);
1654             else
1655                 bcf_enc_vint(&tmp, 0, 0, -1);
1656             ptr_ori += line->unpack_size[2];
1657             line->unpack_size[2] = tmp.l - prev_len;
1658 
1659             if ( line->unpacked & BCF_UN_INFO )
1660             {
1661                 // INFO: pairs of typed vectors
1662                 if ( line->d.shared_dirty & BCF1_DIRTY_INF )
1663                 {
1664                     bcf1_sync_info(line, &tmp);
1665                     ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
1666                 }
1667             }
1668         }
1669 
1670         int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
1671         if ( size ) kputsn_(ptr_ori, size, &tmp);
1672 
1673         free(line->shared.s);
1674         line->shared = tmp;
1675     }
1676     if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
1677     {
1678         // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
1679         size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
1680         int i;
1681         for (i=0; i<line->n_info; i++)
1682         {
1683             uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
1684             line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
1685             off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
1686             if ( vptr_free )
1687             {
1688                 free(vptr_free);
1689                 line->d.info[i].vptr_free = 0;
1690             }
1691         }
1692     }
1693 
1694     if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
1695     {
1696         // The genotype fields changed or are not present
1697         tmp.l = tmp.m = 0; tmp.s = NULL;
1698         int i, irm = -1;
1699         for (i=0; i<line->n_fmt; i++)
1700         {
1701             bcf_fmt_t *fmt = &line->d.fmt[i];
1702             if ( !fmt->p )
1703             {
1704                 // marked for removal
1705                 if ( irm < 0 ) irm = i;
1706                 continue;
1707             }
1708             kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
1709             if ( irm >=0 )
1710             {
1711                 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
1712                 while ( irm<=i && line->d.fmt[irm].p ) irm++;
1713             }
1714 
1715         }
1716         if ( irm>=0 ) line->n_fmt = irm;
1717         free(line->indiv.s);
1718         line->indiv = tmp;
1719 
1720         // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
1721         size_t off_new = 0;
1722         for (i=0; i<line->n_fmt; i++)
1723         {
1724             uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
1725             line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
1726             off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
1727             if ( p_free )
1728             {
1729                 free(p_free);
1730                 line->d.fmt[i].p_free = 0;
1731             }
1732         }
1733     }
1734     if ( !line->n_sample ) line->n_fmt = 0;
1735     line->d.shared_dirty = line->d.indiv_dirty = 0;
1736     return 0;
1737 }
1738 
bcf_copy(bcf1_t * dst,bcf1_t * src)1739 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
1740 {
1741     bcf1_sync(src);
1742 
1743     bcf_clear(dst);
1744     dst->rid  = src->rid;
1745     dst->pos  = src->pos;
1746     dst->rlen = src->rlen;
1747     dst->qual = src->qual;
1748     dst->n_info = src->n_info; dst->n_allele = src->n_allele;
1749     dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
1750 
1751     if ( dst->shared.m < src->shared.l )
1752     {
1753         dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
1754         dst->shared.m = src->shared.l;
1755     }
1756     dst->shared.l = src->shared.l;
1757     memcpy(dst->shared.s,src->shared.s,dst->shared.l);
1758 
1759     if ( dst->indiv.m < src->indiv.l )
1760     {
1761         dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
1762         dst->indiv.m = src->indiv.l;
1763     }
1764     dst->indiv.l = src->indiv.l;
1765     memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
1766 
1767     return dst;
1768 }
bcf_dup(bcf1_t * src)1769 bcf1_t *bcf_dup(bcf1_t *src)
1770 {
1771     bcf1_t *out = bcf_init1();
1772     return bcf_copy(out, src);
1773 }
1774 
bcf_write(htsFile * hfp,bcf_hdr_t * h,bcf1_t * v)1775 int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
1776 {
1777     if ( h->dirty ) {
1778         if (bcf_hdr_sync(h) < 0) return -1;
1779     }
1780     if ( bcf_hdr_nsamples(h)!=v->n_sample )
1781     {
1782         hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
1783             bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
1784         return -1;
1785     }
1786 
1787     if ( hfp->format.format == vcf || hfp->format.format == text_format )
1788         return vcf_write(hfp,h,v);
1789 
1790     if ( v->errcode )
1791     {
1792         // vcf_parse1() encountered a new contig or tag, undeclared in the
1793         // header.  At this point, the header must have been printed,
1794         // proceeding would lead to a broken BCF file. Errors must be checked
1795         // and cleared by the caller before we can proceed.
1796         hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos, v->errcode, bcf_seqname_safe(h,v), v->pos+1);
1797         return -1;
1798     }
1799     bcf1_sync(v);   // check if the BCF record was modified
1800 
1801     if ( v->unpacked & BCF_IS_64BIT )
1802     {
1803         hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
1804         return -1;
1805     }
1806 
1807     BGZF *fp = hfp->fp.bgzf;
1808     uint8_t x[32];
1809     u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
1810     u32_to_le(v->indiv.l, x + 4);
1811     i32_to_le(v->rid, x + 8);
1812     u32_to_le(v->pos, x + 12);
1813     u32_to_le(v->rlen, x + 16);
1814     float_to_le(v->qual, x + 20);
1815     u16_to_le(v->n_info, x + 24);
1816     u16_to_le(v->n_allele, x + 26);
1817     u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
1818     if ( bgzf_write(fp, x, 32) != 32 ) return -1;
1819     if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
1820     if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
1821 
1822     if (hfp->idx) {
1823         if (hts_idx_push(hfp->idx, v->rid, v->pos, v->pos + v->rlen, bgzf_tell(fp), 1) < 0)
1824             return -1;
1825     }
1826 
1827     return 0;
1828 }
1829 
1830 /**********************
1831  *** VCF header I/O ***
1832  **********************/
1833 
add_missing_contig_hrec(bcf_hdr_t * h,const char * name)1834 static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
1835     bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
1836     int save_errno;
1837     if (!hrec) goto fail;
1838 
1839     hrec->key = strdup("contig");
1840     if (!hrec->key) goto fail;
1841 
1842     if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
1843     if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
1844         goto fail;
1845     if (bcf_hdr_add_hrec(h, hrec) < 0)
1846         goto fail;
1847     return 0;
1848 
1849  fail:
1850     save_errno = errno;
1851     hts_log_error("%s", strerror(errno));
1852     if (hrec) bcf_hrec_destroy(hrec);
1853     errno = save_errno;
1854     return -1;
1855 }
1856 
vcf_hdr_read(htsFile * fp)1857 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
1858 {
1859     kstring_t txt, *s = &fp->line;
1860     int ret;
1861     bcf_hdr_t *h;
1862     tbx_t *idx = NULL;
1863     const char **names = NULL;
1864     h = bcf_hdr_init("r");
1865     if (!h) {
1866         hts_log_error("Failed to allocate bcf header");
1867         return NULL;
1868     }
1869     txt.l = txt.m = 0; txt.s = 0;
1870     while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
1871         int e = 0;
1872         if (s->l == 0) continue;
1873         if (s->s[0] != '#') {
1874             hts_log_error("No sample line");
1875             goto error;
1876         }
1877         if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
1878             kstring_t tmp = { 0, 0, NULL };
1879             hFILE *f = hopen(fp->fn_aux, "r");
1880             if (f == NULL) {
1881                 hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
1882                 goto error;
1883             }
1884             while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
1885                 char *tab = strchr(tmp.s, '\t');
1886                 if (tab == NULL) continue;
1887                 e |= (kputs("##contig=<ID=", &txt) < 0);
1888                 e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
1889                 e |= (kputs(",length=", &txt) < 0);
1890                 e |= (kputl(atol(tab), &txt) < 0);
1891                 e |= (kputsn(">\n", 2, &txt) < 0);
1892             }
1893             free(tmp.s);
1894             if (hclose(f) != 0) {
1895                 hts_log_error("Error on closing %s", fp->fn_aux);
1896                 goto error;
1897             }
1898             if (e) goto error;
1899         }
1900         if (kputsn(s->s, s->l, &txt) < 0) goto error;
1901         if (kputc('\n', &txt) < 0) goto error;
1902         if (s->s[1] != '#') break;
1903     }
1904     if ( ret < -1 ) goto error;
1905     if ( !txt.s )
1906     {
1907         hts_log_error("Could not read the header");
1908         goto error;
1909     }
1910     if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
1911 
1912     // check tabix index, are all contigs listed in the header? add the missing ones
1913     idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL);
1914     if ( idx )
1915     {
1916         int i, n, need_sync = 0;
1917         names = tbx_seqnames(idx, &n);
1918         if (!names) goto error;
1919         for (i=0; i<n; i++)
1920         {
1921             bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
1922             if ( hrec ) continue;
1923             if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
1924             need_sync = 1;
1925         }
1926         if ( need_sync ) {
1927             if (bcf_hdr_sync(h) < 0) goto error;
1928         }
1929         free(names);
1930         tbx_destroy(idx);
1931     }
1932     free(txt.s);
1933     return h;
1934 
1935  error:
1936     if (idx) tbx_destroy(idx);
1937     free(names);
1938     free(txt.s);
1939     if (h) bcf_hdr_destroy(h);
1940     return NULL;
1941 }
1942 
bcf_hdr_set(bcf_hdr_t * hdr,const char * fname)1943 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
1944 {
1945     int i = 0, n = 0, save_errno;
1946     char **lines = hts_readlines(fname, &n);
1947     if ( !lines ) return 1;
1948     for (i=0; i<n-1; i++)
1949     {
1950         int k;
1951         bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
1952         if (!hrec) goto fail;
1953         if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1954             bcf_hrec_destroy(hrec);
1955             goto fail;
1956         }
1957         free(lines[i]);
1958         lines[i] = NULL;
1959     }
1960     if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
1961     if (bcf_hdr_sync(hdr) < 0) goto fail;
1962     free(lines[n-1]);
1963     free(lines);
1964     return 0;
1965 
1966  fail:
1967     save_errno = errno;
1968     for (; i < n; i++)
1969         free(lines[i]);
1970     free(lines);
1971     errno = save_errno;
1972     return 1;
1973 }
1974 
_bcf_hrec_format(const bcf_hrec_t * hrec,int is_bcf,kstring_t * str)1975 static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
1976 {
1977     uint32_t e = 0;
1978     if ( !hrec->value )
1979     {
1980         int j, nout = 0;
1981         e |= ksprintf(str, "##%s=<", hrec->key) < 0;
1982         for (j=0; j<hrec->nkeys; j++)
1983         {
1984             // do not output IDX if output is VCF
1985             if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
1986             if ( nout ) e |= kputc(',',str) < 0;
1987             e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
1988             nout++;
1989         }
1990         e |= ksprintf(str,">\n") < 0;
1991     }
1992     else
1993         e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
1994 
1995     return e == 0 ? 0 : -1;
1996 }
1997 
bcf_hrec_format(const bcf_hrec_t * hrec,kstring_t * str)1998 int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
1999 {
2000     return _bcf_hrec_format(hrec,0,str);
2001 }
2002 
bcf_hdr_format(const bcf_hdr_t * hdr,int is_bcf,kstring_t * str)2003 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2004 {
2005     int i, r = 0;
2006     for (i=0; i<hdr->nhrec; i++)
2007         r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2008 
2009     r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2010     if ( bcf_hdr_nsamples(hdr) )
2011     {
2012         r |= ksprintf(str, "\tFORMAT") < 0;
2013         for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2014             r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2015     }
2016     r |= ksprintf(str, "\n") < 0;
2017 
2018     return r ? -1 : 0;
2019 }
2020 
bcf_hdr_fmt_text(const bcf_hdr_t * hdr,int is_bcf,int * len)2021 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2022 {
2023     kstring_t txt = {0,0,0};
2024     if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2025         return NULL;
2026     if ( len ) *len = txt.l;
2027     return txt.s;
2028 }
2029 
bcf_hdr_seqnames(const bcf_hdr_t * h,int * n)2030 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2031 {
2032     vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2033     int tid, m = kh_size(d);
2034     const char **names = (const char**) calloc(m,sizeof(const char*));
2035     khint_t k;
2036     for (k=kh_begin(d); k<kh_end(d); k++)
2037     {
2038         if ( !kh_exist(d,k) ) continue;
2039         tid = kh_val(d,k).id;
2040         assert( tid<m );
2041         names[tid] = kh_key(d,k);
2042     }
2043     // sanity check: there should be no gaps
2044     for (tid=0; tid<m; tid++)
2045         assert(names[tid]);
2046     *n = m;
2047     return names;
2048 }
2049 
vcf_hdr_write(htsFile * fp,const bcf_hdr_t * h)2050 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2051 {
2052     kstring_t htxt = {0,0,0};
2053     if (bcf_hdr_format(h, 0, &htxt) < 0) {
2054         free(htxt.s);
2055         return -1;
2056     }
2057     while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2058     int ret;
2059     if ( fp->format.compression!=no_compression )
2060         ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2061     else
2062         ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2063     free(htxt.s);
2064     return ret<0 ? -1 : 0;
2065 }
2066 
2067 /***********************
2068  *** Typed value I/O ***
2069  ***********************/
2070 
bcf_enc_vint(kstring_t * s,int n,int32_t * a,int wsize)2071 int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2072 {
2073     int32_t max = INT32_MIN, min = INT32_MAX;
2074     int i;
2075     if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL);
2076     else if (n == 1) bcf_enc_int1(s, a[0]);
2077     else {
2078         if (wsize <= 0) wsize = n;
2079         for (i = 0; i < n; ++i) {
2080             if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
2081             if (max < a[i]) max = a[i];
2082             if (min > a[i]) min = a[i];
2083         }
2084         if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2085             bcf_enc_size(s, wsize, BCF_BT_INT8);
2086             for (i = 0; i < n; ++i)
2087                 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
2088                 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
2089                 else kputc(a[i], s);
2090         } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2091             uint8_t *p;
2092             bcf_enc_size(s, wsize, BCF_BT_INT16);
2093             ks_resize(s, s->l + n * sizeof(int16_t));
2094             p = (uint8_t *) s->s + s->l;
2095             for (i = 0; i < n; ++i)
2096             {
2097                 int16_t x;
2098                 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2099                 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2100                 else x = a[i];
2101                 i16_to_le(x, p);
2102                 p += sizeof(int16_t);
2103             }
2104             s->l += n * sizeof(int16_t);
2105         } else {
2106             uint8_t *p;
2107             bcf_enc_size(s, wsize, BCF_BT_INT32);
2108             ks_resize(s, s->l + n * sizeof(int32_t));
2109             p = (uint8_t *) s->s + s->l;
2110             for (i = 0; i < n; ++i) {
2111                 i32_to_le(a[i], p);
2112                 p += sizeof(int32_t);
2113             }
2114             s->l += n * sizeof(int32_t);
2115         }
2116     }
2117 
2118     return 0; // FIXME: check for errs in this function
2119 }
2120 
2121 #ifdef VCF_ALLOW_INT64
bcf_enc_long1(kstring_t * s,int64_t x)2122 static int bcf_enc_long1(kstring_t *s, int64_t x) {
2123     uint32_t e = 0;
2124     if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2125         return bcf_enc_int1(s, x);
2126     if (x == bcf_int64_vector_end) {
2127         e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2128         e |= kputc(bcf_int8_vector_end, s) < 0;
2129     } else if (x == bcf_int64_missing) {
2130         e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2131         e |= kputc(bcf_int8_missing, s) < 0;
2132     } else {
2133         e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2134         e |= ks_expand(s, 8);
2135         if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2136     }
2137     return e == 0 ? 0 : -1;
2138 }
2139 #endif
2140 
serialize_float_array(kstring_t * s,size_t n,const float * a)2141 static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2142     uint8_t *p;
2143     size_t i;
2144     size_t bytes = n * sizeof(float);
2145 
2146     if (bytes / sizeof(float) != n) return -1;
2147     if (ks_resize(s, s->l + bytes) < 0) return -1;
2148 
2149     p = (uint8_t *) s->s + s->l;
2150     for (i = 0; i < n; i++) {
2151         float_to_le(a[i], p);
2152         p += sizeof(float);
2153     }
2154     s->l += bytes;
2155 
2156     return 0;
2157 }
2158 
bcf_enc_vfloat(kstring_t * s,int n,float * a)2159 int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2160 {
2161     assert(n >= 0);
2162     bcf_enc_size(s, n, BCF_BT_FLOAT);
2163     serialize_float_array(s, n, a);
2164     return 0; // FIXME: check for errs in this function
2165 }
2166 
bcf_enc_vchar(kstring_t * s,int l,const char * a)2167 int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2168 {
2169     bcf_enc_size(s, l, BCF_BT_CHAR);
2170     kputsn(a, l, s);
2171     return 0; // FIXME: check for errs in this function
2172 }
2173 
bcf_fmt_array(kstring_t * s,int n,int type,void * data)2174 int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2175 {
2176     int j = 0;
2177     uint32_t e = 0;
2178     if (n == 0) {
2179         return kputc('.', s) >= 0 ? 0 : -1;
2180     }
2181     if (type == BCF_BT_CHAR)
2182     {
2183         char *p = (char*)data;
2184         for (j = 0; j < n && *p; ++j, ++p)
2185         {
2186             if ( *p==bcf_str_missing ) e |= kputc('.', s) < 0;
2187             else e |= kputc(*p, s) < 0;
2188         }
2189     }
2190     else
2191     {
2192         #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2193             uint8_t *p = (uint8_t *) data; \
2194             for (j=0; j<n; j++, p += sizeof(type_t))    \
2195             { \
2196                 type_t v = convert(p); \
2197                 if ( is_vector_end ) break; \
2198                 if ( j ) kputc(',', s); \
2199                 if ( is_missing ) kputc('.', s); \
2200                 else e |= kprint < 0; \
2201             } \
2202         }
2203         switch (type) {
2204             case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2205             case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2206             case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2207             case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2208             default: hts_log_error("Unexpected type %d", type); exit(1); break;
2209         }
2210         #undef BRANCH
2211     }
2212     return e == 0 ? 0 : -1;
2213 }
2214 
bcf_fmt_sized_array(kstring_t * s,uint8_t * ptr)2215 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2216 {
2217     int x, type;
2218     x = bcf_dec_size(ptr, &ptr, &type);
2219     bcf_fmt_array(s, x, type, ptr);
2220     return ptr + (x << bcf_type_shift[type]);
2221 }
2222 
2223 /********************
2224  *** VCF site I/O ***
2225  ********************/
2226 
2227 typedef struct {
2228     int key, max_m, size, offset;
2229     uint32_t is_gt:1, max_g:31;
2230     uint32_t max_l;
2231     uint32_t y;
2232     uint8_t *buf;
2233 } fmt_aux_t;
2234 
align_mem(kstring_t * s)2235 static inline int align_mem(kstring_t *s)
2236 {
2237     int e = 0;
2238     if (s->l&7) {
2239         uint64_t zero = 0;
2240         e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2241     }
2242     return e == 0 ? 0 : -1;
2243 }
2244 
2245 // p,q is the start and the end of the FORMAT field
2246 #define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
vcf_parse_format(kstring_t * s,const bcf_hdr_t * h,bcf1_t * v,char * p,char * q)2247 static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
2248 {
2249     if ( !bcf_hdr_nsamples(h) ) return 0;
2250 
2251     static int extreme_val_warned = 0;
2252     char *r, *t;
2253     int j, l, m, g, overflow = 0;
2254     khint_t k;
2255     ks_tokaux_t aux1;
2256     vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2257     kstring_t *mem = (kstring_t*)&h->mem;
2258     fmt_aux_t fmt[MAX_N_FMT];
2259     mem->l = 0;
2260 
2261     char *end = s->s + s->l;
2262     if ( q>=end )
2263     {
2264         hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2265         v->errcode |= BCF_ERR_NCOLS;
2266         return -1;
2267     }
2268 
2269     v->n_fmt = 0;
2270     if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2271     {
2272         v->n_sample = bcf_hdr_nsamples(h);
2273         return 0;
2274     }
2275 
2276     // get format information from the dictionary
2277     for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2278         if (j >= MAX_N_FMT) {
2279             v->errcode |= BCF_ERR_LIMITS;
2280             hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2281                 bcf_seqname_safe(h,v), v->pos+1);
2282             return -1;
2283         }
2284 
2285         *(char*)aux1.p = 0;
2286         k = kh_get(vdict, d, t);
2287         if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2288             if ( t[0]=='.' && t[1]==0 )
2289             {
2290                 hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2291                 v->errcode |= BCF_ERR_TAG_INVALID;
2292                 return -1;
2293             }
2294             hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2295             kstring_t tmp = {0,0,0};
2296             int l;
2297             ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2298             bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2299             free(tmp.s);
2300             int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2301             if (res < 0) bcf_hrec_destroy(hrec);
2302             if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2303 
2304             k = kh_get(vdict, d, t);
2305             v->errcode = BCF_ERR_TAG_UNDEF;
2306             if (res || k == kh_end(d)) {
2307                 hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2308                 v->errcode |= BCF_ERR_TAG_INVALID;
2309                 return -1;
2310             }
2311         }
2312         fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2313         fmt[j].key = kh_val(d, k).id;
2314         fmt[j].is_gt = !strcmp(t, "GT");
2315         fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2316         v->n_fmt++;
2317     }
2318     // compute max
2319     int n_sample_ori = -1;
2320     r = q + 1;  // r: position in the format string
2321     l = 0, m = g = 1, v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2322     while ( r<end )
2323     {
2324         // can we skip some samples?
2325         if ( h->keep_samples )
2326         {
2327             n_sample_ori++;
2328             if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2329             {
2330                 while ( *r!='\t' && r<end ) r++;
2331                 if ( *r=='\t' ) { *r = 0; r++; }
2332                 continue;
2333             }
2334         }
2335 
2336         // collect fmt stats: max vector size, length, number of alleles
2337         j = 0;  // j-th format field
2338         fmt_aux_t *f = fmt;
2339         for (;;) {
2340             switch (*r) {
2341             case ',':
2342                 m++;
2343                 break;
2344 
2345             case '|':
2346             case '/':
2347                 if (f->is_gt) g++;
2348                 break;
2349 
2350             case '\t':
2351                 *r = 0; // fall through
2352 
2353             case '\0':
2354             case ':':
2355                 if (f->max_m < m) f->max_m = m;
2356                 if (f->max_l < l) f->max_l = l;
2357                 if (f->is_gt && f->max_g < g) f->max_g = g;
2358                 l = 0, m = g = 1;
2359                 if ( *r==':' ) {
2360                     j++; f++;
2361                     if ( j>=v->n_fmt ) {
2362                         hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2363                                       h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2364                         v->errcode |= BCF_ERR_NCOLS;
2365                         return -1;
2366                     }
2367                 } else goto end_for;
2368                 break;
2369             }
2370             if ( r>=end ) break;
2371             r++; l++;
2372         }
2373     end_for:
2374         v->n_sample++;
2375         if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2376         r++;
2377     }
2378 
2379     // allocate memory for arrays
2380     for (j = 0; j < v->n_fmt; ++j) {
2381         fmt_aux_t *f = &fmt[j];
2382         if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
2383         if ((f->y>>4&0xf) == BCF_HT_STR) {
2384             f->size = f->is_gt? f->max_g << 2 : f->max_l;
2385         } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2386             f->size = f->max_m << 2;
2387         } else
2388         {
2389             hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2390             v->errcode |= BCF_ERR_TAG_INVALID;
2391             return -1;
2392         }
2393         if (align_mem(mem) < 0) {
2394             hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2395             v->errcode |= BCF_ERR_LIMITS;
2396             return -1;
2397         }
2398 
2399         // Limit the total memory to ~2Gb per VCF row.  This should mean
2400         // malformed VCF data is less likely to take excessive memory and/or
2401         // time.
2402         if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
2403             hts_log_error("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2404             v->errcode |= BCF_ERR_LIMITS;
2405             return -1;
2406         }
2407 
2408         f->offset = mem->l;
2409         if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
2410             hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2411             v->errcode |= BCF_ERR_LIMITS;
2412             return -1;
2413         }
2414         mem->l += v->n_sample * f->size;
2415     }
2416     for (j = 0; j < v->n_fmt; ++j)
2417         fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
2418     // fill the sample fields; at beginning of the loop, t points to the first char of a format
2419     n_sample_ori = -1;
2420     t = q + 1; m = 0;   // m: sample id
2421     while ( t<end )
2422     {
2423         // can we skip some samples?
2424         if ( h->keep_samples )
2425         {
2426             n_sample_ori++;
2427             if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2428             {
2429                 while ( *t && t<end ) t++;
2430                 t++;
2431                 continue;
2432             }
2433         }
2434         if ( m == bcf_hdr_nsamples(h) ) break;
2435 
2436         j = 0; // j-th format field, m-th sample
2437         while ( t < end )
2438         {
2439             fmt_aux_t *z = &fmt[j++];
2440             if (!z->buf) {
2441                 hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
2442                               z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2443                 v->errcode |= BCF_ERR_LIMITS;
2444                 return -1;
2445             }
2446             if ((z->y>>4&0xf) == BCF_HT_STR) {
2447                 if (z->is_gt) { // genotypes
2448                     int32_t is_phased = 0;
2449                     uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
2450                     uint32_t unreadable = 0;
2451                     uint32_t max = 0;
2452                     overflow = 0;
2453                     for (l = 0;; ++t) {
2454                         if (*t == '.') {
2455                             ++t, x[l++] = is_phased;
2456                         } else {
2457                             char *tt = t;
2458                             uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow);
2459                             unreadable |= tt == t;
2460                             if (max < val) max = val;
2461                             x[l++] = (val + 1) << 1 | is_phased;
2462                         }
2463                         is_phased = (*t == '|');
2464                         if (*t != '|' && *t != '/') break;
2465                     }
2466                     // Possibly check max against v->n_allele instead?
2467                     if (overflow || max > (INT32_MAX >> 1) - 1) {
2468                         hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2469                         return -1;
2470                     }
2471                     if (unreadable) {
2472                         hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2473                         return -1;
2474                     }
2475                     if ( !l ) x[l++] = 0;   // An empty field, insert missing value
2476                     for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2477                 } else {
2478                     char *x = (char*)z->buf + z->size * (size_t)m;
2479                     for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
2480                     for (; l < z->size; ++l) x[l] = 0;
2481                 }
2482             } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2483                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2484                 for (l = 0;; ++t) {
2485                     if (*t == '.') {
2486                         x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2487                     } else {
2488                         overflow = 0;
2489                         char *te;
2490                         long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2491                         if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2492                         {
2493                             if ( !extreme_val_warned )
2494                             {
2495                                 hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
2496                                 extreme_val_warned = 1;
2497                             }
2498                             tmp_val = bcf_int32_missing;
2499                         }
2500                         x[l++] = tmp_val;
2501                         t = te;
2502                     }
2503                     if (*t != ',') break;
2504                 }
2505                 if ( !l ) x[l++] = bcf_int32_missing;
2506                 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2507             } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2508                 float *x = (float*)(z->buf + z->size * (size_t)m);
2509                 for (l = 0;; ++t) {
2510                     if (*t == '.' && !isdigit_c(t[1])) {
2511                         bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
2512                     } else {
2513                         overflow = 0;
2514                         char *te;
2515                         float tmp_val = hts_str2dbl(t, &te, &overflow);
2516                         if ( (te==t || overflow) && !extreme_val_warned )
2517                         {
2518                             hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
2519                             extreme_val_warned = 1;
2520                         }
2521                         x[l++] = tmp_val;
2522                         t = te;
2523                     }
2524                     if (*t != ',') break;
2525                 }
2526                 if ( !l ) bcf_float_set_missing(x[l++]);    // An empty field, insert missing value
2527                 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2528             } else {
2529                 hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2530                 v->errcode |= BCF_ERR_TAG_INVALID;
2531                 return -1;
2532             }
2533 
2534             if (*t == '\0') {
2535                 break;
2536             }
2537             else if (*t == ':') {
2538                 t++;
2539             }
2540             else {
2541                 char buffer[8];
2542                 hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
2543                     hts_strprint(buffer, sizeof buffer, '\'', t, 1),
2544                     h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
2545                 v->errcode |= BCF_ERR_CHAR;
2546                 return -1;
2547             }
2548         }
2549 
2550         for (; j < v->n_fmt; ++j) { // fill end-of-vector values
2551             fmt_aux_t *z = &fmt[j];
2552             if ((z->y>>4&0xf) == BCF_HT_STR) {
2553                 if (z->is_gt) {
2554                     int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2555                     if (z->size) x[0] = bcf_int32_missing;
2556                     for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2557                 } else {
2558                     char *x = (char*)z->buf + z->size * (size_t)m;
2559                     if ( z->size ) x[0] = '.';
2560                     for (l = 1; l < z->size; ++l) x[l] = 0;
2561                 }
2562             } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2563                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2564                 x[0] = bcf_int32_missing;
2565                 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2566             } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2567                 float *x = (float*)(z->buf + z->size * (size_t)m);
2568                 bcf_float_set_missing(x[0]);
2569                 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2570             }
2571         }
2572 
2573         m++; t++;
2574     }
2575 
2576     // write individual genotype information
2577     kstring_t *str = &v->indiv;
2578     int i;
2579     if (v->n_sample > 0) {
2580         for (i = 0; i < v->n_fmt; ++i) {
2581             fmt_aux_t *z = &fmt[i];
2582             bcf_enc_int1(str, z->key);
2583             if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
2584                 bcf_enc_size(str, z->size, BCF_BT_CHAR);
2585                 kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
2586             } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
2587                 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
2588             } else {
2589                 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
2590                 if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
2591                                           (float *) z->buf) != 0) {
2592                     v->errcode |= BCF_ERR_LIMITS;
2593                     hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2594                     return -1;
2595                 }
2596             }
2597         }
2598     }
2599 
2600     if ( v->n_sample!=bcf_hdr_nsamples(h) )
2601     {
2602         hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2603             bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2604         v->errcode |= BCF_ERR_NCOLS;
2605         return -1;
2606     }
2607     if ( v->indiv.l > 0xffffffff )
2608     {
2609         hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
2610         v->errcode |= BCF_ERR_LIMITS;
2611 
2612         // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
2613         v->n_fmt = 0;
2614         return -1;
2615     }
2616 
2617     return 0;
2618 }
2619 
fix_chromosome(const bcf_hdr_t * h,vdict_t * d,const char * p)2620 static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
2621     // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
2622     // been already printed, but will enable tools like vcfcheck to proceed.
2623 
2624     kstring_t tmp = {0,0,0};
2625     khint_t k;
2626     int l;
2627     if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
2628         return kh_end(d);
2629     bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2630     free(tmp.s);
2631     int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2632     if (res < 0) bcf_hrec_destroy(hrec);
2633     if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2634     k = kh_get(vdict, d, p);
2635 
2636     return k;
2637 }
2638 
vcf_parse_filter(kstring_t * str,const bcf_hdr_t * h,bcf1_t * v,char * p,char * q)2639 static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
2640     int i, n_flt = 1, max_n_flt = 0;
2641     char *r, *t;
2642     int32_t *a_flt = NULL;
2643     ks_tokaux_t aux1;
2644     khint_t k;
2645     vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2646     // count the number of filters
2647     if (*(q-1) == ';') *(q-1) = 0;
2648     for (r = p; *r; ++r)
2649         if (*r == ';') ++n_flt;
2650     if (n_flt > max_n_flt) {
2651         a_flt = malloc(n_flt * sizeof(*a_flt));
2652         if (!a_flt) {
2653             hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2654             v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2655             return -1;
2656         }
2657         max_n_flt = n_flt;
2658     }
2659     // add filters
2660     for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
2661         *(char*)aux1.p = 0;
2662         k = kh_get(vdict, d, t);
2663         if (k == kh_end(d))
2664         {
2665             // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
2666             // been already printed, but will enable tools like vcfcheck to proceed.
2667             hts_log_warning("FILTER '%s' is not defined in the header", t);
2668             kstring_t tmp = {0,0,0};
2669             int l;
2670             ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
2671             bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2672             free(tmp.s);
2673             int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2674             if (res < 0) bcf_hrec_destroy(hrec);
2675             if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2676             k = kh_get(vdict, d, t);
2677             v->errcode |= BCF_ERR_TAG_UNDEF;
2678             if (res || k == kh_end(d)) {
2679                 hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2680                 v->errcode |= BCF_ERR_TAG_INVALID;
2681                 free(a_flt);
2682                 return -1;
2683             }
2684         }
2685         a_flt[i++] = kh_val(d, k).id;
2686     }
2687 
2688     bcf_enc_vint(str, n_flt, a_flt, -1);
2689     free(a_flt);
2690 
2691     return 0;
2692 }
2693 
vcf_parse_info(kstring_t * str,const bcf_hdr_t * h,bcf1_t * v,char * p,char * q)2694 static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
2695     static int extreme_int_warned = 0, negative_rlen_warned = 0;
2696     int max_n_val = 0, overflow = 0;
2697     char *r, *key;
2698     khint_t k;
2699     vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2700     int32_t *a_val = NULL;
2701 
2702     v->n_info = 0;
2703     if (*(q-1) == ';') *(q-1) = 0;
2704     for (r = key = p;; ++r) {
2705         int c;
2706         char *val, *end;
2707         if (*r != ';' && *r != '=' && *r != 0) continue;
2708         if (v->n_info == UINT16_MAX) {
2709             hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
2710                           bcf_seqname_safe(h,v), v->pos+1);
2711             v->errcode |= BCF_ERR_LIMITS;
2712             goto fail;
2713         }
2714         val = end = 0;
2715         c = *r; *r = 0;
2716         if (c == '=') {
2717             val = r + 1;
2718             for (end = val; *end != ';' && *end != 0; ++end);
2719             c = *end; *end = 0;
2720         } else end = r;
2721         if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
2722         k = kh_get(vdict, d, key);
2723         if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
2724         {
2725             hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
2726             kstring_t tmp = {0,0,0};
2727             int l;
2728             ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
2729             bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2730             free(tmp.s);
2731             int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2732             if (res < 0) bcf_hrec_destroy(hrec);
2733             if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2734             k = kh_get(vdict, d, key);
2735             v->errcode = BCF_ERR_TAG_UNDEF;
2736             if (res || k == kh_end(d)) {
2737                 hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
2738                 v->errcode |= BCF_ERR_TAG_INVALID;
2739                 goto fail;
2740             }
2741         }
2742         uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
2743         ++v->n_info;
2744         bcf_enc_int1(str, kh_val(d, k).id);
2745         if (val == 0) {
2746             bcf_enc_size(str, 0, BCF_BT_NULL);
2747         } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
2748             bcf_enc_vchar(str, end - val, val);
2749         } else { // int/float value/array
2750             int i, n_val;
2751             char *t, *te;
2752             for (t = val, n_val = 1; *t; ++t) // count the number of values
2753                 if (*t == ',') ++n_val;
2754             // Check both int and float size in one step for simplicity
2755             if (n_val > max_n_val) {
2756                 int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
2757                 if (!a_tmp) {
2758                     hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2759                     v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2760                     goto fail;
2761                 }
2762                 a_val = a_tmp;
2763                 max_n_val = n_val;
2764             }
2765             if ((y>>4&0xf) == BCF_HT_INT) {
2766                 i = 0, t = val;
2767                 int64_t val1;
2768                 int is_int64 = 0;
2769 #ifdef VCF_ALLOW_INT64
2770                 if ( n_val==1 )
2771                 {
2772                     overflow = 0;
2773                     long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2774                     if ( te==val ) tmp_val = bcf_int32_missing;
2775                     else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
2776                     {
2777                         if ( !extreme_int_warned )
2778                         {
2779                             hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
2780                             extreme_int_warned = 1;
2781                         }
2782                         tmp_val = bcf_int32_missing;
2783                     }
2784                     else
2785                         is_int64 = 1;
2786                     val1 = tmp_val;
2787                     t = te;
2788                     i = 1;  // this is just to avoid adding another nested block...
2789                 }
2790 #endif
2791                 for (; i < n_val; ++i, ++t)
2792                 {
2793                     overflow = 0;
2794                     long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2795                     if ( te==t ) tmp_val = bcf_int32_missing;
2796                     else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2797                     {
2798                         if ( !extreme_int_warned )
2799                         {
2800                             hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
2801                             extreme_int_warned = 1;
2802                         }
2803                         tmp_val = bcf_int32_missing;
2804                     }
2805                     a_val[i] = tmp_val;
2806                     for (t = te; *t && *t != ','; t++);
2807                 }
2808                 if (n_val == 1) {
2809 #ifdef VCF_ALLOW_INT64
2810                     if ( is_int64 )
2811                     {
2812                         v->unpacked |= BCF_IS_64BIT;
2813                         bcf_enc_long1(str, val1);
2814                     }
2815                     else
2816                         bcf_enc_int1(str, (int32_t)val1);
2817 #else
2818                     val1 = a_val[0];
2819                     bcf_enc_int1(str, (int32_t)val1);
2820 #endif
2821                 } else {
2822                     bcf_enc_vint(str, n_val, a_val, -1);
2823                 }
2824                 if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0)
2825                 {
2826                     if ( val1 <= v->pos )
2827                     {
2828                         if ( !negative_rlen_warned )
2829                         {
2830                             hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
2831                             negative_rlen_warned = 1;
2832                         }
2833                     }
2834                     else
2835                         v->rlen = val1 - v->pos;
2836                 }
2837             } else if ((y>>4&0xf) == BCF_HT_REAL) {
2838                 float *val_f = (float *)a_val;
2839                 for (i = 0, t = val; i < n_val; ++i, ++t)
2840                 {
2841                     overflow = 0;
2842                     val_f[i] = hts_str2dbl(t, &te, &overflow);
2843                     if ( te==t || overflow ) // conversion failed
2844                         bcf_float_set_missing(val_f[i]);
2845                     for (t = te; *t && *t != ','; t++);
2846                 }
2847                 bcf_enc_vfloat(str, n_val, val_f);
2848             }
2849         }
2850         if (c == 0) break;
2851         r = end;
2852         key = r + 1;
2853     }
2854 
2855     free(a_val);
2856     return 0;
2857 
2858  fail:
2859     free(a_val);
2860     return -1;
2861 }
2862 
vcf_parse(kstring_t * s,const bcf_hdr_t * h,bcf1_t * v)2863 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2864 {
2865     int i = 0, ret = -2, overflow = 0;
2866     char *p, *q, *r, *t;
2867     kstring_t *str;
2868     khint_t k;
2869     ks_tokaux_t aux;
2870 
2871     if (!s || !h || !v || !(s->s))
2872         return ret;
2873 
2874     // Assumed in lots of places, but we may as well spot this early
2875     assert(sizeof(float) == sizeof(int32_t));
2876 
2877     bcf_clear1(v);
2878     str = &v->shared;
2879     memset(&aux, 0, sizeof(ks_tokaux_t));
2880     for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
2881         q = (char*)aux.p;
2882         *q = 0;
2883         if (i == 0) { // CHROM
2884             vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2885             k = kh_get(vdict, d, p);
2886             if (k == kh_end(d))
2887             {
2888                 hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
2889                 v->errcode = BCF_ERR_CTG_UNDEF;
2890                 if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
2891                     hts_log_error("Could not add dummy header for contig '%s'", p);
2892                     v->errcode |= BCF_ERR_CTG_INVALID;
2893                     goto err;
2894                 }
2895             }
2896             v->rid = kh_val(d, k).id;
2897         } else if (i == 1) { // POS
2898             overflow = 0;
2899             v->pos = hts_str2uint(p, &p, 63, &overflow);
2900             if (overflow) {
2901                 hts_log_error("Position value '%s' is too large", p);
2902                 goto err;
2903             } else {
2904                 v->pos -= 1;
2905             }
2906             if (v->pos >= INT32_MAX)
2907                 v->unpacked |= BCF_IS_64BIT;
2908         } else if (i == 2) { // ID
2909             if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
2910             else bcf_enc_size(str, 0, BCF_BT_CHAR);
2911         } else if (i == 3) { // REF
2912             bcf_enc_vchar(str, q - p, p);
2913             v->n_allele = 1, v->rlen = q - p;
2914         } else if (i == 4) { // ALT
2915             if (strcmp(p, ".")) {
2916                 for (r = t = p;; ++r) {
2917                     if (*r == ',' || *r == 0) {
2918                         if (v->n_allele == UINT16_MAX) {
2919                             hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
2920                                           bcf_seqname_safe(h,v), v->pos+1);
2921                             v->errcode |= BCF_ERR_LIMITS;
2922                             goto err;
2923                         }
2924                         bcf_enc_vchar(str, r - t, t);
2925                         t = r + 1;
2926                         ++v->n_allele;
2927                     }
2928                     if (r == q) break;
2929                 }
2930             }
2931         } else if (i == 5) { // QUAL
2932             if (strcmp(p, ".")) v->qual = atof(p);
2933             else bcf_float_set_missing(v->qual);
2934             if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
2935         } else if (i == 6) { // FILTER
2936             if (strcmp(p, ".")) {
2937                 if (vcf_parse_filter(str, h, v, p, q)) goto err;
2938             } else bcf_enc_vint(str, 0, 0, -1);
2939             if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
2940         } else if (i == 7) { // INFO
2941             if (strcmp(p, ".")) {
2942                 if (vcf_parse_info(str, h, v, p, q)) goto err;
2943             }
2944             if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
2945         } else if (i == 8) {// FORMAT
2946             return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
2947         }
2948     }
2949 
2950  end:
2951     ret = 0;
2952 
2953  err:
2954     return ret;
2955 }
2956 
vcf_open_mode(char * mode,const char * fn,const char * format)2957 int vcf_open_mode(char *mode, const char *fn, const char *format)
2958 {
2959     if (format == NULL) {
2960         // Try to pick a format based on the filename extension
2961         char extension[HTS_MAX_EXT_LEN];
2962         if (find_file_extension(fn, extension) < 0) return -1;
2963         return vcf_open_mode(mode, fn, extension);
2964     }
2965     else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
2966     else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
2967     else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
2968     else return -1;
2969 
2970     return 0;
2971 }
2972 
vcf_read(htsFile * fp,const bcf_hdr_t * h,bcf1_t * v)2973 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2974 {
2975     int ret;
2976     ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
2977     if (ret < 0) return ret;
2978     return vcf_parse1(&fp->line, h, v);
2979 }
2980 
bcf_unpack_fmt_core1(uint8_t * ptr,int n_sample,bcf_fmt_t * fmt)2981 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
2982 {
2983     uint8_t *ptr_start = ptr;
2984     fmt->id = bcf_dec_typed_int1(ptr, &ptr);
2985     fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
2986     fmt->size = fmt->n << bcf_type_shift[fmt->type];
2987     fmt->p = ptr;
2988     fmt->p_off  = ptr - ptr_start;
2989     fmt->p_free = 0;
2990     ptr += n_sample * fmt->size;
2991     fmt->p_len = ptr - fmt->p;
2992     return ptr;
2993 }
2994 
bcf_unpack_info_core1(uint8_t * ptr,bcf_info_t * info)2995 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
2996 {
2997     uint8_t *ptr_start = ptr;
2998     info->key = bcf_dec_typed_int1(ptr, &ptr);
2999     info->len = bcf_dec_size(ptr, &ptr, &info->type);
3000     info->vptr = ptr;
3001     info->vptr_off  = ptr - ptr_start;
3002     info->vptr_free = 0;
3003     info->v1.i = 0;
3004     if (info->len == 1) {
3005         if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
3006         else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr);
3007         else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr);
3008         else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr);
3009         else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr);
3010     }
3011     ptr += info->len << bcf_type_shift[info->type];
3012     info->vptr_len = ptr - info->vptr;
3013     return ptr;
3014 }
3015 
bcf_unpack(bcf1_t * b,int which)3016 int bcf_unpack(bcf1_t *b, int which)
3017 {
3018     if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3019     uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3020     int i;
3021     bcf_dec_t *d = &b->d;
3022     if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3023     if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3024     if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3025     {
3026         kstring_t tmp;
3027 
3028         // ID
3029         tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3030         ptr_ori = ptr;
3031         ptr = bcf_fmt_sized_array(&tmp, ptr);
3032         b->unpack_size[0] = ptr - ptr_ori;
3033         kputc('\0', &tmp);
3034         d->id = tmp.s; d->m_id = tmp.m;
3035 
3036         // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3037         hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3038         tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3039         ptr_ori = ptr;
3040         for (i = 0; i < b->n_allele; ++i) {
3041             // Use offset within tmp.s as realloc may change pointer
3042             d->allele[i] = (char *)(intptr_t)tmp.l;
3043             ptr = bcf_fmt_sized_array(&tmp, ptr);
3044             kputc('\0', &tmp);
3045         }
3046         b->unpack_size[1] = ptr - ptr_ori;
3047         d->als = tmp.s; d->m_als = tmp.m;
3048 
3049         // Convert our offsets within tmp.s back to pointers again
3050         for (i = 0; i < b->n_allele; ++i)
3051             d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3052         b->unpacked |= BCF_UN_STR;
3053     }
3054     if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3055         ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3056         ptr_ori = ptr;
3057         if (*ptr>>4) {
3058             int type;
3059             d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3060             hts_expand(int, d->n_flt, d->m_flt, d->flt);
3061             for (i = 0; i < d->n_flt; ++i)
3062                 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3063         } else ++ptr, d->n_flt = 0;
3064         b->unpack_size[2] = ptr - ptr_ori;
3065         b->unpacked |= BCF_UN_FLT;
3066     }
3067     if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3068         ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3069         hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3070         for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3071         for (i = 0; i < b->n_info; ++i)
3072             ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3073         b->unpacked |= BCF_UN_INFO;
3074     }
3075     if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3076         ptr = (uint8_t*)b->indiv.s;
3077         hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3078         for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3079         for (i = 0; i < b->n_fmt; ++i)
3080             ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3081         b->unpacked |= BCF_UN_FMT;
3082     }
3083     return 0;
3084 }
3085 
vcf_format(const bcf_hdr_t * h,const bcf1_t * v,kstring_t * s)3086 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3087 {
3088     int i;
3089     int32_t max_dt_id = h->n[BCF_DT_ID];
3090     const char *chrom = bcf_seqname(h, v);
3091     if (!chrom) {
3092         hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3093                       v->rid);
3094         errno = EINVAL;
3095         return -1;
3096     }
3097     bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
3098     kputs(chrom, s); // CHROM
3099     kputc('\t', s); kputll(v->pos + 1, s); // POS
3100     kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3101     kputc('\t', s); // REF
3102     if (v->n_allele > 0) kputs(v->d.allele[0], s);
3103     else kputc('.', s);
3104     kputc('\t', s); // ALT
3105     if (v->n_allele > 1) {
3106         for (i = 1; i < v->n_allele; ++i) {
3107             if (i > 1) kputc(',', s);
3108             kputs(v->d.allele[i], s);
3109         }
3110     } else kputc('.', s);
3111     kputc('\t', s); // QUAL
3112     if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
3113     else kputd(v->qual, s);
3114     kputc('\t', s); // FILTER
3115     if (v->d.n_flt) {
3116         for (i = 0; i < v->d.n_flt; ++i) {
3117             int32_t idx = v->d.flt[i];
3118             if (idx < 0 || idx >= max_dt_id
3119                 || h->id[BCF_DT_ID][idx].key == NULL) {
3120                 hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
3121                               idx, bcf_seqname_safe(h, v), v->pos + 1);
3122                 errno = EINVAL;
3123                 return -1;
3124             }
3125             if (i) kputc(';', s);
3126             kputs(h->id[BCF_DT_ID][idx].key, s);
3127         }
3128     } else kputc('.', s);
3129     kputc('\t', s); // INFO
3130     if (v->n_info) {
3131         int first = 1;
3132         for (i = 0; i < v->n_info; ++i) {
3133             bcf_info_t *z = &v->d.info[i];
3134             if ( !z->vptr ) continue;
3135             if ( !first ) kputc(';', s);
3136             first = 0;
3137             if (z->key < 0 || z->key >= max_dt_id
3138                 || h->id[BCF_DT_ID][z->key].key == NULL) {
3139                 hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
3140                               z->key,
3141                               z->key < 0 ? "negative"
3142                               : (z->key >= max_dt_id ? "too large" : "not present in the header"),
3143                               bcf_seqname_safe(h, v), v->pos+1);
3144                 errno = EINVAL;
3145                 return -1;
3146             }
3147             kputs(h->id[BCF_DT_ID][z->key].key, s);
3148             if (z->len <= 0) continue;
3149             kputc('=', s);
3150             if (z->len == 1)
3151             {
3152                 switch (z->type)
3153                 {
3154                     case BCF_BT_INT8:  if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3155                     case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3156                     case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3157                     case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break;
3158                     case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break;
3159                     case BCF_BT_CHAR:  kputc(z->v1.i, s); break;
3160                     default:
3161                         hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
3162                         errno = EINVAL;
3163                         return -1;
3164                 }
3165             }
3166             else bcf_fmt_array(s, z->len, z->type, z->vptr);
3167         }
3168         if ( first ) kputc('.', s);
3169     } else kputc('.', s);
3170     // FORMAT and individual information
3171     if (v->n_sample)
3172     {
3173         int i,j;
3174         if ( v->n_fmt)
3175         {
3176             int gt_i = -1;
3177             bcf_fmt_t *fmt = v->d.fmt;
3178             int first = 1;
3179             for (i = 0; i < (int)v->n_fmt; ++i) {
3180                 if ( !fmt[i].p ) continue;
3181                 kputc(!first ? ':' : '\t', s); first = 0;
3182                 if (fmt[i].id < 0 || fmt[i].id >= max_dt_id
3183                     || h->id[BCF_DT_ID][fmt[i].id].key == NULL) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3184                 {
3185                     hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h, v), v->pos+1);
3186                     errno = EINVAL;
3187                     return -1;
3188                 }
3189                 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
3190                 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
3191             }
3192             if ( first ) kputs("\t.", s);
3193             for (j = 0; j < v->n_sample; ++j) {
3194                 kputc('\t', s);
3195                 first = 1;
3196                 for (i = 0; i < (int)v->n_fmt; ++i) {
3197                     bcf_fmt_t *f = &fmt[i];
3198                     if ( !f->p ) continue;
3199                     if (!first) kputc(':', s);
3200                     first = 0;
3201                     if (gt_i == i)
3202                         bcf_format_gt(f,j,s);
3203                     else
3204                         bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
3205                 }
3206                 if ( first ) kputc('.', s);
3207             }
3208         }
3209         else
3210             for (j=0; j<=v->n_sample; j++)
3211                 kputs("\t.", s);
3212     }
3213     kputc('\n', s);
3214     return 0;
3215 }
3216 
vcf_write_line(htsFile * fp,kstring_t * line)3217 int vcf_write_line(htsFile *fp, kstring_t *line)
3218 {
3219     int ret;
3220     if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
3221     if ( fp->format.compression!=no_compression )
3222         ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
3223     else
3224         ret = hwrite(fp->fp.hfile, line->s, line->l);
3225     return ret==line->l ? 0 : -1;
3226 }
3227 
vcf_write(htsFile * fp,const bcf_hdr_t * h,bcf1_t * v)3228 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3229 {
3230     int ret;
3231     fp->line.l = 0;
3232     if (vcf_format1(h, v, &fp->line) != 0)
3233         return -1;
3234     if ( fp->format.compression!=no_compression )
3235         ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
3236     else
3237         ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
3238 
3239     if (fp->idx) {
3240         int tid;
3241         if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
3242             return -1;
3243 
3244         if (hts_idx_push(fp->idx, tid, v->pos, v->pos + v->rlen, bgzf_tell(fp->fp.bgzf), 1) < 0)
3245             return -1;
3246     }
3247 
3248     return ret==fp->line.l ? 0 : -1;
3249 }
3250 
3251 /************************
3252  * Data access routines *
3253  ************************/
3254 
bcf_hdr_id2int(const bcf_hdr_t * h,int which,const char * id)3255 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
3256 {
3257     khint_t k;
3258     vdict_t *d = (vdict_t*)h->dict[which];
3259     k = kh_get(vdict, d, id);
3260     return k == kh_end(d)? -1 : kh_val(d, k).id;
3261 }
3262 
3263 
3264 /********************
3265  *** BCF indexing ***
3266  ********************/
3267 
3268 // Calculate number of index levels given min_shift and the header contig
3269 // list.  Also returns number of contigs in *nids_out.
idx_calc_n_lvls_ids(const bcf_hdr_t * h,int min_shift,int starting_n_lvls,int * nids_out)3270 static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
3271                                int starting_n_lvls, int *nids_out)
3272 {
3273     int n_lvls, i, nids = 0;
3274     int64_t max_len = 0, s;
3275 
3276     for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
3277     {
3278         if ( !h->id[BCF_DT_CTG][i].val ) continue;
3279         if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
3280             max_len = h->id[BCF_DT_CTG][i].val->info[0];
3281         nids++;
3282     }
3283     if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
3284     max_len += 256;
3285     s = 1LL << (min_shift + starting_n_lvls * 3);
3286     for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
3287 
3288     if (nids_out) *nids_out = nids;
3289     return n_lvls;
3290 }
3291 
bcf_index(htsFile * fp,int min_shift)3292 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
3293 {
3294     int n_lvls;
3295     bcf1_t *b = NULL;
3296     hts_idx_t *idx = NULL;
3297     bcf_hdr_t *h;
3298     int r;
3299     h = bcf_hdr_read(fp);
3300     if ( !h ) return NULL;
3301     int nids = 0;
3302     n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
3303     idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3304     if (!idx) goto fail;
3305     b = bcf_init1();
3306     if (!b) goto fail;
3307     while ((r = bcf_read1(fp,h, b)) >= 0) {
3308         int ret;
3309         ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
3310         if (ret < 0) goto fail;
3311     }
3312     if (r < -1) goto fail;
3313     hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
3314     bcf_destroy1(b);
3315     bcf_hdr_destroy(h);
3316     return idx;
3317 
3318  fail:
3319     hts_idx_destroy(idx);
3320     bcf_destroy1(b);
3321     bcf_hdr_destroy(h);
3322     return NULL;
3323 }
3324 
bcf_index_load2(const char * fn,const char * fnidx)3325 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
3326 {
3327     return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
3328 }
3329 
bcf_index_load3(const char * fn,const char * fnidx,int flags)3330 hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
3331 {
3332     return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
3333 }
3334 
bcf_index_build3(const char * fn,const char * fnidx,int min_shift,int n_threads)3335 int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
3336 {
3337     htsFile *fp;
3338     hts_idx_t *idx;
3339     tbx_t *tbx;
3340     int ret;
3341     if ((fp = hts_open(fn, "rb")) == 0) return -2;
3342     if (n_threads)
3343         hts_set_threads(fp, n_threads);
3344     if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
3345     switch (fp->format.format) {
3346         case bcf:
3347             if (!min_shift) {
3348                 hts_log_error("TBI indices for BCF files are not supported");
3349                 ret = -1;
3350             } else {
3351                 idx = bcf_index(fp, min_shift);
3352                 if (idx) {
3353                     ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
3354                     if (ret < 0) ret = -4;
3355                     hts_idx_destroy(idx);
3356                 }
3357                 else ret = -1;
3358             }
3359             break;
3360 
3361         case vcf:
3362             tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
3363             if (tbx) {
3364                 ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
3365                 if (ret < 0) ret = -4;
3366                 tbx_destroy(tbx);
3367             }
3368             else ret = -1;
3369             break;
3370 
3371         default:
3372             ret = -3;
3373             break;
3374     }
3375     hts_close(fp);
3376     return ret;
3377 }
3378 
bcf_index_build2(const char * fn,const char * fnidx,int min_shift)3379 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
3380 {
3381     return bcf_index_build3(fn, fnidx, min_shift, 0);
3382 }
3383 
bcf_index_build(const char * fn,int min_shift)3384 int bcf_index_build(const char *fn, int min_shift)
3385 {
3386     return bcf_index_build3(fn, NULL, min_shift, 0);
3387 }
3388 
3389 // Initialise fp->idx for the current format type.
3390 // This must be called after the header has been written but no other data.
vcf_idx_init(htsFile * fp,bcf_hdr_t * h,int min_shift,const char * fnidx)3391 static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
3392     int n_lvls, fmt;
3393 
3394     if (min_shift == 0) {
3395         min_shift = 14;
3396         n_lvls = 5;
3397         fmt = HTS_FMT_TBI;
3398     } else {
3399         // Set initial n_lvls to match tbx_index()
3400         int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
3401         // Increase if necessary
3402         n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
3403         fmt = HTS_FMT_CSI;
3404     }
3405 
3406     fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3407     if (!fp->idx) return -1;
3408 
3409     // Tabix meta data, added even in CSI for VCF
3410     uint8_t conf[4*7];
3411     u32_to_le(TBX_VCF, conf+0);  // fmt
3412     u32_to_le(1,       conf+4);  // name col
3413     u32_to_le(2,       conf+8);  // beg col
3414     u32_to_le(0,       conf+12); // end col
3415     u32_to_le('#',     conf+16); // comment
3416     u32_to_le(0,       conf+20); // n.skip
3417     u32_to_le(0,       conf+24); // ref name len
3418     if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
3419         hts_idx_destroy(fp->idx);
3420         fp->idx = NULL;
3421         return -1;
3422     }
3423     fp->fnidx = fnidx;
3424 
3425     return 0;
3426 }
3427 
3428 // Initialise fp->idx for the current format type.
3429 // This must be called after the header has been written but no other data.
bcf_idx_init(htsFile * fp,bcf_hdr_t * h,int min_shift,const char * fnidx)3430 int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
3431     int n_lvls, nids = 0;
3432 
3433     if (fp->format.format == vcf)
3434         return vcf_idx_init(fp, h, min_shift, fnidx);
3435 
3436     if (!min_shift)
3437         min_shift = 14;
3438 
3439     n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
3440 
3441     fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3442     if (!fp->idx) return -1;
3443     fp->fnidx = fnidx;
3444 
3445     return 0;
3446 }
3447 
3448 // Finishes an index. Call after the last record has been written.
3449 // Returns 0 on success, <0 on failure.
3450 //
3451 // NB: same format as SAM/BAM as it uses bgzf.
bcf_idx_save(htsFile * fp)3452 int bcf_idx_save(htsFile *fp) {
3453     return sam_idx_save(fp);
3454 }
3455 
3456 /*****************
3457  *** Utilities ***
3458  *****************/
3459 
bcf_hdr_combine(bcf_hdr_t * dst,const bcf_hdr_t * src)3460 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
3461 {
3462     int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
3463     for (i=0; i<src->nhrec; i++)
3464     {
3465         if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
3466         {
3467             int j;
3468             for (j=0; j<ndst_ori; j++)
3469             {
3470                 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
3471 
3472                 // Checking only the key part of generic lines, otherwise
3473                 // the VCFs are too verbose. Should we perhaps add a flag
3474                 // to bcf_hdr_combine() and make this optional?
3475                 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
3476             }
3477             if ( j>=ndst_ori ) {
3478                 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3479                 if (res < 0) return -1;
3480                 need_sync += res;
3481             }
3482         }
3483         else if ( src->hrec[i]->type==BCF_HL_STR )
3484         {
3485             // NB: we are ignoring fields without ID
3486             int j = bcf_hrec_find_key(src->hrec[i],"ID");
3487             if ( j>=0 )
3488             {
3489                 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
3490                 if ( !rec ) {
3491                     res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3492                     if (res < 0) return -1;
3493                     need_sync += res;
3494                 }
3495             }
3496         }
3497         else
3498         {
3499             int j = bcf_hrec_find_key(src->hrec[i],"ID");
3500             assert( j>=0 ); // this should always be true for valid VCFs
3501 
3502             bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
3503             if ( !rec ) {
3504                 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3505                 if (res < 0) return -1;
3506                 need_sync += res;
3507             } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
3508             {
3509                 // Check that both records are of the same type. The bcf_hdr_id2length
3510                 // macro cannot be used here because dst header is not synced yet.
3511                 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
3512                 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
3513                 khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
3514                 khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
3515                 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
3516                 {
3517                     hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
3518                         src->hrec[i]->vals[0]);
3519                     ret |= 1;
3520                 }
3521                 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
3522                 {
3523                     hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
3524                         src->hrec[i]->vals[0]);
3525                     ret |= 1;
3526                 }
3527             }
3528         }
3529     }
3530     if ( need_sync ) {
3531         if (bcf_hdr_sync(dst) < 0) return -1;
3532     }
3533     return ret;
3534 }
3535 
bcf_hdr_merge(bcf_hdr_t * dst,const bcf_hdr_t * src)3536 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
3537 {
3538     if ( !dst )
3539     {
3540         // this will effectively strip existing IDX attributes from src to become dst
3541         dst = bcf_hdr_init("r");
3542         kstring_t htxt = {0,0,0};
3543         if (bcf_hdr_format(src, 0, &htxt) < 0) {
3544             free(htxt.s);
3545             return NULL;
3546         }
3547         if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
3548             bcf_hdr_destroy(dst);
3549             dst = NULL;
3550         }
3551         free(htxt.s);
3552         return dst;
3553     }
3554 
3555     int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
3556     for (i=0; i<src->nhrec; i++)
3557     {
3558         if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
3559         {
3560             int j;
3561             for (j=0; j<ndst_ori; j++)
3562             {
3563                 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
3564 
3565                 // Checking only the key part of generic lines, otherwise
3566                 // the VCFs are too verbose. Should we perhaps add a flag
3567                 // to bcf_hdr_combine() and make this optional?
3568                 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
3569             }
3570             if ( j>=ndst_ori ) {
3571                 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3572                 if (res < 0) return NULL;
3573                 need_sync += res;
3574             }
3575         }
3576         else if ( src->hrec[i]->type==BCF_HL_STR )
3577         {
3578             // NB: we are ignoring fields without ID
3579             int j = bcf_hrec_find_key(src->hrec[i],"ID");
3580             if ( j>=0 )
3581             {
3582                 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
3583                 if ( !rec ) {
3584                     res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3585                     if (res < 0) return NULL;
3586                     need_sync += res;
3587                 }
3588             }
3589         }
3590         else
3591         {
3592             int j = bcf_hrec_find_key(src->hrec[i],"ID");
3593             assert( j>=0 ); // this should always be true for valid VCFs
3594 
3595             bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
3596             if ( !rec ) {
3597                 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3598                 if (res < 0) return NULL;
3599                 need_sync += res;
3600             } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
3601             {
3602                 // Check that both records are of the same type. The bcf_hdr_id2length
3603                 // macro cannot be used here because dst header is not synced yet.
3604                 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
3605                 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
3606                 khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
3607                 khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
3608                 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
3609                 {
3610                     hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
3611                         src->hrec[i]->vals[0]);
3612                     ret |= 1;
3613                 }
3614                 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
3615                 {
3616                     hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
3617                         src->hrec[i]->vals[0]);
3618                     ret |= 1;
3619                 }
3620             }
3621         }
3622     }
3623     if ( need_sync ) {
3624         if (bcf_hdr_sync(dst) < 0) return NULL;
3625     }
3626     return dst;
3627 }
3628 
bcf_translate(const bcf_hdr_t * dst_hdr,bcf_hdr_t * src_hdr,bcf1_t * line)3629 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
3630 {
3631     int i;
3632     if ( line->errcode )
3633     {
3634         hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_seqname_safe(src_hdr,line), line->pos+1);
3635         exit(1);
3636     }
3637     if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
3638     if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
3639     {
3640         int dict;
3641         for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
3642         {
3643             src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
3644             for (i=0; i<src_hdr->n[dict]; i++)
3645             {
3646                 if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
3647                 {
3648                     src_hdr->transl[dict][i] = -1;
3649                     continue;
3650                 }
3651                 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
3652                 if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
3653             }
3654         }
3655         if ( !src_hdr->ntransl )
3656         {
3657             free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
3658             free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
3659             src_hdr->ntransl = -1;
3660         }
3661         if ( src_hdr->ntransl==-1 ) return 0;
3662     }
3663     bcf_unpack(line,BCF_UN_ALL);
3664 
3665     // CHROM
3666     if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
3667 
3668     // FILTER
3669     for (i=0; i<line->d.n_flt; i++)
3670     {
3671         int src_id = line->d.flt[i];
3672         if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
3673             line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
3674         line->d.shared_dirty |= BCF1_DIRTY_FLT;
3675     }
3676 
3677     // INFO
3678     for (i=0; i<line->n_info; i++)
3679     {
3680         int src_id = line->d.info[i].key;
3681         int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
3682         if ( dst_id<0 ) continue;
3683         line->d.info[i].key = dst_id;
3684         if ( !line->d.info[i].vptr ) continue;  // skip deleted
3685         int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3686         int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3687         if ( src_size==dst_size )   // can overwrite
3688         {
3689             uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
3690             if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
3691             else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
3692             else { *(uint32_t*)vptr = (uint32_t)dst_id; }
3693         }
3694         else    // must realloc
3695         {
3696             bcf_info_t *info = &line->d.info[i];
3697             kstring_t str = {0,0,0};
3698             bcf_enc_int1(&str, dst_id);
3699             bcf_enc_size(&str, info->len,info->type);
3700             uint32_t vptr_off = str.l;
3701             kputsn((char*)info->vptr, info->vptr_len, &str);
3702             if( info->vptr_free ) free(info->vptr - info->vptr_off);
3703             info->vptr_off = vptr_off;
3704             info->vptr = (uint8_t*)str.s + info->vptr_off;
3705             info->vptr_free = 1;
3706             line->d.shared_dirty |= BCF1_DIRTY_INF;
3707         }
3708     }
3709 
3710     // FORMAT
3711     for (i=0; i<line->n_fmt; i++)
3712     {
3713         int src_id = line->d.fmt[i].id;
3714         int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
3715         if ( dst_id<0 ) continue;
3716         line->d.fmt[i].id = dst_id;
3717         if( !line->d.fmt[i].p ) continue;  // skip deleted
3718         int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3719         int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3720         if ( src_size==dst_size )   // can overwrite
3721         {
3722             uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
3723             if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
3724             else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
3725             else { i32_to_le(dst_id, p + 1); }
3726         }
3727         else    // must realloc
3728         {
3729             bcf_fmt_t *fmt = &line->d.fmt[i];
3730             kstring_t str = {0,0,0};
3731             bcf_enc_int1(&str, dst_id);
3732             bcf_enc_size(&str, fmt->n, fmt->type);
3733             uint32_t p_off = str.l;
3734             kputsn((char*)fmt->p, fmt->p_len, &str);
3735             if( fmt->p_free ) free(fmt->p - fmt->p_off);
3736             fmt->p_off = p_off;
3737             fmt->p = (uint8_t*)str.s + fmt->p_off;
3738             fmt->p_free = 1;
3739             line->d.indiv_dirty = 1;
3740         }
3741     }
3742     return 0;
3743 }
3744 
bcf_hdr_dup(const bcf_hdr_t * hdr)3745 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
3746 {
3747     bcf_hdr_t *hout = bcf_hdr_init("r");
3748     if (!hout) {
3749         hts_log_error("Failed to allocate bcf header");
3750         return NULL;
3751     }
3752     kstring_t htxt = {0,0,0};
3753     if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
3754         free(htxt.s);
3755         return NULL;
3756     }
3757     if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
3758         bcf_hdr_destroy(hout);
3759         hout = NULL;
3760     }
3761     free(htxt.s);
3762     return hout;
3763 }
3764 
bcf_hdr_subset(const bcf_hdr_t * h0,int n,char * const * samples,int * imap)3765 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
3766 {
3767     void *names_hash = khash_str2int_init();
3768     kstring_t htxt = {0,0,0};
3769     kstring_t str = {0,0,0};
3770     bcf_hdr_t *h = bcf_hdr_init("w");
3771     int r = 0;
3772     if (!h || !names_hash) {
3773         hts_log_error("Failed to allocate bcf header");
3774         goto err;
3775     }
3776     if (bcf_hdr_format(h0, 1, &htxt) < 0) {
3777         hts_log_error("Failed to get header text");
3778         goto err;
3779     }
3780     bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
3781     int j;
3782     for (j=0; j<n; j++) imap[j] = -1;
3783     if ( bcf_hdr_nsamples(h0) > 0) {
3784         char *p = find_chrom_header_line(htxt.s);
3785         int i = 0, end = n? 8 : 7;
3786         while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
3787         if (i != end) {
3788             hts_log_error("Wrong number of columns in header #CHROM line");
3789             goto err;
3790         }
3791         r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
3792         for (i = 0; i < n; ++i) {
3793             if ( khash_str2int_has_key(names_hash,samples[i]) )
3794             {
3795                 hts_log_error("Duplicate sample name \"%s\"", samples[i]);
3796                 goto err;
3797             }
3798             imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
3799             if (imap[i] < 0) continue;
3800             r |= kputc('\t', &str) < 0;
3801             r |= kputs(samples[i], &str) < 0;
3802             r |= khash_str2int_inc(names_hash,samples[i]) < 0;
3803         }
3804     } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
3805     while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
3806     r |= kputc('\n',&str) < 0;
3807     if (r) {
3808         hts_log_error("%s", strerror(errno));
3809         goto err;
3810     }
3811     if ( bcf_hdr_parse(h, str.s) < 0 ) {
3812         bcf_hdr_destroy(h);
3813         h = NULL;
3814     }
3815     free(str.s);
3816     free(htxt.s);
3817     khash_str2int_destroy(names_hash);
3818     return h;
3819 
3820  err:
3821     ks_free(&str);
3822     ks_free(&htxt);
3823     khash_str2int_destroy(names_hash);
3824     bcf_hdr_destroy(h);
3825     return NULL;
3826 }
3827 
bcf_hdr_set_samples(bcf_hdr_t * hdr,const char * samples,int is_file)3828 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
3829 {
3830     if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
3831 
3832     int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
3833     hdr->keep_samples = (uint8_t*) calloc(narr,1);
3834     if (!hdr->keep_samples) return -1;
3835 
3836     hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
3837     if ( !samples )
3838     {
3839         // exclude all samples
3840         khint_t k;
3841         vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
3842         new_dict = kh_init(vdict);
3843         if (!new_dict) return -1;
3844 
3845         bcf_hdr_nsamples(hdr) = 0;
3846 
3847         for (k = kh_begin(d); k != kh_end(d); ++k)
3848             if (kh_exist(d, k)) free((char*)kh_key(d, k));
3849         kh_destroy(vdict, d);
3850         hdr->dict[BCF_DT_SAMPLE] = new_dict;
3851         if (bcf_hdr_sync(hdr) < 0) return -1;
3852 
3853         return 0;
3854     }
3855 
3856     if ( samples[0]=='^' )
3857         for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
3858 
3859     int idx, n, ret = 0;
3860     char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
3861     if ( !smpls ) return -1;
3862     for (i=0; i<n; i++)
3863     {
3864         idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
3865         if ( idx<0 )
3866         {
3867             if ( !ret ) ret = i+1;
3868             continue;
3869         }
3870         assert( idx<bcf_hdr_nsamples(hdr) );
3871         if (  samples[0]=='^' )
3872             bit_array_clear(hdr->keep_samples, idx);
3873         else
3874             bit_array_set(hdr->keep_samples, idx);
3875     }
3876     for (i=0; i<n; i++) free(smpls[i]);
3877     free(smpls);
3878 
3879     bcf_hdr_nsamples(hdr) = 0;
3880     for (i=0; i<hdr->nsamples_ori; i++)
3881         if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
3882 
3883     if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
3884     else
3885     {
3886         // Make new list and dictionary with desired samples
3887         char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
3888         vdict_t *new_dict, *d;
3889         int k, res;
3890         if (!samples) return -1;
3891 
3892         new_dict = kh_init(vdict);
3893         if (!new_dict) {
3894             free(samples);
3895             return -1;
3896         }
3897         idx = 0;
3898         for (i=0; i<hdr->nsamples_ori; i++) {
3899             if ( bit_array_test(hdr->keep_samples,i) ) {
3900                 samples[idx] = hdr->samples[i];
3901                 k = kh_put(vdict, new_dict, hdr->samples[i], &res);
3902                 if (res < 0) {
3903                     free(samples);
3904                     kh_destroy(vdict, new_dict);
3905                     return -1;
3906                 }
3907                 kh_val(new_dict, k) = bcf_idinfo_def;
3908                 kh_val(new_dict, k).id = idx;
3909                 idx++;
3910             }
3911         }
3912 
3913         // Delete desired samples from old dictionary, so we don't free them
3914         d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
3915         for (i=0; i < idx; i++) {
3916             int k = kh_get(vdict, d, samples[i]);
3917             if (k < kh_end(d)) kh_del(vdict, d, k);
3918         }
3919 
3920         // Free everything else
3921         for (k = kh_begin(d); k != kh_end(d); ++k)
3922             if (kh_exist(d, k)) free((char*)kh_key(d, k));
3923         kh_destroy(vdict, d);
3924         hdr->dict[BCF_DT_SAMPLE] = new_dict;
3925 
3926         free(hdr->samples);
3927         hdr->samples = samples;
3928 
3929         if (bcf_hdr_sync(hdr) < 0)
3930             return -1;
3931     }
3932 
3933     return ret;
3934 }
3935 
bcf_subset(const bcf_hdr_t * h,bcf1_t * v,int n,int * imap)3936 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
3937 {
3938     kstring_t ind;
3939     ind.s = 0; ind.l = ind.m = 0;
3940     if (n) {
3941         bcf_fmt_t fmt[MAX_N_FMT];
3942         int i, j;
3943         uint8_t *ptr = (uint8_t*)v->indiv.s;
3944         for (i = 0; i < v->n_fmt; ++i)
3945             ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
3946         for (i = 0; i < (int)v->n_fmt; ++i) {
3947             bcf_fmt_t *f = &fmt[i];
3948             bcf_enc_int1(&ind, f->id);
3949             bcf_enc_size(&ind, f->n, f->type);
3950             for (j = 0; j < n; ++j)
3951                 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
3952         }
3953         for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
3954         v->n_sample = i;
3955     } else v->n_sample = 0;
3956     if ( !v->n_sample ) v->n_fmt = 0;
3957     free(v->indiv.s);
3958     v->indiv = ind;
3959     v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
3960     return 0;
3961 }
3962 
bcf_is_snp(bcf1_t * v)3963 int bcf_is_snp(bcf1_t *v)
3964 {
3965     int i;
3966     bcf_unpack(v, BCF_UN_STR);
3967     for (i = 0; i < v->n_allele; ++i)
3968     {
3969         if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
3970 
3971         // mpileup's <X> allele, see also below. This is not completely satisfactory,
3972         // a general library is here narrowly tailored to fit samtools.
3973         if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
3974         if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
3975 
3976         break;
3977     }
3978     return i == v->n_allele;
3979 }
3980 
bcf_set_variant_type(const char * ref,const char * alt,bcf_variant_t * var)3981 static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
3982 {
3983     if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
3984 
3985     // The most frequent case
3986     if ( !ref[1] && !alt[1] )
3987     {
3988         if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
3989         if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
3990         var->n = 1; var->type = VCF_SNP; return;
3991     }
3992     if ( alt[0]=='<' )
3993     {
3994         if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
3995         if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
3996         if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
3997         var->type = VCF_OTHER;
3998         return;
3999     }
4000 
4001     const char *r = ref, *a = alt;
4002     while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
4003 
4004     if ( *a && !*r )
4005     {
4006         if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; }
4007         while ( *a ) a++;
4008         var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
4009     }
4010     else if ( *r && !*a )
4011     {
4012         while ( *r ) r++;
4013         var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
4014     }
4015     else if ( !*r && !*a )
4016     {
4017         var->n = 0; var->type = VCF_REF; return;
4018     }
4019 
4020     const char *re = r, *ae = a;
4021     while ( re[1] ) re++;
4022     while ( ae[1] ) ae++;
4023     while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
4024     if ( ae==a )
4025     {
4026         if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
4027         var->n = -(re-r);
4028         if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
4029         var->type = VCF_OTHER; return;
4030     }
4031     else if ( re==r )
4032     {
4033         var->n = ae-a;
4034         if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
4035         var->type = VCF_OTHER; return;
4036     }
4037 
4038     var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
4039     var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
4040 
4041     // should do also complex events, SVs, etc...
4042 }
4043 
bcf_set_variant_types(bcf1_t * b)4044 static int bcf_set_variant_types(bcf1_t *b)
4045 {
4046     if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
4047     bcf_dec_t *d = &b->d;
4048     if ( d->n_var < b->n_allele )
4049     {
4050         d->var = (bcf_variant_t *) realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
4051         d->n_var = b->n_allele;
4052     }
4053     int i;
4054     b->d.var_type = 0;
4055     d->var[0].type = VCF_REF;
4056     d->var[0].n    = 0;
4057     for (i=1; i<b->n_allele; i++)
4058     {
4059         bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
4060         b->d.var_type |= d->var[i].type;
4061         //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
4062     }
4063     return 0;
4064 }
4065 
bcf_get_variant_types(bcf1_t * rec)4066 int bcf_get_variant_types(bcf1_t *rec)
4067 {
4068     if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
4069     return rec->d.var_type;
4070 }
bcf_get_variant_type(bcf1_t * rec,int ith_allele)4071 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
4072 {
4073     if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
4074     return rec->d.var[ith_allele].type;
4075 }
4076 
bcf_update_info(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const void * values,int n,int type)4077 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
4078 {
4079     static int negative_rlen_warned = 0;
4080     int is_end_tag;
4081 
4082     // Is the field already present?
4083     int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
4084     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
4085     if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4086 
4087     is_end_tag = strcmp(key, "END") == 0;
4088 
4089     for (i=0; i<line->n_info; i++)
4090         if ( inf_id==line->d.info[i].key ) break;
4091     bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
4092 
4093     if ( !n || (type==BCF_HT_STR && !values) )
4094     {
4095         if ( n==0 && is_end_tag )
4096             line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
4097         if ( inf )
4098         {
4099             // Mark the tag for removal, free existing memory if necessary
4100             if ( inf->vptr_free )
4101             {
4102                 free(inf->vptr - inf->vptr_off);
4103                 inf->vptr_free = 0;
4104             }
4105             line->d.shared_dirty |= BCF1_DIRTY_INF;
4106             inf->vptr = NULL;
4107             inf->vptr_off = inf->vptr_len = 0;
4108         }
4109         return 0;
4110     }
4111 
4112     if (is_end_tag)
4113     {
4114         if (n != 1)
4115         {
4116             hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
4117             line->errcode |= BCF_ERR_TAG_INVALID;
4118             return -1;
4119         }
4120         if (type != BCF_HT_INT && type != BCF_HT_LONG)
4121         {
4122             hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4123             line->errcode |= BCF_ERR_TAG_INVALID;
4124             return -1;
4125         }
4126     }
4127 
4128     // Encode the values and determine the size required to accommodate the values
4129     kstring_t str = {0,0,0};
4130     bcf_enc_int1(&str, inf_id);
4131     if ( type==BCF_HT_INT )
4132         bcf_enc_vint(&str, n, (int32_t*)values, -1);
4133     else if ( type==BCF_HT_REAL )
4134         bcf_enc_vfloat(&str, n, (float*)values);
4135     else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
4136     {
4137         if ( values==NULL )
4138             bcf_enc_size(&str, 0, BCF_BT_NULL);
4139         else
4140             bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
4141     }
4142 #ifdef VCF_ALLOW_INT64
4143     else if ( type==BCF_HT_LONG )
4144     {
4145         if (n != 1) {
4146             hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
4147             abort();
4148         }
4149         bcf_enc_long1(&str, *(int64_t *) values);
4150     }
4151 #endif
4152     else
4153     {
4154         hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4155         abort();
4156     }
4157 
4158     // Is the INFO tag already present
4159     if ( inf )
4160     {
4161         // Is it big enough to accommodate new block?
4162         if ( str.l <= inf->vptr_len + inf->vptr_off )
4163         {
4164             if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
4165             uint8_t *ptr = inf->vptr - inf->vptr_off;
4166             memcpy(ptr, str.s, str.l);
4167             free(str.s);
4168             int vptr_free = inf->vptr_free;
4169             bcf_unpack_info_core1(ptr, inf);
4170             inf->vptr_free = vptr_free;
4171         }
4172         else
4173         {
4174             if ( inf->vptr_free )
4175                 free(inf->vptr - inf->vptr_off);
4176             bcf_unpack_info_core1((uint8_t*)str.s, inf);
4177             inf->vptr_free = 1;
4178             line->d.shared_dirty |= BCF1_DIRTY_INF;
4179         }
4180     }
4181     else
4182     {
4183         // The tag is not present, create new one
4184         line->n_info++;
4185         hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
4186         inf = &line->d.info[line->n_info-1];
4187         bcf_unpack_info_core1((uint8_t*)str.s, inf);
4188         inf->vptr_free = 1;
4189         line->d.shared_dirty |= BCF1_DIRTY_INF;
4190     }
4191     line->unpacked |= BCF_UN_INFO;
4192 
4193    if ( n==1 && is_end_tag) {
4194         hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
4195         if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
4196         {
4197             if ( end <= line->pos )
4198             {
4199                 if ( !negative_rlen_warned )
4200                 {
4201                     hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
4202                     negative_rlen_warned = 1;
4203                 }
4204                 line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
4205             }
4206             else
4207                 line->rlen = end - line->pos;
4208         }
4209     }
4210     return 0;
4211 }
4212 
bcf_update_format_string(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const char ** values,int n)4213 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
4214 {
4215     if ( !n )
4216         return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
4217 
4218     int i, max_len = 0;
4219     for (i=0; i<n; i++)
4220     {
4221         int len = strlen(values[i]);
4222         if ( len > max_len ) max_len = len;
4223     }
4224     char *out = (char*) malloc(max_len*n);
4225     if ( !out ) return -2;
4226     for (i=0; i<n; i++)
4227     {
4228         char *dst = out+i*max_len;
4229         const char *src = values[i];
4230         int j = 0;
4231         while ( src[j] ) { dst[j] = src[j]; j++; }
4232         for (; j<max_len; j++) dst[j] = 0;
4233     }
4234     int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
4235     free(out);
4236     return ret;
4237 }
4238 
bcf_update_format(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const void * values,int n,int type)4239 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
4240 {
4241     // Is the field already present?
4242     int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
4243     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
4244     {
4245         if ( !n ) return 0;
4246         return -1;  // the key not present in the header
4247     }
4248 
4249     if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4250 
4251     for (i=0; i<line->n_fmt; i++)
4252         if ( line->d.fmt[i].id==fmt_id ) break;
4253     bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
4254 
4255     if ( !n )
4256     {
4257         if ( fmt )
4258         {
4259             // Mark the tag for removal, free existing memory if necessary
4260             if ( fmt->p_free )
4261             {
4262                 free(fmt->p - fmt->p_off);
4263                 fmt->p_free = 0;
4264             }
4265             line->d.indiv_dirty = 1;
4266             fmt->p = NULL;
4267         }
4268         return 0;
4269     }
4270 
4271     line->n_sample = bcf_hdr_nsamples(hdr);
4272     int nps = n / line->n_sample;  // number of values per sample
4273     assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
4274 
4275     // Encode the values and determine the size required to accommodate the values
4276     kstring_t str = {0,0,0};
4277     bcf_enc_int1(&str, fmt_id);
4278     assert(values != NULL);
4279     if ( type==BCF_HT_INT )
4280         bcf_enc_vint(&str, n, (int32_t*)values, nps);
4281     else if ( type==BCF_HT_REAL )
4282     {
4283         bcf_enc_size(&str, nps, BCF_BT_FLOAT);
4284         serialize_float_array(&str, nps*line->n_sample, (float *) values);
4285     }
4286     else if ( type==BCF_HT_STR )
4287     {
4288         bcf_enc_size(&str, nps, BCF_BT_CHAR);
4289         kputsn((char*)values, nps*line->n_sample, &str);
4290     }
4291     else
4292     {
4293         hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4294         abort();
4295     }
4296 
4297     if ( !fmt )
4298     {
4299         // Not present, new format field
4300         line->n_fmt++;
4301         hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
4302 
4303         // Special case: VCF specification requires that GT is always first
4304         if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
4305         {
4306             for (i=line->n_fmt-1; i>0; i--)
4307                 line->d.fmt[i] = line->d.fmt[i-1];
4308             fmt = &line->d.fmt[0];
4309         }
4310         else
4311             fmt = &line->d.fmt[line->n_fmt-1];
4312         bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
4313         line->d.indiv_dirty = 1;
4314         fmt->p_free = 1;
4315     }
4316     else
4317     {
4318         // The tag is already present, check if it is big enough to accommodate the new block
4319         if ( str.l <= fmt->p_len + fmt->p_off )
4320         {
4321             // good, the block is big enough
4322             if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
4323             uint8_t *ptr = fmt->p - fmt->p_off;
4324             memcpy(ptr, str.s, str.l);
4325             free(str.s);
4326             int p_free = fmt->p_free;
4327             bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
4328             fmt->p_free = p_free;
4329         }
4330         else
4331         {
4332             if ( fmt->p_free )
4333                 free(fmt->p - fmt->p_off);
4334             bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
4335             fmt->p_free = 1;
4336             line->d.indiv_dirty = 1;
4337         }
4338     }
4339     line->unpacked |= BCF_UN_FMT;
4340     return 0;
4341 }
4342 
4343 
bcf_update_filter(const bcf_hdr_t * hdr,bcf1_t * line,int * flt_ids,int n)4344 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
4345 {
4346     if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4347     line->d.shared_dirty |= BCF1_DIRTY_FLT;
4348     line->d.n_flt = n;
4349     if ( !n ) return 0;
4350     hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
4351     int i;
4352     for (i=0; i<n; i++)
4353         line->d.flt[i] = flt_ids[i];
4354     return 0;
4355 }
4356 
bcf_add_filter(const bcf_hdr_t * hdr,bcf1_t * line,int flt_id)4357 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
4358 {
4359     if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4360     int i;
4361     for (i=0; i<line->d.n_flt; i++)
4362         if ( flt_id==line->d.flt[i] ) break;
4363     if ( i<line->d.n_flt ) return 0;    // this filter is already set
4364     line->d.shared_dirty |= BCF1_DIRTY_FLT;
4365     if ( flt_id==0 )    // set to PASS
4366         line->d.n_flt = 1;
4367     else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
4368         line->d.n_flt = 1;
4369     else
4370         line->d.n_flt++;
4371     hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
4372     line->d.flt[line->d.n_flt-1] = flt_id;
4373     return 1;
4374 }
bcf_remove_filter(const bcf_hdr_t * hdr,bcf1_t * line,int flt_id,int pass)4375 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
4376 {
4377     if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4378     int i;
4379     for (i=0; i<line->d.n_flt; i++)
4380         if ( flt_id==line->d.flt[i] ) break;
4381     if ( i==line->d.n_flt ) return 0;   // the filter is not present
4382     line->d.shared_dirty |= BCF1_DIRTY_FLT;
4383     if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
4384     line->d.n_flt--;
4385     if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
4386     return 0;
4387 }
4388 
bcf_has_filter(const bcf_hdr_t * hdr,bcf1_t * line,char * filter)4389 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
4390 {
4391     if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
4392     int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
4393     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
4394 
4395     if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4396     if ( id==0 && !line->d.n_flt) return 1; // PASS
4397 
4398     int i;
4399     for (i=0; i<line->d.n_flt; i++)
4400         if ( line->d.flt[i]==id ) return 1;
4401     return 0;
4402 }
4403 
_bcf1_sync_alleles(const bcf_hdr_t * hdr,bcf1_t * line,int nals)4404 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
4405 {
4406     line->d.shared_dirty |= BCF1_DIRTY_ALS;
4407 
4408     line->n_allele = nals;
4409     hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
4410 
4411     char *als = line->d.als;
4412     int n = 0;
4413     while (n<nals)
4414     {
4415         line->d.allele[n] = als;
4416         while ( *als ) als++;
4417         als++;
4418         n++;
4419     }
4420 
4421     // Update REF length. Note that END is 1-based while line->pos 0-based
4422     bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
4423     if ( end_info )
4424     {
4425         if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
4426         else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
4427     }
4428     if ( end_info && end_info->v1.i > line->pos )
4429         line->rlen = end_info->v1.i - line->pos;
4430     else if ( nals > 0 )
4431         line->rlen = strlen(line->d.allele[0]);
4432     else
4433         line->rlen = 0;
4434 
4435     return 0;
4436 }
bcf_update_alleles(const bcf_hdr_t * hdr,bcf1_t * line,const char ** alleles,int nals)4437 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
4438 {
4439     if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4440     kstring_t tmp = {0,0,0};
4441     char *free_old = NULL;
4442 
4443     // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
4444     int i;
4445     for (i=0; i<nals; i++)
4446         if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
4447     if ( i==nals )
4448     {
4449         // all alleles point elsewhere, reuse the existing block
4450         tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
4451     }
4452     else
4453         free_old = line->d.als;
4454 
4455     for (i=0; i<nals; i++)
4456     {
4457         kputs(alleles[i], &tmp);
4458         kputc(0, &tmp);
4459     }
4460     line->d.als = tmp.s; line->d.m_als = tmp.m;
4461     free(free_old);
4462     return _bcf1_sync_alleles(hdr,line,nals);
4463 }
4464 
bcf_update_alleles_str(const bcf_hdr_t * hdr,bcf1_t * line,const char * alleles_string)4465 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
4466 {
4467     if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4468     kstring_t tmp;
4469     tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
4470     kputs(alleles_string, &tmp);
4471     line->d.als = tmp.s; line->d.m_als = tmp.m;
4472 
4473     int nals = 1;
4474     char *t = line->d.als;
4475     while (*t)
4476     {
4477         if ( *t==',' ) { *t = 0; nals++; }
4478         t++;
4479     }
4480     return _bcf1_sync_alleles(hdr, line, nals);
4481 }
4482 
bcf_update_id(const bcf_hdr_t * hdr,bcf1_t * line,const char * id)4483 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
4484 {
4485     if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4486     kstring_t tmp;
4487     tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
4488     if ( id )
4489         kputs(id, &tmp);
4490     else
4491         kputs(".", &tmp);
4492     line->d.id = tmp.s; line->d.m_id = tmp.m;
4493     line->d.shared_dirty |= BCF1_DIRTY_ID;
4494     return 0;
4495 }
4496 
bcf_add_id(const bcf_hdr_t * hdr,bcf1_t * line,const char * id)4497 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
4498 {
4499     if ( !id ) return 0;
4500     if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4501 
4502     kstring_t tmp;
4503     tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
4504 
4505     int len = strlen(id);
4506     char *dst = line->d.id;
4507     while ( *dst && (dst=strstr(dst,id)) )
4508     {
4509         if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
4510         else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
4511         dst++;  // a suffix, not a match
4512     }
4513     if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
4514     {
4515         tmp.l = strlen(line->d.id);
4516         kputc(';',&tmp);
4517     }
4518     kputs(id,&tmp);
4519 
4520     line->d.id = tmp.s; line->d.m_id = tmp.m;
4521     line->d.shared_dirty |= BCF1_DIRTY_ID;
4522     return 0;
4523 
4524 }
4525 
bcf_get_fmt(const bcf_hdr_t * hdr,bcf1_t * line,const char * key)4526 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
4527 {
4528     int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
4529     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
4530     return bcf_get_fmt_id(line, id);
4531 }
4532 
bcf_get_info(const bcf_hdr_t * hdr,bcf1_t * line,const char * key)4533 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
4534 {
4535     int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
4536     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
4537     return bcf_get_info_id(line, id);
4538 }
4539 
bcf_get_fmt_id(bcf1_t * line,const int id)4540 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
4541 {
4542     int i;
4543     if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4544     for (i=0; i<line->n_fmt; i++)
4545     {
4546         if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
4547     }
4548     return NULL;
4549 }
4550 
bcf_get_info_id(bcf1_t * line,const int id)4551 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
4552 {
4553     int i;
4554     if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4555     for (i=0; i<line->n_info; i++)
4556     {
4557         if ( line->d.info[i].key==id ) return &line->d.info[i];
4558     }
4559     return NULL;
4560 }
4561 
4562 
bcf_get_info_values(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,void ** dst,int * ndst,int type)4563 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
4564 {
4565     int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
4566     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
4567     if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
4568 
4569     if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4570 
4571     for (i=0; i<line->n_info; i++)
4572         if ( line->d.info[i].key==tag_id ) break;
4573     if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
4574     if ( type==BCF_HT_FLAG ) return 1;
4575 
4576     bcf_info_t *info = &line->d.info[i];
4577     if ( !info->vptr ) return -3;           // the tag was marked for removal
4578     if ( type==BCF_HT_STR )
4579     {
4580         if ( *ndst < info->len+1 )
4581         {
4582             *ndst = info->len + 1;
4583             *dst  = realloc(*dst, *ndst);
4584         }
4585         memcpy(*dst,info->vptr,info->len);
4586         ((uint8_t*)*dst)[info->len] = 0;
4587         return info->len;
4588     }
4589 
4590     // Make sure the buffer is big enough
4591     int size1;
4592     switch (type) {
4593         case BCF_HT_INT:  size1 = sizeof(int32_t); break;
4594         case BCF_HT_LONG: size1 = sizeof(int64_t); break;
4595         case BCF_HT_REAL: size1 = sizeof(float); break;
4596         default:
4597             hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4598             return -2;
4599     }
4600     if ( *ndst < info->len )
4601     {
4602         *ndst = info->len;
4603         *dst  = realloc(*dst, *ndst * size1);
4604     }
4605 
4606     #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
4607         out_type_t *tmp = (out_type_t *) *dst; \
4608         int j; \
4609         for (j=0; j<info->len; j++) \
4610         { \
4611             type_t p = convert(info->vptr + j * sizeof(type_t)); \
4612             if ( is_vector_end ) break; \
4613             if ( is_missing ) set_missing; \
4614             else set_regular; \
4615             tmp++; \
4616         } \
4617         ret = j; \
4618     } while (0)
4619     switch (info->type) {
4620         case BCF_BT_INT8:
4621             if (type == BCF_HT_LONG) {
4622                 BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
4623             } else {
4624                 BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
4625             }
4626             break;
4627         case BCF_BT_INT16:
4628             if (type == BCF_HT_LONG) {
4629                 BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
4630             } else {
4631                 BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
4632             }
4633             break;
4634         case BCF_BT_INT32:
4635             if (type == BCF_HT_LONG) {
4636                 BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
4637             } else {
4638                 BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
4639             }
4640         case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
4641         default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
4642     }
4643     #undef BRANCH
4644     return ret;  // set by BRANCH
4645 }
4646 
bcf_get_format_string(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,char *** dst,int * ndst)4647 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
4648 {
4649     int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
4650     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
4651     if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
4652 
4653     if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4654 
4655     for (i=0; i<line->n_fmt; i++)
4656         if ( line->d.fmt[i].id==tag_id ) break;
4657     if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
4658     bcf_fmt_t *fmt = &line->d.fmt[i];
4659     if ( !fmt->p ) return -3;                                      // the tag was marked for removal
4660 
4661     int nsmpl = bcf_hdr_nsamples(hdr);
4662     if ( !*dst )
4663     {
4664         *dst = (char**) malloc(sizeof(char*)*nsmpl);
4665         if ( !*dst ) return -4;     // could not alloc
4666         (*dst)[0] = NULL;
4667     }
4668     int n = (fmt->n+1)*nsmpl;
4669     if ( *ndst < n )
4670     {
4671         (*dst)[0] = realloc((*dst)[0], n);
4672         if ( !(*dst)[0] ) return -4;    // could not alloc
4673         *ndst = n;
4674     }
4675     for (i=0; i<nsmpl; i++)
4676     {
4677         uint8_t *src = fmt->p + i*fmt->n;
4678         uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
4679         memcpy(tmp,src,fmt->n);
4680         tmp[fmt->n] = 0;
4681         (*dst)[i] = (char*) tmp;
4682     }
4683     return n;
4684 }
4685 
bcf_get_format_values(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,void ** dst,int * ndst,int type)4686 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
4687 {
4688     int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
4689     if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
4690     if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
4691     {
4692         // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
4693         if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
4694     }
4695     else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
4696 
4697     if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4698 
4699     for (i=0; i<line->n_fmt; i++)
4700         if ( line->d.fmt[i].id==tag_id ) break;
4701     if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
4702     bcf_fmt_t *fmt = &line->d.fmt[i];
4703     if ( !fmt->p ) return -3;                                      // the tag was marked for removal
4704 
4705     if ( type==BCF_HT_STR )
4706     {
4707         int n = fmt->n*bcf_hdr_nsamples(hdr);
4708         if ( *ndst < n )
4709         {
4710             *dst  = realloc(*dst, n);
4711             if ( !*dst ) return -4;     // could not alloc
4712             *ndst = n;
4713         }
4714         memcpy(*dst,fmt->p,n);
4715         return n;
4716     }
4717 
4718     // Make sure the buffer is big enough
4719     int nsmpl = bcf_hdr_nsamples(hdr);
4720     int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
4721     if ( *ndst < fmt->n*nsmpl )
4722     {
4723         *ndst = fmt->n*nsmpl;
4724         *dst  = realloc(*dst, *ndst*size1);
4725         if ( !*dst ) return -4;     // could not alloc
4726     }
4727 
4728     #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
4729         out_type_t *tmp = (out_type_t *) *dst; \
4730         uint8_t *fmt_p = fmt->p; \
4731         for (i=0; i<nsmpl; i++) \
4732         { \
4733             for (j=0; j<fmt->n; j++) \
4734             { \
4735                 type_t p = convert(fmt_p + j * sizeof(type_t)); \
4736                 if ( is_missing ) set_missing; \
4737                 else if ( is_vector_end ) { set_vector_end; break; } \
4738                 else set_regular; \
4739                 tmp++; \
4740             } \
4741             for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
4742             fmt_p += fmt->size; \
4743         } \
4744     }
4745     switch (fmt->type) {
4746         case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
4747         case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
4748         case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
4749         case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
4750         default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
4751     }
4752     #undef BRANCH
4753     return nsmpl*fmt->n;
4754 }
4755