1 /* vcf.c -- VCF/BCF API functions.
2
3 Copyright (C) 2012, 2013 Broad Institute.
4 Copyright (C) 2012-2020 Genome Research Ltd.
5 Portions copyright (C) 2014 Intel Corporation.
6
7 Author: Heng Li <lh3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28 #include <config.h>
29
30 #include <stdio.h>
31 #include <assert.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <stdlib.h>
35 #include <limits.h>
36 #include <stdint.h>
37 #include <inttypes.h>
38 #include <errno.h>
39
40 #include "htslib/vcf.h"
41 #include "htslib/bgzf.h"
42 #include "htslib/tbx.h"
43 #include "htslib/hfile.h"
44 #include "hts_internal.h"
45 #include "htslib/hts_endian.h"
46 #include "htslib/khash_str2int.h"
47 #include "htslib/kstring.h"
48 #include "htslib/sam.h"
49
50 #include "htslib/khash.h"
51 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
52 typedef khash_t(vdict) vdict_t;
53
54 #include "htslib/kseq.h"
55 HTSLIB_EXPORT
56 uint32_t bcf_float_missing = 0x7F800001;
57
58 HTSLIB_EXPORT
59 uint32_t bcf_float_vector_end = 0x7F800002;
60
61 HTSLIB_EXPORT
62 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
63
64 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
65
66 /*
67 Partial support for 64-bit POS and Number=1 INFO tags.
68 Notes:
69 - the support for 64-bit values is motivated by POS and INFO/END for large genomes
70 - the use of 64-bit values does not conform to the specification
71 - cannot output 64-bit BCF and if it does, it is not compatible with anything
72 - experimental, use at your risk
73 */
74 #ifdef VCF_ALLOW_INT64
75 #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */
76 #define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */
77 #endif
78
79 #define BCF_IS_64BIT (1<<30)
80
81
find_chrom_header_line(char * s)82 static char *find_chrom_header_line(char *s)
83 {
84 char *nl;
85 if (strncmp(s, "#CHROM\t", 7) == 0) return s;
86 else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
87 else return NULL;
88 }
89
90 /*************************
91 *** VCF header parser ***
92 *************************/
93
bcf_hdr_add_sample_len(bcf_hdr_t * h,const char * s,size_t len)94 static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
95 {
96 if ( !s ) return 0;
97 if (len == 0) len = strlen(s);
98
99 const char *ss = s;
100 while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
101 if ( !*ss || ss - s == len)
102 {
103 hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
104 return -1;
105 }
106
107 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
108 int ret;
109 char *sdup = malloc(len + 1);
110 if (!sdup) return -1;
111 memcpy(sdup, s, len);
112 sdup[len] = 0;
113
114 // Ensure space is available in h->samples
115 size_t n = kh_size(d);
116 char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
117 if (!new_samples) {
118 free(sdup);
119 return -1;
120 }
121 h->samples = new_samples;
122
123 int k = kh_put(vdict, d, sdup, &ret);
124 if (ret < 0) {
125 free(sdup);
126 return -1;
127 }
128 if (ret) { // absent
129 kh_val(d, k) = bcf_idinfo_def;
130 kh_val(d, k).id = n;
131 } else {
132 hts_log_error("Duplicated sample name '%s'", s);
133 free(sdup);
134 return -1;
135 }
136 h->samples[n] = sdup;
137 h->dirty = 1;
138 return 0;
139 }
140
bcf_hdr_add_sample(bcf_hdr_t * h,const char * s)141 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
142 {
143 return bcf_hdr_add_sample_len(h, s, 0);
144 }
145
bcf_hdr_parse_sample_line(bcf_hdr_t * h,const char * str)146 int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
147 {
148 int ret = 0;
149 int i = 0;
150 const char *p, *q;
151 // add samples
152 for (p = q = str;; ++q) {
153 if (*q > '\n') continue;
154 if (++i > 9) {
155 if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1;
156 }
157 if (*q == 0 || *q == '\n' || ret < 0) break;
158 p = q + 1;
159 }
160
161 return ret;
162 }
163
bcf_hdr_sync(bcf_hdr_t * h)164 int bcf_hdr_sync(bcf_hdr_t *h)
165 {
166 int i;
167 for (i = 0; i < 3; i++)
168 {
169 vdict_t *d = (vdict_t*)h->dict[i];
170 khint_t k;
171 if ( h->n[i] < kh_size(d) )
172 {
173 bcf_idpair_t *new_idpair;
174 // this should be true only for i=2, BCF_DT_SAMPLE
175 new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
176 if (!new_idpair) return -1;
177 h->n[i] = kh_size(d);
178 h->id[i] = new_idpair;
179 }
180 for (k=kh_begin(d); k<kh_end(d); k++)
181 {
182 if (!kh_exist(d,k)) continue;
183 h->id[i][kh_val(d,k).id].key = kh_key(d,k);
184 h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
185 }
186 }
187 h->dirty = 0;
188 return 0;
189 }
190
bcf_hrec_destroy(bcf_hrec_t * hrec)191 void bcf_hrec_destroy(bcf_hrec_t *hrec)
192 {
193 if (!hrec) return;
194 free(hrec->key);
195 if ( hrec->value ) free(hrec->value);
196 int i;
197 for (i=0; i<hrec->nkeys; i++)
198 {
199 free(hrec->keys[i]);
200 free(hrec->vals[i]);
201 }
202 free(hrec->keys);
203 free(hrec->vals);
204 free(hrec);
205 }
206
207 // Copies all fields except IDX.
bcf_hrec_dup(bcf_hrec_t * hrec)208 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
209 {
210 int save_errno;
211 bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
212 if (!out) return NULL;
213
214 out->type = hrec->type;
215 if ( hrec->key ) {
216 out->key = strdup(hrec->key);
217 if (!out->key) goto fail;
218 }
219 if ( hrec->value ) {
220 out->value = strdup(hrec->value);
221 if (!out->value) goto fail;
222 }
223 out->nkeys = hrec->nkeys;
224 out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
225 if (!out->keys) goto fail;
226 out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
227 if (!out->vals) goto fail;
228 int i, j = 0;
229 for (i=0; i<hrec->nkeys; i++)
230 {
231 if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
232 if ( hrec->keys[i] ) {
233 out->keys[j] = strdup(hrec->keys[i]);
234 if (!out->keys[j]) goto fail;
235 }
236 if ( hrec->vals[i] ) {
237 out->vals[j] = strdup(hrec->vals[i]);
238 if (!out->vals[j]) goto fail;
239 }
240 j++;
241 }
242 if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
243 return out;
244
245 fail:
246 save_errno = errno;
247 hts_log_error("%s", strerror(errno));
248 bcf_hrec_destroy(out);
249 errno = save_errno;
250 return NULL;
251 }
252
bcf_hrec_debug(FILE * fp,bcf_hrec_t * hrec)253 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
254 {
255 fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
256 int i;
257 for (i=0; i<hrec->nkeys; i++)
258 fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
259 fprintf(fp, "\n");
260 }
261
bcf_header_debug(bcf_hdr_t * hdr)262 void bcf_header_debug(bcf_hdr_t *hdr)
263 {
264 int i, j;
265 for (i=0; i<hdr->nhrec; i++)
266 {
267 if ( !hdr->hrec[i]->value )
268 {
269 fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
270 fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
271 for (j=1; j<hdr->hrec[i]->nkeys; j++)
272 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
273 fprintf(stderr,">\n");
274 }
275 else
276 fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
277 }
278 }
279
bcf_hrec_add_key(bcf_hrec_t * hrec,const char * str,size_t len)280 int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
281 {
282 char **tmp;
283 size_t n = hrec->nkeys + 1;
284 assert(len > 0 && len < SIZE_MAX);
285 tmp = realloc(hrec->keys, sizeof(char*)*n);
286 if (!tmp) return -1;
287 hrec->keys = tmp;
288 tmp = realloc(hrec->vals, sizeof(char*)*n);
289 if (!tmp) return -1;
290 hrec->vals = tmp;
291
292 hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
293 if (!hrec->keys[hrec->nkeys]) return -1;
294 memcpy(hrec->keys[hrec->nkeys],str,len);
295 hrec->keys[hrec->nkeys][len] = 0;
296 hrec->vals[hrec->nkeys] = NULL;
297 hrec->nkeys = n;
298 return 0;
299 }
300
bcf_hrec_set_val(bcf_hrec_t * hrec,int i,const char * str,size_t len,int is_quoted)301 int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
302 {
303 if ( hrec->vals[i] ) {
304 free(hrec->vals[i]);
305 hrec->vals[i] = NULL;
306 }
307 if ( !str ) return 0;
308 if ( is_quoted )
309 {
310 if (len >= SIZE_MAX - 3) {
311 errno = ENOMEM;
312 return -1;
313 }
314 hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
315 if (!hrec->vals[i]) return -1;
316 hrec->vals[i][0] = '"';
317 memcpy(&hrec->vals[i][1],str,len);
318 hrec->vals[i][len+1] = '"';
319 hrec->vals[i][len+2] = 0;
320 }
321 else
322 {
323 if (len == SIZE_MAX) {
324 errno = ENOMEM;
325 return -1;
326 }
327 hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
328 if (!hrec->vals[i]) return -1;
329 memcpy(hrec->vals[i],str,len);
330 hrec->vals[i][len] = 0;
331 }
332 return 0;
333 }
334
hrec_add_idx(bcf_hrec_t * hrec,int idx)335 int hrec_add_idx(bcf_hrec_t *hrec, int idx)
336 {
337 int n = hrec->nkeys + 1;
338 char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
339 if (!tmp) return -1;
340 hrec->keys = tmp;
341
342 tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
343 if (!tmp) return -1;
344 hrec->vals = tmp;
345
346 hrec->keys[hrec->nkeys] = strdup("IDX");
347 if (!hrec->keys[hrec->nkeys]) return -1;
348
349 kstring_t str = {0,0,0};
350 if (kputw(idx, &str) < 0) {
351 free(hrec->keys[hrec->nkeys]);
352 return -1;
353 }
354 hrec->vals[hrec->nkeys] = str.s;
355 hrec->nkeys = n;
356 return 0;
357 }
358
bcf_hrec_find_key(bcf_hrec_t * hrec,const char * key)359 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
360 {
361 int i;
362 for (i=0; i<hrec->nkeys; i++)
363 if ( !strcasecmp(key,hrec->keys[i]) ) return i;
364 return -1;
365 }
366
is_escaped(const char * min,const char * str)367 static inline int is_escaped(const char *min, const char *str)
368 {
369 int n = 0;
370 while ( --str>=min && *str=='\\' ) n++;
371 return n%2;
372 }
373
bcf_hdr_parse_line(const bcf_hdr_t * h,const char * line,int * len)374 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
375 {
376 bcf_hrec_t *hrec = NULL;
377 const char *p = line;
378 if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
379 p += 2;
380
381 const char *q = p;
382 while ( *q && *q!='=' && *q != '\n' ) q++;
383 ptrdiff_t n = q-p;
384 if ( *q!='=' || !n ) // wrong format
385 goto malformed_line;
386
387 hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
388 if (!hrec) { *len = -1; return NULL; }
389 hrec->key = (char*) malloc(sizeof(char)*(n+1));
390 if (!hrec->key) goto fail;
391 memcpy(hrec->key,p,n);
392 hrec->key[n] = 0;
393
394 p = ++q;
395 if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
396 {
397 while ( *q && *q!='\n' ) q++;
398 hrec->value = (char*) malloc((q-p+1)*sizeof(char));
399 if (!hrec->value) goto fail;
400 memcpy(hrec->value, p, q-p);
401 hrec->value[q-p] = 0;
402 *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
403 return hrec;
404 }
405
406 // structured line, e.g.
407 // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
408 // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
409 int nopen = 1;
410 while ( *q && *q!='\n' && nopen>0 )
411 {
412 p = ++q;
413 while ( *q && *q==' ' ) { p++; q++; }
414 // ^[A-Za-z_][0-9A-Za-z_.]*$
415 if (p==q && *q && (isalpha_c(*q) || *q=='_'))
416 {
417 q++;
418 while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
419 }
420 n = q-p;
421 int m = 0;
422 while ( *q && *q==' ' ) { q++; m++; }
423 if ( *q!='=' || !n )
424 goto malformed_line;
425
426 if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
427 p = ++q;
428 while ( *q && *q==' ' ) { p++; q++; }
429 int quoted = *p=='"' ? 1 : 0;
430 if ( quoted ) p++, q++;
431 while ( *q && *q != '\n' )
432 {
433 if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
434 else
435 {
436 if ( *q=='<' ) nopen++;
437 if ( *q=='>' ) nopen--;
438 if ( !nopen ) break;
439 if ( *q==',' && nopen==1 ) break;
440 }
441 q++;
442 }
443 const char *r = q;
444 while ( r > p && r[-1] == ' ' ) r--;
445 if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
446 goto fail;
447 if ( quoted && *q=='"' ) q++;
448 if ( *q=='>' ) { nopen--; q++; }
449 }
450
451 // Skip to end of line
452 int nonspace = 0;
453 p = q;
454 while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
455 if (nonspace) {
456 char buffer[320];
457 hts_log_warning("Dropped trailing junk from header line '%s'",
458 hts_strprint(buffer, sizeof(buffer),
459 '"', line, q - line));
460 }
461
462 *len = q - line + (*q ? 1 : 0);
463 return hrec;
464
465 fail:
466 *len = -1;
467 bcf_hrec_destroy(hrec);
468 return NULL;
469
470 malformed_line:
471 {
472 char buffer[320];
473 while ( *q && *q!='\n' ) q++; // Ensure *len includes full line
474 hts_log_error("Could not parse the header line: %s",
475 hts_strprint(buffer, sizeof(buffer),
476 '"', line, q - line));
477 *len = q - line + (*q ? 1 : 0);
478 bcf_hrec_destroy(hrec);
479 return NULL;
480 }
481 }
482
bcf_hdr_set_idx(bcf_hdr_t * hdr,int dict_type,const char * tag,bcf_idinfo_t * idinfo)483 static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
484 {
485 size_t new_n;
486
487 // If available, preserve existing IDX
488 if ( idinfo->id==-1 )
489 idinfo->id = hdr->n[dict_type];
490 else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
491 {
492 hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
493 idinfo->id, tag);
494 errno = EINVAL;
495 return -1;
496 }
497
498 new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
499 if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
500 &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
501 return -1;
502 }
503 hdr->n[dict_type] = new_n;
504
505 // NB: the next kh_put call can invalidate the idinfo pointer, therefore
506 // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
507 hdr->id[dict_type][idinfo->id].key = tag;
508
509 return 0;
510 }
511
512 // returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
bcf_hdr_register_hrec(bcf_hdr_t * hdr,bcf_hrec_t * hrec)513 static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
514 {
515 // contig
516 int i, ret, replacing = 0;
517 khint_t k;
518 char *str = NULL;
519
520 if ( !strcmp(hrec->key, "contig") )
521 {
522 hts_pos_t len = 0;
523 hrec->type = BCF_HL_CTG;
524
525 // Get the contig ID ($str) and length ($j)
526 i = bcf_hrec_find_key(hrec,"length");
527 if ( i<0 ) len = 0;
528 else {
529 char *end = hrec->vals[i];
530 len = strtoll(hrec->vals[i], &end, 10);
531 if (end == hrec->vals[i] || len < 0) return 0;
532 }
533
534 i = bcf_hrec_find_key(hrec,"ID");
535 if ( i<0 ) return 0;
536 str = strdup(hrec->vals[i]);
537 if (!str) return -1;
538
539 // Register in the dictionary
540 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
541 khint_t k = kh_get(vdict, d, str);
542 if ( k != kh_end(d) ) { // already present
543 free(str); str=NULL;
544 if (kh_val(d, k).hrec[0] != NULL) // and not removed
545 return 0;
546 replacing = 1;
547 } else {
548 k = kh_put(vdict, d, str, &ret);
549 if (ret < 0) { free(str); return -1; }
550 }
551
552 int idx = bcf_hrec_find_key(hrec,"IDX");
553 if ( idx!=-1 )
554 {
555 char *tmp = hrec->vals[idx];
556 idx = strtol(hrec->vals[idx], &tmp, 10);
557 if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
558 {
559 if (!replacing) {
560 kh_del(vdict, d, k);
561 free(str);
562 }
563 hts_log_warning("Error parsing the IDX tag, skipping");
564 return 0;
565 }
566 }
567
568 kh_val(d, k) = bcf_idinfo_def;
569 kh_val(d, k).id = idx;
570 kh_val(d, k).info[0] = len;
571 kh_val(d, k).hrec[0] = hrec;
572 if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
573 if (!replacing) {
574 kh_del(vdict, d, k);
575 free(str);
576 }
577 return -1;
578 }
579 if ( idx==-1 ) {
580 if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
581 return -1;
582 }
583 }
584
585 return 1;
586 }
587
588 if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
589 else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
590 else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
591 else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
592 else return 0;
593
594 // INFO/FILTER/FORMAT
595 char *id = NULL;
596 uint32_t type = UINT32_MAX, var = UINT32_MAX;
597 int num = -1, idx = -1;
598 for (i=0; i<hrec->nkeys; i++)
599 {
600 if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
601 else if ( !strcmp(hrec->keys[i], "IDX") )
602 {
603 char *tmp = hrec->vals[i];
604 idx = strtol(hrec->vals[i], &tmp, 10);
605 if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
606 {
607 hts_log_warning("Error parsing the IDX tag, skipping");
608 return 0;
609 }
610 }
611 else if ( !strcmp(hrec->keys[i], "Type") )
612 {
613 if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
614 else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
615 else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
616 else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
617 else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
618 else
619 {
620 hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
621 type = BCF_HT_STR;
622 }
623 }
624 else if ( !strcmp(hrec->keys[i], "Number") )
625 {
626 if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
627 else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
628 else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
629 else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
630 else
631 {
632 sscanf(hrec->vals[i],"%d",&num);
633 var = BCF_VL_FIXED;
634 }
635 if (var != BCF_VL_FIXED) num = 0xfffff;
636 }
637 }
638 if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
639 if (type == -1) {
640 hts_log_warning("%s %s field has no Type defined. Assuming String",
641 *hrec->key == 'I' ? "An" : "A", hrec->key);
642 type = BCF_HT_STR;
643 }
644 if (var == -1) {
645 hts_log_warning("%s %s field has no Number defined. Assuming '.'",
646 *hrec->key == 'I' ? "An" : "A", hrec->key);
647 var = BCF_VL_VAR;
648 }
649 }
650 uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
651 (var & 0xf) << 8 |
652 (type & 0xf) << 4 |
653 (((uint32_t) hrec->type) & 0xf));
654
655 if ( !id ) return 0;
656 str = strdup(id);
657 if (!str) return -1;
658
659 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
660 k = kh_get(vdict, d, str);
661 if ( k != kh_end(d) )
662 {
663 // already present
664 free(str);
665 if ( kh_val(d, k).hrec[info&0xf] ) return 0;
666 kh_val(d, k).info[info&0xf] = info;
667 kh_val(d, k).hrec[info&0xf] = hrec;
668 if ( idx==-1 ) {
669 if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
670 return -1;
671 }
672 }
673 return 1;
674 }
675 k = kh_put(vdict, d, str, &ret);
676 if (ret < 0) {
677 free(str);
678 return -1;
679 }
680 kh_val(d, k) = bcf_idinfo_def;
681 kh_val(d, k).info[info&0xf] = info;
682 kh_val(d, k).hrec[info&0xf] = hrec;
683 kh_val(d, k).id = idx;
684 if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
685 kh_del(vdict, d, k);
686 free(str);
687 return -1;
688 }
689 if ( idx==-1 ) {
690 if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
691 return -1;
692 }
693 }
694
695 return 1;
696 }
697
bcf_hdr_add_hrec(bcf_hdr_t * hdr,bcf_hrec_t * hrec)698 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
699 {
700 int res;
701 if ( !hrec ) return 0;
702
703 hrec->type = BCF_HL_GEN;
704 res = bcf_hdr_register_hrec(hdr,hrec);
705 if (res < 0) return -1;
706 if ( !res )
707 {
708 // If one of the hashed field, then it is already present
709 if ( hrec->type != BCF_HL_GEN )
710 {
711 bcf_hrec_destroy(hrec);
712 return 0;
713 }
714
715 // Is one of the generic fields and already present?
716 int i;
717 for (i=0; i<hdr->nhrec; i++)
718 {
719 if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
720 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
721 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
722 }
723 if ( i<hdr->nhrec )
724 {
725 bcf_hrec_destroy(hrec);
726 return 0;
727 }
728 }
729
730 // New record, needs to be added
731 int n = hdr->nhrec + 1;
732 bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
733 if (!new_hrec) return -1;
734 hdr->hrec = new_hrec;
735 hdr->hrec[hdr->nhrec] = hrec;
736 hdr->dirty = 1;
737 hdr->nhrec = n;
738
739 return hrec->type==BCF_HL_GEN ? 0 : 1;
740 }
741
742 /*
743 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
744 * the STR,GEN lines are searched for linearly in a linked list of all header lines.
745 * This may become a problem for VCFs with huge headers, we might need to build a
746 * dictionary for these lines as well.
747 */
bcf_hdr_get_hrec(const bcf_hdr_t * hdr,int type,const char * key,const char * value,const char * str_class)748 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
749 {
750 int i;
751 if ( type==BCF_HL_GEN )
752 {
753 for (i=0; i<hdr->nhrec; i++)
754 {
755 if ( hdr->hrec[i]->type!=type ) continue;
756 if ( strcmp(hdr->hrec[i]->key,key) ) continue;
757 if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
758 }
759 return NULL;
760 }
761 else if ( type==BCF_HL_STR )
762 {
763 for (i=0; i<hdr->nhrec; i++)
764 {
765 if ( hdr->hrec[i]->type!=type ) continue;
766 if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
767 int j = bcf_hrec_find_key(hdr->hrec[i],key);
768 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
769 }
770 return NULL;
771 }
772 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
773 khint_t k = kh_get(vdict, d, value);
774 if ( k == kh_end(d) ) return NULL;
775 return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
776 }
777
bcf_hdr_check_sanity(bcf_hdr_t * hdr)778 void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
779 {
780 static int PL_warned = 0, GL_warned = 0;
781
782 if ( !PL_warned )
783 {
784 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
785 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
786 {
787 hts_log_warning("PL should be declared as Number=G");
788 PL_warned = 1;
789 }
790 }
791 if ( !GL_warned )
792 {
793 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
794 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
795 {
796 hts_log_warning("GL should be declared as Number=G");
797 GL_warned = 1;
798 }
799 }
800 }
801
bcf_hdr_parse(bcf_hdr_t * hdr,char * htxt)802 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
803 {
804 int len, done = 0;
805 char *p = htxt;
806
807 // Check sanity: "fileformat" string must come as first
808 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
809 if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
810 hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
811 if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
812 bcf_hrec_destroy(hrec);
813 return -1;
814 }
815
816 // The filter PASS must appear first in the dictionary
817 hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
818 if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
819 bcf_hrec_destroy(hrec);
820 return -1;
821 }
822
823 // Parse the whole header
824 do {
825 while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
826 if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
827 bcf_hrec_destroy(hrec);
828 return -1;
829 }
830 p += len;
831 }
832 assert(hrec == NULL);
833 if (len < 0) {
834 // len < 0 indicates out-of-memory, or similar error
835 hts_log_error("Could not parse header line: %s", strerror(errno));
836 return -1;
837 } else if (len > 0) {
838 // Bad header line. bcf_hdr_parse_line() will have logged it.
839 // Skip and try again on the next line (p + len will be the start
840 // of the next one).
841 p += len;
842 continue;
843 }
844
845 // Next should be the sample line. If not, it was a malformed
846 // header, in which case print a warning and skip (many VCF
847 // operations do not really care about a few malformed lines).
848 // In the future we may want to add a strict mode that errors in
849 // this case.
850 if ( strncmp("#CHROM\tPOS",p,10) != 0 ) {
851 char *eol = strchr(p, '\n');
852 if (*p != '\0') {
853 char buffer[320];
854 hts_log_warning("Could not parse header line: %s",
855 hts_strprint(buffer, sizeof(buffer),
856 '"', p,
857 eol ? (eol - p) : SIZE_MAX));
858 }
859 if (eol) {
860 p = eol + 1; // Try from the next line.
861 } else {
862 done = -1; // No more lines left, give up.
863 }
864 } else {
865 done = 1; // Sample line found
866 }
867 } while (!done);
868
869 if (done < 0) {
870 // No sample line is fatal.
871 hts_log_error("Could not parse the header, sample line not found");
872 return -1;
873 }
874
875 if (bcf_hdr_parse_sample_line(hdr,p) < 0)
876 return -1;
877 if (bcf_hdr_sync(hdr) < 0)
878 return -1;
879 bcf_hdr_check_sanity(hdr);
880 return 0;
881 }
882
bcf_hdr_append(bcf_hdr_t * hdr,const char * line)883 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
884 {
885 int len;
886 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
887 if ( !hrec ) return -1;
888 if (bcf_hdr_add_hrec(hdr, hrec) < 0)
889 return -1;
890 return 0;
891 }
892
bcf_hdr_remove(bcf_hdr_t * hdr,int type,const char * key)893 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
894 {
895 int i = 0;
896 bcf_hrec_t *hrec;
897 if ( !key )
898 {
899 while ( i<hdr->nhrec )
900 {
901 if ( hdr->hrec[i]->type!=type ) { i++; continue; }
902 hrec = hdr->hrec[i];
903
904 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
905 {
906 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
907 if ( j>=0 )
908 {
909 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
910 khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]);
911 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
912 }
913 }
914
915 hdr->dirty = 1;
916 hdr->nhrec--;
917 if ( i < hdr->nhrec )
918 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
919 bcf_hrec_destroy(hrec);
920 }
921 return;
922 }
923 while (1)
924 {
925 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
926 {
927 hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
928 if ( !hrec ) return;
929
930 for (i=0; i<hdr->nhrec; i++)
931 if ( hdr->hrec[i]==hrec ) break;
932 assert( i<hdr->nhrec );
933
934 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
935 khint_t k = kh_get(vdict, d, key);
936 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
937 }
938 else
939 {
940 for (i=0; i<hdr->nhrec; i++)
941 {
942 if ( hdr->hrec[i]->type!=type ) continue;
943 if ( type==BCF_HL_GEN )
944 {
945 if ( !strcmp(hdr->hrec[i]->key,key) ) break;
946 }
947 else
948 {
949 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
950 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
951 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
952 }
953 }
954 if ( i==hdr->nhrec ) return;
955 hrec = hdr->hrec[i];
956 }
957
958 hdr->nhrec--;
959 if ( i < hdr->nhrec )
960 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
961 bcf_hrec_destroy(hrec);
962 hdr->dirty = 1;
963 }
964 }
965
bcf_hdr_printf(bcf_hdr_t * hdr,const char * fmt,...)966 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
967 {
968 char tmp[256], *line = tmp;
969 va_list ap;
970 va_start(ap, fmt);
971 int n = vsnprintf(line, sizeof(tmp), fmt, ap);
972 va_end(ap);
973
974 if (n >= sizeof(tmp)) {
975 n++; // For trailing NUL
976 line = (char*)malloc(n);
977 if (!line)
978 return -1;
979
980 va_start(ap, fmt);
981 vsnprintf(line, n, fmt, ap);
982 va_end(ap);
983 }
984
985 int ret = bcf_hdr_append(hdr, line);
986
987 if (line != tmp) free(line);
988 return ret;
989 }
990
991
992 /**********************
993 *** BCF header I/O ***
994 **********************/
995
bcf_hdr_get_version(const bcf_hdr_t * hdr)996 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
997 {
998 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
999 if ( !hrec )
1000 {
1001 hts_log_warning("No version string found, assuming VCFv4.2");
1002 return "VCFv4.2";
1003 }
1004 return hrec->value;
1005 }
1006
bcf_hdr_set_version(bcf_hdr_t * hdr,const char * version)1007 int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1008 {
1009 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1010 if ( !hrec )
1011 {
1012 int len;
1013 kstring_t str = {0,0,0};
1014 ksprintf(&str,"##fileformat=%s", version);
1015 hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1016 free(str.s);
1017 }
1018 else
1019 {
1020 free(hrec->value);
1021 hrec->value = strdup(version);
1022 }
1023 hdr->dirty = 1;
1024 return 0; // FIXME: check for errs in this function (return < 0 if so)
1025 }
1026
bcf_hdr_init(const char * mode)1027 bcf_hdr_t *bcf_hdr_init(const char *mode)
1028 {
1029 int i;
1030 bcf_hdr_t *h;
1031 h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1032 if (!h) return NULL;
1033 for (i = 0; i < 3; ++i)
1034 if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1035 if ( strchr(mode,'w') )
1036 {
1037 bcf_hdr_append(h, "##fileformat=VCFv4.2");
1038 // The filter PASS must appear first in the dictionary
1039 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1040 }
1041 return h;
1042
1043 fail:
1044 for (i = 0; i < 3; ++i)
1045 kh_destroy(vdict, h->dict[i]);
1046 free(h);
1047 return NULL;
1048 }
1049
bcf_hdr_destroy(bcf_hdr_t * h)1050 void bcf_hdr_destroy(bcf_hdr_t *h)
1051 {
1052 int i;
1053 khint_t k;
1054 if (!h) return;
1055 for (i = 0; i < 3; ++i) {
1056 vdict_t *d = (vdict_t*)h->dict[i];
1057 if (d == 0) continue;
1058 for (k = kh_begin(d); k != kh_end(d); ++k)
1059 if (kh_exist(d, k)) free((char*)kh_key(d, k));
1060 kh_destroy(vdict, d);
1061 free(h->id[i]);
1062 }
1063 for (i=0; i<h->nhrec; i++)
1064 bcf_hrec_destroy(h->hrec[i]);
1065 if (h->nhrec) free(h->hrec);
1066 if (h->samples) free(h->samples);
1067 free(h->keep_samples);
1068 free(h->transl[0]); free(h->transl[1]);
1069 free(h->mem.s);
1070 free(h);
1071 }
1072
bcf_hdr_read(htsFile * hfp)1073 bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1074 {
1075 if (hfp->format.format == vcf)
1076 return vcf_hdr_read(hfp);
1077 if (hfp->format.format != bcf) {
1078 hts_log_error("Input is not detected as bcf or vcf format");
1079 return NULL;
1080 }
1081
1082 assert(hfp->is_bgzf);
1083
1084 BGZF *fp = hfp->fp.bgzf;
1085 uint8_t magic[5];
1086 bcf_hdr_t *h;
1087 h = bcf_hdr_init("r");
1088 if (!h) {
1089 hts_log_error("Failed to allocate bcf header");
1090 return NULL;
1091 }
1092 if (bgzf_read(fp, magic, 5) != 5)
1093 {
1094 hts_log_error("Failed to read the header (reading BCF in text mode?)");
1095 bcf_hdr_destroy(h);
1096 return NULL;
1097 }
1098 if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1099 {
1100 if (!strncmp((char*)magic, "BCF", 3))
1101 hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1102 else
1103 hts_log_error("Invalid BCF2 magic string");
1104 bcf_hdr_destroy(h);
1105 return NULL;
1106 }
1107 uint8_t buf[4];
1108 size_t hlen;
1109 char *htxt = NULL;
1110 if (bgzf_read(fp, buf, 4) != 4) goto fail;
1111 hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1112 if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1113 htxt = (char*)malloc(hlen + 1);
1114 if (!htxt) goto fail;
1115 if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1116 htxt[hlen] = '\0'; // Ensure htxt is terminated
1117 if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1118 free(htxt);
1119 return h;
1120 fail:
1121 hts_log_error("Failed to read BCF header");
1122 free(htxt);
1123 bcf_hdr_destroy(h);
1124 return NULL;
1125 }
1126
bcf_hdr_write(htsFile * hfp,bcf_hdr_t * h)1127 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1128 {
1129 if (!h) {
1130 errno = EINVAL;
1131 return -1;
1132 }
1133 if ( h->dirty ) {
1134 if (bcf_hdr_sync(h) < 0) return -1;
1135 }
1136 hfp->format.category = variant_data;
1137 if (hfp->format.format == vcf || hfp->format.format == text_format) {
1138 hfp->format.format = vcf;
1139 return vcf_hdr_write(hfp, h);
1140 }
1141
1142 if (hfp->format.format == binary_format)
1143 hfp->format.format = bcf;
1144
1145 kstring_t htxt = {0,0,0};
1146 if (bcf_hdr_format(h, 1, &htxt) < 0) {
1147 free(htxt.s);
1148 return -1;
1149 }
1150 kputc('\0', &htxt); // include the \0 byte
1151
1152 BGZF *fp = hfp->fp.bgzf;
1153 if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1154 uint8_t hlen[4];
1155 u32_to_le(htxt.l, hlen);
1156 if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1157 if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1158
1159 free(htxt.s);
1160 return 0;
1161 }
1162
1163 /********************
1164 *** BCF site I/O ***
1165 ********************/
1166
bcf_init()1167 bcf1_t *bcf_init()
1168 {
1169 bcf1_t *v;
1170 v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1171 return v;
1172 }
1173
bcf_clear(bcf1_t * v)1174 void bcf_clear(bcf1_t *v)
1175 {
1176 int i;
1177 for (i=0; i<v->d.m_info; i++)
1178 {
1179 if ( v->d.info[i].vptr_free )
1180 {
1181 free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1182 v->d.info[i].vptr_free = 0;
1183 }
1184 }
1185 for (i=0; i<v->d.m_fmt; i++)
1186 {
1187 if ( v->d.fmt[i].p_free )
1188 {
1189 free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1190 v->d.fmt[i].p_free = 0;
1191 }
1192 }
1193 v->rid = v->pos = v->rlen = v->unpacked = 0;
1194 bcf_float_set_missing(v->qual);
1195 v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1196 v->shared.l = v->indiv.l = 0;
1197 v->d.var_type = -1;
1198 v->d.shared_dirty = 0;
1199 v->d.indiv_dirty = 0;
1200 v->d.n_flt = 0;
1201 v->errcode = 0;
1202 if (v->d.m_als) v->d.als[0] = 0;
1203 if (v->d.m_id) v->d.id[0] = 0;
1204 }
1205
bcf_empty(bcf1_t * v)1206 void bcf_empty(bcf1_t *v)
1207 {
1208 bcf_clear1(v);
1209 free(v->d.id);
1210 free(v->d.als);
1211 free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1212 if (v->d.var ) free(v->d.var);
1213 free(v->shared.s); free(v->indiv.s);
1214 memset(&v->d,0,sizeof(v->d));
1215 memset(&v->shared,0,sizeof(v->shared));
1216 memset(&v->indiv,0,sizeof(v->indiv));
1217 }
1218
bcf_destroy(bcf1_t * v)1219 void bcf_destroy(bcf1_t *v)
1220 {
1221 if (!v) return;
1222 bcf_empty1(v);
1223 free(v);
1224 }
1225
bcf_read1_core(BGZF * fp,bcf1_t * v)1226 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1227 {
1228 uint8_t x[32];
1229 ssize_t ret;
1230 uint32_t shared_len, indiv_len;
1231 if ((ret = bgzf_read(fp, x, 32)) != 32) {
1232 if (ret == 0) return -1;
1233 return -2;
1234 }
1235 bcf_clear1(v);
1236 shared_len = le_to_u32(x);
1237 if (shared_len < 24) return -2;
1238 shared_len -= 24; // to exclude six 32-bit integers
1239 if (ks_resize(&v->shared, shared_len) != 0) return -2;
1240 indiv_len = le_to_u32(x + 4);
1241 if (ks_resize(&v->indiv, indiv_len) != 0) return -2;
1242 v->rid = le_to_i32(x + 8);
1243 v->pos = le_to_u32(x + 12);
1244 v->rlen = le_to_i32(x + 16);
1245 v->qual = le_to_float(x + 20);
1246 v->n_info = le_to_u16(x + 24);
1247 v->n_allele = le_to_u16(x + 26);
1248 v->n_sample = le_to_u32(x + 28) & 0xffffff;
1249 v->n_fmt = x[31];
1250 v->shared.l = shared_len;
1251 v->indiv.l = indiv_len;
1252 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1253 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1254
1255 if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1256 if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1257 return 0;
1258 }
1259
1260 #define bit_array_size(n) ((n)/8+1)
1261 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
1262 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1263 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
1264
bcf_dec_typed_int1_safe(uint8_t * p,uint8_t * end,uint8_t ** q,int32_t * val)1265 static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1266 int32_t *val) {
1267 uint32_t t;
1268 if (end - p < 2) return -1;
1269 t = *p++ & 0xf;
1270 /* Use if .. else if ... else instead of switch to force order. Assumption
1271 is that small integers are more frequent than big ones. */
1272 if (t == BCF_BT_INT8) {
1273 *val = *(int8_t *) p++;
1274 } else {
1275 if (end - p < (1<<bcf_type_shift[t])) return -1;
1276 if (t == BCF_BT_INT16) {
1277 *val = le_to_i16(p);
1278 p += 2;
1279 } else if (t == BCF_BT_INT32) {
1280 *val = le_to_i32(p);
1281 p += 4;
1282 #ifdef VCF_ALLOW_INT64
1283 } else if (t == BCF_BT_INT64) {
1284 // This case should never happen because there should be no
1285 // 64-bit BCFs at all, definitely not coming from htslib
1286 *val = le_to_i64(p);
1287 p += 8;
1288 #endif
1289 } else {
1290 return -1;
1291 }
1292 }
1293 *q = p;
1294 return 0;
1295 }
1296
bcf_dec_size_safe(uint8_t * p,uint8_t * end,uint8_t ** q,int * num,int * type)1297 static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1298 int *num, int *type) {
1299 int r;
1300 if (p >= end) return -1;
1301 *type = *p & 0xf;
1302 if (*p>>4 != 15) {
1303 *q = p + 1;
1304 *num = *p >> 4;
1305 return 0;
1306 }
1307 r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1308 if (r) return r;
1309 return *num >= 0 ? 0 : -1;
1310 }
1311
get_type_name(int type)1312 static const char *get_type_name(int type) {
1313 const char *types[9] = {
1314 "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1315 "unknown", "float", "unknown", "char", "unknown"
1316 };
1317 int t = (type >= 0 && type < 8) ? type : 8;
1318 return types[t];
1319 }
1320
bcf_record_check_err(const bcf_hdr_t * hdr,bcf1_t * rec,char * type,uint32_t * reports,int i)1321 static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1322 char *type, uint32_t *reports, int i) {
1323 if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1324 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1325 ": Invalid FORMAT %s %d",
1326 bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1327 (*reports)++;
1328 }
1329
bcf_record_check(const bcf_hdr_t * hdr,bcf1_t * rec)1330 static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1331 uint8_t *ptr, *end;
1332 size_t bytes;
1333 uint32_t err = 0;
1334 int type = 0;
1335 int num = 0;
1336 int reflen = 0;
1337 uint32_t i, reports;
1338 const uint32_t is_integer = ((1 << BCF_BT_INT8) |
1339 (1 << BCF_BT_INT16) |
1340 #ifdef VCF_ALLOW_INT64
1341 (1 << BCF_BT_INT64) |
1342 #endif
1343 (1 << BCF_BT_INT32));
1344 const uint32_t is_valid_type = (is_integer |
1345 (1 << BCF_BT_NULL) |
1346 (1 << BCF_BT_FLOAT) |
1347 (1 << BCF_BT_CHAR));
1348 int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1349
1350 // Check for valid contig ID
1351 if (rec->rid < 0
1352 || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1353 || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1354 hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1355 err |= BCF_ERR_CTG_INVALID;
1356 }
1357
1358 // Check ID
1359 ptr = (uint8_t *) rec->shared.s;
1360 end = ptr + rec->shared.l;
1361 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1362 if (type != BCF_BT_CHAR) {
1363 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1364 err |= BCF_ERR_TAG_INVALID;
1365 }
1366 bytes = (size_t) num << bcf_type_shift[type];
1367 if (end - ptr < bytes) goto bad_shared;
1368 ptr += bytes;
1369
1370 // Check REF and ALT
1371 reports = 0;
1372 for (i = 0; i < rec->n_allele; i++) {
1373 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1374 if (type != BCF_BT_CHAR) {
1375 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1376 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1377 err |= BCF_ERR_CHAR;
1378 }
1379 if (i == 0) reflen = num;
1380 bytes = (size_t) num << bcf_type_shift[type];
1381 if (end - ptr < bytes) goto bad_shared;
1382 ptr += bytes;
1383 }
1384
1385 // Check FILTER
1386 reports = 0;
1387 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1388 if (num > 0) {
1389 bytes = (size_t) num << bcf_type_shift[type];
1390 if (((1 << type) & is_integer) == 0) {
1391 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1392 err |= BCF_ERR_TAG_INVALID;
1393 if (end - ptr < bytes) goto bad_shared;
1394 ptr += bytes;
1395 } else {
1396 if (end - ptr < bytes) goto bad_shared;
1397 for (i = 0; i < num; i++) {
1398 int32_t key = bcf_dec_int1(ptr, type, &ptr);
1399 if (key < 0
1400 || (hdr && (key >= max_id
1401 || hdr->id[BCF_DT_ID][key].key == NULL))) {
1402 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1403 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1404 err |= BCF_ERR_TAG_UNDEF;
1405 }
1406 }
1407 }
1408 }
1409
1410 // Check INFO
1411 reports = 0;
1412 bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1413 for (i = 0; i < rec->n_info; i++) {
1414 int32_t key = -1;
1415 if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1416 if (key < 0 || (hdr && (key >= max_id
1417 || id_tmp[key].key == NULL))) {
1418 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1419 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1420 err |= BCF_ERR_TAG_UNDEF;
1421 }
1422 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1423 if (((1 << type) & is_valid_type) == 0) {
1424 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1425 hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1426 err |= BCF_ERR_TAG_INVALID;
1427 }
1428 bytes = (size_t) num << bcf_type_shift[type];
1429 if (end - ptr < bytes) goto bad_shared;
1430 ptr += bytes;
1431 }
1432
1433 // Check FORMAT and individual information
1434 ptr = (uint8_t *) rec->indiv.s;
1435 end = ptr + rec->indiv.l;
1436 reports = 0;
1437 for (i = 0; i < rec->n_fmt; i++) {
1438 int32_t key = -1;
1439 if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1440 if (key < 0
1441 || (hdr && (key >= max_id
1442 || id_tmp[key].key == NULL))) {
1443 bcf_record_check_err(hdr, rec, "id", &reports, key);
1444 err |= BCF_ERR_TAG_UNDEF;
1445 }
1446 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1447 if (((1 << type) & is_valid_type) == 0) {
1448 bcf_record_check_err(hdr, rec, "type", &reports, type);
1449 err |= BCF_ERR_TAG_INVALID;
1450 }
1451 bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1452 if (end - ptr < bytes) goto bad_indiv;
1453 ptr += bytes;
1454 }
1455
1456 if (!err && rec->rlen < 0) {
1457 // Treat bad rlen as a warning instead of an error, and try to
1458 // fix up by using the length of the stored REF allele.
1459 static int warned = 0;
1460 if (!warned) {
1461 hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1462 "Only one invalid RLEN will be reported.",
1463 bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1464 warned = 1;
1465 }
1466 rec->rlen = reflen >= 0 ? reflen : 0;
1467 }
1468
1469 rec->errcode |= err;
1470
1471 return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1472
1473 bad_shared:
1474 hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1475 return -2;
1476
1477 bad_indiv:
1478 hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1479 return -2;
1480 }
1481
1482 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
bcf_subset_format(const bcf_hdr_t * hdr,bcf1_t * rec)1483 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1484 {
1485 if ( !hdr->keep_samples ) return 0;
1486 if ( !bcf_hdr_nsamples(hdr) )
1487 {
1488 rec->indiv.l = rec->n_sample = 0;
1489 return 0;
1490 }
1491
1492 int i, j;
1493 uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1494 bcf_dec_t *dec = &rec->d;
1495 hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1496 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1497
1498 for (i=0; i<rec->n_fmt; i++)
1499 {
1500 ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1501 src = dec->fmt[i].p - dec->fmt[i].size;
1502 if ( dst )
1503 {
1504 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1505 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1506 }
1507 dst = dec->fmt[i].p;
1508 for (j=0; j<hdr->nsamples_ori; j++)
1509 {
1510 src += dec->fmt[i].size;
1511 if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1512 memmove(dst, src, dec->fmt[i].size);
1513 dst += dec->fmt[i].size;
1514 }
1515 rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1516 dec->fmt[i].p_len = dst - dec->fmt[i].p;
1517 }
1518 rec->unpacked |= BCF_UN_FMT;
1519
1520 rec->n_sample = bcf_hdr_nsamples(hdr);
1521 return 0;
1522 }
1523
bcf_read(htsFile * fp,const bcf_hdr_t * h,bcf1_t * v)1524 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1525 {
1526 if (fp->format.format == vcf) return vcf_read(fp,h,v);
1527 int ret = bcf_read1_core(fp->fp.bgzf, v);
1528 if (ret == 0) ret = bcf_record_check(h, v);
1529 if ( ret!=0 || !h->keep_samples ) return ret;
1530 return bcf_subset_format(h,v);
1531 }
1532
bcf_readrec(BGZF * fp,void * null,void * vv,int * tid,hts_pos_t * beg,hts_pos_t * end)1533 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1534 {
1535 bcf1_t *v = (bcf1_t *) vv;
1536 int ret = bcf_read1_core(fp, v);
1537 if (ret == 0) ret = bcf_record_check(NULL, v);
1538 if (ret >= 0)
1539 *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1540 return ret;
1541 }
1542
bcf1_sync_id(bcf1_t * line,kstring_t * str)1543 static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1544 {
1545 // single typed string
1546 if ( line->d.id && strcmp(line->d.id, ".") ) {
1547 return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1548 } else {
1549 return bcf_enc_size(str, 0, BCF_BT_CHAR);
1550 }
1551 }
bcf1_sync_alleles(bcf1_t * line,kstring_t * str)1552 static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1553 {
1554 // list of typed strings
1555 int i;
1556 for (i=0; i<line->n_allele; i++) {
1557 if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1558 return -1;
1559 }
1560 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1561 return 0;
1562 }
bcf1_sync_filter(bcf1_t * line,kstring_t * str)1563 static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1564 {
1565 // typed vector of integers
1566 if ( line->d.n_flt ) {
1567 return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1568 } else {
1569 return bcf_enc_vint(str, 0, 0, -1);
1570 }
1571 }
1572
bcf1_sync_info(bcf1_t * line,kstring_t * str)1573 static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1574 {
1575 // pairs of typed vectors
1576 int i, irm = -1, e = 0;
1577 for (i=0; i<line->n_info; i++)
1578 {
1579 bcf_info_t *info = &line->d.info[i];
1580 if ( !info->vptr )
1581 {
1582 // marked for removal
1583 if ( irm < 0 ) irm = i;
1584 continue;
1585 }
1586 e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
1587 if ( irm >=0 )
1588 {
1589 bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
1590 while ( irm<=i && line->d.info[irm].vptr ) irm++;
1591 }
1592 }
1593 if ( irm>=0 ) line->n_info = irm;
1594 return e == 0 ? 0 : -1;
1595 }
1596
bcf1_sync(bcf1_t * line)1597 static int bcf1_sync(bcf1_t *line)
1598 {
1599 char *shared_ori = line->shared.s;
1600 size_t prev_len;
1601
1602 kstring_t tmp = {0,0,0};
1603 if ( !line->shared.l )
1604 {
1605 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
1606 tmp = line->shared;
1607 bcf1_sync_id(line, &tmp);
1608 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1609
1610 bcf1_sync_alleles(line, &tmp);
1611 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1612
1613 bcf1_sync_filter(line, &tmp);
1614 line->unpack_size[2] = tmp.l - prev_len;
1615
1616 bcf1_sync_info(line, &tmp);
1617 line->shared = tmp;
1618 }
1619 else if ( line->d.shared_dirty )
1620 {
1621 // The line was edited, update the BCF data block.
1622
1623 if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
1624
1625 // ptr_ori points to the original unchanged BCF data.
1626 uint8_t *ptr_ori = (uint8_t *) line->shared.s;
1627
1628 // ID: single typed string
1629 if ( line->d.shared_dirty & BCF1_DIRTY_ID )
1630 bcf1_sync_id(line, &tmp);
1631 else
1632 kputsn_(ptr_ori, line->unpack_size[0], &tmp);
1633 ptr_ori += line->unpack_size[0];
1634 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1635
1636 // REF+ALT: list of typed strings
1637 if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
1638 bcf1_sync_alleles(line, &tmp);
1639 else
1640 {
1641 kputsn_(ptr_ori, line->unpack_size[1], &tmp);
1642 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1643 }
1644 ptr_ori += line->unpack_size[1];
1645 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1646
1647 if ( line->unpacked & BCF_UN_FLT )
1648 {
1649 // FILTER: typed vector of integers
1650 if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
1651 bcf1_sync_filter(line, &tmp);
1652 else if ( line->d.n_flt )
1653 kputsn_(ptr_ori, line->unpack_size[2], &tmp);
1654 else
1655 bcf_enc_vint(&tmp, 0, 0, -1);
1656 ptr_ori += line->unpack_size[2];
1657 line->unpack_size[2] = tmp.l - prev_len;
1658
1659 if ( line->unpacked & BCF_UN_INFO )
1660 {
1661 // INFO: pairs of typed vectors
1662 if ( line->d.shared_dirty & BCF1_DIRTY_INF )
1663 {
1664 bcf1_sync_info(line, &tmp);
1665 ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
1666 }
1667 }
1668 }
1669
1670 int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
1671 if ( size ) kputsn_(ptr_ori, size, &tmp);
1672
1673 free(line->shared.s);
1674 line->shared = tmp;
1675 }
1676 if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
1677 {
1678 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
1679 size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
1680 int i;
1681 for (i=0; i<line->n_info; i++)
1682 {
1683 uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
1684 line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
1685 off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
1686 if ( vptr_free )
1687 {
1688 free(vptr_free);
1689 line->d.info[i].vptr_free = 0;
1690 }
1691 }
1692 }
1693
1694 if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
1695 {
1696 // The genotype fields changed or are not present
1697 tmp.l = tmp.m = 0; tmp.s = NULL;
1698 int i, irm = -1;
1699 for (i=0; i<line->n_fmt; i++)
1700 {
1701 bcf_fmt_t *fmt = &line->d.fmt[i];
1702 if ( !fmt->p )
1703 {
1704 // marked for removal
1705 if ( irm < 0 ) irm = i;
1706 continue;
1707 }
1708 kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
1709 if ( irm >=0 )
1710 {
1711 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
1712 while ( irm<=i && line->d.fmt[irm].p ) irm++;
1713 }
1714
1715 }
1716 if ( irm>=0 ) line->n_fmt = irm;
1717 free(line->indiv.s);
1718 line->indiv = tmp;
1719
1720 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
1721 size_t off_new = 0;
1722 for (i=0; i<line->n_fmt; i++)
1723 {
1724 uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
1725 line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
1726 off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
1727 if ( p_free )
1728 {
1729 free(p_free);
1730 line->d.fmt[i].p_free = 0;
1731 }
1732 }
1733 }
1734 if ( !line->n_sample ) line->n_fmt = 0;
1735 line->d.shared_dirty = line->d.indiv_dirty = 0;
1736 return 0;
1737 }
1738
bcf_copy(bcf1_t * dst,bcf1_t * src)1739 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
1740 {
1741 bcf1_sync(src);
1742
1743 bcf_clear(dst);
1744 dst->rid = src->rid;
1745 dst->pos = src->pos;
1746 dst->rlen = src->rlen;
1747 dst->qual = src->qual;
1748 dst->n_info = src->n_info; dst->n_allele = src->n_allele;
1749 dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
1750
1751 if ( dst->shared.m < src->shared.l )
1752 {
1753 dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
1754 dst->shared.m = src->shared.l;
1755 }
1756 dst->shared.l = src->shared.l;
1757 memcpy(dst->shared.s,src->shared.s,dst->shared.l);
1758
1759 if ( dst->indiv.m < src->indiv.l )
1760 {
1761 dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
1762 dst->indiv.m = src->indiv.l;
1763 }
1764 dst->indiv.l = src->indiv.l;
1765 memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
1766
1767 return dst;
1768 }
bcf_dup(bcf1_t * src)1769 bcf1_t *bcf_dup(bcf1_t *src)
1770 {
1771 bcf1_t *out = bcf_init1();
1772 return bcf_copy(out, src);
1773 }
1774
bcf_write(htsFile * hfp,bcf_hdr_t * h,bcf1_t * v)1775 int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
1776 {
1777 if ( h->dirty ) {
1778 if (bcf_hdr_sync(h) < 0) return -1;
1779 }
1780 if ( bcf_hdr_nsamples(h)!=v->n_sample )
1781 {
1782 hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
1783 bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
1784 return -1;
1785 }
1786
1787 if ( hfp->format.format == vcf || hfp->format.format == text_format )
1788 return vcf_write(hfp,h,v);
1789
1790 if ( v->errcode )
1791 {
1792 // vcf_parse1() encountered a new contig or tag, undeclared in the
1793 // header. At this point, the header must have been printed,
1794 // proceeding would lead to a broken BCF file. Errors must be checked
1795 // and cleared by the caller before we can proceed.
1796 hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos, v->errcode, bcf_seqname_safe(h,v), v->pos+1);
1797 return -1;
1798 }
1799 bcf1_sync(v); // check if the BCF record was modified
1800
1801 if ( v->unpacked & BCF_IS_64BIT )
1802 {
1803 hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
1804 return -1;
1805 }
1806
1807 BGZF *fp = hfp->fp.bgzf;
1808 uint8_t x[32];
1809 u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
1810 u32_to_le(v->indiv.l, x + 4);
1811 i32_to_le(v->rid, x + 8);
1812 u32_to_le(v->pos, x + 12);
1813 u32_to_le(v->rlen, x + 16);
1814 float_to_le(v->qual, x + 20);
1815 u16_to_le(v->n_info, x + 24);
1816 u16_to_le(v->n_allele, x + 26);
1817 u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
1818 if ( bgzf_write(fp, x, 32) != 32 ) return -1;
1819 if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
1820 if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
1821
1822 if (hfp->idx) {
1823 if (hts_idx_push(hfp->idx, v->rid, v->pos, v->pos + v->rlen, bgzf_tell(fp), 1) < 0)
1824 return -1;
1825 }
1826
1827 return 0;
1828 }
1829
1830 /**********************
1831 *** VCF header I/O ***
1832 **********************/
1833
add_missing_contig_hrec(bcf_hdr_t * h,const char * name)1834 static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
1835 bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
1836 int save_errno;
1837 if (!hrec) goto fail;
1838
1839 hrec->key = strdup("contig");
1840 if (!hrec->key) goto fail;
1841
1842 if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
1843 if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
1844 goto fail;
1845 if (bcf_hdr_add_hrec(h, hrec) < 0)
1846 goto fail;
1847 return 0;
1848
1849 fail:
1850 save_errno = errno;
1851 hts_log_error("%s", strerror(errno));
1852 if (hrec) bcf_hrec_destroy(hrec);
1853 errno = save_errno;
1854 return -1;
1855 }
1856
vcf_hdr_read(htsFile * fp)1857 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
1858 {
1859 kstring_t txt, *s = &fp->line;
1860 int ret;
1861 bcf_hdr_t *h;
1862 tbx_t *idx = NULL;
1863 const char **names = NULL;
1864 h = bcf_hdr_init("r");
1865 if (!h) {
1866 hts_log_error("Failed to allocate bcf header");
1867 return NULL;
1868 }
1869 txt.l = txt.m = 0; txt.s = 0;
1870 while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
1871 int e = 0;
1872 if (s->l == 0) continue;
1873 if (s->s[0] != '#') {
1874 hts_log_error("No sample line");
1875 goto error;
1876 }
1877 if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
1878 kstring_t tmp = { 0, 0, NULL };
1879 hFILE *f = hopen(fp->fn_aux, "r");
1880 if (f == NULL) {
1881 hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
1882 goto error;
1883 }
1884 while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
1885 char *tab = strchr(tmp.s, '\t');
1886 if (tab == NULL) continue;
1887 e |= (kputs("##contig=<ID=", &txt) < 0);
1888 e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
1889 e |= (kputs(",length=", &txt) < 0);
1890 e |= (kputl(atol(tab), &txt) < 0);
1891 e |= (kputsn(">\n", 2, &txt) < 0);
1892 }
1893 free(tmp.s);
1894 if (hclose(f) != 0) {
1895 hts_log_error("Error on closing %s", fp->fn_aux);
1896 goto error;
1897 }
1898 if (e) goto error;
1899 }
1900 if (kputsn(s->s, s->l, &txt) < 0) goto error;
1901 if (kputc('\n', &txt) < 0) goto error;
1902 if (s->s[1] != '#') break;
1903 }
1904 if ( ret < -1 ) goto error;
1905 if ( !txt.s )
1906 {
1907 hts_log_error("Could not read the header");
1908 goto error;
1909 }
1910 if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
1911
1912 // check tabix index, are all contigs listed in the header? add the missing ones
1913 idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL);
1914 if ( idx )
1915 {
1916 int i, n, need_sync = 0;
1917 names = tbx_seqnames(idx, &n);
1918 if (!names) goto error;
1919 for (i=0; i<n; i++)
1920 {
1921 bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
1922 if ( hrec ) continue;
1923 if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
1924 need_sync = 1;
1925 }
1926 if ( need_sync ) {
1927 if (bcf_hdr_sync(h) < 0) goto error;
1928 }
1929 free(names);
1930 tbx_destroy(idx);
1931 }
1932 free(txt.s);
1933 return h;
1934
1935 error:
1936 if (idx) tbx_destroy(idx);
1937 free(names);
1938 free(txt.s);
1939 if (h) bcf_hdr_destroy(h);
1940 return NULL;
1941 }
1942
bcf_hdr_set(bcf_hdr_t * hdr,const char * fname)1943 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
1944 {
1945 int i = 0, n = 0, save_errno;
1946 char **lines = hts_readlines(fname, &n);
1947 if ( !lines ) return 1;
1948 for (i=0; i<n-1; i++)
1949 {
1950 int k;
1951 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
1952 if (!hrec) goto fail;
1953 if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1954 bcf_hrec_destroy(hrec);
1955 goto fail;
1956 }
1957 free(lines[i]);
1958 lines[i] = NULL;
1959 }
1960 if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
1961 if (bcf_hdr_sync(hdr) < 0) goto fail;
1962 free(lines[n-1]);
1963 free(lines);
1964 return 0;
1965
1966 fail:
1967 save_errno = errno;
1968 for (; i < n; i++)
1969 free(lines[i]);
1970 free(lines);
1971 errno = save_errno;
1972 return 1;
1973 }
1974
_bcf_hrec_format(const bcf_hrec_t * hrec,int is_bcf,kstring_t * str)1975 static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
1976 {
1977 uint32_t e = 0;
1978 if ( !hrec->value )
1979 {
1980 int j, nout = 0;
1981 e |= ksprintf(str, "##%s=<", hrec->key) < 0;
1982 for (j=0; j<hrec->nkeys; j++)
1983 {
1984 // do not output IDX if output is VCF
1985 if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
1986 if ( nout ) e |= kputc(',',str) < 0;
1987 e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
1988 nout++;
1989 }
1990 e |= ksprintf(str,">\n") < 0;
1991 }
1992 else
1993 e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
1994
1995 return e == 0 ? 0 : -1;
1996 }
1997
bcf_hrec_format(const bcf_hrec_t * hrec,kstring_t * str)1998 int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
1999 {
2000 return _bcf_hrec_format(hrec,0,str);
2001 }
2002
bcf_hdr_format(const bcf_hdr_t * hdr,int is_bcf,kstring_t * str)2003 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2004 {
2005 int i, r = 0;
2006 for (i=0; i<hdr->nhrec; i++)
2007 r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2008
2009 r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2010 if ( bcf_hdr_nsamples(hdr) )
2011 {
2012 r |= ksprintf(str, "\tFORMAT") < 0;
2013 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2014 r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2015 }
2016 r |= ksprintf(str, "\n") < 0;
2017
2018 return r ? -1 : 0;
2019 }
2020
bcf_hdr_fmt_text(const bcf_hdr_t * hdr,int is_bcf,int * len)2021 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2022 {
2023 kstring_t txt = {0,0,0};
2024 if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2025 return NULL;
2026 if ( len ) *len = txt.l;
2027 return txt.s;
2028 }
2029
bcf_hdr_seqnames(const bcf_hdr_t * h,int * n)2030 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2031 {
2032 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2033 int tid, m = kh_size(d);
2034 const char **names = (const char**) calloc(m,sizeof(const char*));
2035 khint_t k;
2036 for (k=kh_begin(d); k<kh_end(d); k++)
2037 {
2038 if ( !kh_exist(d,k) ) continue;
2039 tid = kh_val(d,k).id;
2040 assert( tid<m );
2041 names[tid] = kh_key(d,k);
2042 }
2043 // sanity check: there should be no gaps
2044 for (tid=0; tid<m; tid++)
2045 assert(names[tid]);
2046 *n = m;
2047 return names;
2048 }
2049
vcf_hdr_write(htsFile * fp,const bcf_hdr_t * h)2050 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2051 {
2052 kstring_t htxt = {0,0,0};
2053 if (bcf_hdr_format(h, 0, &htxt) < 0) {
2054 free(htxt.s);
2055 return -1;
2056 }
2057 while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2058 int ret;
2059 if ( fp->format.compression!=no_compression )
2060 ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2061 else
2062 ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2063 free(htxt.s);
2064 return ret<0 ? -1 : 0;
2065 }
2066
2067 /***********************
2068 *** Typed value I/O ***
2069 ***********************/
2070
bcf_enc_vint(kstring_t * s,int n,int32_t * a,int wsize)2071 int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2072 {
2073 int32_t max = INT32_MIN, min = INT32_MAX;
2074 int i;
2075 if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL);
2076 else if (n == 1) bcf_enc_int1(s, a[0]);
2077 else {
2078 if (wsize <= 0) wsize = n;
2079 for (i = 0; i < n; ++i) {
2080 if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
2081 if (max < a[i]) max = a[i];
2082 if (min > a[i]) min = a[i];
2083 }
2084 if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2085 bcf_enc_size(s, wsize, BCF_BT_INT8);
2086 for (i = 0; i < n; ++i)
2087 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
2088 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
2089 else kputc(a[i], s);
2090 } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2091 uint8_t *p;
2092 bcf_enc_size(s, wsize, BCF_BT_INT16);
2093 ks_resize(s, s->l + n * sizeof(int16_t));
2094 p = (uint8_t *) s->s + s->l;
2095 for (i = 0; i < n; ++i)
2096 {
2097 int16_t x;
2098 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2099 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2100 else x = a[i];
2101 i16_to_le(x, p);
2102 p += sizeof(int16_t);
2103 }
2104 s->l += n * sizeof(int16_t);
2105 } else {
2106 uint8_t *p;
2107 bcf_enc_size(s, wsize, BCF_BT_INT32);
2108 ks_resize(s, s->l + n * sizeof(int32_t));
2109 p = (uint8_t *) s->s + s->l;
2110 for (i = 0; i < n; ++i) {
2111 i32_to_le(a[i], p);
2112 p += sizeof(int32_t);
2113 }
2114 s->l += n * sizeof(int32_t);
2115 }
2116 }
2117
2118 return 0; // FIXME: check for errs in this function
2119 }
2120
2121 #ifdef VCF_ALLOW_INT64
bcf_enc_long1(kstring_t * s,int64_t x)2122 static int bcf_enc_long1(kstring_t *s, int64_t x) {
2123 uint32_t e = 0;
2124 if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2125 return bcf_enc_int1(s, x);
2126 if (x == bcf_int64_vector_end) {
2127 e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2128 e |= kputc(bcf_int8_vector_end, s) < 0;
2129 } else if (x == bcf_int64_missing) {
2130 e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2131 e |= kputc(bcf_int8_missing, s) < 0;
2132 } else {
2133 e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2134 e |= ks_expand(s, 8);
2135 if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2136 }
2137 return e == 0 ? 0 : -1;
2138 }
2139 #endif
2140
serialize_float_array(kstring_t * s,size_t n,const float * a)2141 static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2142 uint8_t *p;
2143 size_t i;
2144 size_t bytes = n * sizeof(float);
2145
2146 if (bytes / sizeof(float) != n) return -1;
2147 if (ks_resize(s, s->l + bytes) < 0) return -1;
2148
2149 p = (uint8_t *) s->s + s->l;
2150 for (i = 0; i < n; i++) {
2151 float_to_le(a[i], p);
2152 p += sizeof(float);
2153 }
2154 s->l += bytes;
2155
2156 return 0;
2157 }
2158
bcf_enc_vfloat(kstring_t * s,int n,float * a)2159 int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2160 {
2161 assert(n >= 0);
2162 bcf_enc_size(s, n, BCF_BT_FLOAT);
2163 serialize_float_array(s, n, a);
2164 return 0; // FIXME: check for errs in this function
2165 }
2166
bcf_enc_vchar(kstring_t * s,int l,const char * a)2167 int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2168 {
2169 bcf_enc_size(s, l, BCF_BT_CHAR);
2170 kputsn(a, l, s);
2171 return 0; // FIXME: check for errs in this function
2172 }
2173
bcf_fmt_array(kstring_t * s,int n,int type,void * data)2174 int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2175 {
2176 int j = 0;
2177 uint32_t e = 0;
2178 if (n == 0) {
2179 return kputc('.', s) >= 0 ? 0 : -1;
2180 }
2181 if (type == BCF_BT_CHAR)
2182 {
2183 char *p = (char*)data;
2184 for (j = 0; j < n && *p; ++j, ++p)
2185 {
2186 if ( *p==bcf_str_missing ) e |= kputc('.', s) < 0;
2187 else e |= kputc(*p, s) < 0;
2188 }
2189 }
2190 else
2191 {
2192 #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2193 uint8_t *p = (uint8_t *) data; \
2194 for (j=0; j<n; j++, p += sizeof(type_t)) \
2195 { \
2196 type_t v = convert(p); \
2197 if ( is_vector_end ) break; \
2198 if ( j ) kputc(',', s); \
2199 if ( is_missing ) kputc('.', s); \
2200 else e |= kprint < 0; \
2201 } \
2202 }
2203 switch (type) {
2204 case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break;
2205 case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2206 case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2207 case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2208 default: hts_log_error("Unexpected type %d", type); exit(1); break;
2209 }
2210 #undef BRANCH
2211 }
2212 return e == 0 ? 0 : -1;
2213 }
2214
bcf_fmt_sized_array(kstring_t * s,uint8_t * ptr)2215 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2216 {
2217 int x, type;
2218 x = bcf_dec_size(ptr, &ptr, &type);
2219 bcf_fmt_array(s, x, type, ptr);
2220 return ptr + (x << bcf_type_shift[type]);
2221 }
2222
2223 /********************
2224 *** VCF site I/O ***
2225 ********************/
2226
2227 typedef struct {
2228 int key, max_m, size, offset;
2229 uint32_t is_gt:1, max_g:31;
2230 uint32_t max_l;
2231 uint32_t y;
2232 uint8_t *buf;
2233 } fmt_aux_t;
2234
align_mem(kstring_t * s)2235 static inline int align_mem(kstring_t *s)
2236 {
2237 int e = 0;
2238 if (s->l&7) {
2239 uint64_t zero = 0;
2240 e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2241 }
2242 return e == 0 ? 0 : -1;
2243 }
2244
2245 // p,q is the start and the end of the FORMAT field
2246 #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
vcf_parse_format(kstring_t * s,const bcf_hdr_t * h,bcf1_t * v,char * p,char * q)2247 static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
2248 {
2249 if ( !bcf_hdr_nsamples(h) ) return 0;
2250
2251 static int extreme_val_warned = 0;
2252 char *r, *t;
2253 int j, l, m, g, overflow = 0;
2254 khint_t k;
2255 ks_tokaux_t aux1;
2256 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2257 kstring_t *mem = (kstring_t*)&h->mem;
2258 fmt_aux_t fmt[MAX_N_FMT];
2259 mem->l = 0;
2260
2261 char *end = s->s + s->l;
2262 if ( q>=end )
2263 {
2264 hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2265 v->errcode |= BCF_ERR_NCOLS;
2266 return -1;
2267 }
2268
2269 v->n_fmt = 0;
2270 if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2271 {
2272 v->n_sample = bcf_hdr_nsamples(h);
2273 return 0;
2274 }
2275
2276 // get format information from the dictionary
2277 for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2278 if (j >= MAX_N_FMT) {
2279 v->errcode |= BCF_ERR_LIMITS;
2280 hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2281 bcf_seqname_safe(h,v), v->pos+1);
2282 return -1;
2283 }
2284
2285 *(char*)aux1.p = 0;
2286 k = kh_get(vdict, d, t);
2287 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2288 if ( t[0]=='.' && t[1]==0 )
2289 {
2290 hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2291 v->errcode |= BCF_ERR_TAG_INVALID;
2292 return -1;
2293 }
2294 hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2295 kstring_t tmp = {0,0,0};
2296 int l;
2297 ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2298 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2299 free(tmp.s);
2300 int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2301 if (res < 0) bcf_hrec_destroy(hrec);
2302 if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2303
2304 k = kh_get(vdict, d, t);
2305 v->errcode = BCF_ERR_TAG_UNDEF;
2306 if (res || k == kh_end(d)) {
2307 hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2308 v->errcode |= BCF_ERR_TAG_INVALID;
2309 return -1;
2310 }
2311 }
2312 fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2313 fmt[j].key = kh_val(d, k).id;
2314 fmt[j].is_gt = !strcmp(t, "GT");
2315 fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2316 v->n_fmt++;
2317 }
2318 // compute max
2319 int n_sample_ori = -1;
2320 r = q + 1; // r: position in the format string
2321 l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
2322 while ( r<end )
2323 {
2324 // can we skip some samples?
2325 if ( h->keep_samples )
2326 {
2327 n_sample_ori++;
2328 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2329 {
2330 while ( *r!='\t' && r<end ) r++;
2331 if ( *r=='\t' ) { *r = 0; r++; }
2332 continue;
2333 }
2334 }
2335
2336 // collect fmt stats: max vector size, length, number of alleles
2337 j = 0; // j-th format field
2338 fmt_aux_t *f = fmt;
2339 for (;;) {
2340 switch (*r) {
2341 case ',':
2342 m++;
2343 break;
2344
2345 case '|':
2346 case '/':
2347 if (f->is_gt) g++;
2348 break;
2349
2350 case '\t':
2351 *r = 0; // fall through
2352
2353 case '\0':
2354 case ':':
2355 if (f->max_m < m) f->max_m = m;
2356 if (f->max_l < l) f->max_l = l;
2357 if (f->is_gt && f->max_g < g) f->max_g = g;
2358 l = 0, m = g = 1;
2359 if ( *r==':' ) {
2360 j++; f++;
2361 if ( j>=v->n_fmt ) {
2362 hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2363 h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2364 v->errcode |= BCF_ERR_NCOLS;
2365 return -1;
2366 }
2367 } else goto end_for;
2368 break;
2369 }
2370 if ( r>=end ) break;
2371 r++; l++;
2372 }
2373 end_for:
2374 v->n_sample++;
2375 if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2376 r++;
2377 }
2378
2379 // allocate memory for arrays
2380 for (j = 0; j < v->n_fmt; ++j) {
2381 fmt_aux_t *f = &fmt[j];
2382 if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
2383 if ((f->y>>4&0xf) == BCF_HT_STR) {
2384 f->size = f->is_gt? f->max_g << 2 : f->max_l;
2385 } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2386 f->size = f->max_m << 2;
2387 } else
2388 {
2389 hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2390 v->errcode |= BCF_ERR_TAG_INVALID;
2391 return -1;
2392 }
2393 if (align_mem(mem) < 0) {
2394 hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2395 v->errcode |= BCF_ERR_LIMITS;
2396 return -1;
2397 }
2398
2399 // Limit the total memory to ~2Gb per VCF row. This should mean
2400 // malformed VCF data is less likely to take excessive memory and/or
2401 // time.
2402 if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
2403 hts_log_error("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2404 v->errcode |= BCF_ERR_LIMITS;
2405 return -1;
2406 }
2407
2408 f->offset = mem->l;
2409 if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
2410 hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2411 v->errcode |= BCF_ERR_LIMITS;
2412 return -1;
2413 }
2414 mem->l += v->n_sample * f->size;
2415 }
2416 for (j = 0; j < v->n_fmt; ++j)
2417 fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
2418 // fill the sample fields; at beginning of the loop, t points to the first char of a format
2419 n_sample_ori = -1;
2420 t = q + 1; m = 0; // m: sample id
2421 while ( t<end )
2422 {
2423 // can we skip some samples?
2424 if ( h->keep_samples )
2425 {
2426 n_sample_ori++;
2427 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2428 {
2429 while ( *t && t<end ) t++;
2430 t++;
2431 continue;
2432 }
2433 }
2434 if ( m == bcf_hdr_nsamples(h) ) break;
2435
2436 j = 0; // j-th format field, m-th sample
2437 while ( t < end )
2438 {
2439 fmt_aux_t *z = &fmt[j++];
2440 if (!z->buf) {
2441 hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
2442 z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2443 v->errcode |= BCF_ERR_LIMITS;
2444 return -1;
2445 }
2446 if ((z->y>>4&0xf) == BCF_HT_STR) {
2447 if (z->is_gt) { // genotypes
2448 int32_t is_phased = 0;
2449 uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
2450 uint32_t unreadable = 0;
2451 uint32_t max = 0;
2452 overflow = 0;
2453 for (l = 0;; ++t) {
2454 if (*t == '.') {
2455 ++t, x[l++] = is_phased;
2456 } else {
2457 char *tt = t;
2458 uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow);
2459 unreadable |= tt == t;
2460 if (max < val) max = val;
2461 x[l++] = (val + 1) << 1 | is_phased;
2462 }
2463 is_phased = (*t == '|');
2464 if (*t != '|' && *t != '/') break;
2465 }
2466 // Possibly check max against v->n_allele instead?
2467 if (overflow || max > (INT32_MAX >> 1) - 1) {
2468 hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2469 return -1;
2470 }
2471 if (unreadable) {
2472 hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2473 return -1;
2474 }
2475 if ( !l ) x[l++] = 0; // An empty field, insert missing value
2476 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2477 } else {
2478 char *x = (char*)z->buf + z->size * (size_t)m;
2479 for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
2480 for (; l < z->size; ++l) x[l] = 0;
2481 }
2482 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2483 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2484 for (l = 0;; ++t) {
2485 if (*t == '.') {
2486 x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2487 } else {
2488 overflow = 0;
2489 char *te;
2490 long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2491 if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2492 {
2493 if ( !extreme_val_warned )
2494 {
2495 hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
2496 extreme_val_warned = 1;
2497 }
2498 tmp_val = bcf_int32_missing;
2499 }
2500 x[l++] = tmp_val;
2501 t = te;
2502 }
2503 if (*t != ',') break;
2504 }
2505 if ( !l ) x[l++] = bcf_int32_missing;
2506 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2507 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2508 float *x = (float*)(z->buf + z->size * (size_t)m);
2509 for (l = 0;; ++t) {
2510 if (*t == '.' && !isdigit_c(t[1])) {
2511 bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
2512 } else {
2513 overflow = 0;
2514 char *te;
2515 float tmp_val = hts_str2dbl(t, &te, &overflow);
2516 if ( (te==t || overflow) && !extreme_val_warned )
2517 {
2518 hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
2519 extreme_val_warned = 1;
2520 }
2521 x[l++] = tmp_val;
2522 t = te;
2523 }
2524 if (*t != ',') break;
2525 }
2526 if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
2527 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2528 } else {
2529 hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2530 v->errcode |= BCF_ERR_TAG_INVALID;
2531 return -1;
2532 }
2533
2534 if (*t == '\0') {
2535 break;
2536 }
2537 else if (*t == ':') {
2538 t++;
2539 }
2540 else {
2541 char buffer[8];
2542 hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
2543 hts_strprint(buffer, sizeof buffer, '\'', t, 1),
2544 h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
2545 v->errcode |= BCF_ERR_CHAR;
2546 return -1;
2547 }
2548 }
2549
2550 for (; j < v->n_fmt; ++j) { // fill end-of-vector values
2551 fmt_aux_t *z = &fmt[j];
2552 if ((z->y>>4&0xf) == BCF_HT_STR) {
2553 if (z->is_gt) {
2554 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2555 if (z->size) x[0] = bcf_int32_missing;
2556 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2557 } else {
2558 char *x = (char*)z->buf + z->size * (size_t)m;
2559 if ( z->size ) x[0] = '.';
2560 for (l = 1; l < z->size; ++l) x[l] = 0;
2561 }
2562 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2563 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2564 x[0] = bcf_int32_missing;
2565 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2566 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2567 float *x = (float*)(z->buf + z->size * (size_t)m);
2568 bcf_float_set_missing(x[0]);
2569 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2570 }
2571 }
2572
2573 m++; t++;
2574 }
2575
2576 // write individual genotype information
2577 kstring_t *str = &v->indiv;
2578 int i;
2579 if (v->n_sample > 0) {
2580 for (i = 0; i < v->n_fmt; ++i) {
2581 fmt_aux_t *z = &fmt[i];
2582 bcf_enc_int1(str, z->key);
2583 if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
2584 bcf_enc_size(str, z->size, BCF_BT_CHAR);
2585 kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
2586 } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
2587 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
2588 } else {
2589 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
2590 if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
2591 (float *) z->buf) != 0) {
2592 v->errcode |= BCF_ERR_LIMITS;
2593 hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2594 return -1;
2595 }
2596 }
2597 }
2598 }
2599
2600 if ( v->n_sample!=bcf_hdr_nsamples(h) )
2601 {
2602 hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2603 bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2604 v->errcode |= BCF_ERR_NCOLS;
2605 return -1;
2606 }
2607 if ( v->indiv.l > 0xffffffff )
2608 {
2609 hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
2610 v->errcode |= BCF_ERR_LIMITS;
2611
2612 // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
2613 v->n_fmt = 0;
2614 return -1;
2615 }
2616
2617 return 0;
2618 }
2619
fix_chromosome(const bcf_hdr_t * h,vdict_t * d,const char * p)2620 static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
2621 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
2622 // been already printed, but will enable tools like vcfcheck to proceed.
2623
2624 kstring_t tmp = {0,0,0};
2625 khint_t k;
2626 int l;
2627 if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
2628 return kh_end(d);
2629 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2630 free(tmp.s);
2631 int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2632 if (res < 0) bcf_hrec_destroy(hrec);
2633 if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2634 k = kh_get(vdict, d, p);
2635
2636 return k;
2637 }
2638
vcf_parse_filter(kstring_t * str,const bcf_hdr_t * h,bcf1_t * v,char * p,char * q)2639 static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
2640 int i, n_flt = 1, max_n_flt = 0;
2641 char *r, *t;
2642 int32_t *a_flt = NULL;
2643 ks_tokaux_t aux1;
2644 khint_t k;
2645 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2646 // count the number of filters
2647 if (*(q-1) == ';') *(q-1) = 0;
2648 for (r = p; *r; ++r)
2649 if (*r == ';') ++n_flt;
2650 if (n_flt > max_n_flt) {
2651 a_flt = malloc(n_flt * sizeof(*a_flt));
2652 if (!a_flt) {
2653 hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2654 v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2655 return -1;
2656 }
2657 max_n_flt = n_flt;
2658 }
2659 // add filters
2660 for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
2661 *(char*)aux1.p = 0;
2662 k = kh_get(vdict, d, t);
2663 if (k == kh_end(d))
2664 {
2665 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
2666 // been already printed, but will enable tools like vcfcheck to proceed.
2667 hts_log_warning("FILTER '%s' is not defined in the header", t);
2668 kstring_t tmp = {0,0,0};
2669 int l;
2670 ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
2671 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2672 free(tmp.s);
2673 int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2674 if (res < 0) bcf_hrec_destroy(hrec);
2675 if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2676 k = kh_get(vdict, d, t);
2677 v->errcode |= BCF_ERR_TAG_UNDEF;
2678 if (res || k == kh_end(d)) {
2679 hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2680 v->errcode |= BCF_ERR_TAG_INVALID;
2681 free(a_flt);
2682 return -1;
2683 }
2684 }
2685 a_flt[i++] = kh_val(d, k).id;
2686 }
2687
2688 bcf_enc_vint(str, n_flt, a_flt, -1);
2689 free(a_flt);
2690
2691 return 0;
2692 }
2693
vcf_parse_info(kstring_t * str,const bcf_hdr_t * h,bcf1_t * v,char * p,char * q)2694 static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
2695 static int extreme_int_warned = 0, negative_rlen_warned = 0;
2696 int max_n_val = 0, overflow = 0;
2697 char *r, *key;
2698 khint_t k;
2699 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2700 int32_t *a_val = NULL;
2701
2702 v->n_info = 0;
2703 if (*(q-1) == ';') *(q-1) = 0;
2704 for (r = key = p;; ++r) {
2705 int c;
2706 char *val, *end;
2707 if (*r != ';' && *r != '=' && *r != 0) continue;
2708 if (v->n_info == UINT16_MAX) {
2709 hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
2710 bcf_seqname_safe(h,v), v->pos+1);
2711 v->errcode |= BCF_ERR_LIMITS;
2712 goto fail;
2713 }
2714 val = end = 0;
2715 c = *r; *r = 0;
2716 if (c == '=') {
2717 val = r + 1;
2718 for (end = val; *end != ';' && *end != 0; ++end);
2719 c = *end; *end = 0;
2720 } else end = r;
2721 if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
2722 k = kh_get(vdict, d, key);
2723 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
2724 {
2725 hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
2726 kstring_t tmp = {0,0,0};
2727 int l;
2728 ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
2729 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2730 free(tmp.s);
2731 int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2732 if (res < 0) bcf_hrec_destroy(hrec);
2733 if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2734 k = kh_get(vdict, d, key);
2735 v->errcode = BCF_ERR_TAG_UNDEF;
2736 if (res || k == kh_end(d)) {
2737 hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
2738 v->errcode |= BCF_ERR_TAG_INVALID;
2739 goto fail;
2740 }
2741 }
2742 uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
2743 ++v->n_info;
2744 bcf_enc_int1(str, kh_val(d, k).id);
2745 if (val == 0) {
2746 bcf_enc_size(str, 0, BCF_BT_NULL);
2747 } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
2748 bcf_enc_vchar(str, end - val, val);
2749 } else { // int/float value/array
2750 int i, n_val;
2751 char *t, *te;
2752 for (t = val, n_val = 1; *t; ++t) // count the number of values
2753 if (*t == ',') ++n_val;
2754 // Check both int and float size in one step for simplicity
2755 if (n_val > max_n_val) {
2756 int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
2757 if (!a_tmp) {
2758 hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2759 v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2760 goto fail;
2761 }
2762 a_val = a_tmp;
2763 max_n_val = n_val;
2764 }
2765 if ((y>>4&0xf) == BCF_HT_INT) {
2766 i = 0, t = val;
2767 int64_t val1;
2768 int is_int64 = 0;
2769 #ifdef VCF_ALLOW_INT64
2770 if ( n_val==1 )
2771 {
2772 overflow = 0;
2773 long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2774 if ( te==val ) tmp_val = bcf_int32_missing;
2775 else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
2776 {
2777 if ( !extreme_int_warned )
2778 {
2779 hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
2780 extreme_int_warned = 1;
2781 }
2782 tmp_val = bcf_int32_missing;
2783 }
2784 else
2785 is_int64 = 1;
2786 val1 = tmp_val;
2787 t = te;
2788 i = 1; // this is just to avoid adding another nested block...
2789 }
2790 #endif
2791 for (; i < n_val; ++i, ++t)
2792 {
2793 overflow = 0;
2794 long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2795 if ( te==t ) tmp_val = bcf_int32_missing;
2796 else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2797 {
2798 if ( !extreme_int_warned )
2799 {
2800 hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
2801 extreme_int_warned = 1;
2802 }
2803 tmp_val = bcf_int32_missing;
2804 }
2805 a_val[i] = tmp_val;
2806 for (t = te; *t && *t != ','; t++);
2807 }
2808 if (n_val == 1) {
2809 #ifdef VCF_ALLOW_INT64
2810 if ( is_int64 )
2811 {
2812 v->unpacked |= BCF_IS_64BIT;
2813 bcf_enc_long1(str, val1);
2814 }
2815 else
2816 bcf_enc_int1(str, (int32_t)val1);
2817 #else
2818 val1 = a_val[0];
2819 bcf_enc_int1(str, (int32_t)val1);
2820 #endif
2821 } else {
2822 bcf_enc_vint(str, n_val, a_val, -1);
2823 }
2824 if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0)
2825 {
2826 if ( val1 <= v->pos )
2827 {
2828 if ( !negative_rlen_warned )
2829 {
2830 hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
2831 negative_rlen_warned = 1;
2832 }
2833 }
2834 else
2835 v->rlen = val1 - v->pos;
2836 }
2837 } else if ((y>>4&0xf) == BCF_HT_REAL) {
2838 float *val_f = (float *)a_val;
2839 for (i = 0, t = val; i < n_val; ++i, ++t)
2840 {
2841 overflow = 0;
2842 val_f[i] = hts_str2dbl(t, &te, &overflow);
2843 if ( te==t || overflow ) // conversion failed
2844 bcf_float_set_missing(val_f[i]);
2845 for (t = te; *t && *t != ','; t++);
2846 }
2847 bcf_enc_vfloat(str, n_val, val_f);
2848 }
2849 }
2850 if (c == 0) break;
2851 r = end;
2852 key = r + 1;
2853 }
2854
2855 free(a_val);
2856 return 0;
2857
2858 fail:
2859 free(a_val);
2860 return -1;
2861 }
2862
vcf_parse(kstring_t * s,const bcf_hdr_t * h,bcf1_t * v)2863 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2864 {
2865 int i = 0, ret = -2, overflow = 0;
2866 char *p, *q, *r, *t;
2867 kstring_t *str;
2868 khint_t k;
2869 ks_tokaux_t aux;
2870
2871 if (!s || !h || !v || !(s->s))
2872 return ret;
2873
2874 // Assumed in lots of places, but we may as well spot this early
2875 assert(sizeof(float) == sizeof(int32_t));
2876
2877 bcf_clear1(v);
2878 str = &v->shared;
2879 memset(&aux, 0, sizeof(ks_tokaux_t));
2880 for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
2881 q = (char*)aux.p;
2882 *q = 0;
2883 if (i == 0) { // CHROM
2884 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2885 k = kh_get(vdict, d, p);
2886 if (k == kh_end(d))
2887 {
2888 hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
2889 v->errcode = BCF_ERR_CTG_UNDEF;
2890 if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
2891 hts_log_error("Could not add dummy header for contig '%s'", p);
2892 v->errcode |= BCF_ERR_CTG_INVALID;
2893 goto err;
2894 }
2895 }
2896 v->rid = kh_val(d, k).id;
2897 } else if (i == 1) { // POS
2898 overflow = 0;
2899 v->pos = hts_str2uint(p, &p, 63, &overflow);
2900 if (overflow) {
2901 hts_log_error("Position value '%s' is too large", p);
2902 goto err;
2903 } else {
2904 v->pos -= 1;
2905 }
2906 if (v->pos >= INT32_MAX)
2907 v->unpacked |= BCF_IS_64BIT;
2908 } else if (i == 2) { // ID
2909 if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
2910 else bcf_enc_size(str, 0, BCF_BT_CHAR);
2911 } else if (i == 3) { // REF
2912 bcf_enc_vchar(str, q - p, p);
2913 v->n_allele = 1, v->rlen = q - p;
2914 } else if (i == 4) { // ALT
2915 if (strcmp(p, ".")) {
2916 for (r = t = p;; ++r) {
2917 if (*r == ',' || *r == 0) {
2918 if (v->n_allele == UINT16_MAX) {
2919 hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
2920 bcf_seqname_safe(h,v), v->pos+1);
2921 v->errcode |= BCF_ERR_LIMITS;
2922 goto err;
2923 }
2924 bcf_enc_vchar(str, r - t, t);
2925 t = r + 1;
2926 ++v->n_allele;
2927 }
2928 if (r == q) break;
2929 }
2930 }
2931 } else if (i == 5) { // QUAL
2932 if (strcmp(p, ".")) v->qual = atof(p);
2933 else bcf_float_set_missing(v->qual);
2934 if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
2935 } else if (i == 6) { // FILTER
2936 if (strcmp(p, ".")) {
2937 if (vcf_parse_filter(str, h, v, p, q)) goto err;
2938 } else bcf_enc_vint(str, 0, 0, -1);
2939 if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
2940 } else if (i == 7) { // INFO
2941 if (strcmp(p, ".")) {
2942 if (vcf_parse_info(str, h, v, p, q)) goto err;
2943 }
2944 if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
2945 } else if (i == 8) {// FORMAT
2946 return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
2947 }
2948 }
2949
2950 end:
2951 ret = 0;
2952
2953 err:
2954 return ret;
2955 }
2956
vcf_open_mode(char * mode,const char * fn,const char * format)2957 int vcf_open_mode(char *mode, const char *fn, const char *format)
2958 {
2959 if (format == NULL) {
2960 // Try to pick a format based on the filename extension
2961 char extension[HTS_MAX_EXT_LEN];
2962 if (find_file_extension(fn, extension) < 0) return -1;
2963 return vcf_open_mode(mode, fn, extension);
2964 }
2965 else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
2966 else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
2967 else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
2968 else return -1;
2969
2970 return 0;
2971 }
2972
vcf_read(htsFile * fp,const bcf_hdr_t * h,bcf1_t * v)2973 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2974 {
2975 int ret;
2976 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
2977 if (ret < 0) return ret;
2978 return vcf_parse1(&fp->line, h, v);
2979 }
2980
bcf_unpack_fmt_core1(uint8_t * ptr,int n_sample,bcf_fmt_t * fmt)2981 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
2982 {
2983 uint8_t *ptr_start = ptr;
2984 fmt->id = bcf_dec_typed_int1(ptr, &ptr);
2985 fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
2986 fmt->size = fmt->n << bcf_type_shift[fmt->type];
2987 fmt->p = ptr;
2988 fmt->p_off = ptr - ptr_start;
2989 fmt->p_free = 0;
2990 ptr += n_sample * fmt->size;
2991 fmt->p_len = ptr - fmt->p;
2992 return ptr;
2993 }
2994
bcf_unpack_info_core1(uint8_t * ptr,bcf_info_t * info)2995 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
2996 {
2997 uint8_t *ptr_start = ptr;
2998 info->key = bcf_dec_typed_int1(ptr, &ptr);
2999 info->len = bcf_dec_size(ptr, &ptr, &info->type);
3000 info->vptr = ptr;
3001 info->vptr_off = ptr - ptr_start;
3002 info->vptr_free = 0;
3003 info->v1.i = 0;
3004 if (info->len == 1) {
3005 if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
3006 else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr);
3007 else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr);
3008 else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr);
3009 else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr);
3010 }
3011 ptr += info->len << bcf_type_shift[info->type];
3012 info->vptr_len = ptr - info->vptr;
3013 return ptr;
3014 }
3015
bcf_unpack(bcf1_t * b,int which)3016 int bcf_unpack(bcf1_t *b, int which)
3017 {
3018 if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3019 uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3020 int i;
3021 bcf_dec_t *d = &b->d;
3022 if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3023 if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3024 if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3025 {
3026 kstring_t tmp;
3027
3028 // ID
3029 tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3030 ptr_ori = ptr;
3031 ptr = bcf_fmt_sized_array(&tmp, ptr);
3032 b->unpack_size[0] = ptr - ptr_ori;
3033 kputc('\0', &tmp);
3034 d->id = tmp.s; d->m_id = tmp.m;
3035
3036 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3037 hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3038 tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3039 ptr_ori = ptr;
3040 for (i = 0; i < b->n_allele; ++i) {
3041 // Use offset within tmp.s as realloc may change pointer
3042 d->allele[i] = (char *)(intptr_t)tmp.l;
3043 ptr = bcf_fmt_sized_array(&tmp, ptr);
3044 kputc('\0', &tmp);
3045 }
3046 b->unpack_size[1] = ptr - ptr_ori;
3047 d->als = tmp.s; d->m_als = tmp.m;
3048
3049 // Convert our offsets within tmp.s back to pointers again
3050 for (i = 0; i < b->n_allele; ++i)
3051 d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3052 b->unpacked |= BCF_UN_STR;
3053 }
3054 if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3055 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3056 ptr_ori = ptr;
3057 if (*ptr>>4) {
3058 int type;
3059 d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3060 hts_expand(int, d->n_flt, d->m_flt, d->flt);
3061 for (i = 0; i < d->n_flt; ++i)
3062 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3063 } else ++ptr, d->n_flt = 0;
3064 b->unpack_size[2] = ptr - ptr_ori;
3065 b->unpacked |= BCF_UN_FLT;
3066 }
3067 if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3068 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3069 hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3070 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3071 for (i = 0; i < b->n_info; ++i)
3072 ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3073 b->unpacked |= BCF_UN_INFO;
3074 }
3075 if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3076 ptr = (uint8_t*)b->indiv.s;
3077 hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3078 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3079 for (i = 0; i < b->n_fmt; ++i)
3080 ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3081 b->unpacked |= BCF_UN_FMT;
3082 }
3083 return 0;
3084 }
3085
vcf_format(const bcf_hdr_t * h,const bcf1_t * v,kstring_t * s)3086 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3087 {
3088 int i;
3089 int32_t max_dt_id = h->n[BCF_DT_ID];
3090 const char *chrom = bcf_seqname(h, v);
3091 if (!chrom) {
3092 hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3093 v->rid);
3094 errno = EINVAL;
3095 return -1;
3096 }
3097 bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
3098 kputs(chrom, s); // CHROM
3099 kputc('\t', s); kputll(v->pos + 1, s); // POS
3100 kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3101 kputc('\t', s); // REF
3102 if (v->n_allele > 0) kputs(v->d.allele[0], s);
3103 else kputc('.', s);
3104 kputc('\t', s); // ALT
3105 if (v->n_allele > 1) {
3106 for (i = 1; i < v->n_allele; ++i) {
3107 if (i > 1) kputc(',', s);
3108 kputs(v->d.allele[i], s);
3109 }
3110 } else kputc('.', s);
3111 kputc('\t', s); // QUAL
3112 if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
3113 else kputd(v->qual, s);
3114 kputc('\t', s); // FILTER
3115 if (v->d.n_flt) {
3116 for (i = 0; i < v->d.n_flt; ++i) {
3117 int32_t idx = v->d.flt[i];
3118 if (idx < 0 || idx >= max_dt_id
3119 || h->id[BCF_DT_ID][idx].key == NULL) {
3120 hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
3121 idx, bcf_seqname_safe(h, v), v->pos + 1);
3122 errno = EINVAL;
3123 return -1;
3124 }
3125 if (i) kputc(';', s);
3126 kputs(h->id[BCF_DT_ID][idx].key, s);
3127 }
3128 } else kputc('.', s);
3129 kputc('\t', s); // INFO
3130 if (v->n_info) {
3131 int first = 1;
3132 for (i = 0; i < v->n_info; ++i) {
3133 bcf_info_t *z = &v->d.info[i];
3134 if ( !z->vptr ) continue;
3135 if ( !first ) kputc(';', s);
3136 first = 0;
3137 if (z->key < 0 || z->key >= max_dt_id
3138 || h->id[BCF_DT_ID][z->key].key == NULL) {
3139 hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
3140 z->key,
3141 z->key < 0 ? "negative"
3142 : (z->key >= max_dt_id ? "too large" : "not present in the header"),
3143 bcf_seqname_safe(h, v), v->pos+1);
3144 errno = EINVAL;
3145 return -1;
3146 }
3147 kputs(h->id[BCF_DT_ID][z->key].key, s);
3148 if (z->len <= 0) continue;
3149 kputc('=', s);
3150 if (z->len == 1)
3151 {
3152 switch (z->type)
3153 {
3154 case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3155 case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3156 case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3157 case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break;
3158 case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break;
3159 case BCF_BT_CHAR: kputc(z->v1.i, s); break;
3160 default:
3161 hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
3162 errno = EINVAL;
3163 return -1;
3164 }
3165 }
3166 else bcf_fmt_array(s, z->len, z->type, z->vptr);
3167 }
3168 if ( first ) kputc('.', s);
3169 } else kputc('.', s);
3170 // FORMAT and individual information
3171 if (v->n_sample)
3172 {
3173 int i,j;
3174 if ( v->n_fmt)
3175 {
3176 int gt_i = -1;
3177 bcf_fmt_t *fmt = v->d.fmt;
3178 int first = 1;
3179 for (i = 0; i < (int)v->n_fmt; ++i) {
3180 if ( !fmt[i].p ) continue;
3181 kputc(!first ? ':' : '\t', s); first = 0;
3182 if (fmt[i].id < 0 || fmt[i].id >= max_dt_id
3183 || h->id[BCF_DT_ID][fmt[i].id].key == NULL) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3184 {
3185 hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h, v), v->pos+1);
3186 errno = EINVAL;
3187 return -1;
3188 }
3189 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
3190 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
3191 }
3192 if ( first ) kputs("\t.", s);
3193 for (j = 0; j < v->n_sample; ++j) {
3194 kputc('\t', s);
3195 first = 1;
3196 for (i = 0; i < (int)v->n_fmt; ++i) {
3197 bcf_fmt_t *f = &fmt[i];
3198 if ( !f->p ) continue;
3199 if (!first) kputc(':', s);
3200 first = 0;
3201 if (gt_i == i)
3202 bcf_format_gt(f,j,s);
3203 else
3204 bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
3205 }
3206 if ( first ) kputc('.', s);
3207 }
3208 }
3209 else
3210 for (j=0; j<=v->n_sample; j++)
3211 kputs("\t.", s);
3212 }
3213 kputc('\n', s);
3214 return 0;
3215 }
3216
vcf_write_line(htsFile * fp,kstring_t * line)3217 int vcf_write_line(htsFile *fp, kstring_t *line)
3218 {
3219 int ret;
3220 if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
3221 if ( fp->format.compression!=no_compression )
3222 ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
3223 else
3224 ret = hwrite(fp->fp.hfile, line->s, line->l);
3225 return ret==line->l ? 0 : -1;
3226 }
3227
vcf_write(htsFile * fp,const bcf_hdr_t * h,bcf1_t * v)3228 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3229 {
3230 int ret;
3231 fp->line.l = 0;
3232 if (vcf_format1(h, v, &fp->line) != 0)
3233 return -1;
3234 if ( fp->format.compression!=no_compression )
3235 ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
3236 else
3237 ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
3238
3239 if (fp->idx) {
3240 int tid;
3241 if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
3242 return -1;
3243
3244 if (hts_idx_push(fp->idx, tid, v->pos, v->pos + v->rlen, bgzf_tell(fp->fp.bgzf), 1) < 0)
3245 return -1;
3246 }
3247
3248 return ret==fp->line.l ? 0 : -1;
3249 }
3250
3251 /************************
3252 * Data access routines *
3253 ************************/
3254
bcf_hdr_id2int(const bcf_hdr_t * h,int which,const char * id)3255 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
3256 {
3257 khint_t k;
3258 vdict_t *d = (vdict_t*)h->dict[which];
3259 k = kh_get(vdict, d, id);
3260 return k == kh_end(d)? -1 : kh_val(d, k).id;
3261 }
3262
3263
3264 /********************
3265 *** BCF indexing ***
3266 ********************/
3267
3268 // Calculate number of index levels given min_shift and the header contig
3269 // list. Also returns number of contigs in *nids_out.
idx_calc_n_lvls_ids(const bcf_hdr_t * h,int min_shift,int starting_n_lvls,int * nids_out)3270 static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
3271 int starting_n_lvls, int *nids_out)
3272 {
3273 int n_lvls, i, nids = 0;
3274 int64_t max_len = 0, s;
3275
3276 for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
3277 {
3278 if ( !h->id[BCF_DT_CTG][i].val ) continue;
3279 if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
3280 max_len = h->id[BCF_DT_CTG][i].val->info[0];
3281 nids++;
3282 }
3283 if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken.
3284 max_len += 256;
3285 s = 1LL << (min_shift + starting_n_lvls * 3);
3286 for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
3287
3288 if (nids_out) *nids_out = nids;
3289 return n_lvls;
3290 }
3291
bcf_index(htsFile * fp,int min_shift)3292 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
3293 {
3294 int n_lvls;
3295 bcf1_t *b = NULL;
3296 hts_idx_t *idx = NULL;
3297 bcf_hdr_t *h;
3298 int r;
3299 h = bcf_hdr_read(fp);
3300 if ( !h ) return NULL;
3301 int nids = 0;
3302 n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
3303 idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3304 if (!idx) goto fail;
3305 b = bcf_init1();
3306 if (!b) goto fail;
3307 while ((r = bcf_read1(fp,h, b)) >= 0) {
3308 int ret;
3309 ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
3310 if (ret < 0) goto fail;
3311 }
3312 if (r < -1) goto fail;
3313 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
3314 bcf_destroy1(b);
3315 bcf_hdr_destroy(h);
3316 return idx;
3317
3318 fail:
3319 hts_idx_destroy(idx);
3320 bcf_destroy1(b);
3321 bcf_hdr_destroy(h);
3322 return NULL;
3323 }
3324
bcf_index_load2(const char * fn,const char * fnidx)3325 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
3326 {
3327 return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
3328 }
3329
bcf_index_load3(const char * fn,const char * fnidx,int flags)3330 hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
3331 {
3332 return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
3333 }
3334
bcf_index_build3(const char * fn,const char * fnidx,int min_shift,int n_threads)3335 int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
3336 {
3337 htsFile *fp;
3338 hts_idx_t *idx;
3339 tbx_t *tbx;
3340 int ret;
3341 if ((fp = hts_open(fn, "rb")) == 0) return -2;
3342 if (n_threads)
3343 hts_set_threads(fp, n_threads);
3344 if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
3345 switch (fp->format.format) {
3346 case bcf:
3347 if (!min_shift) {
3348 hts_log_error("TBI indices for BCF files are not supported");
3349 ret = -1;
3350 } else {
3351 idx = bcf_index(fp, min_shift);
3352 if (idx) {
3353 ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
3354 if (ret < 0) ret = -4;
3355 hts_idx_destroy(idx);
3356 }
3357 else ret = -1;
3358 }
3359 break;
3360
3361 case vcf:
3362 tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
3363 if (tbx) {
3364 ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
3365 if (ret < 0) ret = -4;
3366 tbx_destroy(tbx);
3367 }
3368 else ret = -1;
3369 break;
3370
3371 default:
3372 ret = -3;
3373 break;
3374 }
3375 hts_close(fp);
3376 return ret;
3377 }
3378
bcf_index_build2(const char * fn,const char * fnidx,int min_shift)3379 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
3380 {
3381 return bcf_index_build3(fn, fnidx, min_shift, 0);
3382 }
3383
bcf_index_build(const char * fn,int min_shift)3384 int bcf_index_build(const char *fn, int min_shift)
3385 {
3386 return bcf_index_build3(fn, NULL, min_shift, 0);
3387 }
3388
3389 // Initialise fp->idx for the current format type.
3390 // This must be called after the header has been written but no other data.
vcf_idx_init(htsFile * fp,bcf_hdr_t * h,int min_shift,const char * fnidx)3391 static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
3392 int n_lvls, fmt;
3393
3394 if (min_shift == 0) {
3395 min_shift = 14;
3396 n_lvls = 5;
3397 fmt = HTS_FMT_TBI;
3398 } else {
3399 // Set initial n_lvls to match tbx_index()
3400 int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
3401 // Increase if necessary
3402 n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
3403 fmt = HTS_FMT_CSI;
3404 }
3405
3406 fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3407 if (!fp->idx) return -1;
3408
3409 // Tabix meta data, added even in CSI for VCF
3410 uint8_t conf[4*7];
3411 u32_to_le(TBX_VCF, conf+0); // fmt
3412 u32_to_le(1, conf+4); // name col
3413 u32_to_le(2, conf+8); // beg col
3414 u32_to_le(0, conf+12); // end col
3415 u32_to_le('#', conf+16); // comment
3416 u32_to_le(0, conf+20); // n.skip
3417 u32_to_le(0, conf+24); // ref name len
3418 if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
3419 hts_idx_destroy(fp->idx);
3420 fp->idx = NULL;
3421 return -1;
3422 }
3423 fp->fnidx = fnidx;
3424
3425 return 0;
3426 }
3427
3428 // Initialise fp->idx for the current format type.
3429 // This must be called after the header has been written but no other data.
bcf_idx_init(htsFile * fp,bcf_hdr_t * h,int min_shift,const char * fnidx)3430 int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
3431 int n_lvls, nids = 0;
3432
3433 if (fp->format.format == vcf)
3434 return vcf_idx_init(fp, h, min_shift, fnidx);
3435
3436 if (!min_shift)
3437 min_shift = 14;
3438
3439 n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
3440
3441 fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3442 if (!fp->idx) return -1;
3443 fp->fnidx = fnidx;
3444
3445 return 0;
3446 }
3447
3448 // Finishes an index. Call after the last record has been written.
3449 // Returns 0 on success, <0 on failure.
3450 //
3451 // NB: same format as SAM/BAM as it uses bgzf.
bcf_idx_save(htsFile * fp)3452 int bcf_idx_save(htsFile *fp) {
3453 return sam_idx_save(fp);
3454 }
3455
3456 /*****************
3457 *** Utilities ***
3458 *****************/
3459
bcf_hdr_combine(bcf_hdr_t * dst,const bcf_hdr_t * src)3460 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
3461 {
3462 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
3463 for (i=0; i<src->nhrec; i++)
3464 {
3465 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
3466 {
3467 int j;
3468 for (j=0; j<ndst_ori; j++)
3469 {
3470 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
3471
3472 // Checking only the key part of generic lines, otherwise
3473 // the VCFs are too verbose. Should we perhaps add a flag
3474 // to bcf_hdr_combine() and make this optional?
3475 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
3476 }
3477 if ( j>=ndst_ori ) {
3478 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3479 if (res < 0) return -1;
3480 need_sync += res;
3481 }
3482 }
3483 else if ( src->hrec[i]->type==BCF_HL_STR )
3484 {
3485 // NB: we are ignoring fields without ID
3486 int j = bcf_hrec_find_key(src->hrec[i],"ID");
3487 if ( j>=0 )
3488 {
3489 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
3490 if ( !rec ) {
3491 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3492 if (res < 0) return -1;
3493 need_sync += res;
3494 }
3495 }
3496 }
3497 else
3498 {
3499 int j = bcf_hrec_find_key(src->hrec[i],"ID");
3500 assert( j>=0 ); // this should always be true for valid VCFs
3501
3502 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
3503 if ( !rec ) {
3504 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3505 if (res < 0) return -1;
3506 need_sync += res;
3507 } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
3508 {
3509 // Check that both records are of the same type. The bcf_hdr_id2length
3510 // macro cannot be used here because dst header is not synced yet.
3511 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
3512 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
3513 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
3514 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
3515 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
3516 {
3517 hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
3518 src->hrec[i]->vals[0]);
3519 ret |= 1;
3520 }
3521 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
3522 {
3523 hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
3524 src->hrec[i]->vals[0]);
3525 ret |= 1;
3526 }
3527 }
3528 }
3529 }
3530 if ( need_sync ) {
3531 if (bcf_hdr_sync(dst) < 0) return -1;
3532 }
3533 return ret;
3534 }
3535
bcf_hdr_merge(bcf_hdr_t * dst,const bcf_hdr_t * src)3536 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
3537 {
3538 if ( !dst )
3539 {
3540 // this will effectively strip existing IDX attributes from src to become dst
3541 dst = bcf_hdr_init("r");
3542 kstring_t htxt = {0,0,0};
3543 if (bcf_hdr_format(src, 0, &htxt) < 0) {
3544 free(htxt.s);
3545 return NULL;
3546 }
3547 if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
3548 bcf_hdr_destroy(dst);
3549 dst = NULL;
3550 }
3551 free(htxt.s);
3552 return dst;
3553 }
3554
3555 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
3556 for (i=0; i<src->nhrec; i++)
3557 {
3558 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
3559 {
3560 int j;
3561 for (j=0; j<ndst_ori; j++)
3562 {
3563 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
3564
3565 // Checking only the key part of generic lines, otherwise
3566 // the VCFs are too verbose. Should we perhaps add a flag
3567 // to bcf_hdr_combine() and make this optional?
3568 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
3569 }
3570 if ( j>=ndst_ori ) {
3571 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3572 if (res < 0) return NULL;
3573 need_sync += res;
3574 }
3575 }
3576 else if ( src->hrec[i]->type==BCF_HL_STR )
3577 {
3578 // NB: we are ignoring fields without ID
3579 int j = bcf_hrec_find_key(src->hrec[i],"ID");
3580 if ( j>=0 )
3581 {
3582 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
3583 if ( !rec ) {
3584 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3585 if (res < 0) return NULL;
3586 need_sync += res;
3587 }
3588 }
3589 }
3590 else
3591 {
3592 int j = bcf_hrec_find_key(src->hrec[i],"ID");
3593 assert( j>=0 ); // this should always be true for valid VCFs
3594
3595 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
3596 if ( !rec ) {
3597 res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3598 if (res < 0) return NULL;
3599 need_sync += res;
3600 } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
3601 {
3602 // Check that both records are of the same type. The bcf_hdr_id2length
3603 // macro cannot be used here because dst header is not synced yet.
3604 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
3605 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
3606 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
3607 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
3608 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
3609 {
3610 hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
3611 src->hrec[i]->vals[0]);
3612 ret |= 1;
3613 }
3614 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
3615 {
3616 hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
3617 src->hrec[i]->vals[0]);
3618 ret |= 1;
3619 }
3620 }
3621 }
3622 }
3623 if ( need_sync ) {
3624 if (bcf_hdr_sync(dst) < 0) return NULL;
3625 }
3626 return dst;
3627 }
3628
bcf_translate(const bcf_hdr_t * dst_hdr,bcf_hdr_t * src_hdr,bcf1_t * line)3629 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
3630 {
3631 int i;
3632 if ( line->errcode )
3633 {
3634 hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_seqname_safe(src_hdr,line), line->pos+1);
3635 exit(1);
3636 }
3637 if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
3638 if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
3639 {
3640 int dict;
3641 for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
3642 {
3643 src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
3644 for (i=0; i<src_hdr->n[dict]; i++)
3645 {
3646 if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
3647 {
3648 src_hdr->transl[dict][i] = -1;
3649 continue;
3650 }
3651 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
3652 if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
3653 }
3654 }
3655 if ( !src_hdr->ntransl )
3656 {
3657 free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
3658 free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
3659 src_hdr->ntransl = -1;
3660 }
3661 if ( src_hdr->ntransl==-1 ) return 0;
3662 }
3663 bcf_unpack(line,BCF_UN_ALL);
3664
3665 // CHROM
3666 if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
3667
3668 // FILTER
3669 for (i=0; i<line->d.n_flt; i++)
3670 {
3671 int src_id = line->d.flt[i];
3672 if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
3673 line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
3674 line->d.shared_dirty |= BCF1_DIRTY_FLT;
3675 }
3676
3677 // INFO
3678 for (i=0; i<line->n_info; i++)
3679 {
3680 int src_id = line->d.info[i].key;
3681 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
3682 if ( dst_id<0 ) continue;
3683 line->d.info[i].key = dst_id;
3684 if ( !line->d.info[i].vptr ) continue; // skip deleted
3685 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3686 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3687 if ( src_size==dst_size ) // can overwrite
3688 {
3689 uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
3690 if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
3691 else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
3692 else { *(uint32_t*)vptr = (uint32_t)dst_id; }
3693 }
3694 else // must realloc
3695 {
3696 bcf_info_t *info = &line->d.info[i];
3697 kstring_t str = {0,0,0};
3698 bcf_enc_int1(&str, dst_id);
3699 bcf_enc_size(&str, info->len,info->type);
3700 uint32_t vptr_off = str.l;
3701 kputsn((char*)info->vptr, info->vptr_len, &str);
3702 if( info->vptr_free ) free(info->vptr - info->vptr_off);
3703 info->vptr_off = vptr_off;
3704 info->vptr = (uint8_t*)str.s + info->vptr_off;
3705 info->vptr_free = 1;
3706 line->d.shared_dirty |= BCF1_DIRTY_INF;
3707 }
3708 }
3709
3710 // FORMAT
3711 for (i=0; i<line->n_fmt; i++)
3712 {
3713 int src_id = line->d.fmt[i].id;
3714 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
3715 if ( dst_id<0 ) continue;
3716 line->d.fmt[i].id = dst_id;
3717 if( !line->d.fmt[i].p ) continue; // skip deleted
3718 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3719 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
3720 if ( src_size==dst_size ) // can overwrite
3721 {
3722 uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
3723 if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
3724 else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
3725 else { i32_to_le(dst_id, p + 1); }
3726 }
3727 else // must realloc
3728 {
3729 bcf_fmt_t *fmt = &line->d.fmt[i];
3730 kstring_t str = {0,0,0};
3731 bcf_enc_int1(&str, dst_id);
3732 bcf_enc_size(&str, fmt->n, fmt->type);
3733 uint32_t p_off = str.l;
3734 kputsn((char*)fmt->p, fmt->p_len, &str);
3735 if( fmt->p_free ) free(fmt->p - fmt->p_off);
3736 fmt->p_off = p_off;
3737 fmt->p = (uint8_t*)str.s + fmt->p_off;
3738 fmt->p_free = 1;
3739 line->d.indiv_dirty = 1;
3740 }
3741 }
3742 return 0;
3743 }
3744
bcf_hdr_dup(const bcf_hdr_t * hdr)3745 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
3746 {
3747 bcf_hdr_t *hout = bcf_hdr_init("r");
3748 if (!hout) {
3749 hts_log_error("Failed to allocate bcf header");
3750 return NULL;
3751 }
3752 kstring_t htxt = {0,0,0};
3753 if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
3754 free(htxt.s);
3755 return NULL;
3756 }
3757 if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
3758 bcf_hdr_destroy(hout);
3759 hout = NULL;
3760 }
3761 free(htxt.s);
3762 return hout;
3763 }
3764
bcf_hdr_subset(const bcf_hdr_t * h0,int n,char * const * samples,int * imap)3765 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
3766 {
3767 void *names_hash = khash_str2int_init();
3768 kstring_t htxt = {0,0,0};
3769 kstring_t str = {0,0,0};
3770 bcf_hdr_t *h = bcf_hdr_init("w");
3771 int r = 0;
3772 if (!h || !names_hash) {
3773 hts_log_error("Failed to allocate bcf header");
3774 goto err;
3775 }
3776 if (bcf_hdr_format(h0, 1, &htxt) < 0) {
3777 hts_log_error("Failed to get header text");
3778 goto err;
3779 }
3780 bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
3781 int j;
3782 for (j=0; j<n; j++) imap[j] = -1;
3783 if ( bcf_hdr_nsamples(h0) > 0) {
3784 char *p = find_chrom_header_line(htxt.s);
3785 int i = 0, end = n? 8 : 7;
3786 while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
3787 if (i != end) {
3788 hts_log_error("Wrong number of columns in header #CHROM line");
3789 goto err;
3790 }
3791 r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
3792 for (i = 0; i < n; ++i) {
3793 if ( khash_str2int_has_key(names_hash,samples[i]) )
3794 {
3795 hts_log_error("Duplicate sample name \"%s\"", samples[i]);
3796 goto err;
3797 }
3798 imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
3799 if (imap[i] < 0) continue;
3800 r |= kputc('\t', &str) < 0;
3801 r |= kputs(samples[i], &str) < 0;
3802 r |= khash_str2int_inc(names_hash,samples[i]) < 0;
3803 }
3804 } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
3805 while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
3806 r |= kputc('\n',&str) < 0;
3807 if (r) {
3808 hts_log_error("%s", strerror(errno));
3809 goto err;
3810 }
3811 if ( bcf_hdr_parse(h, str.s) < 0 ) {
3812 bcf_hdr_destroy(h);
3813 h = NULL;
3814 }
3815 free(str.s);
3816 free(htxt.s);
3817 khash_str2int_destroy(names_hash);
3818 return h;
3819
3820 err:
3821 ks_free(&str);
3822 ks_free(&htxt);
3823 khash_str2int_destroy(names_hash);
3824 bcf_hdr_destroy(h);
3825 return NULL;
3826 }
3827
bcf_hdr_set_samples(bcf_hdr_t * hdr,const char * samples,int is_file)3828 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
3829 {
3830 if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
3831
3832 int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
3833 hdr->keep_samples = (uint8_t*) calloc(narr,1);
3834 if (!hdr->keep_samples) return -1;
3835
3836 hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
3837 if ( !samples )
3838 {
3839 // exclude all samples
3840 khint_t k;
3841 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
3842 new_dict = kh_init(vdict);
3843 if (!new_dict) return -1;
3844
3845 bcf_hdr_nsamples(hdr) = 0;
3846
3847 for (k = kh_begin(d); k != kh_end(d); ++k)
3848 if (kh_exist(d, k)) free((char*)kh_key(d, k));
3849 kh_destroy(vdict, d);
3850 hdr->dict[BCF_DT_SAMPLE] = new_dict;
3851 if (bcf_hdr_sync(hdr) < 0) return -1;
3852
3853 return 0;
3854 }
3855
3856 if ( samples[0]=='^' )
3857 for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
3858
3859 int idx, n, ret = 0;
3860 char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
3861 if ( !smpls ) return -1;
3862 for (i=0; i<n; i++)
3863 {
3864 idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
3865 if ( idx<0 )
3866 {
3867 if ( !ret ) ret = i+1;
3868 continue;
3869 }
3870 assert( idx<bcf_hdr_nsamples(hdr) );
3871 if ( samples[0]=='^' )
3872 bit_array_clear(hdr->keep_samples, idx);
3873 else
3874 bit_array_set(hdr->keep_samples, idx);
3875 }
3876 for (i=0; i<n; i++) free(smpls[i]);
3877 free(smpls);
3878
3879 bcf_hdr_nsamples(hdr) = 0;
3880 for (i=0; i<hdr->nsamples_ori; i++)
3881 if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
3882
3883 if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
3884 else
3885 {
3886 // Make new list and dictionary with desired samples
3887 char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
3888 vdict_t *new_dict, *d;
3889 int k, res;
3890 if (!samples) return -1;
3891
3892 new_dict = kh_init(vdict);
3893 if (!new_dict) {
3894 free(samples);
3895 return -1;
3896 }
3897 idx = 0;
3898 for (i=0; i<hdr->nsamples_ori; i++) {
3899 if ( bit_array_test(hdr->keep_samples,i) ) {
3900 samples[idx] = hdr->samples[i];
3901 k = kh_put(vdict, new_dict, hdr->samples[i], &res);
3902 if (res < 0) {
3903 free(samples);
3904 kh_destroy(vdict, new_dict);
3905 return -1;
3906 }
3907 kh_val(new_dict, k) = bcf_idinfo_def;
3908 kh_val(new_dict, k).id = idx;
3909 idx++;
3910 }
3911 }
3912
3913 // Delete desired samples from old dictionary, so we don't free them
3914 d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
3915 for (i=0; i < idx; i++) {
3916 int k = kh_get(vdict, d, samples[i]);
3917 if (k < kh_end(d)) kh_del(vdict, d, k);
3918 }
3919
3920 // Free everything else
3921 for (k = kh_begin(d); k != kh_end(d); ++k)
3922 if (kh_exist(d, k)) free((char*)kh_key(d, k));
3923 kh_destroy(vdict, d);
3924 hdr->dict[BCF_DT_SAMPLE] = new_dict;
3925
3926 free(hdr->samples);
3927 hdr->samples = samples;
3928
3929 if (bcf_hdr_sync(hdr) < 0)
3930 return -1;
3931 }
3932
3933 return ret;
3934 }
3935
bcf_subset(const bcf_hdr_t * h,bcf1_t * v,int n,int * imap)3936 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
3937 {
3938 kstring_t ind;
3939 ind.s = 0; ind.l = ind.m = 0;
3940 if (n) {
3941 bcf_fmt_t fmt[MAX_N_FMT];
3942 int i, j;
3943 uint8_t *ptr = (uint8_t*)v->indiv.s;
3944 for (i = 0; i < v->n_fmt; ++i)
3945 ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
3946 for (i = 0; i < (int)v->n_fmt; ++i) {
3947 bcf_fmt_t *f = &fmt[i];
3948 bcf_enc_int1(&ind, f->id);
3949 bcf_enc_size(&ind, f->n, f->type);
3950 for (j = 0; j < n; ++j)
3951 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
3952 }
3953 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
3954 v->n_sample = i;
3955 } else v->n_sample = 0;
3956 if ( !v->n_sample ) v->n_fmt = 0;
3957 free(v->indiv.s);
3958 v->indiv = ind;
3959 v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
3960 return 0;
3961 }
3962
bcf_is_snp(bcf1_t * v)3963 int bcf_is_snp(bcf1_t *v)
3964 {
3965 int i;
3966 bcf_unpack(v, BCF_UN_STR);
3967 for (i = 0; i < v->n_allele; ++i)
3968 {
3969 if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
3970
3971 // mpileup's <X> allele, see also below. This is not completely satisfactory,
3972 // a general library is here narrowly tailored to fit samtools.
3973 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
3974 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
3975
3976 break;
3977 }
3978 return i == v->n_allele;
3979 }
3980
bcf_set_variant_type(const char * ref,const char * alt,bcf_variant_t * var)3981 static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
3982 {
3983 if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; } // overlapping variant
3984
3985 // The most frequent case
3986 if ( !ref[1] && !alt[1] )
3987 {
3988 if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
3989 if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
3990 var->n = 1; var->type = VCF_SNP; return;
3991 }
3992 if ( alt[0]=='<' )
3993 {
3994 if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
3995 if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
3996 if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
3997 var->type = VCF_OTHER;
3998 return;
3999 }
4000
4001 const char *r = ref, *a = alt;
4002 while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed
4003
4004 if ( *a && !*r )
4005 {
4006 if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; }
4007 while ( *a ) a++;
4008 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
4009 }
4010 else if ( *r && !*a )
4011 {
4012 while ( *r ) r++;
4013 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
4014 }
4015 else if ( !*r && !*a )
4016 {
4017 var->n = 0; var->type = VCF_REF; return;
4018 }
4019
4020 const char *re = r, *ae = a;
4021 while ( re[1] ) re++;
4022 while ( ae[1] ) ae++;
4023 while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
4024 if ( ae==a )
4025 {
4026 if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
4027 var->n = -(re-r);
4028 if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
4029 var->type = VCF_OTHER; return;
4030 }
4031 else if ( re==r )
4032 {
4033 var->n = ae-a;
4034 if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
4035 var->type = VCF_OTHER; return;
4036 }
4037
4038 var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
4039 var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
4040
4041 // should do also complex events, SVs, etc...
4042 }
4043
bcf_set_variant_types(bcf1_t * b)4044 static int bcf_set_variant_types(bcf1_t *b)
4045 {
4046 if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
4047 bcf_dec_t *d = &b->d;
4048 if ( d->n_var < b->n_allele )
4049 {
4050 d->var = (bcf_variant_t *) realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
4051 d->n_var = b->n_allele;
4052 }
4053 int i;
4054 b->d.var_type = 0;
4055 d->var[0].type = VCF_REF;
4056 d->var[0].n = 0;
4057 for (i=1; i<b->n_allele; i++)
4058 {
4059 bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
4060 b->d.var_type |= d->var[i].type;
4061 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
4062 }
4063 return 0;
4064 }
4065
bcf_get_variant_types(bcf1_t * rec)4066 int bcf_get_variant_types(bcf1_t *rec)
4067 {
4068 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
4069 return rec->d.var_type;
4070 }
bcf_get_variant_type(bcf1_t * rec,int ith_allele)4071 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
4072 {
4073 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
4074 return rec->d.var[ith_allele].type;
4075 }
4076
bcf_update_info(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const void * values,int n,int type)4077 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
4078 {
4079 static int negative_rlen_warned = 0;
4080 int is_end_tag;
4081
4082 // Is the field already present?
4083 int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
4084 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
4085 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4086
4087 is_end_tag = strcmp(key, "END") == 0;
4088
4089 for (i=0; i<line->n_info; i++)
4090 if ( inf_id==line->d.info[i].key ) break;
4091 bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
4092
4093 if ( !n || (type==BCF_HT_STR && !values) )
4094 {
4095 if ( n==0 && is_end_tag )
4096 line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
4097 if ( inf )
4098 {
4099 // Mark the tag for removal, free existing memory if necessary
4100 if ( inf->vptr_free )
4101 {
4102 free(inf->vptr - inf->vptr_off);
4103 inf->vptr_free = 0;
4104 }
4105 line->d.shared_dirty |= BCF1_DIRTY_INF;
4106 inf->vptr = NULL;
4107 inf->vptr_off = inf->vptr_len = 0;
4108 }
4109 return 0;
4110 }
4111
4112 if (is_end_tag)
4113 {
4114 if (n != 1)
4115 {
4116 hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
4117 line->errcode |= BCF_ERR_TAG_INVALID;
4118 return -1;
4119 }
4120 if (type != BCF_HT_INT && type != BCF_HT_LONG)
4121 {
4122 hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4123 line->errcode |= BCF_ERR_TAG_INVALID;
4124 return -1;
4125 }
4126 }
4127
4128 // Encode the values and determine the size required to accommodate the values
4129 kstring_t str = {0,0,0};
4130 bcf_enc_int1(&str, inf_id);
4131 if ( type==BCF_HT_INT )
4132 bcf_enc_vint(&str, n, (int32_t*)values, -1);
4133 else if ( type==BCF_HT_REAL )
4134 bcf_enc_vfloat(&str, n, (float*)values);
4135 else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
4136 {
4137 if ( values==NULL )
4138 bcf_enc_size(&str, 0, BCF_BT_NULL);
4139 else
4140 bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
4141 }
4142 #ifdef VCF_ALLOW_INT64
4143 else if ( type==BCF_HT_LONG )
4144 {
4145 if (n != 1) {
4146 hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
4147 abort();
4148 }
4149 bcf_enc_long1(&str, *(int64_t *) values);
4150 }
4151 #endif
4152 else
4153 {
4154 hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4155 abort();
4156 }
4157
4158 // Is the INFO tag already present
4159 if ( inf )
4160 {
4161 // Is it big enough to accommodate new block?
4162 if ( str.l <= inf->vptr_len + inf->vptr_off )
4163 {
4164 if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
4165 uint8_t *ptr = inf->vptr - inf->vptr_off;
4166 memcpy(ptr, str.s, str.l);
4167 free(str.s);
4168 int vptr_free = inf->vptr_free;
4169 bcf_unpack_info_core1(ptr, inf);
4170 inf->vptr_free = vptr_free;
4171 }
4172 else
4173 {
4174 if ( inf->vptr_free )
4175 free(inf->vptr - inf->vptr_off);
4176 bcf_unpack_info_core1((uint8_t*)str.s, inf);
4177 inf->vptr_free = 1;
4178 line->d.shared_dirty |= BCF1_DIRTY_INF;
4179 }
4180 }
4181 else
4182 {
4183 // The tag is not present, create new one
4184 line->n_info++;
4185 hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
4186 inf = &line->d.info[line->n_info-1];
4187 bcf_unpack_info_core1((uint8_t*)str.s, inf);
4188 inf->vptr_free = 1;
4189 line->d.shared_dirty |= BCF1_DIRTY_INF;
4190 }
4191 line->unpacked |= BCF_UN_INFO;
4192
4193 if ( n==1 && is_end_tag) {
4194 hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
4195 if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
4196 {
4197 if ( end <= line->pos )
4198 {
4199 if ( !negative_rlen_warned )
4200 {
4201 hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
4202 negative_rlen_warned = 1;
4203 }
4204 line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
4205 }
4206 else
4207 line->rlen = end - line->pos;
4208 }
4209 }
4210 return 0;
4211 }
4212
bcf_update_format_string(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const char ** values,int n)4213 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
4214 {
4215 if ( !n )
4216 return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
4217
4218 int i, max_len = 0;
4219 for (i=0; i<n; i++)
4220 {
4221 int len = strlen(values[i]);
4222 if ( len > max_len ) max_len = len;
4223 }
4224 char *out = (char*) malloc(max_len*n);
4225 if ( !out ) return -2;
4226 for (i=0; i<n; i++)
4227 {
4228 char *dst = out+i*max_len;
4229 const char *src = values[i];
4230 int j = 0;
4231 while ( src[j] ) { dst[j] = src[j]; j++; }
4232 for (; j<max_len; j++) dst[j] = 0;
4233 }
4234 int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
4235 free(out);
4236 return ret;
4237 }
4238
bcf_update_format(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const void * values,int n,int type)4239 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
4240 {
4241 // Is the field already present?
4242 int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
4243 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
4244 {
4245 if ( !n ) return 0;
4246 return -1; // the key not present in the header
4247 }
4248
4249 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4250
4251 for (i=0; i<line->n_fmt; i++)
4252 if ( line->d.fmt[i].id==fmt_id ) break;
4253 bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
4254
4255 if ( !n )
4256 {
4257 if ( fmt )
4258 {
4259 // Mark the tag for removal, free existing memory if necessary
4260 if ( fmt->p_free )
4261 {
4262 free(fmt->p - fmt->p_off);
4263 fmt->p_free = 0;
4264 }
4265 line->d.indiv_dirty = 1;
4266 fmt->p = NULL;
4267 }
4268 return 0;
4269 }
4270
4271 line->n_sample = bcf_hdr_nsamples(hdr);
4272 int nps = n / line->n_sample; // number of values per sample
4273 assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
4274
4275 // Encode the values and determine the size required to accommodate the values
4276 kstring_t str = {0,0,0};
4277 bcf_enc_int1(&str, fmt_id);
4278 assert(values != NULL);
4279 if ( type==BCF_HT_INT )
4280 bcf_enc_vint(&str, n, (int32_t*)values, nps);
4281 else if ( type==BCF_HT_REAL )
4282 {
4283 bcf_enc_size(&str, nps, BCF_BT_FLOAT);
4284 serialize_float_array(&str, nps*line->n_sample, (float *) values);
4285 }
4286 else if ( type==BCF_HT_STR )
4287 {
4288 bcf_enc_size(&str, nps, BCF_BT_CHAR);
4289 kputsn((char*)values, nps*line->n_sample, &str);
4290 }
4291 else
4292 {
4293 hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4294 abort();
4295 }
4296
4297 if ( !fmt )
4298 {
4299 // Not present, new format field
4300 line->n_fmt++;
4301 hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
4302
4303 // Special case: VCF specification requires that GT is always first
4304 if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
4305 {
4306 for (i=line->n_fmt-1; i>0; i--)
4307 line->d.fmt[i] = line->d.fmt[i-1];
4308 fmt = &line->d.fmt[0];
4309 }
4310 else
4311 fmt = &line->d.fmt[line->n_fmt-1];
4312 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
4313 line->d.indiv_dirty = 1;
4314 fmt->p_free = 1;
4315 }
4316 else
4317 {
4318 // The tag is already present, check if it is big enough to accommodate the new block
4319 if ( str.l <= fmt->p_len + fmt->p_off )
4320 {
4321 // good, the block is big enough
4322 if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
4323 uint8_t *ptr = fmt->p - fmt->p_off;
4324 memcpy(ptr, str.s, str.l);
4325 free(str.s);
4326 int p_free = fmt->p_free;
4327 bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
4328 fmt->p_free = p_free;
4329 }
4330 else
4331 {
4332 if ( fmt->p_free )
4333 free(fmt->p - fmt->p_off);
4334 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
4335 fmt->p_free = 1;
4336 line->d.indiv_dirty = 1;
4337 }
4338 }
4339 line->unpacked |= BCF_UN_FMT;
4340 return 0;
4341 }
4342
4343
bcf_update_filter(const bcf_hdr_t * hdr,bcf1_t * line,int * flt_ids,int n)4344 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
4345 {
4346 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4347 line->d.shared_dirty |= BCF1_DIRTY_FLT;
4348 line->d.n_flt = n;
4349 if ( !n ) return 0;
4350 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
4351 int i;
4352 for (i=0; i<n; i++)
4353 line->d.flt[i] = flt_ids[i];
4354 return 0;
4355 }
4356
bcf_add_filter(const bcf_hdr_t * hdr,bcf1_t * line,int flt_id)4357 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
4358 {
4359 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4360 int i;
4361 for (i=0; i<line->d.n_flt; i++)
4362 if ( flt_id==line->d.flt[i] ) break;
4363 if ( i<line->d.n_flt ) return 0; // this filter is already set
4364 line->d.shared_dirty |= BCF1_DIRTY_FLT;
4365 if ( flt_id==0 ) // set to PASS
4366 line->d.n_flt = 1;
4367 else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
4368 line->d.n_flt = 1;
4369 else
4370 line->d.n_flt++;
4371 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
4372 line->d.flt[line->d.n_flt-1] = flt_id;
4373 return 1;
4374 }
bcf_remove_filter(const bcf_hdr_t * hdr,bcf1_t * line,int flt_id,int pass)4375 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
4376 {
4377 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4378 int i;
4379 for (i=0; i<line->d.n_flt; i++)
4380 if ( flt_id==line->d.flt[i] ) break;
4381 if ( i==line->d.n_flt ) return 0; // the filter is not present
4382 line->d.shared_dirty |= BCF1_DIRTY_FLT;
4383 if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
4384 line->d.n_flt--;
4385 if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
4386 return 0;
4387 }
4388
bcf_has_filter(const bcf_hdr_t * hdr,bcf1_t * line,char * filter)4389 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
4390 {
4391 if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
4392 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
4393 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
4394
4395 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4396 if ( id==0 && !line->d.n_flt) return 1; // PASS
4397
4398 int i;
4399 for (i=0; i<line->d.n_flt; i++)
4400 if ( line->d.flt[i]==id ) return 1;
4401 return 0;
4402 }
4403
_bcf1_sync_alleles(const bcf_hdr_t * hdr,bcf1_t * line,int nals)4404 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
4405 {
4406 line->d.shared_dirty |= BCF1_DIRTY_ALS;
4407
4408 line->n_allele = nals;
4409 hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
4410
4411 char *als = line->d.als;
4412 int n = 0;
4413 while (n<nals)
4414 {
4415 line->d.allele[n] = als;
4416 while ( *als ) als++;
4417 als++;
4418 n++;
4419 }
4420
4421 // Update REF length. Note that END is 1-based while line->pos 0-based
4422 bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
4423 if ( end_info )
4424 {
4425 if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
4426 else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
4427 }
4428 if ( end_info && end_info->v1.i > line->pos )
4429 line->rlen = end_info->v1.i - line->pos;
4430 else if ( nals > 0 )
4431 line->rlen = strlen(line->d.allele[0]);
4432 else
4433 line->rlen = 0;
4434
4435 return 0;
4436 }
bcf_update_alleles(const bcf_hdr_t * hdr,bcf1_t * line,const char ** alleles,int nals)4437 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
4438 {
4439 if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4440 kstring_t tmp = {0,0,0};
4441 char *free_old = NULL;
4442
4443 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
4444 int i;
4445 for (i=0; i<nals; i++)
4446 if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
4447 if ( i==nals )
4448 {
4449 // all alleles point elsewhere, reuse the existing block
4450 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
4451 }
4452 else
4453 free_old = line->d.als;
4454
4455 for (i=0; i<nals; i++)
4456 {
4457 kputs(alleles[i], &tmp);
4458 kputc(0, &tmp);
4459 }
4460 line->d.als = tmp.s; line->d.m_als = tmp.m;
4461 free(free_old);
4462 return _bcf1_sync_alleles(hdr,line,nals);
4463 }
4464
bcf_update_alleles_str(const bcf_hdr_t * hdr,bcf1_t * line,const char * alleles_string)4465 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
4466 {
4467 if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4468 kstring_t tmp;
4469 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
4470 kputs(alleles_string, &tmp);
4471 line->d.als = tmp.s; line->d.m_als = tmp.m;
4472
4473 int nals = 1;
4474 char *t = line->d.als;
4475 while (*t)
4476 {
4477 if ( *t==',' ) { *t = 0; nals++; }
4478 t++;
4479 }
4480 return _bcf1_sync_alleles(hdr, line, nals);
4481 }
4482
bcf_update_id(const bcf_hdr_t * hdr,bcf1_t * line,const char * id)4483 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
4484 {
4485 if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4486 kstring_t tmp;
4487 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
4488 if ( id )
4489 kputs(id, &tmp);
4490 else
4491 kputs(".", &tmp);
4492 line->d.id = tmp.s; line->d.m_id = tmp.m;
4493 line->d.shared_dirty |= BCF1_DIRTY_ID;
4494 return 0;
4495 }
4496
bcf_add_id(const bcf_hdr_t * hdr,bcf1_t * line,const char * id)4497 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
4498 {
4499 if ( !id ) return 0;
4500 if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4501
4502 kstring_t tmp;
4503 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
4504
4505 int len = strlen(id);
4506 char *dst = line->d.id;
4507 while ( *dst && (dst=strstr(dst,id)) )
4508 {
4509 if ( dst[len]!=0 && dst[len]!=';' ) dst++; // a prefix, not a match
4510 else if ( dst==line->d.id || dst[-1]==';' ) return 0; // already present
4511 dst++; // a suffix, not a match
4512 }
4513 if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
4514 {
4515 tmp.l = strlen(line->d.id);
4516 kputc(';',&tmp);
4517 }
4518 kputs(id,&tmp);
4519
4520 line->d.id = tmp.s; line->d.m_id = tmp.m;
4521 line->d.shared_dirty |= BCF1_DIRTY_ID;
4522 return 0;
4523
4524 }
4525
bcf_get_fmt(const bcf_hdr_t * hdr,bcf1_t * line,const char * key)4526 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
4527 {
4528 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
4529 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
4530 return bcf_get_fmt_id(line, id);
4531 }
4532
bcf_get_info(const bcf_hdr_t * hdr,bcf1_t * line,const char * key)4533 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
4534 {
4535 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
4536 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
4537 return bcf_get_info_id(line, id);
4538 }
4539
bcf_get_fmt_id(bcf1_t * line,const int id)4540 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
4541 {
4542 int i;
4543 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4544 for (i=0; i<line->n_fmt; i++)
4545 {
4546 if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
4547 }
4548 return NULL;
4549 }
4550
bcf_get_info_id(bcf1_t * line,const int id)4551 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
4552 {
4553 int i;
4554 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4555 for (i=0; i<line->n_info; i++)
4556 {
4557 if ( line->d.info[i].key==id ) return &line->d.info[i];
4558 }
4559 return NULL;
4560 }
4561
4562
bcf_get_info_values(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,void ** dst,int * ndst,int type)4563 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
4564 {
4565 int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
4566 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
4567 if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type
4568
4569 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4570
4571 for (i=0; i<line->n_info; i++)
4572 if ( line->d.info[i].key==tag_id ) break;
4573 if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
4574 if ( type==BCF_HT_FLAG ) return 1;
4575
4576 bcf_info_t *info = &line->d.info[i];
4577 if ( !info->vptr ) return -3; // the tag was marked for removal
4578 if ( type==BCF_HT_STR )
4579 {
4580 if ( *ndst < info->len+1 )
4581 {
4582 *ndst = info->len + 1;
4583 *dst = realloc(*dst, *ndst);
4584 }
4585 memcpy(*dst,info->vptr,info->len);
4586 ((uint8_t*)*dst)[info->len] = 0;
4587 return info->len;
4588 }
4589
4590 // Make sure the buffer is big enough
4591 int size1;
4592 switch (type) {
4593 case BCF_HT_INT: size1 = sizeof(int32_t); break;
4594 case BCF_HT_LONG: size1 = sizeof(int64_t); break;
4595 case BCF_HT_REAL: size1 = sizeof(float); break;
4596 default:
4597 hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4598 return -2;
4599 }
4600 if ( *ndst < info->len )
4601 {
4602 *ndst = info->len;
4603 *dst = realloc(*dst, *ndst * size1);
4604 }
4605
4606 #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
4607 out_type_t *tmp = (out_type_t *) *dst; \
4608 int j; \
4609 for (j=0; j<info->len; j++) \
4610 { \
4611 type_t p = convert(info->vptr + j * sizeof(type_t)); \
4612 if ( is_vector_end ) break; \
4613 if ( is_missing ) set_missing; \
4614 else set_regular; \
4615 tmp++; \
4616 } \
4617 ret = j; \
4618 } while (0)
4619 switch (info->type) {
4620 case BCF_BT_INT8:
4621 if (type == BCF_HT_LONG) {
4622 BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
4623 } else {
4624 BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
4625 }
4626 break;
4627 case BCF_BT_INT16:
4628 if (type == BCF_HT_LONG) {
4629 BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
4630 } else {
4631 BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
4632 }
4633 break;
4634 case BCF_BT_INT32:
4635 if (type == BCF_HT_LONG) {
4636 BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
4637 } else {
4638 BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
4639 }
4640 case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
4641 default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
4642 }
4643 #undef BRANCH
4644 return ret; // set by BRANCH
4645 }
4646
bcf_get_format_string(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,char *** dst,int * ndst)4647 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
4648 {
4649 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
4650 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
4651 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
4652
4653 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4654
4655 for (i=0; i<line->n_fmt; i++)
4656 if ( line->d.fmt[i].id==tag_id ) break;
4657 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
4658 bcf_fmt_t *fmt = &line->d.fmt[i];
4659 if ( !fmt->p ) return -3; // the tag was marked for removal
4660
4661 int nsmpl = bcf_hdr_nsamples(hdr);
4662 if ( !*dst )
4663 {
4664 *dst = (char**) malloc(sizeof(char*)*nsmpl);
4665 if ( !*dst ) return -4; // could not alloc
4666 (*dst)[0] = NULL;
4667 }
4668 int n = (fmt->n+1)*nsmpl;
4669 if ( *ndst < n )
4670 {
4671 (*dst)[0] = realloc((*dst)[0], n);
4672 if ( !(*dst)[0] ) return -4; // could not alloc
4673 *ndst = n;
4674 }
4675 for (i=0; i<nsmpl; i++)
4676 {
4677 uint8_t *src = fmt->p + i*fmt->n;
4678 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
4679 memcpy(tmp,src,fmt->n);
4680 tmp[fmt->n] = 0;
4681 (*dst)[i] = (char*) tmp;
4682 }
4683 return n;
4684 }
4685
bcf_get_format_values(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,void ** dst,int * ndst,int type)4686 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
4687 {
4688 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
4689 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
4690 if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
4691 {
4692 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
4693 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
4694 }
4695 else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
4696
4697 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4698
4699 for (i=0; i<line->n_fmt; i++)
4700 if ( line->d.fmt[i].id==tag_id ) break;
4701 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
4702 bcf_fmt_t *fmt = &line->d.fmt[i];
4703 if ( !fmt->p ) return -3; // the tag was marked for removal
4704
4705 if ( type==BCF_HT_STR )
4706 {
4707 int n = fmt->n*bcf_hdr_nsamples(hdr);
4708 if ( *ndst < n )
4709 {
4710 *dst = realloc(*dst, n);
4711 if ( !*dst ) return -4; // could not alloc
4712 *ndst = n;
4713 }
4714 memcpy(*dst,fmt->p,n);
4715 return n;
4716 }
4717
4718 // Make sure the buffer is big enough
4719 int nsmpl = bcf_hdr_nsamples(hdr);
4720 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
4721 if ( *ndst < fmt->n*nsmpl )
4722 {
4723 *ndst = fmt->n*nsmpl;
4724 *dst = realloc(*dst, *ndst*size1);
4725 if ( !*dst ) return -4; // could not alloc
4726 }
4727
4728 #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
4729 out_type_t *tmp = (out_type_t *) *dst; \
4730 uint8_t *fmt_p = fmt->p; \
4731 for (i=0; i<nsmpl; i++) \
4732 { \
4733 for (j=0; j<fmt->n; j++) \
4734 { \
4735 type_t p = convert(fmt_p + j * sizeof(type_t)); \
4736 if ( is_missing ) set_missing; \
4737 else if ( is_vector_end ) { set_vector_end; break; } \
4738 else set_regular; \
4739 tmp++; \
4740 } \
4741 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
4742 fmt_p += fmt->size; \
4743 } \
4744 }
4745 switch (fmt->type) {
4746 case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
4747 case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
4748 case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
4749 case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
4750 default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
4751 }
4752 #undef BRANCH
4753 return nsmpl*fmt->n;
4754 }
4755