1 /// @file htslib/vcf.h
2 /// High-level VCF/BCF variant calling file operations.
3 /*
4 Copyright (C) 2012, 2013 Broad Institute.
5 Copyright (C) 2012-2020 Genome Research Ltd.
6
7 Author: Heng Li <lh3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 /*
28 todo:
29 - make the function names consistent
30 - provide calls to abstract away structs as much as possible
31 */
32
33 #ifndef HTSLIB_VCF_H
34 #define HTSLIB_VCF_H
35
36 #include <stdint.h>
37 #include <limits.h>
38 #include <errno.h>
39 #include "hts.h"
40 #include "kstring.h"
41 #include "hts_defs.h"
42 #include "hts_endian.h"
43
44 /* Included only for backwards compatibility with e.g. bcftools 1.10 */
45 #include <assert.h>
46
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50
51 /*****************
52 * Header struct *
53 *****************/
54
55 #define BCF_HL_FLT 0 // header line
56 #define BCF_HL_INFO 1
57 #define BCF_HL_FMT 2
58 #define BCF_HL_CTG 3
59 #define BCF_HL_STR 4 // structured header line TAG=<A=..,B=..>
60 #define BCF_HL_GEN 5 // generic header line
61
62 #define BCF_HT_FLAG 0 // header type
63 #define BCF_HT_INT 1
64 #define BCF_HT_REAL 2
65 #define BCF_HT_STR 3
66 #define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only!
67
68 #define BCF_VL_FIXED 0 // variable length
69 #define BCF_VL_VAR 1
70 #define BCF_VL_A 2
71 #define BCF_VL_G 3
72 #define BCF_VL_R 4
73
74 /* === Dictionary ===
75
76 The header keeps three dictionaries. The first keeps IDs in the
77 "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
78 in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
79 is the actual hash table, which is opaque to the end users. In the hash
80 table, the key is the ID or sample name as a C string and the value is a
81 bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
82 table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
83 size of the hash table or, equivalently, the length of the id[] arrays.
84 */
85
86 #define BCF_DT_ID 0 // dictionary type
87 #define BCF_DT_CTG 1
88 #define BCF_DT_SAMPLE 2
89
90 // Complete textual representation of a header line
91 typedef struct bcf_hrec_t {
92 int type; // One of the BCF_HL_* type
93 char *key; // The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
94 char *value; // Set only for generic lines, NULL for FILTER/INFO, etc.
95 int nkeys; // Number of structured fields
96 char **keys, **vals; // The key=value pairs
97 } bcf_hrec_t;
98
99 typedef struct bcf_idinfo_t {
100 uint64_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
101 // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
102 bcf_hrec_t *hrec[3];
103 int id;
104 } bcf_idinfo_t;
105
106 typedef struct bcf_idpair_t {
107 const char *key;
108 const bcf_idinfo_t *val;
109 } bcf_idpair_t;
110
111 // Note that bcf_hdr_t structs must always be created via bcf_hdr_init()
112 typedef struct bcf_hdr_t {
113 int32_t n[3]; // n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
114 bcf_idpair_t *id[3];
115 void *dict[3]; // ID dictionary, contig dict and sample dict
116 char **samples;
117 bcf_hrec_t **hrec;
118 int nhrec, dirty;
119 int ntransl, *transl[2]; // for bcf_translate()
120 int nsamples_ori; // for bcf_hdr_set_samples()
121 uint8_t *keep_samples;
122 kstring_t mem;
123 int32_t m[3]; // m: allocated size of the dictionary block in use (see n above)
124 } bcf_hdr_t;
125
126 extern uint8_t bcf_type_shift[];
127
128 /**************
129 * VCF record *
130 **************/
131
132 #define BCF_BT_NULL 0
133 #define BCF_BT_INT8 1
134 #define BCF_BT_INT16 2
135 #define BCF_BT_INT32 3
136 #define BCF_BT_INT64 4 // Unofficial, for internal use only.
137 #define BCF_BT_FLOAT 5
138 #define BCF_BT_CHAR 7
139
140 #define VCF_REF 0
141 #define VCF_SNP 1
142 #define VCF_MNP 2
143 #define VCF_INDEL 4
144 #define VCF_OTHER 8
145 #define VCF_BND 16 // breakend
146 #define VCF_OVERLAP 32 // overlapping deletion, ALT=*
147
148 typedef struct bcf_variant_t {
149 int type, n; // variant type and the number of bases affected, negative for deletions
150 } bcf_variant_t;
151
152 typedef struct bcf_fmt_t {
153 int id; // id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
154 int n, size, type; // n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
155 uint8_t *p; // same as vptr and vptr_* in bcf_info_t below
156 uint32_t p_len;
157 uint32_t p_off:31, p_free:1;
158 } bcf_fmt_t;
159
160 typedef struct bcf_info_t {
161 int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
162 int type; // type: one of BCF_BT_* types
163 union {
164 int64_t i; // integer value
165 float f; // float value
166 } v1; // only set if $len==1; for easier access
167 uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
168 uint32_t vptr_len; // length of the vptr block or, when set, of the vptr_mod block, excluding offset
169 uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes
170 vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new
171 // data block is bigger than the original
172 int len; // vector length, 1 for scalars
173 } bcf_info_t;
174
175
176 #define BCF1_DIRTY_ID 1
177 #define BCF1_DIRTY_ALS 2
178 #define BCF1_DIRTY_FLT 4
179 #define BCF1_DIRTY_INF 8
180
181 typedef struct bcf_dec_t {
182 int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change
183 int n_flt; // Number of FILTER fields
184 int *flt; // FILTER keys in the dictionary
185 char *id, *als; // ID and REF+ALT block (\0-separated)
186 char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated
187 bcf_info_t *info; // INFO
188 bcf_fmt_t *fmt; // FORMAT and individual sample
189 bcf_variant_t *var; // $var and $var_type set only when set_variant_types called
190 int n_var, var_type;
191 int shared_dirty; // if set, shared.s must be recreated on BCF output
192 int indiv_dirty; // if set, indiv.s must be recreated on BCF output
193 } bcf_dec_t;
194
195
196 #define BCF_ERR_CTG_UNDEF 1
197 #define BCF_ERR_TAG_UNDEF 2
198 #define BCF_ERR_NCOLS 4
199 #define BCF_ERR_LIMITS 8
200 #define BCF_ERR_CHAR 16
201 #define BCF_ERR_CTG_INVALID 32
202 #define BCF_ERR_TAG_INVALID 64
203
204 /*
205 The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
206 is slower because the string is first to be parsed, packed into BCF line
207 (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
208 is known in advance that some of the fields will not be required (notably
209 the sample columns), parsing of these can be skipped by setting max_unpack
210 appropriately.
211 Similarly, it is fast to output a BCF line because the columns (kept in
212 shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
213 line must be formatted in vcf_format.
214 */
215 typedef struct bcf1_t {
216 hts_pos_t pos; // POS
217 hts_pos_t rlen; // length of REF
218 int32_t rid; // CHROM
219 float qual; // QUAL
220 uint32_t n_info:16, n_allele:16;
221 uint32_t n_fmt:8, n_sample:24;
222 kstring_t shared, indiv;
223 bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
224 int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
225 int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
226 int unpack_size[3]; // the original block size of ID, REF+ALT and FILTER
227 int errcode; // one of BCF_ERR_* codes
228 } bcf1_t;
229
230 /*******
231 * API *
232 *******/
233
234 /***********************************************************************
235 * BCF and VCF I/O
236 *
237 * A note about naming conventions: htslib internally represents VCF
238 * records as bcf1_t data structures, therefore most functions are
239 * prefixed with bcf_. There are a few exceptions where the functions must
240 * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
241 * these cases, functions prefixed with bcf_ are more general and work
242 * with both BCF and VCF.
243 *
244 ***********************************************************************/
245
246 /** These macros are defined only for consistency with other parts of htslib */
247 #define bcf_init1() bcf_init()
248 #define bcf_read1(fp,h,v) bcf_read((fp),(h),(v))
249 #define vcf_read1(fp,h,v) vcf_read((fp),(h),(v))
250 #define bcf_write1(fp,h,v) bcf_write((fp),(h),(v))
251 #define vcf_write1(fp,h,v) vcf_write((fp),(h),(v))
252 #define bcf_destroy1(v) bcf_destroy(v)
253 #define bcf_empty1(v) bcf_empty(v)
254 #define vcf_parse1(s,h,v) vcf_parse((s),(h),(v))
255 #define bcf_clear1(v) bcf_clear(v)
256 #define vcf_format1(h,v,s) vcf_format((h),(v),(s))
257
258 /**
259 * bcf_hdr_init() - create an empty BCF header.
260 * @param mode "r" or "w"
261 *
262 * When opened for writing, the mandatory fileFormat and
263 * FILTER=PASS lines are added automatically.
264 *
265 * The bcf_hdr_t struct returned by a successful call should be freed
266 * via bcf_hdr_destroy() when it is no longer needed.
267 */
268 HTSLIB_EXPORT
269 bcf_hdr_t *bcf_hdr_init(const char *mode);
270
271 /** Destroy a BCF header struct */
272 HTSLIB_EXPORT
273 void bcf_hdr_destroy(bcf_hdr_t *h);
274
275 /** Allocate and initialize a bcf1_t object.
276 *
277 * The bcf1_t struct returned by a successful call should be freed
278 * via bcf_destroy() when it is no longer needed.
279 */
280 HTSLIB_EXPORT
281 bcf1_t *bcf_init(void);
282
283 /** Deallocate a bcf1_t object */
284 HTSLIB_EXPORT
285 void bcf_destroy(bcf1_t *v);
286
287 /**
288 * Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
289 * not the bcf1_t object itself.
290 */
291 HTSLIB_EXPORT
292 void bcf_empty(bcf1_t *v);
293
294 /**
295 * Make the bcf1_t object ready for next read. Intended mostly for
296 * internal use, the user should rarely need to call this function
297 * directly.
298 */
299 HTSLIB_EXPORT
300 void bcf_clear(bcf1_t *v);
301
302
303 /** bcf_open and vcf_open mode: please see hts_open() in hts.h */
304 typedef htsFile vcfFile;
305 #define bcf_open(fn, mode) hts_open((fn), (mode))
306 #define vcf_open(fn, mode) hts_open((fn), (mode))
307 #define bcf_flush(fp) hts_flush((fp))
308 #define bcf_close(fp) hts_close(fp)
309 #define vcf_close(fp) hts_close(fp)
310
311 /// Read a VCF or BCF header
312 /** @param fp The file to read the header from
313 @return Pointer to a populated header structure on success;
314 NULL on failure
315
316 The bcf_hdr_t struct returned by a successful call should be freed
317 via bcf_hdr_destroy() when it is no longer needed.
318 */
319 HTSLIB_EXPORT
320 bcf_hdr_t *bcf_hdr_read(htsFile *fp) HTS_RESULT_USED;
321
322 /**
323 * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
324 * @param samples samples to include or exclude from file or as a comma-separated string.
325 * LIST|FILE .. select samples in list/file
326 * ^LIST|FILE .. exclude samples from list/file
327 * - .. include all samples
328 * NULL .. exclude all samples
329 * @param is_file @p samples is a file (1) or a comma-separated list (0)
330 *
331 * The bottleneck of VCF reading is parsing of genotype fields. If the
332 * reader knows in advance that only subset of samples is needed (possibly
333 * no samples at all), the performance of bcf_read() can be significantly
334 * improved by calling bcf_hdr_set_samples after bcf_hdr_read().
335 * The function bcf_read() will subset the VCF/BCF records automatically
336 * with the notable exception when reading records via bcf_itr_next().
337 * In this case, bcf_subset_format() must be called explicitly, because
338 * bcf_readrec() does not see the header.
339 *
340 * Returns 0 on success, -1 on error or a positive integer if the list
341 * contains samples not present in the VCF header. In such a case, the
342 * return value is the index of the offending sample.
343 */
344 HTSLIB_EXPORT
345 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) HTS_RESULT_USED;
346
347 HTSLIB_EXPORT
348 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec);
349
350 /// Write a VCF or BCF header
351 /** @param fp Output file
352 @param h The header to write
353 @return 0 on success; -1 on failure
354 */
355 HTSLIB_EXPORT
356 int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) HTS_RESULT_USED;
357
358 /**
359 * Parse VCF line contained in kstring and populate the bcf1_t struct
360 * The line must not end with \n or \r characters.
361 */
362 HTSLIB_EXPORT
363 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v);
364
365 /**
366 * Complete the file opening mode, according to its extension.
367 * @param mode Preallocated mode string to be completed.
368 * @param fn File name to be opened.
369 * @param format Format string (vcf|bcf|vcf.gz)
370 * @return 0 on success; -1 on failure
371 */
372 HTSLIB_EXPORT
373 int vcf_open_mode(char *mode, const char *fn, const char *format);
374
375 /** The opposite of vcf_parse. It should rarely be called directly, see vcf_write */
376 HTSLIB_EXPORT
377 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s);
378
379 /// Read next VCF or BCF record
380 /** @param fp The file to read the record from
381 @param h The header for the vcf/bcf file
382 @param v The bcf1_t structure to populate
383 @return 0 on success; -1 on end of file; < -1 on critical error
384
385 On errors which are not critical for reading, such as missing header
386 definitions in vcf files, zero will be returned but v->errcode will have been
387 set to one of BCF_ERR* codes and must be checked before calling bcf_write().
388 */
389 HTSLIB_EXPORT
390 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED;
391
392 /**
393 * bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
394 *
395 * Note that bcf_unpack() must be called even when reading VCF. It is safe
396 * to call the function repeatedly, it will not unpack the same field
397 * twice.
398 */
399 #define BCF_UN_STR 1 // up to ALT inclusive
400 #define BCF_UN_FLT 2 // up to FILTER
401 #define BCF_UN_INFO 4 // up to INFO
402 #define BCF_UN_SHR (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) // all shared information
403 #define BCF_UN_FMT 8 // unpack format and each sample
404 #define BCF_UN_IND BCF_UN_FMT // a synonym of BCF_UN_FMT
405 #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
406 HTSLIB_EXPORT
407 int bcf_unpack(bcf1_t *b, int which);
408
409 /*
410 * bcf_dup() - create a copy of BCF record.
411 *
412 * Note that bcf_unpack() must be called on the returned copy as if it was
413 * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
414 * internally to reflect any changes made by bcf_update_* functions.
415 *
416 * The bcf1_t struct returned by a successful call should be freed
417 * via bcf_destroy() when it is no longer needed.
418 */
419 HTSLIB_EXPORT
420 bcf1_t *bcf_dup(bcf1_t *src);
421
422 HTSLIB_EXPORT
423 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src);
424
425 /// Write one VCF or BCF record. The type is determined at the open() call.
426 /** @param fp The file to write to
427 @param h The header for the vcf/bcf file
428 @param v The bcf1_t structure to write
429 @return 0 on success; -1 on error
430 */
431 HTSLIB_EXPORT
432 int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED;
433
434 /**
435 * The following functions work only with VCFs and should rarely be called
436 * directly. Usually one wants to use their bcf_* alternatives, which work
437 * transparently with both VCFs and BCFs.
438 */
439 /// Read a VCF format header
440 /** @param fp The file to read the header from
441 @return Pointer to a populated header structure on success;
442 NULL on failure
443
444 Use bcf_hdr_read() instead.
445
446 The bcf_hdr_t struct returned by a successful call should be freed
447 via bcf_hdr_destroy() when it is no longer needed.
448 */
449 HTSLIB_EXPORT
450 bcf_hdr_t *vcf_hdr_read(htsFile *fp) HTS_RESULT_USED;
451
452 /// Write a VCF format header
453 /** @param fp Output file
454 @param h The header to write
455 @return 0 on success; -1 on failure
456
457 Use bcf_hdr_write() instead
458 */
459 HTSLIB_EXPORT
460 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) HTS_RESULT_USED;
461
462 /// Read a record from a VCF file
463 /** @param fp The file to read the record from
464 @param h The header for the vcf file
465 @param v The bcf1_t structure to populate
466 @return 0 on success; -1 on end of file; < -1 on error
467
468 Use bcf_read() instead
469 */
470 HTSLIB_EXPORT
471 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED;
472
473 /// Write a record to a VCF file
474 /** @param fp The file to write to
475 @param h The header for the vcf file
476 @param v The bcf1_t structure to write
477 @return 0 on success; -1 on error
478
479 Use bcf_write() instead
480 */
481 HTSLIB_EXPORT
482 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED;
483
484 /** Helper function for the bcf_itr_next() macro; internal use, ignore it */
485 HTSLIB_EXPORT
486 int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, hts_pos_t *beg, hts_pos_t *end);
487
488 /// Write a line to a VCF file
489 /** @param line Line to write
490 @param fp File to write it to
491 @return 0 on success; -1 on failure
492
493 @note No checks are done on the line being added, apart from
494 ensuring that it ends with a newline. This function
495 should therefore be used with care.
496 */
497 HTSLIB_EXPORT
498 int vcf_write_line(htsFile *fp, kstring_t *line);
499
500 /**************************************************************************
501 * Header querying and manipulation routines
502 **************************************************************************/
503
504 /** Create a new header using the supplied template
505 *
506 * The bcf_hdr_t struct returned by a successful call should be freed
507 * via bcf_hdr_destroy() when it is no longer needed.
508 * @return NULL on failure, header otherwise
509 */
510 HTSLIB_EXPORT
511 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr);
512
513 /**
514 * Copy header lines from src to dst if not already present in dst. See also bcf_translate().
515 * Returns 0 on success or sets a bit on error:
516 * 1 .. conflicting definitions of tag length
517 * // todo
518 */
519 HTSLIB_EXPORT
520 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) HTS_DEPRECATED("Please use bcf_hdr_merge instead");
521
522 /**
523 * bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
524 * @param dst: the destination header to be merged into, NULL on the first pass
525 * @param src: the source header
526 * @return NULL on failure, header otherwise
527 *
528 * Notes:
529 * - use as:
530 * bcf_hdr_t *dst = NULL;
531 * for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
532 *
533 * - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
534 * combining multiple BCF headers. The current bcf_hdr_combine()
535 * does not have this problem, but became slow when used for many files.
536 */
537 HTSLIB_EXPORT
538 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src);
539
540 /**
541 * bcf_hdr_add_sample() - add a new sample.
542 * @param sample: sample name to be added
543 *
544 * Note:
545 * After all samples have been added, the internal header structure must be updated
546 * by calling bcf_hdr_sync(). This is normally done automatically by the first bcf_hdr_write()
547 * or bcf_write() call. Otherwise, the caller must force the update by calling bcf_hdr_sync()
548 * explicitly.
549 */
550 HTSLIB_EXPORT
551 int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample);
552
553 /** Read VCF header from a file and update the header */
554 HTSLIB_EXPORT
555 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname);
556
557 /// Appends formatted header text to _str_.
558 /** If _is_bcf_ is zero, `IDX` fields are discarded.
559 * @return 0 if successful, or negative if an error occurred
560 * @since 1.4
561 */
562 HTSLIB_EXPORT
563 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
564
565 /** Returns formatted header (newly allocated string) and its length,
566 * excluding the terminating \0. If is_bcf parameter is unset, IDX
567 * fields are discarded.
568 * @deprecated Use bcf_hdr_format() instead as it can handle huge headers.
569 */
570 HTSLIB_EXPORT
571 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
572 HTS_DEPRECATED("use bcf_hdr_format() instead");
573
574 /** Append new VCF header line, returns 0 on success */
575 HTSLIB_EXPORT
576 int bcf_hdr_append(bcf_hdr_t *h, const char *line);
577
578 HTSLIB_EXPORT
579 int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
580
581 /** VCF version, e.g. VCFv4.2 */
582 HTSLIB_EXPORT
583 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr);
584
585 /// Set version in bcf header
586 /**
587 @param hdr BCF header struct
588 @param version Version to set, e.g. "VCFv4.3"
589 @return 0 on success; < 0 on error
590 */
591 HTSLIB_EXPORT
592 int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version);
593
594 /**
595 * bcf_hdr_remove() - remove VCF header tag
596 * @param type: one of BCF_HL_*
597 * @param key: tag name or NULL to remove all tags of the given type
598 */
599 HTSLIB_EXPORT
600 void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key);
601
602 /**
603 * bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
604 * @param n: number of samples to keep
605 * @param samples: names of the samples to keep
606 * @param imap: mapping from index in @samples to the sample index in the original file
607 * @return NULL on failure, header otherwise
608 *
609 * Sample names not present in h0 are ignored. The number of unmatched samples can be checked
610 * by comparing n and bcf_hdr_nsamples(out_hdr).
611 * This function can be used to reorder samples.
612 * See also bcf_subset() which subsets individual records.
613 * The bcf_hdr_t struct returned by a successful call should be freed
614 * via bcf_hdr_destroy() when it is no longer needed.
615 */
616 HTSLIB_EXPORT
617 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap);
618
619 /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */
620 HTSLIB_EXPORT
621 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs);
622
623 /** Get number of samples */
624 #define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE]
625
626
627 /** The following functions are for internal use and should rarely be called directly */
628 HTSLIB_EXPORT
629 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt);
630
631 /// Synchronize internal header structures
632 /** @param h Header
633 @return 0 on success, -1 on failure
634
635 This function updates the id, sample and contig arrays in the
636 bcf_hdr_t structure so that they point to the same locations as
637 the id, sample and contig dictionaries.
638 */
639 HTSLIB_EXPORT
640 int bcf_hdr_sync(bcf_hdr_t *h) HTS_RESULT_USED;
641
642 /**
643 * bcf_hdr_parse_line() - parse a single line of VCF textual header
644 * @param h BCF header struct
645 * @param line One or more lines of header text
646 * @param len Filled out with length data parsed from 'line'.
647 * @return bcf_hrec_t* on success;
648 * NULL on error or on end of header text.
649 * NB: to distinguish error from end-of-header, check *len:
650 * *len == 0 indicates @p line did not start with "##"
651 * *len == -1 indicates failure, likely due to out of memory
652 * *len > 0 indicates a malformed header line
653 *
654 * If *len > 0 on exit, it will contain the full length of the line
655 * including any trailing newline (this includes cases where NULL was
656 * returned due to a malformed line). Callers can use this to skip to
657 * the next header line.
658 */
659 HTSLIB_EXPORT
660 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len);
661 /// Convert a bcf header record to string form
662 /**
663 * @param hrec Header record
664 * @param str Destination kstring
665 * @return 0 on success; < 0 on error
666 */
667 HTSLIB_EXPORT
668 int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str);
669
670 HTSLIB_EXPORT
671 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec);
672
673 /**
674 * bcf_hdr_get_hrec() - get header line info
675 * @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
676 * @param key: the header key for generic lines (e.g. "fileformat"), any field
677 * for structured lines, typically "ID".
678 * @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
679 * @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
680 */
681 HTSLIB_EXPORT
682 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class);
683
684 /// Duplicate a header record
685 /** @param hrec Header record to copy
686 @return A new header record on success; NULL on failure
687
688 The bcf_hrec_t struct returned by a successful call should be freed
689 via bcf_hrec_destroy() when it is no longer needed.
690 */
691 HTSLIB_EXPORT
692 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec);
693
694 /// Add a new header record key
695 /** @param hrec Header record
696 @param str Key name
697 @param len Length of @p str
698 @return 0 on success; -1 on failure
699 */
700 HTSLIB_EXPORT
701 int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len) HTS_RESULT_USED;
702
703 /// Set a header record value
704 /** @param hrec Header record
705 @param i Index of value
706 @param str Value to set
707 @param len Length of @p str
708 @param is_quoted Value should be quoted
709 @return 0 on success; -1 on failure
710 */
711 HTSLIB_EXPORT
712 int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted) HTS_RESULT_USED;
713
714 HTSLIB_EXPORT
715 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key);
716
717
718 /// Add an IDX header record
719 /** @param hrec Header record
720 @param idx IDX value to add
721 @return 0 on success; -1 on failure
722 */
723 HTSLIB_EXPORT
724 int hrec_add_idx(bcf_hrec_t *hrec, int idx) HTS_RESULT_USED;
725
726 /// Free up a header record and associated structures
727 /** @param hrec Header record
728 */
729 HTSLIB_EXPORT
730 void bcf_hrec_destroy(bcf_hrec_t *hrec);
731
732
733
734 /**************************************************************************
735 * Individual record querying and manipulation routines
736 **************************************************************************/
737
738 /** See the description of bcf_hdr_subset() */
739 HTSLIB_EXPORT
740 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap);
741
742 /**
743 * bcf_translate() - translate tags ids to be consistent with different header. This function
744 * is useful when lines from multiple VCF need to be combined.
745 * @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
746 * @src_hdr: the source header, used in bcf_read()
747 * @src_line: line obtained by bcf_read()
748 */
749 HTSLIB_EXPORT
750 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line);
751
752 /**
753 * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
754 */
755 HTSLIB_EXPORT
756 int bcf_get_variant_types(bcf1_t *rec);
757
758 HTSLIB_EXPORT
759 int bcf_get_variant_type(bcf1_t *rec, int ith_allele);
760
761 HTSLIB_EXPORT
762 int bcf_is_snp(bcf1_t *v);
763
764 /**
765 * bcf_update_filter() - sets the FILTER column
766 * @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
767 * @n: Number of filters. If n==0, all filters are removed
768 */
769 HTSLIB_EXPORT
770 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n);
771 /**
772 * bcf_add_filter() - adds to the FILTER column
773 * @flt_id: filter ID to add, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
774 *
775 * If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
776 */
777 HTSLIB_EXPORT
778 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id);
779 /**
780 * bcf_remove_filter() - removes from the FILTER column
781 * @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
782 * @pass: when set to 1 and no filters are present, set to PASS
783 */
784 HTSLIB_EXPORT
785 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass);
786 /**
787 * Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
788 */
789 HTSLIB_EXPORT
790 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter);
791 /**
792 * bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
793 * @alleles: Array of alleles
794 * @nals: Number of alleles
795 * @alleles_string: Comma-separated alleles, starting with the REF allele
796 */
797 HTSLIB_EXPORT
798 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals);
799
800 HTSLIB_EXPORT
801 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string);
802
803 /**
804 * bcf_update_id() - sets new ID string
805 * bcf_add_id() - adds to the ID string checking for duplicates
806 */
807 HTSLIB_EXPORT
808 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id);
809
810 HTSLIB_EXPORT
811 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id);
812
813 /**
814 * bcf_update_info_*() - functions for updating INFO fields
815 * @param hdr: the BCF header
816 * @param line: VCF line to be edited
817 * @param key: the INFO tag to be updated
818 * @param values: pointer to the array of values. Pass NULL to remove the tag.
819 * @param n: number of values in the array. When set to 0, the INFO tag is removed
820 * @return 0 on success or negative value on error.
821 *
822 * The @p string in bcf_update_info_flag() is optional,
823 * @p n indicates whether the flag is set or removed.
824 *
825 * Note that updating an END info tag will cause line->rlen to be
826 * updated as a side-effect (removing the tag will set it to the
827 * string length of the REF allele). If line->pos is being changed as
828 * well, it is important that this is done before calling
829 * bcf_update_info_int32() to update the END tag, otherwise rlen will be
830 * set incorrectly. If the new END value is less than or equal to
831 * line->pos, a warning will be printed and line->rlen will be set to
832 * the length of the REF allele.
833 */
834 #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT)
835 #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL)
836 #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG)
837 #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR)
838 HTSLIB_EXPORT
839 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
840
841 /// Set or update 64-bit integer INFO values
842 /**
843 * @param hdr: the BCF header
844 * @param line: VCF line to be edited
845 * @param key: the INFO tag to be updated
846 * @param values: pointer to the array of values. Pass NULL to remove the tag.
847 * @param n: number of values in the array. When set to 0, the INFO tag is removed
848 * @return 0 on success or negative value on error.
849 *
850 * This function takes an int64_t values array as input. The data
851 * actually stored will be shrunk to the minimum size that can
852 * accept all of the values.
853 *
854 * INFO values outside of the range BCF_MIN_BT_INT32 to BCF_MAX_BT_INT32
855 * can only be written to VCF files.
856 */
bcf_update_info_int64(const bcf_hdr_t * hdr,bcf1_t * line,const char * key,const int64_t * values,int n)857 static inline int bcf_update_info_int64(const bcf_hdr_t *hdr, bcf1_t *line,
858 const char *key,
859 const int64_t *values, int n)
860 {
861 return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG);
862 }
863
864 /*
865 * bcf_update_format_*() - functions for updating FORMAT fields
866 * @values: pointer to the array of values, the same number of elements
867 * is expected for each sample. Missing values must be padded
868 * with bcf_*_missing or bcf_*_vector_end values.
869 * @n: number of values in the array. If n==0, existing tag is removed.
870 *
871 * The function bcf_update_format_string() is a higher-level (slower) variant of
872 * bcf_update_format_char(). The former accepts array of \0-terminated strings
873 * whereas the latter requires that the strings are collapsed into a single array
874 * of fixed-length strings. In case of strings with variable length, shorter strings
875 * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
876 * are not \0-terminated.
877 *
878 * Returns 0 on success or negative value on error.
879 */
880 #define bcf_update_format_int32(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_INT)
881 #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL)
882 #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR)
883 #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below
884
885 HTSLIB_EXPORT
886 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n);
887
888 HTSLIB_EXPORT
889 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
890
891 // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
892 // to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
893 // from bcf_get_genotypes() below.
894 #define bcf_gt_phased(idx) (((idx)+1)<<1|1)
895 #define bcf_gt_unphased(idx) (((idx)+1)<<1)
896 #define bcf_gt_missing 0
897 #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1)
898 #define bcf_gt_is_phased(idx) ((idx)&1)
899 #define bcf_gt_allele(val) (((val)>>1)-1)
900
901 /** Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) */
902 #define bcf_alleles2gt(a,b) ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a)))
bcf_gt2alleles(int igt,int * a,int * b)903 static inline void bcf_gt2alleles(int igt, int *a, int *b)
904 {
905 int k = 0, dk = 1;
906 while ( k<igt ) { dk++; k += dk; }
907 *b = dk - 1; *a = igt - k + *b;
908 }
909
910 /**
911 * bcf_get_fmt() - returns pointer to FORMAT's field data
912 * @header: for access to BCF_DT_ID dictionary
913 * @line: VCF line obtained from vcf_parse1
914 * @fmt: one of GT,PL,...
915 *
916 * Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
917 * is not available.
918 */
919 HTSLIB_EXPORT
920 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
921
922 HTSLIB_EXPORT
923 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
924
925 /**
926 * bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
927 * @line: VCF line obtained from vcf_parse1
928 * @id: The header index for the tag, obtained from bcf_hdr_id2int()
929 *
930 * Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
931 * as their goal is to avoid the header lookup.
932 */
933 HTSLIB_EXPORT
934 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id);
935
936 HTSLIB_EXPORT
937 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id);
938
939 /**
940 * bcf_get_info_*() - get INFO values, integers or floats
941 * @param hdr: BCF header
942 * @param line: BCF record
943 * @param tag: INFO tag to retrieve
944 * @param dst: *dst is pointer to a memory location, can point to NULL
945 * @param ndst: pointer to the size of allocated memory
946 * @return >=0 on success
947 * -1 .. no such INFO tag defined in the header
948 * -2 .. clash between types defined in the header and encountered in the VCF record
949 * -3 .. tag is not present in the VCF record
950 * -4 .. the operation could not be completed (e.g. out of memory)
951 *
952 * Returns negative value on error or the number of values (including
953 * missing values) put in *dst on success. bcf_get_info_string() returns
954 * on success the number of characters stored excluding the nul-
955 * terminating byte. bcf_get_info_flag() does not store anything in *dst
956 * but returns 1 if the flag is set or 0 if not.
957 *
958 * *dst will be reallocated if it is not big enough (i.e. *ndst is too
959 * small) or NULL on entry. The new size will be stored in *ndst.
960 */
961 #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
962 #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
963 #define bcf_get_info_string(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
964 #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG)
965
966 HTSLIB_EXPORT
967 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
968
969 /// Put integer INFO values into an int64_t array
970 /**
971 * @param hdr: BCF header
972 * @param line: BCF record
973 * @param tag: INFO tag to retrieve
974 * @param dst: *dst is pointer to a memory location, can point to NULL
975 * @param ndst: pointer to the size of allocated memory
976 * @return >=0 on success
977 * -1 .. no such INFO tag defined in the header
978 * -2 .. clash between types defined in the header and encountered in the VCF record
979 * -3 .. tag is not present in the VCF record
980 * -4 .. the operation could not be completed (e.g. out of memory)
981 *
982 * Returns negative value on error or the number of values (including
983 * missing values) put in *dst on success.
984 *
985 * *dst will be reallocated if it is not big enough (i.e. *ndst is too
986 * small) or NULL on entry. The new size will be stored in *ndst.
987 */
bcf_get_info_int64(const bcf_hdr_t * hdr,bcf1_t * line,const char * tag,int64_t ** dst,int * ndst)988 static inline int bcf_get_info_int64(const bcf_hdr_t *hdr, bcf1_t *line,
989 const char *tag, int64_t **dst,
990 int *ndst)
991 {
992 return bcf_get_info_values(hdr, line, tag,
993 (void **) dst, ndst, BCF_HT_LONG);
994 }
995
996 /**
997 * bcf_get_format_*() - same as bcf_get_info*() above
998 *
999 * The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
1000 * see the description of bcf_update_format_string() and bcf_update_format_char() above.
1001 * Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
1002 * a single block of \0-terminated strings collapsed into a single array and an array of pointers
1003 * to these strings. Both arrays must be cleaned by the user.
1004 *
1005 * Returns negative value on error or the number of written values on success.
1006 *
1007 * Use the returned number of written values for accessing valid entries of dst, as ndst is only a
1008 * watermark that can be higher than the returned value, i.e. the end of dst can contain carry-over
1009 * values from previous calls to bcf_get_format_*() on lines with more values per sample.
1010 *
1011 * Example:
1012 * int ndst = 0; char **dst = NULL;
1013 * if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
1014 * for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]);
1015 * free(dst[0]); free(dst);
1016 *
1017 * Example:
1018 * int i, j, ngt, nsmpl = bcf_hdr_nsamples(hdr);
1019 * int32_t *gt_arr = NULL, ngt_arr = 0;
1020 *
1021 * ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr);
1022 * if ( ngt<=0 ) return; // GT not present
1023 *
1024 * int max_ploidy = ngt/nsmpl;
1025 * for (i=0; i<nsmpl; i++)
1026 * {
1027 * int32_t *ptr = gt_arr + i*max_ploidy;
1028 * for (j=0; j<max_ploidy; j++)
1029 * {
1030 * // if true, the sample has smaller ploidy
1031 * if ( ptr[j]==bcf_int32_vector_end ) break;
1032 *
1033 * // missing allele
1034 * if ( bcf_gt_is_missing(ptr[j]) ) continue;
1035 *
1036 * // the VCF 0-based allele index
1037 * int allele_index = bcf_gt_allele(ptr[j]);
1038 *
1039 * // is phased?
1040 * int is_phased = bcf_gt_is_phased(ptr[j]);
1041 *
1042 * // .. do something ..
1043 * }
1044 * }
1045 * free(gt_arr);
1046 *
1047 */
1048 #define bcf_get_format_int32(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
1049 #define bcf_get_format_float(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
1050 #define bcf_get_format_char(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
1051 #define bcf_get_genotypes(hdr,line,dst,ndst) bcf_get_format_values(hdr,line,"GT",(void**)(dst),ndst,BCF_HT_INT)
1052
1053 HTSLIB_EXPORT
1054 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst);
1055
1056 HTSLIB_EXPORT
1057 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
1058
1059
1060
1061 /**************************************************************************
1062 * Helper functions
1063 **************************************************************************/
1064
1065 /**
1066 * bcf_hdr_id2int() - Translates string into numeric ID
1067 * bcf_hdr_int2id() - Translates numeric ID into string
1068 * @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
1069 * @id: tag name, such as: PL, DP, GT, etc.
1070 *
1071 * Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
1072 * fields in BCF records.
1073 */
1074 HTSLIB_EXPORT
1075 int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id);
1076 #define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key)
1077
1078 /**
1079 * bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
1080 * bcf_hdr_id2name() - Translates numeric ID to sequence name
1081 */
bcf_hdr_name2id(const bcf_hdr_t * hdr,const char * id)1082 static inline int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); }
bcf_hdr_id2name(const bcf_hdr_t * hdr,int rid)1083 static inline const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
1084 {
1085 if ( !hdr || rid<0 || rid>=hdr->n[BCF_DT_CTG] ) return NULL;
1086 return hdr->id[BCF_DT_CTG][rid].key;
1087 }
bcf_seqname(const bcf_hdr_t * hdr,const bcf1_t * rec)1088 static inline const char *bcf_seqname(const bcf_hdr_t *hdr, const bcf1_t *rec) {
1089 return bcf_hdr_id2name(hdr, rec ? rec->rid : -1);
1090 }
1091
1092 /** Return CONTIG name, or "(unknown)"
1093
1094 Like bcf_seqname(), but this function will never return NULL. If
1095 the contig name cannot be found (either because @p hdr was not
1096 supplied or rec->rid was out of range) it returns the string
1097 "(unknown)".
1098 */
bcf_seqname_safe(const bcf_hdr_t * hdr,const bcf1_t * rec)1099 static inline const char *bcf_seqname_safe(const bcf_hdr_t *hdr, const bcf1_t *rec) {
1100 const char *name = bcf_seqname(hdr, rec);
1101 return name ? name : "(unknown)";
1102 }
1103
1104 /**
1105 * bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
1106 * @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
1107 * @int_id: return value of bcf_hdr_id2int, must be >=0
1108 *
1109 * The returned values are:
1110 * bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
1111 * bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
1112 * bcf_hdr_id2type .. the field type, one of BCF_HT_*
1113 * bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
1114 *
1115 * Notes: Prior to using the macros, the presence of the info should be
1116 * tested with bcf_hdr_idinfo_exists().
1117 */
1118 #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf)
1119 #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12)
1120 #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf)
1121 #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf)
1122 #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf)
1123 #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)])
1124 /// Convert BCF FORMAT data to string form
1125 /**
1126 * @param s kstring to write into
1127 * @param n number of items in @p data
1128 * @param type type of items in @p data
1129 * @param data BCF format data
1130 * @return 0 on success
1131 * -1 if out of memory
1132 */
1133 HTSLIB_EXPORT
1134 int bcf_fmt_array(kstring_t *s, int n, int type, void *data);
1135
1136 HTSLIB_EXPORT
1137 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr);
1138
1139 /// Encode a variable-length char array in BCF format
1140 /**
1141 * @param s kstring to write into
1142 * @param l length of input
1143 * @param a input data to encode
1144 * @return 0 on success; < 0 on error
1145 */
1146 HTSLIB_EXPORT
1147 int bcf_enc_vchar(kstring_t *s, int l, const char *a);
1148
1149 /// Encode a variable-length integer array in BCF format
1150 /**
1151 * @param s kstring to write into
1152 * @param n total number of items in @p a (<= 0 to encode BCF_BT_NULL)
1153 * @param a input data to encode
1154 * @param wsize vector length (<= 0 is equivalent to @p n)
1155 * @return 0 on success; < 0 on error
1156 * @note @p n should be an exact multiple of @p wsize
1157 */
1158 HTSLIB_EXPORT
1159 int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize);
1160
1161 /// Encode a variable-length float array in BCF format
1162 /**
1163 * @param s kstring to write into
1164 * @param n total number of items in @p a (<= 0 to encode BCF_BT_NULL)
1165 * @param a input data to encode
1166 * @return 0 on success; < 0 on error
1167 */
1168 HTSLIB_EXPORT
1169 int bcf_enc_vfloat(kstring_t *s, int n, float *a);
1170
1171
1172 /**************************************************************************
1173 * BCF index
1174 *
1175 * Note that these functions work with BCFs only. See synced_bcf_reader.h
1176 * which provides (amongst other things) an API to work transparently with
1177 * both indexed BCFs and VCFs.
1178 **************************************************************************/
1179
1180 #define bcf_itr_destroy(iter) hts_itr_destroy(iter)
1181 #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec)
1182 #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec)
1183
bcf_itr_next(htsFile * htsfp,hts_itr_t * itr,void * r)1184 static inline int bcf_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) {
1185 if (htsfp->is_bgzf)
1186 return hts_itr_next(htsfp->fp.bgzf, itr, r, 0);
1187
1188 hts_log_error("Only bgzf compressed files can be used with iterators");
1189 errno = EINVAL;
1190 return -2;
1191 }
1192 /// Load a BCF index
1193 /** @param fn BCF file name
1194 @return The index, or NULL if an error occurred.
1195 @note This only works for BCF files. Consider synced_bcf_reader instead
1196 which works for both BCF and VCF.
1197 */
1198 #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI)
1199 #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr))
1200
1201 /// Load a BCF index from a given index file name
1202 /** @param fn Input BAM/BCF/etc filename
1203 @param fnidx The input index filename
1204 @return The index, or NULL if an error occurred.
1205 @note This only works for BCF files. Consider synced_bcf_reader instead
1206 which works for both BCF and VCF.
1207 */
1208 HTSLIB_EXPORT
1209 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx);
1210
1211 /// Load a BCF index from a given index file name
1212 /** @param fn Input BAM/BCF/etc filename
1213 @param fnidx The input index filename
1214 @param flags Flags to alter behaviour (see description)
1215 @return The index, or NULL if an error occurred.
1216 @note This only works for BCF files. Consider synced_bcf_reader instead
1217 which works for both BCF and VCF.
1218
1219 The @p flags parameter can be set to a combination of the following
1220 values:
1221
1222 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes
1223 HTS_IDX_SILENT_FAIL Fail silently if the index is not present
1224
1225 Equivalent to hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
1226 */
1227 HTSLIB_EXPORT
1228 hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags);
1229
1230 /**
1231 * bcf_index_build() - Generate and save an index file
1232 * @fn: Input VCF(compressed)/BCF filename
1233 * @min_shift: log2(width of the smallest bin), e.g. a value of 14
1234 * imposes a 16k base lower limit on the width of index bins.
1235 * Positive to generate CSI, or 0 to generate TBI. However, a small
1236 * value of min_shift would create a large index, which would lead to
1237 * reduced performance when using the index. A recommended value is 14.
1238 * For BCF files, only the CSI index can be generated.
1239 *
1240 * Returns 0 if successful, or negative if an error occurred.
1241 *
1242 * List of error codes:
1243 * -1 .. indexing failed
1244 * -2 .. opening @fn failed
1245 * -3 .. format not indexable
1246 * -4 .. failed to create and/or save the index
1247 */
1248 HTSLIB_EXPORT
1249 int bcf_index_build(const char *fn, int min_shift);
1250
1251 /**
1252 * bcf_index_build2() - Generate and save an index to a specific file
1253 * @fn: Input VCF/BCF filename
1254 * @fnidx: Output filename, or NULL to add .csi/.tbi to @fn
1255 * @min_shift: Positive to generate CSI, or 0 to generate TBI
1256 *
1257 * Returns 0 if successful, or negative if an error occurred.
1258 *
1259 * List of error codes:
1260 * -1 .. indexing failed
1261 * -2 .. opening @fn failed
1262 * -3 .. format not indexable
1263 * -4 .. failed to create and/or save the index
1264 */
1265 HTSLIB_EXPORT
1266 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift);
1267
1268 /**
1269 * bcf_index_build3() - Generate and save an index to a specific file
1270 * @fn: Input VCF/BCF filename
1271 * @fnidx: Output filename, or NULL to add .csi/.tbi to @fn
1272 * @min_shift: Positive to generate CSI, or 0 to generate TBI
1273 * @n_threads: Number of VCF/BCF decoder threads
1274 *
1275 * Returns 0 if successful, or negative if an error occurred.
1276 *
1277 * List of error codes:
1278 * -1 .. indexing failed
1279 * -2 .. opening @fn failed
1280 * -3 .. format not indexable
1281 * -4 .. failed to create and/or save the index
1282 */
1283 HTSLIB_EXPORT
1284 int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads);
1285
1286 /// Initialise fp->idx for the current format type, for VCF and BCF files.
1287 /** @param fp File handle for the data file being written.
1288 @param h BCF header structured (needed for BAI and CSI).
1289 @param min_shift CSI bin size (CSI default is 14).
1290 @param fnidx Filename to write index to. This pointer must remain valid
1291 until after bcf_idx_save is called.
1292 @return 0 on success, <0 on failure.
1293 @note This must be called after the header has been written, but before
1294 any other data.
1295 */
1296 HTSLIB_EXPORT
1297 int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx);
1298
1299 /// Writes the index initialised with bcf_idx_init to disk.
1300 /** @param fp File handle for the data file being written.
1301 @return 0 on success, <0 on failure.
1302 */
1303 HTSLIB_EXPORT
1304 int bcf_idx_save(htsFile *fp);
1305
1306 /*******************
1307 * Typed value I/O *
1308 *******************/
1309
1310 /*
1311 Note that in contrast with BCFv2.1 specification, HTSlib implementation
1312 allows missing values in vectors. For integer types, the values 0x80,
1313 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
1314 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
1315 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
1316 end-of-vector indicator.
1317 Note that the end-of-vector byte is not part of the vector.
1318
1319 This trial BCF version (v2.2) is compatible with the VCF specification and
1320 enables to handle correctly vectors with different ploidy in presence of
1321 missing values.
1322 */
1323 #define bcf_int8_vector_end (-127) /* INT8_MIN + 1 */
1324 #define bcf_int16_vector_end (-32767) /* INT16_MIN + 1 */
1325 #define bcf_int32_vector_end (-2147483647) /* INT32_MIN + 1 */
1326 #define bcf_int64_vector_end (-9223372036854775807LL) /* INT64_MIN + 1 */
1327 #define bcf_str_vector_end 0
1328 #define bcf_int8_missing (-128) /* INT8_MIN */
1329 #define bcf_int16_missing (-32767-1) /* INT16_MIN */
1330 #define bcf_int32_missing (-2147483647-1) /* INT32_MIN */
1331 #define bcf_int64_missing (-9223372036854775807LL - 1LL) /* INT64_MIN */
1332 #define bcf_str_missing 0x07
1333
1334 // Limits on BCF values stored in given types. Max values are the same
1335 // as for the underlying type. Min values are slightly different as
1336 // the last 8 values for each type were reserved by BCFv2.2.
1337 #define BCF_MAX_BT_INT8 (0x7f) /* INT8_MAX */
1338 #define BCF_MAX_BT_INT16 (0x7fff) /* INT16_MAX */
1339 #define BCF_MAX_BT_INT32 (0x7fffffff) /* INT32_MAX */
1340 #define BCF_MIN_BT_INT8 (-120) /* INT8_MIN + 8 */
1341 #define BCF_MIN_BT_INT16 (-32760) /* INT16_MIN + 8 */
1342 #define BCF_MIN_BT_INT32 (-2147483640) /* INT32_MIN + 8 */
1343
1344 extern uint32_t bcf_float_vector_end;
1345 extern uint32_t bcf_float_missing;
bcf_float_set(float * ptr,uint32_t value)1346 static inline void bcf_float_set(float *ptr, uint32_t value)
1347 {
1348 union { uint32_t i; float f; } u;
1349 u.i = value;
1350 *ptr = u.f;
1351 }
1352 #define bcf_float_set_vector_end(x) bcf_float_set(&(x),bcf_float_vector_end)
1353 #define bcf_float_set_missing(x) bcf_float_set(&(x),bcf_float_missing)
bcf_float_is_missing(float f)1354 static inline int bcf_float_is_missing(float f)
1355 {
1356 union { uint32_t i; float f; } u;
1357 u.f = f;
1358 return u.i==bcf_float_missing ? 1 : 0;
1359 }
bcf_float_is_vector_end(float f)1360 static inline int bcf_float_is_vector_end(float f)
1361 {
1362 union { uint32_t i; float f; } u;
1363 u.f = f;
1364 return u.i==bcf_float_vector_end ? 1 : 0;
1365 }
1366
bcf_format_gt(bcf_fmt_t * fmt,int isample,kstring_t * str)1367 static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
1368 {
1369 uint32_t e = 0;
1370 #define BRANCH(type_t, missing, vector_end) { \
1371 type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \
1372 int i; \
1373 for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \
1374 { \
1375 if ( i ) e |= kputc("/|"[ptr[i]&1], str) < 0; \
1376 if ( !(ptr[i]>>1) ) e |= kputc('.', str) < 0; \
1377 else e |= kputw((ptr[i]>>1) - 1, str) < 0; \
1378 } \
1379 if (i == 0) e |= kputc('.', str) < 0; \
1380 }
1381 switch (fmt->type) {
1382 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
1383 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
1384 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
1385 case BCF_BT_NULL: e |= kputc('.', str) < 0; break;
1386 default: hts_log_error("Unexpected type %d", fmt->type); return -2;
1387 }
1388 #undef BRANCH
1389 return e == 0 ? 0 : -1;
1390 }
1391
bcf_enc_size(kstring_t * s,int size,int type)1392 static inline int bcf_enc_size(kstring_t *s, int size, int type)
1393 {
1394 uint32_t e = 0;
1395 uint8_t x[4];
1396 if (size >= 15) {
1397 e |= kputc(15<<4|type, s) < 0;
1398 if (size >= 128) {
1399 if (size >= 32768) {
1400 i32_to_le(size, x);
1401 e |= kputc(1<<4|BCF_BT_INT32, s) < 0;
1402 e |= kputsn((char*)&x, 4, s) < 0;
1403 } else {
1404 i16_to_le(size, x);
1405 e |= kputc(1<<4|BCF_BT_INT16, s) < 0;
1406 e |= kputsn((char*)&x, 2, s) < 0;
1407 }
1408 } else {
1409 e |= kputc(1<<4|BCF_BT_INT8, s) < 0;
1410 e |= kputc(size, s) < 0;
1411 }
1412 } else e |= kputc(size<<4|type, s) < 0;
1413 return e == 0 ? 0 : -1;
1414 }
1415
bcf_enc_inttype(long x)1416 static inline int bcf_enc_inttype(long x)
1417 {
1418 if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) return BCF_BT_INT8;
1419 if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) return BCF_BT_INT16;
1420 return BCF_BT_INT32;
1421 }
1422
bcf_enc_int1(kstring_t * s,int32_t x)1423 static inline int bcf_enc_int1(kstring_t *s, int32_t x)
1424 {
1425 uint32_t e = 0;
1426 uint8_t z[4];
1427 if (x == bcf_int32_vector_end) {
1428 e |= bcf_enc_size(s, 1, BCF_BT_INT8);
1429 e |= kputc(bcf_int8_vector_end, s) < 0;
1430 } else if (x == bcf_int32_missing) {
1431 e |= bcf_enc_size(s, 1, BCF_BT_INT8);
1432 e |= kputc(bcf_int8_missing, s) < 0;
1433 } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) {
1434 e |= bcf_enc_size(s, 1, BCF_BT_INT8);
1435 e |= kputc(x, s) < 0;
1436 } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) {
1437 i16_to_le(x, z);
1438 e |= bcf_enc_size(s, 1, BCF_BT_INT16);
1439 e |= kputsn((char*)&z, 2, s) < 0;
1440 } else {
1441 i32_to_le(x, z);
1442 e |= bcf_enc_size(s, 1, BCF_BT_INT32);
1443 e |= kputsn((char*)&z, 4, s) < 0;
1444 }
1445 return e == 0 ? 0 : -1;
1446 }
1447
1448 /// Return the value of a single typed integer.
1449 /** @param p Pointer to input data block.
1450 @param type One of the BCF_BT_INT* type codes
1451 @param[out] q Location to store an updated value for p
1452 @return The integer value, or zero if @p type is not valid.
1453
1454 If @p type is not one of BCF_BT_INT8, BCF_BT_INT16, BCF_BT_INT32 or
1455 BCF_BT_INT64, zero will be returned and @p *q will not be updated.
1456 Otherwise, the integer value will be returned and @p *q will be set
1457 to the memory location immediately following the integer value.
1458
1459 Cautious callers can detect invalid type codes by checking that *q has
1460 actually been updated.
1461 */
1462
bcf_dec_int1(const uint8_t * p,int type,uint8_t ** q)1463 static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
1464 {
1465 if (type == BCF_BT_INT8) {
1466 *q = (uint8_t*)p + 1;
1467 return le_to_i8(p);
1468 } else if (type == BCF_BT_INT16) {
1469 *q = (uint8_t*)p + 2;
1470 return le_to_i16(p);
1471 } else if (type == BCF_BT_INT32) {
1472 *q = (uint8_t*)p + 4;
1473 return le_to_i32(p);
1474 } else if (type == BCF_BT_INT64) {
1475 *q = (uint8_t*)p + 8;
1476 return le_to_i64(p);
1477 } else { // Invalid type.
1478 return 0;
1479 }
1480 }
1481
1482 /// Return the value of a single typed integer from a byte stream.
1483 /** @param p Pointer to input data block.
1484 @param[out] q Location to store an updated value for p
1485 @return The integer value, or zero if the type code was not valid.
1486
1487 Reads a one-byte type code from @p p, and uses it to decode an integer
1488 value from the following bytes in @p p.
1489
1490 If the type is not one of BCF_BT_INT8, BCF_BT_INT16 or BCF_BT_INT32, zero
1491 will be returned and @p *q will unchanged. Otherwise, the integer value will
1492 be returned and @p *q will be set to the memory location immediately following
1493 the integer value.
1494
1495 Cautious callers can detect invalid type codes by checking that *q has
1496 actually been updated.
1497 */
bcf_dec_typed_int1(const uint8_t * p,uint8_t ** q)1498 static inline int64_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
1499 {
1500 return bcf_dec_int1(p + 1, *p&0xf, q);
1501 }
1502
bcf_dec_size(const uint8_t * p,uint8_t ** q,int * type)1503 static inline int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
1504 {
1505 *type = *p & 0xf;
1506 if (*p>>4 != 15) {
1507 *q = (uint8_t*)p + 1;
1508 return *p>>4;
1509 } else return bcf_dec_typed_int1(p + 1, q);
1510 }
1511
1512 #ifdef __cplusplus
1513 }
1514 #endif
1515
1516 #endif
1517