1 /* The MIT License
2 
3    Copyright (c) 2010 Broad Institute
4 
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12 
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25 
26 /* Contact: Heng Li <lh3@live.co.uk> */
27 
28 #ifndef BCF_H
29 #define BCF_H
30 
31 #include <stdint.h>
32 #include <zlib.h>
33 
34 #ifndef BCF_LITE
35 #include "bgzf.h"
36 typedef BGZF *bcfFile;
37 #else
38 typedef gzFile bcfFile;
39 #define bgzf_open(fn, mode) gzopen(fn, mode)
40 #define bgzf_fdopen(fd, mode) gzdopen(fd, mode)
41 #define bgzf_close(fp) gzclose(fp)
42 #define bgzf_read(fp, buf, len) gzread(fp, buf, len)
43 #define bgzf_write(fp, buf, len)
44 #define bgzf_flush(fp)
45 #endif
46 
47 /*
48   A member in the structs below is said to "primary" if its content
49   cannot be inferred from other members in any of structs below; a
50   member is said to be "derived" if its content can be derived from
51   other members. For example, bcf1_t::str is primary as this comes from
52   the input data, while bcf1_t::info is derived as it can always be
53   correctly set if we know bcf1_t::str. Derived members are for quick
54   access to the content and must be synchronized with the primary data.
55  */
56 
57 typedef struct {
58 	uint32_t fmt; // format of the block, set by bcf_str2int().
59 	int len; // length of data for each individual
60 	void *data; // concatenated data
61 	// derived info: fmt, len (<-bcf1_t::fmt)
62 } bcf_ginfo_t;
63 
64 typedef struct {
65 	int32_t tid, pos; // refID and 0-based position
66 	int32_t l_str, m_str; // length and the allocated size of ->str
67 	float qual; // SNP quality
68 	char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
69 	char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
70 	int n_gi, m_gi; // number and the allocated size of geno fields
71 	bcf_ginfo_t *gi; // array of geno fields
72 	int n_alleles, n_smpl; // number of alleles and samples
73 	// derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
74 } bcf1_t;
75 
76 typedef struct {
77 	int32_t n_ref, n_smpl; // number of reference sequences and samples
78 	int32_t l_nm; // length of concatenated sequence names; 0 padded
79 	int32_t l_smpl; // length of concatenated sample names; 0 padded
80 	int32_t l_txt; // length of header text (lines started with ##)
81 	char *name, *sname, *txt; // concatenated sequence names, sample names and header text
82 	char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
83 	// derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
84 } bcf_hdr_t;
85 
86 typedef struct {
87 	int is_vcf; // if the file in operation is a VCF
88 	void *v; // auxillary data structure for VCF
89 	bcfFile fp; // file handler for BCF
90 } bcf_t;
91 
92 struct __bcf_idx_t;
93 typedef struct __bcf_idx_t bcf_idx_t;
94 
95 #ifdef __cplusplus
96 extern "C" {
97 #endif
98 
99 	// open a BCF file; for BCF file only
100 	bcf_t *bcf_open(const char *fn, const char *mode);
101 	// close file
102 	int bcf_close(bcf_t *b);
103 	// read one record from BCF; return -1 on end-of-file, and <-1 for errors
104 	int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
105 	// call this function if b->str is changed
106 	int bcf_sync(bcf1_t *b);
107 	// write a BCF record
108 	int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
109 	// read the BCF header; BCF only
110 	bcf_hdr_t *bcf_hdr_read(bcf_t *b);
111 	// write the BCF header
112 	int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
113 	// set bcf_hdr_t::ns and bcf_hdr_t::sns
114 	int bcf_hdr_sync(bcf_hdr_t *b);
115 	// destroy the header
116 	void bcf_hdr_destroy(bcf_hdr_t *h);
117 	// destroy a record
118 	int bcf_destroy(bcf1_t *b);
119 	// BCF->VCF conversion
120 	char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
121 	// append more info
122 	int bcf_append_info(bcf1_t *b, const char *info, int l);
123 	// copy
124 	int bcf_cpy(bcf1_t *r, const bcf1_t *b);
125 
126 	// open a VCF or BCF file if "b" is set in "mode"
127 	bcf_t *vcf_open(const char *fn, const char *mode);
128 	// close a VCF/BCF file
129 	int vcf_close(bcf_t *bp);
130 	// read the VCF/BCF header
131 	bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
132 	// read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
133 	int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
134 	// write the VCF header
135 	int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
136 	// write a VCF record
137 	int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
138 
139 	// keep the first n alleles and discard the rest
140 	int bcf_shrink_alt(bcf1_t *b, int n);
141 	// convert GL to PL
142 	int bcf_gl2pl(bcf1_t *b);
143 	// if the site is an indel
144 	int bcf_is_indel(const bcf1_t *b);
145 
146 	// string hash table
147 	void *bcf_build_refhash(bcf_hdr_t *h);
148 	void bcf_str2id_destroy(void *_hash);
149 	int bcf_str2id_add(void *_hash, const char *str);
150 	int bcf_str2id(void *_hash, const char *str);
151 	void *bcf_str2id_init();
152 
153 	// indexing related functions
154 	int bcf_idx_build(const char *fn);
155 	uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
156 	int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
157 	bcf_idx_t *bcf_idx_load(const char *fn);
158 	void bcf_idx_destroy(bcf_idx_t *idx);
159 
160 #ifdef __cplusplus
161 }
162 #endif
163 
bcf_str2int(const char * str,int l)164 static inline uint32_t bcf_str2int(const char *str, int l)
165 {
166 	int i;
167 	uint32_t x = 0;
168 	for (i = 0; i < l && i < 4; ++i) {
169 		if (str[i] == 0) return x;
170 		x = x<<8 | str[i];
171 	}
172 	return x;
173 }
174 
175 #endif
176