1 #include <zlib.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "bcf.h"
6 #include "kstring.h"
7 #include "kseq.h"
8 KSTREAM_INIT(gzFile, gzread, 4096)
9 
10 typedef struct {
11 	gzFile fp;
12 	FILE *fpout;
13 	kstream_t *ks;
14 	void *refhash;
15 	kstring_t line;
16 	int max_ref;
17 } vcf_t;
18 
vcf_hdr_read(bcf_t * bp)19 bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
20 {
21 	kstring_t meta, smpl;
22 	int dret;
23 	vcf_t *v;
24 	bcf_hdr_t *h;
25 	if (!bp->is_vcf) return bcf_hdr_read(bp);
26 	h = calloc(1, sizeof(bcf_hdr_t));
27 	v = (vcf_t*)bp->v;
28 	v->line.l = 0;
29 	memset(&meta, 0, sizeof(kstring_t));
30 	memset(&smpl, 0, sizeof(kstring_t));
31 	while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
32 		if (v->line.l < 2) continue;
33 		if (v->line.s[0] != '#') return 0; // no sample line
34 		if (v->line.s[0] == '#' && v->line.s[1] == '#') {
35 			kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
36 		} else if (v->line.s[0] == '#') {
37 			int k;
38 			ks_tokaux_t aux;
39 			char *p;
40 			for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
41 				if (k >= 9) {
42 					kputsn(p, aux.p - p, &smpl);
43 					kputc('\0', &smpl);
44 				}
45 			}
46 			break;
47 		}
48 	}
49 	kputc('\0', &meta);
50 	h->name = 0;
51 	h->sname = smpl.s; h->l_smpl = smpl.l;
52 	h->txt = meta.s; h->l_txt = meta.l;
53 	bcf_hdr_sync(h);
54 	return h;
55 }
56 
vcf_open(const char * fn,const char * mode)57 bcf_t *vcf_open(const char *fn, const char *mode)
58 {
59 	bcf_t *bp;
60 	vcf_t *v;
61 	if (strchr(mode, 'b')) return bcf_open(fn, mode);
62 	bp = calloc(1, sizeof(bcf_t));
63 	v = calloc(1, sizeof(vcf_t));
64 	bp->is_vcf = 1;
65 	bp->v = v;
66 	v->refhash = bcf_str2id_init();
67 	if (strchr(mode, 'r')) {
68 		v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
69 		v->ks = ks_init(v->fp);
70 	} else if (strchr(mode, 'w'))
71 		v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
72 	return bp;
73 }
74 
vcf_close(bcf_t * bp)75 int vcf_close(bcf_t *bp)
76 {
77 	vcf_t *v;
78 	if (bp == 0) return -1;
79 	if (!bp->is_vcf) return bcf_close(bp);
80 	v = (vcf_t*)bp->v;
81 	if (v->fp) {
82 		ks_destroy(v->ks);
83 		gzclose(v->fp);
84 	}
85 	if (v->fpout) fclose(v->fpout);
86 	free(v->line.s);
87 	bcf_str2id_destroy(v->refhash);
88 	free(v);
89 	free(bp);
90 	return 0;
91 }
92 
vcf_hdr_write(bcf_t * bp,const bcf_hdr_t * h)93 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
94 {
95 	vcf_t *v = (vcf_t*)bp->v;
96 	int i, has_ref = 0, has_ver = 0;
97 	if (!bp->is_vcf) return bcf_hdr_write(bp, h);
98 	if (h->l_txt > 0) {
99 		if (strstr(h->txt, "##fileformat=")) has_ver = 1;
100 		if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
101 		fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
102 		if (strstr(h->txt, "##SQ=")) has_ref = 1;
103 	}
104 	if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
105 	fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
106 	for (i = 0; i < h->n_smpl; ++i)
107 		fprintf(v->fpout, "\t%s", h->sns[i]);
108 	fputc('\n', v->fpout);
109 	return 0;
110 }
111 
vcf_write(bcf_t * bp,bcf_hdr_t * h,bcf1_t * b)112 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
113 {
114 	vcf_t *v = (vcf_t*)bp->v;
115 	extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
116 	if (!bp->is_vcf) return bcf_write(bp, h, b);
117 	bcf_fmt_core(h, b, &v->line);
118 	fwrite(v->line.s, 1, v->line.l, v->fpout);
119 	fputc('\n', v->fpout);
120 	return v->line.l + 1;
121 }
122 
vcf_read(bcf_t * bp,bcf_hdr_t * h,bcf1_t * b)123 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
124 {
125 	int dret, k, i, sync = 0;
126 	vcf_t *v = (vcf_t*)bp->v;
127 	char *p, *q;
128 	kstring_t str, rn;
129 	ks_tokaux_t aux, a2;
130 	if (!bp->is_vcf) return bcf_read(bp, h, b);
131 	v->line.l = 0;
132 	str.l = 0; str.m = b->m_str; str.s = b->str;
133 	rn.l = rn.m = h->l_nm; rn.s = h->name;
134 	if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
135 	b->n_smpl = h->n_smpl;
136 	for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
137 		*(char*)aux.p = 0;
138 		if (k == 0) { // ref
139 			int tid = bcf_str2id(v->refhash, p);
140 			if (tid < 0) {
141 				tid = bcf_str2id_add(v->refhash, p);
142 				kputs(p, &rn); kputc('\0', &rn);
143 				sync = 1;
144 			}
145 			b->tid = tid;
146 		} else if (k == 1) { // pos
147 			b->pos = atoi(p) - 1;
148 		} else if (k == 5) { // qual
149 			b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
150 		} else if (k <= 8) { // variable length strings
151 			kputs(p, &str); kputc('\0', &str);
152 			b->l_str = str.l; b->m_str = str.m; b->str = str.s;
153 			if (k == 8) bcf_sync(b);
154 		} else { // k > 9
155 			if (strncmp(p, "./.", 3) == 0) {
156 				for (i = 0; i < b->n_gi; ++i) {
157 					if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
158 						((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
159 					} else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) {
160 						((uint8_t*)b->gi[i].data)[k-9] = 0;
161 					} else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
162 						((uint16_t*)b->gi[i].data)[k-9] = 0;
163 					} else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
164 						int y = b->n_alleles * (b->n_alleles + 1) / 2;
165 						memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
166 					} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
167 						int y = b->n_alleles * (b->n_alleles + 1) / 2;
168 						memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
169 					}
170 				}
171 				goto endblock;
172 			}
173 			for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
174 				if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
175 					((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
176 				} else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) {
177 					double _x = strtod(q, &q);
178 					int x = (int)(_x + .499);
179 					if (x > 255) x = 255;
180 					((uint8_t*)b->gi[i].data)[k-9] = x;
181 				} else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
182 					int x = strtol(q, &q, 10);
183 					if (x > 0xffff) x = 0xffff;
184 					((uint16_t*)b->gi[i].data)[k-9] = x;
185 				} else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
186 					int x, y, j;
187 					uint8_t *data = (uint8_t*)b->gi[i].data;
188 					y = b->n_alleles * (b->n_alleles + 1) / 2;
189 					for (j = 0; j < y; ++j) {
190 						x = strtol(q, &q, 10);
191 						if (x > 255) x = 255;
192 						data[(k-9) * y + j] = x;
193 						++q;
194 					}
195 				} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
196 					int j, y;
197 					float x, *data = (float*)b->gi[i].data;
198 					y = b->n_alleles * (b->n_alleles + 1) / 2;
199 					for (j = 0; j < y; ++j) {
200 						x = strtod(q, &q);
201 						data[(k-9) * y + j] = x;
202 						++q;
203 					}
204 				}
205 			}
206 		endblock: i = i;
207 		}
208 	}
209 	h->l_nm = rn.l; h->name = rn.s;
210 	if (sync) bcf_hdr_sync(h);
211 	return v->line.l + 1;
212 }
213