1 #include <ctime>
2
3 #include "Vcf.h"
4
5 const VcfMeta VcfMeta::predefs::info_AD ("INFO","<ID=AD,Number=R,Type=Integer,Description=\"Total Depth for Each Allele\">");
6 const VcfMeta VcfMeta::predefs::info_AF ("INFO","<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
7 const VcfMeta VcfMeta::predefs::info_DP ("INFO","<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
8 const VcfMeta VcfMeta::predefs::info_NS ("INFO","<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
9
10 const VcfMeta VcfMeta::predefs::format_AD ("FORMAT","<ID=AD,Number=R,Type=Integer,Description=\"Allele Depth\">");
11 const VcfMeta VcfMeta::predefs::format_DP ("FORMAT","<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
12 const VcfMeta VcfMeta::predefs::format_HQ ("FORMAT","<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
13 const VcfMeta VcfMeta::predefs::format_GL ("FORMAT","<ID=GL,Number=G,Type=Float,Description=\"Genotype Likelihood\">");
14 const VcfMeta VcfMeta::predefs::format_GQ ("FORMAT","<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
15 const VcfMeta VcfMeta::predefs::format_GT ("FORMAT","<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
16
17 const VcfMeta VcfMeta::predefs::info_loc_strand (
18 "INFO",
19 "<ID=loc_strand,Number=1,Type=Character,Description=\"Genomic strand the corresponding Stacks locus aligns on\">"
20 );
21
22 const string VcfHeader::std_fields = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
23
add_std_meta(const string & version)24 void VcfHeader::add_std_meta(const string& version) {
25 add_meta(VcfMeta("fileformat", version));
26
27 time_t t;
28 time(&t);
29 char date[9];
30 strftime(date, 9, "%Y%m%d", localtime(&t));
31 add_meta(VcfMeta("fileDate", date));
32
33 add_meta(VcfMeta("source", string("\"Stacks v") + VERSION + "\""));
34
35 add_meta(VcfMeta::predefs::info_AD);
36 add_meta(VcfMeta::predefs::info_AF);
37 add_meta(VcfMeta::predefs::info_DP);
38 add_meta(VcfMeta::predefs::info_NS);
39 add_meta(VcfMeta::predefs::format_AD);
40 add_meta(VcfMeta::predefs::format_DP);
41 add_meta(VcfMeta::predefs::format_HQ);
42 add_meta(VcfMeta::predefs::format_GL);
43 add_meta(VcfMeta::predefs::format_GQ);
44 add_meta(VcfMeta::predefs::format_GT);
45 }
46
assign(const char * rec,size_t len,const VcfHeader & header)47 void VcfRecord::assign(const char* rec, size_t len, const VcfHeader& header) {
48 assert(rec[len-1] != '\n');
49
50 clear();
51 buffer_.assign(rec, rec+len+1);
52
53 // Parse the record.
54 size_t n_exp_fields = header.samples().empty() ? 8 : 9 + header.samples().size();
55 auto wrong_n_fields = [&rec, &len, &n_exp_fields](size_t obs){
56 cerr << "Error: Expected VCF record to have "
57 << n_exp_fields << " fields, not " << obs << "."
58 << "Offending record (" << len << " characters) is:\n"
59 << rec << '\n';
60 throw exception();
61 };
62
63 size_t n_fields = 1;
64 char* q = NULL; // The start of the current field.
65 char* p = buffer_.data(); // The start of the next field.
66 auto next_field = [&p, &q, &n_fields, &wrong_n_fields]() {
67 q = p;
68 p = strchr(p, '\t');
69 if (!p)
70 wrong_n_fields(n_fields);
71 ++n_fields;
72 *p = '\0';
73 ++p;
74 };
75
76 // chrom
77 next_field();
78 assert(q == buffer_.data());
79
80 // pos
81 next_field();
82 pos_ = q - buffer_.data();
83
84 // id
85 next_field();
86
87 // alleles (ref + alt)
88 next_field();
89 allele0_ = q - buffer_.data();
90 next_field();
91 while((q = strchr(q, ','))) {
92 *q = '\0'; // ALT alleles become null-separated.
93 ++q;
94 }
95
96 // qual
97 next_field();
98 qual_ = q - buffer_.data();
99
100 // filters
101 next_field();
102
103 info0_ = p - buffer_.data();
104 if (header.samples().empty()) {
105 // info (last field)
106 while((p = strchr(p, ';'))) {
107 *p = '\0'; // INFO fields become null-separated.
108 ++p;
109 }
110 } else {
111 // info
112 next_field();
113 while((q = strchr(q, ';'))) {
114 *q = '\0';
115 ++q;
116 }
117
118 // format
119 next_field();
120 format0_ = q - buffer_.data();
121 while((q = strchr(q, ':'))) {
122 *q = '\0'; // FORMAT fields become null-separated.
123 ++q;
124 }
125
126 // samples (last fields)
127 sample0_ = p - buffer_.data();
128 while((p = strchr(p, '\t'))) {
129 *p = '\0';
130 ++p;
131 ++n_fields;
132 }
133 }
134 if (n_fields != n_exp_fields)
135 wrong_n_fields(n_fields);
136 }
137
fmt_info_af(const vector<double> & alt_freqs)138 string VcfRecord::util::fmt_info_af(const vector<double>& alt_freqs) {
139 stringstream ss;
140 ss << std::setprecision(3);
141 join(alt_freqs, ',', ss);
142 return string("AF=") + ss.str();
143 }
144
fmt_gt_gl(const vector<Nt2> & alleles,const GtLiks & liks)145 string VcfRecord::util::fmt_gt_gl(const vector<Nt2>& alleles, const GtLiks& liks) {
146 stringstream ss;
147 ss << std::fixed << std::setprecision(2);
148
149 assert(!alleles.empty());
150 vector<double> v;
151 v.reserve(n_possible_genotypes(alleles.size()));
152 for (size_t a2=0; a2<alleles.size(); ++a2) {
153 Nt2 a2nt = alleles[a2];
154 for (size_t a1=0; a1<=a2; ++a1) {
155 Nt2 a1nt = alleles[a1];
156 v.push_back(liks.at(a1nt, a2nt) / log(10));
157 }
158 }
159 join(v, ',', ss);
160 return ss.str();
161 }
162
parse_gt_dp(const char * gt_str,size_t dp_index)163 size_t VcfRecord::util::parse_gt_dp(
164 const char* gt_str,
165 size_t dp_index
166 ) {
167 const char* p = VcfRecord::util::find_gt_subfield(gt_str, dp_index);
168 if (p == NULL || *p == '.') {
169 cerr << "Error: DP field is missing.\n";
170 throw exception();
171 }
172 char* tmp;
173 long dp = strtol(p, &tmp, 10);
174 if (tmp == p || dp < 0) {
175 cerr << "Error: Bad DP field.\n";
176 throw exception();
177 }
178 return dp;
179 }
180
181 Counts<Nt2>
parse_gt_ad(const char * gt_str,size_t ad_index,const vector<Nt2> & alleles)182 VcfRecord::util::parse_gt_ad(
183 const char* gt_str,
184 size_t ad_index,
185 const vector<Nt2>& alleles
186 ) {
187 Counts<Nt2> depths;
188 const char* p = VcfRecord::util::find_gt_subfield(gt_str, ad_index);
189 if (p == NULL || *p == '.') {
190 cerr << "Error: AD field is missing.\n";
191 throw exception();
192 }
193 try {
194 size_t i = 0;
195 char* end;
196 --p;
197 do {
198 ++p;
199 long ad = strtol(p, &end, 10);
200 if (end == p || ad < 0)
201 throw exception();
202 depths.increment(alleles.at(i), ad);
203 ++i;
204 p = end;
205 } while (*p == ',');
206 if (i != alleles.size()) {
207 cerr << "Error: Wrong number of values (expected "
208 << alleles.size() << ", found" << i << ").\n";
209 throw exception();
210 }
211 return depths;
212 } catch (exception&) {
213 cerr << "Error: Bad AD field in '" << gt_str << "'.\n";
214 throw;
215 }}
216
parse_gt_gq(const char * gt_str,size_t gq_index)217 uint8_t VcfRecord::util::parse_gt_gq(
218 const char* gt_str,
219 size_t gq_index
220 ) {
221 const char* p = VcfRecord::util::find_gt_subfield(gt_str, gq_index);
222 if (p == NULL || *p == '.') {
223 cerr << "Error: GQ field is missing.\n";
224 throw exception();
225 }
226 char* tmp;
227 long gq = strtol(p, &tmp, 10);
228 if (tmp == p || gq < 0 || gq > UINT8_MAX) {
229 cerr << "Error: Bad GQ field.\n";
230 throw exception();
231 }
232 return gq;
233 }
234
parse_gt_gl(const char * gt_str,size_t gl_index,const vector<Nt2> & alleles)235 GtLiks VcfRecord::util::parse_gt_gl(
236 const char* gt_str,
237 size_t gl_index,
238 const vector<Nt2>& alleles
239 ) {
240 GtLiks liks;
241 const char* p = VcfRecord::util::find_gt_subfield(gt_str, gl_index);
242 if (p == NULL) {
243 cerr << "Error: GL field is missing.\n";
244 throw exception();
245 }
246 try {
247 size_t n_values_found = 0;
248 char* end;
249 for (size_t a2=0; a2<alleles.size(); ++a2) {
250 Nt2 a2nt = alleles[a2];
251 for (size_t a1=0; a1<=a2; ++a1) {
252 Nt2 a1nt = alleles[a1];
253 double lik = strtod(p, &end);
254 if (end == p)
255 throw exception();
256 lik *= log(10); // Back to base e.
257 liks.set(a1nt, a2nt, lik);
258 // Move to the next field.
259 ++n_values_found;
260 p = end;
261 if (*p == ',') {
262 ++p;
263 } else if (*p != ':' && *p != '\0') {
264 throw exception();
265 } else if (n_values_found != n_possible_genotypes(alleles.size())) {
266 cerr << "Error: Wrong number of values (expected "
267 << n_possible_genotypes(alleles.size()) << ", found "
268 << n_values_found << ").\n";
269 throw exception();
270 }
271 }
272 }
273 return liks;
274 } catch (exception&) {
275 cerr << "Error: VCF: Illegal GL format: "
276 << (gl_index+1) << "th field in '" << gt_str << "'.\n";
277 throw;
278 }}
279
VcfParser(const string & path)280 VcfParser::VcfParser(const string& path) : file_(path), header_() {
281 FileT ftype = guess_file_type(path);
282 if (ftype != FileT::vcf && ftype != FileT::gzvcf) {
283 cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.\n";
284 throw exception();
285 }
286 read_header();
287 }
288
289 int
open(string & path)290 VcfParser::open(string &path) {
291 this->file_.open(path);
292
293 FileT ftype = guess_file_type(path);
294
295 if (ftype != FileT::vcf && ftype != FileT::gzvcf) {
296 cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.\n";
297 throw exception();
298 }
299
300 read_header();
301
302 return 0;
303 }
304
305 void
read_header()306 VcfParser::read_header()
307 {
308 const char* line = NULL;
309 size_t len;
310
311 auto malformed = [this, &line] () {
312 cerr << "Error: Malformed VCF header."
313 << " At line " << file_.line_number() << " in file '" << file_.path() << "'";
314 if (line)
315 cerr << ":\n" << line << "\n";
316 else
317 cerr << ".\n";
318 throw exception();
319 };
320
321 // Meta header lines.
322 bool not_eof;
323 while((not_eof = file_.getline(line, len)) && strncmp(line, "##", 2) == 0) {
324 const char* equal = strchr(line+2, '=');
325 if(!equal) {
326 // Skip header lines missing the '='.
327 continue;
328 }
329 header_.add_meta(VcfMeta(string(line+2, equal), string(equal+1)));
330 }
331 if (!not_eof)
332 malformed();
333
334 // Final header line.
335 if(strncmp(line, VcfHeader::std_fields.c_str(), VcfHeader::std_fields.length()) != 0)
336 malformed();
337
338 // Parse sample names, if any.
339 if(len > VcfHeader::std_fields.length()) {
340 const char* p = line + VcfHeader::std_fields.length();
341 const char* end = line + len;
342
343 // Check that FORMAT is present.
344 const char format[] = "\tFORMAT";
345 const size_t format_len = sizeof(format) - 1;
346 if(strncmp(p, format, format_len) != 0)
347 malformed();
348 p += format_len;
349 if (*p != '\t')
350 malformed();
351
352 do {
353 ++p;
354 const char* next = strchr(p, '\t');
355 if (!next)
356 next = end;
357 header_.add_sample(string(p, next));
358 p = next;
359 } while (p != end);
360 }
361 }
362