1 #include <ctime>
2 
3 #include "Vcf.h"
4 
5 const VcfMeta VcfMeta::predefs::info_AD ("INFO","<ID=AD,Number=R,Type=Integer,Description=\"Total Depth for Each Allele\">");
6 const VcfMeta VcfMeta::predefs::info_AF ("INFO","<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
7 const VcfMeta VcfMeta::predefs::info_DP ("INFO","<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
8 const VcfMeta VcfMeta::predefs::info_NS ("INFO","<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
9 
10 const VcfMeta VcfMeta::predefs::format_AD ("FORMAT","<ID=AD,Number=R,Type=Integer,Description=\"Allele Depth\">");
11 const VcfMeta VcfMeta::predefs::format_DP ("FORMAT","<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
12 const VcfMeta VcfMeta::predefs::format_HQ ("FORMAT","<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
13 const VcfMeta VcfMeta::predefs::format_GL ("FORMAT","<ID=GL,Number=G,Type=Float,Description=\"Genotype Likelihood\">");
14 const VcfMeta VcfMeta::predefs::format_GQ ("FORMAT","<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
15 const VcfMeta VcfMeta::predefs::format_GT ("FORMAT","<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
16 
17 const VcfMeta VcfMeta::predefs::info_loc_strand (
18         "INFO",
19         "<ID=loc_strand,Number=1,Type=Character,Description=\"Genomic strand the corresponding Stacks locus aligns on\">"
20         );
21 
22 const string VcfHeader::std_fields = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
23 
add_std_meta(const string & version)24 void VcfHeader::add_std_meta(const string& version) {
25     add_meta(VcfMeta("fileformat", version));
26 
27     time_t t;
28     time(&t);
29     char date[9];
30     strftime(date, 9, "%Y%m%d", localtime(&t));
31     add_meta(VcfMeta("fileDate", date));
32 
33     add_meta(VcfMeta("source", string("\"Stacks v") + VERSION + "\""));
34 
35     add_meta(VcfMeta::predefs::info_AD);
36     add_meta(VcfMeta::predefs::info_AF);
37     add_meta(VcfMeta::predefs::info_DP);
38     add_meta(VcfMeta::predefs::info_NS);
39     add_meta(VcfMeta::predefs::format_AD);
40     add_meta(VcfMeta::predefs::format_DP);
41     add_meta(VcfMeta::predefs::format_HQ);
42     add_meta(VcfMeta::predefs::format_GL);
43     add_meta(VcfMeta::predefs::format_GQ);
44     add_meta(VcfMeta::predefs::format_GT);
45 }
46 
assign(const char * rec,size_t len,const VcfHeader & header)47 void VcfRecord::assign(const char* rec, size_t len, const VcfHeader& header) {
48     assert(rec[len-1] != '\n');
49 
50     clear();
51     buffer_.assign(rec, rec+len+1);
52 
53     // Parse the record.
54     size_t n_exp_fields = header.samples().empty() ? 8 : 9 + header.samples().size();
55     auto wrong_n_fields = [&rec, &len, &n_exp_fields](size_t obs){
56         cerr << "Error: Expected VCF record to have "
57              << n_exp_fields << " fields, not " << obs << "."
58              << "Offending record (" << len << " characters) is:\n"
59              << rec << '\n';
60         throw exception();
61     };
62 
63     size_t n_fields = 1;
64     char* q = NULL; // The start of the current field.
65     char* p = buffer_.data(); // The start of the next field.
66     auto next_field = [&p, &q, &n_fields, &wrong_n_fields]() {
67         q = p;
68         p = strchr(p, '\t');
69         if (!p)
70             wrong_n_fields(n_fields);
71         ++n_fields;
72         *p = '\0';
73         ++p;
74     };
75 
76     // chrom
77     next_field();
78     assert(q == buffer_.data());
79 
80     // pos
81     next_field();
82     pos_ = q - buffer_.data();
83 
84     // id
85     next_field();
86 
87     // alleles (ref + alt)
88     next_field();
89     allele0_ = q - buffer_.data();
90     next_field();
91     while((q = strchr(q, ','))) {
92         *q = '\0'; // ALT alleles become null-separated.
93         ++q;
94     }
95 
96     // qual
97     next_field();
98     qual_ = q - buffer_.data();
99 
100     // filters
101     next_field();
102 
103     info0_ = p - buffer_.data();
104     if (header.samples().empty()) {
105         // info (last field)
106         while((p = strchr(p, ';'))) {
107             *p = '\0'; // INFO fields become null-separated.
108             ++p;
109         }
110     } else {
111         // info
112         next_field();
113         while((q = strchr(q, ';'))) {
114             *q = '\0';
115             ++q;
116         }
117 
118         // format
119         next_field();
120         format0_ = q - buffer_.data();
121         while((q = strchr(q, ':'))) {
122             *q = '\0'; // FORMAT fields become null-separated.
123             ++q;
124         }
125 
126         // samples (last fields)
127         sample0_ = p - buffer_.data();
128         while((p = strchr(p, '\t'))) {
129             *p = '\0';
130             ++p;
131             ++n_fields;
132         }
133     }
134     if (n_fields != n_exp_fields)
135         wrong_n_fields(n_fields);
136 }
137 
fmt_info_af(const vector<double> & alt_freqs)138 string VcfRecord::util::fmt_info_af(const vector<double>& alt_freqs) {
139     stringstream ss;
140     ss << std::setprecision(3);
141     join(alt_freqs, ',', ss);
142     return string("AF=") + ss.str();
143 }
144 
fmt_gt_gl(const vector<Nt2> & alleles,const GtLiks & liks)145 string VcfRecord::util::fmt_gt_gl(const vector<Nt2>& alleles, const GtLiks& liks) {
146     stringstream ss;
147     ss << std::fixed << std::setprecision(2);
148 
149     assert(!alleles.empty());
150     vector<double> v;
151     v.reserve(n_possible_genotypes(alleles.size()));
152     for (size_t a2=0; a2<alleles.size(); ++a2) {
153         Nt2 a2nt = alleles[a2];
154         for (size_t a1=0; a1<=a2; ++a1) {
155             Nt2 a1nt = alleles[a1];
156             v.push_back(liks.at(a1nt, a2nt) / log(10));
157         }
158     }
159     join(v, ',', ss);
160     return ss.str();
161 }
162 
parse_gt_dp(const char * gt_str,size_t dp_index)163 size_t VcfRecord::util::parse_gt_dp(
164     const char* gt_str,
165     size_t dp_index
166 ) {
167     const char* p = VcfRecord::util::find_gt_subfield(gt_str, dp_index);
168     if (p == NULL || *p == '.') {
169         cerr << "Error: DP field is missing.\n";
170         throw exception();
171     }
172     char* tmp;
173     long dp = strtol(p, &tmp, 10);
174     if (tmp == p || dp < 0) {
175         cerr << "Error: Bad DP field.\n";
176         throw exception();
177     }
178     return dp;
179 }
180 
181 Counts<Nt2>
parse_gt_ad(const char * gt_str,size_t ad_index,const vector<Nt2> & alleles)182 VcfRecord::util::parse_gt_ad(
183     const char* gt_str,
184     size_t ad_index,
185     const vector<Nt2>& alleles
186 ) {
187     Counts<Nt2> depths;
188     const char* p = VcfRecord::util::find_gt_subfield(gt_str, ad_index);
189     if (p == NULL || *p == '.') {
190         cerr << "Error: AD field is missing.\n";
191         throw exception();
192     }
193 try {
194     size_t i = 0;
195     char* end;
196     --p;
197     do {
198         ++p;
199         long ad = strtol(p, &end, 10);
200         if (end == p || ad < 0)
201             throw exception();
202         depths.increment(alleles.at(i), ad);
203         ++i;
204         p = end;
205     } while (*p == ',');
206     if (i != alleles.size()) {
207         cerr << "Error: Wrong number of values (expected "
208              << alleles.size() << ", found" << i << ").\n";
209         throw exception();
210     }
211     return depths;
212 } catch (exception&) {
213     cerr << "Error: Bad AD field in '" << gt_str << "'.\n";
214     throw;
215 }}
216 
parse_gt_gq(const char * gt_str,size_t gq_index)217 uint8_t VcfRecord::util::parse_gt_gq(
218     const char* gt_str,
219     size_t gq_index
220 ) {
221     const char* p = VcfRecord::util::find_gt_subfield(gt_str, gq_index);
222     if (p == NULL || *p == '.') {
223         cerr << "Error: GQ field is missing.\n";
224         throw exception();
225     }
226     char* tmp;
227     long gq = strtol(p, &tmp, 10);
228     if (tmp == p || gq < 0 || gq > UINT8_MAX) {
229         cerr << "Error: Bad GQ field.\n";
230         throw exception();
231     }
232     return gq;
233 }
234 
parse_gt_gl(const char * gt_str,size_t gl_index,const vector<Nt2> & alleles)235 GtLiks VcfRecord::util::parse_gt_gl(
236     const char* gt_str,
237     size_t gl_index,
238     const vector<Nt2>& alleles
239 ) {
240     GtLiks liks;
241     const char* p = VcfRecord::util::find_gt_subfield(gt_str, gl_index);
242     if (p == NULL) {
243         cerr << "Error: GL field is missing.\n";
244         throw exception();
245     }
246 try {
247     size_t n_values_found = 0;
248     char* end;
249     for (size_t a2=0; a2<alleles.size(); ++a2) {
250         Nt2 a2nt = alleles[a2];
251         for (size_t a1=0; a1<=a2; ++a1) {
252             Nt2 a1nt = alleles[a1];
253             double lik = strtod(p, &end);
254             if (end == p)
255                 throw exception();
256             lik *= log(10); // Back to base e.
257             liks.set(a1nt, a2nt, lik);
258             // Move to the next field.
259             ++n_values_found;
260             p = end;
261             if (*p == ',') {
262                 ++p;
263             } else if (*p != ':' && *p != '\0') {
264                 throw exception();
265             } else if (n_values_found != n_possible_genotypes(alleles.size())) {
266                 cerr << "Error: Wrong number of values (expected "
267                      << n_possible_genotypes(alleles.size()) << ", found "
268                      << n_values_found << ").\n";
269                 throw exception();
270             }
271         }
272     }
273     return liks;
274 } catch (exception&) {
275     cerr << "Error: VCF: Illegal GL format: "
276         << (gl_index+1) << "th field in '" << gt_str << "'.\n";
277     throw;
278 }}
279 
VcfParser(const string & path)280 VcfParser::VcfParser(const string& path) : file_(path), header_() {
281     FileT ftype = guess_file_type(path);
282     if (ftype != FileT::vcf && ftype != FileT::gzvcf) {
283         cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.\n";
284         throw exception();
285     }
286     read_header();
287 }
288 
289 int
open(string & path)290 VcfParser::open(string &path) {
291     this->file_.open(path);
292 
293     FileT ftype = guess_file_type(path);
294 
295     if (ftype != FileT::vcf && ftype != FileT::gzvcf) {
296         cerr << "Error: File '" << path << "' : expected '.vcf(.gz)' suffix.\n";
297         throw exception();
298     }
299 
300     read_header();
301 
302     return 0;
303 }
304 
305 void
read_header()306 VcfParser::read_header()
307 {
308     const char* line = NULL;
309     size_t len;
310 
311     auto malformed = [this, &line] () {
312         cerr << "Error: Malformed VCF header."
313              << " At line " << file_.line_number() << " in file '" << file_.path() << "'";
314         if (line)
315             cerr << ":\n" << line << "\n";
316         else
317             cerr << ".\n";
318         throw exception();
319     };
320 
321     // Meta header lines.
322     bool not_eof;
323     while((not_eof = file_.getline(line, len)) && strncmp(line, "##", 2) == 0) {
324         const char* equal = strchr(line+2, '=');
325         if(!equal) {
326             // Skip header lines missing the '='.
327             continue;
328         }
329         header_.add_meta(VcfMeta(string(line+2, equal), string(equal+1)));
330     }
331     if (!not_eof)
332         malformed();
333 
334     // Final header line.
335     if(strncmp(line, VcfHeader::std_fields.c_str(), VcfHeader::std_fields.length()) != 0)
336         malformed();
337 
338     // Parse sample names, if any.
339     if(len > VcfHeader::std_fields.length()) {
340         const char* p = line + VcfHeader::std_fields.length();
341         const char* end = line + len;
342 
343         // Check that FORMAT is present.
344         const char format[] = "\tFORMAT";
345         const size_t format_len = sizeof(format) - 1;
346         if(strncmp(p, format, format_len) != 0)
347             malformed();
348         p += format_len;
349         if (*p != '\t')
350             malformed();
351 
352         do {
353             ++p;
354             const char* next = strchr(p, '\t');
355             if (!next)
356                 next = end;
357             header_.add_sample(string(p, next));
358             p = next;
359         } while (p != end);
360     }
361 }
362