1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2015, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __VCF_H__
22 #define __VCF_H__
23
24 #include "constants.h"
25 #include "nucleotides.h"
26 #include "utils.h"
27
28 /*
29 * VcfMeta
30 * ==========
31 * Represents one line of VCF metainformation.
32 */
33 class VcfMeta {
34 string key_;
35 string value_;
36 public:
VcfMeta(const string & k,const string & p)37 VcfMeta(const string& k, const string& p) : key_(k), value_(p) {}
38
key()39 const string& key() const {return key_;}
value()40 const string& value() const {return value_;}
41
42 struct predefs {
43 static const VcfMeta info_AD;
44 static const VcfMeta info_AF;
45 static const VcfMeta info_DP;
46 static const VcfMeta info_NS;
47
48 static const VcfMeta format_AD;
49 static const VcfMeta format_DP;
50 static const VcfMeta format_GL;
51 static const VcfMeta format_GQ;
52 static const VcfMeta format_GT;
53 static const VcfMeta format_HQ;
54
55 // Custom.
56 static const VcfMeta info_loc_strand;
57 };
58 };
59
60 /*
61 * VcfHeader
62 * ==========
63 * Stores the contents of a VCF header.
64 */
65 class VcfHeader {
66 vector<string> samples_;
67 vector<VcfMeta> meta_;
68 map<string, size_t> sample_indexes_;
69
70 template<typename OStream> void print(OStream& os) const; // Template because VersatileWriter isn't a proper std::ostream.
71
72 public:
VcfHeader()73 VcfHeader() : samples_(), meta_(), sample_indexes_() {}
74
meta()75 const vector<VcfMeta>& meta() const {return meta_;}
samples()76 const vector<string>& samples() const {return samples_;}
sample_index(const string & sample)77 size_t sample_index(const string& sample) const {return sample_indexes_.at(sample);}
78
add_meta(const VcfMeta & m)79 void add_meta(const VcfMeta& m) {meta_.push_back(m);}
add_sample(const string & s)80 void add_sample(const string& s) {samples_.push_back(s); sample_indexes_.insert({s, samples_.size()-1});}
81
82 // Creates a standard header.
83 void add_std_meta(const string& version = "VCFv4.2");
84
85 static const string std_fields;
86
87 friend ostream& operator<< (ostream& os, const VcfHeader& h) {h.print(os); return os;}
88 friend VersatileWriter& operator<< (VersatileWriter& w, const VcfHeader& h) {h.print(w); return w;}
89 };
90
91 /*
92 * VcfRecord
93 * ==========
94 * Datastructure to store VCF records
95 */
96 class VcfRecord {
97 vector<char> buffer_;
data()98 const char* data() const {return buffer_.data();}
99
100 //size_t chrom_; //0
101 size_t pos_;
102 //size_t id_;
103 size_t allele0_;
104 size_t qual_;
105 //size_t filters_;
106 size_t info0_;
107 size_t format0_;
108 size_t sample0_;
109
append(const char * str,size_t len)110 void append(const char* str, size_t len) {buffer_.insert(buffer_.end(), str, str+len+1);}
append(const string & s)111 void append(const string& s) {append(s.c_str(), s.length());}
112
has_infos()113 bool has_infos() const {return info0_ != SIZE_MAX;}
has_samples()114 bool has_samples() const {return sample0_ != SIZE_MAX;}
115
116 template<typename OStream> void print(OStream& os) const; // Template because VersatileWriter isn't a proper std::ostream.
117
118 public:
VcfRecord()119 VcfRecord()
120 : buffer_(), pos_(SIZE_MAX), allele0_(SIZE_MAX), qual_(SIZE_MAX),
121 info0_(SIZE_MAX), format0_(SIZE_MAX), sample0_(SIZE_MAX)
122 {}
123 void assign(const char* rec, size_t len, const VcfHeader& header);
124
125 class iterator {
126 const char* p_;
127 public:
iterator(const char * p)128 iterator(const char* p) : p_(p) {}
129 iterator& operator++ () {while(*p_!='\0') ++p_; ++p_; return *this;}
130 bool operator!= (iterator other) const {return p_ != other.p_;}
131 bool operator== (iterator other) const {return !operator!=(other);}
132 const char* operator* () const {return p_;}
133 };
134
chrom()135 const char* chrom() const {assert(has_infos()); return data();}
pos()136 size_t pos() const {assert(has_infos()); return atol(data() + pos_) - 1;} // 0-based.
id()137 const char* id() const {assert(has_infos()); return *++iterator(data() + pos_);}
allele0()138 const char* allele0() const {assert(has_infos()); return data() + allele0_;}
qual()139 const char* qual() const {assert(has_infos()); return data() + qual_;}
filters()140 const char* filters() const {assert(has_infos()); return *++iterator(qual());}
info0()141 const char* info0() const {assert(has_infos()); return data() + info0_;}
format0()142 const char* format0() const {assert(has_samples()); return data() + format0_;}
sample0()143 const char* sample0() const {assert(has_samples()); return data() + sample0_;}
144
begin_alleles()145 iterator begin_alleles() const {return iterator(allele0());}
end_alleles()146 iterator end_alleles() const {auto itr=begin_alleles(); ++itr; return **itr=='.' ? itr : iterator(qual());}
begin_infos()147 iterator begin_infos() const {return iterator(info0());}
end_infos()148 iterator end_infos() const {return *info0()=='.' ? begin_infos() : begin_formats();}
begin_formats()149 iterator begin_formats() const {return iterator(format0());}
end_formats()150 iterator end_formats() const {return *format0()=='.' ? begin_formats() : begin_samples();}
begin_samples()151 iterator begin_samples() const {return iterator(sample0());}
end_samples()152 iterator end_samples() const {return iterator(data() + buffer_.size());}
153
count_alleles()154 size_t count_alleles() const {size_t n=0; for(auto itr=begin_alleles();itr!=end_alleles(); ++itr) ++n; return n;}
count_infos()155 size_t count_infos() const {size_t n=0; for(auto itr=begin_infos();itr!=end_infos(); ++itr) ++n; return n;}
count_formats()156 size_t count_formats() const {size_t n=0; for(auto itr=begin_formats();itr!=end_formats(); ++itr) ++n; return n;}
count_samples()157 size_t count_samples() const {size_t n=0; for(auto itr=begin_samples();itr!=end_samples(); ++itr) ++n; return n;}
158
find_allele(size_t i)159 const char* find_allele(size_t i) const
160 {auto a=begin_alleles(); for(size_t j=0; j<i; ++j) {assert(a!=end_alleles()); ++a;} return *a;}
find_sample(size_t i)161 const char* find_sample(size_t i) const // N.B. Inefficient.
162 {auto a=begin_samples(); for(size_t j=0; j<i; ++j) {assert(a!=end_samples()); ++a;} return *a;}
163
is_monomorphic()164 bool is_monomorphic() const {return **++begin_alleles() == '.';}
165 bool is_snp() const;
166 size_t index_of_gt_subfield(const char* format_key) const; // SIZE_MAX if not found.
167
168 // Record creation functions.
169 void clear();
170 void append_chrom(const string& s);
171 void append_pos(size_t pos0);
172 void append_id(const string& s = ".");
173 void append_allele(Nt2 nt);
174 void append_allele(const string& s);
175 void append_qual(long phred_qual = -1);
176 void append_filters(const string& s = ".");
177 void append_info(const string& s);
178 void append_format(const string& s);
179 void append_sample(const string& s);
180
181 friend ostream& operator<< (ostream& os, const VcfRecord& r) {r.print(os); return os;}
182 friend VersatileWriter& operator<< (VersatileWriter& w, const VcfRecord& r) {r.print(w); return w;}
183
184 static const char allele_sep = ',';
185 static const char filter_sep = ';';
186 static const char info_sep = ';';
187 static const char format_sep = ':';
188 static const char sample_sep = ':';
189
190 struct util {
191 static string fmt_info_af(const vector<double>& alt_freqs);
192 static string fmt_gt_gl(const vector<Nt2>& alleles, const GtLiks& liks);
193
194 static pair<long,long> parse_gt_gt(const char* gt_str);
195 static size_t parse_gt_dp(const char* gt_str, size_t dp_index);
196 static Counts<Nt2> parse_gt_ad(const char* gt_str, size_t ad_index, const vector<Nt2>& alleles);
197 static uint8_t parse_gt_gq(const char* gt_str, size_t gq_index);
198 static GtLiks parse_gt_gl(const char* gt_str, size_t gl_index, const vector<Nt2>& alleles);
199
200 static const char* find_gt_subfield(const char* sample, size_t n);
201 static void skip_gt_subfields(const char** start, size_t n);
n_possible_genotypesutil202 static size_t n_possible_genotypes(size_t n_alleles) {return (n_alleles*(n_alleles+1))/2;}
203 };
204 };
205
206 /*
207 * VcfParser
208 * ==========
209 */
210 class VcfParser {
211 VersatileLineReader file_;
212 VcfHeader header_;
213
214 // Parse the header.
215 void read_header();
216
217 public:
218 VcfParser(const string& path);
VcfParser()219 VcfParser(): file_(), header_() {};
220
next_record(VcfRecord & rec)221 bool next_record(VcfRecord& rec) {
222 try {
223 const char* line;
224 size_t len;
225 if (!file_.getline(line, len))
226 return false;
227 rec.assign(line, len, header_);
228 return true;
229 } catch (const exception& e) {
230 cerr << "Error: At line " << file_.line_number()
231 << " in file '" << file_.path () << "'.\n";
232 throw e;
233 }
234 }
235
236 int open(string &path);
header()237 const VcfHeader& header() const {return header_;};
path()238 const string& path() const {return file_.path();};
line_number()239 size_t line_number() const {return file_.line_number();}
240 };
241
242 /*
243 * VcfWriter
244 * ==========
245 * (This has become an empty shell...)
246 */
247 class VcfWriter {
248 private:
249 VersatileWriter file_;
250 const VcfHeader header_;
251
252 public:
VcfWriter(const string & path,VcfHeader && header)253 VcfWriter(const string& path, VcfHeader&& header)
254 : file_(path), header_(header)
255 {file_ << header_;}
256
header()257 const VcfHeader& header() const {return header_;}
write_record(const VcfRecord & r)258 void write_record(const VcfRecord& r)
259 {assert(r.count_samples()==header_.samples().size()); file_ << r;}
260
file()261 VersatileWriter& file() {return file_;}
262 };
263
264 /*
265 * Inline methods.
266 * ==========
267 */
268
269 template<typename OStream>
print(OStream & os)270 void VcfHeader::print(OStream& os) const {
271 for(const VcfMeta& m : meta())
272 os << "##" << m.key() << "=" << m.value() << "\n";
273
274 os << VcfHeader::std_fields;
275 if(!samples().empty())
276 os << "\tFORMAT";
277 for(const string& s : samples())
278 os << '\t' << s;
279 os << '\n';
280 }
281
282 template<typename OStream>
print(OStream & os)283 void VcfRecord::print(OStream& os) const {
284
285 os << chrom()
286 << '\t' << pos() + 1
287 << '\t' << id();
288
289 // REF & ALT
290 iterator itr = begin_alleles();
291 iterator end = end_alleles();
292 assert(itr!=end);
293 os << '\t' << *itr;
294 ++itr;
295 if (itr == end) {
296 assert(strcmp(*itr, ".")==0);
297 os << '\t' << '.';
298 } else {
299 os << '\t' << *itr;
300 ++itr;
301 for(; itr!=end; ++itr)
302 os << allele_sep << *itr;
303 }
304
305 //QUAL
306 os << '\t' << qual();
307
308 //FILTER
309 os << '\t' << filters();
310
311 //INFO
312 itr = begin_infos();
313 end = end_infos();
314 if (itr == end) {
315 assert(strcmp(*itr, ".")==0);
316 os << '\t' << '.';
317 } else {
318 os << '\t' << *itr;
319 ++itr;
320 for(; itr!=end; ++itr)
321 os << info_sep << *itr;
322 }
323
324 if (begin_samples() != end_samples()) {
325 //FORMAT
326 itr = begin_formats();
327 end = end_formats();
328 if (itr == end) {
329 assert(strcmp(*itr, ".")==0);
330 os << '\t' << '.';
331 } else {
332 os << '\t' << *itr;
333 ++itr;
334 for(; itr!=end; ++itr)
335 os << format_sep << *itr;
336 }
337
338 //SAMPLES
339 for (itr=begin_samples(); itr!=end_samples(); ++itr)
340 os << '\t' << *itr;
341 }
342
343 os << '\n';
344 }
345
346 inline
append_chrom(const string & s)347 void VcfRecord::append_chrom(const string& s) {
348 assert(buffer_.empty());
349 append(s);
350 }
351
352 inline
append_pos(size_t pos0)353 void VcfRecord::append_pos(size_t pos0) {
354 assert(pos_ == SIZE_MAX);
355 pos_ = buffer_.size();
356 char s[32];
357 size_t len = sprintf(s, "%zu", pos0 + 1);
358 append(s, len);
359 }
360
361 inline
append_id(const string & s)362 void VcfRecord::append_id(const string& s) {
363 assert(pos_ != SIZE_MAX);
364 assert(allele0_ == SIZE_MAX);
365 append(s);
366 }
367
368 inline
append_allele(Nt2 nt)369 void VcfRecord::append_allele(Nt2 nt) {
370 assert(pos_ != SIZE_MAX);
371 assert(buffer_.size() > pos_ + strlen(buffer_.data() + pos_) + 1); // id.
372 assert(qual_ == SIZE_MAX);
373 if (allele0_ == SIZE_MAX)
374 allele0_ = buffer_.size();
375 char s[] = {char(nt), '\0'};
376 append(s, 1);
377 }
378
379 inline
append_allele(const string & s)380 void VcfRecord::append_allele(const string& s) {
381 assert(pos_ != SIZE_MAX);
382 assert(buffer_.size() > pos_ + strlen(buffer_.data() + pos_) + 1); // id.
383 assert(qual_ == SIZE_MAX);
384 if (allele0_ == SIZE_MAX)
385 allele0_ = buffer_.size();
386 append(s);
387 }
388
389 inline
append_qual(long phred_qual)390 void VcfRecord::append_qual(long phred_qual) {
391 assert(allele0_ != SIZE_MAX);
392 assert(qual_ == SIZE_MAX);
393 qual_ = buffer_.size();
394 if (phred_qual >= 0) {
395 char s[32];
396 size_t len = sprintf(s, "%ld", phred_qual);
397 append(s, len);
398 } else {
399 append(".", 1);
400 }
401 }
402
403 inline
append_filters(const string & s)404 void VcfRecord::append_filters(const string& s) {
405 assert(qual_ != SIZE_MAX);
406 assert(info0_ == SIZE_MAX);
407 append(s);
408 }
409
410 inline
append_info(const string & s)411 void VcfRecord::append_info(const string& s) {
412 assert(qual_ != SIZE_MAX);
413 assert(buffer_.size() > qual_ + strlen(buffer_.data() + qual_) + 1); // filters.
414 assert(format0_ == SIZE_MAX);
415 if (info0_ == SIZE_MAX)
416 info0_ = buffer_.size();
417 append(s);
418 }
419
420 inline
append_format(const string & s)421 void VcfRecord::append_format(const string& s) {
422 assert(info0_ != SIZE_MAX);
423 assert(sample0_ == SIZE_MAX);
424 if (format0_ == SIZE_MAX)
425 format0_ = buffer_.size();
426 append(s);
427 }
428
429 inline
append_sample(const string & s)430 void VcfRecord::append_sample(const string& s) {
431 assert(format0_ != SIZE_MAX);
432 if (sample0_ == SIZE_MAX)
433 sample0_ = buffer_.size();
434 append(s);
435 }
436
437 inline
clear()438 void VcfRecord::clear() {
439 buffer_.resize(0);
440 pos_ = SIZE_MAX;
441 allele0_ = SIZE_MAX;
442 qual_ = SIZE_MAX;
443 info0_ = SIZE_MAX;
444 format0_ = SIZE_MAX;
445 sample0_ = SIZE_MAX;
446 }
447
448 inline
index_of_gt_subfield(const char * key)449 size_t VcfRecord::index_of_gt_subfield(const char* key) const {
450 size_t i=0;
451 for (auto f=begin_formats(); f!=end_formats(); ++f) {
452 if (strcmp(*f, key) == 0)
453 return i;
454 ++i;
455 }
456 return SIZE_MAX;
457 }
458
459 inline
parse_gt_gt(const char * sample)460 pair<long,long> VcfRecord::util::parse_gt_gt(const char* sample)
461 { try {
462 assert(sample != NULL && sample[0] != '\0');
463 if (*sample == '.')
464 return {-1, -1};
465 char* end;
466 long first = strtol(sample, &end, 10);
467 if (end == sample || (*end != '/' && *end != '|'))
468 throw exception();
469 sample = end;
470 ++sample;
471 if (*sample == '.')
472 // Incomplete, e.g. "1/."
473 return {-1, -1};
474 long second = strtol(sample, &end, 10);
475 if (end == sample || (*end != ':' && *end != '\0'))
476 throw exception();
477 if (first < 0 || second < 0)
478 throw exception();
479 return {first, second};
480 } catch (exception&) {
481 cerr << "Error: Malformed VCF sample field '" << sample << "'.\n";
482 throw;
483 }}
484
485 inline
is_snp()486 bool VcfRecord::is_snp() const {
487 iterator a = begin_alleles();
488 const iterator end = end_alleles();
489 if (strlen(*a) > 1)
490 return false;
491 ++a;
492 if (a == end)
493 return false;
494 assert(**a != '.');
495 if (**a == '*')
496 return false;
497 for (; a != end; ++a)
498 if (strlen(*a) > 1)
499 return false;
500 return true;
501 }
502
503 inline
find_gt_subfield(const char * sample,size_t n)504 const char* VcfRecord::util::find_gt_subfield(const char* sample, size_t n) {
505 const char* subf = sample;
506 for(size_t i=0; i<n; ++i) {
507 subf = strchr(subf, ':');
508 if (subf == NULL)
509 break;
510 ++subf;
511 }
512 return subf;
513 }
514
515 #endif // __VCF_H__
516