1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2015, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __VCF_H__
22 #define __VCF_H__
23 
24 #include "constants.h"
25 #include "nucleotides.h"
26 #include "utils.h"
27 
28 /*
29  * VcfMeta
30  * ==========
31  * Represents one line of VCF metainformation.
32  */
33 class VcfMeta {
34     string key_;
35     string value_;
36 public:
VcfMeta(const string & k,const string & p)37     VcfMeta(const string& k, const string& p) : key_(k), value_(p) {}
38 
key()39     const string& key() const {return key_;}
value()40     const string& value() const {return value_;}
41 
42     struct predefs {
43         static const VcfMeta info_AD;
44         static const VcfMeta info_AF;
45         static const VcfMeta info_DP;
46         static const VcfMeta info_NS;
47 
48         static const VcfMeta format_AD;
49         static const VcfMeta format_DP;
50         static const VcfMeta format_GL;
51         static const VcfMeta format_GQ;
52         static const VcfMeta format_GT;
53         static const VcfMeta format_HQ;
54 
55         // Custom.
56         static const VcfMeta info_loc_strand;
57     };
58 };
59 
60 /*
61  * VcfHeader
62  * ==========
63  * Stores the contents of a VCF header.
64  */
65 class VcfHeader {
66     vector<string> samples_;
67     vector<VcfMeta> meta_;
68     map<string, size_t> sample_indexes_;
69 
70     template<typename OStream> void print(OStream& os) const; // Template because VersatileWriter isn't a proper std::ostream.
71 
72 public:
VcfHeader()73     VcfHeader() : samples_(), meta_(), sample_indexes_() {}
74 
meta()75     const vector<VcfMeta>& meta() const {return meta_;}
samples()76     const vector<string>& samples() const {return samples_;}
sample_index(const string & sample)77     size_t sample_index(const string& sample) const {return sample_indexes_.at(sample);}
78 
add_meta(const VcfMeta & m)79     void add_meta(const VcfMeta& m) {meta_.push_back(m);}
add_sample(const string & s)80     void add_sample(const string& s) {samples_.push_back(s); sample_indexes_.insert({s, samples_.size()-1});}
81 
82     // Creates a standard header.
83     void add_std_meta(const string& version = "VCFv4.2");
84 
85     static const string std_fields;
86 
87     friend ostream& operator<< (ostream& os, const VcfHeader& h) {h.print(os); return os;}
88     friend VersatileWriter& operator<< (VersatileWriter& w, const VcfHeader& h) {h.print(w); return w;}
89 };
90 
91 /*
92  * VcfRecord
93  * ==========
94  * Datastructure to store VCF records
95  */
96 class VcfRecord {
97     vector<char> buffer_;
data()98     const char* data() const {return buffer_.data();}
99 
100     //size_t chrom_; //0
101     size_t pos_;
102     //size_t id_;
103     size_t allele0_;
104     size_t qual_;
105     //size_t filters_;
106     size_t info0_;
107     size_t format0_;
108     size_t sample0_;
109 
append(const char * str,size_t len)110     void append(const char* str, size_t len) {buffer_.insert(buffer_.end(), str, str+len+1);}
append(const string & s)111     void append(const string& s) {append(s.c_str(), s.length());}
112 
has_infos()113     bool has_infos() const {return info0_ != SIZE_MAX;}
has_samples()114     bool has_samples() const {return sample0_ != SIZE_MAX;}
115 
116     template<typename OStream> void print(OStream& os) const; // Template because VersatileWriter isn't a proper std::ostream.
117 
118 public:
VcfRecord()119     VcfRecord()
120     : buffer_(), pos_(SIZE_MAX), allele0_(SIZE_MAX), qual_(SIZE_MAX),
121       info0_(SIZE_MAX), format0_(SIZE_MAX), sample0_(SIZE_MAX)
122     {}
123     void assign(const char* rec, size_t len, const VcfHeader& header);
124 
125     class iterator {
126         const char* p_;
127     public:
iterator(const char * p)128         iterator(const char* p) : p_(p) {}
129         iterator& operator++ () {while(*p_!='\0') ++p_; ++p_; return *this;}
130         bool operator!= (iterator other) const {return p_ != other.p_;}
131         bool operator== (iterator other) const {return !operator!=(other);}
132         const char* operator* () const {return p_;}
133     };
134 
chrom()135     const char* chrom()   const {assert(has_infos()); return data();}
pos()136          size_t pos()     const {assert(has_infos()); return atol(data() + pos_) - 1;} // 0-based.
id()137     const char* id()      const {assert(has_infos()); return *++iterator(data() + pos_);}
allele0()138     const char* allele0() const {assert(has_infos()); return data() + allele0_;}
qual()139     const char* qual()    const {assert(has_infos()); return data() + qual_;}
filters()140     const char* filters() const {assert(has_infos()); return *++iterator(qual());}
info0()141     const char* info0()   const {assert(has_infos()); return data() + info0_;}
format0()142     const char* format0() const {assert(has_samples()); return data() + format0_;}
sample0()143     const char* sample0() const {assert(has_samples()); return data() + sample0_;}
144 
begin_alleles()145     iterator begin_alleles() const {return iterator(allele0());}
end_alleles()146     iterator end_alleles()   const {auto itr=begin_alleles(); ++itr; return **itr=='.' ? itr : iterator(qual());}
begin_infos()147     iterator begin_infos()   const {return iterator(info0());}
end_infos()148     iterator end_infos()     const {return *info0()=='.' ? begin_infos() : begin_formats();}
begin_formats()149     iterator begin_formats() const {return iterator(format0());}
end_formats()150     iterator end_formats()   const {return *format0()=='.' ? begin_formats() : begin_samples();}
begin_samples()151     iterator begin_samples() const {return iterator(sample0());}
end_samples()152     iterator end_samples()   const {return iterator(data() + buffer_.size());}
153 
count_alleles()154     size_t count_alleles() const {size_t n=0; for(auto itr=begin_alleles();itr!=end_alleles(); ++itr) ++n; return n;}
count_infos()155     size_t count_infos()   const {size_t n=0; for(auto itr=begin_infos();itr!=end_infos(); ++itr) ++n; return n;}
count_formats()156     size_t count_formats() const {size_t n=0; for(auto itr=begin_formats();itr!=end_formats(); ++itr) ++n; return n;}
count_samples()157     size_t count_samples() const {size_t n=0; for(auto itr=begin_samples();itr!=end_samples(); ++itr) ++n; return n;}
158 
find_allele(size_t i)159     const char* find_allele(size_t i) const
160         {auto a=begin_alleles(); for(size_t j=0; j<i; ++j) {assert(a!=end_alleles()); ++a;} return *a;}
find_sample(size_t i)161     const char* find_sample(size_t i) const // N.B. Inefficient.
162         {auto a=begin_samples(); for(size_t j=0; j<i; ++j) {assert(a!=end_samples()); ++a;} return *a;}
163 
is_monomorphic()164     bool is_monomorphic() const {return **++begin_alleles() == '.';}
165     bool is_snp() const;
166     size_t index_of_gt_subfield(const char* format_key) const; // SIZE_MAX if not found.
167 
168     // Record creation functions.
169     void clear();
170     void append_chrom(const string& s);
171     void append_pos(size_t pos0);
172     void append_id(const string& s = ".");
173     void append_allele(Nt2 nt);
174     void append_allele(const string& s);
175     void append_qual(long phred_qual = -1);
176     void append_filters(const string& s = ".");
177     void append_info(const string& s);
178     void append_format(const string& s);
179     void append_sample(const string& s);
180 
181     friend ostream& operator<< (ostream& os, const VcfRecord& r) {r.print(os); return os;}
182     friend VersatileWriter& operator<< (VersatileWriter& w, const VcfRecord& r) {r.print(w); return w;}
183 
184     static const char allele_sep = ',';
185     static const char filter_sep = ';';
186     static const char info_sep = ';';
187     static const char format_sep = ':';
188     static const char sample_sep = ':';
189 
190     struct util {
191         static string fmt_info_af(const vector<double>& alt_freqs);
192         static string fmt_gt_gl(const vector<Nt2>& alleles, const GtLiks& liks);
193 
194         static pair<long,long> parse_gt_gt(const char* gt_str);
195         static size_t parse_gt_dp(const char* gt_str, size_t dp_index);
196         static Counts<Nt2> parse_gt_ad(const char* gt_str, size_t ad_index, const vector<Nt2>& alleles);
197         static uint8_t parse_gt_gq(const char* gt_str, size_t gq_index);
198         static GtLiks parse_gt_gl(const char* gt_str, size_t gl_index, const vector<Nt2>& alleles);
199 
200         static const char* find_gt_subfield(const char* sample, size_t n);
201         static void skip_gt_subfields(const char** start, size_t n);
n_possible_genotypesutil202         static size_t n_possible_genotypes(size_t n_alleles) {return (n_alleles*(n_alleles+1))/2;}
203     };
204 };
205 
206 /*
207  * VcfParser
208  * ==========
209  */
210 class VcfParser {
211     VersatileLineReader file_;
212     VcfHeader header_;
213 
214     // Parse the header.
215     void read_header();
216 
217 public:
218     VcfParser(const string& path);
VcfParser()219     VcfParser(): file_(), header_() {};
220 
next_record(VcfRecord & rec)221     bool next_record(VcfRecord& rec) {
222         try {
223             const char* line;
224             size_t len;
225             if (!file_.getline(line, len))
226                 return false;
227             rec.assign(line, len, header_);
228             return true;
229         } catch (const exception& e) {
230             cerr << "Error: At line " << file_.line_number()
231                  << " in file '" << file_.path () << "'.\n";
232             throw e;
233         }
234     }
235 
236     int open(string &path);
header()237     const VcfHeader& header() const {return header_;};
path()238     const string& path() const {return file_.path();};
line_number()239     size_t line_number() const {return file_.line_number();}
240 };
241 
242 /*
243  * VcfWriter
244  * ==========
245  * (This has become an empty shell...)
246  */
247 class VcfWriter {
248 private:
249     VersatileWriter file_;
250     const VcfHeader header_;
251 
252 public:
VcfWriter(const string & path,VcfHeader && header)253     VcfWriter(const string& path, VcfHeader&& header)
254         : file_(path), header_(header)
255         {file_ << header_;}
256 
header()257     const VcfHeader& header() const {return header_;}
write_record(const VcfRecord & r)258     void write_record(const VcfRecord& r)
259         {assert(r.count_samples()==header_.samples().size()); file_ << r;}
260 
file()261     VersatileWriter& file() {return file_;}
262 };
263 
264 /*
265  * Inline methods.
266  * ==========
267  */
268 
269 template<typename OStream>
print(OStream & os)270 void VcfHeader::print(OStream& os) const {
271     for(const VcfMeta& m : meta())
272         os << "##" << m.key() << "=" << m.value() << "\n";
273 
274     os << VcfHeader::std_fields;
275     if(!samples().empty())
276         os << "\tFORMAT";
277     for(const string& s : samples())
278         os << '\t' << s;
279     os << '\n';
280 }
281 
282 template<typename OStream>
print(OStream & os)283 void VcfRecord::print(OStream& os) const {
284 
285     os << chrom()
286        << '\t' << pos() + 1
287        << '\t' << id();
288 
289     // REF & ALT
290     iterator itr = begin_alleles();
291     iterator end = end_alleles();
292     assert(itr!=end);
293     os << '\t' << *itr;
294     ++itr;
295     if (itr == end) {
296         assert(strcmp(*itr, ".")==0);
297         os << '\t' << '.';
298     } else {
299         os << '\t' << *itr;
300         ++itr;
301         for(; itr!=end; ++itr)
302             os << allele_sep << *itr;
303     }
304 
305     //QUAL
306     os << '\t' << qual();
307 
308     //FILTER
309     os << '\t' << filters();
310 
311     //INFO
312     itr = begin_infos();
313     end = end_infos();
314     if (itr == end) {
315         assert(strcmp(*itr, ".")==0);
316         os << '\t' << '.';
317     } else {
318         os << '\t' << *itr;
319         ++itr;
320         for(; itr!=end; ++itr)
321             os << info_sep << *itr;
322     }
323 
324     if (begin_samples() != end_samples()) {
325         //FORMAT
326         itr = begin_formats();
327         end = end_formats();
328         if (itr == end) {
329             assert(strcmp(*itr, ".")==0);
330             os << '\t' << '.';
331         } else {
332             os << '\t' << *itr;
333             ++itr;
334             for(; itr!=end; ++itr)
335                 os << format_sep << *itr;
336         }
337 
338         //SAMPLES
339         for (itr=begin_samples(); itr!=end_samples(); ++itr)
340             os << '\t' << *itr;
341     }
342 
343     os << '\n';
344 }
345 
346 inline
append_chrom(const string & s)347 void VcfRecord::append_chrom(const string& s) {
348     assert(buffer_.empty());
349     append(s);
350 }
351 
352 inline
append_pos(size_t pos0)353 void VcfRecord::append_pos(size_t pos0) {
354     assert(pos_ == SIZE_MAX);
355     pos_ = buffer_.size();
356     char s[32];
357     size_t len = sprintf(s, "%zu", pos0 + 1);
358     append(s, len);
359 }
360 
361 inline
append_id(const string & s)362 void VcfRecord::append_id(const string& s) {
363     assert(pos_ != SIZE_MAX);
364     assert(allele0_ == SIZE_MAX);
365     append(s);
366 }
367 
368 inline
append_allele(Nt2 nt)369 void VcfRecord::append_allele(Nt2 nt) {
370     assert(pos_ != SIZE_MAX);
371     assert(buffer_.size() > pos_ + strlen(buffer_.data() + pos_) + 1); // id.
372     assert(qual_ == SIZE_MAX);
373     if (allele0_ == SIZE_MAX)
374         allele0_ = buffer_.size();
375     char s[] = {char(nt), '\0'};
376     append(s, 1);
377 }
378 
379 inline
append_allele(const string & s)380 void VcfRecord::append_allele(const string& s) {
381     assert(pos_ != SIZE_MAX);
382     assert(buffer_.size() > pos_ + strlen(buffer_.data() + pos_) + 1); // id.
383     assert(qual_ == SIZE_MAX);
384     if (allele0_ == SIZE_MAX)
385         allele0_ = buffer_.size();
386     append(s);
387 }
388 
389 inline
append_qual(long phred_qual)390 void VcfRecord::append_qual(long phred_qual) {
391     assert(allele0_ != SIZE_MAX);
392     assert(qual_ == SIZE_MAX);
393     qual_ = buffer_.size();
394     if (phred_qual >= 0) {
395         char s[32];
396         size_t len = sprintf(s, "%ld", phred_qual);
397         append(s, len);
398     } else {
399         append(".", 1);
400     }
401 }
402 
403 inline
append_filters(const string & s)404 void VcfRecord::append_filters(const string& s) {
405     assert(qual_ != SIZE_MAX);
406     assert(info0_ == SIZE_MAX);
407     append(s);
408 }
409 
410 inline
append_info(const string & s)411 void VcfRecord::append_info(const string& s) {
412     assert(qual_ != SIZE_MAX);
413     assert(buffer_.size() > qual_ + strlen(buffer_.data() + qual_) + 1); // filters.
414     assert(format0_ == SIZE_MAX);
415     if (info0_ == SIZE_MAX)
416         info0_ = buffer_.size();
417     append(s);
418 }
419 
420 inline
append_format(const string & s)421 void VcfRecord::append_format(const string& s) {
422     assert(info0_ != SIZE_MAX);
423     assert(sample0_ == SIZE_MAX);
424     if (format0_ == SIZE_MAX)
425         format0_ = buffer_.size();
426     append(s);
427 }
428 
429 inline
append_sample(const string & s)430 void VcfRecord::append_sample(const string& s) {
431     assert(format0_ != SIZE_MAX);
432     if (sample0_ == SIZE_MAX)
433         sample0_ = buffer_.size();
434     append(s);
435 }
436 
437 inline
clear()438 void VcfRecord::clear() {
439     buffer_.resize(0);
440     pos_ = SIZE_MAX;
441     allele0_ = SIZE_MAX;
442     qual_ = SIZE_MAX;
443     info0_ = SIZE_MAX;
444     format0_ = SIZE_MAX;
445     sample0_ = SIZE_MAX;
446 }
447 
448 inline
index_of_gt_subfield(const char * key)449 size_t VcfRecord::index_of_gt_subfield(const char* key) const {
450     size_t i=0;
451     for (auto f=begin_formats(); f!=end_formats(); ++f) {
452         if (strcmp(*f, key) == 0)
453             return i;
454         ++i;
455     }
456     return SIZE_MAX;
457 }
458 
459 inline
parse_gt_gt(const char * sample)460 pair<long,long> VcfRecord::util::parse_gt_gt(const char* sample)
461 { try {
462     assert(sample != NULL && sample[0] != '\0');
463     if (*sample == '.')
464         return {-1, -1};
465     char* end;
466     long first = strtol(sample, &end, 10);
467     if (end == sample || (*end != '/' && *end != '|'))
468         throw exception();
469     sample = end;
470     ++sample;
471     if (*sample == '.')
472         // Incomplete, e.g. "1/."
473         return {-1, -1};
474     long second = strtol(sample, &end, 10);
475     if (end == sample || (*end != ':' && *end != '\0'))
476         throw exception();
477     if (first < 0 || second < 0)
478         throw exception();
479     return {first, second};
480 } catch (exception&) {
481     cerr << "Error: Malformed VCF sample field '" << sample << "'.\n";
482     throw;
483 }}
484 
485 inline
is_snp()486 bool VcfRecord::is_snp() const {
487     iterator a = begin_alleles();
488     const iterator end = end_alleles();
489     if (strlen(*a) > 1)
490         return false;
491     ++a;
492     if (a == end)
493         return false;
494     assert(**a != '.');
495     if (**a == '*')
496         return false;
497     for (; a != end; ++a)
498         if (strlen(*a) > 1)
499             return false;
500     return true;
501 }
502 
503 inline
find_gt_subfield(const char * sample,size_t n)504 const char* VcfRecord::util::find_gt_subfield(const char* sample, size_t n) {
505     const char* subf = sample;
506     for(size_t i=0; i<n; ++i) {
507         subf = strchr(subf, ':');
508         if (subf == NULL)
509             break;
510         ++subf;
511     }
512     return subf;
513 }
514 
515 #endif // __VCF_H__
516