1 // Author: Derek Barnett
2 
3 #include <pbbam/vcf/VcfFormat.h>
4 
5 #include <cassert>
6 #include <fstream>
7 #include <iostream>
8 #include <sstream>
9 #include <stdexcept>
10 #include <string>
11 
12 #include <htslib/vcf.h>
13 
14 #include <pbbam/StringUtilities.h>
15 #include <pbbam/vcf/VcfHeader.h>
16 
17 namespace PacBio {
18 namespace VCF {
19 
20 namespace {  // anonymous
21 
22 // using htslib's current version for better compatibility
23 static constexpr const char current_version[] = "VCFv4.2";
24 
25 namespace Tokens {
26 
27 static constexpr const char file_format[] = "fileformat";
28 
29 static constexpr const char double_hash[] = "##";
30 static constexpr const char contig_lead[] = "##contig=<";
31 static constexpr const char filter_lead[] = "##FILTER=<";
32 static constexpr const char format_lead[] = "##FORMAT=<";
33 static constexpr const char info_lead[] = "##INFO=<";
34 static constexpr const char chrom_lead[] = "#CHROM";
35 
36 static constexpr const char id[] = "ID";
37 static constexpr const char number[] = "Number";
38 static constexpr const char type[] = "Type";
39 static constexpr const char description[] = "Description";
40 static constexpr const char source[] = "Source";
41 static constexpr const char version[] = "Version";
42 
43 }  // namespace Tokens
44 
QuotedText(const std::string & d)45 std::string QuotedText(const std::string& d) { return "\"" + d + "\""; }
46 
UnquotedText(const std::string & d)47 std::string UnquotedText(const std::string& d)
48 {
49     if (d.size() < 2 || d.front() != '"' || d.back() != '"')
50         throw std::runtime_error{"VCF format error: description text not quoted: " + d};
51     return d.substr(1, d.size() - 2);
52 }
53 
54 }  // namespace anonymous
55 
CurrentVersion()56 const char* VcfFormat::CurrentVersion() { return current_version; }
57 
FormattedContigDefinition(const ContigDefinition & def)58 std::string VcfFormat::FormattedContigDefinition(const ContigDefinition& def)
59 {
60     std::ostringstream text;
61 
62     // ID
63     text << Tokens::contig_lead << Tokens::id << '=' << def.Id();
64 
65     // attributes
66     if (!def.Attributes().empty()) {
67         text << ',';
68         bool first = true;
69         for (const auto& attr : def.Attributes()) {
70             if (!first) text << ',';
71             text << attr.first << '=' << attr.second;
72             first = false;
73         }
74     }
75     text << '>';
76     return text.str();
77 }
78 
FormattedFilterDefinition(const FilterDefinition & def)79 std::string VcfFormat::FormattedFilterDefinition(const FilterDefinition& def)
80 {
81     std::ostringstream text;
82     text << Tokens::filter_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::description
83          << '=' << QuotedText(def.Description()) << '>';
84     return text.str();
85 }
86 
FormattedFormatDefinition(const FormatDefinition & def)87 std::string VcfFormat::FormattedFormatDefinition(const FormatDefinition& def)
88 {
89     std::ostringstream text;
90     text << Tokens::format_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
91          << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
92          << '=' << QuotedText(def.Description()) << '>';
93     return text.str();
94 }
95 
FormattedGeneralDefinition(const GeneralDefinition & def)96 std::string VcfFormat::FormattedGeneralDefinition(const GeneralDefinition& def)
97 {
98     std::ostringstream text;
99     text << Tokens::double_hash << def.Id() << '=' << def.Text();
100     return text.str();
101 }
102 
FormattedInfoDefinition(const InfoDefinition & def)103 std::string VcfFormat::FormattedInfoDefinition(const InfoDefinition& def)
104 {
105     std::ostringstream text;
106     text << Tokens::info_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
107          << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
108          << '=' << QuotedText(def.Description());
109 
110     if (def.Source().is_initialized() && !def.Source().get().empty())
111         text << ',' << Tokens::source << '=' << QuotedText(def.Source().get());
112 
113     if (def.Version().is_initialized() && !def.Version().get().empty())
114         text << ',' << Tokens::version << '=' << QuotedText(def.Version().get());
115 
116     text << '>';
117     return text.str();
118 }
119 
FormattedHeader(const VcfHeader & header)120 std::string VcfFormat::FormattedHeader(const VcfHeader& header)
121 {
122     std::ostringstream out;
123 
124     const auto& fileformat = header.GeneralDefinition(Tokens::file_format);
125     out << FormattedGeneralDefinition(fileformat) << '\n';
126 
127     // remaining general definiitions
128     for (const auto& def : header.GeneralDefinitions()) {
129         if (def.Id() != Tokens::file_format) out << FormattedGeneralDefinition(def) << '\n';
130     }
131 
132     // ##contig
133     for (const auto& contig : header.ContigDefinitions())
134         out << FormattedContigDefinition(contig) << '\n';
135 
136     // ##FILTER
137     for (const auto& filter : header.FilterDefinitions())
138         out << FormattedFilterDefinition(filter) << '\n';
139 
140     // ##INFO
141     for (const auto& info : header.InfoDefinitions())
142         out << FormattedInfoDefinition(info) << '\n';
143 
144     // ##FORMAT
145     for (const auto& format : header.FormatDefinitions())
146         out << FormattedFormatDefinition(format) << '\n';
147 
148     // fixed headers
149     out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
150 
151     // samples
152     const auto& samples = header.Samples();
153     if (!samples.empty()) {
154         out << "\tFORMAT";
155         for (const auto& sample : samples)
156             out << '\t' << sample;
157     }
158 
159     return out.str();
160 }
161 
ParsedContigDefinition(std::string line)162 ContigDefinition VcfFormat::ParsedContigDefinition(std::string line)
163 {
164     // should already be checked by "normal" code path
165     assert(line.find(Tokens::contig_lead) == 0);
166 
167     // substring between brackets
168     const auto lastBracketPos = line.find_last_of('>');
169     if (lastBracketPos == std::string::npos)
170         throw std::runtime_error{"VCF format error: malformed ##contig line: " + line};
171     line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
172 
173     std::string id;
174     std::vector<std::pair<std::string, std::string>> attributes;
175 
176     const auto fields = PacBio::BAM::Split(line, ',');
177     for (const auto& field : fields) {
178         const auto tokens = PacBio::BAM::Split(field, '=');
179         if (tokens.size() != 2) {
180             throw std::runtime_error{"VCF format error: malformed ##contig line: " + line};
181         }
182         if (tokens[0] == Tokens::id)
183             id = tokens[1];
184         else
185             attributes.push_back(std::make_pair(tokens[0], tokens[1]));
186     }
187 
188     return ContigDefinition{std::move(id), std::move(attributes)};
189 }
190 
ParsedFilterDefinition(std::string line)191 FilterDefinition VcfFormat::ParsedFilterDefinition(std::string line)
192 {
193     // should already be checked by "normal" code path
194     assert(line.find(Tokens::filter_lead) == 0);
195 
196     // substring between brackets
197     const auto lastBracketPos = line.find_last_of('>');
198     if (lastBracketPos == std::string::npos)
199         throw std::runtime_error{"VCF format error: malformed FILTER line: " + line};
200     line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
201 
202     std::string id;
203     std::string description;
204 
205     const auto fields = PacBio::BAM::Split(line, ',');
206     for (const auto& field : fields) {
207         const auto tokens = PacBio::BAM::Split(field, '=');
208         if (tokens.size() != 2) {
209             throw std::runtime_error{"VCF format error: malformed FILTER line: " + line};
210         }
211         if (tokens[0] == Tokens::id)
212             id = tokens[1];
213         else if (tokens[0] == Tokens::description) {
214             description = UnquotedText(tokens[1]);
215         } else
216             throw std::runtime_error{"VCF format error: unrecognized FILTER field: " + tokens[0]};
217     }
218 
219     return FilterDefinition{std::move(id), std::move(description)};
220 }
221 
ParsedFormatDefinition(std::string line)222 FormatDefinition VcfFormat::ParsedFormatDefinition(std::string line)
223 {
224     // should already be checked by "normal" code path
225     assert(line.find(Tokens::format_lead) == 0);
226 
227     // substring between brackets
228     const auto lastBracketPos = line.find_last_of('>');
229     if (lastBracketPos == std::string::npos)
230         throw std::runtime_error{"VCF format error: malformed FORMAT line: " + line};
231     line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
232 
233     std::string id;
234     std::string number;
235     std::string type;
236     std::string description;
237 
238     const auto fields = PacBio::BAM::Split(line, ',');
239     for (const auto& field : fields) {
240         const auto tokens = PacBio::BAM::Split(field, '=');
241         if (tokens.size() != 2) {
242             throw std::runtime_error{"VCF format error: malformed FORMAT line: " + line};
243         }
244         if (tokens[0] == Tokens::id)
245             id = tokens[1];
246         else if (tokens[0] == Tokens::number)
247             number = tokens[1];
248         else if (tokens[0] == Tokens::type)
249             type = tokens[1];
250         else if (tokens[0] == Tokens::description) {
251             description = UnquotedText(tokens[1]);
252         } else
253             throw std::runtime_error{"VCF format error: unrecognized FORMAT field: " + tokens[0]};
254     }
255 
256     return FormatDefinition{std::move(id), std::move(number), std::move(type),
257                             std::move(description)};
258 }
259 
ParsedGeneralDefinition(const std::string & line)260 GeneralDefinition VcfFormat::ParsedGeneralDefinition(const std::string& line)
261 {
262     const auto tokens = PacBio::BAM::Split(line, '=');
263     if (tokens.size() != 2 || tokens[0].find(Tokens::double_hash) != 0) {
264         throw std::runtime_error{"VCF format error: malformed header line: " + line};
265     }
266 
267     return GeneralDefinition{tokens[0].substr(2), tokens[1]};
268 }
269 
ParsedInfoDefinition(std::string line)270 InfoDefinition VcfFormat::ParsedInfoDefinition(std::string line)
271 {
272     // should already be checked by "normal" code path
273     assert(line.find(Tokens::info_lead) == 0);
274 
275     // substring between brackets
276     const auto lastBracketPos = line.find_last_of('>');
277     if (lastBracketPos == std::string::npos)
278         throw std::runtime_error{"VCF format error: malformed INFO line: " + line};
279     line = std::string(line.cbegin() + 8, line.cbegin() + lastBracketPos);
280 
281     std::string id;
282     std::string number;
283     std::string type;
284     std::string description;
285     std::string source;
286     std::string version;
287 
288     const auto fields = PacBio::BAM::Split(line, ',');
289     for (const auto& field : fields) {
290         const auto tokens = PacBio::BAM::Split(field, '=');
291         if (tokens.size() != 2) {
292             throw std::runtime_error{"VCF format error: malformed INFO line: " + line};
293         }
294         if (tokens[0] == Tokens::id)
295             id = tokens[1];
296         else if (tokens[0] == Tokens::number)
297             number = tokens[1];
298         else if (tokens[0] == Tokens::type)
299             type = tokens[1];
300         else if (tokens[0] == Tokens::description) {
301             description = UnquotedText(tokens[1]);
302         } else if (tokens[0] == Tokens::source) {
303             source = UnquotedText(tokens[1]);
304         } else if (tokens[0] == Tokens::version) {
305             version = UnquotedText(tokens[1]);
306         } else
307             throw std::runtime_error{"VCF format error: unrecognized INFO field: " + tokens[0]};
308     }
309 
310     return InfoDefinition{std::move(id),          std::move(number), std::move(type),
311                           std::move(description), std::move(source), std::move(version)};
312 }
313 
ParsedHeader(const std::string & hdrText)314 VcfHeader VcfFormat::ParsedHeader(const std::string& hdrText)
315 {
316     VcfHeader hdr;
317 
318     std::istringstream text{hdrText};
319     std::string line;
320 
321     // quick check for fileformat - should be the first line
322     std::getline(text, line);
323     {
324         auto genDef = ParsedGeneralDefinition(line);
325         if (genDef.Id() != Tokens::file_format)
326             throw std::runtime_error{"VCF format error: file must begin with #fileformat line"};
327         hdr.AddGeneralDefinition(std::move(genDef));
328     }
329 
330     // read through rest of header
331     bool chromLineFound = false;
332     for (; std::getline(text, line);) {
333         if (line.empty()) continue;
334 
335         // info line
336         if (line.find(Tokens::info_lead) == 0) hdr.AddInfoDefinition(ParsedInfoDefinition(line));
337 
338         // filter line
339         else if (line.find(Tokens::filter_lead) == 0)
340             hdr.AddFilterDefinition(ParsedFilterDefinition(line));
341 
342         // format line
343         else if (line.find(Tokens::format_lead) == 0)
344             hdr.AddFormatDefinition(ParsedFormatDefinition(line));
345 
346         // contig line
347         else if (line.find(Tokens::contig_lead) == 0)
348             hdr.AddContigDefinition(ParsedContigDefinition(line));
349 
350         // general comment line
351         //
352         // NOTE: Check this after all other specific header line types. This
353         //       catches all remaining lines starting with "##"
354         //
355         else if (line.find(Tokens::double_hash) == 0)
356             hdr.AddGeneralDefinition(ParsedGeneralDefinition(line));
357 
358         // CHROM line (maybe w/ samples)
359         else if (line.find(Tokens::chrom_lead) == 0) {
360             std::vector<Sample> samples;
361 
362             // If samples are present, skip the fixed colums & FORMAT column (9)
363             // and read the remaining column labels as sample names.
364             //
365             auto columns = PacBio::BAM::Split(line, '\t');
366             for (size_t i = 9; i < columns.size(); ++i)
367                 samples.push_back(std::move(columns[i]));
368             hdr.Samples(std::move(samples));
369 
370             // quit header parsing after CHROM line
371             chromLineFound = true;
372             break;
373         } else
374             throw std::runtime_error{"VCF format error: unexpected line found in header:\n" + line};
375     }
376 
377     if (!chromLineFound) throw std::runtime_error{"VCF format error: CHROM column line is missing"};
378 
379     return hdr;
380 }
381 
HeaderFromFile(const std::string & fn)382 VcfHeader VcfFormat::HeaderFromFile(const std::string& fn)
383 {
384     std::ifstream in(fn);
385     return HeaderFromStream(in);
386 }
387 
HeaderFromStream(std::istream & in)388 VcfHeader VcfFormat::HeaderFromStream(std::istream& in)
389 {
390     std::stringstream text;
391 
392     std::string line;
393     while (std::getline(in, line)) {
394         if (line.empty()) continue;
395         if (line.front() == '#')
396             text << line << '\n';
397         else
398             break;
399     }
400 
401     return ParsedHeader(text.str());
402 }
403 
ParsedInfoField(const std::string & text)404 InfoField VcfFormat::ParsedInfoField(const std::string& text)
405 {
406     const auto& tokens = PacBio::BAM::Split(text, '=');
407     if (tokens.empty()) throw std::runtime_error{"VCF format error: malformed INFO field: " + text};
408 
409     // required ID
410     InfoField result;
411     result.id = tokens.at(0);
412     if (tokens.size() == 1) return result;
413 
414     // optional value or values
415     const auto& valueStr = tokens.at(1);
416     const auto commaFound = valueStr.find(',');
417     if (commaFound != std::string::npos) {
418         std::vector<std::string> values;
419         for (auto&& value : PacBio::BAM::Split(valueStr, ','))
420             values.push_back(std::move(value));
421         result.values = std::move(values);
422     } else
423         result.value = valueStr;
424 
425     return result;
426 }
427 
ParsedInfoFields(const std::string & text)428 std::vector<InfoField> VcfFormat::ParsedInfoFields(const std::string& text)
429 {
430     std::vector<InfoField> result;
431     const auto& fields = PacBio::BAM::Split(text, ';');
432     for (const auto& field : fields)
433         result.push_back(ParsedInfoField(field));
434     return result;
435 }
436 
ParsedVariant(const std::string & line)437 VcfVariant VcfFormat::ParsedVariant(const std::string& line)
438 {
439     const auto fields = PacBio::BAM::Split(line, '\t');
440     if (fields.size() < 7)
441         throw std::runtime_error{"VCF format error: record is missing required fields: " + line};
442 
443     // CHROM POS ID REF ALT REF
444     auto chrom = fields.at(0);
445     auto pos = std::stoi(fields.at(1));
446     auto id = fields.at(2);
447     auto ref = fields.at(3);
448     auto alt = fields.at(4);
449 
450     VcfVariant var{std::move(id), std::move(chrom), std::move(pos), std::move(ref), std::move(alt)};
451 
452     // QUAL
453     const auto& qualStr = fields.at(5);
454     const float qual = (qualStr == "." ? NAN : stof(qualStr));
455     var.Quality(qual);
456 
457     // FILTER
458     auto filter = fields.at(6);
459     var.Filter(std::move(filter));
460 
461     // INFO (allow empty)
462     if (fields.size() >= 8) var.InfoFields(ParsedInfoFields(fields.at(7)));
463 
464     // GENOTYPE (samples)
465     if (fields.size() > 9) {
466         std::vector<std::string> genotypeIds;
467         const auto& formatField = fields.at(8);
468         const auto formatFields = PacBio::BAM::Split(formatField, ':');
469         for (auto&& genotypeId : formatFields)
470             genotypeIds.push_back(genotypeId);
471         var.GenotypeIds(std::move(genotypeIds));
472 
473         // per-sample
474         std::vector<GenotypeField> sampleGenotypes;
475         for (size_t i = 9; i < fields.size(); ++i) {
476 
477             GenotypeField g;
478 
479             const auto& sampleField = fields.at(i);
480             const auto fieldValues = PacBio::BAM::Split(sampleField, ':');
481             for (const auto fieldValue : fieldValues) {
482                 GenotypeData data;
483                 const auto genotypeDataValues = PacBio::BAM::Split(fieldValue, ',');
484                 if (genotypeDataValues.size() == 1)
485                     data.value = genotypeDataValues.at(0);
486                 else
487                     data.values = genotypeDataValues;
488                 g.data.push_back(std::move(data));
489             }
490 
491             sampleGenotypes.push_back(std::move(g));
492         }
493         var.Genotypes(std::move(sampleGenotypes));
494     }
495 
496     return var;
497 }
498 
FormattedInfoField(const InfoField & field)499 std::string VcfFormat::FormattedInfoField(const InfoField& field)
500 {
501     std::ostringstream out;
502     out << field.id;
503     if (field.value.is_initialized()) {
504         out << '=' << field.value.get();
505     } else if (field.values.is_initialized()) {
506         out << '=';
507         bool first = true;
508         for (const auto& value : field.values.get()) {
509             if (!first) out << ',';
510             out << value;
511             first = false;
512         }
513     }
514     return out.str();
515 }
516 
FormattedInfoFields(const std::vector<InfoField> & fields)517 std::string VcfFormat::FormattedInfoFields(const std::vector<InfoField>& fields)
518 {
519     std::ostringstream out;
520     bool first = true;
521     for (const auto field : fields) {
522         if (!first) out << ';';
523         out << FormattedInfoField(field);
524         first = false;
525     }
526     return out.str();
527 }
528 
FormattedVariant(const VcfVariant & var)529 std::string VcfFormat::FormattedVariant(const VcfVariant& var)
530 {
531     std::ostringstream out;
532     out << var.Chrom() << '\t' << var.Position() << '\t' << var.Id() << '\t' << var.RefAllele()
533         << '\t' << var.AltAllele() << '\t'
534         << (var.IsQualityMissing() ? "." : std::to_string(var.Quality())) << '\t' << var.Filter()
535         << '\t' << FormattedInfoFields(var.InfoFields());
536 
537     const auto& genotypeIds = var.GenotypeIds();
538     if (!genotypeIds.empty()) {
539         out << '\t';
540         bool firstId = true;
541         for (const auto id : genotypeIds) {
542             if (!firstId) out << ':';
543             out << id;
544             firstId = false;
545         }
546 
547         const auto& sampleGenotypes = var.Genotypes();
548         for (const auto sampleGenotype : sampleGenotypes) {
549             out << '\t';
550             bool firstDataEntry = true;
551             for (const auto& d : sampleGenotype.data) {
552                 if (!firstDataEntry) out << ':';
553                 if (d.value.is_initialized())
554                     out << d.value.get();
555                 else {
556                     assert(d.values.is_initialized());
557                     bool firstDatapoint = true;
558                     for (const auto& datapoint : d.values.get()) {
559                         if (!firstDatapoint) out << ',';
560                         out << datapoint;
561                         firstDatapoint = false;
562                     }
563                 }
564                 firstDataEntry = false;
565             }
566         }
567     }
568     return out.str();
569 }
570 
571 }  // namespace VCF
572 }  // namespace PacBio
573