1 // Author: Derek Barnett
2
3 #include <pbbam/vcf/VcfFormat.h>
4
5 #include <cassert>
6 #include <fstream>
7 #include <iostream>
8 #include <sstream>
9 #include <stdexcept>
10 #include <string>
11
12 #include <htslib/vcf.h>
13
14 #include <pbbam/StringUtilities.h>
15 #include <pbbam/vcf/VcfHeader.h>
16
17 namespace PacBio {
18 namespace VCF {
19
20 namespace { // anonymous
21
22 // using htslib's current version for better compatibility
23 static constexpr const char current_version[] = "VCFv4.2";
24
25 namespace Tokens {
26
27 static constexpr const char file_format[] = "fileformat";
28
29 static constexpr const char double_hash[] = "##";
30 static constexpr const char contig_lead[] = "##contig=<";
31 static constexpr const char filter_lead[] = "##FILTER=<";
32 static constexpr const char format_lead[] = "##FORMAT=<";
33 static constexpr const char info_lead[] = "##INFO=<";
34 static constexpr const char chrom_lead[] = "#CHROM";
35
36 static constexpr const char id[] = "ID";
37 static constexpr const char number[] = "Number";
38 static constexpr const char type[] = "Type";
39 static constexpr const char description[] = "Description";
40 static constexpr const char source[] = "Source";
41 static constexpr const char version[] = "Version";
42
43 } // namespace Tokens
44
QuotedText(const std::string & d)45 std::string QuotedText(const std::string& d) { return "\"" + d + "\""; }
46
UnquotedText(const std::string & d)47 std::string UnquotedText(const std::string& d)
48 {
49 if (d.size() < 2 || d.front() != '"' || d.back() != '"')
50 throw std::runtime_error{"VCF format error: description text not quoted: " + d};
51 return d.substr(1, d.size() - 2);
52 }
53
54 } // namespace anonymous
55
CurrentVersion()56 const char* VcfFormat::CurrentVersion() { return current_version; }
57
FormattedContigDefinition(const ContigDefinition & def)58 std::string VcfFormat::FormattedContigDefinition(const ContigDefinition& def)
59 {
60 std::ostringstream text;
61
62 // ID
63 text << Tokens::contig_lead << Tokens::id << '=' << def.Id();
64
65 // attributes
66 if (!def.Attributes().empty()) {
67 text << ',';
68 bool first = true;
69 for (const auto& attr : def.Attributes()) {
70 if (!first) text << ',';
71 text << attr.first << '=' << attr.second;
72 first = false;
73 }
74 }
75 text << '>';
76 return text.str();
77 }
78
FormattedFilterDefinition(const FilterDefinition & def)79 std::string VcfFormat::FormattedFilterDefinition(const FilterDefinition& def)
80 {
81 std::ostringstream text;
82 text << Tokens::filter_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::description
83 << '=' << QuotedText(def.Description()) << '>';
84 return text.str();
85 }
86
FormattedFormatDefinition(const FormatDefinition & def)87 std::string VcfFormat::FormattedFormatDefinition(const FormatDefinition& def)
88 {
89 std::ostringstream text;
90 text << Tokens::format_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
91 << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
92 << '=' << QuotedText(def.Description()) << '>';
93 return text.str();
94 }
95
FormattedGeneralDefinition(const GeneralDefinition & def)96 std::string VcfFormat::FormattedGeneralDefinition(const GeneralDefinition& def)
97 {
98 std::ostringstream text;
99 text << Tokens::double_hash << def.Id() << '=' << def.Text();
100 return text.str();
101 }
102
FormattedInfoDefinition(const InfoDefinition & def)103 std::string VcfFormat::FormattedInfoDefinition(const InfoDefinition& def)
104 {
105 std::ostringstream text;
106 text << Tokens::info_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
107 << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
108 << '=' << QuotedText(def.Description());
109
110 if (def.Source().is_initialized() && !def.Source().get().empty())
111 text << ',' << Tokens::source << '=' << QuotedText(def.Source().get());
112
113 if (def.Version().is_initialized() && !def.Version().get().empty())
114 text << ',' << Tokens::version << '=' << QuotedText(def.Version().get());
115
116 text << '>';
117 return text.str();
118 }
119
FormattedHeader(const VcfHeader & header)120 std::string VcfFormat::FormattedHeader(const VcfHeader& header)
121 {
122 std::ostringstream out;
123
124 const auto& fileformat = header.GeneralDefinition(Tokens::file_format);
125 out << FormattedGeneralDefinition(fileformat) << '\n';
126
127 // remaining general definiitions
128 for (const auto& def : header.GeneralDefinitions()) {
129 if (def.Id() != Tokens::file_format) out << FormattedGeneralDefinition(def) << '\n';
130 }
131
132 // ##contig
133 for (const auto& contig : header.ContigDefinitions())
134 out << FormattedContigDefinition(contig) << '\n';
135
136 // ##FILTER
137 for (const auto& filter : header.FilterDefinitions())
138 out << FormattedFilterDefinition(filter) << '\n';
139
140 // ##INFO
141 for (const auto& info : header.InfoDefinitions())
142 out << FormattedInfoDefinition(info) << '\n';
143
144 // ##FORMAT
145 for (const auto& format : header.FormatDefinitions())
146 out << FormattedFormatDefinition(format) << '\n';
147
148 // fixed headers
149 out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
150
151 // samples
152 const auto& samples = header.Samples();
153 if (!samples.empty()) {
154 out << "\tFORMAT";
155 for (const auto& sample : samples)
156 out << '\t' << sample;
157 }
158
159 return out.str();
160 }
161
ParsedContigDefinition(std::string line)162 ContigDefinition VcfFormat::ParsedContigDefinition(std::string line)
163 {
164 // should already be checked by "normal" code path
165 assert(line.find(Tokens::contig_lead) == 0);
166
167 // substring between brackets
168 const auto lastBracketPos = line.find_last_of('>');
169 if (lastBracketPos == std::string::npos)
170 throw std::runtime_error{"VCF format error: malformed ##contig line: " + line};
171 line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
172
173 std::string id;
174 std::vector<std::pair<std::string, std::string>> attributes;
175
176 const auto fields = PacBio::BAM::Split(line, ',');
177 for (const auto& field : fields) {
178 const auto tokens = PacBio::BAM::Split(field, '=');
179 if (tokens.size() != 2) {
180 throw std::runtime_error{"VCF format error: malformed ##contig line: " + line};
181 }
182 if (tokens[0] == Tokens::id)
183 id = tokens[1];
184 else
185 attributes.push_back(std::make_pair(tokens[0], tokens[1]));
186 }
187
188 return ContigDefinition{std::move(id), std::move(attributes)};
189 }
190
ParsedFilterDefinition(std::string line)191 FilterDefinition VcfFormat::ParsedFilterDefinition(std::string line)
192 {
193 // should already be checked by "normal" code path
194 assert(line.find(Tokens::filter_lead) == 0);
195
196 // substring between brackets
197 const auto lastBracketPos = line.find_last_of('>');
198 if (lastBracketPos == std::string::npos)
199 throw std::runtime_error{"VCF format error: malformed FILTER line: " + line};
200 line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
201
202 std::string id;
203 std::string description;
204
205 const auto fields = PacBio::BAM::Split(line, ',');
206 for (const auto& field : fields) {
207 const auto tokens = PacBio::BAM::Split(field, '=');
208 if (tokens.size() != 2) {
209 throw std::runtime_error{"VCF format error: malformed FILTER line: " + line};
210 }
211 if (tokens[0] == Tokens::id)
212 id = tokens[1];
213 else if (tokens[0] == Tokens::description) {
214 description = UnquotedText(tokens[1]);
215 } else
216 throw std::runtime_error{"VCF format error: unrecognized FILTER field: " + tokens[0]};
217 }
218
219 return FilterDefinition{std::move(id), std::move(description)};
220 }
221
ParsedFormatDefinition(std::string line)222 FormatDefinition VcfFormat::ParsedFormatDefinition(std::string line)
223 {
224 // should already be checked by "normal" code path
225 assert(line.find(Tokens::format_lead) == 0);
226
227 // substring between brackets
228 const auto lastBracketPos = line.find_last_of('>');
229 if (lastBracketPos == std::string::npos)
230 throw std::runtime_error{"VCF format error: malformed FORMAT line: " + line};
231 line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
232
233 std::string id;
234 std::string number;
235 std::string type;
236 std::string description;
237
238 const auto fields = PacBio::BAM::Split(line, ',');
239 for (const auto& field : fields) {
240 const auto tokens = PacBio::BAM::Split(field, '=');
241 if (tokens.size() != 2) {
242 throw std::runtime_error{"VCF format error: malformed FORMAT line: " + line};
243 }
244 if (tokens[0] == Tokens::id)
245 id = tokens[1];
246 else if (tokens[0] == Tokens::number)
247 number = tokens[1];
248 else if (tokens[0] == Tokens::type)
249 type = tokens[1];
250 else if (tokens[0] == Tokens::description) {
251 description = UnquotedText(tokens[1]);
252 } else
253 throw std::runtime_error{"VCF format error: unrecognized FORMAT field: " + tokens[0]};
254 }
255
256 return FormatDefinition{std::move(id), std::move(number), std::move(type),
257 std::move(description)};
258 }
259
ParsedGeneralDefinition(const std::string & line)260 GeneralDefinition VcfFormat::ParsedGeneralDefinition(const std::string& line)
261 {
262 const auto tokens = PacBio::BAM::Split(line, '=');
263 if (tokens.size() != 2 || tokens[0].find(Tokens::double_hash) != 0) {
264 throw std::runtime_error{"VCF format error: malformed header line: " + line};
265 }
266
267 return GeneralDefinition{tokens[0].substr(2), tokens[1]};
268 }
269
ParsedInfoDefinition(std::string line)270 InfoDefinition VcfFormat::ParsedInfoDefinition(std::string line)
271 {
272 // should already be checked by "normal" code path
273 assert(line.find(Tokens::info_lead) == 0);
274
275 // substring between brackets
276 const auto lastBracketPos = line.find_last_of('>');
277 if (lastBracketPos == std::string::npos)
278 throw std::runtime_error{"VCF format error: malformed INFO line: " + line};
279 line = std::string(line.cbegin() + 8, line.cbegin() + lastBracketPos);
280
281 std::string id;
282 std::string number;
283 std::string type;
284 std::string description;
285 std::string source;
286 std::string version;
287
288 const auto fields = PacBio::BAM::Split(line, ',');
289 for (const auto& field : fields) {
290 const auto tokens = PacBio::BAM::Split(field, '=');
291 if (tokens.size() != 2) {
292 throw std::runtime_error{"VCF format error: malformed INFO line: " + line};
293 }
294 if (tokens[0] == Tokens::id)
295 id = tokens[1];
296 else if (tokens[0] == Tokens::number)
297 number = tokens[1];
298 else if (tokens[0] == Tokens::type)
299 type = tokens[1];
300 else if (tokens[0] == Tokens::description) {
301 description = UnquotedText(tokens[1]);
302 } else if (tokens[0] == Tokens::source) {
303 source = UnquotedText(tokens[1]);
304 } else if (tokens[0] == Tokens::version) {
305 version = UnquotedText(tokens[1]);
306 } else
307 throw std::runtime_error{"VCF format error: unrecognized INFO field: " + tokens[0]};
308 }
309
310 return InfoDefinition{std::move(id), std::move(number), std::move(type),
311 std::move(description), std::move(source), std::move(version)};
312 }
313
ParsedHeader(const std::string & hdrText)314 VcfHeader VcfFormat::ParsedHeader(const std::string& hdrText)
315 {
316 VcfHeader hdr;
317
318 std::istringstream text{hdrText};
319 std::string line;
320
321 // quick check for fileformat - should be the first line
322 std::getline(text, line);
323 {
324 auto genDef = ParsedGeneralDefinition(line);
325 if (genDef.Id() != Tokens::file_format)
326 throw std::runtime_error{"VCF format error: file must begin with #fileformat line"};
327 hdr.AddGeneralDefinition(std::move(genDef));
328 }
329
330 // read through rest of header
331 bool chromLineFound = false;
332 for (; std::getline(text, line);) {
333 if (line.empty()) continue;
334
335 // info line
336 if (line.find(Tokens::info_lead) == 0) hdr.AddInfoDefinition(ParsedInfoDefinition(line));
337
338 // filter line
339 else if (line.find(Tokens::filter_lead) == 0)
340 hdr.AddFilterDefinition(ParsedFilterDefinition(line));
341
342 // format line
343 else if (line.find(Tokens::format_lead) == 0)
344 hdr.AddFormatDefinition(ParsedFormatDefinition(line));
345
346 // contig line
347 else if (line.find(Tokens::contig_lead) == 0)
348 hdr.AddContigDefinition(ParsedContigDefinition(line));
349
350 // general comment line
351 //
352 // NOTE: Check this after all other specific header line types. This
353 // catches all remaining lines starting with "##"
354 //
355 else if (line.find(Tokens::double_hash) == 0)
356 hdr.AddGeneralDefinition(ParsedGeneralDefinition(line));
357
358 // CHROM line (maybe w/ samples)
359 else if (line.find(Tokens::chrom_lead) == 0) {
360 std::vector<Sample> samples;
361
362 // If samples are present, skip the fixed colums & FORMAT column (9)
363 // and read the remaining column labels as sample names.
364 //
365 auto columns = PacBio::BAM::Split(line, '\t');
366 for (size_t i = 9; i < columns.size(); ++i)
367 samples.push_back(std::move(columns[i]));
368 hdr.Samples(std::move(samples));
369
370 // quit header parsing after CHROM line
371 chromLineFound = true;
372 break;
373 } else
374 throw std::runtime_error{"VCF format error: unexpected line found in header:\n" + line};
375 }
376
377 if (!chromLineFound) throw std::runtime_error{"VCF format error: CHROM column line is missing"};
378
379 return hdr;
380 }
381
HeaderFromFile(const std::string & fn)382 VcfHeader VcfFormat::HeaderFromFile(const std::string& fn)
383 {
384 std::ifstream in(fn);
385 return HeaderFromStream(in);
386 }
387
HeaderFromStream(std::istream & in)388 VcfHeader VcfFormat::HeaderFromStream(std::istream& in)
389 {
390 std::stringstream text;
391
392 std::string line;
393 while (std::getline(in, line)) {
394 if (line.empty()) continue;
395 if (line.front() == '#')
396 text << line << '\n';
397 else
398 break;
399 }
400
401 return ParsedHeader(text.str());
402 }
403
ParsedInfoField(const std::string & text)404 InfoField VcfFormat::ParsedInfoField(const std::string& text)
405 {
406 const auto& tokens = PacBio::BAM::Split(text, '=');
407 if (tokens.empty()) throw std::runtime_error{"VCF format error: malformed INFO field: " + text};
408
409 // required ID
410 InfoField result;
411 result.id = tokens.at(0);
412 if (tokens.size() == 1) return result;
413
414 // optional value or values
415 const auto& valueStr = tokens.at(1);
416 const auto commaFound = valueStr.find(',');
417 if (commaFound != std::string::npos) {
418 std::vector<std::string> values;
419 for (auto&& value : PacBio::BAM::Split(valueStr, ','))
420 values.push_back(std::move(value));
421 result.values = std::move(values);
422 } else
423 result.value = valueStr;
424
425 return result;
426 }
427
ParsedInfoFields(const std::string & text)428 std::vector<InfoField> VcfFormat::ParsedInfoFields(const std::string& text)
429 {
430 std::vector<InfoField> result;
431 const auto& fields = PacBio::BAM::Split(text, ';');
432 for (const auto& field : fields)
433 result.push_back(ParsedInfoField(field));
434 return result;
435 }
436
ParsedVariant(const std::string & line)437 VcfVariant VcfFormat::ParsedVariant(const std::string& line)
438 {
439 const auto fields = PacBio::BAM::Split(line, '\t');
440 if (fields.size() < 7)
441 throw std::runtime_error{"VCF format error: record is missing required fields: " + line};
442
443 // CHROM POS ID REF ALT REF
444 auto chrom = fields.at(0);
445 auto pos = std::stoi(fields.at(1));
446 auto id = fields.at(2);
447 auto ref = fields.at(3);
448 auto alt = fields.at(4);
449
450 VcfVariant var{std::move(id), std::move(chrom), std::move(pos), std::move(ref), std::move(alt)};
451
452 // QUAL
453 const auto& qualStr = fields.at(5);
454 const float qual = (qualStr == "." ? NAN : stof(qualStr));
455 var.Quality(qual);
456
457 // FILTER
458 auto filter = fields.at(6);
459 var.Filter(std::move(filter));
460
461 // INFO (allow empty)
462 if (fields.size() >= 8) var.InfoFields(ParsedInfoFields(fields.at(7)));
463
464 // GENOTYPE (samples)
465 if (fields.size() > 9) {
466 std::vector<std::string> genotypeIds;
467 const auto& formatField = fields.at(8);
468 const auto formatFields = PacBio::BAM::Split(formatField, ':');
469 for (auto&& genotypeId : formatFields)
470 genotypeIds.push_back(genotypeId);
471 var.GenotypeIds(std::move(genotypeIds));
472
473 // per-sample
474 std::vector<GenotypeField> sampleGenotypes;
475 for (size_t i = 9; i < fields.size(); ++i) {
476
477 GenotypeField g;
478
479 const auto& sampleField = fields.at(i);
480 const auto fieldValues = PacBio::BAM::Split(sampleField, ':');
481 for (const auto fieldValue : fieldValues) {
482 GenotypeData data;
483 const auto genotypeDataValues = PacBio::BAM::Split(fieldValue, ',');
484 if (genotypeDataValues.size() == 1)
485 data.value = genotypeDataValues.at(0);
486 else
487 data.values = genotypeDataValues;
488 g.data.push_back(std::move(data));
489 }
490
491 sampleGenotypes.push_back(std::move(g));
492 }
493 var.Genotypes(std::move(sampleGenotypes));
494 }
495
496 return var;
497 }
498
FormattedInfoField(const InfoField & field)499 std::string VcfFormat::FormattedInfoField(const InfoField& field)
500 {
501 std::ostringstream out;
502 out << field.id;
503 if (field.value.is_initialized()) {
504 out << '=' << field.value.get();
505 } else if (field.values.is_initialized()) {
506 out << '=';
507 bool first = true;
508 for (const auto& value : field.values.get()) {
509 if (!first) out << ',';
510 out << value;
511 first = false;
512 }
513 }
514 return out.str();
515 }
516
FormattedInfoFields(const std::vector<InfoField> & fields)517 std::string VcfFormat::FormattedInfoFields(const std::vector<InfoField>& fields)
518 {
519 std::ostringstream out;
520 bool first = true;
521 for (const auto field : fields) {
522 if (!first) out << ';';
523 out << FormattedInfoField(field);
524 first = false;
525 }
526 return out.str();
527 }
528
FormattedVariant(const VcfVariant & var)529 std::string VcfFormat::FormattedVariant(const VcfVariant& var)
530 {
531 std::ostringstream out;
532 out << var.Chrom() << '\t' << var.Position() << '\t' << var.Id() << '\t' << var.RefAllele()
533 << '\t' << var.AltAllele() << '\t'
534 << (var.IsQualityMissing() ? "." : std::to_string(var.Quality())) << '\t' << var.Filter()
535 << '\t' << FormattedInfoFields(var.InfoFields());
536
537 const auto& genotypeIds = var.GenotypeIds();
538 if (!genotypeIds.empty()) {
539 out << '\t';
540 bool firstId = true;
541 for (const auto id : genotypeIds) {
542 if (!firstId) out << ':';
543 out << id;
544 firstId = false;
545 }
546
547 const auto& sampleGenotypes = var.Genotypes();
548 for (const auto sampleGenotype : sampleGenotypes) {
549 out << '\t';
550 bool firstDataEntry = true;
551 for (const auto& d : sampleGenotype.data) {
552 if (!firstDataEntry) out << ':';
553 if (d.value.is_initialized())
554 out << d.value.get();
555 else {
556 assert(d.values.is_initialized());
557 bool firstDatapoint = true;
558 for (const auto& datapoint : d.values.get()) {
559 if (!firstDatapoint) out << ',';
560 out << datapoint;
561 firstDatapoint = false;
562 }
563 }
564 firstDataEntry = false;
565 }
566 }
567 }
568 return out.str();
569 }
570
571 } // namespace VCF
572 } // namespace PacBio
573