1 #include "Variant.h"
2 #include <utility>
3
4 namespace vcflib {
5
6 static char rev_arr [26] = {84, 66, 71, 68, 69, 70, 67, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 65,
7 85, 86, 87, 88, 89, 90};
8
reverse_complement(const std::string & seq)9 std::string reverse_complement(const std::string& seq) {
10 // The old implementation of this function forgot to null-terminate its
11 // returned string. This implementation uses heavier-weight C++ stuff that
12 // may be slower but should ensure that that doesn't happen again.
13
14 if (seq.size() == 0) {
15 return seq;
16 }
17
18 string ret;
19 ret.reserve(seq.size());
20
21 std::transform(seq.rbegin(), seq.rend(), std::back_inserter(ret), [](char in) -> char {
22 bool lower_case = (in >= 'a' && in <= 'z');
23 if (lower_case) {
24 // Convert to upper case
25 in -= 32;
26 }
27 if (in < 'A' || in > 'Z') {
28 throw std::runtime_error("Out of range character " + std::to_string((uint8_t)in) + " in inverted sequence");
29 }
30 // Compute RC in terms of letter identity, and then lower-case if necessary.
31 return rev_arr[((int) in) - 'A'] + (lower_case ? 32 : 0);
32 });
33
34 return ret;
35 }
36
toUpper(const std::string & seq)37 std::string toUpper(const std::string& seq) {
38 if (seq.size() == 0) {
39 return seq;
40 }
41
42 string ret;
43 ret.reserve(seq.size());
44
45 std::transform(seq.begin(), seq.end(), std::back_inserter(ret), [](char in) -> char {
46 // If it's lower-case, bring it down in value to upper-case.
47 return (in >= 'a' && in <= 'z') ? (in - 32) : in;
48 });
49
50 return ret;
51 }
52
53
allATGCN(const string & s,bool allowLowerCase)54 bool allATGCN(const string& s, bool allowLowerCase){
55 if (allowLowerCase){
56 for (string::const_iterator i = s.begin(); i != s.end(); ++i){
57 char c = *i;
58 if (c != 'A' && c != 'a' &&
59 c != 'C' && c != 'c' &&
60 c != 'T' && c != 't' &&
61 c != 'G' && c != 'g' &&
62 c != 'N' && c != 'n'){
63 return false;
64 }
65 }
66 }
67 else{
68 for (string::const_iterator i = s.begin(); i != s.end(); ++i){
69 char c = *i;
70 if (c != 'A' && c != 'C' && c != 'T' && c != 'G' && c != 'N'){
71 return false;
72 }
73 }
74
75 }
76 return true;
77 }
78
79
80
parse(string & line,bool parseSamples)81 void Variant::parse(string& line, bool parseSamples) {
82
83 // clean up potentially variable data structures
84 info.clear();
85 infoFlags.clear();
86 format.clear();
87 alt.clear();
88 alleles.clear();
89 canonical = false;
90
91 // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [SAMPLE1 .. SAMPLEN]
92 vector<string> fields = split(line, '\t');
93 if (fields.size() < 7) {
94 cerr << "broken VCF record (less than 7 fields)" << endl
95 << line << endl;
96 exit(1);
97 }
98
99 sequenceName = fields.at(0);
100 char* end; // dummy variable for strtoll
101 position = strtoll(fields.at(1).c_str(), &end, 10);
102 id = fields.at(2);
103 ref = fields.at(3);
104 alt = split(fields.at(4), ","); // a comma-separated list of alternate alleles
105
106 // make a list of all (ref + alts) alleles, allele[0] = ref, alleles[1:] = alts
107 // add the ref allele ([0]), resize for the alt alleles, and then add the alt alleles
108 alleles.push_back(ref);
109 alleles.resize(alt.size()+1);
110 std::copy(alt.begin(), alt.end(), alleles.begin()+1);
111
112 // set up reverse lookup of allele index
113 altAlleleIndexes.clear();
114 int n = 0;
115 for (vector<string>::iterator a = alt.begin();
116 a != alt.end(); ++a, ++n) {
117 altAlleleIndexes[*a] = n;
118 }
119
120 convert(fields.at(5), quality);
121 filter = fields.at(6);
122 if (fields.size() > 7) {
123 vector<string> infofields = split(fields.at(7), ';');
124 for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) {
125 if (*f == ".") {
126 continue;
127 }
128 vector<string> kv = split(*f, '=');
129 if (kv.size() == 2) {
130 split(kv.at(1), ',', info[kv.at(0)]);
131 } else if (kv.size() == 1) {
132 infoFlags[kv.at(0)] = true;
133 }
134 }
135 }
136 // check if we have samples specified
137 // and that we are supposed to parse them
138 if (parseSamples && fields.size() > 8) {
139 format = split(fields.at(8), ':');
140 // if the format changed, we have to rebuild the samples
141 if (fields.at(8) != lastFormat) {
142 samples.clear();
143 lastFormat = fields.at(8);
144 }
145 vector<string>::iterator sampleName = sampleNames.begin();
146 vector<string>::iterator sample = fields.begin() + 9;
147 for (; sample != fields.end() && sampleName != sampleNames.end();
148 ++sample, ++sampleName) {
149 string& name = *sampleName;
150
151 vector<string> samplefields = split(*sample, ':');
152 vector<string>::iterator i = samplefields.begin();
153
154 for (vector<string>::iterator f = format.begin();
155 f != format.end(); ++f) {
156
157 if(i != samplefields.end()){
158 samples[name][*f] = split(*i, ','); ++i;
159 }
160 else{
161 std::vector<string> missing;
162 missing.push_back(".");
163 samples[name][*f] = missing;
164 }
165 }
166 }
167
168 if (sampleName != sampleNames.end()) {
169 cerr << "error: more sample names in header than sample fields" << endl;
170 cerr << "samples: " << join(sampleNames, " ") << endl;
171 cerr << "line: " << line << endl;
172 exit(1);
173 }
174 if (sample != fields.end()) {
175 cerr << "error: more sample fields than samples listed in header" << endl;
176 cerr << "samples: " << join(sampleNames, " ") << endl;
177 cerr << "line: " << line << endl;
178 cerr << *sample << endl;
179 exit(1);
180 }
181 } else if (!parseSamples) {
182 originalLine = line;
183 }
184
185 //return true; // we should be catching exceptions...
186 }
187
hasSVTags() const188 bool Variant::hasSVTags() const{
189 bool found_svtype = !getSVTYPE().empty();
190 bool found_len = this->info.find("SVLEN") != this->info.end() || this->info.find("END") != this->info.end() || this->info.find("SPAN") != this->info.end();
191
192 return found_svtype && found_len;
193 }
194
195
isSymbolicSV() const196 bool Variant::isSymbolicSV() const{
197
198 bool found_svtype = !getSVTYPE().empty();
199
200 bool ref_valid = allATGCN(this->ref);
201 bool alts_valid = true;
202 for (auto a : this->alt){
203 if (!allATGCN(a)){
204 alts_valid = false;
205 }
206 }
207
208 return (!ref_valid || !alts_valid) && (found_svtype);
209 }
210
getSVTYPE(int altpos) const211 string Variant::getSVTYPE(int altpos) const{
212
213 if (altpos > 0){
214 // TODO: Implement multi-alt SVs
215 return "";
216 }
217
218
219 if (this->info.find("SVTYPE") != this->info.end()){
220 if (altpos >= this->info.at("SVTYPE").size()) {
221 return "";
222 }
223 return this->info.at("SVTYPE")[altpos];
224 }
225
226 return "";
227 };
228
229
230
getMaxReferencePos()231 int Variant::getMaxReferencePos(){
232 if (this->canonical && this->info.find("END") != this->info.end()) {
233 // We are cannonicalized and must have a correct END
234
235 int end = 0;
236 for (auto s : this->info.at("END")){
237 // Get the latest one defined.
238 end = max(abs(stoi(s)), end);
239 }
240 // Convert to 0-based.
241 return end - 1;
242
243 }
244
245 if (!this->isSymbolicSV()){
246 // We don't necessarily have an END, but we don't need one
247 return this->zeroBasedPosition() + this->ref.length() - 1;
248 }
249
250 if (this->canonicalizable()){
251 // We aren't canonical, but we could be.
252 if (this->info.find("END") != this->info.end()){
253 // We have an END; blindly trust it
254 int end = 0;
255 for (auto s : this->info.at("END")){
256 // Get the latest one defined.
257 end = max(abs(stoi(s)), end);
258 }
259 // Convert to 0-based.
260 return end - 1;
261
262 }
263 else if (this->info.find("SVLEN") != this->info.end()){
264 // There's no endpoint, but we know an SVLEN.
265 // A negative SVLEN means a deletion, so if we find one we can say we delete that much.
266 int deleted = 0;
267 for (auto s : this->info.at("SVLEN")){
268 int alt_len = stoi(s);
269 if (alt_len > 0){
270 // Not a deletion, so doesn't affect any ref bases
271 continue;
272 }
273 deleted = max(-alt_len, deleted);
274 }
275
276 // The anchoring base at POS gets added in (because it isn't
277 // deleted) but then subtracted out (because we have to do that to
278 // match non-SV deletions). For insertions, deleted is 0 and we
279 // return 0-based POS. Inversions must have an END.
280 return this->zeroBasedPosition() + deleted;
281 }
282 else{
283 cerr << "Warning: insufficient length information for " << *this << endl;
284 return -1;
285 }
286 }
287 else {
288 cerr << "Warning: can't get end of non-canonicalizeable variant " << *this << endl;
289 }
290 return -1;
291 }
292
293
294
295
296 // To canonicalize a variant, we need either both REF and ALT seqs filled in
297 // or SVTYPE and SVLEN or END or SPAN or SEQ sufficient to define the variant.
canonicalizable()298 bool Variant::canonicalizable(){
299 bool pre_canon = allATGCN(this->ref);
300
301 for (auto& a : this->alt){
302 if (!allATGCN(a)){
303 pre_canon = false;
304 }
305 }
306
307 if (pre_canon){
308 // It came in in a fully specified way.
309 // TODO: ideally, we'd check to make sure ref/alt lengths, svtypes, and ends line up right here.
310 return true;
311 }
312
313 string svtype = getSVTYPE();
314
315 if (svtype.empty()){
316 // We have no SV type, so we can't interpret things.
317 return false;
318 }
319
320 // Check the tags
321 bool has_len = this->info.count("SVLEN") && !this->info.at("SVLEN").empty();
322 bool has_seq = this->info.count("SEQ") && !this->info.at("SEQ").empty();
323 bool has_span = this->info.count("SPAN") && !this->info.at("SPAN").empty();
324 bool has_end = this->info.count("END") && !this->info.at("END").empty();
325
326
327 if (svtype == "INS"){
328 // Insertions need a SEQ, SVLEN, or SPAN
329 return has_seq || has_len || has_span;
330 }
331 else if (svtype == "DEL"){
332 // Deletions need an SVLEN, SPAN, or END
333 return has_len || has_span || has_end;
334 }
335 else if (svtype == "INV"){
336 // Inversions need a SPAN or END
337 return has_span || has_end;
338 }
339 else{
340 // Other SV types are unsupported
341 // TODO: DUP
342 return false;
343 }
344 }
345
canonicalize(FastaReference & fasta_reference,vector<FastaReference * > insertions,bool place_seq,int min_size)346 bool Variant::canonicalize(FastaReference& fasta_reference, vector<FastaReference*> insertions, bool place_seq, int min_size){
347
348 // Nobody should call this without checking
349 assert(canonicalizable());
350
351 // Nobody should call this twice
352 assert(!this->canonical);
353
354 // Find where the inserted sequence can come from for insertions
355 bool do_external_insertions = !insertions.empty();
356 FastaReference* insertion_fasta;
357 if (do_external_insertions){
358 insertion_fasta = insertions[0];
359 }
360
361 bool ref_valid = allATGCN(ref);
362
363 if (!ref_valid && !place_seq){
364 // If the reference is invalid, and we aren't allowed to change the ref sequence,
365 // we can't canonicalize the variant.
366 return false;
367 }
368
369 // Check the alts to see if they are not symbolic
370 vector<bool> alt_i_atgcn (alt.size());
371 for (int i = 0; i < alt.size(); ++i){
372 alt_i_atgcn[i] = allATGCN(alt[i]);
373 }
374
375 // Only allow single-alt variants
376 bool single_alt = alt.size() == 1;
377 if (!single_alt){
378 // TODO: this will need to be remove before supporting multiple alleles
379 cerr << "Warning: multiple ALT alleles not yet allowed for SVs" << endl;
380 return false;
381 }
382
383 // Fill in the SV tags
384 string svtype = getSVTYPE();
385 bool has_len = this->info.count("SVLEN") && !this->info.at("SVLEN").empty();
386 bool has_seq = this->info.count("SEQ") && !this->info.at("SEQ").empty();
387 bool has_span = this->info.count("SPAN") && !this->info.at("SPAN").empty();
388 bool has_end = this->info.count("END") && !this->info.at("END").empty();
389
390 // Where is the end, or where should it be?
391 long info_end = 0;
392 if (has_end) {
393 // Get the END from the tag
394 info_end = stol(this->info.at("END")[0]);
395 }
396 else if(ref_valid && !place_seq) {
397 // Get the END from the reference sequence, which is ready.
398 info_end = this->position + this->ref.length() - 1;
399 }
400 else if ((svtype == "DEL" || svtype == "INV") && has_span) {
401 // For deletions and inversions, we can get the END from the SPAN
402 info_end = this->position + abs(stol(this->info.at("SPAN")[0]));
403 }
404 else if (svtype == "DEL" && has_len) {
405 // For deletions, we can get the END from the SVLEN
406 info_end = this->position + abs(stol(this->info.at("SVLEN")[0]));
407 }
408 else if (svtype == "INS"){
409 // For insertions, END is just POS if not specified
410 info_end = this->position;
411 }
412 else{
413 cerr << "Warning: could not set END info " << *this << endl;
414 return false;
415 }
416
417 // Commit back the END
418 this->info["END"].resize(1);
419 this->info["END"][0] = to_string(info_end);
420 has_end = true;
421
422 // What is the variant length change?
423 // We store it as absolute value
424 long info_len = 0;
425 if (has_len){
426 // Get the SVLEN from the tag
427 info_len = abs(stol(this->info.at("SVLEN")[0]));
428 }
429 else if ((svtype == "INS" || svtype == "DEL") && has_span){
430 info_len = abs(stol(this->info.at("SPAN")[0]));
431 }
432 else if (svtype == "DEL"){
433 // We always have the end by now
434 // Deletion ends give you length change
435 info_len = info_end - this->position;
436 }
437 else if (svtype == "INV"){
438 // Inversions have 0 length change unless otherwise specified.
439 info_len = 0;
440 }
441 else if (svtype == "INS" && has_seq) {
442 // Insertions can let us pick it up from the SEQ tag
443 info_len = this->info.at("SEQ").at(0).size();
444 }
445 else{
446 cerr << "Warning: could not set SVLEN info " << *this << endl;
447 return false;
448 }
449
450 // Commit the SVLEN back
451 if (svtype == "DEL"){
452 // Should be saved as negative
453 this->info["SVLEN"].resize(1);
454 this->info["SVLEN"][0] = to_string(-info_len);
455 }
456 else{
457 // Should be saved as positive
458 this->info["SVLEN"].resize(1);
459 this->info["SVLEN"][0] = to_string(info_len);
460 }
461 // Now the length change is known
462 has_len = true;
463
464 // We also compute a span
465 long info_span = 0;
466 if (has_span){
467 // Use the specified span
468 info_span = abs(stol(this->info.at("SVLEN")[0]));
469 }
470 else if (svtype == "INS" || svtype == "DEL"){
471 // has_len is always true here
472 // Insertions and deletions let us determine the span from the length change, unless they are complex.
473 info_span = info_len;
474 }
475 else if (svtype == "INV"){
476 // has_end is always true here
477 // Inversion span is start to end
478 info_span = info_end - this->position;
479 }
480 else{
481 cerr << "Warning: could not set SPAN info " << *this << endl;
482 return false;
483 }
484
485 // Commit the SPAN back
486 this->info["SPAN"].resize(1);
487 this->info["SPAN"][0] = to_string(info_span);
488 // Now the span change is known
489 has_span = true;
490
491 if (info_end < this->position) {
492 cerr << "Warning: SV END is before POS [canonicalize] " <<
493 *this << endl << "END: " << info_end << " " << "POS: " << this->position << endl;
494 return false;
495 }
496
497 if (has_seq) {
498 // Force the SEQ to upper case, if already present
499 this->info["SEQ"].resize(1);
500 this->info["SEQ"][0] = toUpper(this->info["SEQ"][0]);
501 }
502
503 // Set the other necessary SV Tags (SVTYPE, SEQ (if insertion))
504 // Also check for agreement in the position tags
505 if (svtype == "INS"){
506 if (info_end != this->position){
507 cerr << "Warning: insertion END and POS do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
508 *this << endl << "END: " << info_end << " " << "POS: " << this->position << endl;
509
510 if (info_end == this->position + info_len) {
511 // We can probably guess what they meant here.
512 cerr << "Warning: VCF writer incorrecty produced END = POS + SVLEN for an insertion. Fixing END to POS." << endl;
513 info_end = this->position;
514 this->info["END"][0] = to_string(info_end);
515 } else {
516 return false;
517 }
518 }
519
520 if (info_len != info_span){
521 cerr << "Warning: insertion SVLEN and SPAN do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
522 *this << endl << "SVLEN: " << info_len << " " << "SPAN: " << info_span << endl;
523 return false;
524 }
525
526 if (has_seq && allATGCN(this->info.at("SEQ")[0]) && this->info.at("SEQ")[0].size() != info_len){
527 cerr << "Warning: insertion SVLEN and SEQ do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
528 *this << endl << "SVLEN: " << info_len << " " << "SEQ length: " << this->info.at("SEQ")[0].size() << endl;
529 return false;
530 }
531
532 // Set REF
533 string ref_base = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), 1));
534 if (place_seq){
535 this->ref.assign(ref_base);
536 }
537
538 if (has_seq &&
539 alt[0] != this->info.at("SEQ")[0] &&
540 allATGCN(this->info.at("SEQ")[0])){
541 // Try to remove prepended ref sequence, assuming it's left-aligned
542 string s = this->alt[0];
543 s = toUpper(s.substr(this->ref.length()));
544 if (s != this->info.at("SEQ")[0] && !place_seq){
545 cerr << "Warning: INS sequence in alt field does not match SEQ tag" << endl <<
546 this->alt[0] << " " << this->info.at("SEQ")[0] << endl;
547 return false;
548 }
549 if (place_seq){
550 this->alt[0].assign( ref_base + this->info.at("SEQ")[0] );
551 }
552
553 }
554 else if (alt_i_atgcn[0] && !has_seq){
555 string s = this->alt[0];
556 s = toUpper(s.substr(this->ref.length()));
557 this->info["SEQ"].resize(1);
558 this->info.at("SEQ")[0].assign(s);
559
560 if (s.size() != info_len){
561 cerr << "Warning: insertion SVLEN and added bases do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
562 *this << endl << "SVLEN: " << info_len << " " << "added bases: " << s.size() << endl;
563 return false;
564 }
565
566 }
567 else if (alt[0][0] == '<' && do_external_insertions){
568
569 string ins_seq;
570 string seq_id = alt[0].substr(1, alt[0].size() - 2);
571
572 if (insertion_fasta->index->find(seq_id) != insertion_fasta->index->end()){
573 ins_seq = toUpper(insertion_fasta->getSequence(seq_id));
574 if (allATGCN(ins_seq)){
575 this->info["SEQ"].resize(1);
576 this->info["SEQ"][0].assign(ins_seq);
577 if (place_seq){
578 this->alt[0].assign(ref_base + ins_seq);
579 }
580 }
581 else {
582 cerr << "Warning: Loaded invalid alt sequence for: " << *this << endl;
583 return false;
584 }
585
586 if (ins_seq.size() != info_len){
587 cerr << "Warning: insertion SVLEN and FASTA do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
588 *this << endl << "SVLEN: " << info_len << " " << "FASTA bases: " << ins_seq.size() << endl;
589 return false;
590 }
591 }
592 else{
593 cerr << "Warning: could not locate alt sequence for: " << *this << endl;
594 return false;
595 }
596
597 }
598 else{
599 cerr << "Warning: could not set SEQ [canonicalize]. " << *this << endl;
600 return false;
601 }
602 }
603 else if (svtype == "DEL"){
604 if (this->position + info_span != info_end){
605 cerr << "Warning: deletion END and SVLEN do not agree [canonicalize] " << *this << endl <<
606 "END: " << info_end << " " << "SVLEN: " << -info_len << endl;
607 return false;
608 }
609
610 if (this->position + info_span != info_end){
611 cerr << "Warning: deletion END and SPAN do not agree [canonicalize] " << *this << endl <<
612 "END: " << info_end << " " << "SPAN: " << info_span << endl;
613 return false;
614 }
615
616 // Set REF
617 if (place_seq){
618 string del_seq = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), info_len + 1));
619 string ref_base = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), 1));
620 this->ref.assign( del_seq );
621 this->alt[0].assign( ref_base );
622 }
623 }
624 else if (svtype == "INV"){
625 if (this->position + info_span != info_end){
626 cerr << "Warning: inversion END and SPAN do not agree [canonicalize] " << *this << endl <<
627 "END: " << info_end << " " << "SPAN: " << info_span << endl;
628 return false;
629 }
630
631 if (info_len != 0){
632 cerr << "Warning: inversion SVLEN specifies nonzero length change (complex inversions not canonicalizeable) [canonicalize] " <<
633 *this << endl << "SVLEN: " << info_len << endl;
634
635 if (info_end == this->position + info_len) {
636 // We can probably guess what they meant here.
637 cerr << "Warning: VCF writer incorrecty produced END = POS + SVLEN for an inversion. Fixing SVLEN to 0." << endl;
638 info_len = 0;
639 this->info["SVLEN"][0] = to_string(info_len);
640 } else {
641 return false;
642 }
643 }
644
645 if (place_seq){
646 string ref_seq = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), info_span + 1));
647 // Note that inversions still need an anchoring left base at POS
648 string inv_seq = ref_seq.substr(0, 1) + reverse_complement(ref_seq.substr(1));
649 this->ref.assign(ref_seq);
650 this->alt[0].assign(inv_seq);
651 }
652
653 }
654 else{
655 cerr << "Warning: invalid SV type [canonicalize]:" << *this << endl;
656 return false;
657 }
658
659
660 this->updateAlleleIndexes();
661
662 // Check for harmony between ref / alt / tags
663 if (this->position > stol(this->info.at("END").at(0))){
664 cerr << "Warning: position > END. Possible reference genome mismatch." << endl;
665 return false;
666 }
667
668 if (svtype == "INS"){
669 assert(!this->info.at("SEQ")[0].empty());
670 }
671
672 this->canonical = true;
673 return true;
674 }
675
setVariantCallFile(VariantCallFile & v)676 void Variant::setVariantCallFile(VariantCallFile& v) {
677 sampleNames = v.sampleNames;
678 outputSampleNames = v.sampleNames;
679 vcf = &v;
680 }
681
setVariantCallFile(VariantCallFile * v)682 void Variant::setVariantCallFile(VariantCallFile* v) {
683 sampleNames = v->sampleNames;
684 outputSampleNames = v->sampleNames;
685 vcf = v;
686 }
687
operator <<(ostream & out,VariantFieldType type)688 ostream& operator<<(ostream& out, VariantFieldType type) {
689 switch (type) {
690 case FIELD_INTEGER:
691 out << "integer";
692 break;
693 case FIELD_FLOAT:
694 out << "float";
695 break;
696 case FIELD_BOOL:
697 out << "bool";
698 break;
699 case FIELD_STRING:
700 out << "string";
701 break;
702 default:
703 out << "unknown";
704 break;
705 }
706 return out;
707 }
708
typeStrToVariantFieldType(string & typeStr)709 VariantFieldType typeStrToVariantFieldType(string& typeStr) {
710 if (typeStr == "Integer") {
711 return FIELD_INTEGER;
712 } else if (typeStr == "Float") {
713 return FIELD_FLOAT;
714 } else if (typeStr == "Flag") {
715 return FIELD_BOOL;
716 } else if (typeStr == "String") {
717 return FIELD_STRING;
718 } else {
719 return FIELD_UNKNOWN;
720 }
721 }
722
infoType(const string & key)723 VariantFieldType Variant::infoType(const string& key) {
724 map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
725 if (s == vcf->infoTypes.end()) {
726 if (key == "FILTER") { // hack to use FILTER as an "info" field (why the hack?)
727 return FIELD_STRING;
728 }
729 if (key == "QUAL") { // hack to use QUAL as an "info" field
730 return FIELD_INTEGER;
731 }
732 cerr << "no info field " << key << endl;
733 exit(1);
734 } else {
735 return s->second;
736 }
737 }
738
formatType(const string & key)739 VariantFieldType Variant::formatType(const string& key) {
740 map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
741 if (s == vcf->formatTypes.end()) {
742 cerr << "no format field " << key << endl;
743 exit(1);
744 } else {
745 return s->second;
746 }
747 }
748
getInfoValueBool(const string & key,int index)749 bool Variant::getInfoValueBool(const string& key, int index) {
750 map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
751 if (s == vcf->infoTypes.end()) {
752 cerr << "no info field " << key << endl;
753 exit(1);
754 } else {
755 int count = vcf->infoCounts[key];
756 // XXX TODO, fix for Genotype variants...
757 if (count != ALLELE_NUMBER) {
758 index = 0;
759 }
760 if (index == INDEX_NONE) {
761 if (count != 1) {
762 cerr << "no field index supplied and field count != 1" << endl;
763 exit(1);
764 } else {
765 index = 0;
766 }
767 }
768 VariantFieldType type = s->second;
769 if (type == FIELD_BOOL) {
770 map<string, bool>::iterator b = infoFlags.find(key);
771 if (b == infoFlags.end())
772 return false;
773 else
774 return true;
775 } else {
776 cerr << "not flag type " << key << endl;
777 exit(1);
778 }
779 }
780 }
781
getInfoValueString(const string & key,int index)782 string Variant::getInfoValueString(const string& key, int index) {
783 map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
784 if (s == vcf->infoTypes.end()) {
785 if (key == "FILTER") {
786 return filter;
787 }
788 cerr << "no info field " << key << endl;
789 exit(1);
790 } else {
791 int count = vcf->infoCounts[key];
792 // XXX TODO, fix for Genotype variants...
793 if (count != ALLELE_NUMBER) {
794 index = 0;
795 }
796 if (index == INDEX_NONE) {
797 if (count != 1) {
798 cerr << "no field index supplied and field count != 1" << endl;
799 exit(1);
800 } else {
801 index = 0;
802 }
803 }
804 VariantFieldType type = s->second;
805 if (type == FIELD_STRING) {
806 map<string, vector<string> >::iterator b = info.find(key);
807 if (b == info.end())
808 return "";
809 return b->second.at(index);
810 } else {
811 cerr << "not string type " << key << endl;
812 return "";
813 }
814 }
815 }
816
getInfoValueFloat(const string & key,int index)817 double Variant::getInfoValueFloat(const string& key, int index) {
818 map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
819 if (s == vcf->infoTypes.end()) {
820 if (key == "QUAL") {
821 return quality;
822 }
823 cerr << "no info field " << key << endl;
824 exit(1);
825 } else {
826 int count = vcf->infoCounts[key];
827 // XXX TODO, fix for Genotype variants...
828 if (count != ALLELE_NUMBER) {
829 index = 0;
830 }
831 if (index == INDEX_NONE) {
832 if (count != 1) {
833 cerr << "no field index supplied and field count != 1" << endl;
834 exit(1);
835 } else {
836 index = 0;
837 }
838 }
839 VariantFieldType type = s->second;
840 if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
841 map<string, vector<string> >::iterator b = info.find(key);
842 if (b == info.end())
843 return false;
844 double r;
845 if (!convert(b->second.at(index), r)) {
846 cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
847 exit(1);
848 }
849 return r;
850 } else {
851 cerr << "unsupported type for variant record " << type << endl;
852 exit(1);
853 }
854 }
855 }
856
getNumSamples(void)857 int Variant::getNumSamples(void) {
858 return sampleNames.size();
859 }
860
getNumValidGenotypes(void)861 int Variant::getNumValidGenotypes(void) {
862 int valid_genotypes = 0;
863 map<string, map<string, vector<string> > >::const_iterator s = samples.begin();
864 map<string, map<string, vector<string> > >::const_iterator sEnd = samples.end();
865 for (; s != sEnd; ++s) {
866 map<string, vector<string> > sample_info = s->second;
867 if (sample_info["GT"].front() != "./.") {
868 valid_genotypes++;
869 }
870 }
871 return valid_genotypes;
872 }
873
getSampleValueBool(const string & key,string & sample,int index)874 bool Variant::getSampleValueBool(const string& key, string& sample, int index) {
875 map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
876 if (s == vcf->infoTypes.end()) {
877 cerr << "no info field " << key << endl;
878 exit(1);
879 } else {
880 int count = vcf->formatCounts[key];
881 // XXX TODO, fix for Genotype variants...
882 if (count != ALLELE_NUMBER) {
883 index = 0;
884 }
885 if (index == INDEX_NONE) {
886 if (count != 1) {
887 cerr << "no field index supplied and field count != 1" << endl;
888 exit(1);
889 } else {
890 index = 0;
891 }
892 }
893 VariantFieldType type = s->second;
894 map<string, vector<string> >& sampleData = samples[sample];
895 if (type == FIELD_BOOL) {
896 map<string, vector<string> >::iterator b = sampleData.find(key);
897 if (b == sampleData.end())
898 return false;
899 else
900 return true;
901 } else {
902 cerr << "not bool type " << key << endl;
903 exit(1);
904 }
905 }
906 }
907
getSampleValueString(const string & key,string & sample,int index)908 string Variant::getSampleValueString(const string& key, string& sample, int index) {
909 map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
910 if (s == vcf->infoTypes.end()) {
911 cerr << "no info field " << key << endl;
912 exit(1);
913 } else {
914 int count = vcf->formatCounts[key];
915 // XXX TODO, fix for Genotype variants...
916 if (count != ALLELE_NUMBER) {
917 index = 0;
918 }
919 if (index == INDEX_NONE) {
920 if (count != 1) {
921 cerr << "no field index supplied and field count != 1" << endl;
922 exit(1);
923 } else {
924 index = 0;
925 }
926 }
927 VariantFieldType type = s->second;
928 map<string, vector<string> >& sampleData = samples[sample];
929 if (type == FIELD_STRING) {
930 map<string, vector<string> >::iterator b = sampleData.find(key);
931 if (b == sampleData.end()) {
932 return "";
933 } else {
934 return b->second.at(index);
935 }
936 } else {
937 cerr << "not string type " << key << endl;
938 exit(1);
939 }
940 }
941 }
942
getSampleValueFloat(const string & key,string & sample,int index)943 double Variant::getSampleValueFloat(const string& key, string& sample, int index) {
944 map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
945 if (s == vcf->infoTypes.end()) {
946 cerr << "no info field " << key << endl;
947 exit(1);
948 } else {
949 // XXX TODO wrap this with a function call
950 int count = vcf->formatCounts[key];
951 // XXX TODO, fix for Genotype variants...
952 if (count != ALLELE_NUMBER) {
953 index = 0;
954 }
955 if (index == INDEX_NONE) {
956 if (count != 1) {
957 cerr << "no field index supplied and field count != 1" << endl;
958 exit(1);
959 } else {
960 index = 0;
961 }
962 }
963 VariantFieldType type = s->second;
964 map<string, vector<string> >& sampleData = samples[sample];
965 if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
966 map<string, vector<string> >::iterator b = sampleData.find(key);
967 if (b == sampleData.end())
968 return false;
969 double r;
970 if (!convert(b->second.at(index), r)) {
971 cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
972 exit(1);
973 }
974 return r;
975 } else {
976 cerr << "unsupported type for sample " << type << endl;
977 exit(1);
978 }
979 }
980 }
981
getValueBool(const string & key,string & sample,int index)982 bool Variant::getValueBool(const string& key, string& sample, int index) {
983 if (sample.empty()) { // an empty sample name means
984 return getInfoValueBool(key, index);
985 } else {
986 return getSampleValueBool(key, sample, index);
987 }
988 }
989
getValueFloat(const string & key,string & sample,int index)990 double Variant::getValueFloat(const string& key, string& sample, int index) {
991 if (sample.empty()) { // an empty sample name means
992 return getInfoValueFloat(key, index);
993 } else {
994 return getSampleValueFloat(key, sample, index);
995 }
996 }
997
getValueString(const string & key,string & sample,int index)998 string Variant::getValueString(const string& key, string& sample, int index) {
999 if (sample.empty()) { // an empty sample name means
1000 return getInfoValueString(key, index);
1001 } else {
1002 return getSampleValueString(key, sample, index);
1003 }
1004 }
1005
getAltAlleleIndex(const string & allele)1006 int Variant::getAltAlleleIndex(const string& allele) {
1007 map<string, int>::iterator f = altAlleleIndexes.find(allele);
1008 if (f == altAlleleIndexes.end()) {
1009 cerr << "no such allele \'" << allele << "\' in record " << sequenceName << ":" << position << endl;
1010 exit(1);
1011 } else {
1012 return f->second;
1013 }
1014 }
1015
addFilter(const string & tag)1016 void Variant::addFilter(const string& tag) {
1017 if (filter == "" || filter == ".")
1018 filter = tag;
1019 else
1020 filter += "," + tag;
1021 }
1022
addFormatField(const string & key)1023 void Variant::addFormatField(const string& key) {
1024 bool hasTag = false;
1025 for (vector<string>::iterator t = format.begin(); t != format.end(); ++t) {
1026 if (*t == key) {
1027 hasTag = true;
1028 break;
1029 }
1030 }
1031 if (!hasTag) {
1032 format.push_back(key);
1033 }
1034 }
1035
printAlt(ostream & out)1036 void Variant::printAlt(ostream& out) {
1037 for (vector<string>::iterator i = alt.begin(); i != alt.end(); ++i) {
1038 out << *i;
1039 // add a comma for all but the last alternate allele
1040 if (i != (alt.end() - 1)) out << ",";
1041 }
1042 }
1043
printAlleles(ostream & out)1044 void Variant::printAlleles(ostream& out) {
1045 for (vector<string>::iterator i = alleles.begin(); i != alleles.end(); ++i) {
1046 out << *i;
1047 // add a comma for all but the last alternate allele
1048 if (i != (alleles.end() - 1)) out << ",";
1049 }
1050 }
1051
operator <<(ostream & out,Variant & var)1052 ostream& operator<<(ostream& out, Variant& var) {
1053 // ensure there are no empty fields
1054 if (var.sequenceName.empty()) var.sequenceName = ".";
1055 if (var.id.empty()) var.id = ".";
1056 if (var.ref.empty()) var.ref = ".";
1057 if (var.alt.empty()) var.alt.push_back(".");
1058 if (var.filter.empty()) var.filter = ".";
1059
1060 out << var.sequenceName << "\t"
1061 << var.position << "\t"
1062 << var.id << "\t"
1063 << var.ref << "\t";
1064 // report the list of alternate alleles.
1065 var.printAlt(out);
1066 out << "\t"
1067 << var.quality << "\t"
1068 << var.filter << "\t";
1069 if (var.info.empty() && var.infoFlags.empty()) {
1070 out << ".";
1071 } else {
1072 for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
1073 if (!i->second.empty()) {
1074 out << ((i == var.info.begin()) ? "" : ";") << i->first << "=" << join(i->second, ",");
1075 }
1076 }
1077 for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
1078 if (i == var.infoFlags.end()) {
1079 out << "";
1080 } else if (i == var.infoFlags.begin() && var.info.empty()) {
1081 out << "";
1082 } else {
1083 out << ";";
1084 }
1085 out << i->first;
1086 }
1087 }
1088 if (!var.format.empty()) {
1089 out << "\t";
1090 for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
1091 out << ((f == var.format.begin()) ? "" : ":") << *f;
1092 }
1093 for (vector<string>::iterator s = var.outputSampleNames.begin(); s != var.outputSampleNames.end(); ++s) {
1094 out << "\t";
1095 map<string, map<string, vector<string> > >::iterator sampleItr = var.samples.find(*s);
1096 if (sampleItr == var.samples.end()) {
1097 out << ".";
1098 } else {
1099 map<string, vector<string> >& sample = sampleItr->second;
1100 if (sample.size() == 0) {
1101 out << ".";
1102 } else {
1103 for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
1104 map<string, vector<string> >::iterator g = sample.find(*f);
1105 out << ((f == var.format.begin()) ? "" : ":");
1106 if (g != sample.end() && !g->second.empty()) {
1107 out << join(g->second, ",");
1108 } else {
1109 out << ".";
1110 }
1111 }
1112 }
1113 }
1114 }
1115 }
1116 return out;
1117 }
1118
setOutputSampleNames(vector<string> & samplesToOutput)1119 void Variant::setOutputSampleNames(vector<string>& samplesToOutput) {
1120 outputSampleNames = samplesToOutput;
1121 }
1122
1123
1124 // shunting yard algorithm
infixToPrefix(queue<RuleToken> tokens,queue<RuleToken> & prefixtokens)1125 void infixToPrefix(queue<RuleToken> tokens, queue<RuleToken>& prefixtokens) {
1126 stack<RuleToken> ops;
1127 while (!tokens.empty()) {
1128 RuleToken& token = tokens.front();
1129 if (isOperator(token)) {
1130 //cerr << "found operator " << token.value << endl;
1131 while (ops.size() > 0 && isOperator(ops.top())
1132 && ( (isLeftAssociative(token) && priority(token) <= priority(ops.top()))
1133 || (isRightAssociative(token) && priority(token) < priority(ops.top())))) {
1134 prefixtokens.push(ops.top());
1135 ops.pop();
1136 }
1137 ops.push(token);
1138 } else if (isLeftParenthesis(token)) {
1139 //cerr << "found paran " << token.value << endl;
1140 ops.push(token);
1141 } else if (isRightParenthesis(token)) {
1142 //cerr << "found paran " << token.value << endl;
1143 while (ops.size() > 0 && !isLeftParenthesis(ops.top())) {
1144 prefixtokens.push(ops.top());
1145 ops.pop();
1146 }
1147 if (ops.size() == 0) {
1148 cerr << "error: mismatched parentheses" << endl;
1149 exit(1);
1150 }
1151 if (isLeftParenthesis(ops.top())) {
1152 ops.pop();
1153 }
1154 } else {
1155 //cerr << "found operand " << token.value << endl;
1156 prefixtokens.push(token);
1157 }
1158 tokens.pop();
1159 }
1160 while (ops.size() > 0) {
1161 if (isRightParenthesis(ops.top()) || isLeftParenthesis(ops.top())) {
1162 cerr << "error: mismatched parentheses" << endl;
1163 exit(1);
1164 }
1165 prefixtokens.push(ops.top());
1166 ops.pop();
1167 }
1168 }
1169
RuleToken(string tokenstr,map<string,VariantFieldType> & variables)1170 RuleToken::RuleToken(string tokenstr, map<string, VariantFieldType>& variables) {
1171 isVariable = false;
1172 if (tokenstr == "!") {
1173 type = RuleToken::NOT_OPERATOR;
1174 } else if (tokenstr == "&") {
1175 type = RuleToken::AND_OPERATOR;
1176 } else if (tokenstr == "|") {
1177 type = RuleToken::OR_OPERATOR;
1178 } else if (tokenstr == "+") {
1179 type = RuleToken::ADD_OPERATOR;
1180 } else if (tokenstr == "-") {
1181 type = RuleToken::SUBTRACT_OPERATOR;
1182 } else if (tokenstr == "*") {
1183 type = RuleToken::MULTIPLY_OPERATOR;
1184 } else if (tokenstr == "/") {
1185 type = RuleToken::DIVIDE_OPERATOR;
1186 } else if (tokenstr == "=") {
1187 type = RuleToken::EQUAL_OPERATOR;
1188 } else if (tokenstr == ">") {
1189 type = RuleToken::GREATER_THAN_OPERATOR;
1190 } else if (tokenstr == "<") {
1191 type = RuleToken::LESS_THAN_OPERATOR;
1192 } else if (tokenstr == "(") {
1193 type = RuleToken::LEFT_PARENTHESIS;
1194 } else if (tokenstr == ")") {
1195 type = RuleToken::RIGHT_PARENTHESIS;
1196 } else { // operand
1197 type = RuleToken::OPERAND;
1198 if (variables.find(tokenstr) == variables.end()) {
1199 if (convert(tokenstr, number)) {
1200 type = RuleToken::NUMBER;
1201 } else if (tokenstr == "QUAL") {
1202 isVariable = true;
1203 } else if (tokenstr == "FILTER") {
1204 isVariable = true;
1205 } else {
1206 type = RuleToken::STRING_VARIABLE;
1207 }
1208 } else {
1209 isVariable = true;
1210 }
1211 }
1212 value = tokenstr;
1213 }
1214
1215
tokenizeFilterSpec(string & filterspec,queue<RuleToken> & tokens,map<string,VariantFieldType> & variables)1216 void tokenizeFilterSpec(string& filterspec, queue<RuleToken>& tokens, map<string, VariantFieldType>& variables) {
1217 string lastToken = "";
1218 bool inToken = false;
1219 for (unsigned int i = 0; i < filterspec.size(); ++i) {
1220 char c = filterspec.at(i);
1221 if (c == ' ' || c == '\n') {
1222 inToken = false;
1223 if (!inToken && lastToken.size() > 0) {
1224 tokens.push(RuleToken(lastToken, variables));
1225 lastToken = "";
1226 }
1227 } else if (!inToken && (isOperatorChar(c) || isParanChar(c))) {
1228 inToken = false;
1229 if (lastToken.size() > 0) {
1230 tokens.push(RuleToken(lastToken, variables));
1231 lastToken = "";
1232 }
1233 tokens.push(RuleToken(filterspec.substr(i,1), variables));
1234 } else {
1235 inToken = true;
1236 lastToken += c;
1237 }
1238 }
1239 // get the last token
1240 if (inToken) {
1241 tokens.push(RuleToken(lastToken, variables));
1242 }
1243 }
1244
1245 // class which evaluates filter expressions
1246 // allow filters to be defined using boolean infix expressions e.g.:
1247 //
1248 // "GQ > 10 & (DP < 3 | DP > 5) & SAMPLE = NA12878"
1249 // or
1250 // "GT = 1/1 | GT = 0/0"
1251 //
1252 // on initialization, tokenizes the input sequence, and converts it from infix to postfix
1253 // on call to
1254 //
1255
1256
VariantFilter(string filterspec,VariantFilterType filtertype,map<string,VariantFieldType> & variables)1257 VariantFilter::VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables) {
1258 type = filtertype;
1259 spec = filterspec;
1260 tokenizeFilterSpec(filterspec, tokens, variables);
1261 infixToPrefix(tokens, rules);
1262 /*while (!rules.empty()) {
1263 cerr << " " << rules.front().value << ((isNumeric(rules.front())) ? "f" : "");
1264 rules.pop();
1265 }
1266 */
1267 //cerr << endl;
1268 //cerr << join(" ", tokens) << endl;
1269 }
1270
1271 // all alts pass
passes(Variant & var,string & sample)1272 bool VariantFilter::passes(Variant& var, string& sample) {
1273 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
1274 string& allele = *a;
1275 if (!passes(var, sample, allele)) {
1276 return false;
1277 }
1278 }
1279 return true;
1280 }
1281
passes(Variant & var,string & sample,string & allele)1282 bool VariantFilter::passes(Variant& var, string& sample, string& allele) {
1283 // to evaluate a rpn boolean queue with embedded numbers and variables
1284 // make a result stack, use float to allow comparison of floating point
1285 // numbers, booleans, and integers
1286 stack<RuleToken> results;
1287 queue<RuleToken> rulesCopy = rules; // copy
1288
1289 int index;
1290 if (allele.empty()) {
1291 index = 0; // apply to the whole record
1292 } else {
1293 // apply to a specific allele
1294 index = var.getAltAlleleIndex(allele);
1295 }
1296
1297 while (!rulesCopy.empty()) {
1298 RuleToken token = rulesCopy.front();
1299 rulesCopy.pop();
1300 // pop operands from the front of the queue and push them onto the stack
1301 if (isOperand(token)) {
1302 //cout << "is operand: " << token.value << endl;
1303 // if the token is variable, i.e. not evaluated in this context, we
1304 // must evaluate it before pushing it onto the stack
1305 if (token.isVariable) {
1306 //cout << "is variable" << endl;
1307 // look up the variable using the Variant, depending on our filter type
1308 //cout << "token.value " << token.value << endl;
1309 VariantFieldType vtype;
1310 if (sample.empty()) { // means we are record-specific
1311 vtype = var.infoType(token.value);
1312 } else {
1313 vtype = var.formatType(token.value);
1314 //cout << "type = " << type << endl;
1315 }
1316 //cout << "type: " << type << endl;
1317
1318 if (vtype == FIELD_INTEGER || vtype == FIELD_FLOAT) {
1319 token.type = RuleToken::NUMERIC_VARIABLE;
1320 token.number = var.getValueFloat(token.value, sample, index);
1321 //cerr << "number: " << token.number << endl;
1322 } else if (vtype == FIELD_BOOL) {
1323 token.type = RuleToken::BOOLEAN_VARIABLE;
1324 token.state = var.getValueBool(token.value, sample, index);
1325 //cerr << "state: " << token.state << endl;
1326 } else if (vtype == FIELD_STRING) {
1327 //cout << "token.value = " << token.value << endl;
1328 token.type = RuleToken::STRING_VARIABLE;
1329 token.str = var.getValueString(token.value, sample, index);
1330 } else if (isString(token)) {
1331 token.type = RuleToken::STRING_VARIABLE;
1332 token.str = var.getValueString(token.value, sample, index);
1333 //cerr << "string: " << token.str << endl;
1334 }
1335 } else {
1336 double f;
1337 string s;
1338 //cerr << "parsing operand" << endl;
1339 if (convert(token.value, f)) {
1340 token.type = RuleToken::NUMERIC_VARIABLE;
1341 token.number = f;
1342 //cerr << "number: " << token.number << endl;
1343 } else if (convert(token.value, s)) {
1344 token.type = RuleToken::STRING_VARIABLE;
1345 token.str = s;
1346 //cerr << "string: " << token.str << endl;
1347 } else {
1348 cerr << "could not parse non-variable operand " << token.value << endl;
1349 exit(1);
1350 }
1351 }
1352 results.push(token);
1353 }
1354 // apply operators to the first n elements on the stack and push the result back onto the stack
1355 else if (isOperator(token)) {
1356 //cerr << "is operator: " << token.value << endl;
1357 RuleToken a, b, r;
1358 // is it a not-operator?
1359 switch (token.type) {
1360 case ( RuleToken::NOT_OPERATOR ):
1361 a = results.top();
1362 results.pop();
1363 if (!isBoolean(a)) {
1364 cerr << "cannot negate a non-boolean" << endl;
1365 } else {
1366 a.state = !a.state;
1367 results.push(a);
1368 }
1369 break;
1370
1371 case ( RuleToken::EQUAL_OPERATOR ):
1372 a = results.top(); results.pop();
1373 b = results.top(); results.pop();
1374 if (a.type == b.type) {
1375 switch (a.type) {
1376 case (RuleToken::STRING_VARIABLE):
1377 r.state = (a.str == b.str);
1378 break;
1379 case (RuleToken::NUMERIC_VARIABLE):
1380 r.state = (a.number == b.number);
1381 break;
1382 case (RuleToken::BOOLEAN_VARIABLE):
1383 r.state = (a.state == b.state);
1384 break;
1385 default:
1386 cerr << "should not get here" << endl; exit(1);
1387 break;
1388 }
1389 } else if (a.type == RuleToken::STRING_VARIABLE && b.type == RuleToken::NUMERIC_VARIABLE) {
1390 r.state = (convert(b.number) == a.str);
1391 } else if (b.type == RuleToken::STRING_VARIABLE && a.type == RuleToken::NUMERIC_VARIABLE) {
1392 r.state = (convert(a.number) == b.str);
1393 }
1394 results.push(r);
1395 break;
1396
1397 case ( RuleToken::GREATER_THAN_OPERATOR ):
1398 a = results.top(); results.pop();
1399 b = results.top(); results.pop();
1400 if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1401 r.state = (b.number > a.number);
1402 } else {
1403 cerr << "cannot compare (>) objects of dissimilar types" << endl;
1404 cerr << a.type << " " << b.type << endl;
1405 exit(1);
1406 }
1407 results.push(r);
1408 break;
1409
1410 case ( RuleToken::LESS_THAN_OPERATOR ):
1411 a = results.top(); results.pop();
1412 b = results.top(); results.pop();
1413 if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1414 r.state = (b.number < a.number);
1415 } else {
1416 cerr << "cannot compare (<) objects of dissimilar types" << endl;
1417 cerr << a.type << " " << b.type << endl;
1418 exit(1);
1419 }
1420 results.push(r);
1421 break;
1422
1423 case ( RuleToken::ADD_OPERATOR ):
1424 a = results.top(); results.pop();
1425 b = results.top(); results.pop();
1426 if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1427 r.number = (b.number + a.number);
1428 r.type = RuleToken::NUMERIC_VARIABLE;
1429 } else {
1430 cerr << "cannot add objects of dissimilar types" << endl;
1431 cerr << a.type << " " << b.type << endl;
1432 exit(1);
1433 }
1434 results.push(r);
1435 break;
1436
1437 case ( RuleToken::SUBTRACT_OPERATOR ):
1438 a = results.top(); results.pop();
1439 b = results.top(); results.pop();
1440 if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1441 r.number = (b.number - a.number);
1442 r.type = RuleToken::NUMERIC_VARIABLE;
1443 } else {
1444 cerr << "cannot subtract objects of dissimilar types" << endl;
1445 cerr << a.type << " " << b.type << endl;
1446 exit(1);
1447 }
1448 results.push(r);
1449 break;
1450
1451 case ( RuleToken::MULTIPLY_OPERATOR ):
1452 a = results.top(); results.pop();
1453 b = results.top(); results.pop();
1454 if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1455 r.number = (b.number * a.number);
1456 r.type = RuleToken::NUMERIC_VARIABLE;
1457 } else {
1458 cerr << "cannot multiply objects of dissimilar types" << endl;
1459 cerr << a.type << " " << b.type << endl;
1460 exit(1);
1461 }
1462 results.push(r);
1463 break;
1464
1465 case ( RuleToken::DIVIDE_OPERATOR):
1466 a = results.top(); results.pop();
1467 b = results.top(); results.pop();
1468 if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1469 r.number = (b.number / a.number);
1470 r.type = RuleToken::NUMERIC_VARIABLE;
1471 } else {
1472 cerr << "cannot divide objects of dissimilar types" << endl;
1473 cerr << a.type << " " << b.type << endl;
1474 exit(1);
1475 }
1476 results.push(r);
1477 break;
1478
1479 case ( RuleToken::AND_OPERATOR ):
1480 case ( RuleToken::OR_OPERATOR ):
1481 a = results.top(); results.pop();
1482 b = results.top(); results.pop();
1483 if (a.type == b.type && a.type == RuleToken::BOOLEAN_VARIABLE) {
1484 if (token.type == RuleToken::AND_OPERATOR) {
1485 r.state = (a.state && b.state);
1486 } else {
1487 r.state = (a.state || b.state);
1488 }
1489 } else {
1490 cerr << "cannot compare (& or |) objects of dissimilar types" << endl;
1491 exit(1);
1492 }
1493 results.push(r);
1494 break;
1495 default:
1496 cerr << "should not get here!" << endl; exit(1);
1497 break;
1498 }
1499 }
1500 }
1501 // at the end you should have only one value on the stack, return it as a boolean
1502 if (results.size() == 1) {
1503 if (isBoolean(results.top())) {
1504 return results.top().state;
1505 } else {
1506 cerr << "error, non-boolean value left on stack" << endl;
1507 //cerr << results.top().value << endl;
1508 exit(1);
1509 }
1510 } else if (results.size() > 1) {
1511 cerr << "more than one value left on results stack!" << endl;
1512 while (!results.empty()) {
1513 cerr << results.top().value << endl;
1514 results.pop();
1515 }
1516 exit(1);
1517 } else {
1518 cerr << "results stack empty" << endl;
1519 exit(1);
1520 }
1521 }
1522
removeFilteredGenotypes(Variant & var,bool keepInfo)1523 void VariantFilter::removeFilteredGenotypes(Variant& var, bool keepInfo) {
1524
1525 for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
1526 string& name = *s;
1527 if (!passes(var, name)) {
1528 if (keepInfo) {
1529 var.samples[name]["GT"].clear();
1530 var.samples[name]["GT"].push_back("./.");
1531 }
1532 else {
1533 var.samples.erase(name);
1534 }
1535 }
1536 }
1537 }
1538
1539 /*
1540 bool VariantCallFile::openVCF(string& filename) {
1541 file.open(filename.c_str(), ifstream::in);
1542 if (!file.is_open()) {
1543 cerr << "could not open " << filename << endl;
1544 return false;
1545 } else {
1546 return parseHeader();
1547 }
1548 }
1549
1550 bool VariantCallFile::openVCF(ifstream& stream) {
1551 file = stream;
1552 if (!file.is_open()) {
1553 cerr << "provided file is not open" << endl;
1554 return false;
1555 } else {
1556 return parseHeader();
1557 }
1558 }
1559 */
1560
updateSamples(vector<string> & newSamples)1561 void VariantCallFile::updateSamples(vector<string>& newSamples) {
1562 sampleNames = newSamples;
1563 // regenerate the last line of the header
1564 vector<string> headerLines = split(header, '\n');
1565 vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
1566 vector<string> newcolnames;
1567 newcolnames.resize(9 + sampleNames.size());
1568 copy(colnames.begin(), colnames.begin() + 9, newcolnames.begin());
1569 copy(sampleNames.begin(), sampleNames.end(), newcolnames.begin() + 9);
1570 headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
1571 header = join(headerLines, "\n");
1572 }
1573
1574 // non-destructive version of above
headerWithSampleNames(vector<string> & newSamples)1575 string VariantCallFile::headerWithSampleNames(vector<string>& newSamples) {
1576 // regenerate the last line of the header
1577 if (newSamples.empty()) return header;
1578 vector<string> headerLines = split(header, '\n');
1579 vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
1580 vector<string> newcolnames;
1581 unsigned int colCount = colnames.size(); // used to be hard-coded 9, hopefully the dynamic colCount isn't an issue
1582 if (colCount < 8)
1583 {
1584 cout << "VCF file is not suitable for use because it does not have a format field." << endl;
1585 exit(0);
1586 }
1587 newcolnames.resize(colCount + newSamples.size());
1588 copy(colnames.begin(), colnames.begin() + colCount, newcolnames.begin());
1589 copy(newSamples.begin(), newSamples.end(), newcolnames.begin() + colCount);
1590 headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
1591 return join(headerLines, "\n");
1592 }
1593
1594 // TODO cleanup, store header lines instead of bulk header
addHeaderLine(string line)1595 void VariantCallFile::addHeaderLine(string line) {
1596 vector<string> headerLines = split(header, '\n');
1597 headerLines.insert(headerLines.end() - 1, line);
1598 header = join(unique(headerLines), "\n");
1599 }
1600
1601 // helper to addHeaderLine
unique(vector<string> & strings)1602 vector<string>& unique(vector<string>& strings) {
1603 set<string> uniq;
1604 vector<string> res;
1605 for (vector<string>::const_iterator s = strings.begin(); s != strings.end(); ++s) {
1606 if (uniq.find(*s) == uniq.end()) {
1607 res.push_back(*s);
1608 uniq.insert(*s);
1609 }
1610 }
1611 strings = res;
1612 return strings;
1613 }
1614
infoIds(void)1615 vector<string> VariantCallFile::infoIds(void) {
1616 vector<string> tags;
1617 vector<string> headerLines = split(header, '\n');
1618 for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1619 string& line = *s;
1620 if (line.find("##INFO") == 0) {
1621 size_t pos = line.find("ID=");
1622 if (pos != string::npos) {
1623 pos += 3;
1624 size_t tagend = line.find(",", pos);
1625 if (tagend != string::npos) {
1626 tags.push_back(line.substr(pos, tagend - pos));
1627 }
1628 }
1629 }
1630 }
1631 return tags;
1632 }
1633
formatIds(void)1634 vector<string> VariantCallFile::formatIds(void) {
1635 vector<string> tags;
1636 vector<string> headerLines = split(header, '\n');
1637 for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1638 string& line = *s;
1639 if (line.find("##FORMAT") == 0) {
1640 size_t pos = line.find("ID=");
1641 if (pos != string::npos) {
1642 pos += 3;
1643 size_t tagend = line.find(",", pos);
1644 if (tagend != string::npos) {
1645 tags.push_back(line.substr(pos, tagend - pos));
1646 }
1647 }
1648 }
1649 }
1650 return tags;
1651 }
1652
removeInfoHeaderLine(string tag)1653 void VariantCallFile::removeInfoHeaderLine(string tag) {
1654 vector<string> headerLines = split(header, '\n');
1655 vector<string> newHeader;
1656 string id = "ID=" + tag + ",";
1657 for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1658 string& line = *s;
1659 if (line.find("##INFO") == 0) {
1660 if (line.find(id) == string::npos) {
1661 newHeader.push_back(line);
1662 }
1663 } else {
1664 newHeader.push_back(line);
1665 }
1666 }
1667 header = join(newHeader, "\n");
1668 }
1669
removeGenoHeaderLine(string tag)1670 void VariantCallFile::removeGenoHeaderLine(string tag) {
1671 vector<string> headerLines = split(header, '\n');
1672 vector<string> newHeader;
1673 string id = "ID=" + tag + ",";
1674 for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1675 string& headerLine = *s;
1676 if (headerLine.find("##FORMAT") == 0) {
1677 if (headerLine.find(id) == string::npos) {
1678 newHeader.push_back(headerLine);
1679 }
1680 } else {
1681 newHeader.push_back(headerLine);
1682 }
1683 }
1684 header = join(newHeader, "\n");
1685 }
1686
getHeaderLinesFromFile()1687 vector<string> VariantCallFile::getHeaderLinesFromFile()
1688 {
1689 string headerStr = "";
1690
1691 if (usingTabix) {
1692 tabixFile->getHeader(headerStr);
1693 if (headerStr.empty()) {
1694 cerr << "error: no VCF header" << endl;
1695 exit(1);
1696 }
1697 tabixFile->getNextLine(line);
1698 firstRecord = true;
1699 } else {
1700 while (std::getline(*file, line)) {
1701 if (line.substr(0,1) == "#") {
1702 headerStr += line + '\n';
1703 } else {
1704 // done with header
1705 if (headerStr.empty()) {
1706 cerr << "error: no VCF header" << endl;
1707 return vector<string>();
1708 }
1709 firstRecord = true;
1710 break;
1711 }
1712 }
1713 }
1714 return split(headerStr, "\n");
1715 }
1716
parseHeader(void)1717 bool VariantCallFile::parseHeader(void) {
1718
1719 string headerStr = "";
1720
1721 if (usingTabix) {
1722 tabixFile->getHeader(headerStr);
1723 if (headerStr.empty()) {
1724 cerr << "error: no VCF header" << endl;
1725 exit(1);
1726 }
1727 tabixFile->getNextLine(line);
1728 firstRecord = true;
1729 } else {
1730 while (std::getline(*file, line)) {
1731 if (line.substr(0,1) == "#") {
1732 headerStr += line + '\n';
1733 } else {
1734 // done with header
1735 if (headerStr.empty()) {
1736 cerr << "error: no VCF header" << endl;
1737 return false;
1738 }
1739 firstRecord = true;
1740 break;
1741 }
1742 }
1743 }
1744 this->vcf_header = headerStr;
1745
1746 return parseHeader(headerStr);
1747
1748 }
1749
parseHeader(string & hs)1750 bool VariantCallFile::parseHeader(string& hs) {
1751
1752 if (hs.empty()) return false;
1753 if (hs.substr(hs.size() - 1, 1) == "\n") {
1754 hs.erase(hs.size() - 1, 1); // remove trailing newline
1755 }
1756 header = hs; // stores the header in the object instance
1757
1758 vector<string> headerLines = split(header, "\n");
1759 for (vector<string>::iterator h = headerLines.begin(); h != headerLines.end(); ++h) {
1760 string headerLine = *h;
1761 if (headerLine.substr(0,2) == "##") {
1762 // meta-information headerLines
1763 // TODO parse into map from info/format key to type
1764 // ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
1765 // ##FORMAT=<ID=CB,Number=1,Type=String,Description="Called by S(Sanger), M(UMich), B(BI)">
1766 size_t found = headerLine.find_first_of("=");
1767 string entryType = headerLine.substr(2, found - 2);
1768 // handle reference here, no "<" and ">" given
1769 //} else if (entryType == "reference") {
1770 size_t dataStart = headerLine.find_first_of("<");
1771 size_t dataEnd = headerLine.find_first_of(">");
1772 if (dataStart != string::npos && dataEnd != string::npos) {
1773 string entryData = headerLine.substr(dataStart + 1, dataEnd - dataStart - 1);
1774 // XXX bad; this will break if anyone ever moves the order
1775 // of the fields around to include a "long form" string
1776 // including either a = or , in the first or second field
1777 if (entryType == "INFO" || entryType == "FORMAT") {
1778 vector<string> fields = split(entryData, "=,");
1779 if (fields[0] != "ID") {
1780 cerr << "header parse error at:" << endl
1781 << "fields[0] != \"ID\"" << endl
1782 << headerLine << endl;
1783 exit(1);
1784 }
1785 string id = fields[1];
1786 if (fields[2] != "Number") {
1787 cerr << "header parse error at:" << endl
1788 << "fields[2] != \"Number\"" << endl
1789 << headerLine << endl;
1790 exit(1);
1791 }
1792 int number;
1793 string numberstr = fields[3].c_str();
1794 // XXX TODO VCF has variable numbers of fields...
1795 if (numberstr == "A") {
1796 number = ALLELE_NUMBER;
1797 } else if (numberstr == "G") {
1798 number = GENOTYPE_NUMBER;
1799 } else if (numberstr == ".") {
1800 number = 1;
1801 } else {
1802 convert(numberstr, number);
1803 }
1804 if (fields[4] != "Type") {
1805 cerr << "header parse error at:" << endl
1806 << "fields[4] != \"Type\"" << endl
1807 << headerLine << endl;
1808 exit(1);
1809 }
1810 VariantFieldType type = typeStrToVariantFieldType(fields[5]);
1811 if (entryType == "INFO") {
1812 infoCounts[id] = number;
1813 infoTypes[id] = type;
1814 //cerr << id << " == " << type << endl;
1815 } else if (entryType == "FORMAT") {
1816 //cout << "found format field " << id << " with type " << type << endl;
1817 formatCounts[id] = number;
1818 formatTypes[id] = type;
1819 }
1820 }
1821 }
1822 } else if (headerLine.substr(0,1) == "#") {
1823 // field name headerLine
1824 vector<string> fields = split(headerLine, '\t');
1825 if (fields.size() > 8) {
1826 sampleNames.resize(fields.size() - 9);
1827 copy(fields.begin() + 9, fields.end(), sampleNames.begin());
1828 }
1829 }
1830 }
1831
1832 return true;
1833 }
1834
getNextVariant(Variant & var)1835 bool VariantCallFile::getNextVariant(Variant& var) {
1836 if (firstRecord && !justSetRegion) {
1837 if (!line.empty() && line.substr(0,1) != "#") {
1838 var.parse(line, parseSamples);
1839 firstRecord = false;
1840 _done = false;
1841 return true;
1842 } else {
1843 return false;
1844 }
1845 }
1846 if (usingTabix) {
1847 if (justSetRegion && !line.empty() && line.substr(0,1) != "#") {
1848 if (firstRecord) {
1849 firstRecord = false;
1850 }
1851 var.parse(line, parseSamples);
1852 line.clear();
1853 justSetRegion = false;
1854 _done = false;
1855 return true;
1856 } else if (tabixFile->getNextLine(line)) {
1857 var.parse(line, parseSamples);
1858 _done = false;
1859 return true;
1860 } else {
1861 _done = true;
1862 return false;
1863 }
1864 } else {
1865 if (std::getline(*file, line)) {
1866 var.parse(line, parseSamples);
1867 _done = false;
1868 return true;
1869 } else {
1870 _done = true;
1871 return false;
1872 }
1873 }
1874 }
1875
setRegion(string seq,long int start,long int end)1876 bool VariantCallFile::setRegion(string seq, long int start, long int end) {
1877 stringstream regionstr;
1878 if (end) {
1879 regionstr << seq << ":" << start << "-" << end;
1880 } else {
1881 regionstr << seq << ":" << start;
1882 }
1883 return setRegion(regionstr.str());
1884 }
1885
setRegion(string region)1886 bool VariantCallFile::setRegion(string region) {
1887 if (!usingTabix) {
1888 cerr << "cannot setRegion on a non-tabix indexed file" << endl;
1889 exit(1);
1890 }
1891 size_t dots = region.find("..");
1892 // convert between bamtools/freebayes style region string and tabix/samtools style
1893 if (dots != string::npos) {
1894 region.replace(dots, 2, "-");
1895 }
1896 if (tabixFile->setRegion(region)) {
1897 if (tabixFile->getNextLine(line)) {
1898 justSetRegion = true;
1899 return true;
1900 } else {
1901 return false;
1902 }
1903 } else {
1904 return false;
1905 }
1906 }
1907
1908
1909 // genotype manipulation
1910 /*
1911 map<string, int> decomposeGenotype(string& genotype) {
1912 string splitter = "/";
1913 if (genotype.find("|") != string::npos) {
1914 splitter = "|";
1915 }
1916 vector<string> haps = split(genotype, splitter);
1917 map<string, int> decomposed;
1918 for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
1919 ++decomposed[*h];
1920 }
1921 return decomposed;
1922 }
1923 */
1924
decomposeGenotype(const string & genotype)1925 map<int, int> decomposeGenotype(const string& genotype) {
1926 string splitter = "/";
1927 if (genotype.find("|") != string::npos) {
1928 splitter = "|";
1929 }
1930 vector<string> haps = split(genotype, splitter);
1931 map<int, int> decomposed;
1932 for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
1933 int alt;
1934 if (*h == ".") {
1935 ++decomposed[NULL_ALLELE];
1936 } else {
1937 convert(*h, alt);
1938 ++decomposed[alt];
1939 }
1940 }
1941 return decomposed;
1942 }
1943
decomposePhasedGenotype(const string & genotype)1944 vector<int> decomposePhasedGenotype(const string& genotype) {
1945 string splitter = "/";
1946 if (genotype.find("|") != string::npos) {
1947 splitter = "|";
1948 }
1949 vector<string> haps = split(genotype, splitter);
1950 if (haps.size() > 1 && splitter == "/") {
1951 cerr << "could not find '|' in genotype, cannot decomposePhasedGenotype on unphased genotypes" << endl;
1952 exit(1);
1953 }
1954 vector<int> decomposed;
1955 for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
1956 int alt;
1957 if (*h == ".") {
1958 decomposed.push_back(NULL_ALLELE);
1959 } else {
1960 convert(*h, alt);
1961 decomposed.push_back(alt);
1962 }
1963 }
1964 return decomposed;
1965 }
1966
genotypeToString(const map<int,int> & genotype)1967 string genotypeToString(const map<int, int>& genotype) {
1968 vector<int> s;
1969 for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
1970 int a = g->first;
1971 int c = g->second;
1972 for (int i = 0; i < c; ++i) s.push_back(a);
1973 }
1974 sort(s.begin(), s.end());
1975 vector<string> r;
1976 for (vector<int>::iterator i = s.begin(); i != s.end(); ++i) {
1977 if (*i == NULL_ALLELE) r.push_back(".");
1978 else r.push_back(convert(*i));
1979 }
1980 return join(r, "/"); // TODO adjust for phased/unphased
1981 }
1982
phasedGenotypeToString(const vector<int> & genotype)1983 string phasedGenotypeToString(const vector<int>& genotype) {
1984 vector<string> r;
1985 for (vector<int>::const_iterator i = genotype.begin(); i != genotype.end(); ++i) {
1986 if (*i == NULL_ALLELE) r.push_back(".");
1987 else r.push_back(convert(*i));
1988 }
1989 return join(r, "|");
1990 }
1991
isHet(const map<int,int> & genotype)1992 bool isHet(const map<int, int>& genotype) {
1993 return genotype.size() > 1;
1994 }
1995
isHom(const map<int,int> & genotype)1996 bool isHom(const map<int, int>& genotype) {
1997 return genotype.size() == 1;
1998 }
1999
hasNonRef(const map<int,int> & genotype)2000 bool hasNonRef(const map<int, int>& genotype) {
2001 for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
2002 if (g->first != 0) {
2003 return true;
2004 }
2005 }
2006 return false;
2007 }
2008
isHomRef(const map<int,int> & genotype)2009 bool isHomRef(const map<int, int>& genotype) {
2010 return isHom(genotype) && !hasNonRef(genotype);
2011 }
2012
isHomNonRef(const map<int,int> & genotype)2013 bool isHomNonRef(const map<int, int>& genotype) {
2014 return isHom(genotype) && hasNonRef(genotype);
2015 }
2016
isNull(const map<int,int> & genotype)2017 bool isNull(const map<int, int>& genotype) {
2018 return genotype.find(NULL_ALLELE) != genotype.end();
2019 }
2020
ploidy(const map<int,int> & genotype)2021 int ploidy(const map<int, int>& genotype) {
2022 int i = 0;
2023 for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
2024 i += g->second;
2025 }
2026 return i;
2027 }
2028
2029 // generates cigar from allele parsed by parsedAlternates
varCigar(vector<VariantAllele> & vav,bool xForMismatch)2030 string varCigar(vector<VariantAllele>& vav, bool xForMismatch) {
2031 string cigar;
2032 pair<int, string> element;
2033 for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
2034 VariantAllele& va = *v;
2035 if (va.ref != va.alt) {
2036 if (element.second == "M") {
2037 cigar += convert(element.first) + element.second;
2038 element.second = ""; element.first = 0;
2039 }
2040 if (va.ref.size() == va.alt.size()) {
2041 cigar += convert(va.ref.size()) + (xForMismatch ? "X" : "M");
2042 } else if (va.ref.size() > va.alt.size()) {
2043 cigar += convert(va.ref.size() - va.alt.size()) + "D";
2044 } else {
2045 cigar += convert(va.alt.size() - va.ref.size()) + "I";
2046 }
2047 } else {
2048 if (element.second == "M") {
2049 element.first += va.ref.size();
2050 } else {
2051 element = make_pair(va.ref.size(), "M");
2052 }
2053 }
2054 }
2055 if (element.second == "M") {
2056 cigar += convert(element.first) + element.second;
2057 }
2058 element.second = ""; element.first = 0;
2059 return cigar;
2060 }
2061
parsedAlternates(bool includePreviousBaseForIndels,bool useMNPs,bool useEntropy,float matchScore,float mismatchScore,float gapOpenPenalty,float gapExtendPenalty,float repeatGapExtendPenalty,string flankingRefLeft,string flankingRefRight)2062 map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
2063 bool useMNPs,
2064 bool useEntropy,
2065 float matchScore,
2066 float mismatchScore,
2067 float gapOpenPenalty,
2068 float gapExtendPenalty,
2069 float repeatGapExtendPenalty,
2070 string flankingRefLeft,
2071 string flankingRefRight) {
2072
2073 map<string, vector<VariantAllele> > variantAlleles;
2074
2075 if (isSymbolicSV()){
2076 // Don't ever align SVs. It just wrecks things.
2077 return this->flatAlternates();
2078 }
2079 // add the reference allele
2080 variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
2081
2082 // single SNP case, no ambiguity possible, no need to spend a lot of
2083 // compute aligning ref and alt fields
2084 if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
2085 variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
2086 return variantAlleles;
2087 }
2088
2089 // padding is used to ensure a stable alignment of the alternates to the reference
2090 // without having to go back and look at the full reference sequence
2091 int paddingLen = max(10, (int) (ref.size())); // dynamically determine optimum padding length
2092 for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
2093 string& alternate = *a;
2094 paddingLen = max(paddingLen, (int) (alternate.size()));
2095 }
2096 char padChar = 'Z';
2097 char anchorChar = 'Q';
2098 string padding(paddingLen, padChar);
2099
2100 // this 'anchored' string is done for stability
2101 // the assumption is that there should be a positional match in the first base
2102 // this is true for VCF 4.1, and standard best practices
2103 // using the anchor char ensures this without other kinds of realignment
2104 string reference_M;
2105 if (flankingRefLeft.empty() && flankingRefRight.empty()) {
2106 reference_M = padding + ref + padding;
2107 reference_M[paddingLen] = anchorChar;
2108 } else {
2109 reference_M = flankingRefLeft + ref + flankingRefRight;
2110 paddingLen = flankingRefLeft.size();
2111 }
2112
2113 // passed to sw.Align
2114 unsigned int referencePos;
2115
2116 string cigar;
2117
2118 for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
2119
2120 string& alternate = *a;
2121 vector<VariantAllele>& variants = variantAlleles[alternate];
2122 string alternateQuery_M;
2123 if (flankingRefLeft.empty() && flankingRefRight.empty()) {
2124 alternateQuery_M = padding + alternate + padding;
2125 alternateQuery_M[paddingLen] = anchorChar;
2126 } else {
2127 alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
2128 }
2129 //const unsigned int alternateLen = alternate.size();
2130
2131 if (true) {
2132 CSmithWatermanGotoh sw(matchScore,
2133 mismatchScore,
2134 gapOpenPenalty,
2135 gapExtendPenalty);
2136 if (useEntropy) sw.EnableEntropyGapPenalty(1);
2137 if (repeatGapExtendPenalty != 0){
2138 sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
2139 }
2140 sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
2141 } else { // disabled for now
2142 StripedSmithWaterman::Aligner aligner;
2143 StripedSmithWaterman::Filter sswFilter;
2144 StripedSmithWaterman::Alignment alignment;
2145 aligner.Align(alternateQuery_M.c_str(),
2146 reference_M.c_str(),
2147 reference_M.size(), sswFilter, &alignment);
2148 cigar = alignment.cigar_string;
2149 }
2150
2151 // left-realign the alignment...
2152
2153 vector<pair<int, string> > cigarData = splitCigar(cigar);
2154
2155 if (cigarData.front().second != "M"
2156 || cigarData.back().second != "M"
2157 || cigarData.front().first < paddingLen
2158 || cigarData.back().first < paddingLen) {
2159 cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
2160 cerr << cigar << endl;
2161 cerr << reference_M << endl;
2162 cerr << alternateQuery_M << endl;
2163 exit(1);
2164 } else {
2165 cigarData.front().first -= paddingLen;
2166 cigarData.back().first -= paddingLen;;
2167 }
2168 //cigarData = cleanCigar(cigarData);
2169 cigar = joinCigar(cigarData);
2170
2171 int altpos = 0;
2172 int refpos = 0;
2173
2174 for (vector<pair<int, string> >::iterator e = cigarData.begin();
2175 e != cigarData.end(); ++e) {
2176
2177 int len = e->first;
2178 string type = e->second;
2179
2180 switch (type.at(0)) {
2181 case 'I':
2182 if (includePreviousBaseForIndels) {
2183 if (!variants.empty() &&
2184 variants.back().ref != variants.back().alt) {
2185 VariantAllele a =
2186 VariantAllele("",
2187 alternate.substr(altpos, len),
2188 refpos + position);
2189 variants.back() = variants.back() + a;
2190 } else {
2191 VariantAllele a =
2192 VariantAllele(ref.substr(refpos - 1, 1),
2193 alternate.substr(altpos - 1, len + 1),
2194 refpos + position - 1);
2195 variants.push_back(a);
2196 }
2197 } else {
2198 variants.push_back(VariantAllele("",
2199 alternate.substr(altpos, len),
2200 refpos + position));
2201 }
2202 altpos += len;
2203 break;
2204 case 'D':
2205 if (includePreviousBaseForIndels) {
2206 if (!variants.empty() &&
2207 variants.back().ref != variants.back().alt) {
2208 VariantAllele a
2209 = VariantAllele(ref.substr(refpos, len)
2210 , "", refpos + position);
2211 variants.back() = variants.back() + a;
2212 } else {
2213 VariantAllele a
2214 = VariantAllele(ref.substr(refpos - 1, len + 1),
2215 alternate.substr(altpos - 1, 1),
2216 refpos + position - 1);
2217 variants.push_back(a);
2218 }
2219 } else {
2220 variants.push_back(VariantAllele(ref.substr(refpos, len),
2221 "", refpos + position));
2222 }
2223 refpos += len;
2224 break;
2225
2226 // zk has added (!variants.empty()) solves the seg fault in
2227 // vcfstats, but need to test
2228 case 'M':
2229 {
2230 for (int i = 0; i < len; ++i) {
2231 VariantAllele a
2232 = VariantAllele(ref.substr(refpos + i, 1),
2233 alternate.substr(altpos + i, 1),
2234 (refpos + i + position));
2235 if (useMNPs && (!variants.empty()) &&
2236 variants.back().ref.size() == variants.back().alt.size()
2237 && variants.back().ref != variants.back().alt) {
2238 variants.back() = variants.back() + a;
2239 } else {
2240 variants.push_back(a);
2241 }
2242 }
2243 }
2244 refpos += len;
2245 altpos += len;
2246 break;
2247 case 'S':
2248 {
2249 refpos += len;
2250 altpos += len;
2251 break;
2252 }
2253 default:
2254 {
2255 break;
2256 }
2257 }
2258 }
2259 }
2260 return variantAlleles;
2261 }
2262
flatAlternates(void)2263 map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
2264 map<string, vector<VariantAllele> > variantAlleles;
2265 for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
2266 string& alternate = *a;
2267 vector<VariantAllele>& variants = variantAlleles[alternate];
2268 variants.push_back(VariantAllele(ref, alternate, position));
2269 }
2270 return variantAlleles;
2271 }
2272
altSet(void)2273 set<string> Variant::altSet(void) {
2274 set<string> altset(alt.begin(), alt.end());
2275 return altset;
2276 }
2277
operator <<(ostream & out,VariantAllele & var)2278 ostream& operator<<(ostream& out, VariantAllele& var) {
2279 out << var.position << " " << var.ref << " -> " << var.alt;
2280 return out;
2281 }
2282
operator +(const VariantAllele & a,const VariantAllele & b)2283 VariantAllele operator+(const VariantAllele& a, const VariantAllele& b) {
2284 return VariantAllele(a.ref + b.ref, a.alt + b.alt, a.position);
2285 }
2286
operator <(const VariantAllele & a,const VariantAllele & b)2287 bool operator<(const VariantAllele& a, const VariantAllele& b) {
2288 return a.repr < b.repr;
2289 }
2290
getGenotypeIndexesDiploid(void)2291 map<pair<int, int>, int> Variant::getGenotypeIndexesDiploid(void) {
2292
2293 map<pair<int, int>, int> genotypeIndexes;
2294 //map<int, map<Genotype*, int> > vcfGenotypeOrder;
2295 vector<int> indexes;
2296 for (int i = 0; i < alleles.size(); ++i) {
2297 indexes.push_back(i);
2298 }
2299 int ploidy = 2; // ONLY diploid
2300 vector<vector<int> > genotypes = multichoose(ploidy, indexes);
2301 for (vector<vector<int> >::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
2302 sort(g->begin(), g->end()); // enforce e.g. 0/1, 0/2, 1/2 ordering over reverse
2303 // XXX this does not handle non-diploid!!!!
2304 int j = g->front();
2305 int k = g->back();
2306 genotypeIndexes[make_pair(j, k)] = (k * (k + 1) / 2) + j;
2307 }
2308 return genotypeIndexes;
2309
2310 }
2311
updateAlleleIndexes(void)2312 void Variant::updateAlleleIndexes(void) {
2313 // adjust the allele index
2314 altAlleleIndexes.clear();
2315 int m = 0;
2316 for (vector<string>::iterator a = alt.begin();
2317 a != alt.end(); ++a, ++m) {
2318 altAlleleIndexes[*a] = m;
2319 }
2320 }
2321
2322 // TODO only works on "A"llele variant fields
removeAlt(const string & altAllele)2323 void Variant::removeAlt(const string& altAllele) {
2324
2325 int altIndex = getAltAlleleIndex(altAllele); // this is the alt-relative index, 0-based
2326
2327 for (map<string, int>::iterator c = vcf->infoCounts.begin();
2328 c != vcf->infoCounts.end(); ++c) {
2329 int count = c->second;
2330 if (count == ALLELE_NUMBER) {
2331 string key = c->first;
2332 map<string, vector<string> >::iterator v = info.find(key);
2333 if (v != info.end()) {
2334 vector<string>& vals = v->second;
2335 vector<string> tokeep;
2336 int i = 0;
2337 for (vector<string>::iterator a = vals.begin();
2338 a != vals.end(); ++a, ++i) {
2339 if (i != altIndex) {
2340 tokeep.push_back(*a);
2341 }
2342 }
2343 vals = tokeep;
2344 }
2345 }
2346 }
2347
2348 for (map<string, int>::iterator c = vcf->formatCounts.begin();
2349 c != vcf->formatCounts.end(); ++c) {
2350 int count = c->second;
2351 if (count == ALLELE_NUMBER) {
2352 string key = c->first;
2353 for (map<string, map<string, vector<string> > >::iterator
2354 s = samples.begin(); s != samples.end(); ++s) {
2355 map<string, vector<string> >& sample = s->second;
2356 map<string, vector<string> >::iterator v = sample.find(key);
2357 if (v != sample.end()) {
2358 vector<string>& vals = v->second;
2359 vector<string> tokeep;
2360 int i = 0;
2361 for (vector<string>::iterator a = vals.begin();
2362 a != vals.end(); ++a, ++i) {
2363 if (i != altIndex) {
2364 tokeep.push_back(*a);
2365 }
2366 }
2367 vals = tokeep;
2368 }
2369 }
2370 }
2371 }
2372
2373 int altSpecIndex = altIndex + 1; // this is the genotype-spec index, ref=0, 1-based for alts
2374
2375 vector<string> newalt;
2376 map<int, int> alleleIndexMapping;
2377 // setup the new alt string
2378 alleleIndexMapping[0] = 0; // reference allele remains the same
2379 alleleIndexMapping[NULL_ALLELE] = NULL_ALLELE; // null allele remains the same
2380 int i = 1; // current index
2381 int j = 1; // new index
2382 for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a, ++i) {
2383 if (i != altSpecIndex) {
2384 newalt.push_back(*a);
2385 // get the mapping between new and old allele indexes
2386 alleleIndexMapping[i] = j;
2387 ++j;
2388 } else {
2389 alleleIndexMapping[i] = NULL_ALLELE;
2390 }
2391 }
2392
2393 // fix the sample genotypes, removing reference to the old allele
2394 map<string, int> samplePloidy;
2395 for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
2396 map<string, vector<string> >& sample = s->second;
2397 if (sample.find("GT") != sample.end()) {
2398 string& gt = sample["GT"].front();
2399 string splitter = "/";
2400 if (gt.find("|") != string::npos) {
2401 splitter = "|";
2402 }
2403
2404 if (splitter == "/") {
2405 samplePloidy[s->first] = split(gt, splitter).size();
2406 map<int, int> genotype = decomposeGenotype(sample["GT"].front());
2407 map<int, int> newGenotype;
2408 for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
2409 newGenotype[alleleIndexMapping[g->first]] += g->second;
2410 }
2411 sample["GT"].clear();
2412 sample["GT"].push_back(genotypeToString(newGenotype));
2413 } else {
2414 samplePloidy[s->first] = split(gt, splitter).size();
2415 vector<int> genotype = decomposePhasedGenotype(sample["GT"].front());
2416 vector<int> newGenotype;
2417 for (vector<int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
2418 newGenotype.push_back(alleleIndexMapping[*g]);
2419 }
2420 sample["GT"].clear();
2421 sample["GT"].push_back(phasedGenotypeToString(newGenotype));
2422 }
2423 }
2424 }
2425
2426 set<int> ploidies;
2427 for (map<string, int>::iterator p = samplePloidy.begin(); p != samplePloidy.end(); ++p) {
2428 ploidies.insert(p->second);
2429 }
2430
2431 // fix the sample genotype likelihoods, removing reference to the old allele
2432 // which GL fields should we remove?
2433 vector<int> toRemove;
2434 toRemove.push_back(altSpecIndex);
2435 map<int, map<int, int> > glMappingByPloidy;
2436 for (set<int>::iterator p = ploidies.begin(); p != ploidies.end(); ++p) {
2437 glMappingByPloidy[*p] = glReorder(*p, alt.size() + 1, alleleIndexMapping, toRemove);
2438 }
2439
2440 for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
2441 map<string, vector<string> >& sample = s->second;
2442 map<string, vector<string> >::iterator glsit = sample.find("GL");
2443 if (glsit != sample.end()) {
2444 vector<string>& gls = glsit->second; // should be split already
2445 map<int, string> newgls;
2446 map<int, int>& newOrder = glMappingByPloidy[samplePloidy[s->first]];
2447 int i = 0;
2448 for (vector<string>::iterator g = gls.begin(); g != gls.end(); ++g, ++i) {
2449 int j = newOrder[i];
2450 if (j != -1) {
2451 newgls[i] = *g;
2452 }
2453 }
2454 // update the gls
2455 gls.clear();
2456 for (map<int, string>::iterator g = newgls.begin(); g != newgls.end(); ++g) {
2457 gls.push_back(g->second);
2458 }
2459 }
2460 }
2461
2462 // reset the alt
2463 alt = newalt;
2464
2465 // and the alleles
2466 alleles.clear();
2467 alleles.push_back(ref);
2468 alleles.insert(alleles.end(), alt.begin(), alt.end());
2469
2470 updateAlleleIndexes();
2471
2472 }
2473
2474 // union of lines in headers of input files
unionInfoHeaderLines(string & s1,string & s2)2475 string unionInfoHeaderLines(string& s1, string& s2) {
2476 vector<string> lines1 = split(s1, "\n");
2477 vector<string> lines2 = split(s2, "\n");
2478 vector<string> result;
2479 set<string> l2;
2480 string lastHeaderLine; // this one needs to be at the end
2481 for (vector<string>::iterator s = lines2.begin(); s != lines2.end(); ++s) {
2482 if (s->substr(0,6) == "##INFO") {
2483 l2.insert(*s);
2484 }
2485 }
2486 for (vector<string>::iterator s = lines1.begin(); s != lines1.end(); ++s) {
2487 if (l2.count(*s)) {
2488 l2.erase(*s);
2489 }
2490 if (s->substr(0,6) == "#CHROM") {
2491 lastHeaderLine = *s;
2492 } else {
2493 result.push_back(*s);
2494 }
2495 }
2496 for (set<string>::iterator s = l2.begin(); s != l2.end(); ++s) {
2497 result.push_back(*s);
2498 }
2499 if (lastHeaderLine.empty()) {
2500 cerr << "could not find CHROM POS ... header line" << endl;
2501 exit(1);
2502 }
2503 result.push_back(lastHeaderLine);
2504 return join(result, "\n");
2505 }
2506
mergeCigar(const string & c1,const string & c2)2507 string mergeCigar(const string& c1, const string& c2) {
2508 vector<pair<int, string> > cigar1 = splitCigar(c1);
2509 vector<pair<int, string> > cigar2 = splitCigar(c2);
2510 // check if the middle elements are the same
2511 if (cigar1.back().second == cigar2.front().second) {
2512 cigar1.back().first += cigar2.front().first;
2513 cigar2.erase(cigar2.begin());
2514 }
2515 for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
2516 cigar1.push_back(*c);
2517 }
2518 return joinCigar(cigar1);
2519 }
2520
splitCigar(const string & cigarStr)2521 vector<pair<int, string> > splitCigar(const string& cigarStr) {
2522 vector<pair<int, string> > cigar;
2523 string number;
2524 string type;
2525 // strings go [Number][Type] ...
2526 for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
2527 char c = *s;
2528 if (isdigit(c)) {
2529 if (type.empty()) {
2530 number += c;
2531 } else {
2532 // signal for next token, push back the last pair, clean up
2533 cigar.push_back(make_pair(atoi(number.c_str()), type));
2534 number.clear();
2535 type.clear();
2536 number += c;
2537 }
2538 } else {
2539 type += c;
2540 }
2541 }
2542 if (!number.empty() && !type.empty()) {
2543 cigar.push_back(make_pair(atoi(number.c_str()), type));
2544 }
2545 return cigar;
2546 }
2547
splitCigarList(const string & cigarStr)2548 list<pair<int, string> > splitCigarList(const string& cigarStr) {
2549 list<pair<int, string> > cigar;
2550 string number;
2551 string type;
2552 // strings go [Number][Type] ...
2553 for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
2554 char c = *s;
2555 if (isdigit(c)) {
2556 if (type.empty()) {
2557 number += c;
2558 } else {
2559 // signal for next token, push back the last pair, clean up
2560 cigar.push_back(make_pair(atoi(number.c_str()), type));
2561 number.clear();
2562 type.clear();
2563 number += c;
2564 }
2565 } else {
2566 type += c;
2567 }
2568 }
2569 if (!number.empty() && !type.empty()) {
2570 cigar.push_back(make_pair(atoi(number.c_str()), type));
2571 }
2572 return cigar;
2573 }
2574
cleanCigar(const vector<pair<int,string>> & cigar)2575 vector<pair<int, string> > cleanCigar(const vector<pair<int, string> >& cigar) {
2576 vector<pair<int, string> > cigarClean;
2577 for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2578 if (c->first > 0) {
2579 cigarClean.push_back(*c);
2580 }
2581 }
2582 return cigarClean;
2583 }
2584
joinCigar(const vector<pair<int,string>> & cigar)2585 string joinCigar(const vector<pair<int, string> >& cigar) {
2586 string cigarStr;
2587 for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2588 if (c->first) {
2589 cigarStr += convert(c->first) + c->second;
2590 }
2591 }
2592 return cigarStr;
2593 }
2594
joinCigar(const vector<pair<int,char>> & cigar)2595 string joinCigar(const vector<pair<int, char> >& cigar) {
2596 string cigarStr;
2597 for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2598 if (c->first) {
2599 cigarStr += convert(c->first) + string(1, c->second);
2600 }
2601 }
2602 return cigarStr;
2603 }
2604
joinCigarList(const list<pair<int,string>> & cigar)2605 string joinCigarList(const list<pair<int, string> >& cigar) {
2606 string cigarStr;
2607 for (list<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2608 cigarStr += convert(c->first) + c->second;
2609 }
2610 return cigarStr;
2611 }
2612
cigarRefLen(const vector<pair<int,char>> & cigar)2613 int cigarRefLen(const vector<pair<int, char> >& cigar) {
2614 int len = 0;
2615 for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2616 if (c->second == 'M' || c->second == 'D' || c->second == 'X') {
2617 len += c->first;
2618 }
2619 }
2620 return len;
2621 }
2622
cigarRefLen(const vector<pair<int,string>> & cigar)2623 int cigarRefLen(const vector<pair<int, string> >& cigar) {
2624 int len = 0;
2625 for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2626 if (c->second == "M" || c->second == "D" || c->second == "X") {
2627 len += c->first;
2628 }
2629 }
2630 return len;
2631 }
2632
isEmptyCigarElement(const pair<int,string> & elem)2633 bool isEmptyCigarElement(const pair<int, string>& elem) {
2634 return elem.first == 0;
2635 }
2636
_glorder(int ploidy,int alts)2637 list<list<int> > _glorder(int ploidy, int alts) {
2638 if (ploidy == 1) {
2639 list<list<int> > results;
2640 for (int n = 0; n < alts; ++n) {
2641 list<int> v;
2642 v.push_back(n);
2643 results.push_back(v);
2644 }
2645 return results;
2646 } else {
2647 list<list<int> > results;
2648 for (int n = 0; n < alts; ++n) {
2649 list<list<int> > x = _glorder(ploidy - 1, alts);
2650 for (list<list<int> >::iterator v = x.begin(); v != x.end(); ++v) {
2651 if (v->front() <= n) {
2652 v->push_front(n);
2653 results.push_back(*v);
2654 }
2655 }
2656 }
2657 return results;
2658 }
2659 }
2660
2661 // genotype likelihood-ordering of genotypes, where each genotype is a
2662 // list of integers (as written in the GT field)
glorder(int ploidy,int alts)2663 list<list<int> > glorder(int ploidy, int alts) {
2664 list<list<int> > results = _glorder(ploidy, alts);
2665 for (list<list<int> >::iterator v = results.begin(); v != results.end(); ++v) {
2666 v->reverse();
2667 }
2668 return results;
2669 }
2670
2671 // which genotype likelihoods would include this alternate allele
glsWithAlt(int alt,int ploidy,int numalts)2672 list<int> glsWithAlt(int alt, int ploidy, int numalts) {
2673 list<int> gls;
2674 list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
2675 int i = 0;
2676 for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
2677 for (list<int>::iterator q = v->begin(); q != v->end(); ++q) {
2678 if (*q == alt) {
2679 gls.push_back(i);
2680 break;
2681 }
2682 }
2683 }
2684 return gls;
2685 }
2686
2687 // describes the mapping between the old gl ordering and and a new
2688 // one in which the GLs including the old alt have been removed
2689 // a map to -1 means "remove"
glReorder(int ploidy,int numalts,map<int,int> & alleleIndexMapping,vector<int> & altsToRemove)2690 map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove) {
2691 map<int, int> mapping;
2692 list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
2693 for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v) {
2694 for (list<int>::iterator n = v->begin(); n != v->end(); ++n) {
2695 *n = alleleIndexMapping[*n];
2696 }
2697 }
2698 list<list<int> > newOrderedGenotypes = glorder(ploidy, numalts - altsToRemove.size());
2699 map<list<int>, int> newOrderedGenotypesMapping;
2700 int i = 0;
2701 // mapping is wrong...
2702 for (list<list<int> >::iterator v = newOrderedGenotypes.begin(); v != newOrderedGenotypes.end(); ++v, ++i) {
2703 newOrderedGenotypesMapping[*v] = i;
2704 }
2705 i = 0;
2706 for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
2707 map<list<int>, int>::iterator m = newOrderedGenotypesMapping.find(*v);
2708 if (m != newOrderedGenotypesMapping.end()) {
2709 //cout << "new gl order of " << i << " is " << m->second << endl;
2710 mapping[i] = m->second;
2711 } else {
2712 //cout << i << " will be removed" << endl;
2713 mapping[i] = -1;
2714 }
2715 }
2716 return mapping;
2717 }
2718
getGenotype(const string & sample)2719 string Variant::getGenotype(const string& sample) {
2720 map<string, map<string, vector<string> > >::iterator s = samples.find(sample);
2721 if (s != samples.end()) {
2722 map<string, vector<string> >::iterator f = s->second.find("GT");
2723 if (f != s->second.end()) {
2724 return f->second.front();
2725 }
2726 }
2727 return "";
2728 }
2729
isPhased(void)2730 bool Variant::isPhased(void) {
2731 for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
2732 map<string, vector<string> >& sample = s->second;
2733 map<string, vector<string> >::iterator g = sample.find("GT");
2734 if (g != sample.end()) {
2735 string gt = g->second.front();
2736 if (gt.size() > 1 && gt.find("|") == string::npos) {
2737 return false;
2738 }
2739 }
2740 }
2741 return true;
2742 }
2743
zeroBasedPosition(void) const2744 long Variant::zeroBasedPosition(void) const {
2745 return position - 1;
2746 }
2747
vrepr(void)2748 string Variant::vrepr(void) {
2749 return sequenceName + "\t" + convert(position) + "\t" + join(alleles, ",");
2750 }
2751
2752 // TODO
2753 /*
2754 vector<Variant*> Variant::matchingHaplotypes() {
2755
2756 int haplotypeStart = var.position;
2757 int haplotypeEnd = var.position + var.ref.size();
2758
2759 for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
2760 haplotypeStart = min((*v)->position, (long int) haplotypeStart);
2761 haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
2762 }
2763
2764 // for everything overlapping and the current variant, construct the local haplotype within the bounds
2765 // if there is an exact match, the allele in the current VCF does intersect
2766
2767 string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
2768 map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
2769
2770 for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
2771 Variant& variant = **v;
2772 int altindex = 0;
2773 for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
2774 string haplotype = referenceHaplotype;
2775 // get the relative start and end coordinates for the variant alternate allele
2776 int relativeStart = variant.position - haplotypeStart;
2777 haplotype.replace(relativeStart, variant.ref.size(), *a);
2778 haplotypes[haplotype].push_back(make_pair(*v, altindex));
2779 }
2780 }
2781
2782 Variant originalVar = var;
2783
2784 // determine the non-intersecting alts
2785 vector<string> altsToRemove;
2786 vector<int> altIndexesToRemove;
2787 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
2788 string haplotype = referenceHaplotype;
2789 int relativeStart = var.position - haplotypeStart;
2790 haplotype.replace(relativeStart, var.ref.size(), *a);
2791 map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
2792 if ((intersecting && !invert && h == haplotypes.end())
2793 || (intersecting && invert && h != haplotypes.end())
2794 || (unioning && h != haplotypes.end())) {
2795 if (tag.empty() && mergeToTag.empty()) {
2796 altsToRemove.push_back(*a);
2797 } else {
2798 if (!tag.empty()) {
2799 var.info[tag].push_back(".");
2800 }
2801 if (!mergeToTag.empty()) {
2802 var.info[mergeToTag].push_back(".");
2803 }
2804 }
2805 } else {
2806 if (!tag.empty()) {
2807 var.info[tag].push_back(tagValue);
2808 }
2809 // NB: just take the first value for the mergeFromTag
2810 if (!mergeToTag.empty()) {
2811 Variant* v = h->second.front().first;
2812 int index = h->second.front().second;
2813 if (v->info.find(mergeFromTag) != v->info.end()) {
2814 // now you have to find the exact allele...
2815 string& otherValue = v->info[mergeFromTag].at(index);
2816 var.info[mergeToTag].push_back(otherValue);
2817 } else if (mergeFromTag == "QUAL") {
2818 var.info[mergeToTag].push_back(convert(v->quality));
2819 } else {
2820 var.info[mergeToTag].push_back(".");
2821 }
2822 }
2823 }
2824 }
2825
2826 // remove the non-overlapping (intersecting) or overlapping (unioning) alts
2827 if (intersecting && loci && altsToRemove.size() != var.alt.size()) {
2828 // we have a match in loci mode, so we should output the whole loci, not just the matching sequence
2829 } else {
2830 for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
2831 var.removeAlt(*a);
2832 }
2833 }
2834
2835 if (unioning) {
2836
2837 // somehow sort the records and combine them?
2838 map<long int, vector<Variant*> > variants;
2839 for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
2840 if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
2841 outputVariants.find(*o) == outputVariants.end()) {
2842 outputVariants.insert(*o);
2843 variants[(*o)->position].push_back(*o);
2844 }
2845 }
2846 // add in the current variant, if it has alts left
2847 if (!var.alt.empty()) {
2848 vector<Variant*>& vars = variants[var.position];
2849 int numalts = 0;
2850 for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
2851 numalts += (*v)->alt.size();
2852 }
2853 if (numalts + var.alt.size() == originalVar.alt.size()) {
2854 variants[var.position].clear();
2855 variants[var.position].push_back(&originalVar);
2856 } else {
2857 variants[var.position].push_back(&var);
2858 }
2859 }
2860
2861 for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
2862 for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
2863 cout << **o << endl;
2864 lastOutputPosition = max(lastOutputPosition, (*o)->position);
2865 }
2866 }
2867 } else {
2868 // if any alts remain, output the variant record
2869 if (!var.alt.empty()) {
2870 cout << var << endl;
2871 lastOutputPosition = max(lastOutputPosition, var.position);
2872 }
2873 }
2874
2875 }
2876 */
2877
2878
VCFHeader()2879 VCFHeader::VCFHeader()
2880 {
2881
2882 // add manditory fields
2883 this->header_columns.push_back("#CHROM");
2884 this->header_columns.push_back("POS");
2885 this->header_columns.push_back("ID");
2886 this->header_columns.push_back("REF");
2887 this->header_columns.push_back("ALT");
2888 this->header_columns.push_back("QUAL");
2889 this->header_columns.push_back("FILTER");
2890 this->header_columns.push_back("INFO");
2891
2892 // add the line names in order
2893 // the order is used when outputting as a string
2894 this->header_line_names_ordered.push_back("##fileFormat");
2895 this->header_line_names_ordered.push_back("##fileDate");
2896 this->header_line_names_ordered.push_back("##source");
2897 this->header_line_names_ordered.push_back("##reference");
2898 this->header_line_names_ordered.push_back( "##contig");
2899 this->header_line_names_ordered.push_back("##phasing");
2900 this->header_line_names_ordered.push_back( "##assembly");
2901
2902 // add the list names in order
2903 // the order is used when outputting as a string (getHeaderString)
2904 this->header_list_names_ordered.push_back("##info");
2905 this->header_list_names_ordered.push_back("##filter");
2906 this->header_list_names_ordered.push_back("##format");
2907 this->header_list_names_ordered.push_back("##alt");
2908 this->header_list_names_ordered.push_back("##sample");
2909 this->header_list_names_ordered.push_back("##pedigree");
2910 this->header_list_names_ordered.push_back("##pedigreedb");
2911
2912 // initialize the header_lines with the above vector.
2913 // Set the key as the ##_type_ and the value as an empty string
2914 // Empty strings are ignored when outputting as string (getHeaderString)
2915 for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
2916 {
2917 this->header_lines[(*header_lines_iter)] = "";
2918 }
2919
2920 // initialize the header_lines with the above vector.
2921 // Set the key as the ##_type_ and the value as an empty vector<string>
2922 // Empty vectors are ignored when outputting as string (getHeaderString)
2923 for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
2924 {
2925 this->header_lists[(*header_lists_iter)] = vector<string>(0);
2926 }
2927
2928 }
2929
addMetaInformationLine(const string & meta_line)2930 void VCFHeader::addMetaInformationLine(const string& meta_line)
2931 {
2932 // get the meta_line unique key (first chars before the =)
2933 unsigned int meta_line_index = meta_line.find("=", 0);
2934 string meta_line_prefix = meta_line.substr(0, meta_line_index);
2935
2936 // check if the meta_line_prefix is in the header_lines, if so add it to the appropirate list
2937 if (this->header_lines.find(meta_line_prefix) != header_lines.end()) // the meta_line is a header line so replace what was there
2938 {
2939 this->header_lines[meta_line_prefix] = meta_line;
2940 }
2941 else if (header_lists.find(meta_line_prefix) != header_lists.end() &&
2942 !metaInfoIdExistsInVector(meta_line, this->header_lists[meta_line_prefix])) // check if the metalineprefix is in the headerLists, if so add it to the appropirate list
2943 {
2944 this->header_lists[meta_line_prefix].push_back(meta_line);
2945 }
2946 }
2947
getHeaderString()2948 string VCFHeader::getHeaderString()
2949 {
2950 // getHeaderString generates the string each time it is called
2951 string header_string;
2952
2953 // start by adding the header_lines
2954 for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
2955 {
2956 if (this->header_lines[(*header_lines_iter)] != "")
2957 {
2958 header_string += this->header_lines[(*header_lines_iter)] + "\n";
2959 }
2960 }
2961
2962 // next add header_lists
2963 for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
2964 {
2965 vector<string> tmp_header_lists = this->header_lists[(*header_lists_iter)];
2966 for (vector<string>::const_iterator header_list = tmp_header_lists.begin(); header_list != tmp_header_lists.end(); ++header_list)
2967 {
2968 header_string += (*header_list) + "\n";
2969 }
2970 }
2971
2972 // last add header columns
2973 vector<string>::const_iterator last_element = this->header_columns.end() - 1;
2974 for (vector<string>::const_iterator header_column_iter = this->header_columns.begin(); header_column_iter != this->header_columns.end(); ++header_column_iter)
2975 {
2976 string delimiter = (header_column_iter == last_element) ? "\n" : "\t";
2977 header_string += (*header_column_iter) + delimiter;
2978 }
2979 return header_string;
2980 }
2981
metaInfoIdExistsInVector(const string & meta_line,vector<string> & meta_lines)2982 bool VCFHeader::metaInfoIdExistsInVector(const string& meta_line, vector<string>& meta_lines)
2983 {
2984 // extract the id from meta_line
2985 size_t meta_line_id_start_idx = meta_line.find("ID=", 0); // used for the start of the substring index
2986 size_t meta_line_id_end_idx = meta_line.find(",", meta_line_id_start_idx); // used for end of the substring index
2987 string meta_line_id = (meta_line_id_start_idx < meta_line_id_end_idx) ? meta_line.substr(meta_line_id_start_idx, meta_line_id_end_idx - meta_line_id_start_idx) : "";
2988
2989 for (vector<string>::const_iterator iter = meta_lines.begin(); iter != meta_lines.end(); ++iter)
2990 {
2991 // extract the id from iter's meta_line string
2992 size_t iter_meta_line_id_start_idx = (*iter).find("ID=", 0);
2993 size_t iter_meta_line_id_end_idx = (*iter).find(",", iter_meta_line_id_start_idx);
2994 string iter_meta_line_id = (iter_meta_line_id_start_idx < iter_meta_line_id_end_idx) ? (*iter).substr(iter_meta_line_id_start_idx, iter_meta_line_id_end_idx - iter_meta_line_id_start_idx) : "";
2995 // compare the meta_line_id with the iter_meta_line_id
2996 if (strcasecmp(meta_line_id.c_str(), iter_meta_line_id.c_str()) == 0)
2997 {
2998 return true;
2999 }
3000 }
3001 return false;
3002 }
3003
addHeaderColumn(const string & header_column)3004 void VCFHeader::addHeaderColumn(const string& header_column)
3005 {
3006 // don't add duplicates
3007 // vector<string>::iterator test = find(this->header_columns.begin(), this->header_columns.end(), header_column);
3008 if (find(this->header_columns.begin(), this->header_columns.end(), header_column) == this->header_columns.end())
3009 {
3010 this->header_columns.push_back(header_column);
3011 }
3012 }
3013
3014 } // end namespace vcf
3015