1 #include "Variant.h"
2 #include <utility>
3 
4 namespace vcflib {
5 
6 static char rev_arr [26] = {84, 66, 71, 68, 69, 70, 67, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 65,
7                            85, 86, 87, 88, 89, 90};
8 
reverse_complement(const std::string & seq)9 std::string reverse_complement(const std::string& seq) {
10     // The old implementation of this function forgot to null-terminate its
11     // returned string. This implementation uses heavier-weight C++ stuff that
12     // may be slower but should ensure that that doesn't happen again.
13 
14     if (seq.size() == 0) {
15         return seq;
16     }
17 
18     string ret;
19     ret.reserve(seq.size());
20 
21     std::transform(seq.rbegin(), seq.rend(), std::back_inserter(ret), [](char in) -> char {
22         bool lower_case = (in >= 'a' && in <= 'z');
23         if (lower_case) {
24             // Convert to upper case
25             in -= 32;
26         }
27         if (in < 'A' || in > 'Z') {
28             throw std::runtime_error("Out of range character " + std::to_string((uint8_t)in) + " in inverted sequence");
29         }
30         // Compute RC in terms of letter identity, and then lower-case if necessary.
31         return rev_arr[((int) in) - 'A'] + (lower_case ? 32 : 0);
32     });
33 
34     return ret;
35 }
36 
toUpper(const std::string & seq)37 std::string toUpper(const std::string& seq) {
38     if (seq.size() == 0) {
39         return seq;
40     }
41 
42     string ret;
43     ret.reserve(seq.size());
44 
45     std::transform(seq.begin(), seq.end(), std::back_inserter(ret), [](char in) -> char {
46         // If it's lower-case, bring it down in value to upper-case.
47         return (in >= 'a' && in <= 'z') ? (in - 32) : in;
48     });
49 
50     return ret;
51 }
52 
53 
allATGCN(const string & s,bool allowLowerCase)54 bool allATGCN(const string& s, bool allowLowerCase){
55     if (allowLowerCase){
56        for (string::const_iterator i = s.begin(); i != s.end(); ++i){
57             char c = *i;
58             if (c != 'A' && c != 'a' &&
59                 c != 'C' && c != 'c' &&
60                 c != 'T' && c != 't' &&
61                 c != 'G' && c != 'g' &&
62                 c != 'N' && c != 'n'){
63                     return false;
64             }
65         }
66     }
67     else{
68         for (string::const_iterator i = s.begin(); i != s.end(); ++i){
69             char c = *i;
70             if (c != 'A' && c != 'C' && c != 'T' && c != 'G' && c != 'N'){
71                 return false;
72             }
73         }
74 
75     }
76     return true;
77 }
78 
79 
80 
parse(string & line,bool parseSamples)81 void Variant::parse(string& line, bool parseSamples) {
82 
83     // clean up potentially variable data structures
84     info.clear();
85     infoFlags.clear();
86     format.clear();
87     alt.clear();
88     alleles.clear();
89     canonical = false;
90 
91     // #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT [SAMPLE1 .. SAMPLEN]
92     vector<string> fields = split(line, '\t');
93     if (fields.size() < 7) {
94         cerr << "broken VCF record (less than 7 fields)" << endl
95              << line << endl;
96         exit(1);
97     }
98 
99     sequenceName = fields.at(0);
100     char* end; // dummy variable for strtoll
101     position = strtoll(fields.at(1).c_str(), &end, 10);
102     id = fields.at(2);
103     ref = fields.at(3);
104     alt = split(fields.at(4), ","); // a comma-separated list of alternate alleles
105 
106     // make a list of all (ref + alts) alleles, allele[0] = ref, alleles[1:] = alts
107     // add the ref allele ([0]), resize for the alt alleles, and then add the alt alleles
108     alleles.push_back(ref);
109     alleles.resize(alt.size()+1);
110     std::copy(alt.begin(), alt.end(), alleles.begin()+1);
111 
112     // set up reverse lookup of allele index
113     altAlleleIndexes.clear();
114     int n = 0;
115     for (vector<string>::iterator a = alt.begin();
116             a != alt.end(); ++a, ++n) {
117         altAlleleIndexes[*a] = n;
118     }
119 
120     convert(fields.at(5), quality);
121     filter = fields.at(6);
122     if (fields.size() > 7) {
123         vector<string> infofields = split(fields.at(7), ';');
124         for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) {
125             if (*f == ".") {
126                 continue;
127             }
128             vector<string> kv = split(*f, '=');
129             if (kv.size() == 2) {
130                 split(kv.at(1), ',', info[kv.at(0)]);
131             } else if (kv.size() == 1) {
132                 infoFlags[kv.at(0)] = true;
133             }
134         }
135     }
136     // check if we have samples specified
137     // and that we are supposed to parse them
138     if (parseSamples && fields.size() > 8) {
139         format = split(fields.at(8), ':');
140         // if the format changed, we have to rebuild the samples
141         if (fields.at(8) != lastFormat) {
142             samples.clear();
143             lastFormat = fields.at(8);
144         }
145         vector<string>::iterator sampleName = sampleNames.begin();
146         vector<string>::iterator sample = fields.begin() + 9;
147         for (; sample != fields.end() && sampleName != sampleNames.end();
148 	     ++sample, ++sampleName) {
149 	  string& name = *sampleName;
150 
151 	  vector<string> samplefields = split(*sample, ':');
152 	  vector<string>::iterator i = samplefields.begin();
153 
154 	  for (vector<string>::iterator f = format.begin();
155 	       f != format.end(); ++f) {
156 
157 	    if(i != samplefields.end()){
158 	      samples[name][*f] = split(*i, ','); ++i;
159 	    }
160 	    else{
161 	      std::vector<string> missing;
162 	      missing.push_back(".");
163 	      samples[name][*f] = missing;
164 	    }
165 	  }
166 	}
167 
168         if (sampleName != sampleNames.end()) {
169             cerr << "error: more sample names in header than sample fields" << endl;
170             cerr << "samples: " << join(sampleNames, " ") << endl;
171             cerr << "line: " << line << endl;
172             exit(1);
173         }
174         if (sample != fields.end()) {
175             cerr << "error: more sample fields than samples listed in header" << endl;
176             cerr << "samples: " << join(sampleNames, " ") << endl;
177             cerr << "line: " << line << endl;
178             cerr << *sample << endl;
179             exit(1);
180         }
181     } else if (!parseSamples) {
182         originalLine = line;
183     }
184 
185     //return true; // we should be catching exceptions...
186 }
187 
hasSVTags() const188 bool Variant::hasSVTags() const{
189     bool found_svtype = !getSVTYPE().empty();
190     bool found_len = this->info.find("SVLEN") != this->info.end() || this->info.find("END") != this->info.end() || this->info.find("SPAN") != this->info.end();
191 
192    return found_svtype && found_len;
193 }
194 
195 
isSymbolicSV() const196 bool Variant::isSymbolicSV() const{
197 
198     bool found_svtype = !getSVTYPE().empty();
199 
200     bool ref_valid = allATGCN(this->ref);
201     bool alts_valid = true;
202     for (auto a : this->alt){
203         if (!allATGCN(a)){
204             alts_valid = false;
205         }
206     }
207 
208     return (!ref_valid || !alts_valid) && (found_svtype);
209 }
210 
getSVTYPE(int altpos) const211 string Variant::getSVTYPE(int altpos) const{
212 
213     if (altpos > 0){
214         // TODO: Implement multi-alt SVs
215         return "";
216     }
217 
218 
219     if (this->info.find("SVTYPE") != this->info.end()){
220         if (altpos >= this->info.at("SVTYPE").size()) {
221             return "";
222         }
223         return this->info.at("SVTYPE")[altpos];
224     }
225 
226     return "";
227 };
228 
229 
230 
getMaxReferencePos()231 int Variant::getMaxReferencePos(){
232     if (this->canonical && this->info.find("END") != this->info.end()) {
233         // We are cannonicalized and must have a correct END
234 
235         int end = 0;
236         for (auto s : this->info.at("END")){
237             // Get the latest one defined.
238             end = max(abs(stoi(s)), end);
239         }
240         // Convert to 0-based.
241         return end - 1;
242 
243     }
244 
245     if (!this->isSymbolicSV()){
246         // We don't necessarily have an END, but we don't need one
247         return this->zeroBasedPosition() + this->ref.length() - 1;
248     }
249 
250     if (this->canonicalizable()){
251         // We aren't canonical, but we could be.
252         if (this->info.find("END") != this->info.end()){
253             // We have an END; blindly trust it
254             int end = 0;
255             for (auto s : this->info.at("END")){
256                 // Get the latest one defined.
257                 end = max(abs(stoi(s)), end);
258             }
259             // Convert to 0-based.
260             return end - 1;
261 
262         }
263         else if (this->info.find("SVLEN") != this->info.end()){
264             // There's no endpoint, but we know an SVLEN.
265             // A negative SVLEN means a deletion, so if we find one we can say we delete that much.
266             int deleted = 0;
267             for (auto s : this->info.at("SVLEN")){
268                 int alt_len = stoi(s);
269                 if (alt_len > 0){
270                     // Not a deletion, so doesn't affect any ref bases
271                     continue;
272                 }
273                 deleted = max(-alt_len, deleted);
274             }
275 
276             // The anchoring base at POS gets added in (because it isn't
277             // deleted) but then subtracted out (because we have to do that to
278             // match non-SV deletions). For insertions, deleted is 0 and we
279             // return 0-based POS. Inversions must have an END.
280             return this->zeroBasedPosition() + deleted;
281         }
282         else{
283             cerr << "Warning: insufficient length information for " << *this << endl;
284             return -1;
285         }
286     }
287     else {
288         cerr << "Warning: can't get end of non-canonicalizeable variant " << *this << endl;
289     }
290     return -1;
291 }
292 
293 
294 
295 
296 // To canonicalize a variant, we need either both REF and ALT seqs filled in
297 // or SVTYPE and SVLEN or END or SPAN or SEQ sufficient to define the variant.
canonicalizable()298 bool Variant::canonicalizable(){
299     bool pre_canon = allATGCN(this->ref);
300 
301     for (auto& a : this->alt){
302         if (!allATGCN(a)){
303             pre_canon = false;
304         }
305     }
306 
307     if (pre_canon){
308         // It came in in a fully specified way.
309         // TODO: ideally, we'd check to make sure ref/alt lengths, svtypes, and ends line up right here.
310         return true;
311     }
312 
313     string svtype = getSVTYPE();
314 
315     if (svtype.empty()){
316         // We have no SV type, so we can't interpret things.
317         return false;
318     }
319 
320     // Check the tags
321     bool has_len = this->info.count("SVLEN") && !this->info.at("SVLEN").empty();
322     bool has_seq = this->info.count("SEQ") && !this->info.at("SEQ").empty();
323     bool has_span = this->info.count("SPAN") && !this->info.at("SPAN").empty();
324     bool has_end = this->info.count("END") && !this->info.at("END").empty();
325 
326 
327     if (svtype == "INS"){
328         // Insertions need a SEQ, SVLEN, or SPAN
329         return has_seq || has_len || has_span;
330     }
331     else if (svtype == "DEL"){
332         // Deletions need an SVLEN, SPAN, or END
333         return has_len || has_span || has_end;
334     }
335     else if (svtype == "INV"){
336         // Inversions need a SPAN or END
337         return has_span || has_end;
338     }
339     else{
340         // Other SV types are unsupported
341         // TODO: DUP
342         return false;
343     }
344 }
345 
canonicalize(FastaReference & fasta_reference,vector<FastaReference * > insertions,bool place_seq,int min_size)346 bool Variant::canonicalize(FastaReference& fasta_reference, vector<FastaReference*> insertions, bool place_seq, int min_size){
347 
348     // Nobody should call this without checking
349     assert(canonicalizable());
350 
351     // Nobody should call this twice
352     assert(!this->canonical);
353 
354     // Find where the inserted sequence can come from for insertions
355     bool do_external_insertions = !insertions.empty();
356     FastaReference* insertion_fasta;
357     if (do_external_insertions){
358         insertion_fasta = insertions[0];
359     }
360 
361     bool ref_valid = allATGCN(ref);
362 
363     if (!ref_valid && !place_seq){
364         // If the reference is invalid, and we aren't allowed to change the ref sequence,
365         // we can't canonicalize the variant.
366         return false;
367     }
368 
369     // Check the alts to see if they are not symbolic
370     vector<bool> alt_i_atgcn (alt.size());
371     for (int i = 0; i < alt.size(); ++i){
372         alt_i_atgcn[i] = allATGCN(alt[i]);
373     }
374 
375     // Only allow single-alt variants
376     bool single_alt = alt.size() == 1;
377     if (!single_alt){
378         // TODO: this will need to be remove before supporting multiple alleles
379         cerr << "Warning: multiple ALT alleles not yet allowed for SVs" << endl;
380         return false;
381     }
382 
383     // Fill in the SV tags
384     string svtype = getSVTYPE();
385     bool has_len = this->info.count("SVLEN") && !this->info.at("SVLEN").empty();
386     bool has_seq = this->info.count("SEQ") && !this->info.at("SEQ").empty();
387     bool has_span = this->info.count("SPAN") && !this->info.at("SPAN").empty();
388     bool has_end = this->info.count("END") && !this->info.at("END").empty();
389 
390     // Where is the end, or where should it be?
391     long info_end = 0;
392     if (has_end) {
393         // Get the END from the tag
394         info_end = stol(this->info.at("END")[0]);
395     }
396     else if(ref_valid && !place_seq) {
397         // Get the END from the reference sequence, which is ready.
398         info_end = this->position + this->ref.length() - 1;
399     }
400     else if ((svtype == "DEL" || svtype == "INV") && has_span) {
401         // For deletions and inversions, we can get the END from the SPAN
402         info_end = this->position + abs(stol(this->info.at("SPAN")[0]));
403     }
404     else if (svtype == "DEL" && has_len) {
405         // For deletions, we can get the END from the SVLEN
406         info_end = this->position + abs(stol(this->info.at("SVLEN")[0]));
407     }
408     else if (svtype == "INS"){
409         // For insertions, END is just POS if not specified
410         info_end = this->position;
411     }
412     else{
413         cerr << "Warning: could not set END info " << *this << endl;
414         return false;
415     }
416 
417     // Commit back the END
418     this->info["END"].resize(1);
419     this->info["END"][0] = to_string(info_end);
420     has_end = true;
421 
422     // What is the variant length change?
423     // We store it as absolute value
424     long info_len = 0;
425     if (has_len){
426         // Get the SVLEN from the tag
427         info_len = abs(stol(this->info.at("SVLEN")[0]));
428     }
429     else if ((svtype == "INS" || svtype == "DEL") && has_span){
430         info_len = abs(stol(this->info.at("SPAN")[0]));
431     }
432     else if (svtype == "DEL"){
433         // We always have the end by now
434         // Deletion ends give you length change
435         info_len = info_end - this->position;
436     }
437     else if (svtype == "INV"){
438         // Inversions have 0 length change unless otherwise specified.
439         info_len = 0;
440     }
441     else if (svtype == "INS" && has_seq) {
442         // Insertions can let us pick it up from the SEQ tag
443         info_len = this->info.at("SEQ").at(0).size();
444     }
445     else{
446         cerr << "Warning: could not set SVLEN info " << *this << endl;
447         return false;
448     }
449 
450     // Commit the SVLEN back
451     if (svtype == "DEL"){
452         // Should be saved as negative
453         this->info["SVLEN"].resize(1);
454         this->info["SVLEN"][0] = to_string(-info_len);
455     }
456     else{
457         // Should be saved as positive
458         this->info["SVLEN"].resize(1);
459         this->info["SVLEN"][0] = to_string(info_len);
460     }
461     // Now the length change is known
462     has_len = true;
463 
464     // We also compute a span
465     long info_span = 0;
466     if (has_span){
467         // Use the specified span
468         info_span = abs(stol(this->info.at("SVLEN")[0]));
469     }
470     else if (svtype == "INS" || svtype == "DEL"){
471         // has_len is always true here
472         // Insertions and deletions let us determine the span from the length change, unless they are complex.
473         info_span = info_len;
474     }
475     else if (svtype == "INV"){
476         // has_end is always true here
477         // Inversion span is start to end
478         info_span = info_end - this->position;
479     }
480     else{
481         cerr << "Warning: could not set SPAN info " << *this << endl;
482         return false;
483     }
484 
485     // Commit the SPAN back
486     this->info["SPAN"].resize(1);
487     this->info["SPAN"][0] = to_string(info_span);
488     // Now the span change is known
489     has_span = true;
490 
491     if (info_end < this->position) {
492         cerr << "Warning: SV END is before POS [canonicalize] " <<
493         *this << endl << "END: " << info_end << "  " << "POS: " << this->position << endl;
494         return false;
495     }
496 
497     if (has_seq) {
498         // Force the SEQ to upper case, if already present
499         this->info["SEQ"].resize(1);
500         this->info["SEQ"][0] = toUpper(this->info["SEQ"][0]);
501     }
502 
503     // Set the other necessary SV Tags (SVTYPE, SEQ (if insertion))
504     // Also check for agreement in the position tags
505     if (svtype == "INS"){
506         if (info_end != this->position){
507             cerr << "Warning: insertion END and POS do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
508             *this << endl << "END: " << info_end << "  " << "POS: " << this->position << endl;
509 
510             if (info_end == this->position + info_len) {
511                 // We can probably guess what they meant here.
512                 cerr << "Warning: VCF writer incorrecty produced END = POS + SVLEN for an insertion. Fixing END to POS." << endl;
513                 info_end = this->position;
514                 this->info["END"][0] = to_string(info_end);
515             } else {
516                 return false;
517             }
518         }
519 
520         if (info_len != info_span){
521             cerr << "Warning: insertion SVLEN and SPAN do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
522             *this << endl << "SVLEN: " << info_len << "  " << "SPAN: " << info_span << endl;
523             return false;
524         }
525 
526         if (has_seq && allATGCN(this->info.at("SEQ")[0]) && this->info.at("SEQ")[0].size() != info_len){
527             cerr << "Warning: insertion SVLEN and SEQ do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
528             *this << endl << "SVLEN: " << info_len << "  " << "SEQ length: " << this->info.at("SEQ")[0].size() << endl;
529             return false;
530         }
531 
532         // Set REF
533         string ref_base = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), 1));
534         if (place_seq){
535             this->ref.assign(ref_base);
536         }
537 
538         if (has_seq &&
539                  alt[0] != this->info.at("SEQ")[0] &&
540                  allATGCN(this->info.at("SEQ")[0])){
541             // Try to remove prepended ref sequence, assuming it's left-aligned
542             string s = this->alt[0];
543             s = toUpper(s.substr(this->ref.length()));
544             if (s != this->info.at("SEQ")[0] && !place_seq){
545                 cerr << "Warning: INS sequence in alt field does not match SEQ tag" << endl <<
546                 this->alt[0] << " " << this->info.at("SEQ")[0] << endl;
547                 return false;
548             }
549             if (place_seq){
550                 this->alt[0].assign( ref_base + this->info.at("SEQ")[0] );
551             }
552 
553         }
554         else if (alt_i_atgcn[0] && !has_seq){
555             string s = this->alt[0];
556             s = toUpper(s.substr(this->ref.length()));
557             this->info["SEQ"].resize(1);
558             this->info.at("SEQ")[0].assign(s);
559 
560             if (s.size() != info_len){
561                 cerr << "Warning: insertion SVLEN and added bases do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
562                 *this << endl << "SVLEN: " << info_len << "  " << "added bases: " << s.size() << endl;
563                 return false;
564             }
565 
566         }
567         else if (alt[0][0] == '<' && do_external_insertions){
568 
569             string ins_seq;
570             string seq_id = alt[0].substr(1, alt[0].size() - 2);
571 
572             if (insertion_fasta->index->find(seq_id) != insertion_fasta->index->end()){
573                 ins_seq = toUpper(insertion_fasta->getSequence(seq_id));
574                 if (allATGCN(ins_seq)){
575                     this->info["SEQ"].resize(1);
576                     this->info["SEQ"][0].assign(ins_seq);
577                     if (place_seq){
578                         this->alt[0].assign(ref_base + ins_seq);
579                     }
580                 }
581                 else {
582                     cerr << "Warning: Loaded invalid alt sequence for: " << *this << endl;
583                     return false;
584                 }
585 
586                 if (ins_seq.size() != info_len){
587                     cerr << "Warning: insertion SVLEN and FASTA do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
588                     *this << endl << "SVLEN: " << info_len << "  " << "FASTA bases: " << ins_seq.size() << endl;
589                     return false;
590                 }
591             }
592             else{
593                 cerr << "Warning: could not locate alt sequence for: " << *this << endl;
594                 return false;
595             }
596 
597         }
598         else{
599             cerr << "Warning: could not set SEQ [canonicalize]. " << *this << endl;
600             return false;
601         }
602     }
603     else if (svtype == "DEL"){
604         if (this->position + info_span != info_end){
605             cerr << "Warning: deletion END and SVLEN do not agree [canonicalize] " << *this << endl <<
606             "END: " << info_end << "  " << "SVLEN: " << -info_len << endl;
607             return false;
608         }
609 
610         if (this->position + info_span != info_end){
611             cerr << "Warning: deletion END and SPAN do not agree [canonicalize] " << *this << endl <<
612             "END: " << info_end << "  " << "SPAN: " << info_span << endl;
613             return false;
614         }
615 
616         // Set REF
617         if (place_seq){
618             string del_seq = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), info_len + 1));
619             string ref_base = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), 1));
620             this->ref.assign( del_seq );
621             this->alt[0].assign( ref_base );
622         }
623     }
624     else if (svtype == "INV"){
625         if (this->position + info_span != info_end){
626             cerr << "Warning: inversion END and SPAN do not agree [canonicalize] " << *this << endl <<
627             "END: " << info_end << "  " << "SPAN: " << info_span << endl;
628             return false;
629         }
630 
631         if (info_len != 0){
632             cerr << "Warning: inversion SVLEN specifies nonzero length change (complex inversions not canonicalizeable) [canonicalize] " <<
633             *this << endl << "SVLEN: " << info_len << endl;
634 
635             if (info_end == this->position + info_len) {
636                 // We can probably guess what they meant here.
637                 cerr << "Warning: VCF writer incorrecty produced END = POS + SVLEN for an inversion. Fixing SVLEN to 0." << endl;
638                 info_len = 0;
639                 this->info["SVLEN"][0] = to_string(info_len);
640             } else {
641                 return false;
642             }
643         }
644 
645         if (place_seq){
646             string ref_seq = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), info_span + 1));
647             // Note that inversions still need an anchoring left base at POS
648             string inv_seq = ref_seq.substr(0, 1) + reverse_complement(ref_seq.substr(1));
649             this->ref.assign(ref_seq);
650             this->alt[0].assign(inv_seq);
651         }
652 
653     }
654     else{
655         cerr << "Warning: invalid SV type [canonicalize]:" << *this << endl;
656         return false;
657     }
658 
659 
660     this->updateAlleleIndexes();
661 
662     // Check for harmony between ref / alt / tags
663     if (this->position > stol(this->info.at("END").at(0))){
664         cerr << "Warning: position > END. Possible reference genome mismatch." << endl;
665         return false;
666     }
667 
668     if (svtype == "INS"){
669         assert(!this->info.at("SEQ")[0].empty());
670     }
671 
672     this->canonical = true;
673     return true;
674 }
675 
setVariantCallFile(VariantCallFile & v)676 void Variant::setVariantCallFile(VariantCallFile& v) {
677     sampleNames = v.sampleNames;
678     outputSampleNames = v.sampleNames;
679     vcf = &v;
680 }
681 
setVariantCallFile(VariantCallFile * v)682 void Variant::setVariantCallFile(VariantCallFile* v) {
683     sampleNames = v->sampleNames;
684     outputSampleNames = v->sampleNames;
685     vcf = v;
686 }
687 
operator <<(ostream & out,VariantFieldType type)688 ostream& operator<<(ostream& out, VariantFieldType type) {
689     switch (type) {
690         case FIELD_INTEGER:
691             out << "integer";
692             break;
693         case FIELD_FLOAT:
694             out << "float";
695             break;
696         case FIELD_BOOL:
697             out << "bool";
698             break;
699         case FIELD_STRING:
700             out << "string";
701             break;
702         default:
703             out << "unknown";
704             break;
705     }
706     return out;
707 }
708 
typeStrToVariantFieldType(string & typeStr)709 VariantFieldType typeStrToVariantFieldType(string& typeStr) {
710     if (typeStr == "Integer") {
711         return FIELD_INTEGER;
712     } else if (typeStr == "Float") {
713         return FIELD_FLOAT;
714     } else if (typeStr == "Flag") {
715         return FIELD_BOOL;
716     } else if (typeStr == "String") {
717         return FIELD_STRING;
718     } else {
719         return FIELD_UNKNOWN;
720     }
721 }
722 
infoType(const string & key)723 VariantFieldType Variant::infoType(const string& key) {
724     map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
725     if (s == vcf->infoTypes.end()) {
726         if (key == "FILTER") { // hack to use FILTER as an "info" field (why the hack?)
727             return FIELD_STRING;
728         }
729         if (key == "QUAL") { // hack to use QUAL as an "info" field
730             return FIELD_INTEGER;
731         }
732         cerr << "no info field " << key << endl;
733         exit(1);
734     } else {
735         return s->second;
736     }
737 }
738 
formatType(const string & key)739     VariantFieldType Variant::formatType(const string& key) {
740         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
741         if (s == vcf->formatTypes.end()) {
742             cerr << "no format field " << key << endl;
743             exit(1);
744         } else {
745             return s->second;
746         }
747     }
748 
getInfoValueBool(const string & key,int index)749     bool Variant::getInfoValueBool(const string& key, int index) {
750         map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
751         if (s == vcf->infoTypes.end()) {
752             cerr << "no info field " << key << endl;
753             exit(1);
754         } else {
755             int count = vcf->infoCounts[key];
756             // XXX TODO, fix for Genotype variants...
757             if (count != ALLELE_NUMBER) {
758                 index = 0;
759             }
760             if (index == INDEX_NONE) {
761                 if (count != 1) {
762                     cerr << "no field index supplied and field count != 1" << endl;
763                     exit(1);
764                 } else {
765                     index = 0;
766                 }
767             }
768             VariantFieldType type = s->second;
769             if (type == FIELD_BOOL) {
770                 map<string, bool>::iterator b = infoFlags.find(key);
771                 if (b == infoFlags.end())
772                     return false;
773                 else
774                     return true;
775             } else {
776                 cerr << "not flag type " << key << endl;
777                 exit(1);
778             }
779         }
780     }
781 
getInfoValueString(const string & key,int index)782     string Variant::getInfoValueString(const string& key, int index) {
783         map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
784         if (s == vcf->infoTypes.end()) {
785             if (key == "FILTER") {
786               return filter;
787             }
788             cerr << "no info field " << key << endl;
789             exit(1);
790         } else {
791             int count = vcf->infoCounts[key];
792             // XXX TODO, fix for Genotype variants...
793             if (count != ALLELE_NUMBER) {
794                 index = 0;
795             }
796             if (index == INDEX_NONE) {
797                 if (count != 1) {
798                     cerr << "no field index supplied and field count != 1" << endl;
799                     exit(1);
800                 } else {
801                     index = 0;
802                 }
803             }
804             VariantFieldType type = s->second;
805             if (type == FIELD_STRING) {
806                 map<string, vector<string> >::iterator b = info.find(key);
807                 if (b == info.end())
808                     return "";
809                 return b->second.at(index);
810             } else {
811                 cerr << "not string type " << key << endl;
812                 return "";
813             }
814         }
815     }
816 
getInfoValueFloat(const string & key,int index)817     double Variant::getInfoValueFloat(const string& key, int index) {
818         map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
819         if (s == vcf->infoTypes.end()) {
820             if (key == "QUAL") {
821                 return quality;
822             }
823             cerr << "no info field " << key << endl;
824             exit(1);
825         } else {
826             int count = vcf->infoCounts[key];
827             // XXX TODO, fix for Genotype variants...
828             if (count != ALLELE_NUMBER) {
829                 index = 0;
830             }
831             if (index == INDEX_NONE) {
832                 if (count != 1) {
833                     cerr << "no field index supplied and field count != 1" << endl;
834                     exit(1);
835                 } else {
836                     index = 0;
837                 }
838             }
839             VariantFieldType type = s->second;
840             if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
841                 map<string, vector<string> >::iterator b = info.find(key);
842                 if (b == info.end())
843                     return false;
844                 double r;
845                 if (!convert(b->second.at(index), r)) {
846                     cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
847                     exit(1);
848                 }
849                 return r;
850             } else {
851                 cerr << "unsupported type for variant record " << type << endl;
852                 exit(1);
853             }
854         }
855     }
856 
getNumSamples(void)857     int Variant::getNumSamples(void) {
858         return sampleNames.size();
859     }
860 
getNumValidGenotypes(void)861     int Variant::getNumValidGenotypes(void) {
862         int valid_genotypes = 0;
863         map<string, map<string, vector<string> > >::const_iterator s     = samples.begin();
864         map<string, map<string, vector<string> > >::const_iterator sEnd  = samples.end();
865         for (; s != sEnd; ++s) {
866             map<string, vector<string> > sample_info = s->second;
867             if (sample_info["GT"].front() != "./.") {
868                 valid_genotypes++;
869             }
870         }
871         return valid_genotypes;
872     }
873 
getSampleValueBool(const string & key,string & sample,int index)874     bool Variant::getSampleValueBool(const string& key, string& sample, int index) {
875         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
876         if (s == vcf->infoTypes.end()) {
877             cerr << "no info field " << key << endl;
878             exit(1);
879         } else {
880             int count = vcf->formatCounts[key];
881             // XXX TODO, fix for Genotype variants...
882             if (count != ALLELE_NUMBER) {
883                 index = 0;
884             }
885             if (index == INDEX_NONE) {
886                 if (count != 1) {
887                     cerr << "no field index supplied and field count != 1" << endl;
888                     exit(1);
889                 } else {
890                     index = 0;
891                 }
892             }
893             VariantFieldType type = s->second;
894             map<string, vector<string> >& sampleData = samples[sample];
895             if (type == FIELD_BOOL) {
896                 map<string, vector<string> >::iterator b = sampleData.find(key);
897                 if (b == sampleData.end())
898                     return false;
899                 else
900                     return true;
901             } else {
902                 cerr << "not bool type " << key << endl;
903                 exit(1);
904             }
905         }
906     }
907 
getSampleValueString(const string & key,string & sample,int index)908     string Variant::getSampleValueString(const string& key, string& sample, int index) {
909         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
910         if (s == vcf->infoTypes.end()) {
911             cerr << "no info field " << key << endl;
912             exit(1);
913         } else {
914             int count = vcf->formatCounts[key];
915             // XXX TODO, fix for Genotype variants...
916             if (count != ALLELE_NUMBER) {
917                 index = 0;
918             }
919             if (index == INDEX_NONE) {
920                 if (count != 1) {
921                     cerr << "no field index supplied and field count != 1" << endl;
922                     exit(1);
923                 } else {
924                     index = 0;
925                 }
926             }
927             VariantFieldType type = s->second;
928             map<string, vector<string> >& sampleData = samples[sample];
929             if (type == FIELD_STRING) {
930                 map<string, vector<string> >::iterator b = sampleData.find(key);
931                 if (b == sampleData.end()) {
932                     return "";
933                 } else {
934                     return b->second.at(index);
935                 }
936             } else {
937                 cerr << "not string type " << key << endl;
938                 exit(1);
939             }
940         }
941     }
942 
getSampleValueFloat(const string & key,string & sample,int index)943     double Variant::getSampleValueFloat(const string& key, string& sample, int index) {
944         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
945         if (s == vcf->infoTypes.end()) {
946             cerr << "no info field " << key << endl;
947             exit(1);
948         } else {
949             // XXX TODO wrap this with a function call
950             int count = vcf->formatCounts[key];
951             // XXX TODO, fix for Genotype variants...
952             if (count != ALLELE_NUMBER) {
953                 index = 0;
954             }
955             if (index == INDEX_NONE) {
956                 if (count != 1) {
957                     cerr << "no field index supplied and field count != 1" << endl;
958                     exit(1);
959                 } else {
960                     index = 0;
961                 }
962             }
963             VariantFieldType type = s->second;
964             map<string, vector<string> >& sampleData = samples[sample];
965             if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
966                 map<string, vector<string> >::iterator b = sampleData.find(key);
967                 if (b == sampleData.end())
968                     return false;
969                 double r;
970                 if (!convert(b->second.at(index), r)) {
971                     cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
972                     exit(1);
973                 }
974                 return r;
975             } else {
976                 cerr << "unsupported type for sample " << type << endl;
977                 exit(1);
978             }
979         }
980     }
981 
getValueBool(const string & key,string & sample,int index)982     bool Variant::getValueBool(const string& key, string& sample, int index) {
983         if (sample.empty()) { // an empty sample name means
984             return getInfoValueBool(key, index);
985         } else {
986             return getSampleValueBool(key, sample, index);
987         }
988     }
989 
getValueFloat(const string & key,string & sample,int index)990     double Variant::getValueFloat(const string& key, string& sample, int index) {
991         if (sample.empty()) { // an empty sample name means
992             return getInfoValueFloat(key, index);
993         } else {
994             return getSampleValueFloat(key, sample, index);
995         }
996     }
997 
getValueString(const string & key,string & sample,int index)998     string Variant::getValueString(const string& key, string& sample, int index) {
999         if (sample.empty()) { // an empty sample name means
1000             return getInfoValueString(key, index);
1001         } else {
1002             return getSampleValueString(key, sample, index);
1003         }
1004     }
1005 
getAltAlleleIndex(const string & allele)1006     int Variant::getAltAlleleIndex(const string& allele) {
1007         map<string, int>::iterator f = altAlleleIndexes.find(allele);
1008         if (f == altAlleleIndexes.end()) {
1009             cerr << "no such allele \'" << allele << "\' in record " << sequenceName << ":" << position << endl;
1010             exit(1);
1011         } else {
1012             return f->second;
1013         }
1014     }
1015 
addFilter(const string & tag)1016     void Variant::addFilter(const string& tag) {
1017         if (filter == "" || filter == ".")
1018             filter = tag;
1019         else
1020             filter += "," + tag;
1021     }
1022 
addFormatField(const string & key)1023     void Variant::addFormatField(const string& key) {
1024         bool hasTag = false;
1025         for (vector<string>::iterator t = format.begin(); t != format.end(); ++t) {
1026             if (*t == key) {
1027                 hasTag = true;
1028                 break;
1029             }
1030         }
1031         if (!hasTag) {
1032             format.push_back(key);
1033         }
1034     }
1035 
printAlt(ostream & out)1036     void Variant::printAlt(ostream& out) {
1037         for (vector<string>::iterator i = alt.begin(); i != alt.end(); ++i) {
1038             out << *i;
1039             // add a comma for all but the last alternate allele
1040             if (i != (alt.end() - 1)) out << ",";
1041         }
1042     }
1043 
printAlleles(ostream & out)1044     void Variant::printAlleles(ostream& out) {
1045         for (vector<string>::iterator i = alleles.begin(); i != alleles.end(); ++i) {
1046             out << *i;
1047             // add a comma for all but the last alternate allele
1048             if (i != (alleles.end() - 1)) out << ",";
1049         }
1050     }
1051 
operator <<(ostream & out,Variant & var)1052     ostream& operator<<(ostream& out, Variant& var) {
1053         // ensure there are no empty fields
1054         if (var.sequenceName.empty()) var.sequenceName = ".";
1055         if (var.id.empty()) var.id = ".";
1056         if (var.ref.empty()) var.ref = ".";
1057         if (var.alt.empty()) var.alt.push_back(".");
1058         if (var.filter.empty()) var.filter = ".";
1059 
1060         out << var.sequenceName << "\t"
1061             << var.position << "\t"
1062             << var.id << "\t"
1063             << var.ref << "\t";
1064         // report the list of alternate alleles.
1065         var.printAlt(out);
1066         out << "\t"
1067             << var.quality << "\t"
1068             << var.filter << "\t";
1069         if (var.info.empty() && var.infoFlags.empty()) {
1070             out << ".";
1071         } else {
1072             for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
1073                 if (!i->second.empty()) {
1074                     out << ((i == var.info.begin()) ? "" : ";") << i->first << "=" << join(i->second, ",");
1075                 }
1076             }
1077             for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
1078                 if (i == var.infoFlags.end()) {
1079                     out << "";
1080                 } else if (i == var.infoFlags.begin() && var.info.empty()) {
1081                     out << "";
1082                 } else {
1083                     out << ";";
1084                 }
1085                 out << i->first;
1086             }
1087         }
1088         if (!var.format.empty()) {
1089             out << "\t";
1090             for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
1091                 out << ((f == var.format.begin()) ? "" : ":") << *f;
1092             }
1093             for (vector<string>::iterator s = var.outputSampleNames.begin(); s != var.outputSampleNames.end(); ++s) {
1094                 out << "\t";
1095                 map<string, map<string, vector<string> > >::iterator sampleItr = var.samples.find(*s);
1096                 if (sampleItr == var.samples.end()) {
1097                     out << ".";
1098                 } else {
1099                     map<string, vector<string> >& sample = sampleItr->second;
1100                     if (sample.size() == 0) {
1101                         out << ".";
1102                     } else {
1103                         for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
1104                             map<string, vector<string> >::iterator g = sample.find(*f);
1105                             out << ((f == var.format.begin()) ? "" : ":");
1106                             if (g != sample.end() && !g->second.empty()) {
1107                                 out << join(g->second, ",");
1108                             } else {
1109                                 out << ".";
1110                             }
1111                         }
1112                     }
1113                 }
1114             }
1115         }
1116         return out;
1117     }
1118 
setOutputSampleNames(vector<string> & samplesToOutput)1119     void Variant::setOutputSampleNames(vector<string>& samplesToOutput) {
1120         outputSampleNames = samplesToOutput;
1121     }
1122 
1123 
1124 // shunting yard algorithm
infixToPrefix(queue<RuleToken> tokens,queue<RuleToken> & prefixtokens)1125     void infixToPrefix(queue<RuleToken> tokens, queue<RuleToken>& prefixtokens) {
1126         stack<RuleToken> ops;
1127         while (!tokens.empty()) {
1128             RuleToken& token = tokens.front();
1129             if (isOperator(token)) {
1130                 //cerr << "found operator " << token.value << endl;
1131                 while (ops.size() > 0 && isOperator(ops.top())
1132                        && (   (isLeftAssociative(token)  && priority(token) <= priority(ops.top()))
1133                               || (isRightAssociative(token) && priority(token) <  priority(ops.top())))) {
1134                     prefixtokens.push(ops.top());
1135                     ops.pop();
1136                 }
1137                 ops.push(token);
1138             } else if (isLeftParenthesis(token)) {
1139                 //cerr << "found paran " << token.value << endl;
1140                 ops.push(token);
1141             } else if (isRightParenthesis(token)) {
1142                 //cerr << "found paran " << token.value << endl;
1143                 while (ops.size() > 0 && !isLeftParenthesis(ops.top())) {
1144                     prefixtokens.push(ops.top());
1145                     ops.pop();
1146                 }
1147                 if (ops.size() == 0) {
1148                     cerr << "error: mismatched parentheses" << endl;
1149                     exit(1);
1150                 }
1151                 if (isLeftParenthesis(ops.top())) {
1152                     ops.pop();
1153                 }
1154             } else {
1155                 //cerr << "found operand " << token.value << endl;
1156                 prefixtokens.push(token);
1157             }
1158             tokens.pop();
1159         }
1160         while (ops.size() > 0) {
1161             if (isRightParenthesis(ops.top()) || isLeftParenthesis(ops.top())) {
1162                 cerr << "error: mismatched parentheses" << endl;
1163                 exit(1);
1164             }
1165             prefixtokens.push(ops.top());
1166             ops.pop();
1167         }
1168     }
1169 
RuleToken(string tokenstr,map<string,VariantFieldType> & variables)1170     RuleToken::RuleToken(string tokenstr, map<string, VariantFieldType>& variables) {
1171         isVariable = false;
1172         if (tokenstr == "!") {
1173             type = RuleToken::NOT_OPERATOR;
1174         } else if (tokenstr == "&") {
1175             type = RuleToken::AND_OPERATOR;
1176         } else if (tokenstr == "|") {
1177             type = RuleToken::OR_OPERATOR;
1178         } else if (tokenstr == "+") {
1179             type = RuleToken::ADD_OPERATOR;
1180         } else if (tokenstr == "-") {
1181             type = RuleToken::SUBTRACT_OPERATOR;
1182         } else if (tokenstr == "*") {
1183             type = RuleToken::MULTIPLY_OPERATOR;
1184         } else if (tokenstr == "/") {
1185             type = RuleToken::DIVIDE_OPERATOR;
1186         } else if (tokenstr == "=") {
1187             type = RuleToken::EQUAL_OPERATOR;
1188         } else if (tokenstr == ">") {
1189             type = RuleToken::GREATER_THAN_OPERATOR;
1190         } else if (tokenstr == "<") {
1191             type = RuleToken::LESS_THAN_OPERATOR;
1192         } else if (tokenstr == "(") {
1193             type = RuleToken::LEFT_PARENTHESIS;
1194         } else if (tokenstr == ")") {
1195             type = RuleToken::RIGHT_PARENTHESIS;
1196         } else { // operand
1197             type = RuleToken::OPERAND;
1198             if (variables.find(tokenstr) == variables.end()) {
1199                 if (convert(tokenstr, number)) {
1200                     type = RuleToken::NUMBER;
1201                 } else if (tokenstr == "QUAL") {
1202                     isVariable = true;
1203                 } else if (tokenstr == "FILTER") {
1204                     isVariable = true;
1205                 } else {
1206                     type = RuleToken::STRING_VARIABLE;
1207                 }
1208             } else {
1209                 isVariable = true;
1210             }
1211         }
1212         value = tokenstr;
1213     }
1214 
1215 
tokenizeFilterSpec(string & filterspec,queue<RuleToken> & tokens,map<string,VariantFieldType> & variables)1216     void tokenizeFilterSpec(string& filterspec, queue<RuleToken>& tokens, map<string, VariantFieldType>& variables) {
1217         string lastToken = "";
1218         bool inToken = false;
1219         for (unsigned int i = 0; i <  filterspec.size(); ++i) {
1220             char c = filterspec.at(i);
1221             if (c == ' ' || c == '\n') {
1222                 inToken = false;
1223                 if (!inToken && lastToken.size() > 0) {
1224                     tokens.push(RuleToken(lastToken, variables));
1225                     lastToken = "";
1226                 }
1227             } else if (!inToken && (isOperatorChar(c) || isParanChar(c))) {
1228                 inToken = false;
1229                 if (lastToken.size() > 0) {
1230                     tokens.push(RuleToken(lastToken, variables));
1231                     lastToken = "";
1232                 }
1233                 tokens.push(RuleToken(filterspec.substr(i,1), variables));
1234             } else {
1235                 inToken = true;
1236                 lastToken += c;
1237             }
1238         }
1239         // get the last token
1240         if (inToken) {
1241             tokens.push(RuleToken(lastToken, variables));
1242         }
1243     }
1244 
1245 // class which evaluates filter expressions
1246 // allow filters to be defined using boolean infix expressions e.g.:
1247 //
1248 // "GQ > 10 & (DP < 3 | DP > 5) & SAMPLE = NA12878"
1249 // or
1250 // "GT = 1/1 | GT = 0/0"
1251 //
1252 // on initialization, tokenizes the input sequence, and converts it from infix to postfix
1253 // on call to
1254 //
1255 
1256 
VariantFilter(string filterspec,VariantFilterType filtertype,map<string,VariantFieldType> & variables)1257     VariantFilter::VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables) {
1258         type = filtertype;
1259         spec = filterspec;
1260         tokenizeFilterSpec(filterspec, tokens, variables);
1261         infixToPrefix(tokens, rules);
1262         /*while (!rules.empty()) {
1263           cerr << " " << rules.front().value << ((isNumeric(rules.front())) ? "f" : "");
1264           rules.pop();
1265           }
1266         */
1267         //cerr << endl;
1268         //cerr << join(" ", tokens) << endl;
1269     }
1270 
1271 // all alts pass
passes(Variant & var,string & sample)1272     bool VariantFilter::passes(Variant& var, string& sample) {
1273         for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
1274             string& allele = *a;
1275             if (!passes(var, sample, allele)) {
1276                 return false;
1277             }
1278         }
1279         return true;
1280     }
1281 
passes(Variant & var,string & sample,string & allele)1282     bool VariantFilter::passes(Variant& var, string& sample, string& allele) {
1283         // to evaluate a rpn boolean queue with embedded numbers and variables
1284         // make a result stack, use float to allow comparison of floating point
1285         // numbers, booleans, and integers
1286         stack<RuleToken> results;
1287         queue<RuleToken> rulesCopy = rules; // copy
1288 
1289         int index;
1290         if (allele.empty()) {
1291             index = 0; // apply to the whole record
1292         } else {
1293             // apply to a specific allele
1294             index = var.getAltAlleleIndex(allele);
1295         }
1296 
1297         while (!rulesCopy.empty()) {
1298             RuleToken token = rulesCopy.front();
1299             rulesCopy.pop();
1300         // pop operands from the front of the queue and push them onto the stack
1301         if (isOperand(token)) {
1302             //cout << "is operand: " << token.value << endl;
1303             // if the token is variable, i.e. not evaluated in this context, we
1304             // must evaluate it before pushing it onto the stack
1305             if (token.isVariable) {
1306                 //cout << "is variable" << endl;
1307                 // look up the variable using the Variant, depending on our filter type
1308                 //cout << "token.value " << token.value << endl;
1309                 VariantFieldType vtype;
1310                 if (sample.empty()) { // means we are record-specific
1311                     vtype = var.infoType(token.value);
1312                 } else {
1313                     vtype = var.formatType(token.value);
1314                     //cout << "type = " << type << endl;
1315                 }
1316                 //cout << "type: " << type << endl;
1317 
1318                 if (vtype == FIELD_INTEGER || vtype == FIELD_FLOAT) {
1319                     token.type = RuleToken::NUMERIC_VARIABLE;
1320                     token.number = var.getValueFloat(token.value, sample, index);
1321                     //cerr << "number: " << token.number << endl;
1322                 } else if (vtype == FIELD_BOOL) {
1323                     token.type = RuleToken::BOOLEAN_VARIABLE;
1324                     token.state = var.getValueBool(token.value, sample, index);
1325                     //cerr << "state: " << token.state << endl;
1326                 } else if (vtype == FIELD_STRING) {
1327                     //cout << "token.value = " << token.value << endl;
1328                     token.type = RuleToken::STRING_VARIABLE;
1329                     token.str = var.getValueString(token.value, sample, index);
1330                 } else if (isString(token)) {
1331                     token.type = RuleToken::STRING_VARIABLE;
1332                     token.str = var.getValueString(token.value, sample, index);
1333                     //cerr << "string: " << token.str << endl;
1334                 }
1335             } else {
1336                 double f;
1337                 string s;
1338                 //cerr << "parsing operand" << endl;
1339                 if (convert(token.value, f)) {
1340                     token.type = RuleToken::NUMERIC_VARIABLE;
1341                     token.number = f;
1342                     //cerr << "number: " << token.number << endl;
1343                 } else if (convert(token.value, s)) {
1344                     token.type = RuleToken::STRING_VARIABLE;
1345                     token.str = s;
1346                     //cerr << "string: " << token.str << endl;
1347                 } else {
1348                     cerr << "could not parse non-variable operand " << token.value << endl;
1349                     exit(1);
1350                 }
1351             }
1352             results.push(token);
1353         }
1354         // apply operators to the first n elements on the stack and push the result back onto the stack
1355         else if (isOperator(token)) {
1356             //cerr << "is operator: " << token.value << endl;
1357             RuleToken a, b, r;
1358             // is it a not-operator?
1359             switch (token.type) {
1360                 case ( RuleToken::NOT_OPERATOR ):
1361                     a = results.top();
1362                     results.pop();
1363                     if (!isBoolean(a)) {
1364                         cerr << "cannot negate a non-boolean" << endl;
1365                     } else {
1366                         a.state = !a.state;
1367                         results.push(a);
1368                     }
1369                     break;
1370 
1371                 case ( RuleToken::EQUAL_OPERATOR ):
1372                     a = results.top(); results.pop();
1373                     b = results.top(); results.pop();
1374                     if (a.type == b.type) {
1375                         switch (a.type) {
1376                             case (RuleToken::STRING_VARIABLE):
1377                                 r.state = (a.str == b.str);
1378                                 break;
1379                             case (RuleToken::NUMERIC_VARIABLE):
1380                                 r.state = (a.number == b.number);
1381                                 break;
1382                             case (RuleToken::BOOLEAN_VARIABLE):
1383                                 r.state = (a.state == b.state);
1384                                 break;
1385                             default:
1386                                 cerr << "should not get here" << endl; exit(1);
1387                                 break;
1388                         }
1389                     } else if (a.type == RuleToken::STRING_VARIABLE && b.type == RuleToken::NUMERIC_VARIABLE) {
1390                         r.state = (convert(b.number) == a.str);
1391                     } else if (b.type == RuleToken::STRING_VARIABLE && a.type == RuleToken::NUMERIC_VARIABLE) {
1392                         r.state = (convert(a.number) == b.str);
1393                     }
1394                     results.push(r);
1395                     break;
1396 
1397                 case ( RuleToken::GREATER_THAN_OPERATOR ):
1398                     a = results.top(); results.pop();
1399                     b = results.top(); results.pop();
1400                     if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1401                         r.state = (b.number > a.number);
1402                     } else {
1403                         cerr << "cannot compare (>) objects of dissimilar types" << endl;
1404                         cerr << a.type << " " << b.type << endl;
1405                         exit(1);
1406                     }
1407                     results.push(r);
1408                     break;
1409 
1410                 case ( RuleToken::LESS_THAN_OPERATOR ):
1411                     a = results.top(); results.pop();
1412                     b = results.top(); results.pop();
1413                     if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1414                         r.state = (b.number < a.number);
1415                     } else {
1416                         cerr << "cannot compare (<) objects of dissimilar types" << endl;
1417                         cerr << a.type << " " << b.type << endl;
1418                         exit(1);
1419                     }
1420                     results.push(r);
1421                     break;
1422 
1423                 case ( RuleToken::ADD_OPERATOR ):
1424                     a = results.top(); results.pop();
1425                     b = results.top(); results.pop();
1426                     if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1427                         r.number = (b.number + a.number);
1428                         r.type = RuleToken::NUMERIC_VARIABLE;
1429                     } else {
1430                         cerr << "cannot add objects of dissimilar types" << endl;
1431                         cerr << a.type << " " << b.type << endl;
1432                         exit(1);
1433                     }
1434                     results.push(r);
1435                     break;
1436 
1437                 case ( RuleToken::SUBTRACT_OPERATOR ):
1438                     a = results.top(); results.pop();
1439                     b = results.top(); results.pop();
1440                     if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1441                         r.number = (b.number - a.number);
1442                         r.type = RuleToken::NUMERIC_VARIABLE;
1443                     } else {
1444                         cerr << "cannot subtract objects of dissimilar types" << endl;
1445                         cerr << a.type << " " << b.type << endl;
1446                         exit(1);
1447                     }
1448                     results.push(r);
1449                     break;
1450 
1451                 case ( RuleToken::MULTIPLY_OPERATOR ):
1452                     a = results.top(); results.pop();
1453                     b = results.top(); results.pop();
1454                     if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1455                         r.number = (b.number * a.number);
1456                         r.type = RuleToken::NUMERIC_VARIABLE;
1457                     } else {
1458                         cerr << "cannot multiply objects of dissimilar types" << endl;
1459                         cerr << a.type << " " << b.type << endl;
1460                         exit(1);
1461                     }
1462                     results.push(r);
1463                     break;
1464 
1465                 case ( RuleToken::DIVIDE_OPERATOR):
1466                     a = results.top(); results.pop();
1467                     b = results.top(); results.pop();
1468                     if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
1469                         r.number = (b.number / a.number);
1470                         r.type = RuleToken::NUMERIC_VARIABLE;
1471                     } else {
1472                         cerr << "cannot divide objects of dissimilar types" << endl;
1473                         cerr << a.type << " " << b.type << endl;
1474                         exit(1);
1475                     }
1476                     results.push(r);
1477                     break;
1478 
1479                 case ( RuleToken::AND_OPERATOR ):
1480                 case ( RuleToken::OR_OPERATOR ):
1481                     a = results.top(); results.pop();
1482                     b = results.top(); results.pop();
1483                     if (a.type == b.type && a.type == RuleToken::BOOLEAN_VARIABLE) {
1484                         if (token.type == RuleToken::AND_OPERATOR) {
1485                             r.state = (a.state && b.state);
1486                         } else {
1487                             r.state = (a.state || b.state);
1488                         }
1489                     } else {
1490                         cerr << "cannot compare (& or |) objects of dissimilar types" << endl;
1491                         exit(1);
1492                     }
1493                     results.push(r);
1494                     break;
1495                 default:
1496                     cerr << "should not get here!" << endl; exit(1);
1497                     break;
1498             }
1499         }
1500     }
1501     // at the end you should have only one value on the stack, return it as a boolean
1502     if (results.size() == 1) {
1503         if (isBoolean(results.top())) {
1504             return results.top().state;
1505         } else {
1506             cerr << "error, non-boolean value left on stack" << endl;
1507             //cerr << results.top().value << endl;
1508             exit(1);
1509         }
1510     } else if (results.size() > 1) {
1511         cerr << "more than one value left on results stack!" << endl;
1512         while (!results.empty()) {
1513             cerr << results.top().value << endl;
1514             results.pop();
1515         }
1516         exit(1);
1517     } else {
1518         cerr << "results stack empty" << endl;
1519         exit(1);
1520     }
1521 }
1522 
removeFilteredGenotypes(Variant & var,bool keepInfo)1523 void VariantFilter::removeFilteredGenotypes(Variant& var, bool keepInfo) {
1524 
1525     for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
1526         string& name = *s;
1527         if (!passes(var, name)) {
1528         	if (keepInfo) {
1529 				var.samples[name]["GT"].clear();
1530 				var.samples[name]["GT"].push_back("./.");
1531         	}
1532         	else {
1533 			    var.samples.erase(name);
1534         	}
1535         }
1536     }
1537 }
1538 
1539 /*
1540 bool VariantCallFile::openVCF(string& filename) {
1541     file.open(filename.c_str(), ifstream::in);
1542     if (!file.is_open()) {
1543         cerr << "could not open " << filename << endl;
1544         return false;
1545     } else {
1546         return parseHeader();
1547     }
1548 }
1549 
1550 bool VariantCallFile::openVCF(ifstream& stream) {
1551     file = stream;
1552     if (!file.is_open()) {
1553         cerr << "provided file is not open" << endl;
1554         return false;
1555     } else {
1556         return parseHeader();
1557     }
1558 }
1559 */
1560 
updateSamples(vector<string> & newSamples)1561 void VariantCallFile::updateSamples(vector<string>& newSamples) {
1562     sampleNames = newSamples;
1563     // regenerate the last line of the header
1564     vector<string> headerLines = split(header, '\n');
1565     vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
1566     vector<string> newcolnames;
1567     newcolnames.resize(9 + sampleNames.size());
1568     copy(colnames.begin(), colnames.begin() + 9, newcolnames.begin());
1569     copy(sampleNames.begin(), sampleNames.end(), newcolnames.begin() + 9);
1570     headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
1571     header = join(headerLines, "\n");
1572 }
1573 
1574 // non-destructive version of above
headerWithSampleNames(vector<string> & newSamples)1575 string VariantCallFile::headerWithSampleNames(vector<string>& newSamples) {
1576     // regenerate the last line of the header
1577     if (newSamples.empty()) return header;
1578     vector<string> headerLines = split(header, '\n');
1579     vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
1580     vector<string> newcolnames;
1581     unsigned int colCount = colnames.size(); // used to be hard-coded 9, hopefully the dynamic colCount isn't an issue
1582     if (colCount < 8)
1583     {
1584         cout << "VCF file is not suitable for use because it does not have a format field." << endl;
1585         exit(0);
1586     }
1587     newcolnames.resize(colCount + newSamples.size());
1588     copy(colnames.begin(), colnames.begin() + colCount, newcolnames.begin());
1589     copy(newSamples.begin(), newSamples.end(), newcolnames.begin() + colCount);
1590     headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
1591     return join(headerLines, "\n");
1592 }
1593 
1594 // TODO cleanup, store header lines instead of bulk header
addHeaderLine(string line)1595 void VariantCallFile::addHeaderLine(string line) {
1596     vector<string> headerLines = split(header, '\n');
1597     headerLines.insert(headerLines.end() - 1, line);
1598     header = join(unique(headerLines), "\n");
1599 }
1600 
1601 // helper to addHeaderLine
unique(vector<string> & strings)1602 vector<string>& unique(vector<string>& strings) {
1603     set<string> uniq;
1604     vector<string> res;
1605     for (vector<string>::const_iterator s = strings.begin(); s != strings.end(); ++s) {
1606         if (uniq.find(*s) == uniq.end()) {
1607             res.push_back(*s);
1608             uniq.insert(*s);
1609         }
1610     }
1611     strings = res;
1612     return strings;
1613 }
1614 
infoIds(void)1615 vector<string> VariantCallFile::infoIds(void) {
1616     vector<string> tags;
1617     vector<string> headerLines = split(header, '\n');
1618     for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1619         string& line = *s;
1620         if (line.find("##INFO") == 0) {
1621             size_t pos = line.find("ID=");
1622             if (pos != string::npos) {
1623                 pos += 3;
1624                 size_t tagend = line.find(",", pos);
1625                 if (tagend != string::npos) {
1626                     tags.push_back(line.substr(pos, tagend - pos));
1627                 }
1628             }
1629         }
1630     }
1631     return tags;
1632 }
1633 
formatIds(void)1634 vector<string> VariantCallFile::formatIds(void) {
1635     vector<string> tags;
1636     vector<string> headerLines = split(header, '\n');
1637     for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1638         string& line = *s;
1639         if (line.find("##FORMAT") == 0) {
1640             size_t pos = line.find("ID=");
1641             if (pos != string::npos) {
1642                 pos += 3;
1643                 size_t tagend = line.find(",", pos);
1644                 if (tagend != string::npos) {
1645                     tags.push_back(line.substr(pos, tagend - pos));
1646                 }
1647             }
1648         }
1649     }
1650     return tags;
1651 }
1652 
removeInfoHeaderLine(string tag)1653 void VariantCallFile::removeInfoHeaderLine(string tag) {
1654     vector<string> headerLines = split(header, '\n');
1655     vector<string> newHeader;
1656     string id = "ID=" + tag + ",";
1657     for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1658         string& line = *s;
1659         if (line.find("##INFO") == 0) {
1660             if (line.find(id) == string::npos) {
1661                 newHeader.push_back(line);
1662             }
1663         } else {
1664             newHeader.push_back(line);
1665         }
1666     }
1667     header = join(newHeader, "\n");
1668 }
1669 
removeGenoHeaderLine(string tag)1670 void VariantCallFile::removeGenoHeaderLine(string tag) {
1671     vector<string> headerLines = split(header, '\n');
1672     vector<string> newHeader;
1673     string id = "ID=" + tag + ",";
1674     for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
1675         string& headerLine = *s;
1676         if (headerLine.find("##FORMAT") == 0) {
1677             if (headerLine.find(id) == string::npos) {
1678                 newHeader.push_back(headerLine);
1679             }
1680         } else {
1681             newHeader.push_back(headerLine);
1682         }
1683     }
1684     header = join(newHeader, "\n");
1685 }
1686 
getHeaderLinesFromFile()1687 vector<string> VariantCallFile::getHeaderLinesFromFile()
1688 {
1689     string headerStr = "";
1690 
1691     if (usingTabix) {
1692         tabixFile->getHeader(headerStr);
1693         if (headerStr.empty()) {
1694             cerr << "error: no VCF header" << endl;
1695             exit(1);
1696         }
1697         tabixFile->getNextLine(line);
1698         firstRecord = true;
1699     } else {
1700         while (std::getline(*file, line)) {
1701             if (line.substr(0,1) == "#") {
1702                 headerStr += line + '\n';
1703             } else {
1704                 // done with header
1705                 if (headerStr.empty()) {
1706                     cerr << "error: no VCF header" << endl;
1707                     return vector<string>();
1708                 }
1709                 firstRecord = true;
1710                 break;
1711             }
1712         }
1713     }
1714     return split(headerStr, "\n");
1715 }
1716 
parseHeader(void)1717 bool VariantCallFile::parseHeader(void) {
1718 
1719     string headerStr = "";
1720 
1721     if (usingTabix) {
1722         tabixFile->getHeader(headerStr);
1723         if (headerStr.empty()) {
1724             cerr << "error: no VCF header" << endl;
1725             exit(1);
1726         }
1727         tabixFile->getNextLine(line);
1728         firstRecord = true;
1729     } else {
1730         while (std::getline(*file, line)) {
1731             if (line.substr(0,1) == "#") {
1732                 headerStr += line + '\n';
1733             } else {
1734                 // done with header
1735                 if (headerStr.empty()) {
1736                     cerr << "error: no VCF header" << endl;
1737                     return false;
1738                 }
1739                 firstRecord = true;
1740                 break;
1741             }
1742         }
1743     }
1744     this->vcf_header = headerStr;
1745 
1746     return parseHeader(headerStr);
1747 
1748 }
1749 
parseHeader(string & hs)1750 bool VariantCallFile::parseHeader(string& hs) {
1751 
1752     if (hs.empty()) return false;
1753     if (hs.substr(hs.size() - 1, 1) == "\n") {
1754 	hs.erase(hs.size() - 1, 1); // remove trailing newline
1755     }
1756     header = hs; // stores the header in the object instance
1757 
1758     vector<string> headerLines = split(header, "\n");
1759     for (vector<string>::iterator h = headerLines.begin(); h != headerLines.end(); ++h) {
1760         string headerLine = *h;
1761         if (headerLine.substr(0,2) == "##") {
1762             // meta-information headerLines
1763             // TODO parse into map from info/format key to type
1764             // ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
1765             // ##FORMAT=<ID=CB,Number=1,Type=String,Description="Called by S(Sanger), M(UMich), B(BI)">
1766             size_t found = headerLine.find_first_of("=");
1767             string entryType = headerLine.substr(2, found - 2);
1768             // handle reference here, no "<" and ">" given
1769                 //} else if (entryType == "reference") {
1770             size_t dataStart = headerLine.find_first_of("<");
1771             size_t dataEnd = headerLine.find_first_of(">");
1772             if (dataStart != string::npos && dataEnd != string::npos) {
1773                 string entryData = headerLine.substr(dataStart + 1, dataEnd - dataStart - 1);
1774                 // XXX bad; this will break if anyone ever moves the order
1775                 // of the fields around to include a "long form" string
1776                 // including either a = or , in the first or second field
1777                 if (entryType == "INFO" || entryType == "FORMAT") {
1778                     vector<string> fields = split(entryData, "=,");
1779                     if (fields[0] != "ID") {
1780                         cerr << "header parse error at:" << endl
1781                              << "fields[0] != \"ID\"" << endl
1782                              << headerLine << endl;
1783                         exit(1);
1784                     }
1785                     string id = fields[1];
1786                     if (fields[2] != "Number") {
1787                         cerr << "header parse error at:" << endl
1788                              << "fields[2] != \"Number\"" << endl
1789                              << headerLine << endl;
1790                         exit(1);
1791                     }
1792                     int number;
1793                     string numberstr = fields[3].c_str();
1794                     // XXX TODO VCF has variable numbers of fields...
1795                     if (numberstr == "A") {
1796                         number = ALLELE_NUMBER;
1797                     } else if (numberstr == "G") {
1798                         number = GENOTYPE_NUMBER;
1799                     } else if (numberstr == ".") {
1800                         number = 1;
1801                     } else {
1802                         convert(numberstr, number);
1803                     }
1804                     if (fields[4] != "Type") {
1805                         cerr << "header parse error at:" << endl
1806                              << "fields[4] != \"Type\"" << endl
1807                              << headerLine << endl;
1808                         exit(1);
1809                     }
1810                     VariantFieldType type = typeStrToVariantFieldType(fields[5]);
1811                     if (entryType == "INFO") {
1812                         infoCounts[id] = number;
1813                         infoTypes[id] = type;
1814                         //cerr << id << " == " << type << endl;
1815                     } else if (entryType == "FORMAT") {
1816                         //cout << "found format field " << id << " with type " << type << endl;
1817                         formatCounts[id] = number;
1818                         formatTypes[id] = type;
1819                     }
1820                 }
1821             }
1822         } else if (headerLine.substr(0,1) == "#") {
1823             // field name headerLine
1824             vector<string> fields = split(headerLine, '\t');
1825             if (fields.size() > 8) {
1826                 sampleNames.resize(fields.size() - 9);
1827                 copy(fields.begin() + 9, fields.end(), sampleNames.begin());
1828             }
1829         }
1830     }
1831 
1832     return true;
1833 }
1834 
getNextVariant(Variant & var)1835 bool VariantCallFile::getNextVariant(Variant& var) {
1836         if (firstRecord && !justSetRegion) {
1837             if (!line.empty() && line.substr(0,1) != "#") {
1838                 var.parse(line, parseSamples);
1839                 firstRecord = false;
1840                 _done = false;
1841                 return true;
1842             } else {
1843                 return false;
1844             }
1845         }
1846         if (usingTabix) {
1847             if (justSetRegion && !line.empty() && line.substr(0,1) != "#") {
1848                 if (firstRecord) {
1849                     firstRecord = false;
1850                 }
1851                 var.parse(line, parseSamples);
1852                 line.clear();
1853                 justSetRegion = false;
1854                 _done = false;
1855                 return true;
1856             } else if (tabixFile->getNextLine(line)) {
1857                 var.parse(line, parseSamples);
1858                 _done = false;
1859                 return true;
1860             } else {
1861                 _done = true;
1862                 return false;
1863             }
1864         } else {
1865             if (std::getline(*file, line)) {
1866                 var.parse(line, parseSamples);
1867                 _done = false;
1868                 return true;
1869             } else {
1870                 _done = true;
1871                 return false;
1872             }
1873         }
1874 }
1875 
setRegion(string seq,long int start,long int end)1876 bool VariantCallFile::setRegion(string seq, long int start, long int end) {
1877     stringstream regionstr;
1878     if (end) {
1879         regionstr << seq << ":" << start << "-" << end;
1880     } else {
1881         regionstr << seq << ":" << start;
1882     }
1883     return setRegion(regionstr.str());
1884 }
1885 
setRegion(string region)1886 bool VariantCallFile::setRegion(string region) {
1887     if (!usingTabix) {
1888         cerr << "cannot setRegion on a non-tabix indexed file" << endl;
1889         exit(1);
1890     }
1891     size_t dots = region.find("..");
1892     // convert between bamtools/freebayes style region string and tabix/samtools style
1893     if (dots != string::npos) {
1894         region.replace(dots, 2, "-");
1895     }
1896     if (tabixFile->setRegion(region)) {
1897         if (tabixFile->getNextLine(line)) {
1898 	    justSetRegion = true;
1899             return true;
1900         } else {
1901             return false;
1902         }
1903     } else {
1904         return false;
1905     }
1906 }
1907 
1908 
1909 // genotype manipulation
1910 /*
1911 map<string, int> decomposeGenotype(string& genotype) {
1912     string splitter = "/";
1913     if (genotype.find("|") != string::npos) {
1914         splitter = "|";
1915     }
1916     vector<string> haps = split(genotype, splitter);
1917     map<string, int> decomposed;
1918     for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
1919         ++decomposed[*h];
1920     }
1921     return decomposed;
1922 }
1923 */
1924 
decomposeGenotype(const string & genotype)1925 map<int, int> decomposeGenotype(const string& genotype) {
1926     string splitter = "/";
1927     if (genotype.find("|") != string::npos) {
1928         splitter = "|";
1929     }
1930     vector<string> haps = split(genotype, splitter);
1931     map<int, int> decomposed;
1932     for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
1933         int alt;
1934         if (*h == ".") {
1935             ++decomposed[NULL_ALLELE];
1936         } else {
1937             convert(*h, alt);
1938             ++decomposed[alt];
1939         }
1940     }
1941     return decomposed;
1942 }
1943 
decomposePhasedGenotype(const string & genotype)1944 vector<int> decomposePhasedGenotype(const string& genotype) {
1945     string splitter = "/";
1946     if (genotype.find("|") != string::npos) {
1947         splitter = "|";
1948     }
1949     vector<string> haps = split(genotype, splitter);
1950     if (haps.size() > 1 && splitter == "/") {
1951         cerr << "could not find '|' in genotype, cannot decomposePhasedGenotype on unphased genotypes" << endl;
1952         exit(1);
1953     }
1954     vector<int> decomposed;
1955     for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
1956         int alt;
1957         if (*h == ".") {
1958             decomposed.push_back(NULL_ALLELE);
1959         } else {
1960             convert(*h, alt);
1961             decomposed.push_back(alt);
1962         }
1963     }
1964     return decomposed;
1965 }
1966 
genotypeToString(const map<int,int> & genotype)1967 string genotypeToString(const map<int, int>& genotype) {
1968     vector<int> s;
1969     for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
1970         int a = g->first;
1971         int c = g->second;
1972         for (int i = 0; i < c; ++i) s.push_back(a);
1973     }
1974     sort(s.begin(), s.end());
1975     vector<string> r;
1976     for (vector<int>::iterator i = s.begin(); i != s.end(); ++i) {
1977         if (*i == NULL_ALLELE) r.push_back(".");
1978         else r.push_back(convert(*i));
1979     }
1980     return join(r, "/"); // TODO adjust for phased/unphased
1981 }
1982 
phasedGenotypeToString(const vector<int> & genotype)1983 string phasedGenotypeToString(const vector<int>& genotype) {
1984     vector<string> r;
1985     for (vector<int>::const_iterator i = genotype.begin(); i != genotype.end(); ++i) {
1986         if (*i == NULL_ALLELE) r.push_back(".");
1987         else r.push_back(convert(*i));
1988     }
1989     return join(r, "|");
1990 }
1991 
isHet(const map<int,int> & genotype)1992 bool isHet(const map<int, int>& genotype) {
1993     return genotype.size() > 1;
1994 }
1995 
isHom(const map<int,int> & genotype)1996 bool isHom(const map<int, int>& genotype) {
1997     return genotype.size() == 1;
1998 }
1999 
hasNonRef(const map<int,int> & genotype)2000 bool hasNonRef(const map<int, int>& genotype) {
2001     for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
2002         if (g->first != 0) {
2003             return true;
2004         }
2005     }
2006     return false;
2007 }
2008 
isHomRef(const map<int,int> & genotype)2009 bool isHomRef(const map<int, int>& genotype) {
2010     return isHom(genotype) && !hasNonRef(genotype);
2011 }
2012 
isHomNonRef(const map<int,int> & genotype)2013 bool isHomNonRef(const map<int, int>& genotype) {
2014     return isHom(genotype) && hasNonRef(genotype);
2015 }
2016 
isNull(const map<int,int> & genotype)2017 bool isNull(const map<int, int>& genotype) {
2018     return genotype.find(NULL_ALLELE) != genotype.end();
2019 }
2020 
ploidy(const map<int,int> & genotype)2021 int ploidy(const map<int, int>& genotype) {
2022     int i = 0;
2023     for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
2024         i += g->second;
2025     }
2026     return i;
2027 }
2028 
2029 // generates cigar from allele parsed by parsedAlternates
varCigar(vector<VariantAllele> & vav,bool xForMismatch)2030 string varCigar(vector<VariantAllele>& vav, bool xForMismatch) {
2031     string cigar;
2032     pair<int, string> element;
2033     for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
2034         VariantAllele& va = *v;
2035         if (va.ref != va.alt) {
2036             if (element.second == "M") {
2037                 cigar += convert(element.first) + element.second;
2038                 element.second = ""; element.first = 0;
2039             }
2040             if (va.ref.size() == va.alt.size()) {
2041                 cigar += convert(va.ref.size()) + (xForMismatch ? "X" : "M");
2042             } else if (va.ref.size() > va.alt.size()) {
2043                 cigar += convert(va.ref.size() - va.alt.size()) + "D";
2044             } else {
2045                 cigar += convert(va.alt.size() - va.ref.size()) + "I";
2046             }
2047         } else {
2048             if (element.second == "M") {
2049                 element.first += va.ref.size();
2050             } else {
2051                 element = make_pair(va.ref.size(), "M");
2052             }
2053         }
2054     }
2055     if (element.second == "M") {
2056         cigar += convert(element.first) + element.second;
2057     }
2058     element.second = ""; element.first = 0;
2059     return cigar;
2060 }
2061 
parsedAlternates(bool includePreviousBaseForIndels,bool useMNPs,bool useEntropy,float matchScore,float mismatchScore,float gapOpenPenalty,float gapExtendPenalty,float repeatGapExtendPenalty,string flankingRefLeft,string flankingRefRight)2062 map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
2063                                                               bool useMNPs,
2064                                                               bool useEntropy,
2065                                                               float matchScore,
2066                                                               float mismatchScore,
2067                                                               float gapOpenPenalty,
2068                                                               float gapExtendPenalty,
2069                                                               float repeatGapExtendPenalty,
2070                                                               string flankingRefLeft,
2071                                                               string flankingRefRight) {
2072 
2073     map<string, vector<VariantAllele> > variantAlleles;
2074 
2075     if (isSymbolicSV()){
2076         // Don't ever align SVs. It just wrecks things.
2077         return this->flatAlternates();
2078     }
2079     // add the reference allele
2080     variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
2081 
2082     // single SNP case, no ambiguity possible, no need to spend a lot of
2083     // compute aligning ref and alt fields
2084     if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
2085         variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
2086         return variantAlleles;
2087     }
2088 
2089     // padding is used to ensure a stable alignment of the alternates to the reference
2090     // without having to go back and look at the full reference sequence
2091     int paddingLen = max(10, (int) (ref.size()));  // dynamically determine optimum padding length
2092     for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
2093         string& alternate = *a;
2094         paddingLen = max(paddingLen, (int) (alternate.size()));
2095     }
2096     char padChar = 'Z';
2097     char anchorChar = 'Q';
2098     string padding(paddingLen, padChar);
2099 
2100     // this 'anchored' string is done for stability
2101     // the assumption is that there should be a positional match in the first base
2102     // this is true for VCF 4.1, and standard best practices
2103     // using the anchor char ensures this without other kinds of realignment
2104     string reference_M;
2105     if (flankingRefLeft.empty() && flankingRefRight.empty()) {
2106         reference_M = padding + ref + padding;
2107         reference_M[paddingLen] = anchorChar;
2108     } else {
2109         reference_M = flankingRefLeft + ref + flankingRefRight;
2110         paddingLen = flankingRefLeft.size();
2111     }
2112 
2113     // passed to sw.Align
2114     unsigned int referencePos;
2115 
2116     string cigar;
2117 
2118     for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
2119 
2120       string& alternate = *a;
2121       vector<VariantAllele>& variants = variantAlleles[alternate];
2122       string alternateQuery_M;
2123       if (flankingRefLeft.empty() && flankingRefRight.empty()) {
2124 	alternateQuery_M = padding + alternate + padding;
2125 	alternateQuery_M[paddingLen] = anchorChar;
2126       } else {
2127 	alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
2128       }
2129       //const unsigned int alternateLen = alternate.size();
2130 
2131       if (true) {
2132 	CSmithWatermanGotoh sw(matchScore,
2133 			       mismatchScore,
2134 			       gapOpenPenalty,
2135 			       gapExtendPenalty);
2136 	if (useEntropy) sw.EnableEntropyGapPenalty(1);
2137 	if (repeatGapExtendPenalty != 0){
2138 	  sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
2139 	}
2140 	sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
2141       } else {  // disabled for now
2142 	StripedSmithWaterman::Aligner aligner;
2143 	StripedSmithWaterman::Filter sswFilter;
2144 	StripedSmithWaterman::Alignment alignment;
2145 	aligner.Align(alternateQuery_M.c_str(),
2146 		      reference_M.c_str(),
2147 		      reference_M.size(), sswFilter, &alignment);
2148 	cigar = alignment.cigar_string;
2149       }
2150 
2151       // left-realign the alignment...
2152 
2153       vector<pair<int, string> > cigarData = splitCigar(cigar);
2154 
2155       if (cigarData.front().second != "M"
2156 	  || cigarData.back().second != "M"
2157 	  || cigarData.front().first < paddingLen
2158 	  || cigarData.back().first < paddingLen) {
2159 	cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
2160 	cerr << cigar << endl;
2161 	cerr << reference_M << endl;
2162 	cerr << alternateQuery_M << endl;
2163 	exit(1);
2164       } else {
2165 	cigarData.front().first -= paddingLen;
2166 	cigarData.back().first -= paddingLen;;
2167       }
2168       //cigarData = cleanCigar(cigarData);
2169       cigar = joinCigar(cigarData);
2170 
2171       int altpos = 0;
2172       int refpos = 0;
2173 
2174       for (vector<pair<int, string> >::iterator e = cigarData.begin();
2175 	   e != cigarData.end(); ++e) {
2176 
2177 	int len = e->first;
2178 	string type = e->second;
2179 
2180 	switch (type.at(0)) {
2181 	case 'I':
2182 	  if (includePreviousBaseForIndels) {
2183 	    if (!variants.empty() &&
2184 		variants.back().ref != variants.back().alt) {
2185 	      VariantAllele a =
2186 		VariantAllele("",
2187 			      alternate.substr(altpos, len),
2188 			      refpos + position);
2189 	      variants.back() = variants.back() + a;
2190 	    } else {
2191 	      VariantAllele a =
2192 		VariantAllele(ref.substr(refpos - 1, 1),
2193 			      alternate.substr(altpos - 1, len + 1),
2194 			      refpos + position - 1);
2195 	      variants.push_back(a);
2196 	    }
2197 	  } else {
2198 	    variants.push_back(VariantAllele("",
2199 					     alternate.substr(altpos, len),
2200 					     refpos + position));
2201 	  }
2202 	  altpos += len;
2203 	  break;
2204 	case 'D':
2205 	  if (includePreviousBaseForIndels) {
2206 	    if (!variants.empty() &&
2207 		variants.back().ref != variants.back().alt) {
2208 	      VariantAllele a
2209 		= VariantAllele(ref.substr(refpos, len)
2210 				, "", refpos + position);
2211 	      variants.back() = variants.back() + a;
2212 	      } else {
2213 	      VariantAllele a
2214 		= VariantAllele(ref.substr(refpos - 1, len + 1),
2215 				alternate.substr(altpos - 1, 1),
2216 				refpos + position - 1);
2217 	      variants.push_back(a);
2218 	    }
2219 	  } else {
2220 	    variants.push_back(VariantAllele(ref.substr(refpos, len),
2221 					     "", refpos + position));
2222 	  }
2223 	  refpos += len;
2224 	  break;
2225 
2226 	  // zk has added (!variants.empty()) solves the seg fault in
2227           // vcfstats, but need to test
2228 	case 'M':
2229 	  {
2230 	    for (int i = 0; i < len; ++i) {
2231 	      VariantAllele a
2232 		= VariantAllele(ref.substr(refpos + i, 1),
2233 				alternate.substr(altpos + i, 1),
2234 				(refpos + i + position));
2235 	      if (useMNPs && (!variants.empty()) &&
2236 		  variants.back().ref.size() == variants.back().alt.size()
2237 		  && variants.back().ref != variants.back().alt) {
2238 		  variants.back() = variants.back() + a;
2239 	      } else {
2240 		variants.push_back(a);
2241 	      }
2242 	    }
2243 	  }
2244 	  refpos += len;
2245 	  altpos += len;
2246 	  break;
2247 	case 'S':
2248 	  {
2249 	    refpos += len;
2250 	    altpos += len;
2251 	    break;
2252 	  }
2253 	default:
2254 	  {
2255 	    break;
2256 	  }
2257 	}
2258       }
2259     }
2260     return variantAlleles;
2261 }
2262 
flatAlternates(void)2263 map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
2264     map<string, vector<VariantAllele> > variantAlleles;
2265     for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
2266         string& alternate = *a;
2267         vector<VariantAllele>& variants = variantAlleles[alternate];
2268         variants.push_back(VariantAllele(ref, alternate, position));
2269     }
2270     return variantAlleles;
2271 }
2272 
altSet(void)2273 set<string> Variant::altSet(void) {
2274     set<string> altset(alt.begin(), alt.end());
2275     return altset;
2276 }
2277 
operator <<(ostream & out,VariantAllele & var)2278 ostream& operator<<(ostream& out, VariantAllele& var) {
2279     out << var.position << " " << var.ref << " -> " << var.alt;
2280     return out;
2281 }
2282 
operator +(const VariantAllele & a,const VariantAllele & b)2283 VariantAllele operator+(const VariantAllele& a, const VariantAllele& b) {
2284     return VariantAllele(a.ref + b.ref, a.alt + b.alt, a.position);
2285 }
2286 
operator <(const VariantAllele & a,const VariantAllele & b)2287 bool operator<(const VariantAllele& a, const VariantAllele& b) {
2288     return a.repr < b.repr;
2289 }
2290 
getGenotypeIndexesDiploid(void)2291 map<pair<int, int>, int> Variant::getGenotypeIndexesDiploid(void) {
2292 
2293     map<pair<int, int>, int> genotypeIndexes;
2294     //map<int, map<Genotype*, int> > vcfGenotypeOrder;
2295     vector<int> indexes;
2296     for (int i = 0; i < alleles.size(); ++i) {
2297         indexes.push_back(i);
2298     }
2299     int ploidy = 2; // ONLY diploid
2300     vector<vector<int> > genotypes = multichoose(ploidy, indexes);
2301     for (vector<vector<int> >::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
2302         sort(g->begin(), g->end());  // enforce e.g. 0/1, 0/2, 1/2 ordering over reverse
2303         // XXX this does not handle non-diploid!!!!
2304         int j = g->front();
2305         int k = g->back();
2306         genotypeIndexes[make_pair(j, k)] = (k * (k + 1) / 2) + j;
2307     }
2308     return genotypeIndexes;
2309 
2310 }
2311 
updateAlleleIndexes(void)2312 void Variant::updateAlleleIndexes(void) {
2313     // adjust the allele index
2314     altAlleleIndexes.clear();
2315     int m = 0;
2316     for (vector<string>::iterator a = alt.begin();
2317             a != alt.end(); ++a, ++m) {
2318         altAlleleIndexes[*a] = m;
2319     }
2320 }
2321 
2322 // TODO only works on "A"llele variant fields
removeAlt(const string & altAllele)2323   void Variant::removeAlt(const string& altAllele) {
2324 
2325     int altIndex = getAltAlleleIndex(altAllele);  // this is the alt-relative index, 0-based
2326 
2327     for (map<string, int>::iterator c = vcf->infoCounts.begin();
2328 	 c != vcf->infoCounts.end(); ++c) {
2329       int count = c->second;
2330       if (count == ALLELE_NUMBER) {
2331 	string key = c->first;
2332 	map<string, vector<string> >::iterator v = info.find(key);
2333 	if (v != info.end()) {
2334 	  vector<string>& vals = v->second;
2335 	  vector<string> tokeep;
2336 	  int i = 0;
2337 	  for (vector<string>::iterator a = vals.begin();
2338 	       a != vals.end(); ++a, ++i) {
2339 	    if (i != altIndex) {
2340 	      tokeep.push_back(*a);
2341 	    }
2342 	  }
2343 	  vals = tokeep;
2344 	}
2345       }
2346     }
2347 
2348     for (map<string, int>::iterator c = vcf->formatCounts.begin();
2349 	 c != vcf->formatCounts.end(); ++c) {
2350       int count = c->second;
2351       if (count == ALLELE_NUMBER) {
2352             string key = c->first;
2353             for (map<string, map<string, vector<string> > >::iterator
2354 		   s = samples.begin(); s != samples.end(); ++s) {
2355 	      map<string, vector<string> >& sample = s->second;
2356 	      map<string, vector<string> >::iterator v = sample.find(key);
2357 	      if (v != sample.end()) {
2358 		vector<string>& vals = v->second;
2359 		vector<string> tokeep;
2360 		int i = 0;
2361 		for (vector<string>::iterator a = vals.begin();
2362 		     a != vals.end(); ++a, ++i) {
2363 		  if (i != altIndex) {
2364 		    tokeep.push_back(*a);
2365 		  }
2366 		}
2367 		vals = tokeep;
2368 	      }
2369             }
2370       }
2371     }
2372 
2373     int altSpecIndex = altIndex + 1; // this is the genotype-spec index, ref=0, 1-based for alts
2374 
2375     vector<string> newalt;
2376     map<int, int> alleleIndexMapping;
2377     // setup the new alt string
2378     alleleIndexMapping[0] = 0; // reference allele remains the same
2379     alleleIndexMapping[NULL_ALLELE] = NULL_ALLELE; // null allele remains the same
2380     int i = 1; // current index
2381     int j = 1; // new index
2382     for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a, ++i) {
2383         if (i != altSpecIndex) {
2384             newalt.push_back(*a);
2385             // get the mapping between new and old allele indexes
2386             alleleIndexMapping[i] = j;
2387             ++j;
2388         } else {
2389             alleleIndexMapping[i] = NULL_ALLELE;
2390         }
2391     }
2392 
2393     // fix the sample genotypes, removing reference to the old allele
2394     map<string, int> samplePloidy;
2395     for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
2396         map<string, vector<string> >& sample = s->second;
2397         if (sample.find("GT") != sample.end()) {
2398             string& gt = sample["GT"].front();
2399             string splitter = "/";
2400             if (gt.find("|") != string::npos) {
2401                 splitter = "|";
2402             }
2403 
2404             if (splitter == "/") {
2405                 samplePloidy[s->first] = split(gt, splitter).size();
2406                 map<int, int> genotype = decomposeGenotype(sample["GT"].front());
2407                 map<int, int> newGenotype;
2408                 for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
2409                     newGenotype[alleleIndexMapping[g->first]] += g->second;
2410                 }
2411                 sample["GT"].clear();
2412                 sample["GT"].push_back(genotypeToString(newGenotype));
2413             } else {
2414                 samplePloidy[s->first] = split(gt, splitter).size();
2415                 vector<int> genotype = decomposePhasedGenotype(sample["GT"].front());
2416                 vector<int> newGenotype;
2417                 for (vector<int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
2418                     newGenotype.push_back(alleleIndexMapping[*g]);
2419                 }
2420                 sample["GT"].clear();
2421                 sample["GT"].push_back(phasedGenotypeToString(newGenotype));
2422             }
2423         }
2424     }
2425 
2426     set<int> ploidies;
2427     for (map<string, int>::iterator p = samplePloidy.begin(); p != samplePloidy.end(); ++p) {
2428         ploidies.insert(p->second);
2429     }
2430 
2431     // fix the sample genotype likelihoods, removing reference to the old allele
2432     // which GL fields should we remove?
2433     vector<int> toRemove;
2434     toRemove.push_back(altSpecIndex);
2435     map<int, map<int, int> > glMappingByPloidy;
2436     for (set<int>::iterator p = ploidies.begin(); p != ploidies.end(); ++p) {
2437         glMappingByPloidy[*p] = glReorder(*p, alt.size() + 1, alleleIndexMapping, toRemove);
2438     }
2439 
2440     for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
2441         map<string, vector<string> >& sample = s->second;
2442         map<string, vector<string> >::iterator glsit = sample.find("GL");
2443         if (glsit != sample.end()) {
2444             vector<string>& gls = glsit->second; // should be split already
2445             map<int, string> newgls;
2446             map<int, int>& newOrder = glMappingByPloidy[samplePloidy[s->first]];
2447             int i = 0;
2448             for (vector<string>::iterator g = gls.begin(); g != gls.end(); ++g, ++i) {
2449                 int j = newOrder[i];
2450                 if (j != -1) {
2451                     newgls[i] = *g;
2452                 }
2453             }
2454             // update the gls
2455             gls.clear();
2456             for (map<int, string>::iterator g = newgls.begin(); g != newgls.end(); ++g) {
2457                 gls.push_back(g->second);
2458             }
2459         }
2460     }
2461 
2462     // reset the alt
2463     alt = newalt;
2464 
2465     // and the alleles
2466     alleles.clear();
2467     alleles.push_back(ref);
2468     alleles.insert(alleles.end(), alt.begin(), alt.end());
2469 
2470     updateAlleleIndexes();
2471 
2472 }
2473 
2474 // union of lines in headers of input files
unionInfoHeaderLines(string & s1,string & s2)2475 string unionInfoHeaderLines(string& s1, string& s2) {
2476     vector<string> lines1 = split(s1, "\n");
2477     vector<string> lines2 = split(s2, "\n");
2478     vector<string> result;
2479     set<string> l2;
2480     string lastHeaderLine; // this one needs to be at the end
2481     for (vector<string>::iterator s = lines2.begin(); s != lines2.end(); ++s) {
2482         if (s->substr(0,6) == "##INFO") {
2483             l2.insert(*s);
2484         }
2485     }
2486     for (vector<string>::iterator s = lines1.begin(); s != lines1.end(); ++s) {
2487         if (l2.count(*s)) {
2488             l2.erase(*s);
2489         }
2490         if (s->substr(0,6) == "#CHROM") {
2491             lastHeaderLine = *s;
2492         } else {
2493             result.push_back(*s);
2494         }
2495     }
2496     for (set<string>::iterator s = l2.begin(); s != l2.end(); ++s) {
2497         result.push_back(*s);
2498     }
2499     if (lastHeaderLine.empty()) {
2500         cerr << "could not find CHROM POS ... header line" << endl;
2501         exit(1);
2502     }
2503     result.push_back(lastHeaderLine);
2504     return join(result, "\n");
2505 }
2506 
mergeCigar(const string & c1,const string & c2)2507 string mergeCigar(const string& c1, const string& c2) {
2508     vector<pair<int, string> > cigar1 = splitCigar(c1);
2509     vector<pair<int, string> > cigar2 = splitCigar(c2);
2510     // check if the middle elements are the same
2511     if (cigar1.back().second == cigar2.front().second) {
2512         cigar1.back().first += cigar2.front().first;
2513         cigar2.erase(cigar2.begin());
2514     }
2515     for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
2516         cigar1.push_back(*c);
2517     }
2518     return joinCigar(cigar1);
2519 }
2520 
splitCigar(const string & cigarStr)2521 vector<pair<int, string> > splitCigar(const string& cigarStr) {
2522     vector<pair<int, string> > cigar;
2523     string number;
2524     string type;
2525     // strings go [Number][Type] ...
2526     for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
2527         char c = *s;
2528         if (isdigit(c)) {
2529             if (type.empty()) {
2530                 number += c;
2531             } else {
2532                 // signal for next token, push back the last pair, clean up
2533                 cigar.push_back(make_pair(atoi(number.c_str()), type));
2534                 number.clear();
2535                 type.clear();
2536                 number += c;
2537             }
2538         } else {
2539             type += c;
2540         }
2541     }
2542     if (!number.empty() && !type.empty()) {
2543         cigar.push_back(make_pair(atoi(number.c_str()), type));
2544     }
2545     return cigar;
2546 }
2547 
splitCigarList(const string & cigarStr)2548 list<pair<int, string> > splitCigarList(const string& cigarStr) {
2549     list<pair<int, string> > cigar;
2550     string number;
2551     string type;
2552     // strings go [Number][Type] ...
2553     for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
2554         char c = *s;
2555         if (isdigit(c)) {
2556             if (type.empty()) {
2557                 number += c;
2558             } else {
2559                 // signal for next token, push back the last pair, clean up
2560                 cigar.push_back(make_pair(atoi(number.c_str()), type));
2561                 number.clear();
2562                 type.clear();
2563                 number += c;
2564             }
2565         } else {
2566             type += c;
2567         }
2568     }
2569     if (!number.empty() && !type.empty()) {
2570         cigar.push_back(make_pair(atoi(number.c_str()), type));
2571     }
2572     return cigar;
2573 }
2574 
cleanCigar(const vector<pair<int,string>> & cigar)2575 vector<pair<int, string> > cleanCigar(const vector<pair<int, string> >& cigar) {
2576     vector<pair<int, string> > cigarClean;
2577     for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2578         if (c->first > 0) {
2579             cigarClean.push_back(*c);
2580         }
2581     }
2582     return cigarClean;
2583 }
2584 
joinCigar(const vector<pair<int,string>> & cigar)2585 string joinCigar(const vector<pair<int, string> >& cigar) {
2586     string cigarStr;
2587     for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2588         if (c->first) {
2589             cigarStr += convert(c->first) + c->second;
2590         }
2591     }
2592     return cigarStr;
2593 }
2594 
joinCigar(const vector<pair<int,char>> & cigar)2595 string joinCigar(const vector<pair<int, char> >& cigar) {
2596     string cigarStr;
2597     for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2598         if (c->first) {
2599             cigarStr += convert(c->first) + string(1, c->second);
2600         }
2601     }
2602     return cigarStr;
2603 }
2604 
joinCigarList(const list<pair<int,string>> & cigar)2605 string joinCigarList(const list<pair<int, string> >& cigar) {
2606     string cigarStr;
2607     for (list<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2608         cigarStr += convert(c->first) + c->second;
2609     }
2610     return cigarStr;
2611 }
2612 
cigarRefLen(const vector<pair<int,char>> & cigar)2613 int cigarRefLen(const vector<pair<int, char> >& cigar) {
2614     int len = 0;
2615     for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2616         if (c->second == 'M' || c->second == 'D' || c->second == 'X') {
2617             len += c->first;
2618         }
2619     }
2620     return len;
2621 }
2622 
cigarRefLen(const vector<pair<int,string>> & cigar)2623 int cigarRefLen(const vector<pair<int, string> >& cigar) {
2624     int len = 0;
2625     for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
2626         if (c->second == "M" || c->second == "D" || c->second == "X") {
2627             len += c->first;
2628         }
2629     }
2630     return len;
2631 }
2632 
isEmptyCigarElement(const pair<int,string> & elem)2633 bool isEmptyCigarElement(const pair<int, string>& elem) {
2634     return elem.first == 0;
2635 }
2636 
_glorder(int ploidy,int alts)2637 list<list<int> > _glorder(int ploidy, int alts) {
2638     if (ploidy == 1) {
2639         list<list<int> > results;
2640         for (int n = 0; n < alts; ++n) {
2641             list<int> v;
2642             v.push_back(n);
2643             results.push_back(v);
2644         }
2645         return results;
2646     } else {
2647         list<list<int> > results;
2648         for (int n = 0; n < alts; ++n) {
2649             list<list<int> > x = _glorder(ploidy - 1, alts);
2650             for (list<list<int> >::iterator v = x.begin(); v != x.end(); ++v) {
2651                 if (v->front() <= n) {
2652                     v->push_front(n);
2653                     results.push_back(*v);
2654                 }
2655             }
2656         }
2657         return results;
2658     }
2659 }
2660 
2661 // genotype likelihood-ordering of genotypes, where each genotype is a
2662 // list of integers (as written in the GT field)
glorder(int ploidy,int alts)2663 list<list<int> > glorder(int ploidy, int alts) {
2664     list<list<int> > results = _glorder(ploidy, alts);
2665     for (list<list<int> >::iterator v = results.begin(); v != results.end(); ++v) {
2666         v->reverse();
2667     }
2668     return results;
2669 }
2670 
2671 // which genotype likelihoods would include this alternate allele
glsWithAlt(int alt,int ploidy,int numalts)2672 list<int> glsWithAlt(int alt, int ploidy, int numalts) {
2673     list<int> gls;
2674     list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
2675     int i = 0;
2676     for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
2677         for (list<int>::iterator q = v->begin(); q != v->end(); ++q) {
2678             if (*q == alt) {
2679                 gls.push_back(i);
2680                 break;
2681             }
2682         }
2683     }
2684     return gls;
2685 }
2686 
2687 // describes the mapping between the old gl ordering and and a new
2688 // one in which the GLs including the old alt have been removed
2689 // a map to -1 means "remove"
glReorder(int ploidy,int numalts,map<int,int> & alleleIndexMapping,vector<int> & altsToRemove)2690 map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove) {
2691     map<int, int> mapping;
2692     list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
2693     for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v) {
2694         for (list<int>::iterator n = v->begin(); n != v->end(); ++n) {
2695             *n = alleleIndexMapping[*n];
2696         }
2697     }
2698     list<list<int> > newOrderedGenotypes = glorder(ploidy, numalts - altsToRemove.size());
2699     map<list<int>, int> newOrderedGenotypesMapping;
2700     int i = 0;
2701     // mapping is wrong...
2702     for (list<list<int> >::iterator v = newOrderedGenotypes.begin(); v != newOrderedGenotypes.end(); ++v, ++i) {
2703         newOrderedGenotypesMapping[*v] = i;
2704     }
2705     i = 0;
2706     for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
2707         map<list<int>, int>::iterator m = newOrderedGenotypesMapping.find(*v);
2708         if (m != newOrderedGenotypesMapping.end()) {
2709             //cout << "new gl order of " << i << " is " << m->second << endl;
2710             mapping[i] = m->second;
2711         } else {
2712             //cout << i << " will be removed" << endl;
2713             mapping[i] = -1;
2714         }
2715     }
2716     return mapping;
2717 }
2718 
getGenotype(const string & sample)2719 string Variant::getGenotype(const string& sample) {
2720     map<string, map<string, vector<string> > >::iterator s = samples.find(sample);
2721     if (s != samples.end()) {
2722         map<string, vector<string> >::iterator f = s->second.find("GT");
2723         if (f != s->second.end()) {
2724             return f->second.front();
2725         }
2726     }
2727     return "";
2728 }
2729 
isPhased(void)2730 bool Variant::isPhased(void) {
2731     for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
2732         map<string, vector<string> >& sample = s->second;
2733         map<string, vector<string> >::iterator g = sample.find("GT");
2734         if (g != sample.end()) {
2735             string gt = g->second.front();
2736             if (gt.size() > 1 && gt.find("|") == string::npos) {
2737                 return false;
2738             }
2739         }
2740     }
2741     return true;
2742 }
2743 
zeroBasedPosition(void) const2744 long Variant::zeroBasedPosition(void) const {
2745     return position - 1;
2746 }
2747 
vrepr(void)2748 string Variant::vrepr(void) {
2749     return sequenceName + "\t" + convert(position) + "\t" + join(alleles, ",");
2750 }
2751 
2752 // TODO
2753 /*
2754 vector<Variant*> Variant::matchingHaplotypes() {
2755 
2756     int haplotypeStart = var.position;
2757     int haplotypeEnd = var.position + var.ref.size();
2758 
2759     for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
2760         haplotypeStart = min((*v)->position, (long int) haplotypeStart);
2761         haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
2762     }
2763 
2764     // for everything overlapping and the current variant, construct the local haplotype within the bounds
2765     // if there is an exact match, the allele in the current VCF does intersect
2766 
2767     string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
2768     map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
2769 
2770     for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
2771         Variant& variant = **v;
2772         int altindex = 0;
2773         for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
2774             string haplotype = referenceHaplotype;
2775             // get the relative start and end coordinates for the variant alternate allele
2776             int relativeStart = variant.position - haplotypeStart;
2777             haplotype.replace(relativeStart, variant.ref.size(), *a);
2778             haplotypes[haplotype].push_back(make_pair(*v, altindex));
2779         }
2780     }
2781 
2782     Variant originalVar = var;
2783 
2784     // determine the non-intersecting alts
2785     vector<string> altsToRemove;
2786     vector<int> altIndexesToRemove;
2787     for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
2788         string haplotype = referenceHaplotype;
2789         int relativeStart = var.position - haplotypeStart;
2790         haplotype.replace(relativeStart, var.ref.size(), *a);
2791         map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
2792         if ((intersecting && !invert && h == haplotypes.end())
2793             || (intersecting && invert && h != haplotypes.end())
2794             || (unioning && h != haplotypes.end())) {
2795             if (tag.empty() && mergeToTag.empty()) {
2796                 altsToRemove.push_back(*a);
2797             } else {
2798                 if (!tag.empty()) {
2799                     var.info[tag].push_back(".");
2800                 }
2801                 if (!mergeToTag.empty()) {
2802                     var.info[mergeToTag].push_back(".");
2803                 }
2804             }
2805         } else {
2806             if (!tag.empty()) {
2807                 var.info[tag].push_back(tagValue);
2808             }
2809             // NB: just take the first value for the mergeFromTag
2810             if (!mergeToTag.empty()) {
2811                 Variant* v = h->second.front().first;
2812                 int index = h->second.front().second;
2813                 if (v->info.find(mergeFromTag) != v->info.end()) {
2814                     // now you have to find the exact allele...
2815                     string& otherValue = v->info[mergeFromTag].at(index);
2816                     var.info[mergeToTag].push_back(otherValue);
2817                 } else if (mergeFromTag == "QUAL") {
2818                     var.info[mergeToTag].push_back(convert(v->quality));
2819                 } else {
2820                     var.info[mergeToTag].push_back(".");
2821                 }
2822             }
2823         }
2824     }
2825 
2826     // remove the non-overlapping (intersecting) or overlapping (unioning) alts
2827     if (intersecting && loci && altsToRemove.size() != var.alt.size()) {
2828         // we have a match in loci mode, so we should output the whole loci, not just the matching sequence
2829     } else {
2830         for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
2831             var.removeAlt(*a);
2832         }
2833     }
2834 
2835     if (unioning) {
2836 
2837         // somehow sort the records and combine them?
2838         map<long int, vector<Variant*> > variants;
2839         for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
2840             if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
2841                 outputVariants.find(*o) == outputVariants.end()) {
2842                 outputVariants.insert(*o);
2843                 variants[(*o)->position].push_back(*o);
2844             }
2845         }
2846         // add in the current variant, if it has alts left
2847         if (!var.alt.empty()) {
2848             vector<Variant*>& vars = variants[var.position];
2849             int numalts = 0;
2850             for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
2851                 numalts += (*v)->alt.size();
2852             }
2853             if (numalts + var.alt.size() == originalVar.alt.size()) {
2854                 variants[var.position].clear();
2855                 variants[var.position].push_back(&originalVar);
2856             } else {
2857                 variants[var.position].push_back(&var);
2858             }
2859         }
2860 
2861         for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
2862             for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
2863                 cout << **o << endl;
2864                 lastOutputPosition = max(lastOutputPosition, (*o)->position);
2865             }
2866         }
2867     } else {
2868         // if any alts remain, output the variant record
2869         if (!var.alt.empty()) {
2870             cout << var << endl;
2871             lastOutputPosition = max(lastOutputPosition, var.position);
2872         }
2873     }
2874 
2875 }
2876 */
2877 
2878 
VCFHeader()2879     VCFHeader::VCFHeader()
2880     {
2881 
2882         // add manditory fields
2883         this->header_columns.push_back("#CHROM");
2884         this->header_columns.push_back("POS");
2885         this->header_columns.push_back("ID");
2886         this->header_columns.push_back("REF");
2887         this->header_columns.push_back("ALT");
2888         this->header_columns.push_back("QUAL");
2889         this->header_columns.push_back("FILTER");
2890         this->header_columns.push_back("INFO");
2891 
2892         // add the line names in order
2893         // the order is used when outputting as a string
2894         this->header_line_names_ordered.push_back("##fileFormat");
2895         this->header_line_names_ordered.push_back("##fileDate");
2896         this->header_line_names_ordered.push_back("##source");
2897         this->header_line_names_ordered.push_back("##reference");
2898         this->header_line_names_ordered.push_back( "##contig");
2899         this->header_line_names_ordered.push_back("##phasing");
2900         this->header_line_names_ordered.push_back( "##assembly");
2901 
2902         // add the list names in order
2903         // the order is used when outputting as a string (getHeaderString)
2904         this->header_list_names_ordered.push_back("##info");
2905         this->header_list_names_ordered.push_back("##filter");
2906         this->header_list_names_ordered.push_back("##format");
2907         this->header_list_names_ordered.push_back("##alt");
2908         this->header_list_names_ordered.push_back("##sample");
2909         this->header_list_names_ordered.push_back("##pedigree");
2910         this->header_list_names_ordered.push_back("##pedigreedb");
2911 
2912         // initialize the header_lines with the above vector.
2913         // Set the key as the ##_type_ and the value as an empty string
2914         // Empty strings are ignored when outputting as string (getHeaderString)
2915         for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
2916         {
2917             this->header_lines[(*header_lines_iter)] = "";
2918         }
2919 
2920         // initialize the header_lines with the above vector.
2921         // Set the key as the ##_type_ and the value as an empty vector<string>
2922         // Empty vectors are ignored when outputting as string (getHeaderString)
2923         for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
2924         {
2925             this->header_lists[(*header_lists_iter)] = vector<string>(0);
2926         }
2927 
2928     }
2929 
addMetaInformationLine(const string & meta_line)2930     void VCFHeader::addMetaInformationLine(const string& meta_line)
2931     {
2932         // get the meta_line unique key (first chars before the =)
2933         unsigned int meta_line_index = meta_line.find("=", 0);
2934         string meta_line_prefix = meta_line.substr(0, meta_line_index);
2935 
2936         // check if the meta_line_prefix is in the header_lines, if so add it to the appropirate list
2937         if (this->header_lines.find(meta_line_prefix) != header_lines.end()) // the meta_line is a header line so replace what was there
2938         {
2939             this->header_lines[meta_line_prefix] = meta_line;
2940         }
2941         else if (header_lists.find(meta_line_prefix) != header_lists.end() &&
2942             !metaInfoIdExistsInVector(meta_line, this->header_lists[meta_line_prefix])) // check if the metalineprefix is in the headerLists, if so add it to the appropirate list
2943         {
2944             this->header_lists[meta_line_prefix].push_back(meta_line);
2945         }
2946     }
2947 
getHeaderString()2948     string VCFHeader::getHeaderString()
2949     {
2950         // getHeaderString generates the string each time it is called
2951         string header_string;
2952 
2953         // start by adding the header_lines
2954         for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
2955         {
2956             if (this->header_lines[(*header_lines_iter)] != "")
2957             {
2958                 header_string += this->header_lines[(*header_lines_iter)] + "\n";
2959             }
2960         }
2961 
2962         // next add header_lists
2963         for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
2964         {
2965             vector<string> tmp_header_lists = this->header_lists[(*header_lists_iter)];
2966             for (vector<string>::const_iterator header_list = tmp_header_lists.begin(); header_list != tmp_header_lists.end(); ++header_list)
2967             {
2968                 header_string += (*header_list) + "\n";
2969             }
2970         }
2971 
2972         // last add header columns
2973         vector<string>::const_iterator last_element = this->header_columns.end() - 1;
2974         for (vector<string>::const_iterator header_column_iter = this->header_columns.begin(); header_column_iter != this->header_columns.end(); ++header_column_iter)
2975         {
2976             string delimiter = (header_column_iter == last_element) ? "\n" : "\t";
2977             header_string += (*header_column_iter) + delimiter;
2978         }
2979         return header_string;
2980     }
2981 
metaInfoIdExistsInVector(const string & meta_line,vector<string> & meta_lines)2982     bool VCFHeader::metaInfoIdExistsInVector(const string& meta_line, vector<string>& meta_lines)
2983     {
2984         // extract the id from meta_line
2985         size_t meta_line_id_start_idx = meta_line.find("ID=", 0); // used for the start of the substring index
2986         size_t meta_line_id_end_idx = meta_line.find(",", meta_line_id_start_idx); // used for end of the substring index
2987         string meta_line_id = (meta_line_id_start_idx < meta_line_id_end_idx) ? meta_line.substr(meta_line_id_start_idx, meta_line_id_end_idx - meta_line_id_start_idx) : "";
2988 
2989         for (vector<string>::const_iterator iter = meta_lines.begin(); iter != meta_lines.end(); ++iter)
2990         {
2991             // extract the id from iter's meta_line string
2992             size_t iter_meta_line_id_start_idx = (*iter).find("ID=", 0);
2993             size_t iter_meta_line_id_end_idx = (*iter).find(",", iter_meta_line_id_start_idx);
2994             string iter_meta_line_id = (iter_meta_line_id_start_idx < iter_meta_line_id_end_idx) ? (*iter).substr(iter_meta_line_id_start_idx, iter_meta_line_id_end_idx - iter_meta_line_id_start_idx) : "";
2995             // compare the meta_line_id with the iter_meta_line_id
2996             if (strcasecmp(meta_line_id.c_str(), iter_meta_line_id.c_str()) == 0)
2997             {
2998                 return true;
2999             }
3000         }
3001         return false;
3002     }
3003 
addHeaderColumn(const string & header_column)3004     void VCFHeader::addHeaderColumn(const string& header_column)
3005     {
3006         // don't add duplicates
3007         //  vector<string>::iterator test = find(this->header_columns.begin(), this->header_columns.end(), header_column);
3008         if (find(this->header_columns.begin(), this->header_columns.end(), header_column) == this->header_columns.end())
3009         {
3010             this->header_columns.push_back(header_column);
3011         }
3012     }
3013 
3014 } // end namespace vcf
3015