1 #include "Allele.h"
2 #include "multichoose.h"
3 #include "TryCatch.h"
4 
5 
referenceOffset(void) const6 int Allele::referenceOffset(void) const {
7     /*cout << readID << " offset checked " << referencePosition - position << " against position " << position
8         << " allele length " << length << " str length " << referenceSequence.size() << " qstr size " << qualityString.size() << endl;
9         */
10     return *currentReferencePosition - position;
11 }
12 
setQuality(void)13 void Allele::setQuality(void) {
14     quality = currentQuality();
15     lnquality = phred2ln(quality);
16 }
17 
bpLeft(void)18 int Allele::bpLeft(void) {
19     return position - alignmentStart;
20 }
21 
bpRight(void)22 int Allele::bpRight(void) {
23     return alignmentEnd - (position + referenceLength);
24 }
25 
26 // called prior to using the allele in analysis
27 // called again when haplotype alleles are built, in which case the "currentBase" is set to the alternate sequence of the allele
update(int haplotypeLength)28 void Allele::update(int haplotypeLength) {
29     if (haplotypeLength == 1) {
30         if (type == ALLELE_REFERENCE) {
31             currentBase = string(1, *currentReferenceBase);
32         } else {
33             currentBase = base();
34         }
35     } else {
36         currentBase = base();
37     }
38     // should be done after setting currentBase to haplotypeLength
39     if (isReference()) setQuality();
40     basesLeft = bpLeft();
41     basesRight = bpRight();
42 }
43 
44 // quality of subsequence of allele
subquality(int startpos,int len) const45 const int Allele::subquality(int startpos, int len) const {
46     int start = startpos - position;
47     int sum = 0;
48     for (int i = start; i < len; ++i) {
49         sum += baseQualities.at(i);
50     }
51     return sum;
52 }
53 
54 // quality of subsequence of allele
lnsubquality(int startpos,int len) const55 const long double Allele::lnsubquality(int startpos, int len) const {
56     return phred2ln(subquality(startpos, len));
57 }
58 
subquality(const Allele & a) const59 const int Allele::subquality(const Allele &a) const {
60     int sum = 0;
61     int rp = a.position - position;
62     int l = a.length;
63     int L = l;
64     int spanstart = 0;
65     int spanend = 1;
66     //int l = min((int) a.length, (int) baseQualities.size() - start);
67     if (a.type == ALLELE_INSERTION) {
68         L = l + 2;
69         if (L > baseQualities.size()) {
70             L = baseQualities.size();
71             spanstart = 0;
72         } else {
73             // set lower bound to 0
74             if (rp < (L / 2)) {
75                 spanstart = 0;
76             } else {
77                 spanstart = rp - (L / 2);
78             }
79             // set upper bound to the string length
80             if (spanstart + L > baseQualities.size()) {
81                 spanstart = baseQualities.size() - L;
82             }
83         }
84         //string qualstr = baseQualities.substr(spanstart, L);
85         spanend = spanstart + L;
86     } else if (a.type == ALLELE_DELETION) {
87         L = l + 2;
88         if (L > baseQualities.size()) {
89             L = baseQualities.size();
90             spanstart = 0;
91         } else {
92             // set lower bound to 0
93             if (rp < 1) {
94                 spanstart = 0;
95             } else {
96                 spanstart = rp - 1;
97             }
98             // set upper bound to the string length
99             if (spanstart + L > baseQualities.size()) {
100                 spanstart = baseQualities.size() - L;
101             }
102         }
103         spanend = spanstart + L;
104     } else if (a.type == ALLELE_MNP) {
105         L = l;
106         if (L > baseQualities.size()) {
107             L = baseQualities.size();
108             spanstart = 0;
109         } else {
110             if (rp < 1) {
111                 spanstart = 0;
112             } else {
113                 spanstart = rp;
114             }
115             // impossible
116             if (spanstart + L > baseQualities.size()) {
117                 spanstart = baseQualities.size() - L;
118             }
119         }
120         spanend = spanstart + L;
121     }
122     // TODO ALLELE_COMPLEX
123     for (int i = spanstart; i < spanend; ++i) {
124         sum += baseQualities.at(i);
125     }
126     return sum * (l / L);
127 }
128 
lnsubquality(const Allele & a) const129 const long double Allele::lnsubquality(const Allele& a) const {
130     return phred2ln(subquality(a));
131 }
132 
updateAllelesCachedData(vector<Allele * > & alleles)133 void updateAllelesCachedData(vector<Allele*>& alleles) {
134     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
135         (*a)->update();
136     }
137 }
138 
139 /*
140   const int Allele::basesLeft(void) const {
141   if (type == ALLELE_REFERENCE) {
142   return bpLeft + referenceOffset();
143   } else {
144   return bpLeft;
145   }
146   }
147 
148   const int Allele::basesRight(void) const {
149   if (type == ALLELE_REFERENCE) {
150   return bpRight - referenceOffset();
151   } else {
152   return bpRight;
153   }
154   }
155 */
156 
157 // quality at a given reference position
currentQuality(void) const158 const short Allele::currentQuality(void) const {
159     //cerr << readID << " " << position << "-" << position + length << " " << alternateSequence.size() << " vs " << baseQualities.size() << endl;
160     switch (this->type) {
161         case ALLELE_REFERENCE:
162             // should check a different way... this is wrong
163             // it will catch it all the time,
164             if (currentBase.size() > 1) {
165                 return averageQuality(baseQualities);
166             } else {
167                 int off = referenceOffset();
168                 if (off < 0 || off > baseQualities.size()) {
169                     return 0;
170                 } else {
171                     return baseQualities.at(off);
172                 }
173             }
174             break;
175         case ALLELE_INSERTION:
176         case ALLELE_DELETION:
177         case ALLELE_SNP:
178         case ALLELE_MNP:
179         case ALLELE_COMPLEX:
180             return quality;
181             break;
182     }
183     return 0;
184 }
185 
lncurrentQuality(void) const186 const long double Allele::lncurrentQuality(void) const {
187     return phred2ln(currentQuality());
188 }
189 
typeStr(void) const190 string Allele::typeStr(void) const {
191 
192     string t;
193 
194     switch (this->type) {
195         case ALLELE_GENOTYPE:
196             t = "genotype";
197             break;
198         case ALLELE_REFERENCE:
199             t = "reference";
200             break;
201         case ALLELE_MNP:
202             t = "mnp";
203             break;
204         case ALLELE_SNP:
205             t = "snp";
206             break;
207         case ALLELE_INSERTION:
208             t = "insertion";
209             break;
210         case ALLELE_DELETION:
211             t = "deletion";
212             break;
213         case ALLELE_COMPLEX:
214             t = "complex";
215             break;
216         case ALLELE_NULL:
217             t = "null";
218             break;
219         default:
220             t = "unknown";
221             break;
222     }
223 
224     return t;
225 
226 }
227 
isReference(void) const228 bool Allele::isReference(void) const {
229     return type == ALLELE_REFERENCE;
230 }
231 
isSNP(void) const232 bool Allele::isSNP(void) const {
233     return type == ALLELE_SNP;
234 }
235 
isInsertion(void) const236 bool Allele::isInsertion(void) const {
237     return type == ALLELE_INSERTION;
238 }
239 
isDeletion(void) const240 bool Allele::isDeletion(void) const {
241     return type == ALLELE_DELETION;
242 }
243 
isMNP(void) const244 bool Allele::isMNP(void) const {
245     return type == ALLELE_MNP;
246 }
247 
isComplex(void) const248 bool Allele::isComplex(void) const {
249     return type == ALLELE_COMPLEX;
250 }
251 
isNull(void) const252 bool Allele::isNull(void) const {
253     return type == ALLELE_NULL;
254 }
255 
base(void) const256 const string Allele::base(void) const { // the base of this allele
257 
258     switch (this->type) {
259     case ALLELE_REFERENCE:
260         if (genotypeAllele)
261             return alternateSequence;
262         else
263             return currentBase;
264         break;
265     case ALLELE_GENOTYPE:
266         return alternateSequence;
267         break;
268         /*
269     case ALLELE_GENOTYPE:
270         return alternateSequence; // todo fix
271         break;
272     case ALLELE_REFERENCE:
273         return "R:" + convert(position) + ":" + cigar + ":" + alternateSequence;
274         break;
275         */
276     case ALLELE_SNP:
277         return "S:" + convert(position) + ":" + cigar + ":" + alternateSequence;
278         break;
279     case ALLELE_MNP:
280         return "M:" + convert(position) + ":" + cigar + ":" + alternateSequence;
281         break;
282     case ALLELE_INSERTION:
283         return "I:" + convert(position) + ":" + cigar + ":" + alternateSequence;
284         break;
285     case ALLELE_DELETION:
286         return "D:" + convert(position) + ":" + cigar;
287         break;
288     case ALLELE_COMPLEX:
289         return "C:" + convert(position) + ":" + cigar + ":" + alternateSequence;
290         break;
291     case ALLELE_NULL:
292         return "N:" + convert(position) + ":" + alternateSequence;
293     default:
294         return "";
295         break;
296     }
297 }
298 
stringForAllele(const Allele & allele)299 string stringForAllele(const Allele &allele) {
300 
301     stringstream out;
302     if (!allele.genotypeAllele) {
303         out.precision(1);
304         out
305             << allele.sampleID << ":"
306             << allele.readID << ":"
307             << allele.typeStr() << ":"
308             << allele.cigar << ":"
309             << scientific << fixed << allele.position << ":"
310             << allele.length << ":"
311             << (allele.strand == STRAND_FORWARD ? "+" : "-") << ":"
312             << allele.referenceSequence << ":"
313             << allele.alternateSequence << ":"
314             << allele.quality << ":"
315             << allele.basesLeft << ":"
316             << allele.basesRight;
317     } else {
318         out << allele.typeStr() << ":"
319             << allele.cigar << ":"
320             << scientific << fixed << allele.position << ":"
321             << allele.length << ":"
322             << allele.alternateSequence;
323     }
324 
325     return out.str();
326 }
327 
stringForAlleles(vector<Allele> & alleles)328 string stringForAlleles(vector<Allele> &alleles) {
329     stringstream out;
330     for (vector<Allele>::iterator allele = alleles.begin(); allele != alleles.end(); allele++) {
331         out << stringForAllele(*allele) << endl;
332     }
333     return out.str();
334 }
335 
tojson(vector<Allele * > & alleles)336 string tojson(vector<Allele*> &alleles) {
337     stringstream out;
338     vector<Allele*>::iterator a = alleles.begin();
339     out << "[" << (*a)->tojson(); ++a;
340     for (; a != alleles.end(); ++a)
341         out << "," << (*a)->tojson();
342     out << "]";
343     return out.str();
344 }
345 
tojson(Allele * & allele)346 string tojson(Allele*& allele) { return allele->tojson(); }
347 
tojson(void)348 string Allele::tojson(void) {
349     stringstream out;
350     if (!genotypeAllele) {
351         out << "{\"id\":\"" << readID << "\""
352             << ",\"type\":\"" << typeStr() << "\""
353             << ",\"length\":" << ((type == ALLELE_REFERENCE) ? 1 : length)
354             << ",\"position\":" << position
355             << ",\"strand\":\"" << (strand == STRAND_FORWARD ? "+" : "-") << "\"";
356         if (type == ALLELE_REFERENCE ) {
357             out << ",\"base\":\"" << alternateSequence.at(referenceOffset()) << "\""
358                 //<< ",\"reference\":\"" << allele.referenceSequence.at(referenceOffset) << "\""
359                 << ",\"quality\":" << currentQuality();
360         } else {
361             out << ",\"base\":\"" << alternateSequence << "\""
362                 //<< ",\"reference\":\"" << allele.referenceSequence << "\""
363                 << ",\"quality\":" << quality;
364         }
365         out << "}";
366 
367     } else {
368         out << "{\"type\":\"" << typeStr() << "\"";
369         switch (type) {
370             case ALLELE_REFERENCE:
371                 out << "}";
372                 break;
373             default:
374                 out << "\",\"length\":" << length
375                     << ",\"alt\":\"" << alternateSequence << "\"}";
376                 break;
377         }
378     }
379     return out.str();
380 }
381 
operator <<(ostream & out,vector<Allele * > & alleles)382 ostream &operator<<(ostream &out, vector<Allele*> &alleles) {
383     vector<Allele*>::iterator a = alleles.begin();
384     out << **a++;
385     while (a != alleles.end())
386         out << "|" << **a++;
387     return out;
388 }
389 
operator <<(ostream & out,vector<Allele> & alleles)390 ostream &operator<<(ostream &out, vector<Allele> &alleles) {
391     int i = 0;
392     for (auto& allele : alleles) {
393         out << (i++ ? "|" : "") << allele;
394     }
395     return out;
396 }
397 
operator <<(ostream & out,list<Allele * > & alleles)398 ostream &operator<<(ostream &out, list<Allele*> &alleles) {
399     int i = 0;
400     for (auto& allele : alleles) {
401         out << (i++ ? "|" : "") << allele;
402     }
403     return out;
404 }
405 
operator <<(ostream & out,Allele * & allele)406 ostream &operator<<(ostream &out, Allele* &allele) {
407     out << *allele;
408     return out;
409 }
410 
operator <<(ostream & out,Allele & allele)411 ostream &operator<<(ostream &out, Allele &allele) {
412 
413     if (!allele.genotypeAllele) {
414         int prec = out.precision();
415         // << &allele << ":"
416         out.precision(1);
417         out << allele.sampleID
418             << ":" << allele.readID
419             << ":" << allele.typeStr()
420             << ":" << allele.length
421             << ":" << allele.referenceLength
422             << ":" << scientific << fixed << allele.position
423             << ":" << (allele.strand == STRAND_FORWARD ? "+" : "-")
424             << ":" << allele.alternateSequence
425             //<< ":" << allele.referenceSequence
426             << ":" << allele.repeatRightBoundary
427             << ":" << allele.cigar
428             << ":" << allele.lnmapQuality
429             << ":" << allele.lnquality;
430         out.precision(prec);
431     } else {
432         out << allele.typeStr()
433             << ":" << allele.cigar
434             << ":" << scientific << fixed << allele.position
435             << ":" << allele.length
436             << ":" << (string) allele.alternateSequence;
437     }
438     out.precision(5);
439     return out;
440 }
441 
operator <(const Allele & a,const Allele & b)442 bool operator<(const Allele &a, const Allele &b) {
443     //cerr << "allele<" << endl;
444     return a.currentBase < b.currentBase;
445 }
446 
447 // TODO fixme??
448 // alleles are equal if they represent the same reference-relative variation or
449 // sequence, which we encode as a string and compare here
operator ==(const Allele & a,const Allele & b)450 bool operator==(const Allele &a, const Allele &b) {
451     //cerr << "allele==" << endl;
452     return a.currentBase == b.currentBase;
453 }
454 
operator !=(const Allele & a,const Allele & b)455 bool operator!=(const Allele& a, const Allele& b) {
456     return ! (a == b);
457 }
458 
equivalent(Allele & b)459 bool Allele::equivalent(Allele &b) {
460 
461     if (type != b.type) {
462         return false;
463     } else {
464         switch (type) {
465             case ALLELE_REFERENCE:
466                 // reference alleles are, by definition, always equivalent
467                 return true;
468                 break;
469             case ALLELE_SNP:
470             case ALLELE_MNP:
471                 if (alternateSequence == b.alternateSequence)
472                     return true;
473                 break;
474             case ALLELE_DELETION:
475                 if (length == b.length)
476                     return true;
477                 break;
478             case ALLELE_INSERTION:
479                 if (length == b.length
480                     && alternateSequence == b.alternateSequence)
481                     return true;
482                 break;
483             case ALLELE_COMPLEX:
484                 if (length == b.length
485                     && alternateSequence == b.alternateSequence
486                     && cigar == b.cigar)
487                     return true;
488                 break;
489             case ALLELE_NULL:
490                 return alternateSequence == b.alternateSequence;
491             default:
492                 break;
493         }
494     }
495 
496     return false;
497 }
498 
areHomozygous(vector<Allele * > & alleles)499 bool areHomozygous(vector<Allele*>& alleles) {
500     Allele* prev = alleles.front();
501     for (vector<Allele*>::iterator allele = alleles.begin() + 1; allele != alleles.end(); ++allele) {
502         if (**allele != *prev) {
503             return false;
504         }
505     }
506     return true;
507 }
508 
509 // counts alleles which satisfy operator==
countAlleles(vector<Allele * > & alleles)510 map<Allele, int> countAlleles(vector<Allele*>& alleles) {
511     map<Allele, int> counts;
512     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
513         Allele& allele = **a;
514         map<Allele, int>::iterator f = counts.find(allele);
515         if (f == counts.end()) {
516             counts[allele] = 1;
517         } else {
518             counts[allele] += 1;
519         }
520     }
521     return counts;
522 }
523 
524 // counts alleles which satisfy operator==
countAllelesString(vector<Allele * > & alleles)525 map<string, int> countAllelesString(vector<Allele*>& alleles) {
526     map<string, int> counts;
527     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
528         Allele& thisAllele = **a;
529         const string& allele = thisAllele.currentBase;
530         map<string, int>::iterator f = counts.find(allele);
531         if (f == counts.end()) {
532             counts[allele] = 1;
533         } else {
534             counts[allele] += 1;
535         }
536     }
537     return counts;
538 }
539 
countAllelesString(vector<Allele> & alleles)540 map<string, int> countAllelesString(vector<Allele>& alleles) {
541     map<string, int> counts;
542     for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
543         Allele& thisAllele = *a;
544         const string& allele = thisAllele.currentBase;
545         map<string, int>::iterator f = counts.find(allele);
546         if (f == counts.end()) {
547             counts[allele] = 1;
548         } else {
549             counts[allele] += 1;
550         }
551     }
552     return counts;
553 }
554 
555 
countAlleles(vector<Allele> & alleles)556 map<Allele, int> countAlleles(vector<Allele>& alleles) {
557     map<Allele, int> counts;
558     for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
559         Allele& allele = *a;
560         map<Allele, int>::iterator f = counts.find(allele);
561         if (f == counts.end()) {
562             counts[allele] = 1;
563         } else {
564             counts[allele] += 1;
565         }
566     }
567     return counts;
568 }
569 
countAlleles(list<Allele * > & alleles)570 map<Allele, int> countAlleles(list<Allele*>& alleles) {
571     map<Allele, int> counts;
572     for (list<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
573         Allele& allele = **a;
574         map<Allele, int>::iterator f = counts.find(allele);
575         if (f == counts.end()) {
576             counts[allele] = 1;
577         } else {
578             counts[allele] += 1;
579         }
580     }
581     return counts;
582 }
583 
584 
groupAllelesBySample(list<Allele * > & alleles)585 map<string, vector<Allele*> > groupAllelesBySample(list<Allele*>& alleles) {
586     map<string, vector<Allele*> > groups;
587     for (list<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
588         Allele*& allele = *a;
589         groups[allele->sampleID].push_back(allele);
590     }
591     return groups;
592 }
593 
uniqueAlleles(list<Allele * > & alleles)594 vector<Allele> uniqueAlleles(list<Allele*>& alleles) {
595     vector<Allele> uniques;
596     map<Allele, int> counts = countAlleles(alleles);
597     for (map<Allele, int>::iterator c = counts.begin(); c != counts.end(); ++c) {
598         uniques.push_back(c->first);
599     }
600     return uniques;
601 }
602 
groupAllelesBySample(list<Allele * > & alleles,map<string,vector<Allele * >> & groups)603 void groupAllelesBySample(list<Allele*>& alleles, map<string, vector<Allele*> >& groups) {
604     for (list<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
605         Allele*& allele = *a;
606         groups[allele->sampleID].push_back(allele);
607     }
608 }
609 
610 // in haplotype calling, if alleles have the same alternate sequence, the
611 // should have the same cigar and position.  this function picks the most
612 // common allele observation per alternate sequence and homogenizes the rest to
613 // the same if they are not reference alleles
homogenizeAlleles(map<string,vector<Allele * >> & alleleGroups,string & refseq,Allele & refallele)614 void homogenizeAlleles(map<string, vector<Allele*> >& alleleGroups, string& refseq, Allele& refallele) {
615     map<string, map<string, int> > equivs;
616     map<string, Allele*> homogenizeTo;
617     // find equivalencies between alleles
618     // base equivalency is self
619     for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
620         Allele& allele = *g->second.front();
621         if (allele.isReference()) {
622             continue;
623         }
624         equivs[allele.alternateSequence][g->first]++;
625     }
626     //
627     for (map<string, map<string, int> >::iterator e = equivs.begin(); e != equivs.end(); ++e) {
628         string altseq = e->first;
629         map<string, int>& group = e->second;
630         map<int, string> ordered;
631         for (map<string, int>::iterator f = group.begin(); f != group.end(); ++f) {
632             // pick the best by count
633             ordered[f->second] = f->first;
634         }
635 
636         // choose the most common group
637         string& altbase = ordered.rbegin()->second;
638         if (altseq == refseq) {
639             homogenizeTo[altseq] = &refallele;
640         } else {
641             homogenizeTo[altseq] = alleleGroups[altbase].front();
642         }
643     }
644     for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
645         vector<Allele*>& alleles = g->second;
646         if (alleles.front()->isReference()) {
647             continue;
648         }
649         string& altseq = alleles.front()->alternateSequence;
650         Allele* toallele = homogenizeTo[altseq];
651         string& cigar = toallele->cigar;
652         AlleleType type = toallele->type;
653         long int position = toallele->position;
654         for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
655             (*a)->cigar = cigar;
656             (*a)->type = type;
657             (*a)->position = position;
658             (*a)->update();
659         }
660     }
661 }
662 
resetProcessedFlag(map<string,vector<Allele * >> & alleleGroups)663 void resetProcessedFlag(map<string, vector<Allele*> >& alleleGroups) {
664     for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
665         for (vector<Allele*>::iterator a = g->second.begin(); a != g->second.end(); ++a) {
666             (*a)->processed = false;
667         }
668     }
669 }
670 
groupAlleles(map<string,vector<Allele * >> & sampleGroups,map<string,vector<Allele * >> & alleleGroups)671 void groupAlleles(map<string, vector<Allele*> >& sampleGroups, map<string, vector<Allele*> >& alleleGroups) {
672     for (map<string, vector<Allele*> >::iterator sample = sampleGroups.begin(); sample != sampleGroups.end(); ++sample) {
673         for (vector<Allele*>::iterator allele = sample->second.begin(); allele != sample->second.end(); ++allele) {
674             alleleGroups[(*allele)->currentBase].push_back(*allele);
675         }
676     }
677 }
678 
groupAlleles(map<string,vector<Allele * >> & sampleGroups,bool (* fncompare)(Allele & a,Allele & b))679 vector<vector<Allele*> > groupAlleles(map<string, vector<Allele*> > &sampleGroups, bool (*fncompare)(Allele &a, Allele &b)) {
680     vector<vector<Allele*> > groups;
681     for (map<string, vector<Allele*> >::iterator sg = sampleGroups.begin(); sg != sampleGroups.end(); ++sg) {
682         vector<Allele*>& alleles = sg->second;
683         for (vector<Allele*>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
684             bool unique = true;
685             for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
686                 if ((*fncompare)(**oa, *ag->front())) {
687                     ag->push_back(*oa);
688                     unique = false;
689                     break;
690                 }
691             }
692             if (unique) {
693                 vector<Allele*> trueAlleleGroup;
694                 trueAlleleGroup.push_back(*oa);
695                 groups.push_back(trueAlleleGroup);
696             }
697         }
698     }
699     return groups;
700 }
701 
groupAlleles(list<Allele * > & alleles,bool (* fncompare)(Allele * & a,Allele * & b))702 vector<vector<Allele*> >  groupAlleles(list<Allele*> &alleles, bool (*fncompare)(Allele* &a, Allele* &b)) {
703     vector<vector<Allele*> > groups;
704     for (list<Allele*>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
705         bool unique = true;
706         for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
707             if ((*fncompare)(*oa, ag->front())) {
708                 ag->push_back(*oa);
709                 unique = false;
710                 break;
711             }
712         }
713         if (unique) {
714             vector<Allele*> trueAlleleGroup;
715             trueAlleleGroup.push_back(*oa);
716             groups.push_back(trueAlleleGroup);
717         }
718     }
719     return groups;
720 }
721 
groupAlleles(list<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))722 vector<vector<Allele*> >  groupAlleles(list<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
723     vector<vector<Allele*> > groups;
724     for (list<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
725         bool unique = true;
726         for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
727             if ((*fncompare)(*oa, *ag->front())) {
728                 ag->push_back(&*oa);
729                 unique = false;
730                 break;
731             }
732         }
733         if (unique) {
734             vector<Allele*> trueAlleleGroup;
735             trueAlleleGroup.push_back(&*oa);
736             groups.push_back(trueAlleleGroup);
737         }
738     }
739     return groups;
740 }
741 
groupAlleles(vector<Allele * > & alleles,bool (* fncompare)(Allele & a,Allele & b))742 vector<vector<Allele*> >  groupAlleles(vector<Allele*> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
743     vector<vector<Allele*> > groups;
744     for (vector<Allele*>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
745         bool unique = true;
746         for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
747             if ((*fncompare)(**oa, *ag->front())) {
748                 ag->push_back(*oa);
749                 unique = false;
750                 break;
751             }
752         }
753         if (unique) {
754             vector<Allele*> trueAlleleGroup;
755             trueAlleleGroup.push_back(*oa);
756             groups.push_back(trueAlleleGroup);
757         }
758     }
759     return groups;
760 }
761 
groupAlleles(vector<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))762 vector<vector<Allele*> >  groupAlleles(vector<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
763     vector<vector<Allele*> > groups;
764     for (vector<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
765         bool unique = true;
766         for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
767             if ((*fncompare)(*oa, *ag->front())) {
768                 ag->push_back(&*oa);
769                 unique = false;
770                 break;
771             }
772         }
773         if (unique) {
774             vector<Allele*> trueAlleleGroup;
775             trueAlleleGroup.push_back(&*oa);
776             groups.push_back(trueAlleleGroup);
777         }
778     }
779     return groups;
780 }
781 
groupAlleles_copy(list<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))782 vector<vector<Allele> >  groupAlleles_copy(list<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
783     vector<vector<Allele> > groups;
784     for (list<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
785         bool unique = true;
786         for (vector<vector<Allele> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
787             if ((*fncompare)(*oa, ag->front())) {
788                 ag->push_back(*oa);
789                 unique = false;
790                 break;
791             }
792         }
793         if (unique) {
794             vector<Allele> trueAlleleGroup;
795             trueAlleleGroup.push_back(*oa);
796             groups.push_back(trueAlleleGroup);
797         }
798     }
799     return groups;
800 }
801 
groupAlleles_copy(vector<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))802 vector<vector<Allele> >  groupAlleles_copy(vector<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
803     vector<vector<Allele> > groups;
804     for (vector<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
805         bool unique = true;
806         for (vector<vector<Allele> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
807             if ((*fncompare)(*oa, ag->front())) {
808                 ag->push_back(*oa);
809                 unique = false;
810                 break;
811             }
812         }
813         if (unique) {
814             vector<Allele> trueAlleleGroup;
815             trueAlleleGroup.push_back(*oa);
816             groups.push_back(trueAlleleGroup);
817         }
818     }
819     return groups;
820 }
821 
groupAlleles_copy(vector<Allele> & alleles)822 vector<vector<Allele> >  groupAlleles_copy(vector<Allele> &alleles) {
823     vector<vector<Allele> > groups;
824     for (vector<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
825         bool unique = true;
826         for (vector<vector<Allele> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
827             if (*oa == ag->front()) {
828                 ag->push_back(*oa);
829                 unique = false;
830                 break;
831             }
832         }
833         if (unique) {
834             vector<Allele> trueAlleleGroup;
835             trueAlleleGroup.push_back(*oa);
836             groups.push_back(trueAlleleGroup);
837         }
838     }
839     return groups;
840 }
841 
sameSample(Allele & other)842 bool Allele::sameSample(Allele &other) { return this->sampleID == other.sampleID; }
843 
allelesSameType(Allele * & a,Allele * & b)844 bool allelesSameType(Allele* &a, Allele* &b) { return a->type == b->type; }
845 
allelesEquivalent(Allele * & a,Allele * & b)846 bool allelesEquivalent(Allele* &a, Allele* &b) { return a->equivalent(*b); }
847 
allelesSameSample(Allele * & a,Allele * & b)848 bool allelesSameSample(Allele* &a, Allele* &b) { return a->sampleID == b->sampleID; }
849 
allelesSameType(Allele & a,Allele & b)850 bool allelesSameType(Allele &a, Allele &b) { return a.type == b.type; }
851 
allelesEquivalent(Allele & a,Allele & b)852 bool allelesEquivalent(Allele &a, Allele &b) { return a.equivalent(b); }
853 
allelesSameSample(Allele & a,Allele & b)854 bool allelesSameSample(Allele &a, Allele &b) { return a.sampleID == b.sampleID; }
855 
allelesEqual(Allele & a,Allele & b)856 bool allelesEqual(Allele &a, Allele &b) { return a == b; }
857 
genotypeAllelesFromAlleleGroups(vector<vector<Allele * >> & groups)858 vector<Allele> genotypeAllelesFromAlleleGroups(vector<vector<Allele*> > &groups) {
859 
860     vector<Allele> results;
861     for (vector<vector<Allele*> >::iterator g = groups.begin(); g != groups.end(); ++g)
862         results.push_back(genotypeAllele(*g->front()));
863     return results;
864 
865 }
866 
genotypeAllelesFromAlleles(vector<Allele * > & alleles)867 vector<Allele> genotypeAllelesFromAlleles(vector<Allele*> &alleles) {
868 
869     vector<Allele> results;
870     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a)
871         results.push_back(genotypeAllele(**a));
872     return results;
873 
874 }
875 
genotypeAllelesFromAlleleGroups(vector<vector<Allele>> & groups)876 vector<Allele> genotypeAllelesFromAlleleGroups(vector<vector<Allele> > &groups) {
877 
878     vector<Allele> results;
879     for (vector<vector<Allele> >::iterator g = groups.begin(); g != groups.end(); ++g)
880         results.push_back(genotypeAllele(g->front()));
881     return results;
882 
883 }
884 
genotypeAllelesFromAlleles(vector<Allele> & alleles)885 vector<Allele> genotypeAllelesFromAlleles(vector<Allele> &alleles) {
886 
887     vector<Allele> results;
888     for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a)
889         results.push_back(genotypeAllele(*a));
890     return results;
891 
892 }
893 
genotypeAllele(Allele & a)894 Allele genotypeAllele(Allele &a) {
895     return Allele(a.type, a.alternateSequence, a.length, a.referenceLength, a.cigar, a.position, a.repeatRightBoundary);
896 }
897 
genotypeAllele(AlleleType type,string alt,unsigned int len,string cigar,unsigned int reflen,long int pos,long int rrbound)898 Allele genotypeAllele(AlleleType type, string alt, unsigned int len, string cigar, unsigned int reflen, long int pos, long int rrbound) {
899     return Allele(type, alt, len, reflen, cigar, pos, rrbound);
900 }
901 
allowedAlleleTypes(vector<AlleleType> & allowedEnumeratedTypes)902 int allowedAlleleTypes(vector<AlleleType>& allowedEnumeratedTypes) {
903     int allowedTypes = 0;// (numberOfPossibleAlleleTypes, false);
904     for (vector<AlleleType>::iterator t = allowedEnumeratedTypes.begin(); t != allowedEnumeratedTypes.end(); ++t) {
905         allowedTypes |= *t;
906     }
907     return allowedTypes;
908 }
909 
filterAlleles(list<Allele * > & alleles,int allowedTypes)910 void filterAlleles(list<Allele*>& alleles, int allowedTypes) {
911 
912     for (list<Allele*>::iterator allele = alleles.begin(); allele != alleles.end(); ++allele) {
913         bool allowed = false;
914         if (!(allowedTypes & (*allele)->type))
915             *allele = NULL;
916     }
917     alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
918 
919 }
920 
countAlleles(map<string,vector<Allele * >> & sampleGroups)921 int countAlleles(map<string, vector<Allele*> >& sampleGroups) {
922 
923     int count = 0;
924     for (map<string, vector<Allele*> >::iterator sg = sampleGroups.begin(); sg != sampleGroups.end(); ++sg) {
925         count += sg->second.size();
926     }
927     return count;
928 
929 }
930 
countAllelesWithBase(vector<Allele * > & alleles,string base)931 int countAllelesWithBase(vector<Allele*>& alleles, string base) {
932 
933     int count = 0;
934     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
935         if ((*a)->currentBase == base)
936             ++count;
937     }
938     return count;
939 
940 }
941 
baseCount(vector<Allele * > & alleles,string base,AlleleStrand strand)942 int baseCount(vector<Allele*>& alleles, string base, AlleleStrand strand) {
943 
944     int count = 0;
945     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
946         if ((*a)->currentBase == base && (*a)->strand == strand)
947             ++count;
948     }
949     return count;
950 
951 }
952 
953 pair<pair<int, int>, pair<int, int> >
baseCount(vector<Allele * > & alleles,string refbase,string altbase)954 baseCount(vector<Allele*>& alleles, string refbase, string altbase) {
955 
956     int forwardRef = 0;
957     int reverseRef = 0;
958     int forwardAlt = 0;
959     int reverseAlt = 0;
960 
961     for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
962         string base = (*a)->currentBase;
963         AlleleStrand strand = (*a)->strand;
964         if (base == refbase) {
965             if (strand == STRAND_FORWARD)
966                 ++forwardRef;
967             else if (strand == STRAND_REVERSE)
968                 ++reverseRef;
969         } else if (base == altbase) {
970             if (strand == STRAND_FORWARD)
971                 ++forwardAlt;
972             else if (strand == STRAND_REVERSE)
973                 ++reverseAlt;
974         }
975     }
976 
977     return make_pair(make_pair(forwardRef, forwardAlt), make_pair(reverseRef, reverseAlt));
978 
979 }
980 
readSeq(void)981 string Allele::readSeq(void) {
982     string r;
983     for (vector<Allele>::iterator a = alignmentAlleles->begin(); a != alignmentAlleles->end(); ++a) {
984         r.append(a->alternateSequence);
985     }
986     return r;
987 }
988 
read5p(void)989 string Allele::read5p(void) {
990     string r;
991     vector<Allele>::const_reverse_iterator a = alignmentAlleles->rbegin();
992     while (&*a != this) {
993         ++a;
994     }
995     if ((a+1) != alignmentAlleles->rend()) ++a;
996     while (a != alignmentAlleles->rend()) {
997         r = a->alternateSequence + r;
998         ++a;
999     }
1000     r.append(alternateSequence);
1001     return r;
1002 }
1003 
read3p(void)1004 string Allele::read3p(void) {
1005     string r = alternateSequence;
1006     vector<Allele>::const_iterator a = alignmentAlleles->begin();
1007     while (&*a != this) {
1008         ++a;
1009     }
1010     if ((a+1) != alignmentAlleles->end()) ++a;
1011     while (a != alignmentAlleles->end()) {
1012         r.append(a->alternateSequence);
1013         ++a;
1014     }
1015     return r;
1016 }
1017 
read5pNonNull(void)1018 string Allele::read5pNonNull(void) {
1019     string r = alternateSequence;
1020     vector<Allele>::const_reverse_iterator a = alignmentAlleles->rbegin();
1021     while (&*a != this) {
1022         ++a;
1023     }
1024     while (a != alignmentAlleles->rend() && !a->isNull()) {
1025         if (&*a != this) {
1026             r = a->alternateSequence + r;
1027         }
1028         ++a;
1029     }
1030     return r;
1031 }
1032 
read3pNonNull(void)1033 string Allele::read3pNonNull(void) {
1034     string r = alternateSequence;
1035     vector<Allele>::const_iterator a = alignmentAlleles->begin();
1036     while (&*a != this) {
1037         ++a;
1038     }
1039     while (a != alignmentAlleles->end() && !a->isNull()) {
1040         if (&*a != this) {
1041             r.append(a->alternateSequence);
1042         }
1043         ++a;
1044     }
1045     return r;
1046 }
1047 
read5pNonNullBases(void)1048 int Allele::read5pNonNullBases(void) {
1049     int bp = 0;
1050     vector<Allele>::const_reverse_iterator a = alignmentAlleles->rbegin();
1051     while (&*a != this) {
1052         ++a;
1053     }
1054     while (a != alignmentAlleles->rend() && !a->isNull()) {
1055         if (&*a != this) {
1056             //cerr << "5p bp = " << bp << " adding " << stringForAllele(*a) << " to " << stringForAllele(*this) << endl;
1057             bp += a->alternateSequence.size();
1058         }
1059         ++a;
1060     }
1061     return bp;
1062 }
1063 
read3pNonNullBases(void)1064 int Allele::read3pNonNullBases(void) {
1065     int bp = 0;
1066     vector<Allele>::const_iterator a = alignmentAlleles->begin();
1067     while (&*a != this) {
1068         ++a;
1069     }
1070     while (a != alignmentAlleles->end() && !a->isNull()) {
1071         if (&*a != this) {
1072             //cerr << "3p bp = " << bp << " adding " << stringForAllele(*a) << " to " << stringForAllele(*this) << endl;
1073             bp += a->alternateSequence.size();
1074         }
1075         ++a;
1076     }
1077     return bp;
1078 }
1079 
1080 // adjusts the allele to have a new start
1081 // returns the ref/alt sequence obtained by subtracting length from the left end of the allele
subtract(int subtractFromRefStart,int subtractFromRefEnd,string & substart,string & subend,vector<pair<int,string>> & cigarStart,vector<pair<int,string>> & cigarEnd,vector<short> & qsubstart,vector<short> & qsubend)1082 void Allele::subtract(
1083         int subtractFromRefStart,
1084         int subtractFromRefEnd,
1085         string& substart,
1086         string& subend,
1087         vector<pair<int, string> >& cigarStart,
1088         vector<pair<int, string> >& cigarEnd,
1089         vector<short>& qsubstart,
1090         vector<short>& qsubend
1091     ) {
1092 
1093     substart.clear();
1094     subend.clear();
1095     cigarStart.clear();
1096     cigarEnd.clear();
1097     qsubstart.clear();
1098     qsubend.clear();
1099 
1100     // prepare to adjust cigar
1101     list<pair<int, string> > cigarL = splitCigarList(cigar);
1102 
1103     // walk the cigar string to determine where to make the left cut in the alternate sequence
1104     int subtractFromAltStart = 0;
1105     if (subtractFromRefStart) {
1106         int refbpstart = subtractFromRefStart;
1107         pair<int, string> c;
1108         while (!cigarL.empty()) {
1109             c = cigarL.front();
1110             cigarL.pop_front();
1111             char op = c.second[0];
1112             switch (op) {
1113                 case 'M':
1114                 case 'X':
1115                 case 'N':
1116                     refbpstart -= c.first;
1117                     subtractFromAltStart += c.first;
1118                     break;
1119                 case 'I':
1120                     subtractFromAltStart += c.first;
1121                     break;
1122                 case 'D':
1123                     refbpstart -= c.first;
1124                     break;
1125                 default:
1126                     break;
1127             }
1128 
1129             cigarStart.push_back(c);
1130 
1131             if (refbpstart < 0) {
1132                 // split/adjust the last cigar element
1133                 cigarL.push_front(c);
1134                 cigarL.front().first = -refbpstart;
1135                 cigarStart.back().first += refbpstart;
1136                 switch (op) {
1137                     case 'M':
1138                     case 'X':
1139                     case 'N':
1140                     case 'I':
1141                         subtractFromAltStart += refbpstart;
1142                         break;
1143                     case 'D':
1144                     default:
1145                         break;
1146                 }
1147                 break; // we're done
1148             }
1149         }
1150     }
1151 
1152 
1153     int subtractFromAltEnd = 0;
1154     // walk the cigar string to determine where to make the right cut in the alternate sequence
1155     if (subtractFromRefEnd) {
1156         int refbpend = subtractFromRefEnd;
1157         pair<int, string> c;
1158         while (!cigarL.empty() && refbpend > 0) {
1159             c = cigarL.back();
1160             cigarL.pop_back();
1161             char op = c.second[0];
1162             switch (op) {
1163                 case 'M':
1164                 case 'X':
1165                 case 'N':
1166                     subtractFromAltEnd += c.first;
1167                     refbpend -= c.first;
1168                     break;
1169                 case 'I':
1170                     subtractFromAltEnd += c.first;
1171                     break;
1172                 case 'D':
1173                     refbpend -= c.first;
1174                     break;
1175                 default:
1176                     break;
1177             }
1178 
1179             cigarEnd.insert(cigarEnd.begin(), c);
1180 
1181             if (refbpend < 0) {
1182                 // split/adjust the last cigar element
1183                 cigarL.push_back(c);
1184                 cigarL.back().first = -refbpend;
1185                 cigarEnd.front().first += refbpend;
1186                 switch (op) {
1187                     case 'M':
1188                     case 'X':
1189                     case 'I':
1190                     case 'N':
1191                         subtractFromAltEnd += refbpend;
1192                         break;
1193                     case 'D':
1194                     default:
1195                         break;
1196                 }
1197                 break; // drop out of loop, we're done
1198             }
1199         }
1200     }
1201 
1202     // adjust the alternateSequence
1203     substart = alternateSequence.substr(0, subtractFromAltStart);
1204     subend = alternateSequence.substr(alternateSequence.size() - subtractFromAltEnd, subtractFromAltEnd);
1205     alternateSequence.erase(0, subtractFromAltStart);
1206     alternateSequence.erase(alternateSequence.size() - subtractFromAltEnd, subtractFromAltEnd);
1207 
1208     // adjust the quality string
1209     qsubstart.insert(qsubstart.begin(), baseQualities.begin(), baseQualities.begin() + subtractFromAltStart);
1210     qsubend.insert(qsubend.begin(), baseQualities.begin() + baseQualities.size() - subtractFromAltEnd, baseQualities.end());
1211     baseQualities.erase(baseQualities.begin(), baseQualities.begin() + subtractFromAltStart);
1212     baseQualities.erase(baseQualities.begin() + baseQualities.size() - subtractFromAltEnd, baseQualities.end());
1213 
1214     // reset the cigar
1215     cigarL.erase(remove_if(cigarL.begin(), cigarL.end(), isEmptyCigarElement), cigarL.end());
1216     cigar = joinCigarList(cigarL);
1217 
1218     // reset the length
1219     length = alternateSequence.size();
1220 
1221     // update the type specification
1222     updateTypeAndLengthFromCigar();
1223 
1224     // adjust the position
1225     position += subtractFromRefStart; // assumes the first-base of the alleles is reference==, not ins
1226 
1227     //referenceLength -= subtractFromRefStart;
1228     //referenceLength -= subtractFromRefEnd;
1229 
1230     referenceLength = referenceLengthFromCigar();
1231 
1232 }
1233 
subtractFromStart(int bp,string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1234 void Allele::subtractFromStart(int bp, string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1235     string emptystr;
1236     vector<pair<int, string> > emptycigar;
1237     vector<short> emptyquals;
1238     subtract(bp, 0, seq, emptystr, cig, emptycigar, quals, emptyquals);
1239 }
1240 
subtractFromEnd(int bp,string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1241 void Allele::subtractFromEnd(int bp, string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1242     string emptystr;
1243     vector<pair<int, string> > emptycigar;
1244     vector<short> emptyquals;
1245     subtract(0, bp, emptystr, seq, emptycigar, cig, emptyquals, quals);
1246 }
1247 
addToStart(string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1248 void Allele::addToStart(string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1249     string emptystr;
1250     vector<pair<int, string> > emptycigar;
1251     vector<short> emptyquals;
1252     add(seq, emptystr, cig, emptycigar, quals, emptyquals);
1253 }
1254 
addToEnd(string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1255 void Allele::addToEnd(string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1256     string emptystr;
1257     vector<pair<int, string> > emptycigar;
1258     vector<short> emptyquals;
1259     add(emptystr, seq, emptycigar, cig, emptyquals, quals);
1260 }
1261 
add(string & addToStart,string & addToEnd,vector<pair<int,string>> & cigarStart,vector<pair<int,string>> & cigarEnd,vector<short> & qaddToStart,vector<short> & qaddToEnd)1262 void Allele::add(
1263         string& addToStart,
1264         string& addToEnd,
1265         vector<pair<int, string> >& cigarStart,
1266         vector<pair<int, string> >& cigarEnd,
1267         vector<short>& qaddToStart,
1268         vector<short>& qaddToEnd
1269     ) {
1270 
1271     // adjust the position
1272     for (vector<pair<int, string> >::iterator c = cigarStart.begin(); c != cigarStart.end(); ++c) {
1273         switch (c->second[0]) {
1274             case 'M':
1275             case 'X':
1276             case 'D':
1277             case 'N':
1278                 position -= c->first;
1279                 break;
1280             case 'I':
1281             default:
1282                 break;
1283         }
1284     }
1285 
1286     // prepare to adjust cigar
1287     vector<pair<int, string> > cigarV = splitCigar(cigar);
1288 
1289     // adjust the cigar
1290     if (!cigarStart.empty()) {
1291         if (cigarStart.back().second == cigarV.front().second) {
1292             // merge
1293             cigarV.front().first += cigarStart.back().first;
1294             cigarStart.pop_back();
1295         }
1296     }
1297     cigarV.insert(cigarV.begin(), cigarStart.begin(), cigarStart.end());
1298 
1299     if (!cigarEnd.empty()) {
1300         if (cigarEnd.front().second == cigarV.back().second) {
1301             // merge
1302             cigarV.back().first += cigarEnd.front().first;
1303             cigarEnd.pop_back();
1304         } else {
1305             cigarV.insert(cigarV.end(), cigarEnd.begin(), cigarEnd.end());
1306         }
1307     }
1308 
1309     // adjust the alternateSequence
1310     alternateSequence.insert(0, addToStart);
1311     alternateSequence.append(addToEnd);
1312 
1313     // adjust the quality string
1314     baseQualities.insert(baseQualities.begin(), qaddToStart.begin(), qaddToStart.end());
1315     baseQualities.insert(baseQualities.end(), qaddToEnd.begin(), qaddToEnd.end());
1316 
1317     // reset the cigar
1318     cigarV.erase(remove_if(cigarV.begin(), cigarV.end(), isEmptyCigarElement), cigarV.end());
1319     cigar = joinCigar(cigarV);
1320 
1321     updateTypeAndLengthFromCigar();
1322 
1323     // reset referenceLength
1324     referenceLength = referenceLengthFromCigar();
1325 
1326 }
1327 
updateTypeAndLengthFromCigar(void)1328 void Allele::updateTypeAndLengthFromCigar(void) {
1329 
1330     vector<pair<int, string> > cigarV = splitCigar(cigar);
1331 
1332     map<char, int> cigarTypes;
1333     map<char, int> cigarLengths;
1334     for (vector<pair<int, string> >::iterator c = cigarV.begin(); c != cigarV.end(); ++c) {
1335         ++cigarTypes[c->second[0]];
1336         cigarLengths[c->second[0]] += c->first;
1337     }
1338     if (cigarTypes.size() == 1) {
1339         switch (cigarTypes.begin()->first) {
1340             case 'M':
1341                 type = ALLELE_REFERENCE;
1342                 break;
1343             case 'I':
1344                 type = ALLELE_INSERTION;
1345                 break;
1346             case 'D':
1347                 type = ALLELE_DELETION;
1348                 break;
1349             case 'X':
1350                 if (cigarLengths['X'] > 1) {
1351                     type = ALLELE_MNP;
1352                 } else {
1353                     type = ALLELE_SNP;
1354                 }
1355                 break;
1356             case 'N':
1357                 type = ALLELE_NULL;
1358                 break;
1359             default:
1360                 break;
1361         }
1362     } else if (cigarTypes.size() == 2) {
1363         if (cigarTypes['M'] > 0) {
1364             if (cigarTypes['I'] == 1) {
1365                 type = ALLELE_INSERTION;
1366             } else if (cigarTypes['D'] == 1) {
1367                 type = ALLELE_DELETION;
1368             } else if (cigarTypes['X'] == 1) {
1369                 if (cigarLengths['X'] > 1) {
1370                     type = ALLELE_MNP;
1371                 } else {
1372                     type = ALLELE_SNP;
1373                 }
1374             } else {
1375                 type = ALLELE_COMPLEX;
1376             }
1377         } else {
1378             type = ALLELE_COMPLEX;
1379         }
1380     } else {
1381         type = ALLELE_COMPLEX;
1382     }
1383 
1384     // recalculate allele length and quality, based on type
1385     switch (type) {
1386         case ALLELE_REFERENCE:
1387             length = alternateSequence.size();
1388             break;
1389         case ALLELE_SNP:
1390         case ALLELE_MNP:
1391             length = cigarLengths['X'];
1392             break;
1393         case ALLELE_INSERTION:
1394             length = cigarLengths['I'];
1395             break;
1396         case ALLELE_DELETION:
1397             length = cigarLengths['D'];
1398             break;
1399         case ALLELE_COMPLEX:
1400             length = alternateSequence.size();
1401             break;
1402         case ALLELE_NULL:
1403             length = alternateSequence.size();
1404             break;
1405         default:
1406             break;
1407     }
1408 
1409 }
1410 
referenceLengthFromCigar(string & cigar)1411 int referenceLengthFromCigar(string& cigar) {
1412     int r = 0;
1413     vector<pair<int, string> > cigarV = splitCigar(cigar);
1414     for (vector<pair<int, string> >::iterator c = cigarV.begin(); c != cigarV.end(); ++c) {
1415         switch (c->second[0]) {
1416         case 'M':
1417         case 'X':
1418         case 'D':
1419             r += c->first;
1420             break;
1421         case 'N':
1422         case 'I':
1423         default:
1424             break;
1425         }
1426     }
1427     return r;
1428 }
1429 
referenceLengthFromCigar(void)1430 int Allele::referenceLengthFromCigar(void) {
1431     int r = 0;
1432     vector<pair<int, string> > cigarV = splitCigar(cigar);
1433     for (vector<pair<int, string> >::iterator c = cigarV.begin(); c != cigarV.end(); ++c) {
1434         switch (c->second[0]) {
1435         case 'M':
1436         case 'X':
1437         case 'D':
1438             r += c->first;
1439             break;
1440         case 'N':
1441         case 'I':
1442         default:
1443             break;
1444         }
1445     }
1446     return r;
1447 }
1448 
1449 
1450 // combines the two alleles into a complex variant, updates important data
mergeAllele(const Allele & newAllele,AlleleType newType)1451 void Allele::mergeAllele(const Allele& newAllele, AlleleType newType) {
1452     //cerr << stringForAllele(*this) << endl << stringForAllele(newAllele) << endl;
1453     type = newType;
1454     alternateSequence += newAllele.alternateSequence;
1455     length += newAllele.length; // hmmm
1456     basesRight = newAllele.basesRight;
1457     baseQualities.insert(baseQualities.end(), newAllele.baseQualities.begin(), newAllele.baseQualities.end());
1458     currentBase = base();
1459     quality = averageQuality(baseQualities);
1460     lnquality = phred2ln(quality);
1461     basesRight += newAllele.referenceLength;
1462     if (newAllele.type != ALLELE_REFERENCE) {
1463         repeatRightBoundary = newAllele.repeatRightBoundary;
1464     }
1465     cigar = mergeCigar(cigar, newAllele.cigar);
1466     referenceLength = referenceLengthFromCigar();
1467 }
1468 
squash(void)1469 void Allele::squash(void) {
1470     // will trigger destruction of this allele in the AlleleParser
1471     length = 0;
1472     position = 0;
1473 }
1474 
getLengthOnReference(void)1475 unsigned int Allele::getLengthOnReference(void) {
1476     return referenceLengthFromCigar();
1477 }
1478 
alleleUnion(vector<Allele> & a1,vector<Allele> & a2)1479 vector<Allele> alleleUnion(vector<Allele>& a1, vector<Allele>& a2) {
1480     map<string, Allele> alleleSet;
1481     vector<Allele> results;
1482     for (vector<Allele>::iterator a = a1.begin(); a != a1.end(); ++a) {
1483         alleleSet.insert(make_pair(a->base(), *a));
1484     }
1485     for (vector<Allele>::iterator a = a2.begin(); a != a2.end(); ++a) {
1486         alleleSet.insert(make_pair(a->base(), *a));
1487     }
1488     for (map<string, Allele>::iterator a = alleleSet.begin(); a != alleleSet.end(); ++a) {
1489         results.push_back(a->second);
1490     }
1491     return results;
1492 }
1493 
isEmptyAllele(const Allele & allele)1494 bool isEmptyAllele(const Allele& allele) {
1495     return allele.length == 0;
1496 }
1497 
isDividedIndel(const Allele & allele)1498 bool isDividedIndel(const Allele& allele) {
1499     vector<pair<int, string> > cigarV = splitCigar(allele.cigar);
1500     if (cigarV.front().second == "D" || cigarV.front().second == "I") {
1501         return true;
1502     } else {
1503         return false;
1504     }
1505 }
1506 
1507 // returns true if this indel is not properly flanked by reference-matching sequence
isUnflankedIndel(const Allele & allele)1508 bool isUnflankedIndel(const Allele& allele) {
1509     if (allele.isReference() || allele.isSNP() || allele.isMNP()) {
1510         return false;
1511     } else {
1512         vector<pair<int, string> > cigarV = splitCigar(allele.cigar);
1513         if (cigarV.back().second == "D"
1514             || cigarV.back().second == "I"
1515             || cigarV.front().second == "D"
1516             || cigarV.front().second == "I") {
1517             return true;
1518         } else {
1519             return false;
1520         }
1521     }
1522 }
1523 
isEmptyAlleleOrIsDividedIndel(const Allele & allele)1524 bool isEmptyAlleleOrIsDividedIndel(const Allele& allele) {
1525     return isEmptyAllele(allele) || isDividedIndel(allele);
1526 }
1527