1 #include "Allele.h"
2 #include "multichoose.h"
3 #include "TryCatch.h"
4
5
referenceOffset(void) const6 int Allele::referenceOffset(void) const {
7 /*cout << readID << " offset checked " << referencePosition - position << " against position " << position
8 << " allele length " << length << " str length " << referenceSequence.size() << " qstr size " << qualityString.size() << endl;
9 */
10 return *currentReferencePosition - position;
11 }
12
setQuality(void)13 void Allele::setQuality(void) {
14 quality = currentQuality();
15 lnquality = phred2ln(quality);
16 }
17
bpLeft(void)18 int Allele::bpLeft(void) {
19 return position - alignmentStart;
20 }
21
bpRight(void)22 int Allele::bpRight(void) {
23 return alignmentEnd - (position + referenceLength);
24 }
25
26 // called prior to using the allele in analysis
27 // called again when haplotype alleles are built, in which case the "currentBase" is set to the alternate sequence of the allele
update(int haplotypeLength)28 void Allele::update(int haplotypeLength) {
29 if (haplotypeLength == 1) {
30 if (type == ALLELE_REFERENCE) {
31 currentBase = string(1, *currentReferenceBase);
32 } else {
33 currentBase = base();
34 }
35 } else {
36 currentBase = base();
37 }
38 // should be done after setting currentBase to haplotypeLength
39 if (isReference()) setQuality();
40 basesLeft = bpLeft();
41 basesRight = bpRight();
42 }
43
44 // quality of subsequence of allele
subquality(int startpos,int len) const45 const int Allele::subquality(int startpos, int len) const {
46 int start = startpos - position;
47 int sum = 0;
48 for (int i = start; i < len; ++i) {
49 sum += baseQualities.at(i);
50 }
51 return sum;
52 }
53
54 // quality of subsequence of allele
lnsubquality(int startpos,int len) const55 const long double Allele::lnsubquality(int startpos, int len) const {
56 return phred2ln(subquality(startpos, len));
57 }
58
subquality(const Allele & a) const59 const int Allele::subquality(const Allele &a) const {
60 int sum = 0;
61 int rp = a.position - position;
62 int l = a.length;
63 int L = l;
64 int spanstart = 0;
65 int spanend = 1;
66 //int l = min((int) a.length, (int) baseQualities.size() - start);
67 if (a.type == ALLELE_INSERTION) {
68 L = l + 2;
69 if (L > baseQualities.size()) {
70 L = baseQualities.size();
71 spanstart = 0;
72 } else {
73 // set lower bound to 0
74 if (rp < (L / 2)) {
75 spanstart = 0;
76 } else {
77 spanstart = rp - (L / 2);
78 }
79 // set upper bound to the string length
80 if (spanstart + L > baseQualities.size()) {
81 spanstart = baseQualities.size() - L;
82 }
83 }
84 //string qualstr = baseQualities.substr(spanstart, L);
85 spanend = spanstart + L;
86 } else if (a.type == ALLELE_DELETION) {
87 L = l + 2;
88 if (L > baseQualities.size()) {
89 L = baseQualities.size();
90 spanstart = 0;
91 } else {
92 // set lower bound to 0
93 if (rp < 1) {
94 spanstart = 0;
95 } else {
96 spanstart = rp - 1;
97 }
98 // set upper bound to the string length
99 if (spanstart + L > baseQualities.size()) {
100 spanstart = baseQualities.size() - L;
101 }
102 }
103 spanend = spanstart + L;
104 } else if (a.type == ALLELE_MNP) {
105 L = l;
106 if (L > baseQualities.size()) {
107 L = baseQualities.size();
108 spanstart = 0;
109 } else {
110 if (rp < 1) {
111 spanstart = 0;
112 } else {
113 spanstart = rp;
114 }
115 // impossible
116 if (spanstart + L > baseQualities.size()) {
117 spanstart = baseQualities.size() - L;
118 }
119 }
120 spanend = spanstart + L;
121 }
122 // TODO ALLELE_COMPLEX
123 for (int i = spanstart; i < spanend; ++i) {
124 sum += baseQualities.at(i);
125 }
126 return sum * (l / L);
127 }
128
lnsubquality(const Allele & a) const129 const long double Allele::lnsubquality(const Allele& a) const {
130 return phred2ln(subquality(a));
131 }
132
updateAllelesCachedData(vector<Allele * > & alleles)133 void updateAllelesCachedData(vector<Allele*>& alleles) {
134 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
135 (*a)->update();
136 }
137 }
138
139 /*
140 const int Allele::basesLeft(void) const {
141 if (type == ALLELE_REFERENCE) {
142 return bpLeft + referenceOffset();
143 } else {
144 return bpLeft;
145 }
146 }
147
148 const int Allele::basesRight(void) const {
149 if (type == ALLELE_REFERENCE) {
150 return bpRight - referenceOffset();
151 } else {
152 return bpRight;
153 }
154 }
155 */
156
157 // quality at a given reference position
currentQuality(void) const158 const short Allele::currentQuality(void) const {
159 //cerr << readID << " " << position << "-" << position + length << " " << alternateSequence.size() << " vs " << baseQualities.size() << endl;
160 switch (this->type) {
161 case ALLELE_REFERENCE:
162 // should check a different way... this is wrong
163 // it will catch it all the time,
164 if (currentBase.size() > 1) {
165 return averageQuality(baseQualities);
166 } else {
167 int off = referenceOffset();
168 if (off < 0 || off > baseQualities.size()) {
169 return 0;
170 } else {
171 return baseQualities.at(off);
172 }
173 }
174 break;
175 case ALLELE_INSERTION:
176 case ALLELE_DELETION:
177 case ALLELE_SNP:
178 case ALLELE_MNP:
179 case ALLELE_COMPLEX:
180 return quality;
181 break;
182 }
183 return 0;
184 }
185
lncurrentQuality(void) const186 const long double Allele::lncurrentQuality(void) const {
187 return phred2ln(currentQuality());
188 }
189
typeStr(void) const190 string Allele::typeStr(void) const {
191
192 string t;
193
194 switch (this->type) {
195 case ALLELE_GENOTYPE:
196 t = "genotype";
197 break;
198 case ALLELE_REFERENCE:
199 t = "reference";
200 break;
201 case ALLELE_MNP:
202 t = "mnp";
203 break;
204 case ALLELE_SNP:
205 t = "snp";
206 break;
207 case ALLELE_INSERTION:
208 t = "insertion";
209 break;
210 case ALLELE_DELETION:
211 t = "deletion";
212 break;
213 case ALLELE_COMPLEX:
214 t = "complex";
215 break;
216 case ALLELE_NULL:
217 t = "null";
218 break;
219 default:
220 t = "unknown";
221 break;
222 }
223
224 return t;
225
226 }
227
isReference(void) const228 bool Allele::isReference(void) const {
229 return type == ALLELE_REFERENCE;
230 }
231
isSNP(void) const232 bool Allele::isSNP(void) const {
233 return type == ALLELE_SNP;
234 }
235
isInsertion(void) const236 bool Allele::isInsertion(void) const {
237 return type == ALLELE_INSERTION;
238 }
239
isDeletion(void) const240 bool Allele::isDeletion(void) const {
241 return type == ALLELE_DELETION;
242 }
243
isMNP(void) const244 bool Allele::isMNP(void) const {
245 return type == ALLELE_MNP;
246 }
247
isComplex(void) const248 bool Allele::isComplex(void) const {
249 return type == ALLELE_COMPLEX;
250 }
251
isNull(void) const252 bool Allele::isNull(void) const {
253 return type == ALLELE_NULL;
254 }
255
base(void) const256 const string Allele::base(void) const { // the base of this allele
257
258 switch (this->type) {
259 case ALLELE_REFERENCE:
260 if (genotypeAllele)
261 return alternateSequence;
262 else
263 return currentBase;
264 break;
265 case ALLELE_GENOTYPE:
266 return alternateSequence;
267 break;
268 /*
269 case ALLELE_GENOTYPE:
270 return alternateSequence; // todo fix
271 break;
272 case ALLELE_REFERENCE:
273 return "R:" + convert(position) + ":" + cigar + ":" + alternateSequence;
274 break;
275 */
276 case ALLELE_SNP:
277 return "S:" + convert(position) + ":" + cigar + ":" + alternateSequence;
278 break;
279 case ALLELE_MNP:
280 return "M:" + convert(position) + ":" + cigar + ":" + alternateSequence;
281 break;
282 case ALLELE_INSERTION:
283 return "I:" + convert(position) + ":" + cigar + ":" + alternateSequence;
284 break;
285 case ALLELE_DELETION:
286 return "D:" + convert(position) + ":" + cigar;
287 break;
288 case ALLELE_COMPLEX:
289 return "C:" + convert(position) + ":" + cigar + ":" + alternateSequence;
290 break;
291 case ALLELE_NULL:
292 return "N:" + convert(position) + ":" + alternateSequence;
293 default:
294 return "";
295 break;
296 }
297 }
298
stringForAllele(const Allele & allele)299 string stringForAllele(const Allele &allele) {
300
301 stringstream out;
302 if (!allele.genotypeAllele) {
303 out.precision(1);
304 out
305 << allele.sampleID << ":"
306 << allele.readID << ":"
307 << allele.typeStr() << ":"
308 << allele.cigar << ":"
309 << scientific << fixed << allele.position << ":"
310 << allele.length << ":"
311 << (allele.strand == STRAND_FORWARD ? "+" : "-") << ":"
312 << allele.referenceSequence << ":"
313 << allele.alternateSequence << ":"
314 << allele.quality << ":"
315 << allele.basesLeft << ":"
316 << allele.basesRight;
317 } else {
318 out << allele.typeStr() << ":"
319 << allele.cigar << ":"
320 << scientific << fixed << allele.position << ":"
321 << allele.length << ":"
322 << allele.alternateSequence;
323 }
324
325 return out.str();
326 }
327
stringForAlleles(vector<Allele> & alleles)328 string stringForAlleles(vector<Allele> &alleles) {
329 stringstream out;
330 for (vector<Allele>::iterator allele = alleles.begin(); allele != alleles.end(); allele++) {
331 out << stringForAllele(*allele) << endl;
332 }
333 return out.str();
334 }
335
tojson(vector<Allele * > & alleles)336 string tojson(vector<Allele*> &alleles) {
337 stringstream out;
338 vector<Allele*>::iterator a = alleles.begin();
339 out << "[" << (*a)->tojson(); ++a;
340 for (; a != alleles.end(); ++a)
341 out << "," << (*a)->tojson();
342 out << "]";
343 return out.str();
344 }
345
tojson(Allele * & allele)346 string tojson(Allele*& allele) { return allele->tojson(); }
347
tojson(void)348 string Allele::tojson(void) {
349 stringstream out;
350 if (!genotypeAllele) {
351 out << "{\"id\":\"" << readID << "\""
352 << ",\"type\":\"" << typeStr() << "\""
353 << ",\"length\":" << ((type == ALLELE_REFERENCE) ? 1 : length)
354 << ",\"position\":" << position
355 << ",\"strand\":\"" << (strand == STRAND_FORWARD ? "+" : "-") << "\"";
356 if (type == ALLELE_REFERENCE ) {
357 out << ",\"base\":\"" << alternateSequence.at(referenceOffset()) << "\""
358 //<< ",\"reference\":\"" << allele.referenceSequence.at(referenceOffset) << "\""
359 << ",\"quality\":" << currentQuality();
360 } else {
361 out << ",\"base\":\"" << alternateSequence << "\""
362 //<< ",\"reference\":\"" << allele.referenceSequence << "\""
363 << ",\"quality\":" << quality;
364 }
365 out << "}";
366
367 } else {
368 out << "{\"type\":\"" << typeStr() << "\"";
369 switch (type) {
370 case ALLELE_REFERENCE:
371 out << "}";
372 break;
373 default:
374 out << "\",\"length\":" << length
375 << ",\"alt\":\"" << alternateSequence << "\"}";
376 break;
377 }
378 }
379 return out.str();
380 }
381
operator <<(ostream & out,vector<Allele * > & alleles)382 ostream &operator<<(ostream &out, vector<Allele*> &alleles) {
383 vector<Allele*>::iterator a = alleles.begin();
384 out << **a++;
385 while (a != alleles.end())
386 out << "|" << **a++;
387 return out;
388 }
389
operator <<(ostream & out,vector<Allele> & alleles)390 ostream &operator<<(ostream &out, vector<Allele> &alleles) {
391 int i = 0;
392 for (auto& allele : alleles) {
393 out << (i++ ? "|" : "") << allele;
394 }
395 return out;
396 }
397
operator <<(ostream & out,list<Allele * > & alleles)398 ostream &operator<<(ostream &out, list<Allele*> &alleles) {
399 int i = 0;
400 for (auto& allele : alleles) {
401 out << (i++ ? "|" : "") << allele;
402 }
403 return out;
404 }
405
operator <<(ostream & out,Allele * & allele)406 ostream &operator<<(ostream &out, Allele* &allele) {
407 out << *allele;
408 return out;
409 }
410
operator <<(ostream & out,Allele & allele)411 ostream &operator<<(ostream &out, Allele &allele) {
412
413 if (!allele.genotypeAllele) {
414 int prec = out.precision();
415 // << &allele << ":"
416 out.precision(1);
417 out << allele.sampleID
418 << ":" << allele.readID
419 << ":" << allele.typeStr()
420 << ":" << allele.length
421 << ":" << allele.referenceLength
422 << ":" << scientific << fixed << allele.position
423 << ":" << (allele.strand == STRAND_FORWARD ? "+" : "-")
424 << ":" << allele.alternateSequence
425 //<< ":" << allele.referenceSequence
426 << ":" << allele.repeatRightBoundary
427 << ":" << allele.cigar
428 << ":" << allele.lnmapQuality
429 << ":" << allele.lnquality;
430 out.precision(prec);
431 } else {
432 out << allele.typeStr()
433 << ":" << allele.cigar
434 << ":" << scientific << fixed << allele.position
435 << ":" << allele.length
436 << ":" << (string) allele.alternateSequence;
437 }
438 out.precision(5);
439 return out;
440 }
441
operator <(const Allele & a,const Allele & b)442 bool operator<(const Allele &a, const Allele &b) {
443 //cerr << "allele<" << endl;
444 return a.currentBase < b.currentBase;
445 }
446
447 // TODO fixme??
448 // alleles are equal if they represent the same reference-relative variation or
449 // sequence, which we encode as a string and compare here
operator ==(const Allele & a,const Allele & b)450 bool operator==(const Allele &a, const Allele &b) {
451 //cerr << "allele==" << endl;
452 return a.currentBase == b.currentBase;
453 }
454
operator !=(const Allele & a,const Allele & b)455 bool operator!=(const Allele& a, const Allele& b) {
456 return ! (a == b);
457 }
458
equivalent(Allele & b)459 bool Allele::equivalent(Allele &b) {
460
461 if (type != b.type) {
462 return false;
463 } else {
464 switch (type) {
465 case ALLELE_REFERENCE:
466 // reference alleles are, by definition, always equivalent
467 return true;
468 break;
469 case ALLELE_SNP:
470 case ALLELE_MNP:
471 if (alternateSequence == b.alternateSequence)
472 return true;
473 break;
474 case ALLELE_DELETION:
475 if (length == b.length)
476 return true;
477 break;
478 case ALLELE_INSERTION:
479 if (length == b.length
480 && alternateSequence == b.alternateSequence)
481 return true;
482 break;
483 case ALLELE_COMPLEX:
484 if (length == b.length
485 && alternateSequence == b.alternateSequence
486 && cigar == b.cigar)
487 return true;
488 break;
489 case ALLELE_NULL:
490 return alternateSequence == b.alternateSequence;
491 default:
492 break;
493 }
494 }
495
496 return false;
497 }
498
areHomozygous(vector<Allele * > & alleles)499 bool areHomozygous(vector<Allele*>& alleles) {
500 Allele* prev = alleles.front();
501 for (vector<Allele*>::iterator allele = alleles.begin() + 1; allele != alleles.end(); ++allele) {
502 if (**allele != *prev) {
503 return false;
504 }
505 }
506 return true;
507 }
508
509 // counts alleles which satisfy operator==
countAlleles(vector<Allele * > & alleles)510 map<Allele, int> countAlleles(vector<Allele*>& alleles) {
511 map<Allele, int> counts;
512 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
513 Allele& allele = **a;
514 map<Allele, int>::iterator f = counts.find(allele);
515 if (f == counts.end()) {
516 counts[allele] = 1;
517 } else {
518 counts[allele] += 1;
519 }
520 }
521 return counts;
522 }
523
524 // counts alleles which satisfy operator==
countAllelesString(vector<Allele * > & alleles)525 map<string, int> countAllelesString(vector<Allele*>& alleles) {
526 map<string, int> counts;
527 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
528 Allele& thisAllele = **a;
529 const string& allele = thisAllele.currentBase;
530 map<string, int>::iterator f = counts.find(allele);
531 if (f == counts.end()) {
532 counts[allele] = 1;
533 } else {
534 counts[allele] += 1;
535 }
536 }
537 return counts;
538 }
539
countAllelesString(vector<Allele> & alleles)540 map<string, int> countAllelesString(vector<Allele>& alleles) {
541 map<string, int> counts;
542 for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
543 Allele& thisAllele = *a;
544 const string& allele = thisAllele.currentBase;
545 map<string, int>::iterator f = counts.find(allele);
546 if (f == counts.end()) {
547 counts[allele] = 1;
548 } else {
549 counts[allele] += 1;
550 }
551 }
552 return counts;
553 }
554
555
countAlleles(vector<Allele> & alleles)556 map<Allele, int> countAlleles(vector<Allele>& alleles) {
557 map<Allele, int> counts;
558 for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
559 Allele& allele = *a;
560 map<Allele, int>::iterator f = counts.find(allele);
561 if (f == counts.end()) {
562 counts[allele] = 1;
563 } else {
564 counts[allele] += 1;
565 }
566 }
567 return counts;
568 }
569
countAlleles(list<Allele * > & alleles)570 map<Allele, int> countAlleles(list<Allele*>& alleles) {
571 map<Allele, int> counts;
572 for (list<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
573 Allele& allele = **a;
574 map<Allele, int>::iterator f = counts.find(allele);
575 if (f == counts.end()) {
576 counts[allele] = 1;
577 } else {
578 counts[allele] += 1;
579 }
580 }
581 return counts;
582 }
583
584
groupAllelesBySample(list<Allele * > & alleles)585 map<string, vector<Allele*> > groupAllelesBySample(list<Allele*>& alleles) {
586 map<string, vector<Allele*> > groups;
587 for (list<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
588 Allele*& allele = *a;
589 groups[allele->sampleID].push_back(allele);
590 }
591 return groups;
592 }
593
uniqueAlleles(list<Allele * > & alleles)594 vector<Allele> uniqueAlleles(list<Allele*>& alleles) {
595 vector<Allele> uniques;
596 map<Allele, int> counts = countAlleles(alleles);
597 for (map<Allele, int>::iterator c = counts.begin(); c != counts.end(); ++c) {
598 uniques.push_back(c->first);
599 }
600 return uniques;
601 }
602
groupAllelesBySample(list<Allele * > & alleles,map<string,vector<Allele * >> & groups)603 void groupAllelesBySample(list<Allele*>& alleles, map<string, vector<Allele*> >& groups) {
604 for (list<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
605 Allele*& allele = *a;
606 groups[allele->sampleID].push_back(allele);
607 }
608 }
609
610 // in haplotype calling, if alleles have the same alternate sequence, the
611 // should have the same cigar and position. this function picks the most
612 // common allele observation per alternate sequence and homogenizes the rest to
613 // the same if they are not reference alleles
homogenizeAlleles(map<string,vector<Allele * >> & alleleGroups,string & refseq,Allele & refallele)614 void homogenizeAlleles(map<string, vector<Allele*> >& alleleGroups, string& refseq, Allele& refallele) {
615 map<string, map<string, int> > equivs;
616 map<string, Allele*> homogenizeTo;
617 // find equivalencies between alleles
618 // base equivalency is self
619 for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
620 Allele& allele = *g->second.front();
621 if (allele.isReference()) {
622 continue;
623 }
624 equivs[allele.alternateSequence][g->first]++;
625 }
626 //
627 for (map<string, map<string, int> >::iterator e = equivs.begin(); e != equivs.end(); ++e) {
628 string altseq = e->first;
629 map<string, int>& group = e->second;
630 map<int, string> ordered;
631 for (map<string, int>::iterator f = group.begin(); f != group.end(); ++f) {
632 // pick the best by count
633 ordered[f->second] = f->first;
634 }
635
636 // choose the most common group
637 string& altbase = ordered.rbegin()->second;
638 if (altseq == refseq) {
639 homogenizeTo[altseq] = &refallele;
640 } else {
641 homogenizeTo[altseq] = alleleGroups[altbase].front();
642 }
643 }
644 for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
645 vector<Allele*>& alleles = g->second;
646 if (alleles.front()->isReference()) {
647 continue;
648 }
649 string& altseq = alleles.front()->alternateSequence;
650 Allele* toallele = homogenizeTo[altseq];
651 string& cigar = toallele->cigar;
652 AlleleType type = toallele->type;
653 long int position = toallele->position;
654 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
655 (*a)->cigar = cigar;
656 (*a)->type = type;
657 (*a)->position = position;
658 (*a)->update();
659 }
660 }
661 }
662
resetProcessedFlag(map<string,vector<Allele * >> & alleleGroups)663 void resetProcessedFlag(map<string, vector<Allele*> >& alleleGroups) {
664 for (map<string, vector<Allele*> >::iterator g = alleleGroups.begin(); g != alleleGroups.end(); ++g) {
665 for (vector<Allele*>::iterator a = g->second.begin(); a != g->second.end(); ++a) {
666 (*a)->processed = false;
667 }
668 }
669 }
670
groupAlleles(map<string,vector<Allele * >> & sampleGroups,map<string,vector<Allele * >> & alleleGroups)671 void groupAlleles(map<string, vector<Allele*> >& sampleGroups, map<string, vector<Allele*> >& alleleGroups) {
672 for (map<string, vector<Allele*> >::iterator sample = sampleGroups.begin(); sample != sampleGroups.end(); ++sample) {
673 for (vector<Allele*>::iterator allele = sample->second.begin(); allele != sample->second.end(); ++allele) {
674 alleleGroups[(*allele)->currentBase].push_back(*allele);
675 }
676 }
677 }
678
groupAlleles(map<string,vector<Allele * >> & sampleGroups,bool (* fncompare)(Allele & a,Allele & b))679 vector<vector<Allele*> > groupAlleles(map<string, vector<Allele*> > &sampleGroups, bool (*fncompare)(Allele &a, Allele &b)) {
680 vector<vector<Allele*> > groups;
681 for (map<string, vector<Allele*> >::iterator sg = sampleGroups.begin(); sg != sampleGroups.end(); ++sg) {
682 vector<Allele*>& alleles = sg->second;
683 for (vector<Allele*>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
684 bool unique = true;
685 for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
686 if ((*fncompare)(**oa, *ag->front())) {
687 ag->push_back(*oa);
688 unique = false;
689 break;
690 }
691 }
692 if (unique) {
693 vector<Allele*> trueAlleleGroup;
694 trueAlleleGroup.push_back(*oa);
695 groups.push_back(trueAlleleGroup);
696 }
697 }
698 }
699 return groups;
700 }
701
groupAlleles(list<Allele * > & alleles,bool (* fncompare)(Allele * & a,Allele * & b))702 vector<vector<Allele*> > groupAlleles(list<Allele*> &alleles, bool (*fncompare)(Allele* &a, Allele* &b)) {
703 vector<vector<Allele*> > groups;
704 for (list<Allele*>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
705 bool unique = true;
706 for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
707 if ((*fncompare)(*oa, ag->front())) {
708 ag->push_back(*oa);
709 unique = false;
710 break;
711 }
712 }
713 if (unique) {
714 vector<Allele*> trueAlleleGroup;
715 trueAlleleGroup.push_back(*oa);
716 groups.push_back(trueAlleleGroup);
717 }
718 }
719 return groups;
720 }
721
groupAlleles(list<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))722 vector<vector<Allele*> > groupAlleles(list<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
723 vector<vector<Allele*> > groups;
724 for (list<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
725 bool unique = true;
726 for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
727 if ((*fncompare)(*oa, *ag->front())) {
728 ag->push_back(&*oa);
729 unique = false;
730 break;
731 }
732 }
733 if (unique) {
734 vector<Allele*> trueAlleleGroup;
735 trueAlleleGroup.push_back(&*oa);
736 groups.push_back(trueAlleleGroup);
737 }
738 }
739 return groups;
740 }
741
groupAlleles(vector<Allele * > & alleles,bool (* fncompare)(Allele & a,Allele & b))742 vector<vector<Allele*> > groupAlleles(vector<Allele*> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
743 vector<vector<Allele*> > groups;
744 for (vector<Allele*>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
745 bool unique = true;
746 for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
747 if ((*fncompare)(**oa, *ag->front())) {
748 ag->push_back(*oa);
749 unique = false;
750 break;
751 }
752 }
753 if (unique) {
754 vector<Allele*> trueAlleleGroup;
755 trueAlleleGroup.push_back(*oa);
756 groups.push_back(trueAlleleGroup);
757 }
758 }
759 return groups;
760 }
761
groupAlleles(vector<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))762 vector<vector<Allele*> > groupAlleles(vector<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
763 vector<vector<Allele*> > groups;
764 for (vector<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
765 bool unique = true;
766 for (vector<vector<Allele*> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
767 if ((*fncompare)(*oa, *ag->front())) {
768 ag->push_back(&*oa);
769 unique = false;
770 break;
771 }
772 }
773 if (unique) {
774 vector<Allele*> trueAlleleGroup;
775 trueAlleleGroup.push_back(&*oa);
776 groups.push_back(trueAlleleGroup);
777 }
778 }
779 return groups;
780 }
781
groupAlleles_copy(list<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))782 vector<vector<Allele> > groupAlleles_copy(list<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
783 vector<vector<Allele> > groups;
784 for (list<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
785 bool unique = true;
786 for (vector<vector<Allele> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
787 if ((*fncompare)(*oa, ag->front())) {
788 ag->push_back(*oa);
789 unique = false;
790 break;
791 }
792 }
793 if (unique) {
794 vector<Allele> trueAlleleGroup;
795 trueAlleleGroup.push_back(*oa);
796 groups.push_back(trueAlleleGroup);
797 }
798 }
799 return groups;
800 }
801
groupAlleles_copy(vector<Allele> & alleles,bool (* fncompare)(Allele & a,Allele & b))802 vector<vector<Allele> > groupAlleles_copy(vector<Allele> &alleles, bool (*fncompare)(Allele &a, Allele &b)) {
803 vector<vector<Allele> > groups;
804 for (vector<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
805 bool unique = true;
806 for (vector<vector<Allele> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
807 if ((*fncompare)(*oa, ag->front())) {
808 ag->push_back(*oa);
809 unique = false;
810 break;
811 }
812 }
813 if (unique) {
814 vector<Allele> trueAlleleGroup;
815 trueAlleleGroup.push_back(*oa);
816 groups.push_back(trueAlleleGroup);
817 }
818 }
819 return groups;
820 }
821
groupAlleles_copy(vector<Allele> & alleles)822 vector<vector<Allele> > groupAlleles_copy(vector<Allele> &alleles) {
823 vector<vector<Allele> > groups;
824 for (vector<Allele>::iterator oa = alleles.begin(); oa != alleles.end(); ++oa) {
825 bool unique = true;
826 for (vector<vector<Allele> >::iterator ag = groups.begin(); ag != groups.end(); ++ag) {
827 if (*oa == ag->front()) {
828 ag->push_back(*oa);
829 unique = false;
830 break;
831 }
832 }
833 if (unique) {
834 vector<Allele> trueAlleleGroup;
835 trueAlleleGroup.push_back(*oa);
836 groups.push_back(trueAlleleGroup);
837 }
838 }
839 return groups;
840 }
841
sameSample(Allele & other)842 bool Allele::sameSample(Allele &other) { return this->sampleID == other.sampleID; }
843
allelesSameType(Allele * & a,Allele * & b)844 bool allelesSameType(Allele* &a, Allele* &b) { return a->type == b->type; }
845
allelesEquivalent(Allele * & a,Allele * & b)846 bool allelesEquivalent(Allele* &a, Allele* &b) { return a->equivalent(*b); }
847
allelesSameSample(Allele * & a,Allele * & b)848 bool allelesSameSample(Allele* &a, Allele* &b) { return a->sampleID == b->sampleID; }
849
allelesSameType(Allele & a,Allele & b)850 bool allelesSameType(Allele &a, Allele &b) { return a.type == b.type; }
851
allelesEquivalent(Allele & a,Allele & b)852 bool allelesEquivalent(Allele &a, Allele &b) { return a.equivalent(b); }
853
allelesSameSample(Allele & a,Allele & b)854 bool allelesSameSample(Allele &a, Allele &b) { return a.sampleID == b.sampleID; }
855
allelesEqual(Allele & a,Allele & b)856 bool allelesEqual(Allele &a, Allele &b) { return a == b; }
857
genotypeAllelesFromAlleleGroups(vector<vector<Allele * >> & groups)858 vector<Allele> genotypeAllelesFromAlleleGroups(vector<vector<Allele*> > &groups) {
859
860 vector<Allele> results;
861 for (vector<vector<Allele*> >::iterator g = groups.begin(); g != groups.end(); ++g)
862 results.push_back(genotypeAllele(*g->front()));
863 return results;
864
865 }
866
genotypeAllelesFromAlleles(vector<Allele * > & alleles)867 vector<Allele> genotypeAllelesFromAlleles(vector<Allele*> &alleles) {
868
869 vector<Allele> results;
870 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a)
871 results.push_back(genotypeAllele(**a));
872 return results;
873
874 }
875
genotypeAllelesFromAlleleGroups(vector<vector<Allele>> & groups)876 vector<Allele> genotypeAllelesFromAlleleGroups(vector<vector<Allele> > &groups) {
877
878 vector<Allele> results;
879 for (vector<vector<Allele> >::iterator g = groups.begin(); g != groups.end(); ++g)
880 results.push_back(genotypeAllele(g->front()));
881 return results;
882
883 }
884
genotypeAllelesFromAlleles(vector<Allele> & alleles)885 vector<Allele> genotypeAllelesFromAlleles(vector<Allele> &alleles) {
886
887 vector<Allele> results;
888 for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a)
889 results.push_back(genotypeAllele(*a));
890 return results;
891
892 }
893
genotypeAllele(Allele & a)894 Allele genotypeAllele(Allele &a) {
895 return Allele(a.type, a.alternateSequence, a.length, a.referenceLength, a.cigar, a.position, a.repeatRightBoundary);
896 }
897
genotypeAllele(AlleleType type,string alt,unsigned int len,string cigar,unsigned int reflen,long int pos,long int rrbound)898 Allele genotypeAllele(AlleleType type, string alt, unsigned int len, string cigar, unsigned int reflen, long int pos, long int rrbound) {
899 return Allele(type, alt, len, reflen, cigar, pos, rrbound);
900 }
901
allowedAlleleTypes(vector<AlleleType> & allowedEnumeratedTypes)902 int allowedAlleleTypes(vector<AlleleType>& allowedEnumeratedTypes) {
903 int allowedTypes = 0;// (numberOfPossibleAlleleTypes, false);
904 for (vector<AlleleType>::iterator t = allowedEnumeratedTypes.begin(); t != allowedEnumeratedTypes.end(); ++t) {
905 allowedTypes |= *t;
906 }
907 return allowedTypes;
908 }
909
filterAlleles(list<Allele * > & alleles,int allowedTypes)910 void filterAlleles(list<Allele*>& alleles, int allowedTypes) {
911
912 for (list<Allele*>::iterator allele = alleles.begin(); allele != alleles.end(); ++allele) {
913 bool allowed = false;
914 if (!(allowedTypes & (*allele)->type))
915 *allele = NULL;
916 }
917 alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
918
919 }
920
countAlleles(map<string,vector<Allele * >> & sampleGroups)921 int countAlleles(map<string, vector<Allele*> >& sampleGroups) {
922
923 int count = 0;
924 for (map<string, vector<Allele*> >::iterator sg = sampleGroups.begin(); sg != sampleGroups.end(); ++sg) {
925 count += sg->second.size();
926 }
927 return count;
928
929 }
930
countAllelesWithBase(vector<Allele * > & alleles,string base)931 int countAllelesWithBase(vector<Allele*>& alleles, string base) {
932
933 int count = 0;
934 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
935 if ((*a)->currentBase == base)
936 ++count;
937 }
938 return count;
939
940 }
941
baseCount(vector<Allele * > & alleles,string base,AlleleStrand strand)942 int baseCount(vector<Allele*>& alleles, string base, AlleleStrand strand) {
943
944 int count = 0;
945 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
946 if ((*a)->currentBase == base && (*a)->strand == strand)
947 ++count;
948 }
949 return count;
950
951 }
952
953 pair<pair<int, int>, pair<int, int> >
baseCount(vector<Allele * > & alleles,string refbase,string altbase)954 baseCount(vector<Allele*>& alleles, string refbase, string altbase) {
955
956 int forwardRef = 0;
957 int reverseRef = 0;
958 int forwardAlt = 0;
959 int reverseAlt = 0;
960
961 for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
962 string base = (*a)->currentBase;
963 AlleleStrand strand = (*a)->strand;
964 if (base == refbase) {
965 if (strand == STRAND_FORWARD)
966 ++forwardRef;
967 else if (strand == STRAND_REVERSE)
968 ++reverseRef;
969 } else if (base == altbase) {
970 if (strand == STRAND_FORWARD)
971 ++forwardAlt;
972 else if (strand == STRAND_REVERSE)
973 ++reverseAlt;
974 }
975 }
976
977 return make_pair(make_pair(forwardRef, forwardAlt), make_pair(reverseRef, reverseAlt));
978
979 }
980
readSeq(void)981 string Allele::readSeq(void) {
982 string r;
983 for (vector<Allele>::iterator a = alignmentAlleles->begin(); a != alignmentAlleles->end(); ++a) {
984 r.append(a->alternateSequence);
985 }
986 return r;
987 }
988
read5p(void)989 string Allele::read5p(void) {
990 string r;
991 vector<Allele>::const_reverse_iterator a = alignmentAlleles->rbegin();
992 while (&*a != this) {
993 ++a;
994 }
995 if ((a+1) != alignmentAlleles->rend()) ++a;
996 while (a != alignmentAlleles->rend()) {
997 r = a->alternateSequence + r;
998 ++a;
999 }
1000 r.append(alternateSequence);
1001 return r;
1002 }
1003
read3p(void)1004 string Allele::read3p(void) {
1005 string r = alternateSequence;
1006 vector<Allele>::const_iterator a = alignmentAlleles->begin();
1007 while (&*a != this) {
1008 ++a;
1009 }
1010 if ((a+1) != alignmentAlleles->end()) ++a;
1011 while (a != alignmentAlleles->end()) {
1012 r.append(a->alternateSequence);
1013 ++a;
1014 }
1015 return r;
1016 }
1017
read5pNonNull(void)1018 string Allele::read5pNonNull(void) {
1019 string r = alternateSequence;
1020 vector<Allele>::const_reverse_iterator a = alignmentAlleles->rbegin();
1021 while (&*a != this) {
1022 ++a;
1023 }
1024 while (a != alignmentAlleles->rend() && !a->isNull()) {
1025 if (&*a != this) {
1026 r = a->alternateSequence + r;
1027 }
1028 ++a;
1029 }
1030 return r;
1031 }
1032
read3pNonNull(void)1033 string Allele::read3pNonNull(void) {
1034 string r = alternateSequence;
1035 vector<Allele>::const_iterator a = alignmentAlleles->begin();
1036 while (&*a != this) {
1037 ++a;
1038 }
1039 while (a != alignmentAlleles->end() && !a->isNull()) {
1040 if (&*a != this) {
1041 r.append(a->alternateSequence);
1042 }
1043 ++a;
1044 }
1045 return r;
1046 }
1047
read5pNonNullBases(void)1048 int Allele::read5pNonNullBases(void) {
1049 int bp = 0;
1050 vector<Allele>::const_reverse_iterator a = alignmentAlleles->rbegin();
1051 while (&*a != this) {
1052 ++a;
1053 }
1054 while (a != alignmentAlleles->rend() && !a->isNull()) {
1055 if (&*a != this) {
1056 //cerr << "5p bp = " << bp << " adding " << stringForAllele(*a) << " to " << stringForAllele(*this) << endl;
1057 bp += a->alternateSequence.size();
1058 }
1059 ++a;
1060 }
1061 return bp;
1062 }
1063
read3pNonNullBases(void)1064 int Allele::read3pNonNullBases(void) {
1065 int bp = 0;
1066 vector<Allele>::const_iterator a = alignmentAlleles->begin();
1067 while (&*a != this) {
1068 ++a;
1069 }
1070 while (a != alignmentAlleles->end() && !a->isNull()) {
1071 if (&*a != this) {
1072 //cerr << "3p bp = " << bp << " adding " << stringForAllele(*a) << " to " << stringForAllele(*this) << endl;
1073 bp += a->alternateSequence.size();
1074 }
1075 ++a;
1076 }
1077 return bp;
1078 }
1079
1080 // adjusts the allele to have a new start
1081 // returns the ref/alt sequence obtained by subtracting length from the left end of the allele
subtract(int subtractFromRefStart,int subtractFromRefEnd,string & substart,string & subend,vector<pair<int,string>> & cigarStart,vector<pair<int,string>> & cigarEnd,vector<short> & qsubstart,vector<short> & qsubend)1082 void Allele::subtract(
1083 int subtractFromRefStart,
1084 int subtractFromRefEnd,
1085 string& substart,
1086 string& subend,
1087 vector<pair<int, string> >& cigarStart,
1088 vector<pair<int, string> >& cigarEnd,
1089 vector<short>& qsubstart,
1090 vector<short>& qsubend
1091 ) {
1092
1093 substart.clear();
1094 subend.clear();
1095 cigarStart.clear();
1096 cigarEnd.clear();
1097 qsubstart.clear();
1098 qsubend.clear();
1099
1100 // prepare to adjust cigar
1101 list<pair<int, string> > cigarL = splitCigarList(cigar);
1102
1103 // walk the cigar string to determine where to make the left cut in the alternate sequence
1104 int subtractFromAltStart = 0;
1105 if (subtractFromRefStart) {
1106 int refbpstart = subtractFromRefStart;
1107 pair<int, string> c;
1108 while (!cigarL.empty()) {
1109 c = cigarL.front();
1110 cigarL.pop_front();
1111 char op = c.second[0];
1112 switch (op) {
1113 case 'M':
1114 case 'X':
1115 case 'N':
1116 refbpstart -= c.first;
1117 subtractFromAltStart += c.first;
1118 break;
1119 case 'I':
1120 subtractFromAltStart += c.first;
1121 break;
1122 case 'D':
1123 refbpstart -= c.first;
1124 break;
1125 default:
1126 break;
1127 }
1128
1129 cigarStart.push_back(c);
1130
1131 if (refbpstart < 0) {
1132 // split/adjust the last cigar element
1133 cigarL.push_front(c);
1134 cigarL.front().first = -refbpstart;
1135 cigarStart.back().first += refbpstart;
1136 switch (op) {
1137 case 'M':
1138 case 'X':
1139 case 'N':
1140 case 'I':
1141 subtractFromAltStart += refbpstart;
1142 break;
1143 case 'D':
1144 default:
1145 break;
1146 }
1147 break; // we're done
1148 }
1149 }
1150 }
1151
1152
1153 int subtractFromAltEnd = 0;
1154 // walk the cigar string to determine where to make the right cut in the alternate sequence
1155 if (subtractFromRefEnd) {
1156 int refbpend = subtractFromRefEnd;
1157 pair<int, string> c;
1158 while (!cigarL.empty() && refbpend > 0) {
1159 c = cigarL.back();
1160 cigarL.pop_back();
1161 char op = c.second[0];
1162 switch (op) {
1163 case 'M':
1164 case 'X':
1165 case 'N':
1166 subtractFromAltEnd += c.first;
1167 refbpend -= c.first;
1168 break;
1169 case 'I':
1170 subtractFromAltEnd += c.first;
1171 break;
1172 case 'D':
1173 refbpend -= c.first;
1174 break;
1175 default:
1176 break;
1177 }
1178
1179 cigarEnd.insert(cigarEnd.begin(), c);
1180
1181 if (refbpend < 0) {
1182 // split/adjust the last cigar element
1183 cigarL.push_back(c);
1184 cigarL.back().first = -refbpend;
1185 cigarEnd.front().first += refbpend;
1186 switch (op) {
1187 case 'M':
1188 case 'X':
1189 case 'I':
1190 case 'N':
1191 subtractFromAltEnd += refbpend;
1192 break;
1193 case 'D':
1194 default:
1195 break;
1196 }
1197 break; // drop out of loop, we're done
1198 }
1199 }
1200 }
1201
1202 // adjust the alternateSequence
1203 substart = alternateSequence.substr(0, subtractFromAltStart);
1204 subend = alternateSequence.substr(alternateSequence.size() - subtractFromAltEnd, subtractFromAltEnd);
1205 alternateSequence.erase(0, subtractFromAltStart);
1206 alternateSequence.erase(alternateSequence.size() - subtractFromAltEnd, subtractFromAltEnd);
1207
1208 // adjust the quality string
1209 qsubstart.insert(qsubstart.begin(), baseQualities.begin(), baseQualities.begin() + subtractFromAltStart);
1210 qsubend.insert(qsubend.begin(), baseQualities.begin() + baseQualities.size() - subtractFromAltEnd, baseQualities.end());
1211 baseQualities.erase(baseQualities.begin(), baseQualities.begin() + subtractFromAltStart);
1212 baseQualities.erase(baseQualities.begin() + baseQualities.size() - subtractFromAltEnd, baseQualities.end());
1213
1214 // reset the cigar
1215 cigarL.erase(remove_if(cigarL.begin(), cigarL.end(), isEmptyCigarElement), cigarL.end());
1216 cigar = joinCigarList(cigarL);
1217
1218 // reset the length
1219 length = alternateSequence.size();
1220
1221 // update the type specification
1222 updateTypeAndLengthFromCigar();
1223
1224 // adjust the position
1225 position += subtractFromRefStart; // assumes the first-base of the alleles is reference==, not ins
1226
1227 //referenceLength -= subtractFromRefStart;
1228 //referenceLength -= subtractFromRefEnd;
1229
1230 referenceLength = referenceLengthFromCigar();
1231
1232 }
1233
subtractFromStart(int bp,string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1234 void Allele::subtractFromStart(int bp, string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1235 string emptystr;
1236 vector<pair<int, string> > emptycigar;
1237 vector<short> emptyquals;
1238 subtract(bp, 0, seq, emptystr, cig, emptycigar, quals, emptyquals);
1239 }
1240
subtractFromEnd(int bp,string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1241 void Allele::subtractFromEnd(int bp, string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1242 string emptystr;
1243 vector<pair<int, string> > emptycigar;
1244 vector<short> emptyquals;
1245 subtract(0, bp, emptystr, seq, emptycigar, cig, emptyquals, quals);
1246 }
1247
addToStart(string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1248 void Allele::addToStart(string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1249 string emptystr;
1250 vector<pair<int, string> > emptycigar;
1251 vector<short> emptyquals;
1252 add(seq, emptystr, cig, emptycigar, quals, emptyquals);
1253 }
1254
addToEnd(string & seq,vector<pair<int,string>> & cig,vector<short> & quals)1255 void Allele::addToEnd(string& seq, vector<pair<int, string> >& cig, vector<short>& quals) {
1256 string emptystr;
1257 vector<pair<int, string> > emptycigar;
1258 vector<short> emptyquals;
1259 add(emptystr, seq, emptycigar, cig, emptyquals, quals);
1260 }
1261
add(string & addToStart,string & addToEnd,vector<pair<int,string>> & cigarStart,vector<pair<int,string>> & cigarEnd,vector<short> & qaddToStart,vector<short> & qaddToEnd)1262 void Allele::add(
1263 string& addToStart,
1264 string& addToEnd,
1265 vector<pair<int, string> >& cigarStart,
1266 vector<pair<int, string> >& cigarEnd,
1267 vector<short>& qaddToStart,
1268 vector<short>& qaddToEnd
1269 ) {
1270
1271 // adjust the position
1272 for (vector<pair<int, string> >::iterator c = cigarStart.begin(); c != cigarStart.end(); ++c) {
1273 switch (c->second[0]) {
1274 case 'M':
1275 case 'X':
1276 case 'D':
1277 case 'N':
1278 position -= c->first;
1279 break;
1280 case 'I':
1281 default:
1282 break;
1283 }
1284 }
1285
1286 // prepare to adjust cigar
1287 vector<pair<int, string> > cigarV = splitCigar(cigar);
1288
1289 // adjust the cigar
1290 if (!cigarStart.empty()) {
1291 if (cigarStart.back().second == cigarV.front().second) {
1292 // merge
1293 cigarV.front().first += cigarStart.back().first;
1294 cigarStart.pop_back();
1295 }
1296 }
1297 cigarV.insert(cigarV.begin(), cigarStart.begin(), cigarStart.end());
1298
1299 if (!cigarEnd.empty()) {
1300 if (cigarEnd.front().second == cigarV.back().second) {
1301 // merge
1302 cigarV.back().first += cigarEnd.front().first;
1303 cigarEnd.pop_back();
1304 } else {
1305 cigarV.insert(cigarV.end(), cigarEnd.begin(), cigarEnd.end());
1306 }
1307 }
1308
1309 // adjust the alternateSequence
1310 alternateSequence.insert(0, addToStart);
1311 alternateSequence.append(addToEnd);
1312
1313 // adjust the quality string
1314 baseQualities.insert(baseQualities.begin(), qaddToStart.begin(), qaddToStart.end());
1315 baseQualities.insert(baseQualities.end(), qaddToEnd.begin(), qaddToEnd.end());
1316
1317 // reset the cigar
1318 cigarV.erase(remove_if(cigarV.begin(), cigarV.end(), isEmptyCigarElement), cigarV.end());
1319 cigar = joinCigar(cigarV);
1320
1321 updateTypeAndLengthFromCigar();
1322
1323 // reset referenceLength
1324 referenceLength = referenceLengthFromCigar();
1325
1326 }
1327
updateTypeAndLengthFromCigar(void)1328 void Allele::updateTypeAndLengthFromCigar(void) {
1329
1330 vector<pair<int, string> > cigarV = splitCigar(cigar);
1331
1332 map<char, int> cigarTypes;
1333 map<char, int> cigarLengths;
1334 for (vector<pair<int, string> >::iterator c = cigarV.begin(); c != cigarV.end(); ++c) {
1335 ++cigarTypes[c->second[0]];
1336 cigarLengths[c->second[0]] += c->first;
1337 }
1338 if (cigarTypes.size() == 1) {
1339 switch (cigarTypes.begin()->first) {
1340 case 'M':
1341 type = ALLELE_REFERENCE;
1342 break;
1343 case 'I':
1344 type = ALLELE_INSERTION;
1345 break;
1346 case 'D':
1347 type = ALLELE_DELETION;
1348 break;
1349 case 'X':
1350 if (cigarLengths['X'] > 1) {
1351 type = ALLELE_MNP;
1352 } else {
1353 type = ALLELE_SNP;
1354 }
1355 break;
1356 case 'N':
1357 type = ALLELE_NULL;
1358 break;
1359 default:
1360 break;
1361 }
1362 } else if (cigarTypes.size() == 2) {
1363 if (cigarTypes['M'] > 0) {
1364 if (cigarTypes['I'] == 1) {
1365 type = ALLELE_INSERTION;
1366 } else if (cigarTypes['D'] == 1) {
1367 type = ALLELE_DELETION;
1368 } else if (cigarTypes['X'] == 1) {
1369 if (cigarLengths['X'] > 1) {
1370 type = ALLELE_MNP;
1371 } else {
1372 type = ALLELE_SNP;
1373 }
1374 } else {
1375 type = ALLELE_COMPLEX;
1376 }
1377 } else {
1378 type = ALLELE_COMPLEX;
1379 }
1380 } else {
1381 type = ALLELE_COMPLEX;
1382 }
1383
1384 // recalculate allele length and quality, based on type
1385 switch (type) {
1386 case ALLELE_REFERENCE:
1387 length = alternateSequence.size();
1388 break;
1389 case ALLELE_SNP:
1390 case ALLELE_MNP:
1391 length = cigarLengths['X'];
1392 break;
1393 case ALLELE_INSERTION:
1394 length = cigarLengths['I'];
1395 break;
1396 case ALLELE_DELETION:
1397 length = cigarLengths['D'];
1398 break;
1399 case ALLELE_COMPLEX:
1400 length = alternateSequence.size();
1401 break;
1402 case ALLELE_NULL:
1403 length = alternateSequence.size();
1404 break;
1405 default:
1406 break;
1407 }
1408
1409 }
1410
referenceLengthFromCigar(string & cigar)1411 int referenceLengthFromCigar(string& cigar) {
1412 int r = 0;
1413 vector<pair<int, string> > cigarV = splitCigar(cigar);
1414 for (vector<pair<int, string> >::iterator c = cigarV.begin(); c != cigarV.end(); ++c) {
1415 switch (c->second[0]) {
1416 case 'M':
1417 case 'X':
1418 case 'D':
1419 r += c->first;
1420 break;
1421 case 'N':
1422 case 'I':
1423 default:
1424 break;
1425 }
1426 }
1427 return r;
1428 }
1429
referenceLengthFromCigar(void)1430 int Allele::referenceLengthFromCigar(void) {
1431 int r = 0;
1432 vector<pair<int, string> > cigarV = splitCigar(cigar);
1433 for (vector<pair<int, string> >::iterator c = cigarV.begin(); c != cigarV.end(); ++c) {
1434 switch (c->second[0]) {
1435 case 'M':
1436 case 'X':
1437 case 'D':
1438 r += c->first;
1439 break;
1440 case 'N':
1441 case 'I':
1442 default:
1443 break;
1444 }
1445 }
1446 return r;
1447 }
1448
1449
1450 // combines the two alleles into a complex variant, updates important data
mergeAllele(const Allele & newAllele,AlleleType newType)1451 void Allele::mergeAllele(const Allele& newAllele, AlleleType newType) {
1452 //cerr << stringForAllele(*this) << endl << stringForAllele(newAllele) << endl;
1453 type = newType;
1454 alternateSequence += newAllele.alternateSequence;
1455 length += newAllele.length; // hmmm
1456 basesRight = newAllele.basesRight;
1457 baseQualities.insert(baseQualities.end(), newAllele.baseQualities.begin(), newAllele.baseQualities.end());
1458 currentBase = base();
1459 quality = averageQuality(baseQualities);
1460 lnquality = phred2ln(quality);
1461 basesRight += newAllele.referenceLength;
1462 if (newAllele.type != ALLELE_REFERENCE) {
1463 repeatRightBoundary = newAllele.repeatRightBoundary;
1464 }
1465 cigar = mergeCigar(cigar, newAllele.cigar);
1466 referenceLength = referenceLengthFromCigar();
1467 }
1468
squash(void)1469 void Allele::squash(void) {
1470 // will trigger destruction of this allele in the AlleleParser
1471 length = 0;
1472 position = 0;
1473 }
1474
getLengthOnReference(void)1475 unsigned int Allele::getLengthOnReference(void) {
1476 return referenceLengthFromCigar();
1477 }
1478
alleleUnion(vector<Allele> & a1,vector<Allele> & a2)1479 vector<Allele> alleleUnion(vector<Allele>& a1, vector<Allele>& a2) {
1480 map<string, Allele> alleleSet;
1481 vector<Allele> results;
1482 for (vector<Allele>::iterator a = a1.begin(); a != a1.end(); ++a) {
1483 alleleSet.insert(make_pair(a->base(), *a));
1484 }
1485 for (vector<Allele>::iterator a = a2.begin(); a != a2.end(); ++a) {
1486 alleleSet.insert(make_pair(a->base(), *a));
1487 }
1488 for (map<string, Allele>::iterator a = alleleSet.begin(); a != alleleSet.end(); ++a) {
1489 results.push_back(a->second);
1490 }
1491 return results;
1492 }
1493
isEmptyAllele(const Allele & allele)1494 bool isEmptyAllele(const Allele& allele) {
1495 return allele.length == 0;
1496 }
1497
isDividedIndel(const Allele & allele)1498 bool isDividedIndel(const Allele& allele) {
1499 vector<pair<int, string> > cigarV = splitCigar(allele.cigar);
1500 if (cigarV.front().second == "D" || cigarV.front().second == "I") {
1501 return true;
1502 } else {
1503 return false;
1504 }
1505 }
1506
1507 // returns true if this indel is not properly flanked by reference-matching sequence
isUnflankedIndel(const Allele & allele)1508 bool isUnflankedIndel(const Allele& allele) {
1509 if (allele.isReference() || allele.isSNP() || allele.isMNP()) {
1510 return false;
1511 } else {
1512 vector<pair<int, string> > cigarV = splitCigar(allele.cigar);
1513 if (cigarV.back().second == "D"
1514 || cigarV.back().second == "I"
1515 || cigarV.front().second == "D"
1516 || cigarV.front().second == "I") {
1517 return true;
1518 } else {
1519 return false;
1520 }
1521 }
1522 }
1523
isEmptyAlleleOrIsDividedIndel(const Allele & allele)1524 bool isEmptyAlleleOrIsDividedIndel(const Allele& allele) {
1525 return isEmptyAllele(allele) || isDividedIndel(allele);
1526 }
1527