1 #include "Variant.h"
2 #include "convert.h"
3 #include "join.h"
4 #include "split.h"
5 #include "Fasta.h"
6 #include <set>
7 #include <vector>
8 #include <getopt.h>
9 #include <cmath>
10
11 using namespace std;
12 using namespace vcflib;
13
14
15 // Attempts to left-realign all the indels represented by the alignment cigar.
16 //
17 // This is done by shifting all indels as far left as they can go without
18 // mismatch, then merging neighboring indels of the same class. leftAlign
19 // updates the alignment cigar with changes, and returns true if realignment
20 // changed the alignment cigar.
21 //
22 // To left-align, we move multi-base indels left by their own length as long as
23 // the preceding bases match the inserted or deleted sequence. After this
24 // step, we handle multi-base homopolymer indels by shifting them one base to
25 // the left until they mismatch the reference.
26 //
27 // To merge neighboring indels, we iterate through the set of left-stabilized
28 // indels. For each indel we add a new cigar element to the new cigar. If a
29 // deletion follows a deletion, or an insertion occurs at the same place as
30 // another insertion, we merge the events by extending the previous cigar
31 // element.
32 //
33 // In practice, we must call this function until the alignment is stabilized.
34
35 #define VCFLEFTALIGN_DEBUG(msg) \
36 if (false) { cerr << msg; }
37
38 class VCFIndelAllele {
39 friend ostream& operator<<(ostream&, const VCFIndelAllele&);
40 friend bool operator==(const VCFIndelAllele&, const VCFIndelAllele&);
41 friend bool operator!=(const VCFIndelAllele&, const VCFIndelAllele&);
42 friend bool operator<(const VCFIndelAllele&, const VCFIndelAllele&);
43 public:
44 bool insertion;
45 int length;
46 int position;
47 int readPosition;
48 string sequence;
49
50 bool homopolymer(void);
51
VCFIndelAllele(bool i,int l,int p,int rp,string s)52 VCFIndelAllele(bool i, int l, int p, int rp, string s)
53 : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
54 { }
55 };
56
57 bool FBhomopolymer(string sequence);
58 ostream& operator<<(ostream& out, const VCFIndelAllele& indel);
59 bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b);
60 bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b);
61 bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b);
62
homopolymer(void)63 bool VCFIndelAllele::homopolymer(void) {
64 string::iterator s = sequence.begin();
65 char c = *s++;
66 while (s != sequence.end()) {
67 if (c != *s++) return false;
68 }
69 return true;
70 }
71
FBhomopolymer(string sequence)72 bool FBhomopolymer(string sequence) {
73 string::iterator s = sequence.begin();
74 char c = *s++;
75 while (s != sequence.end()) {
76 if (c != *s++) return false;
77 }
78 return true;
79 }
80
operator <<(ostream & out,const VCFIndelAllele & indel)81 ostream& operator<<(ostream& out, const VCFIndelAllele& indel) {
82 string t = indel.insertion ? "i" : "d";
83 out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.sequence;
84 return out;
85 }
86
operator ==(const VCFIndelAllele & a,const VCFIndelAllele & b)87 bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b) {
88 return (a.insertion == b.insertion
89 && a.length == b.length
90 && a.position == b.position
91 && a.sequence == b.sequence);
92 }
93
operator !=(const VCFIndelAllele & a,const VCFIndelAllele & b)94 bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b) {
95 return !(a==b);
96 }
97
operator <(const VCFIndelAllele & a,const VCFIndelAllele & b)98 bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b) {
99 ostringstream as, bs;
100 as << a;
101 bs << b;
102 return as.str() < bs.str();
103 }
104
105
106 class AltAlignment {
107 public:
108 unsigned int pos;
109 string seq;
110 vector<pair<int, string> > cigar;
AltAlignment(unsigned int & p,string & s,string & c)111 AltAlignment(unsigned int& p,
112 string& s,
113 string& c) {
114 pos = p;
115 seq = s;
116 cigar = splitCigar(c);
117 }
118 };
119
entropy(const string & st)120 double entropy(const string& st) {
121 vector<char> stvec(st.begin(), st.end());
122 set<char> alphabet(stvec.begin(), stvec.end());
123 vector<double> freqs;
124 for (set<char>::iterator c = alphabet.begin(); c != alphabet.end(); ++c) {
125 int ctr = 0;
126 for (vector<char>::iterator s = stvec.begin(); s != stvec.end(); ++s) {
127 if (*s == *c) {
128 ++ctr;
129 }
130 }
131 freqs.push_back((double)ctr / (double)stvec.size());
132 }
133 double ent = 0;
134 double ln2 = log(2);
135 for (vector<double>::iterator f = freqs.begin(); f != freqs.end(); ++f) {
136 ent += *f * log(*f)/ln2;
137 }
138 ent = -ent;
139 return ent;
140 }
141
getAlignment(Variant & var,FastaReference & reference,string & ref,vector<AltAlignment> & alignments,int window)142 void getAlignment(Variant& var, FastaReference& reference, string& ref, vector<AltAlignment>& alignments, int window) {
143
144 // default alignment params
145 float matchScore = 10.0f;
146 float mismatchScore = -9.0f;
147 float gapOpenPenalty = 25.0f;
148 float gapExtendPenalty = 3.33f;
149
150 // establish reference sequence
151 string pad = string(window/2, 'Z');
152 string leftFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() - window/2, window/2);
153 string rightFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() + var.ref.size(), window/2);
154 ref = pad + leftFlank + var.ref + rightFlank + pad;
155
156 // and iterate through the alternates, generating alignments
157 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
158 string alt = pad + leftFlank + *a + rightFlank + pad;
159 CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
160 unsigned int referencePos;
161 string cigar;
162 sw.Align(referencePos, cigar, ref, alt);
163 alignments.push_back(AltAlignment(referencePos, alt, cigar));
164 }
165 }
166
167
168 bool stablyLeftAlign(string& alternateSequence, string referenceSequence, int maxiterations = 50, bool debug = false);
169 int countMismatches(string& alternateSequence, string referenceSequence);
170
leftAlign(string & alternateSequence,Cigar & cigar,string & referenceSequence,bool debug=false)171 bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) {
172
173 int arsOffset = 0; // pointer to insertion point in aligned reference sequence
174 string alignedReferenceSequence = referenceSequence;
175 int aabOffset = 0;
176 string alignmentAlignedBases = alternateSequence;
177
178 // store information about the indels
179 vector<VCFIndelAllele> indels;
180
181 int rp = 0; // read position, 0-based relative to read
182 int sp = 0; // sequence position
183
184 string softBegin;
185 string softEnd;
186
187 stringstream cigar_before, cigar_after;
188 for (vector<pair<int, string> >::const_iterator c = cigar.begin();
189 c != cigar.end(); ++c) {
190 unsigned int l = c->first;
191 char t = c->second.at(0);
192
193 cigar_before << l << t;
194 if (t == 'M') { // match or mismatch
195 sp += l;
196 rp += l;
197 } else if (t == 'D') { // deletion
198 indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
199 alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
200 aabOffset += l;
201 sp += l; // update reference sequence position
202 } else if (t == 'I') { // insertion
203 indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l)));
204 alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
205 arsOffset += l;
206 rp += l;
207 } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
208 // remove these bases from the refseq and read seq, but don't modify the alignment sequence
209 if (rp == 0) {
210 alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
211 softBegin = alignmentAlignedBases.substr(0, l);
212 } else {
213 alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
214 softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
215 }
216 rp += l;
217 } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
218 } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
219 sp += l;
220 }
221 }
222
223
224 int alignedLength = sp;
225
226 VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl
227 << "| " << alignedReferenceSequence << endl
228 << "| " << alignmentAlignedBases << endl);
229
230 // if no indels, return the alignment
231 if (indels.empty()) { return false; }
232
233 // for each indel, from left to right
234 // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
235 // move the indel left
236
237 vector<VCFIndelAllele>::iterator previous = indels.begin();
238 for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
239
240 // left shift by repeats
241 //
242 // from 1 base to the length of the indel, attempt to shift left
243 // if the move would cause no change in alignment optimality (no
244 // introduction of mismatches, and by definition no change in gap
245 // length), move to the new position.
246 // in practice this moves the indel left when we reach the size of
247 // the repeat unit.
248 //
249 int steppos, readsteppos;
250 VCFIndelAllele& indel = *id;
251 int i = 1;
252 while (i <= indel.length) {
253
254 int steppos = indel.position - i;
255 int readsteppos = indel.readPosition - i;
256
257 #ifdef VERBOSE_DEBUG
258 if (debug) {
259 if (steppos >= 0 && readsteppos >= 0) {
260 cerr << referenceSequence.substr(steppos, indel.length) << endl;
261 cerr << alternateSequence.substr(readsteppos, indel.length) << endl;
262 cerr << indel.sequence << endl;
263 }
264 }
265 #endif
266 while (steppos >= 0 && readsteppos >= 0
267 && indel.sequence == referenceSequence.substr(steppos, indel.length)
268 && indel.sequence == alternateSequence.substr(readsteppos, indel.length)
269 && (id == indels.begin()
270 || (previous->insertion && steppos >= previous->position)
271 || (!previous->insertion && steppos >= previous->position + previous->length))) {
272 VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
273 indel.position -= i;
274 indel.readPosition -= i;
275 steppos = indel.position - i;
276 readsteppos = indel.readPosition - i;
277 }
278 do {
279 ++i;
280 } while (i <= indel.length && indel.length % i != 0);
281 }
282
283 // left shift indels with exchangeable flanking sequence
284 //
285 // for example:
286 //
287 // GTTACGTT GTTACGTT
288 // GT-----T ----> G-----TT
289 //
290 // GTGTGACGTGT GTGTGACGTGT
291 // GTGTG-----T ----> GTG-----TGT
292 //
293 // GTGTG-----T GTG-----TGT
294 // GTGTGACGTGT ----> GTGTGACGTGT
295 //
296 //
297 steppos = indel.position - 1;
298 readsteppos = indel.readPosition - 1;
299 while (steppos >= 0 && readsteppos >= 0
300 && alternateSequence.at(readsteppos) == referenceSequence.at(steppos)
301 && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
302 && (id == indels.begin()
303 || (previous->insertion && indel.position - 1 >= previous->position)
304 || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
305 VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
306 indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
307 indel.position -= 1;
308 indel.readPosition -= 1;
309 steppos = indel.position - 1;
310 readsteppos = indel.readPosition - 1;
311 }
312 // tracks previous indel, so we don't run into it with the next shift
313 previous = id;
314 }
315
316 // bring together floating indels
317 // from left to right
318 // check if we could merge with the next indel
319 // if so, adjust so that we will merge in the next step
320 if (indels.size() > 1) {
321 previous = indels.begin();
322 for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
323 VCFIndelAllele& indel = *id;
324 // parsimony: could we shift right and merge with the previous indel?
325 // if so, do it
326 int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
327 int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
328 if (previous->insertion == indel.insertion
329 && ((previous->insertion
330 && (previous->position < indel.position
331 && previous->readPosition + previous->readPosition < indel.readPosition))
332 ||
333 (!previous->insertion
334 && (previous->position + previous->length < indel.position)
335 && (previous->readPosition < indel.readPosition)
336 ))) {
337 if (previous->homopolymer()) {
338 string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
339 string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref);
340 VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
341 if (previous->sequence.at(0) == seq.at(0)
342 && FBhomopolymer(seq)
343 && FBhomopolymer(readseq)) {
344 VCFLEFTALIGN_DEBUG("moving " << *previous << " right to "
345 << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
346 previous->position = indel.insertion ? indel.position : indel.position - previous->length;
347 }
348 }
349 else {
350 int pos = previous->position;
351 while (pos < (int) referenceSequence.length() &&
352 ((previous->insertion && pos + previous->length <= indel.position)
353 ||
354 (!previous->insertion && pos + previous->length < indel.position))
355 && previous->sequence
356 == referenceSequence.substr(pos + previous->length, previous->length)) {
357 pos += previous->length;
358 }
359 if (pos < previous->position &&
360 ((previous->insertion && pos + previous->length == indel.position)
361 ||
362 (!previous->insertion && pos == indel.position - previous->length))
363 ) {
364 VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
365 previous->position = pos;
366 }
367 }
368 }
369 previous = id;
370 }
371 }
372
373 // for each indel
374 // if ( we're matched up to the previous insertion (or deletion)
375 // and it's also an insertion or deletion )
376 // merge the indels
377 //
378 // and simultaneously reconstruct the cigar
379
380 Cigar newCigar;
381
382 if (!softBegin.empty()) {
383 newCigar.push_back(make_pair(softBegin.size(), "S"));
384 }
385
386 vector<VCFIndelAllele>::iterator id = indels.begin();
387 VCFIndelAllele last = *id++;
388 if (last.position > 0) {
389 newCigar.push_back(make_pair(last.position, "M"));
390 newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
391 } else {
392 newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
393 }
394 int lastend = last.insertion ? last.position : (last.position + last.length);
395 VCFLEFTALIGN_DEBUG(last << ",");
396
397 for (; id != indels.end(); ++id) {
398 VCFIndelAllele& indel = *id;
399 VCFLEFTALIGN_DEBUG(indel << ",");
400 if (indel.position < lastend) {
401 cerr << "impossibility?: indel realigned left of another indel" << endl
402 << referenceSequence << endl << alternateSequence << endl;
403 exit(1);
404 } else if (indel.position == lastend && indel.insertion == last.insertion) {
405 pair<int, string>& op = newCigar.back();
406 op.first += indel.length;
407 } else if (indel.position >= lastend) { // also catches differential indels, but with the same position
408 newCigar.push_back(make_pair(indel.position - lastend, "M"));
409 newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
410 }
411 last = *id;
412 lastend = last.insertion ? last.position : (last.position + last.length);
413 }
414
415 if (lastend < alignedLength) {
416 newCigar.push_back(make_pair(alignedLength - lastend, "M"));
417 }
418
419 if (!softEnd.empty()) {
420 newCigar.push_back(make_pair(softEnd.size(), "S"));
421 }
422
423 VCFLEFTALIGN_DEBUG(endl);
424
425 cigar = newCigar;
426
427 for (vector<pair<int, string> >::const_iterator c = cigar.begin();
428 c != cigar.end(); ++c) {
429 unsigned int l = c->first;
430 char t = c->second.at(0);
431 cigar_after << l << t;
432 }
433
434 //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl;
435 VCFLEFTALIGN_DEBUG(cigar_after.str() << endl);
436
437 // check if we're realigned
438 if (cigar_after.str() == cigar_before.str()) {
439 return false;
440 } else {
441 return true;
442 }
443
444 }
445
446 // Iteratively left-aligns the indels in the alignment until we have a stable
447 // realignment. Returns true on realignment success or non-realignment.
448 // Returns false if we exceed the maximum number of realignment iterations.
449 //
stablyLeftAlign(string & alternateSequence,string referenceSequence,Cigar & cigar,int maxiterations,bool debug)450 bool stablyLeftAlign(string& alternateSequence, string referenceSequence, Cigar& cigar, int maxiterations, bool debug) {
451
452 if (!leftAlign(alternateSequence, cigar, referenceSequence, debug)) {
453
454 return true;
455
456 } else {
457
458 bool result = true;
459 while ((result = leftAlign(alternateSequence, cigar, referenceSequence, debug)) && --maxiterations > 0) {
460 }
461
462 if (maxiterations <= 0) {
463 return false;
464 } else {
465 return true;
466 }
467
468 }
469
470 }
471
472
printSummary(char ** argv)473 void printSummary(char** argv) {
474 cerr << "usage: " << argv[0] << " [options] [file]" << endl
475 << endl
476 << "options:" << endl
477 << " -r, --reference FILE Use this reference as a basis for realignment." << endl
478 << " -w, --window N Use a window of this many bp when left aligning (150)." << endl
479 << endl
480 << "Left-aligns variants in the specified input file or stdin. Window size is determined" << endl
481 << "dynamically according to the entropy of the regions flanking the indel. These must have" << endl
482 << "entropy > 1 bit/bp, or be shorter than ~5kb." << endl;
483 exit(0);
484 }
485
main(int argc,char ** argv)486 int main(int argc, char** argv) {
487
488 int window = 150;
489 VariantCallFile variantFile;
490 string fastaFileName;
491
492 int c;
493 while (true) {
494 static struct option long_options[] =
495 {
496 /* These options set a flag. */
497 //{"verbose", no_argument, &verbose_flag, 1},
498 {"help", no_argument, 0, 'h'},
499 {"reference", required_argument, 0, 'r'},
500 {"window", required_argument, 0, 'w'},
501 {0, 0, 0, 0}
502 };
503 /* getopt_long stores the option index here. */
504 int option_index = 0;
505
506 c = getopt_long (argc, argv, "hw:r:",
507 long_options, &option_index);
508
509 if (c == -1)
510 break;
511
512 switch (c) {
513
514 case 'r':
515 fastaFileName = optarg;
516 break;
517
518 case 'w':
519 window = atoi(optarg);
520 break;
521
522 case '?':
523 printSummary(argv);
524 exit(1);
525 break;
526
527 case 'h':
528 printSummary(argv);
529 break;
530
531 default:
532 abort ();
533 }
534 }
535
536 if (optind < argc) {
537 string filename = argv[optind];
538 variantFile.open(filename);
539 } else {
540 variantFile.open(std::cin);
541 }
542
543 if (!variantFile.is_open()) {
544 cerr << "could not open VCF file" << endl;
545 exit(1);
546 }
547
548 FastaReference fastaReference;
549 if (fastaFileName.empty()) {
550 cerr << "a reference is required" << endl;
551 exit(1);
552 } else {
553 fastaReference.open(fastaFileName);
554 }
555
556 /*
557 variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
558 variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
559 if (!parseFlag.empty()) {
560 variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
561 }
562 */
563 cout << variantFile.header << endl;
564
565 Variant var(variantFile);
566 while (variantFile.getNextVariant(var)) {
567
568 // if there is no indel, there is nothing to realign
569 bool hasIndel = false;
570 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
571 if (a->size() != var.ref.size()) {
572 hasIndel = true;
573 break;
574 }
575 }
576 if (!hasIndel) {
577 cout << var << endl;
578 continue;
579 }
580
581 vector<AltAlignment> alignments;
582 string ref;
583
584 // determine window size to prevent mismapping with SW algorithm
585 int currentWindow = window;
586 int scale = 2;
587 if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
588 for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
589 if (a->size()*scale > currentWindow) {
590 currentWindow = a->size()*scale;
591 }
592 }
593
594 // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
595 while (currentWindow < 2000) { // limit to one step > than this
596 string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
597 if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
598 entropy(refTarget.substr(refTarget.size()/2)) < 1) {
599 currentWindow *= scale;
600 } else {
601 break;
602 }
603 }
604
605 // do the alignments
606 getAlignment(var, fastaReference, ref, alignments, currentWindow);
607
608 // stably left align the alignments
609 for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
610 Cigar cigarBefore = a->cigar;
611 //cerr << a->seq << endl;
612 //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
613 long int prev = a->pos;
614 stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
615 //cerr << "after : " << a->pos << " " << joinCigar(a->cigar) << endl;
616 if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
617 }
618 //cout << var << endl;
619
620 // transform the mappings
621 // chop off leading matching bases
622 // find the range of bp in the alleles
623 // make the new ref allele
624 // make the new alt alleles
625 // emit the var
626
627 long int newPosition = var.position+currentWindow/2;
628 long int newEndPosition = var.position-currentWindow/2;
629 // check for no-indel case
630 int newLength = var.ref.size();
631 bool giveUp = false;
632 for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
633 // get the first mismatching position
634 Cigar::iterator c = a->cigar.begin();
635
636 int rp = 0;
637 int sp = 0;
638 bool hitMismatch = false;
639
640 int matchingBpAtStart = 0;
641 int matchingBpAtEnd = 0;
642 // will be set to true if the first reference position match is broken by a SNP, not an indel
643 bool leadingSNP = false;
644
645 while (c != a->cigar.end()) {
646 char op = c->second[0];
647 if (c == a->cigar.begin()) {
648 if (op != 'M') {
649 cerr << "alignment does not start on matched sequence" << endl;
650 cerr << var << endl;
651 exit(1);
652 }
653 int i = 0;
654 for ( ; i < c->first; ++i) {
655 if (ref[i] != a->seq[i]) {
656 leadingSNP = true;
657 break;
658 }
659 }
660 matchingBpAtStart = i;
661 }
662 if (!leadingSNP && c == (a->cigar.begin()+1)) {
663 // if the first thing we run into is an indel, step back, per VCF spec
664 if (op == 'D' || op == 'I') {
665 --matchingBpAtStart;
666 }
667 }
668 if (c == (a->cigar.end()-1)) {
669 if (op != 'M') {
670 // soft clip at end
671 // it'll be hard to interpret this
672 // the alignments sometimes generate this
673 // best thing to do is to move on
674 //cerr << "alignment does not end on matched sequence" << endl;
675 //cout << var << endl;
676 //exit(1);
677 giveUp = true;
678 break;
679 }
680 int i = 0;
681 for ( ; i < c->first; ++i) {
682 if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
683 break;
684 }
685 }
686 matchingBpAtEnd = i;
687 }
688 ++c;
689 }
690
691 int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
692 int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
693 //cerr << "alt mismatch length " << altMismatchLength << endl
694 // << "ref mismatch length " << refMismatchLength << endl;
695 long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
696 long int newEnd = newStart + refMismatchLength;
697 //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
698 newPosition = min(newStart, newPosition);
699 newEndPosition = max(newEnd, newEndPosition);
700 //cerr << newPosition << " " << newEndPosition << endl;
701 //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
702 }
703
704 // the alignment failed for some reason, continue
705 if (giveUp) {
706 cout << var << endl;
707 continue;
708 }
709
710 //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
711 int newRefSize = newEndPosition - newPosition;
712 string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
713 // get the number of bp to strip from the alts
714 int stripFromStart = currentWindow/2 - (var.position - newPosition);
715 int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);
716
717 //cerr << "strip from start " << stripFromStart << endl;
718 //cerr << "strip from end " << stripFromEnd << endl;
719
720 vector<string> newAlt;
721 vector<string>::iterator l = var.alt.begin();
722 bool failedAlt = false;
723 for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
724 ++a, ++l) {
725 int diff = newRef.size() - l->size();
726 string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
727 newAlt.push_back(alt);
728 if (alt.empty()) failedAlt = true;
729 }
730
731 // check the before/after haplotypes
732 bool brokenRealignment = false;
733 if (!newRef.empty() && !failedAlt) {
734 int slop = 50; // 50 extra bp!
735 int haplotypeStart = min(var.position, newPosition) - slop;
736 int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
737 string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
738 haplotypeEnd - haplotypeStart);
739 vector<string>::iterator o = var.alt.begin();
740 vector<string>::iterator n = newAlt.begin();
741 for ( ; o != var.alt.end() ; ++o, ++n) {
742 // map the haplotypes
743 string oldHaplotype = referenceHaplotype;
744 string newHaplotype = referenceHaplotype;
745 oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
746 newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
747 if (oldHaplotype != newHaplotype) {
748 cerr << "broken left alignment!" << endl
749 << "old " << oldHaplotype << endl
750 << "new " << newHaplotype << endl;
751 cerr << "was: " << var << endl;
752 brokenRealignment = true;
753 }
754 }
755 }
756
757 // *if* everything is OK, update the variant
758 if (!brokenRealignment && !newRef.empty() && !failedAlt) {
759 var.ref = newRef;
760 var.alt = newAlt;
761 var.position = newPosition;
762 }
763
764 cout << var << endl;
765
766 // for each parsedalternate, get the position
767 // build a new vcf record for that position
768 // unless we are already at the position !
769 // take everything which is unique to that allele (records) and append it to the new record
770 // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
771 // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
772
773
774
775 //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
776 }
777
778 return 0;
779
780 }
781
782