1 /*
2     vcflib C++ library for parsing and manipulating VCF files
3 
4     Copyright © 2010-2020 Erik Garrison
5     Copyright © 2020      Pjotr Prins
6 
7     This software is published under the MIT License. See the LICENSE file.
8 */
9 
10 #include "Variant.h"
11 #include "convert.h"
12 #include "join.h"
13 #include "split.h"
14 #include "Fasta.h"
15 #include <set>
16 #include <vector>
17 #include <getopt.h>
18 #include <cmath>
19 
20 using namespace std;
21 using namespace vcflib;
22 
23 
24 // Attempts to left-realign all the indels represented by the alignment cigar.
25 //
26 // This is done by shifting all indels as far left as they can go without
27 // mismatch, then merging neighboring indels of the same class.  leftAlign
28 // updates the alignment cigar with changes, and returns true if realignment
29 // changed the alignment cigar.
30 //
31 // To left-align, we move multi-base indels left by their own length as long as
32 // the preceding bases match the inserted or deleted sequence.  After this
33 // step, we handle multi-base homopolymer indels by shifting them one base to
34 // the left until they mismatch the reference.
35 //
36 // To merge neighboring indels, we iterate through the set of left-stabilized
37 // indels.  For each indel we add a new cigar element to the new cigar.  If a
38 // deletion follows a deletion, or an insertion occurs at the same place as
39 // another insertion, we merge the events by extending the previous cigar
40 // element.
41 //
42 // In practice, we must call this function until the alignment is stabilized.
43 
44 #define VCFLEFTALIGN_DEBUG(msg) \
45     if (false) { cerr << msg; }
46 
47 class VCFIndelAllele {
48     friend ostream& operator<<(ostream&, const VCFIndelAllele&);
49     friend bool operator==(const VCFIndelAllele&, const VCFIndelAllele&);
50     friend bool operator!=(const VCFIndelAllele&, const VCFIndelAllele&);
51     friend bool operator<(const VCFIndelAllele&, const VCFIndelAllele&);
52 public:
53     bool insertion;
54     int length;
55     int position;
56     int readPosition;
57     string sequence;
58 
59     bool homopolymer(void);
60 
VCFIndelAllele(bool i,int l,int p,int rp,string s)61     VCFIndelAllele(bool i, int l, int p, int rp, string s)
62         : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
63         { }
64 };
65 
66 bool FBhomopolymer(string sequence);
67 ostream& operator<<(ostream& out, const VCFIndelAllele& indel);
68 bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b);
69 bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b);
70 bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b);
71 
homopolymer(void)72 bool VCFIndelAllele::homopolymer(void) {
73     string::iterator s = sequence.begin();
74     char c = *s++;
75     while (s != sequence.end()) {
76         if (c != *s++) return false;
77     }
78     return true;
79 }
80 
FBhomopolymer(string sequence)81 bool FBhomopolymer(string sequence) {
82     string::iterator s = sequence.begin();
83     char c = *s++;
84     while (s != sequence.end()) {
85         if (c != *s++) return false;
86     }
87     return true;
88 }
89 
operator <<(ostream & out,const VCFIndelAllele & indel)90 ostream& operator<<(ostream& out, const VCFIndelAllele& indel) {
91     string t = indel.insertion ? "i" : "d";
92     out << t <<  ":" << indel.position << ":" << indel.readPosition << ":" << indel.sequence;
93     return out;
94 }
95 
operator ==(const VCFIndelAllele & a,const VCFIndelAllele & b)96 bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b) {
97     return (a.insertion == b.insertion
98             && a.length == b.length
99             && a.position == b.position
100             && a.sequence == b.sequence);
101 }
102 
operator !=(const VCFIndelAllele & a,const VCFIndelAllele & b)103 bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b) {
104     return !(a==b);
105 }
106 
operator <(const VCFIndelAllele & a,const VCFIndelAllele & b)107 bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b) {
108     ostringstream as, bs;
109     as << a;
110     bs << b;
111     return as.str() < bs.str();
112 }
113 
114 
115 class AltAlignment {
116 public:
117     unsigned int pos;
118     string seq;
119     vector<pair<int, string> > cigar;
AltAlignment(unsigned int & p,string & s,string & c)120     AltAlignment(unsigned int& p,
121                  string& s,
122                  string& c) {
123         pos = p;
124         seq = s;
125         cigar = splitCigar(c);
126     }
127 };
128 
entropy(const string & st)129 double entropy(const string& st) {
130     vector<char> stvec(st.begin(), st.end());
131     set<char> alphabet(stvec.begin(), stvec.end());
132     vector<double> freqs;
133     for (set<char>::iterator c = alphabet.begin(); c != alphabet.end(); ++c) {
134         int ctr = 0;
135         for (vector<char>::iterator s = stvec.begin(); s != stvec.end(); ++s) {
136             if (*s == *c) {
137                 ++ctr;
138             }
139         }
140         freqs.push_back((double)ctr / (double)stvec.size());
141     }
142     double ent = 0;
143     double ln2 = log(2);
144     for (vector<double>::iterator f = freqs.begin(); f != freqs.end(); ++f) {
145         ent += *f * log(*f)/ln2;
146     }
147     ent = -ent;
148     return ent;
149 }
150 
getAlignment(Variant & var,FastaReference & reference,string & ref,vector<AltAlignment> & alignments,int window)151 void getAlignment(Variant& var, FastaReference& reference, string& ref, vector<AltAlignment>& alignments, int window) {
152 
153     // default alignment params
154     float matchScore = 10.0f;
155     float mismatchScore = -9.0f;
156     float gapOpenPenalty = 25.0f;
157     float gapExtendPenalty = 3.33f;
158 
159     // establish reference sequence
160     string pad = string(window/2, 'Z');
161     string leftFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() - window/2, window/2);
162     string rightFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() + var.ref.size(), window/2);
163     ref = pad + leftFlank + var.ref + rightFlank + pad;
164 
165     // and iterate through the alternates, generating alignments
166     for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
167         string alt = pad + leftFlank + *a + rightFlank + pad;
168         CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
169         unsigned int referencePos;
170         string cigar;
171         sw.Align(referencePos, cigar, ref, alt);
172         alignments.push_back(AltAlignment(referencePos, alt, cigar));
173     }
174 }
175 
176 
177 bool stablyLeftAlign(string& alternateSequence, string referenceSequence, int maxiterations = 50, bool debug = false);
178 int countMismatches(string& alternateSequence, string referenceSequence);
179 
leftAlign(string & alternateSequence,Cigar & cigar,string & referenceSequence,bool debug=false)180 bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) {
181 
182     int arsOffset = 0; // pointer to insertion point in aligned reference sequence
183     string alignedReferenceSequence = referenceSequence;
184     int aabOffset = 0;
185     string alignmentAlignedBases = alternateSequence;
186 
187     // store information about the indels
188     vector<VCFIndelAllele> indels;
189 
190     int rp = 0;  // read position, 0-based relative to read
191     int sp = 0;  // sequence position
192 
193     string softBegin;
194     string softEnd;
195 
196     stringstream cigar_before, cigar_after;
197     for (vector<pair<int, string> >::const_iterator c = cigar.begin();
198         c != cigar.end(); ++c) {
199         unsigned int l = c->first;
200         char t = c->second.at(0);
201 
202         cigar_before << l << t;
203         if (t == 'M') { // match or mismatch
204             sp += l;
205             rp += l;
206         } else if (t == 'D') { // deletion
207             indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
208             alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
209             aabOffset += l;
210             sp += l;  // update reference sequence position
211         } else if (t == 'I') { // insertion
212             indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l)));
213             alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
214             arsOffset += l;
215             rp += l;
216         } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
217             // remove these bases from the refseq and read seq, but don't modify the alignment sequence
218             if (rp == 0) {
219                 alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
220                 softBegin = alignmentAlignedBases.substr(0, l);
221             } else {
222                 alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
223                 softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
224             }
225             rp += l;
226         } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
227         } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
228             sp += l;
229         }
230     }
231 
232 
233     int alignedLength = sp;
234 
235     VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl
236        << "| " << alignedReferenceSequence << endl
237        << "| " << alignmentAlignedBases << endl);
238 
239     // if no indels, return the alignment
240     if (indels.empty()) { return false; }
241 
242     // for each indel, from left to right
243     //     while the indel sequence repeated to the left and we're not matched up with the left-previous indel
244     //         move the indel left
245 
246     vector<VCFIndelAllele>::iterator previous = indels.begin();
247     for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
248 
249         // left shift by repeats
250         //
251         // from 1 base to the length of the indel, attempt to shift left
252         // if the move would cause no change in alignment optimality (no
253         // introduction of mismatches, and by definition no change in gap
254         // length), move to the new position.
255         // in practice this moves the indel left when we reach the size of
256         // the repeat unit.
257         //
258         int steppos, readsteppos;
259         VCFIndelAllele& indel = *id;
260         int i = 1;
261         while (i <= indel.length) {
262 
263             int steppos = indel.position - i;
264             int readsteppos = indel.readPosition - i;
265 
266 #ifdef VERBOSE_DEBUG
267             if (debug) {
268                 if (steppos >= 0 && readsteppos >= 0) {
269                     cerr << referenceSequence.substr(steppos, indel.length) << endl;
270                     cerr << alternateSequence.substr(readsteppos, indel.length) << endl;
271                     cerr << indel.sequence << endl;
272                 }
273             }
274 #endif
275             while (steppos >= 0 && readsteppos >= 0
276                    && indel.sequence == referenceSequence.substr(steppos, indel.length)
277                    && indel.sequence == alternateSequence.substr(readsteppos, indel.length)
278                    && (id == indels.begin()
279                        || (previous->insertion && steppos >= previous->position)
280                        || (!previous->insertion && steppos >= previous->position + previous->length))) {
281                 VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
282                 indel.position -= i;
283                 indel.readPosition -= i;
284                 steppos = indel.position - i;
285                 readsteppos = indel.readPosition - i;
286             }
287             do {
288                 ++i;
289             } while (i <= indel.length && indel.length % i != 0);
290         }
291 
292         // left shift indels with exchangeable flanking sequence
293         //
294         // for example:
295         //
296         //    GTTACGTT           GTTACGTT
297         //    GT-----T   ---->   G-----TT
298         //
299         // GTGTGACGTGT           GTGTGACGTGT
300         // GTGTG-----T   ---->   GTG-----TGT
301         //
302         // GTGTG-----T           GTG-----TGT
303         // GTGTGACGTGT   ---->   GTGTGACGTGT
304         //
305         //
306         steppos = indel.position - 1;
307         readsteppos = indel.readPosition - 1;
308         while (steppos >= 0 && readsteppos >= 0
309                && alternateSequence.at(readsteppos) == referenceSequence.at(steppos)
310                && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
311                && (id == indels.begin()
312                    || (previous->insertion && indel.position - 1 >= previous->position)
313                    || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
314             VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
315             indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
316             indel.position -= 1;
317             indel.readPosition -= 1;
318             steppos = indel.position - 1;
319             readsteppos = indel.readPosition - 1;
320         }
321         // tracks previous indel, so we don't run into it with the next shift
322         previous = id;
323     }
324 
325     // bring together floating indels
326     // from left to right
327     // check if we could merge with the next indel
328     // if so, adjust so that we will merge in the next step
329     if (indels.size() > 1) {
330         previous = indels.begin();
331         for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
332             VCFIndelAllele& indel = *id;
333             // parsimony: could we shift right and merge with the previous indel?
334             // if so, do it
335             int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
336             int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
337             if (previous->insertion == indel.insertion
338                     && ((previous->insertion
339                         && (previous->position < indel.position
340                         && previous->readPosition + previous->readPosition < indel.readPosition))
341                         ||
342                         (!previous->insertion
343                         && (previous->position + previous->length < indel.position)
344                         && (previous->readPosition < indel.readPosition)
345                         ))) {
346                 if (previous->homopolymer()) {
347                     string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
348                     string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref);
349                     VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
350                     if (previous->sequence.at(0) == seq.at(0)
351                             && FBhomopolymer(seq)
352                             && FBhomopolymer(readseq)) {
353                         VCFLEFTALIGN_DEBUG("moving " << *previous << " right to "
354                                 << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
355                         previous->position = indel.insertion ? indel.position : indel.position - previous->length;
356                     }
357                 }
358                 else {
359                     int pos = previous->position;
360                     while (pos < (int) referenceSequence.length() &&
361                             ((previous->insertion && pos + previous->length <= indel.position)
362                             ||
363                             (!previous->insertion && pos + previous->length < indel.position))
364                             && previous->sequence
365                                 == referenceSequence.substr(pos + previous->length, previous->length)) {
366                         pos += previous->length;
367                     }
368                     if (pos < previous->position &&
369                         ((previous->insertion && pos + previous->length == indel.position)
370                         ||
371                         (!previous->insertion && pos == indel.position - previous->length))
372                        ) {
373                         VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
374                         previous->position = pos;
375                     }
376                 }
377             }
378             previous = id;
379         }
380     }
381 
382     // for each indel
383     //     if ( we're matched up to the previous insertion (or deletion)
384     //          and it's also an insertion or deletion )
385     //         merge the indels
386     //
387     // and simultaneously reconstruct the cigar
388 
389     Cigar newCigar;
390 
391     if (!softBegin.empty()) {
392         newCigar.push_back(make_pair(softBegin.size(), "S"));
393     }
394 
395     vector<VCFIndelAllele>::iterator id = indels.begin();
396     VCFIndelAllele last = *id++;
397     if (last.position > 0) {
398         newCigar.push_back(make_pair(last.position, "M"));
399         newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
400     } else {
401         newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
402     }
403     int lastend = last.insertion ? last.position : (last.position + last.length);
404     VCFLEFTALIGN_DEBUG(last << ",");
405 
406     for (; id != indels.end(); ++id) {
407         VCFIndelAllele& indel = *id;
408         VCFLEFTALIGN_DEBUG(indel << ",");
409         if (indel.position < lastend) {
410             cerr << "impossibility?: indel realigned left of another indel" << endl
411                  << referenceSequence << endl << alternateSequence << endl;
412             exit(1);
413         } else if (indel.position == lastend && indel.insertion == last.insertion) {
414             pair<int, string>& op = newCigar.back();
415             op.first += indel.length;
416         } else if (indel.position >= lastend) {  // also catches differential indels, but with the same position
417             newCigar.push_back(make_pair(indel.position - lastend, "M"));
418             newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
419         }
420         last = *id;
421         lastend = last.insertion ? last.position : (last.position + last.length);
422     }
423 
424     if (lastend < alignedLength) {
425         newCigar.push_back(make_pair(alignedLength - lastend, "M"));
426     }
427 
428     if (!softEnd.empty()) {
429         newCigar.push_back(make_pair(softEnd.size(), "S"));
430     }
431 
432     VCFLEFTALIGN_DEBUG(endl);
433 
434     cigar = newCigar;
435 
436     for (vector<pair<int, string> >::const_iterator c = cigar.begin();
437         c != cigar.end(); ++c) {
438         unsigned int l = c->first;
439         char t = c->second.at(0);
440         cigar_after << l << t;
441     }
442 
443     //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl;
444     VCFLEFTALIGN_DEBUG(cigar_after.str() << endl);
445 
446     // check if we're realigned
447     if (cigar_after.str() == cigar_before.str()) {
448         return false;
449     } else {
450         return true;
451     }
452 
453 }
454 
455 // Iteratively left-aligns the indels in the alignment until we have a stable
456 // realignment.  Returns true on realignment success or non-realignment.
457 // Returns false if we exceed the maximum number of realignment iterations.
458 //
stablyLeftAlign(string & alternateSequence,string referenceSequence,Cigar & cigar,int maxiterations,bool debug)459 bool stablyLeftAlign(string& alternateSequence, string referenceSequence, Cigar& cigar, int maxiterations, bool debug) {
460 
461     if (!leftAlign(alternateSequence, cigar, referenceSequence, debug)) {
462 
463         return true;
464 
465     } else {
466 
467         bool result = true;
468         while ((result = leftAlign(alternateSequence, cigar, referenceSequence, debug)) && --maxiterations > 0) {
469         }
470 
471         if (maxiterations <= 0) {
472             return false;
473         } else {
474             return true;
475         }
476 
477     }
478 
479 }
480 
481 
printSummary(char ** argv)482 void printSummary(char** argv) {
483       cerr << R"(
484 
485 Left-align indels and complex variants in the input using a pairwise
486 ref/alt alignment followed by a heuristic, iterative left realignment
487 process that shifts indel representations to their absolute leftmost
488 (5') extent.
489 
490 This is the same procedure used in the internal left alignment in
491 freebayes, and can be used when preparing VCF files for input to
492 freebayes to decrease positional representation differences between
493 the input alleles and left-realigned alignments.
494 
495 usage: vcfleftalign [options] [file]
496 
497 options:
498 
499         -r, --reference FILE  Use this reference as a basis for realignment.
500         -w, --window N        Use a window of this many bp when left aligning (150).
501 
502 Left-aligns variants in the specified input file or stdin.  Window
503 size is determined dynamically according to the entropy of the regions
504 flanking the indel.  These must have entropy > 1 bit/bp, or be shorter
505 than ~5kb.
506 
507 )";
508     cerr << endl << "Type: transformation" << endl << endl;
509     exit(0);
510 }
511 
main(int argc,char ** argv)512 int main(int argc, char** argv) {
513 
514     int window = 150;
515     VariantCallFile variantFile;
516     string fastaFileName;
517 
518     int c;
519     while (true) {
520         static struct option long_options[] =
521             {
522                 /* These options set a flag. */
523                 //{"verbose", no_argument,       &verbose_flag, 1},
524                 {"help", no_argument, 0, 'h'},
525                 {"reference", required_argument, 0, 'r'},
526                 {"window", required_argument, 0, 'w'},
527                 {0, 0, 0, 0}
528             };
529         /* getopt_long stores the option index here. */
530         int option_index = 0;
531 
532         c = getopt_long (argc, argv, "hw:r:",
533                          long_options, &option_index);
534 
535         if (c == -1)
536             break;
537 
538         switch (c) {
539 
540 	    case 'r':
541             fastaFileName = optarg;
542             break;
543 
544 	    case 'w':
545             window = atoi(optarg);
546             break;
547 
548         case '?':
549             printSummary(argv);
550             exit(1);
551             break;
552 
553         case 'h':
554             printSummary(argv);
555             break;
556 
557         default:
558             abort ();
559         }
560     }
561 
562     if (optind < argc) {
563         string filename = argv[optind];
564         variantFile.open(filename);
565     } else {
566         variantFile.open(std::cin);
567     }
568 
569     if (!variantFile.is_open()) {
570         cerr << "could not open VCF file" << endl;
571         exit(1);
572     }
573 
574     FastaReference fastaReference;
575     if (fastaFileName.empty()) {
576         cerr << "a reference is required" << endl;
577         exit(1);
578     } else {
579         fastaReference.open(fastaFileName);
580     }
581 
582     /*
583     variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
584     variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
585     if (!parseFlag.empty()) {
586         variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
587     }
588     */
589     cout << variantFile.header << endl;
590 
591     Variant var(variantFile);
592     while (variantFile.getNextVariant(var)) {
593 
594         // if there is no indel, there is nothing to realign
595         bool hasIndel = false;
596         for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
597             if (a->size() != var.ref.size()) {
598                 hasIndel = true;
599                 break;
600             }
601         }
602         if (!hasIndel) {
603             cout << var << endl;
604             continue;
605         }
606 
607         vector<AltAlignment> alignments;
608         string ref;
609 
610         // determine window size to prevent mismapping with SW algorithm
611         int currentWindow = window;
612         int scale = 2;
613         if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
614         for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
615             if (a->size()*scale > currentWindow) {
616                 currentWindow = a->size()*scale;
617             }
618         }
619 
620         // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
621         while (currentWindow < 2000) { // limit to one step > than this
622             string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
623             if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
624                 entropy(refTarget.substr(refTarget.size()/2)) < 1) {
625                 currentWindow *= scale;
626             } else {
627                 break;
628             }
629         }
630 
631         // do the alignments
632         getAlignment(var, fastaReference, ref, alignments, currentWindow);
633 
634         // stably left align the alignments
635         for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
636             Cigar cigarBefore = a->cigar;
637             //cerr << a->seq << endl;
638             //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
639             long int prev = a->pos;
640             stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
641             //cerr << "after  : " << a->pos << " " << joinCigar(a->cigar) << endl;
642             if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
643         }
644         //cout << var << endl;
645 
646         // transform the mappings
647         // chop off leading matching bases
648         // find the range of bp in the alleles
649         // make the new ref allele
650         // make the new alt alleles
651         // emit the var
652 
653         long int newPosition = var.position+currentWindow/2;
654         long int newEndPosition = var.position-currentWindow/2;
655         // check for no-indel case
656         int newLength = var.ref.size();
657         bool giveUp = false;
658         for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
659             // get the first mismatching position
660             Cigar::iterator c = a->cigar.begin();
661 
662             int rp = 0;
663             int sp = 0;
664             bool hitMismatch = false;
665 
666             int matchingBpAtStart = 0;
667             int matchingBpAtEnd = 0;
668             // will be set to true if the first reference position match is broken by a SNP, not an indel
669             bool leadingSNP = false;
670 
671             while (c != a->cigar.end()) {
672                 char op = c->second[0];
673                 if (c == a->cigar.begin()) {
674                     if (op != 'M') {
675                         cerr << "alignment does not start on matched sequence" << endl;
676                         cerr << var << endl;
677                         exit(1);
678                     }
679                     int i = 0;
680                     for ( ; i < c->first; ++i) {
681                         if (ref[i] != a->seq[i]) {
682                             leadingSNP = true;
683                             break;
684                         }
685                     }
686                     matchingBpAtStart = i;
687                 }
688                 if (!leadingSNP && c == (a->cigar.begin()+1)) {
689                     // if the first thing we run into is an indel, step back, per VCF spec
690                     if (op == 'D' || op == 'I') {
691                         --matchingBpAtStart;
692                     }
693                 }
694                 if (c == (a->cigar.end()-1)) {
695                     if (op != 'M') {
696                         // soft clip at end
697                         // it'll be hard to interpret this
698                         // the alignments sometimes generate this
699                         // best thing to do is to move on
700                         //cerr << "alignment does not end on matched sequence" << endl;
701                         //cout << var << endl;
702                         //exit(1);
703                         giveUp = true;
704                         break;
705                     }
706                     int i = 0;
707                     for ( ; i < c->first; ++i) {
708                         if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
709                             break;
710                         }
711                     }
712                     matchingBpAtEnd = i;
713                 }
714                 ++c;
715             }
716 
717             int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
718             int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
719             //cerr << "alt mismatch length " << altMismatchLength << endl
720             //     << "ref mismatch length " << refMismatchLength << endl;
721             long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
722             long int newEnd = newStart + refMismatchLength;
723             //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
724             newPosition = min(newStart, newPosition);
725             newEndPosition = max(newEnd, newEndPosition);
726             //cerr << newPosition << " " << newEndPosition << endl;
727             //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
728         }
729 
730         // the alignment failed for some reason, continue
731         if (giveUp) {
732             cout << var << endl;
733             continue;
734         }
735 
736         //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
737         int newRefSize = newEndPosition - newPosition;
738         string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
739         // get the number of bp to strip from the alts
740         int stripFromStart = currentWindow/2 - (var.position - newPosition);
741         int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);
742 
743         //cerr << "strip from start " << stripFromStart << endl;
744         //cerr << "strip from end " << stripFromEnd << endl;
745 
746         vector<string> newAlt;
747         vector<string>::iterator l = var.alt.begin();
748         bool failedAlt = false;
749         for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
750              ++a, ++l) {
751             int diff = newRef.size() - l->size();
752             string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
753             newAlt.push_back(alt);
754             if (alt.empty()) failedAlt = true;
755         }
756 
757         // check the before/after haplotypes
758         bool brokenRealignment = false;
759         if (!newRef.empty() && !failedAlt) {
760             int slop = 50; // 50 extra bp!
761             int haplotypeStart = min(var.position, newPosition) - slop;
762             int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
763             string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
764                                                                       haplotypeEnd - haplotypeStart);
765             vector<string>::iterator o = var.alt.begin();
766             vector<string>::iterator n = newAlt.begin();
767             for ( ; o != var.alt.end() ; ++o, ++n) {
768                 // map the haplotypes
769                 string oldHaplotype = referenceHaplotype;
770                 string newHaplotype = referenceHaplotype;
771                 oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
772                 newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
773                 if (oldHaplotype != newHaplotype) {
774                     cerr << "broken left alignment!" << endl
775                          << "old " << oldHaplotype << endl
776                          << "new " << newHaplotype << endl;
777                     cerr << "was: " << var << endl;
778                     brokenRealignment = true;
779                 }
780             }
781         }
782 
783         // *if* everything is OK, update the variant
784         if (!brokenRealignment && !newRef.empty() && !failedAlt) {
785             var.ref = newRef;
786             var.alt = newAlt;
787             var.position = newPosition;
788         }
789 
790         cout << var << endl;
791 
792         // for each parsedalternate, get the position
793         // build a new vcf record for that position
794         // unless we are already at the position !
795         // take everything which is unique to that allele (records) and append it to the new record
796         // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
797         // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
798 
799 
800 
801         //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
802     }
803 
804     return 0;
805 
806 }
807