1 /*
2 vcflib C++ library for parsing and manipulating VCF files
3
4 Copyright © 2010-2020 Erik Garrison
5 Copyright © 2020 Pjotr Prins
6
7 This software is published under the MIT License. See the LICENSE file.
8 */
9
10 #include "Variant.h"
11 #include "convert.h"
12 #include "join.h"
13 #include "split.h"
14 #include "Fasta.h"
15 #include <set>
16 #include <vector>
17 #include <getopt.h>
18 #include <cmath>
19
20 using namespace std;
21 using namespace vcflib;
22
23
24 // Attempts to left-realign all the indels represented by the alignment cigar.
25 //
26 // This is done by shifting all indels as far left as they can go without
27 // mismatch, then merging neighboring indels of the same class. leftAlign
28 // updates the alignment cigar with changes, and returns true if realignment
29 // changed the alignment cigar.
30 //
31 // To left-align, we move multi-base indels left by their own length as long as
32 // the preceding bases match the inserted or deleted sequence. After this
33 // step, we handle multi-base homopolymer indels by shifting them one base to
34 // the left until they mismatch the reference.
35 //
36 // To merge neighboring indels, we iterate through the set of left-stabilized
37 // indels. For each indel we add a new cigar element to the new cigar. If a
38 // deletion follows a deletion, or an insertion occurs at the same place as
39 // another insertion, we merge the events by extending the previous cigar
40 // element.
41 //
42 // In practice, we must call this function until the alignment is stabilized.
43
44 #define VCFLEFTALIGN_DEBUG(msg) \
45 if (false) { cerr << msg; }
46
47 class VCFIndelAllele {
48 friend ostream& operator<<(ostream&, const VCFIndelAllele&);
49 friend bool operator==(const VCFIndelAllele&, const VCFIndelAllele&);
50 friend bool operator!=(const VCFIndelAllele&, const VCFIndelAllele&);
51 friend bool operator<(const VCFIndelAllele&, const VCFIndelAllele&);
52 public:
53 bool insertion;
54 int length;
55 int position;
56 int readPosition;
57 string sequence;
58
59 bool homopolymer(void);
60
VCFIndelAllele(bool i,int l,int p,int rp,string s)61 VCFIndelAllele(bool i, int l, int p, int rp, string s)
62 : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
63 { }
64 };
65
66 bool FBhomopolymer(string sequence);
67 ostream& operator<<(ostream& out, const VCFIndelAllele& indel);
68 bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b);
69 bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b);
70 bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b);
71
homopolymer(void)72 bool VCFIndelAllele::homopolymer(void) {
73 string::iterator s = sequence.begin();
74 char c = *s++;
75 while (s != sequence.end()) {
76 if (c != *s++) return false;
77 }
78 return true;
79 }
80
FBhomopolymer(string sequence)81 bool FBhomopolymer(string sequence) {
82 string::iterator s = sequence.begin();
83 char c = *s++;
84 while (s != sequence.end()) {
85 if (c != *s++) return false;
86 }
87 return true;
88 }
89
operator <<(ostream & out,const VCFIndelAllele & indel)90 ostream& operator<<(ostream& out, const VCFIndelAllele& indel) {
91 string t = indel.insertion ? "i" : "d";
92 out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.sequence;
93 return out;
94 }
95
operator ==(const VCFIndelAllele & a,const VCFIndelAllele & b)96 bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b) {
97 return (a.insertion == b.insertion
98 && a.length == b.length
99 && a.position == b.position
100 && a.sequence == b.sequence);
101 }
102
operator !=(const VCFIndelAllele & a,const VCFIndelAllele & b)103 bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b) {
104 return !(a==b);
105 }
106
operator <(const VCFIndelAllele & a,const VCFIndelAllele & b)107 bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b) {
108 ostringstream as, bs;
109 as << a;
110 bs << b;
111 return as.str() < bs.str();
112 }
113
114
115 class AltAlignment {
116 public:
117 unsigned int pos;
118 string seq;
119 vector<pair<int, string> > cigar;
AltAlignment(unsigned int & p,string & s,string & c)120 AltAlignment(unsigned int& p,
121 string& s,
122 string& c) {
123 pos = p;
124 seq = s;
125 cigar = splitCigar(c);
126 }
127 };
128
entropy(const string & st)129 double entropy(const string& st) {
130 vector<char> stvec(st.begin(), st.end());
131 set<char> alphabet(stvec.begin(), stvec.end());
132 vector<double> freqs;
133 for (set<char>::iterator c = alphabet.begin(); c != alphabet.end(); ++c) {
134 int ctr = 0;
135 for (vector<char>::iterator s = stvec.begin(); s != stvec.end(); ++s) {
136 if (*s == *c) {
137 ++ctr;
138 }
139 }
140 freqs.push_back((double)ctr / (double)stvec.size());
141 }
142 double ent = 0;
143 double ln2 = log(2);
144 for (vector<double>::iterator f = freqs.begin(); f != freqs.end(); ++f) {
145 ent += *f * log(*f)/ln2;
146 }
147 ent = -ent;
148 return ent;
149 }
150
getAlignment(Variant & var,FastaReference & reference,string & ref,vector<AltAlignment> & alignments,int window)151 void getAlignment(Variant& var, FastaReference& reference, string& ref, vector<AltAlignment>& alignments, int window) {
152
153 // default alignment params
154 float matchScore = 10.0f;
155 float mismatchScore = -9.0f;
156 float gapOpenPenalty = 25.0f;
157 float gapExtendPenalty = 3.33f;
158
159 // establish reference sequence
160 string pad = string(window/2, 'Z');
161 string leftFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() - window/2, window/2);
162 string rightFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() + var.ref.size(), window/2);
163 ref = pad + leftFlank + var.ref + rightFlank + pad;
164
165 // and iterate through the alternates, generating alignments
166 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
167 string alt = pad + leftFlank + *a + rightFlank + pad;
168 CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
169 unsigned int referencePos;
170 string cigar;
171 sw.Align(referencePos, cigar, ref, alt);
172 alignments.push_back(AltAlignment(referencePos, alt, cigar));
173 }
174 }
175
176
177 bool stablyLeftAlign(string& alternateSequence, string referenceSequence, int maxiterations = 50, bool debug = false);
178 int countMismatches(string& alternateSequence, string referenceSequence);
179
leftAlign(string & alternateSequence,Cigar & cigar,string & referenceSequence,bool debug=false)180 bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) {
181
182 int arsOffset = 0; // pointer to insertion point in aligned reference sequence
183 string alignedReferenceSequence = referenceSequence;
184 int aabOffset = 0;
185 string alignmentAlignedBases = alternateSequence;
186
187 // store information about the indels
188 vector<VCFIndelAllele> indels;
189
190 int rp = 0; // read position, 0-based relative to read
191 int sp = 0; // sequence position
192
193 string softBegin;
194 string softEnd;
195
196 stringstream cigar_before, cigar_after;
197 for (vector<pair<int, string> >::const_iterator c = cigar.begin();
198 c != cigar.end(); ++c) {
199 unsigned int l = c->first;
200 char t = c->second.at(0);
201
202 cigar_before << l << t;
203 if (t == 'M') { // match or mismatch
204 sp += l;
205 rp += l;
206 } else if (t == 'D') { // deletion
207 indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
208 alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
209 aabOffset += l;
210 sp += l; // update reference sequence position
211 } else if (t == 'I') { // insertion
212 indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l)));
213 alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
214 arsOffset += l;
215 rp += l;
216 } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
217 // remove these bases from the refseq and read seq, but don't modify the alignment sequence
218 if (rp == 0) {
219 alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
220 softBegin = alignmentAlignedBases.substr(0, l);
221 } else {
222 alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
223 softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
224 }
225 rp += l;
226 } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
227 } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
228 sp += l;
229 }
230 }
231
232
233 int alignedLength = sp;
234
235 VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl
236 << "| " << alignedReferenceSequence << endl
237 << "| " << alignmentAlignedBases << endl);
238
239 // if no indels, return the alignment
240 if (indels.empty()) { return false; }
241
242 // for each indel, from left to right
243 // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
244 // move the indel left
245
246 vector<VCFIndelAllele>::iterator previous = indels.begin();
247 for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
248
249 // left shift by repeats
250 //
251 // from 1 base to the length of the indel, attempt to shift left
252 // if the move would cause no change in alignment optimality (no
253 // introduction of mismatches, and by definition no change in gap
254 // length), move to the new position.
255 // in practice this moves the indel left when we reach the size of
256 // the repeat unit.
257 //
258 int steppos, readsteppos;
259 VCFIndelAllele& indel = *id;
260 int i = 1;
261 while (i <= indel.length) {
262
263 int steppos = indel.position - i;
264 int readsteppos = indel.readPosition - i;
265
266 #ifdef VERBOSE_DEBUG
267 if (debug) {
268 if (steppos >= 0 && readsteppos >= 0) {
269 cerr << referenceSequence.substr(steppos, indel.length) << endl;
270 cerr << alternateSequence.substr(readsteppos, indel.length) << endl;
271 cerr << indel.sequence << endl;
272 }
273 }
274 #endif
275 while (steppos >= 0 && readsteppos >= 0
276 && indel.sequence == referenceSequence.substr(steppos, indel.length)
277 && indel.sequence == alternateSequence.substr(readsteppos, indel.length)
278 && (id == indels.begin()
279 || (previous->insertion && steppos >= previous->position)
280 || (!previous->insertion && steppos >= previous->position + previous->length))) {
281 VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
282 indel.position -= i;
283 indel.readPosition -= i;
284 steppos = indel.position - i;
285 readsteppos = indel.readPosition - i;
286 }
287 do {
288 ++i;
289 } while (i <= indel.length && indel.length % i != 0);
290 }
291
292 // left shift indels with exchangeable flanking sequence
293 //
294 // for example:
295 //
296 // GTTACGTT GTTACGTT
297 // GT-----T ----> G-----TT
298 //
299 // GTGTGACGTGT GTGTGACGTGT
300 // GTGTG-----T ----> GTG-----TGT
301 //
302 // GTGTG-----T GTG-----TGT
303 // GTGTGACGTGT ----> GTGTGACGTGT
304 //
305 //
306 steppos = indel.position - 1;
307 readsteppos = indel.readPosition - 1;
308 while (steppos >= 0 && readsteppos >= 0
309 && alternateSequence.at(readsteppos) == referenceSequence.at(steppos)
310 && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
311 && (id == indels.begin()
312 || (previous->insertion && indel.position - 1 >= previous->position)
313 || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
314 VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
315 indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
316 indel.position -= 1;
317 indel.readPosition -= 1;
318 steppos = indel.position - 1;
319 readsteppos = indel.readPosition - 1;
320 }
321 // tracks previous indel, so we don't run into it with the next shift
322 previous = id;
323 }
324
325 // bring together floating indels
326 // from left to right
327 // check if we could merge with the next indel
328 // if so, adjust so that we will merge in the next step
329 if (indels.size() > 1) {
330 previous = indels.begin();
331 for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
332 VCFIndelAllele& indel = *id;
333 // parsimony: could we shift right and merge with the previous indel?
334 // if so, do it
335 int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
336 int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
337 if (previous->insertion == indel.insertion
338 && ((previous->insertion
339 && (previous->position < indel.position
340 && previous->readPosition + previous->readPosition < indel.readPosition))
341 ||
342 (!previous->insertion
343 && (previous->position + previous->length < indel.position)
344 && (previous->readPosition < indel.readPosition)
345 ))) {
346 if (previous->homopolymer()) {
347 string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
348 string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref);
349 VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
350 if (previous->sequence.at(0) == seq.at(0)
351 && FBhomopolymer(seq)
352 && FBhomopolymer(readseq)) {
353 VCFLEFTALIGN_DEBUG("moving " << *previous << " right to "
354 << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
355 previous->position = indel.insertion ? indel.position : indel.position - previous->length;
356 }
357 }
358 else {
359 int pos = previous->position;
360 while (pos < (int) referenceSequence.length() &&
361 ((previous->insertion && pos + previous->length <= indel.position)
362 ||
363 (!previous->insertion && pos + previous->length < indel.position))
364 && previous->sequence
365 == referenceSequence.substr(pos + previous->length, previous->length)) {
366 pos += previous->length;
367 }
368 if (pos < previous->position &&
369 ((previous->insertion && pos + previous->length == indel.position)
370 ||
371 (!previous->insertion && pos == indel.position - previous->length))
372 ) {
373 VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
374 previous->position = pos;
375 }
376 }
377 }
378 previous = id;
379 }
380 }
381
382 // for each indel
383 // if ( we're matched up to the previous insertion (or deletion)
384 // and it's also an insertion or deletion )
385 // merge the indels
386 //
387 // and simultaneously reconstruct the cigar
388
389 Cigar newCigar;
390
391 if (!softBegin.empty()) {
392 newCigar.push_back(make_pair(softBegin.size(), "S"));
393 }
394
395 vector<VCFIndelAllele>::iterator id = indels.begin();
396 VCFIndelAllele last = *id++;
397 if (last.position > 0) {
398 newCigar.push_back(make_pair(last.position, "M"));
399 newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
400 } else {
401 newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
402 }
403 int lastend = last.insertion ? last.position : (last.position + last.length);
404 VCFLEFTALIGN_DEBUG(last << ",");
405
406 for (; id != indels.end(); ++id) {
407 VCFIndelAllele& indel = *id;
408 VCFLEFTALIGN_DEBUG(indel << ",");
409 if (indel.position < lastend) {
410 cerr << "impossibility?: indel realigned left of another indel" << endl
411 << referenceSequence << endl << alternateSequence << endl;
412 exit(1);
413 } else if (indel.position == lastend && indel.insertion == last.insertion) {
414 pair<int, string>& op = newCigar.back();
415 op.first += indel.length;
416 } else if (indel.position >= lastend) { // also catches differential indels, but with the same position
417 newCigar.push_back(make_pair(indel.position - lastend, "M"));
418 newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
419 }
420 last = *id;
421 lastend = last.insertion ? last.position : (last.position + last.length);
422 }
423
424 if (lastend < alignedLength) {
425 newCigar.push_back(make_pair(alignedLength - lastend, "M"));
426 }
427
428 if (!softEnd.empty()) {
429 newCigar.push_back(make_pair(softEnd.size(), "S"));
430 }
431
432 VCFLEFTALIGN_DEBUG(endl);
433
434 cigar = newCigar;
435
436 for (vector<pair<int, string> >::const_iterator c = cigar.begin();
437 c != cigar.end(); ++c) {
438 unsigned int l = c->first;
439 char t = c->second.at(0);
440 cigar_after << l << t;
441 }
442
443 //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl;
444 VCFLEFTALIGN_DEBUG(cigar_after.str() << endl);
445
446 // check if we're realigned
447 if (cigar_after.str() == cigar_before.str()) {
448 return false;
449 } else {
450 return true;
451 }
452
453 }
454
455 // Iteratively left-aligns the indels in the alignment until we have a stable
456 // realignment. Returns true on realignment success or non-realignment.
457 // Returns false if we exceed the maximum number of realignment iterations.
458 //
stablyLeftAlign(string & alternateSequence,string referenceSequence,Cigar & cigar,int maxiterations,bool debug)459 bool stablyLeftAlign(string& alternateSequence, string referenceSequence, Cigar& cigar, int maxiterations, bool debug) {
460
461 if (!leftAlign(alternateSequence, cigar, referenceSequence, debug)) {
462
463 return true;
464
465 } else {
466
467 bool result = true;
468 while ((result = leftAlign(alternateSequence, cigar, referenceSequence, debug)) && --maxiterations > 0) {
469 }
470
471 if (maxiterations <= 0) {
472 return false;
473 } else {
474 return true;
475 }
476
477 }
478
479 }
480
481
printSummary(char ** argv)482 void printSummary(char** argv) {
483 cerr << R"(
484
485 Left-align indels and complex variants in the input using a pairwise
486 ref/alt alignment followed by a heuristic, iterative left realignment
487 process that shifts indel representations to their absolute leftmost
488 (5') extent.
489
490 This is the same procedure used in the internal left alignment in
491 freebayes, and can be used when preparing VCF files for input to
492 freebayes to decrease positional representation differences between
493 the input alleles and left-realigned alignments.
494
495 usage: vcfleftalign [options] [file]
496
497 options:
498
499 -r, --reference FILE Use this reference as a basis for realignment.
500 -w, --window N Use a window of this many bp when left aligning (150).
501
502 Left-aligns variants in the specified input file or stdin. Window
503 size is determined dynamically according to the entropy of the regions
504 flanking the indel. These must have entropy > 1 bit/bp, or be shorter
505 than ~5kb.
506
507 )";
508 cerr << endl << "Type: transformation" << endl << endl;
509 exit(0);
510 }
511
main(int argc,char ** argv)512 int main(int argc, char** argv) {
513
514 int window = 150;
515 VariantCallFile variantFile;
516 string fastaFileName;
517
518 int c;
519 while (true) {
520 static struct option long_options[] =
521 {
522 /* These options set a flag. */
523 //{"verbose", no_argument, &verbose_flag, 1},
524 {"help", no_argument, 0, 'h'},
525 {"reference", required_argument, 0, 'r'},
526 {"window", required_argument, 0, 'w'},
527 {0, 0, 0, 0}
528 };
529 /* getopt_long stores the option index here. */
530 int option_index = 0;
531
532 c = getopt_long (argc, argv, "hw:r:",
533 long_options, &option_index);
534
535 if (c == -1)
536 break;
537
538 switch (c) {
539
540 case 'r':
541 fastaFileName = optarg;
542 break;
543
544 case 'w':
545 window = atoi(optarg);
546 break;
547
548 case '?':
549 printSummary(argv);
550 exit(1);
551 break;
552
553 case 'h':
554 printSummary(argv);
555 break;
556
557 default:
558 abort ();
559 }
560 }
561
562 if (optind < argc) {
563 string filename = argv[optind];
564 variantFile.open(filename);
565 } else {
566 variantFile.open(std::cin);
567 }
568
569 if (!variantFile.is_open()) {
570 cerr << "could not open VCF file" << endl;
571 exit(1);
572 }
573
574 FastaReference fastaReference;
575 if (fastaFileName.empty()) {
576 cerr << "a reference is required" << endl;
577 exit(1);
578 } else {
579 fastaReference.open(fastaFileName);
580 }
581
582 /*
583 variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
584 variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
585 if (!parseFlag.empty()) {
586 variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
587 }
588 */
589 cout << variantFile.header << endl;
590
591 Variant var(variantFile);
592 while (variantFile.getNextVariant(var)) {
593
594 // if there is no indel, there is nothing to realign
595 bool hasIndel = false;
596 for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
597 if (a->size() != var.ref.size()) {
598 hasIndel = true;
599 break;
600 }
601 }
602 if (!hasIndel) {
603 cout << var << endl;
604 continue;
605 }
606
607 vector<AltAlignment> alignments;
608 string ref;
609
610 // determine window size to prevent mismapping with SW algorithm
611 int currentWindow = window;
612 int scale = 2;
613 if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
614 for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
615 if (a->size()*scale > currentWindow) {
616 currentWindow = a->size()*scale;
617 }
618 }
619
620 // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
621 while (currentWindow < 2000) { // limit to one step > than this
622 string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
623 if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
624 entropy(refTarget.substr(refTarget.size()/2)) < 1) {
625 currentWindow *= scale;
626 } else {
627 break;
628 }
629 }
630
631 // do the alignments
632 getAlignment(var, fastaReference, ref, alignments, currentWindow);
633
634 // stably left align the alignments
635 for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
636 Cigar cigarBefore = a->cigar;
637 //cerr << a->seq << endl;
638 //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
639 long int prev = a->pos;
640 stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
641 //cerr << "after : " << a->pos << " " << joinCigar(a->cigar) << endl;
642 if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
643 }
644 //cout << var << endl;
645
646 // transform the mappings
647 // chop off leading matching bases
648 // find the range of bp in the alleles
649 // make the new ref allele
650 // make the new alt alleles
651 // emit the var
652
653 long int newPosition = var.position+currentWindow/2;
654 long int newEndPosition = var.position-currentWindow/2;
655 // check for no-indel case
656 int newLength = var.ref.size();
657 bool giveUp = false;
658 for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
659 // get the first mismatching position
660 Cigar::iterator c = a->cigar.begin();
661
662 int rp = 0;
663 int sp = 0;
664 bool hitMismatch = false;
665
666 int matchingBpAtStart = 0;
667 int matchingBpAtEnd = 0;
668 // will be set to true if the first reference position match is broken by a SNP, not an indel
669 bool leadingSNP = false;
670
671 while (c != a->cigar.end()) {
672 char op = c->second[0];
673 if (c == a->cigar.begin()) {
674 if (op != 'M') {
675 cerr << "alignment does not start on matched sequence" << endl;
676 cerr << var << endl;
677 exit(1);
678 }
679 int i = 0;
680 for ( ; i < c->first; ++i) {
681 if (ref[i] != a->seq[i]) {
682 leadingSNP = true;
683 break;
684 }
685 }
686 matchingBpAtStart = i;
687 }
688 if (!leadingSNP && c == (a->cigar.begin()+1)) {
689 // if the first thing we run into is an indel, step back, per VCF spec
690 if (op == 'D' || op == 'I') {
691 --matchingBpAtStart;
692 }
693 }
694 if (c == (a->cigar.end()-1)) {
695 if (op != 'M') {
696 // soft clip at end
697 // it'll be hard to interpret this
698 // the alignments sometimes generate this
699 // best thing to do is to move on
700 //cerr << "alignment does not end on matched sequence" << endl;
701 //cout << var << endl;
702 //exit(1);
703 giveUp = true;
704 break;
705 }
706 int i = 0;
707 for ( ; i < c->first; ++i) {
708 if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
709 break;
710 }
711 }
712 matchingBpAtEnd = i;
713 }
714 ++c;
715 }
716
717 int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
718 int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
719 //cerr << "alt mismatch length " << altMismatchLength << endl
720 // << "ref mismatch length " << refMismatchLength << endl;
721 long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
722 long int newEnd = newStart + refMismatchLength;
723 //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
724 newPosition = min(newStart, newPosition);
725 newEndPosition = max(newEnd, newEndPosition);
726 //cerr << newPosition << " " << newEndPosition << endl;
727 //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
728 }
729
730 // the alignment failed for some reason, continue
731 if (giveUp) {
732 cout << var << endl;
733 continue;
734 }
735
736 //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
737 int newRefSize = newEndPosition - newPosition;
738 string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
739 // get the number of bp to strip from the alts
740 int stripFromStart = currentWindow/2 - (var.position - newPosition);
741 int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);
742
743 //cerr << "strip from start " << stripFromStart << endl;
744 //cerr << "strip from end " << stripFromEnd << endl;
745
746 vector<string> newAlt;
747 vector<string>::iterator l = var.alt.begin();
748 bool failedAlt = false;
749 for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
750 ++a, ++l) {
751 int diff = newRef.size() - l->size();
752 string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
753 newAlt.push_back(alt);
754 if (alt.empty()) failedAlt = true;
755 }
756
757 // check the before/after haplotypes
758 bool brokenRealignment = false;
759 if (!newRef.empty() && !failedAlt) {
760 int slop = 50; // 50 extra bp!
761 int haplotypeStart = min(var.position, newPosition) - slop;
762 int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
763 string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
764 haplotypeEnd - haplotypeStart);
765 vector<string>::iterator o = var.alt.begin();
766 vector<string>::iterator n = newAlt.begin();
767 for ( ; o != var.alt.end() ; ++o, ++n) {
768 // map the haplotypes
769 string oldHaplotype = referenceHaplotype;
770 string newHaplotype = referenceHaplotype;
771 oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
772 newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
773 if (oldHaplotype != newHaplotype) {
774 cerr << "broken left alignment!" << endl
775 << "old " << oldHaplotype << endl
776 << "new " << newHaplotype << endl;
777 cerr << "was: " << var << endl;
778 brokenRealignment = true;
779 }
780 }
781 }
782
783 // *if* everything is OK, update the variant
784 if (!brokenRealignment && !newRef.empty() && !failedAlt) {
785 var.ref = newRef;
786 var.alt = newAlt;
787 var.position = newPosition;
788 }
789
790 cout << var << endl;
791
792 // for each parsedalternate, get the position
793 // build a new vcf record for that position
794 // unless we are already at the position !
795 // take everything which is unique to that allele (records) and append it to the new record
796 // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
797 // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
798
799
800
801 //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
802 }
803
804 return 0;
805
806 }
807