1 #include "LeftAlign.h"
2
3 //bool debug;
4
5 // Attempts to left-realign all the indels represented by the alignment cigar.
6 //
7 // This is done by shifting all indels as far left as they can go without
8 // mismatch, then merging neighboring indels of the same class. leftAlign
9 // updates the alignment cigar with changes, and returns true if realignment
10 // changed the alignment cigar.
11 //
12 // To left-align, we move multi-base indels left by their own length as long as
13 // the preceding bases match the inserted or deleted sequence. After this
14 // step, we handle multi-base homopolymer indels by shifting them one base to
15 // the left until they mismatch the reference.
16 //
17 // To merge neighboring indels, we iterate through the set of left-stabilized
18 // indels. For each indel we add a new cigar element to the new cigar. If a
19 // deletion follows a deletion, or an insertion occurs at the same place as
20 // another insertion, we merge the events by extending the previous cigar
21 // element.
22 //
23 // In practice, we must call this function until the alignment is stabilized.
24 //
leftAlign(BAMALIGN & alignment,string & referenceSequence,bool debug)25 bool leftAlign(BAMALIGN& alignment, string& referenceSequence, bool debug) {
26 string alignmentAlignedBases = alignment.QUERYBASES;
27 const string alignmentSequence = alignment.QUERYBASES;
28
29 int arsOffset = 0; // pointer to insertion point in aligned reference sequence
30 string alignedReferenceSequence = referenceSequence;
31 int aabOffset = 0;
32
33 // store information about the indels
34 vector<FBIndelAllele> indels;
35
36 int rp = 0; // read position, 0-based relative to read
37 int sp = 0; // sequence position
38
39 string softBegin;
40 string softEnd;
41
42 stringstream cigar_before, cigar_after;
43 CIGAR cigar = alignment.GETCIGAR;
44 for (CIGAR::const_iterator c = cigar.begin();
45 c != cigar.end(); ++c) {
46 unsigned int l = c->CIGLEN;
47 char t = c->CIGTYPE;
48 cigar_before << l << t;
49 if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
50 sp += l;
51 rp += l;
52 } else if (t == 'D') { // deletion
53 indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l), false));
54 alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
55 aabOffset += l;
56 sp += l; // update reference sequence position
57 } else if (t == 'N') {
58 indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l), true));
59 alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
60 aabOffset += l;
61 sp += l; // update reference sequence position
62 } else if (t == 'I') { // insertion
63 indels.push_back(FBIndelAllele(true, l, sp, rp, alignmentSequence.substr(rp, l), false));
64 alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
65 arsOffset += l;
66 rp += l;
67 } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
68 // remove these bases from the refseq and read seq, but don't modify the alignment sequence
69 if (rp == 0) {
70 alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
71 softBegin = alignmentAlignedBases.substr(0, l);
72 } else {
73 alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
74 softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
75 }
76 rp += l;
77 } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
78 //} else if (t == 'N') { // skipped region in the reference not present in read, aka splice
79 //sp += l;
80 }
81 }
82
83
84 int alignedLength = sp;
85
86 LEFTALIGN_DEBUG("| " << cigar_before.str() << endl
87 << "| " << alignedReferenceSequence << endl
88 << "| " << alignmentAlignedBases << endl);
89
90 // if no indels, return the alignment
91 if (indels.empty()) { return false; }
92
93 // for each indel, from left to right
94 // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
95 // move the indel left
96
97 vector<FBIndelAllele>::iterator previous = indels.begin();
98 for (vector<FBIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
99
100 // left shift by repeats
101 //
102 // from 1 base to the length of the indel, attempt to shift left
103 // if the move would cause no change in alignment optimality (no
104 // introduction of mismatches, and by definition no change in gap
105 // length), move to the new position.
106 // in practice this moves the indel left when we reach the size of
107 // the repeat unit.
108 //
109 int steppos, readsteppos;
110 FBIndelAllele& indel = *id;
111 int i = 1;
112 while (i <= indel.length) {
113
114 int steppos = indel.position - i;
115 int readsteppos = indel.readPosition - i;
116
117 #ifdef VERBOSE_DEBUG
118 if (debug) {
119 if (steppos >= 0 && readsteppos >= 0) {
120 cerr << referenceSequence.substr(steppos, indel.length) << endl;
121 cerr << alignmentSequence.substr(readsteppos, indel.length) << endl;
122 cerr << indel.sequence << endl;
123 }
124 }
125 #endif
126 while (steppos >= 0 && readsteppos >= 0
127 && !indel.splice
128 && indel.sequence == referenceSequence.substr(steppos, indel.length)
129 && indel.sequence == alignmentSequence.substr(readsteppos, indel.length)
130 //&& indel.sequence == alignment.QueryBases.substr(readsteppos, indel.length)
131 && (id == indels.begin()
132 || (previous->insertion && steppos >= previous->position)
133 || (!previous->insertion && steppos >= previous->position + previous->length))) {
134 LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
135 indel.position -= i;
136 indel.readPosition -= i;
137 steppos = indel.position - i;
138 readsteppos = indel.readPosition - i;
139 }
140 do {
141 ++i;
142 } while (i <= indel.length && indel.length % i != 0);
143 }
144
145 // left shift indels with exchangeable flanking sequence
146 //
147 // for example:
148 //
149 // GTTACGTT GTTACGTT
150 // GT-----T ----> G-----TT
151 //
152 // GTGTGACGTGT GTGTGACGTGT
153 // GTGTG-----T ----> GTG-----TGT
154 //
155 // GTGTG-----T GTG-----TGT
156 // GTGTGACGTGT ----> GTGTGACGTGT
157 //
158 //
159 steppos = indel.position - 1;
160 readsteppos = indel.readPosition - 1;
161 while (steppos >= 0 && readsteppos >= 0
162 //&& alignment.QueryBases.at(readsteppos) == referenceSequence.at(steppos)
163 //&& alignment.QueryBases.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
164 && alignmentSequence.at(readsteppos) == referenceSequence.at(steppos)
165 && alignmentSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
166 && (id == indels.begin()
167 || (previous->insertion && indel.position - 1 >= previous->position)
168 || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
169 LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
170 indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
171 indel.position -= 1;
172 indel.readPosition -= 1;
173 steppos = indel.position - 1;
174 readsteppos = indel.readPosition - 1;
175 }
176 // tracks previous indel, so we don't run into it with the next shift
177 previous = id;
178 }
179
180 // bring together floating indels
181 // from left to right
182 // check if we could merge with the next indel
183 // if so, adjust so that we will merge in the next step
184 if (indels.size() > 1) {
185 previous = indels.begin();
186 for (vector<FBIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
187 FBIndelAllele& indel = *id;
188 // parsimony: could we shift right and merge with the previous indel?
189 // if so, do it
190 int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
191 int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
192 if (!previous->splice && !indel.splice &&
193 previous->insertion == indel.insertion
194 && ((previous->insertion
195 && (previous->position < indel.position
196 && previous->readPosition + previous->readPosition < indel.readPosition))
197 ||
198 (!previous->insertion
199 && (previous->position + previous->length < indel.position)
200 && (previous->readPosition < indel.readPosition)
201 ))) {
202 if (previous->homopolymer()) {
203 string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
204 //string readseq = alignment.QueryBases.substr(prev_end_read, indel.position - prev_end_ref);
205 string readseq = alignmentSequence.substr(prev_end_read, indel.position - prev_end_ref);
206 LEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
207 if (previous->sequence.at(0) == seq.at(0)
208 && FBhomopolymer(seq)
209 && FBhomopolymer(readseq)) {
210 LEFTALIGN_DEBUG("moving " << *previous << " right to "
211 << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
212 previous->position = indel.insertion ? indel.position : indel.position - previous->length;
213 }
214 }
215 else {
216 int pos = previous->position;
217 while (pos < (int) referenceSequence.length() &&
218 ((previous->insertion && pos + previous->length <= indel.position)
219 ||
220 (!previous->insertion && pos + previous->length < indel.position))
221 && previous->sequence
222 == referenceSequence.substr(pos + previous->length, previous->length)) {
223 pos += previous->length;
224 }
225 if (pos < previous->position &&
226 ((previous->insertion && pos + previous->length == indel.position)
227 ||
228 (!previous->insertion && pos == indel.position - previous->length))
229 ) {
230 LEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
231 previous->position = pos;
232 }
233 }
234 }
235 previous = id;
236 }
237 }
238
239 // for each indel
240 // if ( we're matched up to the previous insertion (or deletion)
241 // and it's also an insertion or deletion )
242 // merge the indels
243 //
244 // and simultaneously reconstruct the cigar
245
246 CIGAR newCigar;
247
248 if (!softBegin.empty()) {
249 newCigar.ADDCIGAR(CIGOP('S', softBegin.size()));
250 }
251
252 vector<FBIndelAllele>::iterator id = indels.begin();
253 FBIndelAllele last = *id++;
254 if (last.position > 0) {
255 newCigar.ADDCIGAR(CIGOP('M', last.position));
256 }
257 if (last.insertion) {
258 newCigar.ADDCIGAR(CIGOP('I', last.length));
259 } else if (last.splice) {
260 newCigar.ADDCIGAR(CIGOP('N', last.length));
261 } else {
262 newCigar.ADDCIGAR(CIGOP('D', last.length));
263 }
264 int lastend = last.insertion ? last.position : (last.position + last.length);
265 LEFTALIGN_DEBUG(last << ",");
266
267 for (; id != indels.end(); ++id) {
268 FBIndelAllele& indel = *id;
269 LEFTALIGN_DEBUG(indel << ",");
270 if (indel.position < lastend) {
271 cerr << "impossibility?: indel realigned left of another indel" << endl << alignment.QNAME
272 << " " << alignment.POSITION << endl << alignment.QUERYBASES << endl;
273 exit(1);
274
275 } else if (indel.position == lastend && indel.insertion == last.insertion) {
276 CIGOP& op = newCigar.back();
277 #ifdef HAVE_BAMTOOLS
278 op.Length += indel.length;
279 #else
280 op = SeqLib::CigarField(op.Type(), op.Length() + indel.length);
281 #endif
282 } else if (indel.position >= lastend) { // also catches differential indels, but with the same position
283 newCigar.ADDCIGAR(CIGOP('M', indel.position - lastend));
284 if (indel.insertion) {
285 newCigar.ADDCIGAR(CIGOP('I', indel.length));
286 } else if (indel.splice) {
287 newCigar.ADDCIGAR(CIGOP('N', indel.length));
288 } else { // deletion
289 newCigar.ADDCIGAR(CIGOP('D', indel.length));
290 }
291
292 }
293 last = *id;
294 lastend = last.insertion ? last.position : (last.position + last.length);
295 }
296
297 if (lastend < alignedLength) {
298 newCigar.ADDCIGAR(CIGOP('M', alignedLength - lastend));
299 }
300
301 if (!softEnd.empty()) {
302 newCigar.ADDCIGAR(CIGOP('S', softEnd.size()));
303 }
304
305 LEFTALIGN_DEBUG(endl);
306
307 #ifdef VERBOSE_DEBUG
308 if (debug) {
309 CIGAR cigar = alignment.GETCIGAR;
310 for (CIGAR::const_iterator c = cigar.begin();
311 c != cigar.end(); ++c) {
312 unsigned int l = c->CIGLEN;
313 char t = c->CIGTYPE;
314 cerr << l << t;
315 }
316 cerr << endl;
317 }
318 #endif
319
320 #ifdef HAVE_BAMTOOLS
321 alignment.CigarData = newCigar;
322 #else
323 alignment.SetCigar(newCigar);
324 #endif
325
326 cigar = alignment.GETCIGAR;
327 for (CIGAR::const_iterator c = cigar.begin();
328 c != cigar.end(); ++c) {
329 unsigned int l = c->CIGLEN;
330 char t = c->CIGTYPE;
331 cigar_after << l << t;
332 }
333 LEFTALIGN_DEBUG(cigar_after.str() << endl);
334
335 // check if we're realigned
336 if (cigar_after.str() == cigar_before.str()) {
337 return false;
338 } else {
339 return true;
340 }
341
342 }
343
countMismatches(BAMALIGN & alignment,string referenceSequence)344 int countMismatches(BAMALIGN& alignment, string referenceSequence) {
345 const string alignmentSequence = alignment.QUERYBASES;
346
347 int mismatches = 0;
348 int sp = 0;
349 int rp = 0;
350 CIGAR cigar = alignment.GETCIGAR;
351 for (CIGAR::const_iterator c = cigar.begin();
352 c != cigar.end(); ++c) {
353 unsigned int l = c->CIGLEN;
354 char t = c->CIGTYPE;
355
356 if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
357 for (int i = 0; i < l; ++i) {
358 //if (alignment.QueryBases.at(rp) != referenceSequence.at(sp))
359 if (alignmentSequence.at(rp) != referenceSequence.at(sp))
360 ++mismatches;
361 ++sp;
362 ++rp;
363 }
364 } else if (t == 'D') { // deletion
365 sp += l; // update reference sequence position
366 } else if (t == 'I') { // insertion
367 rp += l; // update read position
368 } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
369 rp += l;
370 } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
371 } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
372 sp += l;
373 }
374 }
375
376 return mismatches;
377
378 }
379
380 // Iteratively left-aligns the indels in the alignment until we have a stable
381 // realignment. Returns true on realignment success or non-realignment.
382 // Returns false if we exceed the maximum number of realignment iterations.
383 //
stablyLeftAlign(BAMALIGN & alignment,string referenceSequence,int maxiterations,bool debug)384 bool stablyLeftAlign(BAMALIGN& alignment, string referenceSequence, int maxiterations, bool debug) {
385
386 #ifdef VERBOSE_DEBUG
387 int mismatchesBefore = countMismatches(alignment, referenceSequence);
388 #endif
389
390 if (!leftAlign(alignment, referenceSequence, debug)) {
391 return true;
392
393 } else {
394
395 while (leftAlign(alignment, referenceSequence, debug) && --maxiterations > 0) {
396 LEFTALIGN_DEBUG("realigning ..." << endl);
397 }
398
399 #ifdef VERBOSE_DEBUG
400 int mismatchesAfter = countMismatches(alignment, referenceSequence);
401
402 if (mismatchesBefore != mismatchesAfter) {
403 cerr << alignment.QNAME << endl;
404 cerr << "ERROR: found " << mismatchesBefore << " mismatches before, but " << mismatchesAfter << " after left realignment!" << endl;
405 exit(1);
406 }
407 #endif
408
409 if (maxiterations <= 0) {
410 return false;
411 } else {
412 return true;
413 }
414
415 }
416
417 }
418