1 #include "LeftAlign.h"
2 
3 //bool debug;
4 
5 // Attempts to left-realign all the indels represented by the alignment cigar.
6 //
7 // This is done by shifting all indels as far left as they can go without
8 // mismatch, then merging neighboring indels of the same class.  leftAlign
9 // updates the alignment cigar with changes, and returns true if realignment
10 // changed the alignment cigar.
11 //
12 // To left-align, we move multi-base indels left by their own length as long as
13 // the preceding bases match the inserted or deleted sequence.  After this
14 // step, we handle multi-base homopolymer indels by shifting them one base to
15 // the left until they mismatch the reference.
16 //
17 // To merge neighboring indels, we iterate through the set of left-stabilized
18 // indels.  For each indel we add a new cigar element to the new cigar.  If a
19 // deletion follows a deletion, or an insertion occurs at the same place as
20 // another insertion, we merge the events by extending the previous cigar
21 // element.
22 //
23 // In practice, we must call this function until the alignment is stabilized.
24 //
leftAlign(BAMALIGN & alignment,string & referenceSequence,bool debug)25 bool leftAlign(BAMALIGN& alignment, string& referenceSequence, bool debug) {
26     string alignmentAlignedBases = alignment.QUERYBASES;
27   const string alignmentSequence = alignment.QUERYBASES;
28 
29     int arsOffset = 0; // pointer to insertion point in aligned reference sequence
30     string alignedReferenceSequence = referenceSequence;
31     int aabOffset = 0;
32 
33     // store information about the indels
34     vector<FBIndelAllele> indels;
35 
36     int rp = 0;  // read position, 0-based relative to read
37     int sp = 0;  // sequence position
38 
39     string softBegin;
40     string softEnd;
41 
42     stringstream cigar_before, cigar_after;
43     CIGAR cigar = alignment.GETCIGAR;
44     for (CIGAR::const_iterator c = cigar.begin();
45         c != cigar.end(); ++c) {
46         unsigned int l = c->CIGLEN;
47         char t = c->CIGTYPE;
48         cigar_before << l << t;
49         if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
50             sp += l;
51             rp += l;
52         } else if (t == 'D') { // deletion
53             indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l), false));
54             alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
55             aabOffset += l;
56             sp += l;  // update reference sequence position
57         } else if (t == 'N') {
58             indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l), true));
59             alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
60             aabOffset += l;
61             sp += l;  // update reference sequence position
62         } else if (t == 'I') { // insertion
63             indels.push_back(FBIndelAllele(true, l, sp, rp, alignmentSequence.substr(rp, l), false));
64             alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
65             arsOffset += l;
66             rp += l;
67         } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
68             // remove these bases from the refseq and read seq, but don't modify the alignment sequence
69             if (rp == 0) {
70                 alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
71                 softBegin = alignmentAlignedBases.substr(0, l);
72             } else {
73                 alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
74                 softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
75             }
76             rp += l;
77         } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
78             //} else if (t == 'N') { // skipped region in the reference not present in read, aka splice
79             //sp += l;
80         }
81     }
82 
83 
84     int alignedLength = sp;
85 
86     LEFTALIGN_DEBUG("| " << cigar_before.str() << endl
87        << "| " << alignedReferenceSequence << endl
88        << "| " << alignmentAlignedBases << endl);
89 
90     // if no indels, return the alignment
91     if (indels.empty()) { return false; }
92 
93     // for each indel, from left to right
94     //     while the indel sequence repeated to the left and we're not matched up with the left-previous indel
95     //         move the indel left
96 
97     vector<FBIndelAllele>::iterator previous = indels.begin();
98     for (vector<FBIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
99 
100         // left shift by repeats
101         //
102         // from 1 base to the length of the indel, attempt to shift left
103         // if the move would cause no change in alignment optimality (no
104         // introduction of mismatches, and by definition no change in gap
105         // length), move to the new position.
106         // in practice this moves the indel left when we reach the size of
107         // the repeat unit.
108         //
109         int steppos, readsteppos;
110         FBIndelAllele& indel = *id;
111         int i = 1;
112         while (i <= indel.length) {
113 
114             int steppos = indel.position - i;
115             int readsteppos = indel.readPosition - i;
116 
117 #ifdef VERBOSE_DEBUG
118             if (debug) {
119                 if (steppos >= 0 && readsteppos >= 0) {
120                     cerr << referenceSequence.substr(steppos, indel.length) << endl;
121                     cerr << alignmentSequence.substr(readsteppos, indel.length) << endl;
122                     cerr << indel.sequence << endl;
123                 }
124             }
125 #endif
126             while (steppos >= 0 && readsteppos >= 0
127                    && !indel.splice
128                    && indel.sequence == referenceSequence.substr(steppos, indel.length)
129                    && indel.sequence == alignmentSequence.substr(readsteppos, indel.length)
130                    //&& indel.sequence == alignment.QueryBases.substr(readsteppos, indel.length)
131                    && (id == indels.begin()
132                        || (previous->insertion && steppos >= previous->position)
133                        || (!previous->insertion && steppos >= previous->position + previous->length))) {
134                 LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
135                 indel.position -= i;
136                 indel.readPosition -= i;
137                 steppos = indel.position - i;
138                 readsteppos = indel.readPosition - i;
139             }
140             do {
141                 ++i;
142             } while (i <= indel.length && indel.length % i != 0);
143         }
144 
145         // left shift indels with exchangeable flanking sequence
146         //
147         // for example:
148         //
149         //    GTTACGTT           GTTACGTT
150         //    GT-----T   ---->   G-----TT
151         //
152         // GTGTGACGTGT           GTGTGACGTGT
153         // GTGTG-----T   ---->   GTG-----TGT
154         //
155         // GTGTG-----T           GTG-----TGT
156         // GTGTGACGTGT   ---->   GTGTGACGTGT
157         //
158         //
159         steppos = indel.position - 1;
160         readsteppos = indel.readPosition - 1;
161         while (steppos >= 0 && readsteppos >= 0
162                //&& alignment.QueryBases.at(readsteppos) == referenceSequence.at(steppos)
163                //&& alignment.QueryBases.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
164                && alignmentSequence.at(readsteppos) == referenceSequence.at(steppos)
165                && alignmentSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
166                && (id == indels.begin()
167                    || (previous->insertion && indel.position - 1 >= previous->position)
168                    || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
169             LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
170             indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
171             indel.position -= 1;
172             indel.readPosition -= 1;
173             steppos = indel.position - 1;
174             readsteppos = indel.readPosition - 1;
175         }
176         // tracks previous indel, so we don't run into it with the next shift
177         previous = id;
178     }
179 
180     // bring together floating indels
181     // from left to right
182     // check if we could merge with the next indel
183     // if so, adjust so that we will merge in the next step
184     if (indels.size() > 1) {
185         previous = indels.begin();
186         for (vector<FBIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
187             FBIndelAllele& indel = *id;
188             // parsimony: could we shift right and merge with the previous indel?
189             // if so, do it
190             int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
191             int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
192             if (!previous->splice && !indel.splice &&
193                 previous->insertion == indel.insertion
194                     && ((previous->insertion
195                         && (previous->position < indel.position
196                         && previous->readPosition + previous->readPosition < indel.readPosition))
197                         ||
198                         (!previous->insertion
199                         && (previous->position + previous->length < indel.position)
200                         && (previous->readPosition < indel.readPosition)
201                         ))) {
202                 if (previous->homopolymer()) {
203                     string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
204                     //string readseq = alignment.QueryBases.substr(prev_end_read, indel.position - prev_end_ref);
205                     string readseq = alignmentSequence.substr(prev_end_read, indel.position - prev_end_ref);
206                     LEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
207                     if (previous->sequence.at(0) == seq.at(0)
208                             && FBhomopolymer(seq)
209                             && FBhomopolymer(readseq)) {
210                         LEFTALIGN_DEBUG("moving " << *previous << " right to "
211                                 << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
212                         previous->position = indel.insertion ? indel.position : indel.position - previous->length;
213                     }
214                 }
215                 else {
216                     int pos = previous->position;
217                     while (pos < (int) referenceSequence.length() &&
218                             ((previous->insertion && pos + previous->length <= indel.position)
219                             ||
220                             (!previous->insertion && pos + previous->length < indel.position))
221                             && previous->sequence
222                                 == referenceSequence.substr(pos + previous->length, previous->length)) {
223                         pos += previous->length;
224                     }
225                     if (pos < previous->position &&
226                         ((previous->insertion && pos + previous->length == indel.position)
227                         ||
228                         (!previous->insertion && pos == indel.position - previous->length))
229                        ) {
230                         LEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
231                         previous->position = pos;
232                     }
233                 }
234             }
235             previous = id;
236         }
237     }
238 
239     // for each indel
240     //     if ( we're matched up to the previous insertion (or deletion)
241     //          and it's also an insertion or deletion )
242     //         merge the indels
243     //
244     // and simultaneously reconstruct the cigar
245 
246     CIGAR newCigar;
247 
248     if (!softBegin.empty()) {
249       newCigar.ADDCIGAR(CIGOP('S', softBegin.size()));
250     }
251 
252     vector<FBIndelAllele>::iterator id = indels.begin();
253     FBIndelAllele last = *id++;
254     if (last.position > 0) {
255         newCigar.ADDCIGAR(CIGOP('M', last.position));
256     }
257     if (last.insertion) {
258         newCigar.ADDCIGAR(CIGOP('I', last.length));
259     } else if (last.splice) {
260         newCigar.ADDCIGAR(CIGOP('N', last.length));
261     } else {
262         newCigar.ADDCIGAR(CIGOP('D', last.length));
263     }
264     int lastend = last.insertion ? last.position : (last.position + last.length);
265     LEFTALIGN_DEBUG(last << ",");
266 
267     for (; id != indels.end(); ++id) {
268         FBIndelAllele& indel = *id;
269         LEFTALIGN_DEBUG(indel << ",");
270         if (indel.position < lastend) {
271             cerr << "impossibility?: indel realigned left of another indel" << endl << alignment.QNAME
272                 << " " << alignment.POSITION << endl << alignment.QUERYBASES << endl;
273             exit(1);
274 
275         } else if (indel.position == lastend && indel.insertion == last.insertion) {
276             CIGOP& op = newCigar.back();
277 #ifdef HAVE_BAMTOOLS
278             op.Length += indel.length;
279 #else
280 	    op = SeqLib::CigarField(op.Type(), op.Length() + indel.length);
281 #endif
282         } else if (indel.position >= lastend) {  // also catches differential indels, but with the same position
283 	    newCigar.ADDCIGAR(CIGOP('M', indel.position - lastend));
284             if (indel.insertion) {
285                 newCigar.ADDCIGAR(CIGOP('I', indel.length));
286             } else if (indel.splice) {
287                 newCigar.ADDCIGAR(CIGOP('N', indel.length));
288             } else { // deletion
289                 newCigar.ADDCIGAR(CIGOP('D', indel.length));
290             }
291 
292         }
293         last = *id;
294         lastend = last.insertion ? last.position : (last.position + last.length);
295     }
296 
297     if (lastend < alignedLength) {
298         newCigar.ADDCIGAR(CIGOP('M', alignedLength - lastend));
299     }
300 
301     if (!softEnd.empty()) {
302         newCigar.ADDCIGAR(CIGOP('S', softEnd.size()));
303     }
304 
305     LEFTALIGN_DEBUG(endl);
306 
307 #ifdef VERBOSE_DEBUG
308     if (debug) {
309       CIGAR cigar = alignment.GETCIGAR;
310         for (CIGAR::const_iterator c = cigar.begin();
311             c != cigar.end(); ++c) {
312             unsigned int l = c->CIGLEN;
313             char t = c->CIGTYPE;
314             cerr << l << t;
315         }
316         cerr << endl;
317     }
318 #endif
319 
320 #ifdef HAVE_BAMTOOLS
321     alignment.CigarData = newCigar;
322 #else
323     alignment.SetCigar(newCigar);
324 #endif
325 
326     cigar = alignment.GETCIGAR;
327     for (CIGAR::const_iterator c = cigar.begin();
328 	 c != cigar.end(); ++c) {
329       unsigned int l = c->CIGLEN;
330       char t = c->CIGTYPE;
331       cigar_after << l << t;
332     }
333     LEFTALIGN_DEBUG(cigar_after.str() << endl);
334 
335     // check if we're realigned
336     if (cigar_after.str() == cigar_before.str()) {
337         return false;
338     } else {
339         return true;
340     }
341 
342 }
343 
countMismatches(BAMALIGN & alignment,string referenceSequence)344 int countMismatches(BAMALIGN& alignment, string referenceSequence) {
345   const string alignmentSequence = alignment.QUERYBASES;
346 
347     int mismatches = 0;
348     int sp = 0;
349     int rp = 0;
350       CIGAR cigar = alignment.GETCIGAR;
351         for (CIGAR::const_iterator c = cigar.begin();
352             c != cigar.end(); ++c) {
353         unsigned int l = c->CIGLEN;
354         char t = c->CIGTYPE;
355 
356         if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
357             for (int i = 0; i < l; ++i) {
358 	      //if (alignment.QueryBases.at(rp) != referenceSequence.at(sp))
359                 if (alignmentSequence.at(rp) != referenceSequence.at(sp))
360                     ++mismatches;
361                 ++sp;
362                 ++rp;
363             }
364         } else if (t == 'D') { // deletion
365             sp += l;  // update reference sequence position
366         } else if (t == 'I') { // insertion
367             rp += l;  // update read position
368         } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
369             rp += l;
370         } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
371         } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
372             sp += l;
373         }
374     }
375 
376     return mismatches;
377 
378 }
379 
380 // Iteratively left-aligns the indels in the alignment until we have a stable
381 // realignment.  Returns true on realignment success or non-realignment.
382 // Returns false if we exceed the maximum number of realignment iterations.
383 //
stablyLeftAlign(BAMALIGN & alignment,string referenceSequence,int maxiterations,bool debug)384     bool stablyLeftAlign(BAMALIGN& alignment, string referenceSequence, int maxiterations, bool debug) {
385 
386 #ifdef VERBOSE_DEBUG
387     int mismatchesBefore = countMismatches(alignment, referenceSequence);
388 #endif
389 
390     if (!leftAlign(alignment, referenceSequence, debug)) {
391         return true;
392 
393     } else {
394 
395         while (leftAlign(alignment, referenceSequence, debug) && --maxiterations > 0) {
396             LEFTALIGN_DEBUG("realigning ..." << endl);
397         }
398 
399 #ifdef VERBOSE_DEBUG
400         int mismatchesAfter = countMismatches(alignment, referenceSequence);
401 
402         if (mismatchesBefore != mismatchesAfter) {
403             cerr << alignment.QNAME << endl;
404             cerr << "ERROR: found " << mismatchesBefore << " mismatches before, but " << mismatchesAfter << " after left realignment!" << endl;
405             exit(1);
406         }
407 #endif
408 
409         if (maxiterations <= 0) {
410             return false;
411         } else {
412             return true;
413         }
414 
415     }
416 
417 }
418