1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Rene Rahn <rene.rahn@fu-berlin.de>
33 // ==========================================================================
34 // Implements the tracback algorithm for the banded chain alignment.
35 // First the section where both dp matrices intersect is computed to fing
36 // the glue poit between the traces of the current matrix and the next one.
37 // Afterwards the final traceback beginnging in this glue point is computed
38 // fir the current dp matrix.
39 // ==========================================================================
40 
41 #ifndef INCLUDE_SEQAN_SEEDS_BANDED_CHAIN_ALIGNMENT_TRACEBACK_H_
42 #define INCLUDE_SEQAN_SEEDS_BANDED_CHAIN_ALIGNMENT_TRACEBACK_H_
43 
44 namespace seqan {
45 
46 // ============================================================================
47 // Forwards
48 // ============================================================================
49 
50 // ============================================================================
51 // Tags, Classes, Enums
52 // ============================================================================
53 
54 // ============================================================================
55 // Metafunctions
56 // ============================================================================
57 
58 // ----------------------------------------------------------------------------
59 // Metafunction PreferGapsAtEnd_
60 // ----------------------------------------------------------------------------
61 
62 template <typename TFreeEndGaps, typename TMatrixSpec, typename TTracebackSpec>
63 struct PreferGapsAtEnd_<DPProfile_<BandedChainAlignment_<TFreeEndGaps, TMatrixSpec>,
64                                    AffineGaps, TTracebackSpec> > : False{};
65 
66 template <typename TFreeEndGaps, typename TTracebackSpec>
67 struct PreferGapsAtEnd_<DPProfile_<BandedChainAlignment_<TFreeEndGaps, BandedChainFinalDPMatrix>,
68                                    AffineGaps, TTracebackSpec> > : True{};
69 
70 // ============================================================================
71 // Functions
72 // ============================================================================
73 
74 // ----------------------------------------------------------------------------
75 // Function _adaptLocalTracesToGlobalGrid()
76 // ----------------------------------------------------------------------------
77 
78 template <typename TTraceSet, typename TGridPoint>
79 inline void
80 _adaptLocalTracesToGlobalGrid(TTraceSet & traceSet, TGridPoint const & gridBegin)
81 {
82     typedef typename Value<TTraceSet>::Type TTraceString;
83     typedef typename Iterator<TTraceString>::Type TTraceStringIterator;
84 
85     for (unsigned i = 0; i < length(traceSet); ++i)
86     {
87         for (TTraceStringIterator it = begin(traceSet[i]); it != end(traceSet[i]); ++it)
88         {
89             value(it)._horizontalBeginPos += gridBegin.i1;
90             value(it)._verticalBeginPos += gridBegin.i2;
91         }
92     }
93 }
94 
95 // ----------------------------------------------------------------------------
96 // Function _smoothGluePoint()
97 // ----------------------------------------------------------------------------
98 
99 template <typename TTraceSegment, typename TStringSpec, typename TSize>
100 inline void
101 _smoothGluePoint(String<TTraceSegment, TStringSpec> & tracePath, TSize referenceSize)
102 {
103     typedef typename Iterator<String<TTraceSegment, TStringSpec> >::Type TTraceSegmentsIterator;
104 
105     TTraceSegmentsIterator itEndOldTrace = end(tracePath) - referenceSize;
106     TTraceSegmentsIterator itBeginNewTrace = itEndOldTrace - 1;
107     if (_getTraceValue(value(itEndOldTrace)) == _getTraceValue(value(itBeginNewTrace)))
108     {
109         _setLength(value(itEndOldTrace), length(value(itEndOldTrace)) + length(value(itBeginNewTrace)));
110         erase(tracePath, position(itBeginNewTrace, tracePath));
111     }
112 }
113 
114 // ----------------------------------------------------------------------------
115 // Function _glueTracebacks()
116 // ----------------------------------------------------------------------------
117 
118 // TODO(rmaerker): What about using a set to find matching parts more efficiently. Optimize this code here!
119 template <typename TTraceSet>
120 inline void _glueTracebacks(TTraceSet & globalTraces, TTraceSet & localTraces)
121 {
122     typedef typename Value<TTraceSet>::Type TTraceSegments;
123     typedef typename Value<TTraceSegments>::Type TTraceSegment;
124     typedef typename Size<TTraceSegments>::Type TSize;
125 
126     bool isGlued = false;
127 
128     if (empty(globalTraces))
129     {
130         globalTraces = localTraces;
131         return;
132     }
133 
134     TSize lengthGlobalTraces = length(globalTraces);
135     TSize oldNumOfGlobalTraces = lengthGlobalTraces;
136     String<unsigned> elementsToErase;
137 
138     for (unsigned j = 0; j < lengthGlobalTraces; ++j)
139     {
140         // traceback from back to front -> first trace segment is at end of sequences
141         TTraceSegment globalTraceEndPoint = value(begin(globalTraces[j]));
142         TSize numOfCurrElements = length(globalTraces[j]);
143         TSize numOfAddedTraces = 0;
144         // check for all existing paths if traces can be glued together.
145         bool isConnected = false;
146         for (unsigned i = 0; i < length(localTraces); ++i)
147         {
148             // traceback from back to front -> last trace segment is at beginning of sequences.
149             TTraceSegment localTraceBeginPoint = value(end(localTraces[i]) - 1);
150             // trace segments match in horizontal position
151             if (_getEndHorizontal(globalTraceEndPoint) == _getBeginHorizontal(localTraceBeginPoint))
152             {
153                 // trace segments match in vertical position
154                 if (_getEndVertical(globalTraceEndPoint) == _getBeginVertical(localTraceBeginPoint))
155                 { // found a glue point between local and global trace
156                     TSize numOfElementsToAdd = length(localTraces[i]);
157 
158                     if (isConnected)
159                     {
160                         // create a new traceback track.
161                         TTraceSegments newTraceTrack;
162                         resize(newTraceTrack, numOfCurrElements + numOfElementsToAdd, Generous());
163                         arrayMoveForward(begin(localTraces[i]), end(localTraces[i]), begin(newTraceTrack));
164                         arrayCopyForward(end(globalTraces[j]) - numOfCurrElements, end(globalTraces[j]), begin(newTraceTrack) + numOfElementsToAdd);
165                         appendValue(globalTraces, newTraceTrack);
166                         ++numOfAddedTraces;
167                         continue;
168                     }
169                     // resize global traces such that new elements fit into it
170                     resize(globalTraces[j], numOfCurrElements + numOfElementsToAdd, Generous());
171                     // shift old values to correct position in array after new elements will be added.
172                     // use backward move in order to avoid overwriting when ranges overlap
173                     arrayMoveBackward(begin(globalTraces[j]), begin(globalTraces[j]) + numOfCurrElements,
174                                       begin(globalTraces[j]) + numOfElementsToAdd);
175                     // actually move new elements to global traces at it's begin
176                     arrayMoveForward(begin(localTraces[i]), end(localTraces[i]), begin(globalTraces[j]));
177                     isGlued = true;
178                     isConnected = true;
179                 }
180             }
181         }
182         if (!isConnected)
183         {
184             appendValue(elementsToErase, j);
185         }
186         else
187         {
188             // First, smooth the actual path we are currently gluing.
189             _smoothGluePoint(globalTraces[j], numOfCurrElements);
190             // Second, smooth all paths that have been added to the global traceback.
191             for (unsigned traceId = oldNumOfGlobalTraces; traceId < oldNumOfGlobalTraces + numOfAddedTraces; ++traceId)
192                 _smoothGluePoint(globalTraces[traceId], numOfCurrElements);
193             oldNumOfGlobalTraces += numOfAddedTraces;
194         }
195     }
196 
197     for (unsigned i = length(elementsToErase); i > 0; --i)
198     {
199         erase(globalTraces, elementsToErase[i-1]); // erase from behind to avoid accessing an element beyond the scope
200     }
201     SEQAN_ASSERT_EQ_MSG(isGlued, true, "Fatal error while trying to connect trace backs: No glue point available!");
202     (void)isGlued;
203 }
204 
205 // ----------------------------------------------------------------------------
206 // Function _correctDPCellForAffineGaps()
207 // ----------------------------------------------------------------------------
208 
209 template <typename TScoreValue, typename TTraceValue>
210 inline void
211 _correctDPCellForAffineGaps(DPCell_<TScoreValue, LinearGaps> const &, TTraceValue /*lastTraceValue*/)
212 {
213     //no-op
214 }
215 
216 template <typename TScoreValue, typename TTraceValue>
217 inline void
218 _correctDPCellForAffineGaps(DPCell_<TScoreValue, AffineGaps> & dpCell, TTraceValue lastTraceValue)
219 {
220     typedef DPCell_<TScoreValue, AffineGaps> TDPCell;
221     if (lastTraceValue & TraceBitMap_::DIAGONAL)
222     {
223         dpCell._verticalScore = DPCellDefaultInfinity<TDPCell>::VALUE;
224         dpCell._horizontalScore = DPCellDefaultInfinity<TDPCell>::VALUE;
225     }
226     else if (lastTraceValue & TraceBitMap_::VERTICAL)
227         dpCell._horizontalScore = DPCellDefaultInfinity<TDPCell>::VALUE;
228     else
229         dpCell._verticalScore = DPCellDefaultInfinity<TDPCell>::VALUE;
230 }
231 
232 // ----------------------------------------------------------------------------
233 // Function _computeTraceback()                          [BandedChainAlignment]
234 // ----------------------------------------------------------------------------
235 
236 template<typename TTarget, typename TDPTraceMatrixNavigator, typename TDPCell, typename TScoutSpec,
237          typename TSequenceH, typename TSequenceV, typename TBandFlag, typename TFreeEndGaps, typename TDPMatrixLocation,
238          typename TGapCosts, typename TTracebackSpec>
239 void _computeTraceback(TTarget & target,
240                        TDPTraceMatrixNavigator & matrixNavigator,
241                        unsigned maxHostPosition,
242                        DPScout_<TDPCell, TScoutSpec> & dpScout,
243                        TSequenceH const & seqH,
244                        TSequenceV const & seqV,
245                        DPBandConfig<TBandFlag> const & band,
246                        DPProfile_<BandedChainAlignment_<TFreeEndGaps, TDPMatrixLocation>, TGapCosts, TTracebackSpec> const & dpProfile)
247 {
248     typedef DPScout_<TDPCell, TScoutSpec> TDPScout_;
249     typedef typename TDPScout_::TScoutState TScoutState_;
250     typedef typename TScoutState_::TInitCell TInitCell;
251     typedef typename Container<TDPTraceMatrixNavigator>::Type TContainer;
252     typedef typename Size<TContainer>::Type TSize;
253     //typedef typename MakeSigned<TSize>::Type TSignedSize;
254     typedef typename Position<TContainer>::Type TPosition;
255     typedef typename MakeSigned<TPosition>::Type TSignedPosition;
256     typedef typename Size<TSequenceH>::Type TSizeSeqH;
257     typedef typename Size<TSequenceV>::Type TSizeSeqV;
258     typedef typename TraceBitMap_::TTraceValue TTraceValue;
259 
260     if (IsSameType<TTracebackSpec, TracebackOff>::VALUE)
261         return;
262 
263     TSizeSeqH seqHSize = length(seqH);
264     TSizeSeqV seqVSize = length(seqV);
265 
266     // Determine whether or not we place gaps to the left.
267     typedef typename IsGapsLeft_<TTracebackSpec>::Type TIsGapsLeft;
268 
269     // TODO(rmaerker): Define separate function for this.
270     // Set to the correct position within the trace matrix.
271     _setToPosition(matrixNavigator, maxHostPosition);
272 
273     SEQAN_ASSERT_LEQ(coordinate(matrixNavigator, +DPMatrixDimension_::HORIZONTAL), seqHSize);
274     SEQAN_ASSERT_LEQ(coordinate(matrixNavigator, +DPMatrixDimension_::VERTICAL), seqVSize);
275 
276     TTraceValue traceValue = value(matrixNavigator);
277     TTraceValue lastTraceValue = _retrieveInitialTraceDirection(traceValue, dpProfile);
278 
279     TracebackCoordinator_<TPosition> tracebackCoordinator(coordinate(matrixNavigator, +DPMatrixDimension_::HORIZONTAL),
280                                                           coordinate(matrixNavigator, +DPMatrixDimension_::VERTICAL),
281                                                           dpScout._dpScoutStatePtr->_horizontalNextGridOrigin,
282                                                           dpScout._dpScoutStatePtr->_verticalNextGridOrigin,
283                                                           band, seqHSize, seqVSize);
284 
285     // Record trailing gaps if any.
286     if (IsSameType<TDPMatrixLocation, BandedChainFinalDPMatrix>::VALUE)
287     {
288         if (tracebackCoordinator._currRow != seqVSize)
289             _recordSegment(target, seqHSize, tracebackCoordinator._currRow, seqVSize - tracebackCoordinator._currRow,
290                            +TraceBitMap_::VERTICAL);
291         if (tracebackCoordinator._currColumn != seqHSize)
292             _recordSegment(target, tracebackCoordinator._currColumn, tracebackCoordinator._currRow, seqHSize -
293                            tracebackCoordinator._currColumn, +TraceBitMap_::HORIZONTAL);
294 
295         _computeTraceback(target, matrixNavigator, position(matrixNavigator), seqH, seqV, band, dpProfile);
296         return;
297     }
298 
299     TSize fragmentLength = 0;
300     TTarget tmp;
301     while(!_hasReachedEnd(tracebackCoordinator) && traceValue != +TraceBitMap_::NONE)
302         _doTraceback(tmp, matrixNavigator, traceValue, lastTraceValue, fragmentLength, tracebackCoordinator, TGapCosts(), TIsGapsLeft());
303 
304     TSignedPosition horizontalInitPos = static_cast<TSignedPosition>(tracebackCoordinator._currColumn) -
305                                         static_cast<TSignedPosition>(tracebackCoordinator._endColumn);
306     TSignedPosition verticalInitPos = static_cast<TSignedPosition>(tracebackCoordinator._currRow) -
307                                       static_cast<TSignedPosition>(tracebackCoordinator._endRow);
308 
309     bool insertResult;
310     if (verticalInitPos <= 0)
311     {
312         _correctDPCellForAffineGaps(dpScout._dpScoutStatePtr->_horizontalInitNextMatrix[horizontalInitPos], lastTraceValue);
313         insertResult = dpScout._dpScoutStatePtr->_nextInitializationCells.insert(TInitCell(horizontalInitPos, 0,
314                                                 dpScout._dpScoutStatePtr->_horizontalInitNextMatrix[horizontalInitPos])).second;
315     }
316     else
317     {
318         _correctDPCellForAffineGaps(dpScout._dpScoutStatePtr->_verticalInitNextMatrix[verticalInitPos], lastTraceValue);
319         insertResult = dpScout._dpScoutStatePtr->_nextInitializationCells.insert(TInitCell(0, verticalInitPos,
320                                                 dpScout._dpScoutStatePtr->_verticalInitNextMatrix[verticalInitPos])).second;
321     }
322 
323     // Now before we can continue at the current position, we might want to track a vertical/horizontal gap up to this position.
324     if (insertResult)
325     {
326         if (verticalInitPos < 0)  // Here we are in a vertical gap.
327             _recordSegment(target, tracebackCoordinator._currColumn, tracebackCoordinator._currRow, -verticalInitPos,
328                            lastTraceValue);
329         else if (horizontalInitPos < 0)  // Here we are in a horizontal gap.
330             _recordSegment(target, tracebackCoordinator._currColumn, tracebackCoordinator._currRow, -horizontalInitPos,
331                            lastTraceValue);
332         _computeTraceback(target, matrixNavigator, position(matrixNavigator), seqH, seqV, band, dpProfile);
333     }
334 
335     if (IsSameType<TDPMatrixLocation, BandedChainInitialDPMatrix>::VALUE)
336     {
337         TPosition currCol = coordinate(matrixNavigator, +DPMatrixDimension_::HORIZONTAL);
338         TPosition currRow = coordinate(matrixNavigator, +DPMatrixDimension_::VERTICAL);
339 
340         // Correct the row position.
341         if (IsSameType<TBandFlag, BandOn>::VALUE)
342             if (upperDiagonal(band) > 0)
343                 if (currCol < tracebackCoordinator._breakpoint1)
344                     if (currCol < tracebackCoordinator._breakpoint2)
345                         currRow -= length(container(matrixNavigator), +DPMatrixDimension_::VERTICAL) - 1 + lowerDiagonal(band) - currCol;
346         // Record leading gaps if any.
347         if (currRow != 0u)
348             _recordSegment(target, 0, 0, currRow, +TraceBitMap_::VERTICAL);
349         if (currCol != 0u)
350             _recordSegment(target, 0, 0, currCol, +TraceBitMap_::HORIZONTAL);
351     }
352 }
353 
354 // ----------------------------------------------------------------------------
355 // Function _computeTraceback()        [BandedChainAlignment, global interface]
356 // ----------------------------------------------------------------------------
357 
358 template <typename TTarget, typename TDPTraceMatrixNavigator, typename TDPScout, typename TSequenceH, typename TSequenceV,
359 typename TBandFlag, typename TFreeEndGaps, typename TDPMatrixLocation, typename TGapCosts, typename TTracebackSpec>
360 void _computeTraceback(StringSet<TTarget> & targetSet,
361                        TDPTraceMatrixNavigator & matrixNavigator,
362                        TDPScout & dpScout,
363                        TSequenceH const & seqH,
364                        TSequenceV const & seqV,
365                        DPBandConfig<TBandFlag> const & band,
366                        DPProfile_<BandedChainAlignment_<TFreeEndGaps, TDPMatrixLocation>, TGapCosts, TTracebackSpec> const & dpProfile)
367 {
368     typedef typename TDPScout::TMaxHostPositionString TMaxHostPositions;
369     typedef typename Iterator<TMaxHostPositions, Standard>::Type TMaxHostPositionsIterator;
370 
371     // We have to clear the nextInitalization cells here.
372     dpScout._dpScoutStatePtr->_nextInitializationCells.clear();
373     TMaxHostPositions & tracebackCandidates = maxHostPositions(dpScout);
374 
375     TMaxHostPositionsIterator itTraceCandidates = begin(tracebackCandidates, Standard());
376 
377     for (;itTraceCandidates != end(tracebackCandidates, Standard()); ++itTraceCandidates)
378     {
379         TTarget tmpTarget;
380         _computeTraceback(tmpTarget, matrixNavigator, *itTraceCandidates, dpScout, seqH, seqV, band, dpProfile);
381         if (!empty(tmpTarget))
382             appendValue(targetSet, tmpTarget);  // We only need to add the tmpTarget if it gives us a new alignment.
383     }
384 }
385 
386 }  // namespace seqan
387 
388 #endif  // #ifndef INCLUDE_SEQAN_SEEDS_BANDED_CHAIN_ALIGNMENT_TRACEBACK_H_
389