1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_GRAPH_SMITH_WATERMAN_H
34 #define SEQAN_HEADER_GRAPH_SMITH_WATERMAN_H
35
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38
39
40 //////////////////////////////////////////////////////////////////////////////
41 // Alignment: Smith Waterman Alignment, affine gap cost
42 //////////////////////////////////////////////////////////////////////////////
43
44
45 //////////////////////////////////////////////////////////////////////////////
46
47 template<typename TSpec, typename TSize>
48 inline void
_setForbiddenCell(String<bool,TSpec> & forbidden,TSize len1,TSize len2,TSize numRows)49 _setForbiddenCell(String<bool, TSpec>& forbidden,
50 TSize len1,
51 TSize len2,
52 TSize numRows)
53 {
54 forbidden[(len1 - 1)*numRows + (len2 - 1)] = true;
55 }
56
57 //////////////////////////////////////////////////////////////////////////////
58
59 template<typename TSize>
60 inline void
_setForbiddenCell(Nothing &,TSize,TSize,TSize)61 _setForbiddenCell(Nothing&,
62 TSize,
63 TSize,
64 TSize)
65 {
66 }
67
68 //////////////////////////////////////////////////////////////////////////////
69
70 template<typename TSpec, typename TSize>
71 inline bool
_isClumping(String<bool,TSpec> const & forbidden,TSize row,TSize col,TSize len2)72 _isClumping(String<bool, TSpec> const& forbidden,
73 TSize row,
74 TSize col,
75 TSize len2)
76 {
77 return forbidden[(col-1) * len2 + (row-1)];
78 }
79
80 //////////////////////////////////////////////////////////////////////////////
81
82 template<typename TSize>
83 inline bool
_isClumping(Nothing &,TSize,TSize,TSize)84 _isClumping(Nothing&,
85 TSize,
86 TSize,
87 TSize)
88 {
89 return false;
90 }
91
92 ////////////////////////////////////////////////////////////////////////////
93
94 template <typename TAlign, typename TStringSet, typename TTrace, typename TVal, typename TIndexPair, typename TForbidden>
95 inline void
_alignSmithWatermanTrace(TAlign & align,TStringSet const & str,TTrace const & trace,TVal const initialDir,TIndexPair const & indexPair,TForbidden & forbidden)96 _alignSmithWatermanTrace(TAlign& align,
97 TStringSet const& str,
98 TTrace const& trace,
99 TVal const initialDir,
100 TIndexPair const& indexPair,
101 TForbidden& forbidden)
102 {
103 SEQAN_CHECKPOINT
104 typedef typename Size<TTrace>::Type TSize;
105 typedef typename Value<TTrace>::Type TTraceValue;
106 typedef typename Id<TStringSet>::Type TId;
107
108 // TraceBack values for Gotoh
109 TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2; TTraceValue Stop = 3;
110
111 TId id1 = positionToId(const_cast<TStringSet&>(str), 0);
112 TId id2 = positionToId(const_cast<TStringSet&>(str), 1);
113 TSize len1 = indexPair[1];
114 TSize len2 = indexPair[0];
115 if ((indexPair[0] == 0) || (indexPair[1] == 0)) return;
116 TSize numCols = length(str[0]);
117 TSize numRowsOrig = length(str[1]);
118 if (len1 < numCols) _alignTracePrint(align, str, id1, len1, id2, len2, numCols - len1, Horizontal);
119 if (len2 < numRowsOrig) _alignTracePrint(align, str, id1, len1, id2, len2, numRowsOrig - len2, Vertical);
120 TSize numRows = (numRowsOrig >> 1) + (numRowsOrig & 1);
121
122
123
124 // Initialize everything
125 TTraceValue nextTraceValue = (len2 & 1) ? trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)] >> 4 : trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)];
126 TTraceValue tv = Diagonal;
127 if (initialDir == Diagonal) tv = (nextTraceValue & 3);
128 else if (initialDir == Horizontal) {
129 if ((nextTraceValue >> 2) & 1) _alignTracePrint(align, str, id1, --len1, id2, len2, (TSize) 1, Horizontal);
130 else tv = Horizontal;
131 } else if (initialDir == Vertical) {
132 if ((nextTraceValue >> 3) & 1) _alignTracePrint(align, str, id1, len1, id2, --len2, (TSize) 1, Vertical);
133 else tv = Vertical;
134 }
135 TSize segLen = 0;
136 TTraceValue tvOld = tv;
137
138 // Now follow the trace
139 do {
140 nextTraceValue = (len2 & 1) ? trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)] >> 4 : trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)];
141 if ((nextTraceValue & 3) == Stop) break;
142 _setForbiddenCell(forbidden, len1, len2, numRowsOrig);
143 if (tv == Diagonal) tv = (nextTraceValue & 3);
144 else if (tv == Horizontal) {
145 if ((nextTraceValue >> 2) & 1) tv = Diagonal;
146 else tv = Horizontal;
147 } else if (tv == Vertical) {
148 if ((nextTraceValue >> 3) & 1) tv = Diagonal;
149 else tv = Vertical;
150 }
151 if (tv == Diagonal) {
152 if (tv != tvOld) {
153 if (tvOld == Vertical) --len2;
154 else --len1;
155 _alignTracePrint(align, str, id1, len1, id2, len2, ++segLen, tvOld);
156 tvOld = tv; segLen = 0;
157 } else {
158 ++segLen;
159 --len1; --len2;
160 }
161 } else if (tv == Horizontal) {
162 if (tv != tvOld) {
163 _alignTracePrint(align, str, id1, len1, id2, len2, segLen, tvOld);
164 if ((nextTraceValue >> 2) & 1) {
165 _alignTracePrint(align, str, id1, --len1, id2, len2, (TSize) 1, Horizontal);
166 tv = Diagonal; segLen = 0;
167 } else {
168 tvOld = tv; segLen = 1;
169 --len1;
170 }
171 } else {
172 ++segLen;
173 --len1;
174 }
175 } else if (tv == Vertical) {
176 if (tv != tvOld) {
177 _alignTracePrint(align, str, id1, len1, id2, len2, segLen, tvOld);
178 if ((nextTraceValue >> 3) & 1) {
179 _alignTracePrint(align, str, id1, len1, id2, --len2, (TSize) 1, Vertical);
180 tv = Diagonal; segLen = 0;
181 } else {
182 tvOld = tv; segLen = 1;
183 --len2;
184 }
185 } else {
186 ++segLen;
187 --len2;
188 }
189 }
190 } while ((len1 != 0) && (len2 !=0));
191 // Process left-overs
192 if (segLen) _alignTracePrint(align, str, id1, len1, id2, len2, segLen, tvOld);
193
194 // Handle the remaining sequence
195 if (len1 != 0) _alignTracePrint(align, str, (TId) id1, (TSize) 0, (TId) 0, (TSize) 0, (TSize) len1, Horizontal);
196 if (len2 != 0) _alignTracePrint(align, str, (TId) 0, (TSize) 0, (TId) id2, (TSize) 0, (TSize) len2, Vertical);
197 }
198
199
200 //////////////////////////////////////////////////////////////////////////////
201
202 //////////////////////////////////////////////////////////////////////////////
203
204 template <typename TTrace, typename TStringSet, typename TScore, typename TIndexPair, typename TForbidden>
205 inline typename Value<TScore>::Type
_alignSmithWaterman(TTrace & trace,TStringSet const & str,TScore const & sc,typename Value<TTrace>::Type & initialDir,TIndexPair & indexPair,TForbidden & forbidden)206 _alignSmithWaterman(TTrace& trace,
207 TStringSet const& str,
208 TScore const & sc,
209 typename Value<TTrace>::Type& initialDir,
210 TIndexPair& indexPair,
211 TForbidden& forbidden)
212 {
213 SEQAN_CHECKPOINT
214 typedef typename Size<TTrace>::Type TSize;
215 typedef typename Value<TTrace>::Type TTraceValue;
216
217 // TraceBack values for Smith Waterman
218 TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2; TTraceValue Stop = 3;
219
220 // The DP Matrix for diagonal walks
221 typedef typename Value<TScore>::Type TScoreValue;
222 typedef String<TScoreValue> TColumn;
223 TColumn mat;
224 // The DP Matrix for gaps from the left
225 TColumn horizontal;
226 // The DP Matrix for gaps from the top
227 TScoreValue vert = 0;
228
229 typedef typename Iterator<TColumn, Standard>::Type TMatIter;
230
231 // Initialization
232 typedef typename Value<TStringSet>::Type TString;
233 TString const& str1 = str[0];
234 TString const& str2 = str[1];
235 TSize len1 = length(str1);
236 TSize len2 = length(str2);
237 resize(mat, (len2+1)); // One column for the diagonal matrix
238 resize(horizontal, (len2+1)); // One column for the horizontal matrix
239 resize(trace, len1 * ((len2 >> 1) + (len2 & 1)), 0);
240 TTraceValue tvMat= 0;
241
242 // Record the max score
243 TScoreValue score_max = 0;
244 indexPair[0] = 0; indexPair[1] = 0;
245 initialDir = Stop;
246
247 // Classical DP
248 TScoreValue max_val = 0;
249 TScoreValue a = 0;
250 TScoreValue b = 0;
251 typedef typename Iterator<TTrace, Standard>::Type TTraceIter;
252 TTraceIter it = begin(trace, Standard() );
253 TMatIter matIt = begin(mat, Standard() );
254 TMatIter horiIt = begin(horizontal, Standard() );
255 *matIt = 0;
256 for(TSize row = 1; row <= len2; ++row) {
257 *(++matIt) = 0;
258 *(++horiIt) = scoreGapOpenHorizontal(sc, 0, row-1, str1, str2) - scoreGapExtendHorizontal(sc, 0, row-1, str1, str2);
259 }
260 for(TSize col = 1; col <= len1; ++col) {
261 matIt = begin(mat, Standard() );
262 horiIt = begin(horizontal, Standard() );
263 TScoreValue diagValMat = *matIt;
264 *matIt = 0;
265 vert = scoreGapOpenVertical(sc, col-1, 0, str1, str2) - scoreGapExtendVertical(sc, col-1, 0, str1, str2);
266 TSize row = 1;
267 while(row <= len2) {
268 if (_isClumping(forbidden, row, col, len2)) {
269 *it <<= 3;
270 *it |= Stop;
271 max_val = 0;
272 vert = 0;
273 *(++horiIt) = 0;
274 ++matIt;
275 } else {
276 // Get the new maximum for vertical
277 a = *matIt + scoreGapOpenVertical(sc, col-1, row-1, str1, str2);
278 b = vert + scoreGapExtendVertical(sc, col-1, row-1, str1, str2);
279 if (a > b) { vert = a; *it |= 1;}
280 else vert = b;
281
282 // Get the new maximum for horizontal
283 *it <<= 1;
284 a = *(++matIt) + scoreGapOpenHorizontal(sc, col-1, row-1, str1, str2);
285 b = *(++horiIt) + scoreGapExtendHorizontal(sc, col-1, row-1, str1, str2);
286 if (a > b) {*horiIt = a; *it |= 1; }
287 else *horiIt = b;
288
289 // Get the new maximum for mat
290 *it <<= 2;
291 max_val = diagValMat + score(const_cast<TScore&>(sc), col-1, row-1, str1, str2);
292 tvMat = Diagonal;
293 if (vert > max_val) {
294 max_val = vert;
295 tvMat = Vertical;
296 }
297 if (*horiIt > max_val) {
298 max_val = *horiIt;
299 tvMat = Horizontal;
300 }
301 if (0 >= max_val) {
302 max_val = 0;
303 tvMat = Stop;
304 }
305 *it |= tvMat;
306 }
307
308 // Assign the new diagonal values
309 diagValMat = *matIt;
310 *matIt = max_val;
311
312 // Record the new best score
313 if (max_val > score_max) {
314 indexPair[0] = row; indexPair[1] = col;
315 score_max = max_val;
316 initialDir = tvMat;
317 }
318
319 if (row & 1) *it <<= 1; else ++it;
320 ++row;
321 }
322 if (!(row & 1)) {*it <<= 3; ++it; }
323 }
324
325 //// Debug code
326 //for(TSize i= 0; i<len2;++i) {
327 // for(TSize j= 0; j<len1;++j) {
328 // std::cout << (TSize) getValue(trace, j*len2 + i) << ',';
329 // }
330 // std::cout << std::endl;
331 //}
332 //std::cout << "Max score: " << best_row << ',' << best_col << ':' << score_max << " (" << (TSize) initialDir << ")" << std::endl;
333
334 return score_max;
335 }
336
337 //////////////////////////////////////////////////////////////////////////////
338
339 template<typename TAlign, typename TStringSet, typename TScore>
340 inline typename Value<TScore>::Type
_localAlignment(TAlign & align,TStringSet const & str,TScore const & sc,SmithWaterman)341 _localAlignment(TAlign& align,
342 TStringSet const& str,
343 TScore const& sc,
344 SmithWaterman)
345 {
346 SEQAN_CHECKPOINT
347 typedef typename Value<TScore>::Type TScoreValue;
348 typedef typename Size<TStringSet>::Type TSize;
349
350 TScoreValue maxScore;
351 TSize indexPair[2];
352 Nothing forbidden;
353
354 // Trace
355 String<unsigned char> trace;
356 unsigned char initialDir;
357
358 // Create the trace
359 maxScore = _alignSmithWaterman(trace, str, sc, initialDir, indexPair, forbidden);
360 // Follow the trace and create the graph
361 _alignSmithWatermanTrace(align, str, trace, initialDir, indexPair, forbidden);
362
363 return maxScore;
364 }
365
366 //////////////////////////////////////////////////////////////////////////////
367
368 template<typename TStringSet, typename TScore>
369 inline typename Value<TScore>::Type
_localAlignment(TStringSet const & str,TScore const & sc,SmithWaterman)370 _localAlignment(TStringSet const& str,
371 TScore const& sc,
372 SmithWaterman)
373 {
374 SEQAN_CHECKPOINT
375 typedef typename Size<TStringSet>::Type TSize;
376 TSize indexPair[2];
377 Nothing forbidden;
378 // Trace
379 String<unsigned char> trace;
380 unsigned char initialDir;
381 return _alignSmithWaterman(trace, str, sc, initialDir, indexPair, forbidden);
382 }
383
384
385
386 }// namespace SEQAN_NAMESPACE_MAIN
387
388 #endif //#ifndef SEQAN_HEADER_...
389