1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 
33 #ifndef SEQAN_HEADER_GRAPH_SMITH_WATERMAN_H
34 #define SEQAN_HEADER_GRAPH_SMITH_WATERMAN_H
35 
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38 
39 
40 //////////////////////////////////////////////////////////////////////////////
41 // Alignment: Smith Waterman Alignment, affine gap cost
42 //////////////////////////////////////////////////////////////////////////////
43 
44 
45 //////////////////////////////////////////////////////////////////////////////
46 
47 template<typename TSpec, typename TSize>
48 inline void
_setForbiddenCell(String<bool,TSpec> & forbidden,TSize len1,TSize len2,TSize numRows)49 _setForbiddenCell(String<bool, TSpec>& forbidden,
50 				  TSize len1,
51 				  TSize len2,
52 				  TSize numRows)
53 {
54 	forbidden[(len1 - 1)*numRows + (len2 - 1)] = true;
55 }
56 
57 //////////////////////////////////////////////////////////////////////////////
58 
59 template<typename TSize>
60 inline void
_setForbiddenCell(Nothing &,TSize,TSize,TSize)61 _setForbiddenCell(Nothing&,
62 				  TSize,
63 				  TSize,
64 				  TSize)
65 {
66 }
67 
68 //////////////////////////////////////////////////////////////////////////////
69 
70 template<typename TSpec, typename TSize>
71 inline bool
_isClumping(String<bool,TSpec> const & forbidden,TSize row,TSize col,TSize len2)72 _isClumping(String<bool, TSpec> const& forbidden,
73 			TSize row,
74 			TSize col,
75 			TSize len2)
76 {
77 	return forbidden[(col-1) * len2 + (row-1)];
78 }
79 
80 //////////////////////////////////////////////////////////////////////////////
81 
82 template<typename TSize>
83 inline bool
_isClumping(Nothing &,TSize,TSize,TSize)84 _isClumping(Nothing&,
85 			TSize,
86 			TSize,
87 			TSize)
88 {
89 	return false;
90 }
91 
92 ////////////////////////////////////////////////////////////////////////////
93 
94 template <typename TAlign, typename TStringSet, typename TTrace, typename TVal, typename TIndexPair, typename TForbidden>
95 inline void
_alignSmithWatermanTrace(TAlign & align,TStringSet const & str,TTrace const & trace,TVal const initialDir,TIndexPair const & indexPair,TForbidden & forbidden)96 _alignSmithWatermanTrace(TAlign& align,
97 							TStringSet const& str,
98 							TTrace const& trace,
99 							TVal const initialDir,
100 							TIndexPair const& indexPair,
101 							TForbidden& forbidden)
102 {
103 	SEQAN_CHECKPOINT
104 	typedef typename Size<TTrace>::Type TSize;
105 	typedef typename Value<TTrace>::Type TTraceValue;
106 	typedef typename Id<TStringSet>::Type TId;
107 
108 	// TraceBack values for Gotoh
109 	TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2; TTraceValue Stop = 3;
110 
111 	TId id1 = positionToId(const_cast<TStringSet&>(str), 0);
112 	TId id2 = positionToId(const_cast<TStringSet&>(str), 1);
113 	TSize len1 = indexPair[1];
114 	TSize len2 = indexPair[0];
115 	if ((indexPair[0] == 0) || (indexPair[1] == 0)) return;
116 	TSize numCols = length(str[0]);
117 	TSize numRowsOrig = length(str[1]);
118 	if (len1 < numCols) _alignTracePrint(align, str, id1, len1, id2, len2, numCols - len1, Horizontal);
119 	if (len2 < numRowsOrig) _alignTracePrint(align, str, id1, len1, id2, len2, numRowsOrig - len2, Vertical);
120 	TSize numRows = (numRowsOrig >> 1) + (numRowsOrig & 1);
121 
122 
123 
124 	// Initialize everything
125 	TTraceValue nextTraceValue = (len2 & 1) ? trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)] >> 4 : trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)];
126 	TTraceValue tv = Diagonal;
127 	if (initialDir == Diagonal) tv = (nextTraceValue & 3);
128 	else if (initialDir == Horizontal) {
129 		if ((nextTraceValue >> 2) & 1) _alignTracePrint(align, str, id1, --len1, id2, len2, (TSize) 1, Horizontal);
130 		else tv = Horizontal;
131 	} else if (initialDir == Vertical) {
132 		if ((nextTraceValue >> 3) & 1) _alignTracePrint(align, str, id1, len1, id2, --len2, (TSize) 1, Vertical);
133 		else tv = Vertical;
134 	}
135 	TSize segLen = 0;
136 	TTraceValue tvOld = tv;
137 
138 	// Now follow the trace
139 	do {
140 		nextTraceValue = (len2 & 1) ? trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)] >> 4 : trace[(len1 - 1)*numRows + ((len2 - 1) >> 1)];
141 		if ((nextTraceValue & 3) == Stop) break;
142 		_setForbiddenCell(forbidden, len1, len2, numRowsOrig);
143 		if (tv == Diagonal) tv = (nextTraceValue & 3);
144 		else if (tv == Horizontal) {
145 			if ((nextTraceValue >> 2) & 1) tv = Diagonal;
146 			else tv =  Horizontal;
147 		} else if (tv == Vertical) {
148 			if ((nextTraceValue >> 3) & 1) tv =  Diagonal;
149 			else tv =  Vertical;
150 		}
151 		if (tv == Diagonal) {
152 			if (tv != tvOld) {
153 				if (tvOld == Vertical) --len2;
154 				else --len1;
155 				_alignTracePrint(align, str, id1, len1, id2, len2, ++segLen, tvOld);
156 				tvOld = tv; segLen = 0;
157 			} else {
158 				++segLen;
159 				--len1; --len2;
160 			}
161 		} else if (tv == Horizontal) {
162 			if (tv != tvOld) {
163 				_alignTracePrint(align, str, id1, len1, id2, len2, segLen, tvOld);
164 				if ((nextTraceValue >> 2) & 1) {
165 					_alignTracePrint(align, str, id1, --len1, id2, len2, (TSize) 1, Horizontal);
166 					tv = Diagonal; segLen = 0;
167 				} else {
168 					tvOld = tv; segLen = 1;
169 					--len1;
170 				}
171 			} else {
172 				++segLen;
173 				--len1;
174 			}
175 		} else if (tv == Vertical) {
176 			if (tv != tvOld) {
177 				_alignTracePrint(align, str, id1, len1, id2, len2, segLen, tvOld);
178 				if ((nextTraceValue >> 3) & 1) {
179 					_alignTracePrint(align, str, id1, len1, id2, --len2, (TSize) 1, Vertical);
180 					tv = Diagonal; segLen = 0;
181 				} else {
182 					tvOld = tv; segLen = 1;
183 					--len2;
184 				}
185 			} else {
186 				++segLen;
187 				--len2;
188 			}
189 		}
190 	} while ((len1 != 0) && (len2 !=0));
191 	// Process left-overs
192 	if (segLen) _alignTracePrint(align, str, id1, len1, id2, len2, segLen, tvOld);
193 
194 	// Handle the remaining sequence
195 	if (len1 != 0) _alignTracePrint(align, str, (TId) id1, (TSize) 0, (TId) 0, (TSize) 0, (TSize) len1, Horizontal);
196 	if (len2 != 0) _alignTracePrint(align, str, (TId) 0, (TSize) 0, (TId) id2, (TSize) 0, (TSize) len2, Vertical);
197 }
198 
199 
200 //////////////////////////////////////////////////////////////////////////////
201 
202 //////////////////////////////////////////////////////////////////////////////
203 
204 template <typename TTrace, typename TStringSet, typename TScore, typename TIndexPair, typename TForbidden>
205 inline typename Value<TScore>::Type
_alignSmithWaterman(TTrace & trace,TStringSet const & str,TScore const & sc,typename Value<TTrace>::Type & initialDir,TIndexPair & indexPair,TForbidden & forbidden)206 _alignSmithWaterman(TTrace& trace,
207 					  TStringSet const& str,
208 					  TScore const & sc,
209 					  typename Value<TTrace>::Type& initialDir,
210 					  TIndexPair& indexPair,
211 					  TForbidden& forbidden)
212 {
213 	SEQAN_CHECKPOINT
214 	typedef typename Size<TTrace>::Type TSize;
215 	typedef typename Value<TTrace>::Type TTraceValue;
216 
217 	// TraceBack values for Smith Waterman
218 	TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2; TTraceValue Stop = 3;
219 
220 	// The DP Matrix for diagonal walks
221 	typedef typename Value<TScore>::Type TScoreValue;
222 	typedef String<TScoreValue> TColumn;
223 	TColumn mat;
224 	// The DP Matrix for gaps from the left
225 	TColumn horizontal;
226 	// The DP Matrix for gaps from the top
227 	TScoreValue vert = 0;
228 
229 	typedef typename Iterator<TColumn, Standard>::Type TMatIter;
230 
231 	// Initialization
232 	typedef typename Value<TStringSet>::Type TString;
233 	TString const& str1 = str[0];
234 	TString const& str2 = str[1];
235 	TSize len1 = length(str1);
236 	TSize len2 = length(str2);
237 	resize(mat, (len2+1));   // One column for the diagonal matrix
238 	resize(horizontal, (len2+1));   // One column for the horizontal matrix
239 	resize(trace, len1 * ((len2 >> 1) + (len2 & 1)), 0);
240 	TTraceValue tvMat= 0;
241 
242 	// Record the max score
243 	TScoreValue score_max = 0;
244 	indexPair[0] = 0; indexPair[1] = 0;
245 	initialDir = Stop;
246 
247 	// Classical DP
248 	TScoreValue max_val = 0;
249 	TScoreValue a = 0;
250 	TScoreValue b = 0;
251 	typedef typename Iterator<TTrace, Standard>::Type TTraceIter;
252 	TTraceIter it = begin(trace, Standard() );
253 	TMatIter matIt = begin(mat, Standard() );
254 	TMatIter horiIt = begin(horizontal, Standard() );
255 	*matIt = 0;
256 	for(TSize row = 1; row <= len2; ++row) {
257 		*(++matIt) = 0;
258 		*(++horiIt) = scoreGapOpenHorizontal(sc, 0, row-1, str1, str2) - scoreGapExtendHorizontal(sc, 0, row-1, str1, str2);
259 	}
260 	for(TSize col = 1; col <= len1; ++col) {
261 		matIt = begin(mat, Standard() );
262 		horiIt = begin(horizontal, Standard() );
263 		TScoreValue diagValMat = *matIt;
264 		*matIt = 0;
265 		vert = scoreGapOpenVertical(sc, col-1, 0, str1, str2) - scoreGapExtendVertical(sc, col-1, 0, str1, str2);
266 		TSize row = 1;
267 		while(row <= len2) {
268 			if (_isClumping(forbidden, row, col, len2)) {
269 				*it <<= 3;
270 				*it |= Stop;
271 				max_val = 0;
272 				vert = 0;
273 				*(++horiIt) = 0;
274 				++matIt;
275 			} else {
276 				// Get the new maximum for vertical
277 				a = *matIt + scoreGapOpenVertical(sc, col-1, row-1, str1, str2);
278 				b = vert + scoreGapExtendVertical(sc, col-1, row-1, str1, str2);
279 				if (a > b) { vert = a; *it |= 1;}
280 				else vert = b;
281 
282 				// Get the new maximum for horizontal
283 				*it <<= 1;
284 				a = *(++matIt) + scoreGapOpenHorizontal(sc, col-1, row-1, str1, str2);
285 				b = *(++horiIt) + scoreGapExtendHorizontal(sc, col-1, row-1, str1, str2);
286 				if (a > b) {*horiIt = a; *it |= 1; }
287 				else *horiIt =  b;
288 
289 				// Get the new maximum for mat
290 				*it <<= 2;
291 				max_val = diagValMat + score(const_cast<TScore&>(sc), col-1, row-1, str1, str2);
292 				tvMat =  Diagonal;
293 				if (vert > max_val) {
294 					max_val = vert;
295 					tvMat =  Vertical;
296 				}
297 				if (*horiIt > max_val) {
298 					max_val = *horiIt;
299 					tvMat =  Horizontal;
300 				}
301 				if (0 >= max_val) {
302 					max_val = 0;
303 					tvMat =  Stop;
304 				}
305 				*it |= tvMat;
306 			}
307 
308 			// Assign the new diagonal values
309 			diagValMat = *matIt;
310 			*matIt = max_val;
311 
312 			// Record the new best score
313 			if (max_val > score_max) {
314 				indexPair[0] = row; indexPair[1] = col;
315 				score_max = max_val;
316 				initialDir = tvMat;
317 			}
318 
319 			if (row & 1) *it <<= 1; else ++it;
320 			++row;
321 		}
322 		if (!(row & 1)) {*it <<= 3; ++it; }
323 	}
324 
325 	//// Debug code
326 	//for(TSize i= 0; i<len2;++i) {
327 	//	for(TSize j= 0; j<len1;++j) {
328 	//		std::cout << (TSize) getValue(trace, j*len2 + i) << ',';
329 	//	}
330 	//	std::cout << std::endl;
331 	//}
332 	//std::cout << "Max score: " << best_row << ',' << best_col << ':' << score_max << " (" << (TSize) initialDir << ")" << std::endl;
333 
334 	return score_max;
335 }
336 
337 //////////////////////////////////////////////////////////////////////////////
338 
339 template<typename TAlign, typename TStringSet, typename TScore>
340 inline typename Value<TScore>::Type
_localAlignment(TAlign & align,TStringSet const & str,TScore const & sc,SmithWaterman)341 _localAlignment(TAlign& align,
342 				TStringSet const& str,
343 				TScore const& sc,
344 				SmithWaterman)
345 {
346 	SEQAN_CHECKPOINT
347 	typedef typename Value<TScore>::Type TScoreValue;
348 	typedef typename Size<TStringSet>::Type TSize;
349 
350 	TScoreValue maxScore;
351 	TSize indexPair[2];
352 	Nothing forbidden;
353 
354 	// Trace
355 	String<unsigned char> trace;
356 	unsigned char initialDir;
357 
358 	// Create the trace
359 	maxScore = _alignSmithWaterman(trace, str, sc, initialDir, indexPair, forbidden);
360 	// Follow the trace and create the graph
361 	_alignSmithWatermanTrace(align, str, trace, initialDir, indexPair, forbidden);
362 
363 	return maxScore;
364 }
365 
366 //////////////////////////////////////////////////////////////////////////////
367 
368 template<typename TStringSet, typename TScore>
369 inline typename Value<TScore>::Type
_localAlignment(TStringSet const & str,TScore const & sc,SmithWaterman)370 _localAlignment(TStringSet const& str,
371 				TScore const& sc,
372 				SmithWaterman)
373 {
374 	SEQAN_CHECKPOINT
375 	typedef typename Size<TStringSet>::Type TSize;
376 	TSize indexPair[2];
377 	Nothing forbidden;
378 	// Trace
379 	String<unsigned char> trace;
380 	unsigned char initialDir;
381 	return _alignSmithWaterman(trace, str, sc, initialDir, indexPair, forbidden);
382 }
383 
384 
385 
386 }// namespace SEQAN_NAMESPACE_MAIN
387 
388 #endif //#ifndef SEQAN_HEADER_...
389