1 #include "SmithWatermanGotoh.h"
2
3 const float CSmithWatermanGotoh::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
4
5 const char CSmithWatermanGotoh::Directions_STOP = 0;
6 const char CSmithWatermanGotoh::Directions_LEFT = 1;
7 const char CSmithWatermanGotoh::Directions_DIAGONAL = 2;
8 const char CSmithWatermanGotoh::Directions_UP = 3;
9
10 const int CSmithWatermanGotoh::repeat_size_max = 12;
11
CSmithWatermanGotoh(float matchScore,float mismatchScore,float gapOpenPenalty,float gapExtendPenalty)12 CSmithWatermanGotoh::CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty)
13 : mCurrentMatrixSize(0)
14 , mCurrentAnchorSize(0)
15 , mCurrentQuerySize(0)
16 , mCurrentAQSumSize(0)
17 , mMatchScore(matchScore)
18 , mMismatchScore(mismatchScore)
19 , mGapOpenPenalty(gapOpenPenalty)
20 , mGapExtendPenalty(gapExtendPenalty)
21 , mPointers(NULL)
22 , mSizesOfVerticalGaps(NULL)
23 , mSizesOfHorizontalGaps(NULL)
24 , mQueryGapScores(NULL)
25 , mBestScores(NULL)
26 , mReversedAnchor(NULL)
27 , mReversedQuery(NULL)
28 , mUseHomoPolymerGapOpenPenalty(false)
29 , mUseEntropyGapOpenPenalty(false)
30 , mUseRepeatGapExtensionPenalty(false)
31 {
32 CreateScoringMatrix();
33 }
34
~CSmithWatermanGotoh(void)35 CSmithWatermanGotoh::~CSmithWatermanGotoh(void) {
36 if(mPointers) delete [] mPointers;
37 if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
38 if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
39 if(mQueryGapScores) delete [] mQueryGapScores;
40 if(mBestScores) delete [] mBestScores;
41 if(mReversedAnchor) delete [] mReversedAnchor;
42 if(mReversedQuery) delete [] mReversedQuery;
43 }
44
45 // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
Align(unsigned int & referenceAl,string & cigarAl,const string & s1,const string & s2)46 void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {
47
48 if((s1.length() == 0) || (s2.length() == 0)) {
49 cout << "ERROR: Found a read with a zero length." << endl;
50 exit(1);
51 }
52
53 unsigned int referenceLen = s1.length() + 1;
54 unsigned int queryLen = s2.length() + 1;
55 unsigned int sequenceSumLength = s1.length() + s2.length();
56
57 // reinitialize our matrices
58
59 if((referenceLen * queryLen) > mCurrentMatrixSize) {
60
61 // calculate the new matrix size
62 mCurrentMatrixSize = referenceLen * queryLen;
63
64 // delete the old arrays
65 if(mPointers) delete [] mPointers;
66 if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
67 if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
68
69 try {
70
71 // initialize the arrays
72 mPointers = new char[mCurrentMatrixSize];
73 mSizesOfVerticalGaps = new short[mCurrentMatrixSize];
74 mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];
75
76 } catch(bad_alloc) {
77 cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
78 exit(1);
79 }
80 }
81
82 // initialize the traceback matrix to STOP
83 memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
84 for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;
85
86 // initialize the gap matrices to 1
87 uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
88 uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);
89
90
91 // initialize our repeat counts if they are needed
92 vector<map<string, int> > referenceRepeats;
93 vector<map<string, int> > queryRepeats;
94 int queryBeginRepeatBases = 0;
95 int queryEndRepeatBases = 0;
96 if (mUseRepeatGapExtensionPenalty) {
97 for (unsigned int i = 0; i < queryLen; ++i)
98 queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
99 for (unsigned int i = 0; i < referenceLen; ++i)
100 referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));
101
102 // keep only the biggest repeat
103 vector<map<string, int> >::iterator q = queryRepeats.begin();
104 for (; q != queryRepeats.end(); ++q) {
105 map<string, int>::iterator biggest = q->begin();
106 map<string, int>::iterator z = q->begin();
107 for (; z != q->end(); ++z)
108 if (z->first.size() > biggest->first.size()) biggest = z;
109 z = q->begin();
110 while (z != q->end()) {
111 if (z != biggest)
112 q->erase(z++);
113 else ++z;
114 }
115 }
116
117 q = referenceRepeats.begin();
118 for (; q != referenceRepeats.end(); ++q) {
119 map<string, int>::iterator biggest = q->begin();
120 map<string, int>::iterator z = q->begin();
121 for (; z != q->end(); ++z)
122 if (z->first.size() > biggest->first.size()) biggest = z;
123 z = q->begin();
124 while (z != q->end()) {
125 if (z != biggest)
126 q->erase(z++);
127 else ++z;
128 }
129 }
130
131 // remove repeat information from ends of queries
132 // this results in the addition of spurious flanking deletions in repeats
133 map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
134 if (!qrend.empty()) {
135 int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
136 for (int i = 0; i < queryEndRepeatBases; ++i)
137 queryRepeats.at(queryRepeats.size() - 2 - i).clear();
138 }
139
140 map<string, int>& qrbegin = queryRepeats.front();
141 if (!qrbegin.empty()) {
142 int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
143 for (int i = 0; i < queryBeginRepeatBases; ++i)
144 queryRepeats.at(i).clear();
145 }
146
147 }
148
149 int entropyWindowSize = 8;
150 vector<float> referenceEntropies;
151 vector<float> queryEntropies;
152 if (mUseEntropyGapOpenPenalty) {
153 for (unsigned int i = 0; i < queryLen; ++i)
154 queryEntropies.push_back(
155 shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
156 entropyWindowSize));
157 for (unsigned int i = 0; i < referenceLen; ++i)
158 referenceEntropies.push_back(
159 shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
160 entropyWindowSize));
161 }
162
163 // normalize entropies
164 /*
165 float qsum = 0;
166 float qnorm = 0;
167 float qmax = 0;
168 for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
169 qsum += *q;
170 if (*q > qmax) qmax = *q;
171 }
172 qnorm = qsum / queryEntropies.size();
173 for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
174 *q = *q / qsum + qmax;
175
176 float rsum = 0;
177 float rnorm = 0;
178 float rmax = 0;
179 for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
180 rsum += *r;
181 if (*r > rmax) rmax = *r;
182 }
183 rnorm = rsum / referenceEntropies.size();
184 for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
185 *r = *r / rsum + rmax;
186 */
187
188 //
189 // construct
190 //
191
192 // reinitialize our query-dependent arrays
193 if(s2.length() > mCurrentQuerySize) {
194
195 // calculate the new query array size
196 mCurrentQuerySize = s2.length();
197
198 // delete the old arrays
199 if(mQueryGapScores) delete [] mQueryGapScores;
200 if(mBestScores) delete [] mBestScores;
201
202 // initialize the arrays
203 try {
204
205 mQueryGapScores = new float[mCurrentQuerySize + 1];
206 mBestScores = new float[mCurrentQuerySize + 1];
207
208 } catch(bad_alloc) {
209 cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
210 exit(1);
211 }
212 }
213
214 // reinitialize our reference+query-dependent arrays
215 if(sequenceSumLength > mCurrentAQSumSize) {
216
217 // calculate the new reference array size
218 mCurrentAQSumSize = sequenceSumLength;
219
220 // delete the old arrays
221 if(mReversedAnchor) delete [] mReversedAnchor;
222 if(mReversedQuery) delete [] mReversedQuery;
223
224 // initialize the arrays
225 try {
226
227 mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
228 mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
229
230 } catch(bad_alloc) {
231 cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
232 exit(1);
233 }
234 }
235
236 // initialize the gap score and score vectors
237 uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
238 memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);
239
240 float similarityScore, totalSimilarityScore, bestScoreDiagonal;
241 float queryGapExtendScore, queryGapOpenScore;
242 float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;
243
244 unsigned int BestColumn = 0;
245 unsigned int BestRow = 0;
246 BestScore = FLOAT_NEGATIVE_INFINITY;
247
248 for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {
249
250 currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
251 bestScoreDiagonal = mBestScores[0];
252
253 for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {
254
255 // calculate our similarity score
256 similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];
257
258 // fill the matrices
259 totalSimilarityScore = bestScoreDiagonal + similarityScore;
260
261 //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;
262
263 queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
264 queryGapOpenScore = mBestScores[j] - mGapOpenPenalty;
265
266 // compute the homo-polymer gap score if enabled
267 if(mUseHomoPolymerGapOpenPenalty)
268 if((j > 1) && (s2[j - 1] == s2[j - 2]))
269 queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
270
271 // compute the entropy gap score if enabled
272 if (mUseEntropyGapOpenPenalty) {
273 queryGapOpenScore =
274 mBestScores[j] - mGapOpenPenalty
275 * max(queryEntropies.at(j), referenceEntropies.at(i))
276 * mEntropyGapOpenPenalty;
277 }
278
279 int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;
280
281 if (mUseRepeatGapExtensionPenalty) {
282 map<string, int>& repeats = queryRepeats[j];
283 // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
284 if (!repeats.empty()) {
285
286 const pair<string, int>& repeat = *repeats.begin();
287 int repeatsize = repeat.first.size();
288 if (gaplen != repeatsize && gaplen % repeatsize != 0) {
289 gaplen = gaplen / repeatsize + repeatsize;
290 }
291
292 if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
293 string gapseq = string(&s1[i], gaplen);
294 if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
295 queryGapExtendScore = mQueryGapScores[j]
296 + mRepeatGapExtensionPenalty / (float) gaplen;
297 // mMaxRepeatGapExtensionPenalty)
298 } else {
299 queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
300 }
301 }
302 } else {
303 queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
304 }
305 }
306
307 if(queryGapExtendScore > queryGapOpenScore) {
308 mQueryGapScores[j] = queryGapExtendScore;
309 mSizesOfVerticalGaps[l] = gaplen;
310 } else mQueryGapScores[j] = queryGapOpenScore;
311
312 referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
313 referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty;
314
315 // compute the homo-polymer gap score if enabled
316 if(mUseHomoPolymerGapOpenPenalty)
317 if((i > 1) && (s1[i - 1] == s1[i - 2]))
318 referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
319
320 // compute the entropy gap score if enabled
321 if (mUseEntropyGapOpenPenalty) {
322 referenceGapOpenScore =
323 mBestScores[j - 1] - mGapOpenPenalty
324 * max(queryEntropies.at(j), referenceEntropies.at(i))
325 * mEntropyGapOpenPenalty;
326 }
327
328 gaplen = mSizesOfHorizontalGaps[l - 1] + 1;
329
330 if (mUseRepeatGapExtensionPenalty) {
331 map<string, int>& repeats = referenceRepeats[i];
332 // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
333 if (!repeats.empty()) {
334
335 const pair<string, int>& repeat = *repeats.begin();
336 int repeatsize = repeat.first.size();
337 if (gaplen != repeatsize && gaplen % repeatsize != 0) {
338 gaplen = gaplen / repeatsize + repeatsize;
339 }
340
341 if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
342 string gapseq = string(&s2[j], gaplen);
343 if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
344 referenceGapExtendScore = currentAnchorGapScore
345 + mRepeatGapExtensionPenalty / (float) gaplen;
346 //mMaxRepeatGapExtensionPenalty)
347 } else {
348 referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
349 }
350 }
351 } else {
352 referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
353 }
354 }
355
356 if(referenceGapExtendScore > referenceGapOpenScore) {
357 currentAnchorGapScore = referenceGapExtendScore;
358 mSizesOfHorizontalGaps[l] = gaplen;
359 } else currentAnchorGapScore = referenceGapOpenScore;
360
361 bestScoreDiagonal = mBestScores[j];
362 mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
363
364
365 // determine the traceback direction
366 // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
367 if(mBestScores[j] == 0) mPointers[l] = Directions_STOP;
368 else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
369 else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP;
370 else mPointers[l] = Directions_LEFT;
371
372 // set the traceback start at the current cell i, j and score
373 if(mBestScores[j] > BestScore) {
374 BestRow = i;
375 BestColumn = j;
376 BestScore = mBestScores[j];
377 }
378 }
379 }
380
381 //
382 // traceback
383 //
384
385 // aligned sequences
386 int gappedAnchorLen = 0; // length of sequence #1 after alignment
387 int gappedQueryLen = 0; // length of sequence #2 after alignment
388 int numMismatches = 0; // the mismatched nucleotide count
389
390 char c1, c2;
391
392 int ci = BestRow;
393 int cj = BestColumn;
394 int ck = ci * queryLen;
395
396 // traceback flag
397 bool keepProcessing = true;
398
399 while(keepProcessing) {
400 //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl;
401
402 // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
403 switch(mPointers[ck + cj]) {
404
405 case Directions_DIAGONAL:
406 c1 = s1[--ci];
407 c2 = s2[--cj];
408 ck -= queryLen;
409
410 mReversedAnchor[gappedAnchorLen++] = c1;
411 mReversedQuery[gappedQueryLen++] = c2;
412
413 // increment our mismatch counter
414 if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;
415 break;
416
417 case Directions_STOP:
418 keepProcessing = false;
419 break;
420
421 case Directions_UP:
422 for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
423 if (ci <= 0) {
424 keepProcessing = false;
425 break;
426 }
427 mReversedAnchor[gappedAnchorLen++] = s1[--ci];
428 mReversedQuery[gappedQueryLen++] = GAP;
429 ck -= queryLen;
430 numMismatches++;
431 }
432 break;
433
434 case Directions_LEFT:
435 for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
436 if (cj <= 0) {
437 keepProcessing = false;
438 break;
439 }
440 mReversedAnchor[gappedAnchorLen++] = GAP;
441 mReversedQuery[gappedQueryLen++] = s2[--cj];
442 numMismatches++;
443 }
444 break;
445 }
446 }
447
448 // define the reference and query sequences
449 mReversedAnchor[gappedAnchorLen] = 0;
450 mReversedQuery[gappedQueryLen] = 0;
451
452 // catch sequences with different lengths
453 if(gappedAnchorLen != gappedQueryLen) {
454 cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
455 exit(1);
456 }
457
458 // reverse the strings and assign them to our alignment structure
459 reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
460 reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
461
462 //alignment.Reference = mReversedAnchor;
463 //alignment.Query = mReversedQuery;
464
465 // set the reference endpoints
466 //alignment.ReferenceBegin = ci;
467 //alignment.ReferenceEnd = BestRow - 1;
468 referenceAl = ci;
469
470 // set the query endpoints
471 /*
472 if(alignment.IsReverseComplement) {
473 alignment.QueryBegin = s2Length - BestColumn;
474 alignment.QueryEnd = s2Length - cj - 1;
475 // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
476 } else {
477 alignment.QueryBegin = cj;
478 alignment.QueryEnd = BestColumn - 1;
479 // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
480 }
481 */
482
483 // set the query length and number of mismatches
484 //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
485 //alignment.NumMismatches = numMismatches;
486
487 unsigned int alLength = strlen(mReversedAnchor);
488 unsigned int m = 0, d = 0, i = 0;
489 bool dashRegion = false;
490 ostringstream oCigar (ostringstream::out);
491 int insertedBases = 0;
492
493 if ( cj != 0 ) {
494 if ( cj > 0 ) {
495 oCigar << cj << 'S';
496 } else { // how do we get negative cj's?
497 referenceAl -= cj;
498 alLength += cj;
499 }
500 }
501
502 for ( unsigned int j = 0; j < alLength; j++ ) {
503 // m
504 if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
505 if ( dashRegion ) {
506 if ( d != 0 ) oCigar << d << 'D';
507 else { oCigar << i << 'I'; insertedBases += i; }
508 }
509 dashRegion = false;
510 m++;
511 d = 0;
512 i = 0;
513 }
514 else {
515 if ( !dashRegion && m )
516 oCigar << m << 'M';
517 dashRegion = true;
518 m = 0;
519 if ( mReversedAnchor[j] == GAP ) {
520 if ( d != 0 ) oCigar << d << 'D';
521 i++;
522 d = 0;
523 }
524 else {
525 if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
526 d++;
527 i = 0;
528 }
529 }
530 }
531 if ( m != 0 ) oCigar << m << 'M';
532 else if ( d != 0 ) oCigar << d << 'D';
533 else if ( i != 0 ) oCigar << i << 'I';
534
535 if ( BestColumn != s2.length() )
536 oCigar << s2.length() - BestColumn << 'S';
537
538 cigarAl = oCigar.str();
539
540 // fix the gap order
541 CorrectHomopolymerGapOrder(alLength, numMismatches);
542
543 if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
544 int offset = 0;
545 string oldCigar;
546 try {
547 oldCigar = cigarAl;
548 stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
549 } catch (...) {
550 cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
551 cigarAl = oldCigar; // undo the failed left-realignment attempt
552 offset = 0;
553 }
554 referenceAl += offset;
555 }
556
557 }
558
559 // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
CreateScoringMatrix(void)560 void CSmithWatermanGotoh::CreateScoringMatrix(void) {
561
562 unsigned int nIndex = 13;
563 unsigned int xIndex = 23;
564
565 // define the N score to be 1/4 of the span between mismatch and match
566 //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
567
568 // calculate the scoring matrix
569 for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
570 for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
571
572 // N.B. matching N to everything (while conceptually correct) leads to some
573 // bad alignments, lets make N be a mismatch instead.
574
575 // add the matches or mismatches to the hashtable (N is a mismatch)
576 if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
577 else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
578 else if(i == j) mScoringMatrix[i][j] = mMatchScore;
579 else mScoringMatrix[i][j] = mMismatchScore;
580 }
581 }
582
583 // add ambiguity codes
584 mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
585 mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
586 mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
587 mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
588
589 mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
590 mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
591 mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
592 mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
593
594 mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
595 mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
596 mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
597 mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
598
599 mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
600 mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
601 mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
602 mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
603
604 mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
605 mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
606 mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
607 mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
608
609 mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
610 mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
611 mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
612 mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
613
614 mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
615 mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
616 mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
617 mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
618 mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
619 mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
620
621 mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
622 mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
623 mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
624 mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
625 mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
626 mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
627
628 mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
629 mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
630 mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
631 mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
632 mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
633 mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
634
635 mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
636 mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
637 mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
638 mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
639 mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
640 mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
641 }
642
643 // enables homo-polymer scoring
EnableHomoPolymerGapPenalty(float hpGapOpenPenalty)644 void CSmithWatermanGotoh::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
645 mUseHomoPolymerGapOpenPenalty = true;
646 mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
647 }
648
649 // enables entropy-based gap open penalty
EnableEntropyGapPenalty(float enGapOpenPenalty)650 void CSmithWatermanGotoh::EnableEntropyGapPenalty(float enGapOpenPenalty) {
651 mUseEntropyGapOpenPenalty = true;
652 mEntropyGapOpenPenalty = enGapOpenPenalty;
653 }
654
655 // enables repeat-aware gap extension penalty
EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty,float rMaxGapRepeatExtensionPenaltyFactor)656 void CSmithWatermanGotoh::EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor) {
657 mUseRepeatGapExtensionPenalty = true;
658 mRepeatGapExtensionPenalty = rGapExtensionPenalty;
659 mMaxRepeatGapExtensionPenalty = rMaxGapRepeatExtensionPenaltyFactor * rGapExtensionPenalty;
660 }
661
662 // corrects the homopolymer gap order for forward alignments
CorrectHomopolymerGapOrder(const unsigned int numBases,const unsigned int numMismatches)663 void CSmithWatermanGotoh::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
664
665 // this is only required for alignments with mismatches
666 //if(al.NumMismatches == 0) return;
667 if ( numMismatches == 0 ) return;
668
669 // localize the alignment data
670 //char* pReference = al.Reference.Data();
671 //char* pQuery = al.Query.Data();
672 //const unsigned int numBases = al.Reference.Length();
673 char* pReference = mReversedAnchor;
674 char* pQuery = mReversedQuery;
675
676 // initialize
677 bool hasReferenceGap = false, hasQueryGap = false;
678 char* pNonGapSeq = NULL;
679 char* pGapSeq = NULL;
680 char nonGapBase = 'J';
681
682 // identify gapped regions
683 for(unsigned int i = 0; i < numBases; i++) {
684
685 // check if the current position is gapped
686 hasReferenceGap = false;
687 hasQueryGap = false;
688
689 if(pReference[i] == GAP) {
690 hasReferenceGap = true;
691 pNonGapSeq = pQuery;
692 pGapSeq = pReference;
693 nonGapBase = pQuery[i];
694 }
695
696 if(pQuery[i] == GAP) {
697 hasQueryGap = true;
698 pNonGapSeq = pReference;
699 pGapSeq = pQuery;
700 nonGapBase = pReference[i];
701 }
702
703 // continue if we don't have any gaps
704 if(!hasReferenceGap && !hasQueryGap) continue;
705
706 // sanity check
707 if(hasReferenceGap && hasQueryGap) {
708 printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
709 exit(1);
710 }
711
712 // find the non-gapped length (forward)
713 unsigned short numGappedBases = 0;
714 unsigned short nonGapLength = 0;
715 unsigned short testPos = i;
716 while(testPos < numBases) {
717
718 const char gs = pGapSeq[testPos];
719 const char ngs = pNonGapSeq[testPos];
720
721 bool isPartofHomopolymer = false;
722 if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
723 if(!isPartofHomopolymer) break;
724
725 if(gs == GAP) numGappedBases++;
726 else nonGapLength++;
727 testPos++;
728 }
729
730 // fix the gap order
731 if(numGappedBases != 0) {
732 char* pCurrentSequence = pGapSeq + i;
733 memset(pCurrentSequence, nonGapBase, nonGapLength);
734 pCurrentSequence += nonGapLength;
735 memset(pCurrentSequence, GAP, numGappedBases);
736 }
737
738 // increment
739 i += numGappedBases + nonGapLength - 1;
740 }
741 }
742