1 #include "SmithWatermanGotoh.h"
2 
3 const float CSmithWatermanGotoh::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
4 
5 const char CSmithWatermanGotoh::Directions_STOP     = 0;
6 const char CSmithWatermanGotoh::Directions_LEFT     = 1;
7 const char CSmithWatermanGotoh::Directions_DIAGONAL = 2;
8 const char CSmithWatermanGotoh::Directions_UP       = 3;
9 
10 const int CSmithWatermanGotoh::repeat_size_max      = 12;
11 
CSmithWatermanGotoh(float matchScore,float mismatchScore,float gapOpenPenalty,float gapExtendPenalty)12 CSmithWatermanGotoh::CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty)
13     : mCurrentMatrixSize(0)
14     , mCurrentAnchorSize(0)
15     , mCurrentQuerySize(0)
16     , mCurrentAQSumSize(0)
17     , mMatchScore(matchScore)
18     , mMismatchScore(mismatchScore)
19     , mGapOpenPenalty(gapOpenPenalty)
20     , mGapExtendPenalty(gapExtendPenalty)
21     , mPointers(NULL)
22     , mSizesOfVerticalGaps(NULL)
23     , mSizesOfHorizontalGaps(NULL)
24     , mQueryGapScores(NULL)
25     , mBestScores(NULL)
26     , mReversedAnchor(NULL)
27     , mReversedQuery(NULL)
28     , mUseHomoPolymerGapOpenPenalty(false)
29     , mUseEntropyGapOpenPenalty(false)
30     , mUseRepeatGapExtensionPenalty(false)
31 {
32     CreateScoringMatrix();
33 }
34 
~CSmithWatermanGotoh(void)35 CSmithWatermanGotoh::~CSmithWatermanGotoh(void) {
36     if(mPointers)              delete [] mPointers;
37     if(mSizesOfVerticalGaps)   delete [] mSizesOfVerticalGaps;
38     if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
39     if(mQueryGapScores)        delete [] mQueryGapScores;
40     if(mBestScores)            delete [] mBestScores;
41     if(mReversedAnchor)        delete [] mReversedAnchor;
42     if(mReversedQuery)         delete [] mReversedQuery;
43 }
44 
45 // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
Align(unsigned int & referenceAl,string & cigarAl,const string & s1,const string & s2)46 void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {
47 
48     if((s1.length() == 0) || (s2.length() == 0)) {
49 	cout << "ERROR: Found a read with a zero length." << endl;
50 	exit(1);
51     }
52 
53     unsigned int referenceLen      = s1.length() + 1;
54     unsigned int queryLen          = s2.length() + 1;
55     unsigned int sequenceSumLength = s1.length() + s2.length();
56 
57     // reinitialize our matrices
58 
59     if((referenceLen * queryLen) > mCurrentMatrixSize) {
60 
61 	// calculate the new matrix size
62 	mCurrentMatrixSize = referenceLen * queryLen;
63 
64 	// delete the old arrays
65 	if(mPointers)              delete [] mPointers;
66 	if(mSizesOfVerticalGaps)   delete [] mSizesOfVerticalGaps;
67 	if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
68 
69 	try {
70 
71 	    // initialize the arrays
72 	    mPointers              = new char[mCurrentMatrixSize];
73 	    mSizesOfVerticalGaps   = new short[mCurrentMatrixSize];
74 	    mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];
75 
76 	} catch(bad_alloc) {
77 	    cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
78 	    exit(1);
79 	}
80     }
81 
82     // initialize the traceback matrix to STOP
83     memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
84     for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;
85 
86     // initialize the gap matrices to 1
87     uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
88     uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);
89 
90 
91     // initialize our repeat counts if they are needed
92     vector<map<string, int> > referenceRepeats;
93     vector<map<string, int> > queryRepeats;
94     int queryBeginRepeatBases = 0;
95     int queryEndRepeatBases = 0;
96     if (mUseRepeatGapExtensionPenalty) {
97 	for (unsigned int i = 0; i < queryLen; ++i)
98 	    queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
99 	for (unsigned int i = 0; i < referenceLen; ++i)
100 	    referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));
101 
102 	// keep only the biggest repeat
103 	vector<map<string, int> >::iterator q = queryRepeats.begin();
104 	for (; q != queryRepeats.end(); ++q) {
105 	    map<string, int>::iterator biggest = q->begin();
106 	    map<string, int>::iterator z = q->begin();
107 	    for (; z != q->end(); ++z)
108 		if (z->first.size() > biggest->first.size()) biggest = z;
109 	    z = q->begin();
110 	    while (z != q->end()) {
111 		if (z != biggest)
112 		    q->erase(z++);
113 		else ++z;
114 	    }
115 	}
116 
117 	q = referenceRepeats.begin();
118 	for (; q != referenceRepeats.end(); ++q) {
119 	    map<string, int>::iterator biggest = q->begin();
120 	    map<string, int>::iterator z = q->begin();
121 	    for (; z != q->end(); ++z)
122 		if (z->first.size() > biggest->first.size()) biggest = z;
123 	    z = q->begin();
124 	    while (z != q->end()) {
125 		if (z != biggest)
126 		    q->erase(z++);
127 		else ++z;
128 	    }
129 	}
130 
131 	// remove repeat information from ends of queries
132 	// this results in the addition of spurious flanking deletions in repeats
133 	map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
134 	if (!qrend.empty()) {
135 	    int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
136 	    for (int i = 0; i < queryEndRepeatBases; ++i)
137 		queryRepeats.at(queryRepeats.size() - 2 - i).clear();
138 	}
139 
140 	map<string, int>& qrbegin = queryRepeats.front();
141 	if (!qrbegin.empty()) {
142 	    int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
143 	    for (int i = 0; i < queryBeginRepeatBases; ++i)
144 		queryRepeats.at(i).clear();
145 	}
146 
147     }
148 
149     int entropyWindowSize = 8;
150     vector<float> referenceEntropies;
151     vector<float> queryEntropies;
152     if (mUseEntropyGapOpenPenalty) {
153 	for (unsigned int i = 0; i < queryLen; ++i)
154 	    queryEntropies.push_back(
155 		shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
156 			  entropyWindowSize));
157 	for (unsigned int i = 0; i < referenceLen; ++i)
158 	    referenceEntropies.push_back(
159 		shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
160 			  entropyWindowSize));
161     }
162 
163     // normalize entropies
164     /*
165     float qsum = 0;
166     float qnorm = 0;
167     float qmax = 0;
168     for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
169 	qsum += *q;
170 	if (*q > qmax) qmax = *q;
171     }
172     qnorm = qsum / queryEntropies.size();
173     for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
174 	*q = *q / qsum + qmax;
175 
176     float rsum = 0;
177     float rnorm = 0;
178     float rmax = 0;
179     for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
180 	rsum += *r;
181 	if (*r > rmax) rmax = *r;
182     }
183     rnorm = rsum / referenceEntropies.size();
184     for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
185 	*r = *r / rsum + rmax;
186     */
187 
188     //
189     // construct
190     //
191 
192     // reinitialize our query-dependent arrays
193     if(s2.length() > mCurrentQuerySize) {
194 
195 	// calculate the new query array size
196 	mCurrentQuerySize = s2.length();
197 
198 	// delete the old arrays
199 	if(mQueryGapScores) delete [] mQueryGapScores;
200 	if(mBestScores)     delete [] mBestScores;
201 
202 	// initialize the arrays
203 	try {
204 
205 	    mQueryGapScores = new float[mCurrentQuerySize + 1];
206 	    mBestScores     = new float[mCurrentQuerySize + 1];
207 
208 	} catch(bad_alloc) {
209 	    cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
210 	    exit(1);
211 	}
212     }
213 
214     // reinitialize our reference+query-dependent arrays
215     if(sequenceSumLength > mCurrentAQSumSize) {
216 
217 	// calculate the new reference array size
218 	mCurrentAQSumSize = sequenceSumLength;
219 
220 	// delete the old arrays
221 	if(mReversedAnchor) delete [] mReversedAnchor;
222 	if(mReversedQuery)  delete [] mReversedQuery;
223 
224 	// initialize the arrays
225 	try {
226 
227 	    mReversedAnchor = new char[mCurrentAQSumSize + 1];	// reversed sequence #1
228 	    mReversedQuery  = new char[mCurrentAQSumSize + 1];	// reversed sequence #2
229 
230 	} catch(bad_alloc) {
231 	    cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
232 	    exit(1);
233 	}
234     }
235 
236     // initialize the gap score and score vectors
237     uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
238     memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);
239 
240     float similarityScore, totalSimilarityScore, bestScoreDiagonal;
241     float queryGapExtendScore, queryGapOpenScore;
242     float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;
243 
244     unsigned int BestColumn = 0;
245     unsigned int BestRow    = 0;
246     BestScore               = FLOAT_NEGATIVE_INFINITY;
247 
248     for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {
249 
250 	currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
251 	bestScoreDiagonal = mBestScores[0];
252 
253 	for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {
254 
255 	    // calculate our similarity score
256 	    similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];
257 
258 	    // fill the matrices
259 	    totalSimilarityScore = bestScoreDiagonal + similarityScore;
260 
261 	    //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;
262 
263 	    queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
264 	    queryGapOpenScore   = mBestScores[j] - mGapOpenPenalty;
265 
266 	    // compute the homo-polymer gap score if enabled
267 	    if(mUseHomoPolymerGapOpenPenalty)
268 		if((j > 1) && (s2[j - 1] == s2[j - 2]))
269 		    queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
270 
271 	    // compute the entropy gap score if enabled
272 	    if (mUseEntropyGapOpenPenalty) {
273 		queryGapOpenScore =
274 		    mBestScores[j] - mGapOpenPenalty
275 		    * max(queryEntropies.at(j), referenceEntropies.at(i))
276 		    * mEntropyGapOpenPenalty;
277 	    }
278 
279 	    int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;
280 
281 	    if (mUseRepeatGapExtensionPenalty) {
282 		map<string, int>& repeats = queryRepeats[j];
283 		// does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
284 		if (!repeats.empty()) {
285 
286 		    const pair<string, int>& repeat = *repeats.begin();
287 		    int repeatsize = repeat.first.size();
288 		    if (gaplen != repeatsize && gaplen % repeatsize != 0) {
289 			gaplen = gaplen / repeatsize + repeatsize;
290 		    }
291 
292 		    if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
293 			string gapseq = string(&s1[i], gaplen);
294 			if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
295 			    queryGapExtendScore = mQueryGapScores[j]
296 				+ mRepeatGapExtensionPenalty / (float) gaplen;
297 				//    mMaxRepeatGapExtensionPenalty)
298 			} else {
299 			    queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
300 			}
301 		    }
302 		} else {
303 		    queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
304 		}
305 	    }
306 
307 	    if(queryGapExtendScore > queryGapOpenScore) {
308 		mQueryGapScores[j] = queryGapExtendScore;
309 		mSizesOfVerticalGaps[l] = gaplen;
310 	    } else mQueryGapScores[j] = queryGapOpenScore;
311 
312 	    referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
313 	    referenceGapOpenScore   = mBestScores[j - 1] - mGapOpenPenalty;
314 
315 	    // compute the homo-polymer gap score if enabled
316 	    if(mUseHomoPolymerGapOpenPenalty)
317 		if((i > 1) && (s1[i - 1] == s1[i - 2]))
318 		    referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
319 
320 	    // compute the entropy gap score if enabled
321 	    if (mUseEntropyGapOpenPenalty) {
322 		referenceGapOpenScore =
323 		    mBestScores[j - 1] - mGapOpenPenalty
324 		    * max(queryEntropies.at(j), referenceEntropies.at(i))
325 		    * mEntropyGapOpenPenalty;
326 	    }
327 
328 	    gaplen = mSizesOfHorizontalGaps[l - 1] + 1;
329 
330 	    if (mUseRepeatGapExtensionPenalty) {
331 		map<string, int>& repeats = referenceRepeats[i];
332 		// does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
333 		if (!repeats.empty()) {
334 
335 		    const pair<string, int>& repeat = *repeats.begin();
336 		    int repeatsize = repeat.first.size();
337 		    if (gaplen != repeatsize && gaplen % repeatsize != 0) {
338 			gaplen = gaplen / repeatsize + repeatsize;
339 		    }
340 
341 		    if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
342 			string gapseq = string(&s2[j], gaplen);
343 			if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
344 			    referenceGapExtendScore = currentAnchorGapScore
345 				+ mRepeatGapExtensionPenalty / (float) gaplen;
346 				//mMaxRepeatGapExtensionPenalty)
347 			} else {
348 			    referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
349 			}
350 		    }
351 		} else {
352 		    referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
353 		}
354 	    }
355 
356 	    if(referenceGapExtendScore > referenceGapOpenScore) {
357 		currentAnchorGapScore = referenceGapExtendScore;
358 		mSizesOfHorizontalGaps[l] = gaplen;
359 	    } else currentAnchorGapScore = referenceGapOpenScore;
360 
361 	    bestScoreDiagonal = mBestScores[j];
362 	    mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
363 
364 
365 	    // determine the traceback direction
366 	    // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
367 	    if(mBestScores[j] == 0)                         mPointers[l] = Directions_STOP;
368 	    else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
369 	    else if(mBestScores[j] == mQueryGapScores[j])   mPointers[l] = Directions_UP;
370 	    else                                            mPointers[l] = Directions_LEFT;
371 
372 	    // set the traceback start at the current cell i, j and score
373 	    if(mBestScores[j] > BestScore) {
374 		BestRow    = i;
375 		BestColumn = j;
376 		BestScore  = mBestScores[j];
377 	    }
378 	}
379     }
380 
381     //
382     // traceback
383     //
384 
385     // aligned sequences
386     int gappedAnchorLen  = 0;   // length of sequence #1 after alignment
387     int gappedQueryLen   = 0;   // length of sequence #2 after alignment
388     int numMismatches    = 0;   // the mismatched nucleotide count
389 
390     char c1, c2;
391 
392     int ci = BestRow;
393     int cj = BestColumn;
394     int ck = ci * queryLen;
395 
396     // traceback flag
397     bool keepProcessing = true;
398 
399     while(keepProcessing) {
400 	//cerr << ci << " " << cj << " " << ck << "  ... " << gappedAnchorLen << " " << gappedQueryLen <<  endl;
401 
402 	// diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
403 	switch(mPointers[ck + cj]) {
404 
405 	case Directions_DIAGONAL:
406 	    c1 = s1[--ci];
407 	    c2 = s2[--cj];
408 	    ck -= queryLen;
409 
410 	    mReversedAnchor[gappedAnchorLen++] = c1;
411 	    mReversedQuery[gappedQueryLen++]   = c2;
412 
413 	    // increment our mismatch counter
414 	    if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;
415 	    break;
416 
417 	case Directions_STOP:
418 	    keepProcessing = false;
419 	    break;
420 
421 	case Directions_UP:
422 	    for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
423 		if (ci <= 0) {
424 		    keepProcessing = false;
425 		    break;
426 		}
427 		mReversedAnchor[gappedAnchorLen++] = s1[--ci];
428 		mReversedQuery[gappedQueryLen++]   = GAP;
429 		ck -= queryLen;
430 		numMismatches++;
431 	    }
432 	    break;
433 
434 	case Directions_LEFT:
435 	    for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
436 		if (cj <= 0) {
437 		    keepProcessing = false;
438 		    break;
439 		}
440 		mReversedAnchor[gappedAnchorLen++] = GAP;
441 		mReversedQuery[gappedQueryLen++]   = s2[--cj];
442 		numMismatches++;
443 	    }
444 	    break;
445 	}
446     }
447 
448     // define the reference and query sequences
449     mReversedAnchor[gappedAnchorLen] = 0;
450     mReversedQuery[gappedQueryLen]   = 0;
451 
452     // catch sequences with different lengths
453     if(gappedAnchorLen != gappedQueryLen) {
454 	cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
455 	exit(1);
456     }
457 
458     // reverse the strings and assign them to our alignment structure
459     reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
460     reverse(mReversedQuery,  mReversedQuery  + gappedQueryLen);
461 
462     //alignment.Reference = mReversedAnchor;
463     //alignment.Query     = mReversedQuery;
464 
465     // set the reference endpoints
466     //alignment.ReferenceBegin = ci;
467     //alignment.ReferenceEnd   = BestRow - 1;
468     referenceAl = ci;
469 
470     // set the query endpoints
471     /*
472 	if(alignment.IsReverseComplement) {
473 	alignment.QueryBegin = s2Length - BestColumn;
474 	alignment.QueryEnd   = s2Length - cj - 1;
475 	// alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
476 	} else {
477 	alignment.QueryBegin = cj;
478 	alignment.QueryEnd   = BestColumn - 1;
479 	// alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
480 	}
481     */
482 
483     // set the query length and number of mismatches
484     //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
485     //alignment.NumMismatches  = numMismatches;
486 
487     unsigned int alLength = strlen(mReversedAnchor);
488     unsigned int m = 0, d = 0, i = 0;
489     bool dashRegion = false;
490     ostringstream oCigar (ostringstream::out);
491     int insertedBases = 0;
492 
493     if ( cj != 0 ) {
494 	if ( cj > 0 ) {
495 	    oCigar << cj << 'S';
496 	} else { // how do we get negative cj's?
497 	    referenceAl -= cj;
498 	    alLength += cj;
499 	}
500     }
501 
502     for ( unsigned int j = 0; j < alLength; j++ ) {
503 	// m
504 	if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
505 	    if ( dashRegion ) {
506 		if ( d != 0 ) oCigar << d << 'D';
507 		else          { oCigar << i << 'I'; insertedBases += i; }
508 	    }
509 	    dashRegion = false;
510 	    m++;
511 	    d = 0;
512 	    i = 0;
513 	}
514 	else {
515 	    if ( !dashRegion && m )
516 		oCigar << m << 'M';
517 	    dashRegion = true;
518 	    m = 0;
519 	    if ( mReversedAnchor[j] == GAP ) {
520 		if ( d != 0 ) oCigar << d << 'D';
521 		i++;
522 		d = 0;
523 	    }
524 	    else {
525 		if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
526 		d++;
527 		i = 0;
528 	    }
529 	}
530     }
531     if      ( m != 0 ) oCigar << m << 'M';
532     else if ( d != 0 ) oCigar << d << 'D';
533     else if ( i != 0 ) oCigar << i << 'I';
534 
535     if ( BestColumn != s2.length() )
536 	oCigar << s2.length() - BestColumn << 'S';
537 
538     cigarAl = oCigar.str();
539 
540     // fix the gap order
541     CorrectHomopolymerGapOrder(alLength, numMismatches);
542 
543     if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
544 	int offset = 0;
545 	string oldCigar;
546 	try {
547 	    oldCigar = cigarAl;
548 	    stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
549 	} catch (...) {
550 	    cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
551 	    cigarAl = oldCigar; // undo the failed left-realignment attempt
552 	    offset = 0;
553 	}
554 	referenceAl += offset;
555     }
556 
557 }
558 
559 // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
CreateScoringMatrix(void)560 void CSmithWatermanGotoh::CreateScoringMatrix(void) {
561 
562     unsigned int nIndex = 13;
563     unsigned int xIndex = 23;
564 
565     // define the N score to be 1/4 of the span between mismatch and match
566     //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
567 
568     // calculate the scoring matrix
569     for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
570 	for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
571 
572 	    // N.B. matching N to everything (while conceptually correct) leads to some
573 	    // bad alignments, lets make N be a mismatch instead.
574 
575 	    // add the matches or mismatches to the hashtable (N is a mismatch)
576 	    if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
577 	    else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
578 	    else if(i == j) mScoringMatrix[i][j] = mMatchScore;
579 	    else mScoringMatrix[i][j] = mMismatchScore;
580 	}
581     }
582 
583     // add ambiguity codes
584     mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore;	// M - A
585     mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
586     mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
587     mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
588 
589     mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore;	// R - A
590     mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
591     mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
592     mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
593 
594     mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore;	// W - A
595     mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
596     mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
597     mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
598 
599     mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore;	// S - C
600     mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
601     mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
602     mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
603 
604     mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore;	// Y - C
605     mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
606     mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
607     mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
608 
609     mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore;	// K - G
610     mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
611     mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
612     mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
613 
614     mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore;	// V - A
615     mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
616     mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
617     mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
618     mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
619     mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
620 
621     mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore;	// H - A
622     mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
623     mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
624     mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
625     mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
626     mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
627 
628     mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore;	// D - A
629     mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
630     mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
631     mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
632     mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
633     mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
634 
635     mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore;	// B - C
636     mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
637     mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
638     mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
639     mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
640     mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
641 }
642 
643 // enables homo-polymer scoring
EnableHomoPolymerGapPenalty(float hpGapOpenPenalty)644 void CSmithWatermanGotoh::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
645     mUseHomoPolymerGapOpenPenalty = true;
646     mHomoPolymerGapOpenPenalty    = hpGapOpenPenalty;
647 }
648 
649 // enables entropy-based gap open penalty
EnableEntropyGapPenalty(float enGapOpenPenalty)650 void CSmithWatermanGotoh::EnableEntropyGapPenalty(float enGapOpenPenalty) {
651     mUseEntropyGapOpenPenalty = true;
652     mEntropyGapOpenPenalty    = enGapOpenPenalty;
653 }
654 
655 // enables repeat-aware gap extension penalty
EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty,float rMaxGapRepeatExtensionPenaltyFactor)656 void CSmithWatermanGotoh::EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor) {
657     mUseRepeatGapExtensionPenalty = true;
658     mRepeatGapExtensionPenalty    = rGapExtensionPenalty;
659     mMaxRepeatGapExtensionPenalty = rMaxGapRepeatExtensionPenaltyFactor * rGapExtensionPenalty;
660 }
661 
662 // corrects the homopolymer gap order for forward alignments
CorrectHomopolymerGapOrder(const unsigned int numBases,const unsigned int numMismatches)663 void CSmithWatermanGotoh::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
664 
665     // this is only required for alignments with mismatches
666     //if(al.NumMismatches == 0) return;
667     if ( numMismatches == 0 ) return;
668 
669     // localize the alignment data
670     //char* pReference = al.Reference.Data();
671     //char* pQuery     = al.Query.Data();
672     //const unsigned int numBases = al.Reference.Length();
673     char* pReference = mReversedAnchor;
674     char* pQuery     = mReversedQuery;
675 
676     // initialize
677     bool hasReferenceGap = false, hasQueryGap = false;
678     char* pNonGapSeq = NULL;
679     char* pGapSeq    = NULL;
680     char nonGapBase  = 'J';
681 
682     // identify gapped regions
683     for(unsigned int i = 0; i < numBases; i++) {
684 
685 	// check if the current position is gapped
686 	hasReferenceGap = false;
687 	hasQueryGap     = false;
688 
689 	if(pReference[i] == GAP) {
690 	    hasReferenceGap = true;
691 	    pNonGapSeq      = pQuery;
692 	    pGapSeq         = pReference;
693 	    nonGapBase      = pQuery[i];
694 	}
695 
696 	if(pQuery[i] == GAP) {
697 	    hasQueryGap = true;
698 	    pNonGapSeq  = pReference;
699 	    pGapSeq     = pQuery;
700 	    nonGapBase  = pReference[i];
701 	}
702 
703 	// continue if we don't have any gaps
704 	if(!hasReferenceGap && !hasQueryGap) continue;
705 
706 	// sanity check
707 	if(hasReferenceGap && hasQueryGap) {
708 	    printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
709 	    exit(1);
710 	}
711 
712 	// find the non-gapped length (forward)
713 	unsigned short numGappedBases = 0;
714 	unsigned short nonGapLength   = 0;
715 	unsigned short testPos = i;
716 	while(testPos < numBases) {
717 
718 	    const char gs  = pGapSeq[testPos];
719 	    const char ngs = pNonGapSeq[testPos];
720 
721 	    bool isPartofHomopolymer = false;
722 	    if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
723 	    if(!isPartofHomopolymer) break;
724 
725 	    if(gs == GAP) numGappedBases++;
726 	    else nonGapLength++;
727 	    testPos++;
728 	}
729 
730 	// fix the gap order
731 	if(numGappedBases != 0) {
732 	    char* pCurrentSequence = pGapSeq + i;
733 	    memset(pCurrentSequence, nonGapBase, nonGapLength);
734 	    pCurrentSequence += nonGapLength;
735 	    memset(pCurrentSequence, GAP, numGappedBases);
736 	}
737 
738 	// increment
739 	i += numGappedBases + nonGapLength - 1;
740     }
741 }
742