1 #include <iomanip>
2 #include <itoa.h>
3 #include "Matcher.h"
4 #include "Util.h"
5 #include "Parameters.h"
6 #include "StripedSmithWaterman.h"
7
8
Matcher(int querySeqType,int maxSeqLen,BaseMatrix * m,EvalueComputation * evaluer,bool aaBiasCorrection,int gapOpen,int gapExtend,int zdrop)9 Matcher::Matcher(int querySeqType, int maxSeqLen, BaseMatrix *m, EvalueComputation * evaluer,
10 bool aaBiasCorrection, int gapOpen, int gapExtend, int zdrop)
11 : gapOpen(gapOpen), gapExtend(gapExtend), m(m), evaluer(evaluer), tinySubMat(NULL) {
12 if(Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE) == false ) {
13 setSubstitutionMatrix(m);
14 }
15
16 if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
17 nuclaligner = new BandedNucleotideAligner(m, maxSeqLen, gapOpen, gapExtend, zdrop);
18 aligner = NULL;
19 } else {
20 nuclaligner = NULL;
21 aligner = new SmithWaterman(maxSeqLen, m->alphabetSize, aaBiasCorrection);
22 }
23 //std::cout << "lambda=" << lambdaLog2 << " logKLog2=" << logKLog2 << std::endl;
24 }
25
26
setSubstitutionMatrix(BaseMatrix * m)27 void Matcher::setSubstitutionMatrix(BaseMatrix *m){
28 tinySubMat = new int8_t[m->alphabetSize*m->alphabetSize];
29 for (int i = 0; i < m->alphabetSize; i++) {
30 for (int j = 0; j < m->alphabetSize; j++) {
31 tinySubMat[i*m->alphabetSize + j] = m->subMatrix[i][j];
32 }
33 }
34 }
35
~Matcher()36 Matcher::~Matcher(){
37 if(aligner != NULL){
38 delete aligner;
39 }
40 if(nuclaligner != NULL){
41 delete nuclaligner;
42 }
43 if(tinySubMat != NULL){
44 delete [] tinySubMat;
45 tinySubMat = NULL;
46 }
47 }
48
initQuery(Sequence * query)49 void Matcher::initQuery(Sequence* query){
50 currentQuery = query;
51 if(Parameters::isEqualDbtype(query->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES)){
52 nuclaligner->initQuery(query);
53 }else if(Parameters::isEqualDbtype(query->getSeqType(), Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(query->getSeqType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
54 aligner->ssw_init(query, query->getAlignmentProfile(), this->m, 2);
55 }else{
56 aligner->ssw_init(query, this->tinySubMat, this->m, 2);
57 }
58 }
59
60
getSWResult(Sequence * dbSeq,const int diagonal,bool isReverse,const int covMode,const float covThr,const double evalThr,unsigned int alignmentMode,unsigned int seqIdMode,bool isIdentity,bool wrappedScoring)61 Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool isReverse, const int covMode, const float covThr,
62 const double evalThr, unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentity,
63 bool wrappedScoring){
64 // calculation of the score and traceback of the alignment
65 int32_t maskLen = currentQuery->L / 2;
66 int origQueryLen = wrappedScoring? currentQuery->L / 2 : currentQuery->L ;
67
68 // calcuate stop score
69 // const double qL = static_cast<double>(currentQuery->L);
70 // const double dbL = static_cast<double>(dbSeq->L);
71
72 // avoid nummerical issues -log(evalThr/(qL*dbL*seqDbSize))
73 // double datapoints = -log(static_cast<double>(seqDbSize)) - log(qL) - log(dbL) + log(evalThr);
74 //std::cout << seqDbSize << " " << 100 << " " << scoreThr << std::endl;
75 //std::cout <<datapoints << " " << m->getBitFactor() <<" "<< evalThr << " " << seqDbSize << " " << currentQuery->L << " " << dbSeq->L<< " " << scoreThr << " " << std::endl;
76 s_align alignment;
77 // compute sequence identity
78 std::string backtrace;
79 int aaIds = 0;
80
81 if(Parameters::isEqualDbtype(dbSeq->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES)){
82 if(diagonal==INT_MAX){
83 Debug(Debug::ERROR) << "Query sequence " << currentQuery->getDbKey() << " has a result with no diagonal information. Please check your database.\n";
84 EXIT(EXIT_FAILURE);
85 }
86 alignment = nuclaligner->align(dbSeq, diagonal, isReverse, backtrace, aaIds, evaluer, wrappedScoring);
87 alignmentMode = Matcher::SCORE_COV_SEQID;
88 }else{ if(isIdentity==false){
89 alignment = aligner->ssw_align(dbSeq->numSequence, dbSeq->L, gapOpen, gapExtend, alignmentMode, evalThr, evaluer, covMode, covThr, maskLen);
90 }else{
91 alignment = aligner->scoreIdentical(dbSeq->numSequence, dbSeq->L, evaluer, alignmentMode);
92 }
93 if(alignmentMode == Matcher::SCORE_COV_SEQID){
94 if(isIdentity==false){
95 if(alignment.cigar){
96 int32_t targetPos = alignment.dbStartPos1, queryPos = alignment.qStartPos1;
97 for (int32_t c = 0; c < alignment.cigarLen; ++c) {
98 char letter = SmithWaterman::cigar_int_to_op(alignment.cigar[c]);
99 uint32_t length = SmithWaterman::cigar_int_to_len(alignment.cigar[c]);
100 backtrace.reserve(length);
101
102 for (uint32_t i = 0; i < length; ++i){
103 if (letter == 'M') {
104 if (dbSeq->numSequence[targetPos] == currentQuery->numSequence[queryPos]){
105 aaIds++;
106 }
107 ++queryPos;
108 ++targetPos;
109 backtrace.append("M");
110 } else {
111 if (letter == 'I') {
112 ++queryPos;
113 backtrace.append("I");
114 }
115 else{
116 ++targetPos;
117 backtrace.append("D");
118 }
119 }
120 }
121 }
122 }
123 } else {
124 for (int32_t c = 0; c < origQueryLen; ++c) {
125 aaIds++;
126 backtrace.append("M");
127 }
128 }
129 }
130
131 }
132
133 // calculation of the coverage and e-value
134 float qcov = 0.0;
135 float dbcov = 0.0;
136 float seqId = 0.0;
137
138 const unsigned int qStartPos = alignment.qStartPos1;
139 const unsigned int dbStartPos = alignment.dbStartPos1;
140 const unsigned int qEndPos = alignment.qEndPos1;
141 const unsigned int dbEndPos = alignment.dbEndPos1;
142 // normalize score
143 // alignment->score1 = alignment->score1 - log2(dbSeq->L);
144 if(alignmentMode == Matcher::SCORE_COV || alignmentMode == Matcher::SCORE_COV_SEQID) {
145 qcov = alignment.qCov;
146 dbcov = alignment.tCov;
147 }
148
149 unsigned int alnLength = Matcher::computeAlnLength(qStartPos, qEndPos, dbStartPos, dbEndPos);
150 // try to estimate sequence id
151 if(alignmentMode == Matcher::SCORE_COV_SEQID){
152 // compute sequence id
153 if(alignment.cigar){
154 // OVERWRITE alnLength with gapped value
155 alnLength = backtrace.size();
156 }
157 seqId = Util::computeSeqId(seqIdMode, aaIds, origQueryLen, dbSeq->L, alnLength);
158
159 }else if( alignmentMode == Matcher::SCORE_COV){
160 // "20% 30% 40% 50% 60% 70% 80% 90% 99%"
161 // "0.52 1.12 1.73 2.33 2.93 3.53 4.14 4.74 5.28"
162 unsigned int qAlnLen = std::max(qEndPos - qStartPos, static_cast<unsigned int>(1));
163 unsigned int dbAlnLen = std::max(dbEndPos - dbStartPos, static_cast<unsigned int>(1));
164 //seqId = (alignment.score1 / static_cast<float>(std::max(qAlnLength, dbAlnLength))) * 0.1656 + 0.1141;
165 seqId = estimateSeqIdByScorePerCol(alignment.score1, qAlnLen, dbAlnLen);
166 }else if ( alignmentMode == Matcher::SCORE_ONLY){
167 unsigned int qAlnLen = std::max(qEndPos, static_cast<unsigned int>(1));
168 unsigned int dbAlnLen = std::max(dbEndPos, static_cast<unsigned int>(1));
169 //seqId = (alignment.score1 / static_cast<float>(std::max(dbAlnLen, qAlnLen))) * 0.1656 + 0.1141;
170 seqId = estimateSeqIdByScorePerCol(alignment.score1, qAlnLen, dbAlnLen);
171 }
172
173 // E = qL dL * exp^(-S/lambda)
174 double evalue = alignment.evalue;
175 int bitScore = static_cast<int>(evaluer->computeBitScore(alignment.score1)+0.5);
176
177 result_t result;
178 if(isReverse){
179 result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, origQueryLen, dbEndPos, dbStartPos, dbSeq->L, backtrace);
180 }else{
181 result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, origQueryLen, dbStartPos, dbEndPos, dbSeq->L, backtrace);
182 }
183
184
185 delete [] alignment.cigar;
186 return result;
187 }
188
189
readAlignmentResults(std::vector<result_t> & result,char * data,bool readCompressed)190 void Matcher::readAlignmentResults(std::vector<result_t> &result, char *data, bool readCompressed) {
191 if(data == NULL) {
192 return;
193 }
194
195 while(*data != '\0'){
196 result.emplace_back(parseAlignmentRecord(data, readCompressed));
197 data = Util::skipLine(data);
198 }
199 }
200
computeAlnLength(int qStart,int qEnd,int dbStart,int dbEnd)201 int Matcher::computeAlnLength(int qStart, int qEnd, int dbStart, int dbEnd) {
202 return std::max(abs(qEnd - qStart), abs(dbEnd - dbStart)) + 1;
203 }
204
estimateSeqIdByScorePerCol(uint16_t score,unsigned int qLen,unsigned int tLen)205 float Matcher::estimateSeqIdByScorePerCol(uint16_t score, unsigned int qLen, unsigned int tLen) {
206 float estimatedSeqId = (score / static_cast<float>(std::max(qLen, tLen))) * 0.1656 + 0.1141;
207 estimatedSeqId = std::min(estimatedSeqId, 1.0f);
208 return std::max(0.0f, estimatedSeqId);
209 }
210
211
compressAlignment(const std::string & bt)212 std::string Matcher::compressAlignment(const std::string& bt) {
213 std::string ret;
214 char state = 'M';
215 size_t counter = 0;
216 for(size_t i = 0; i < bt.size(); i++){
217 if(bt[i] != state){
218 ret.append(std::to_string(counter));
219 ret.push_back(state);
220 state = bt[i];
221 counter = 1;
222 }else{
223 counter++;
224 }
225 }
226 ret.append(std::to_string(counter));
227 ret.push_back(state);
228 return ret;
229 }
230
uncompressAlignment(const std::string & cbt)231 std::string Matcher::uncompressAlignment(const std::string &cbt) {
232 std::string bt;
233 size_t count = 0;
234 for(size_t i = 0; i < cbt.size(); i++) {
235 sscanf(cbt.c_str() + i, "%zu", &count);
236 for(size_t j = i; j < cbt.size(); j++ ){
237 if(isdigit(cbt[j]) == false){
238 char state = cbt[j];
239 bt.append(count, state);
240 i = j;
241 break;
242 }
243 }
244 }
245 return bt;
246 }
247
parseAlignmentRecord(const char * data,bool readCompressed)248 Matcher::result_t Matcher::parseAlignmentRecord(const char *data, bool readCompressed) {
249 const char *entry[255];
250 size_t columns = Util::getWordsOfLine(data, entry, 255);
251 if (columns < ALN_RES_WITHOUT_BT_COL_CNT) {
252 Debug(Debug::ERROR) << "Invalid alignment result record.\n";
253 EXIT(EXIT_FAILURE);
254 }
255
256 char key[255];
257 ptrdiff_t keySize = (entry[1] - data);
258 strncpy(key, data, keySize);
259 key[keySize] = '\0';
260
261 unsigned int targetId = Util::fast_atoi<unsigned int>(key);
262 int score = Util::fast_atoi<int>(entry[1]);
263 double seqId = strtod(entry[2],NULL);
264 double eval = strtod(entry[3],NULL);
265
266 int qStart = Util::fast_atoi<int>(entry[4]);
267 int qEnd = Util::fast_atoi<int>(entry[5]);
268 int qLen = Util::fast_atoi<int>(entry[6]);
269 int dbStart = Util::fast_atoi<int>(entry[7]);
270 int dbEnd = Util::fast_atoi<int>(entry[8]);
271 int dbLen = Util::fast_atoi<int>(entry[9]);
272 int adjustQstart = (qStart==-1)? 0 : qStart;
273 int adjustDBstart = (dbStart==-1)? 0 : dbStart;
274 double qCov = SmithWaterman::computeCov(adjustQstart, qEnd, qLen);
275 double dbCov = SmithWaterman::computeCov(adjustDBstart, dbEnd, dbLen);
276 size_t alnLength = Matcher::computeAlnLength(adjustQstart, qEnd, adjustDBstart, dbEnd);
277
278 switch(columns) {
279 // 10 no backtrace
280 case ALN_RES_WITHOUT_BT_COL_CNT:
281 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
282 alnLength, qStart, qEnd, qLen, dbStart, dbEnd, dbLen, -1, -1, -1, -1, "");
283 // 11 with backtrace
284 case ALN_RES_WITH_BT_COL_CNT:
285 if (readCompressed) {
286 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
287 alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
288 dbLen, -1, -1, -1, -1, std::string(entry[10], entry[11] - entry[10]));
289 } else {
290 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
291 alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
292 dbLen, -1, -1, -1, -1,
293 uncompressAlignment(std::string(entry[10], entry[11] - entry[10])));
294 }
295 // 12 without backtrace but qOrfStart dbOrfStart
296 case ALN_RES_WITH_ORF_POS_WITHOUT_BT_COL_CNT:
297 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
298 alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
299 dbLen, Util::fast_atoi<int>(entry[10]), Util::fast_atoi<int>(entry[11]),
300 Util::fast_atoi<int>(entry[12]), Util::fast_atoi<int>(entry[13]), "");
301 // 13 without backtrace but qOrfStart dbOrfStart
302 case ALN_RES_WITH_ORF_AND_BT_COL_CNT:
303 if (readCompressed) {
304 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
305 alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
306 dbLen, Util::fast_atoi<int>(entry[10]), Util::fast_atoi<int>(entry[11]),
307 Util::fast_atoi<int>(entry[12]), Util::fast_atoi<int>(entry[13]),
308 std::string(entry[14], entry[15] - entry[14]));
309 } else {
310 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
311 alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
312 dbLen, Util::fast_atoi<int>(entry[10]), Util::fast_atoi<int>(entry[11]),
313 Util::fast_atoi<int>(entry[12]), Util::fast_atoi<int>(entry[13]),
314 uncompressAlignment(std::string(entry[14], entry[15] - entry[14])));
315 }
316 default:
317 Debug(Debug::ERROR) << "Invalid column count in alignment.\n";
318 EXIT(EXIT_FAILURE);
319 }
320 }
321
322
resultToBuffer(char * buff1,const result_t & result,bool addBacktrace,bool compress,bool addOrfPosition)323 size_t Matcher::resultToBuffer(char * buff1, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition) {
324 char * basePos = buff1;
325 char * tmpBuff = Itoa::u32toa_sse2((uint32_t) result.dbKey, buff1);
326 *(tmpBuff-1) = '\t';
327 tmpBuff = Itoa::i32toa_sse2(result.score, tmpBuff);
328 *(tmpBuff-1) = '\t';
329 tmpBuff = Util::fastSeqIdToBuffer(result.seqId, tmpBuff);
330 *(tmpBuff-1) = '\t';
331 tmpBuff += sprintf(tmpBuff,"%.3E",result.eval);
332 tmpBuff++;
333 *(tmpBuff-1) = '\t';
334 tmpBuff = Itoa::i32toa_sse2(result.qStartPos, tmpBuff);
335 *(tmpBuff-1) = '\t';
336 tmpBuff = Itoa::i32toa_sse2(result.qEndPos, tmpBuff);
337 *(tmpBuff-1) = '\t';
338 tmpBuff = Itoa::i32toa_sse2(result.qLen, tmpBuff);
339 *(tmpBuff-1) = '\t';
340 tmpBuff = Itoa::i32toa_sse2(result.dbStartPos, tmpBuff);
341 *(tmpBuff-1) = '\t';
342 tmpBuff = Itoa::i32toa_sse2(result.dbEndPos, tmpBuff);
343 *(tmpBuff-1) = '\t';
344 tmpBuff = Itoa::i32toa_sse2(result.dbLen, tmpBuff);
345 if(addOrfPosition){
346 *(tmpBuff-1) = '\t';
347 tmpBuff = Itoa::i32toa_sse2(result.queryOrfStartPos, tmpBuff);
348 *(tmpBuff-1) = '\t';
349 tmpBuff = Itoa::i32toa_sse2(result.queryOrfEndPos, tmpBuff);
350 *(tmpBuff-1) = '\t';
351 tmpBuff = Itoa::i32toa_sse2(result.dbOrfStartPos, tmpBuff);
352 *(tmpBuff-1) = '\t';
353 tmpBuff = Itoa::i32toa_sse2(result.dbOrfEndPos, tmpBuff);
354 }
355 if(addBacktrace == true){
356 if(compress){
357 *(tmpBuff-1) = '\t';
358 std::string compressedCigar = Matcher::compressAlignment(result.backtrace);
359 tmpBuff = strncpy(tmpBuff, compressedCigar.c_str(), compressedCigar.length());
360 tmpBuff += compressedCigar.length()+1;
361 }else{
362 *(tmpBuff-1) = '\t';
363 tmpBuff = strncpy(tmpBuff, result.backtrace.c_str(), result.backtrace.length());
364 tmpBuff += result.backtrace.length()+1;
365 }
366 }
367 *(tmpBuff-1) = '\n';
368 *(tmpBuff) = '\0';
369 return tmpBuff - basePos;
370 }
371
updateResultByRescoringBacktrace(const char * querySeq,const char * targetSeq,const char ** subMat,EvalueComputation & evaluer,int gapOpen,int gapExtend,result_t & result)372 void Matcher::updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
373 int gapOpen, int gapExtend, result_t &result) {
374 int maxScore = 0;
375 int maxBtEndPos = 0;
376 int maxBtStartPos = 0;
377 int maxQueryEndPos = 0;
378 int maxQueryStartPos = 0;
379 int maxTargetStartPos = 0;
380 int maxTargetEndPos = 0;
381 int minPos = -1;
382 int minQueryPos = result.qStartPos-1;
383 int minTargetPos = result.dbStartPos-1;
384 int score = 0;
385 int identicalAAs = 0;
386 int maxIdAaCnt = 0;
387 int queryPos = result.qStartPos;
388 int targetPos = result.dbStartPos;
389 bool isGapOpen = false;
390
391 for(unsigned int pos = 0; pos < result.backtrace.size(); pos++){
392 char letter = result.backtrace[pos];
393 int curr;
394 if (letter == 'M') {
395 curr = subMat[static_cast<int>(querySeq[queryPos])][static_cast<int>(targetSeq[targetPos])];
396 identicalAAs += (querySeq[queryPos] == targetSeq[targetPos]);
397 isGapOpen = false;
398 } else {
399 curr = (isGapOpen) ? -gapExtend : -gapOpen;
400 isGapOpen = (isGapOpen == false) ? true : isGapOpen;
401 }
402 score = curr + score;
403 // minimum
404 const bool isMinScore = (score <= 0);
405 score = (isMinScore) ? 0 : score;
406 identicalAAs = (isMinScore) ? 0 : identicalAAs;
407 if(isMinScore){
408 minPos = pos;
409 minQueryPos = (letter == 'D') ? queryPos - 1 : queryPos;
410 minTargetPos = (letter == 'I') ? targetPos - 1 : targetPos;
411 }
412 // new max
413 const bool isNewMaxScore = (score > maxScore);
414 if(isNewMaxScore){
415 maxBtEndPos = pos;
416 maxQueryEndPos = queryPos;
417 maxTargetEndPos = targetPos;
418 maxBtStartPos = minPos + 1;
419 maxQueryStartPos = minQueryPos + 1;
420 maxTargetStartPos = minTargetPos + 1;
421 maxScore = score;
422 maxIdAaCnt = identicalAAs;
423 }
424 queryPos += (letter == 'M' || letter == 'I') ? 1 : 0;
425 targetPos += (letter == 'M' || letter == 'D') ? 1 : 0;
426 }
427
428 result.qStartPos = maxQueryStartPos;
429 result.qEndPos = maxQueryEndPos;
430 result.dbStartPos = maxTargetStartPos;
431 result.dbEndPos = maxTargetEndPos;
432 double bitScore = evaluer.computeBitScore(maxScore);
433 double evalue = evaluer.computeEvalue(maxScore, result.qLen);
434 result.score = bitScore;
435 result.eval = evalue;
436 result.alnLength = (maxBtEndPos - maxBtStartPos) + 1;
437 result.seqId = static_cast<float>(maxIdAaCnt) / static_cast<float>(result.alnLength);
438 result.backtrace = result.backtrace.substr(maxBtStartPos, result.alnLength);
439
440 }
441
442
443