1 #include <iomanip>
2 #include <itoa.h>
3 #include "Matcher.h"
4 #include "Util.h"
5 #include "Parameters.h"
6 #include "StripedSmithWaterman.h"
7 
8 
Matcher(int querySeqType,int maxSeqLen,BaseMatrix * m,EvalueComputation * evaluer,bool aaBiasCorrection,int gapOpen,int gapExtend,int zdrop)9 Matcher::Matcher(int querySeqType, int maxSeqLen, BaseMatrix *m, EvalueComputation * evaluer,
10                  bool aaBiasCorrection, int gapOpen, int gapExtend, int zdrop)
11                  : gapOpen(gapOpen), gapExtend(gapExtend), m(m), evaluer(evaluer), tinySubMat(NULL) {
12     if(Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE) == false ) {
13         setSubstitutionMatrix(m);
14     }
15 
16     if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
17         nuclaligner = new BandedNucleotideAligner(m, maxSeqLen, gapOpen, gapExtend, zdrop);
18         aligner = NULL;
19     } else {
20         nuclaligner = NULL;
21         aligner = new SmithWaterman(maxSeqLen, m->alphabetSize, aaBiasCorrection);
22     }
23     //std::cout << "lambda=" << lambdaLog2 << " logKLog2=" << logKLog2 << std::endl;
24 }
25 
26 
setSubstitutionMatrix(BaseMatrix * m)27 void Matcher::setSubstitutionMatrix(BaseMatrix *m){
28     tinySubMat = new int8_t[m->alphabetSize*m->alphabetSize];
29     for (int i = 0; i < m->alphabetSize; i++) {
30         for (int j = 0; j < m->alphabetSize; j++) {
31             tinySubMat[i*m->alphabetSize + j] = m->subMatrix[i][j];
32         }
33     }
34 }
35 
~Matcher()36 Matcher::~Matcher(){
37     if(aligner != NULL){
38         delete aligner;
39     }
40     if(nuclaligner != NULL){
41         delete nuclaligner;
42     }
43     if(tinySubMat != NULL){
44         delete [] tinySubMat;
45         tinySubMat = NULL;
46     }
47 }
48 
initQuery(Sequence * query)49 void Matcher::initQuery(Sequence* query){
50     currentQuery = query;
51     if(Parameters::isEqualDbtype(query->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES)){
52         nuclaligner->initQuery(query);
53     }else if(Parameters::isEqualDbtype(query->getSeqType(), Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(query->getSeqType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
54         aligner->ssw_init(query, query->getAlignmentProfile(), this->m, 2);
55     }else{
56         aligner->ssw_init(query, this->tinySubMat, this->m, 2);
57     }
58 }
59 
60 
getSWResult(Sequence * dbSeq,const int diagonal,bool isReverse,const int covMode,const float covThr,const double evalThr,unsigned int alignmentMode,unsigned int seqIdMode,bool isIdentity,bool wrappedScoring)61 Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool isReverse, const int covMode, const float covThr,
62                                        const double evalThr, unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentity,
63                                        bool wrappedScoring){
64     // calculation of the score and traceback of the alignment
65     int32_t maskLen = currentQuery->L / 2;
66     int origQueryLen = wrappedScoring? currentQuery->L / 2 : currentQuery->L ;
67 
68     // calcuate stop score
69 //    const double qL = static_cast<double>(currentQuery->L);
70 //    const double dbL = static_cast<double>(dbSeq->L);
71 
72     // avoid nummerical issues -log(evalThr/(qL*dbL*seqDbSize))
73 //    double datapoints = -log(static_cast<double>(seqDbSize)) - log(qL) - log(dbL) + log(evalThr);
74     //std::cout << seqDbSize << " " << 100 << " " << scoreThr << std::endl;
75     //std::cout <<datapoints << " " << m->getBitFactor() <<" "<< evalThr << " " << seqDbSize << " " << currentQuery->L << " " << dbSeq->L<< " " << scoreThr << " " << std::endl;
76     s_align alignment;
77     // compute sequence identity
78     std::string backtrace;
79     int aaIds = 0;
80 
81     if(Parameters::isEqualDbtype(dbSeq->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES)){
82         if(diagonal==INT_MAX){
83             Debug(Debug::ERROR) << "Query sequence " << currentQuery->getDbKey() << " has a result with no diagonal information. Please check your database.\n";
84             EXIT(EXIT_FAILURE);
85         }
86         alignment = nuclaligner->align(dbSeq, diagonal, isReverse, backtrace, aaIds, evaluer, wrappedScoring);
87         alignmentMode = Matcher::SCORE_COV_SEQID;
88     }else{ if(isIdentity==false){
89             alignment = aligner->ssw_align(dbSeq->numSequence, dbSeq->L, gapOpen, gapExtend, alignmentMode, evalThr, evaluer, covMode, covThr, maskLen);
90         }else{
91             alignment = aligner->scoreIdentical(dbSeq->numSequence, dbSeq->L, evaluer, alignmentMode);
92         }
93         if(alignmentMode == Matcher::SCORE_COV_SEQID){
94             if(isIdentity==false){
95                 if(alignment.cigar){
96                     int32_t targetPos = alignment.dbStartPos1, queryPos = alignment.qStartPos1;
97                     for (int32_t c = 0; c < alignment.cigarLen; ++c) {
98                         char letter = SmithWaterman::cigar_int_to_op(alignment.cigar[c]);
99                         uint32_t length = SmithWaterman::cigar_int_to_len(alignment.cigar[c]);
100                         backtrace.reserve(length);
101 
102                         for (uint32_t i = 0; i < length; ++i){
103                             if (letter == 'M') {
104                                 if (dbSeq->numSequence[targetPos] == currentQuery->numSequence[queryPos]){
105                                     aaIds++;
106                                 }
107                                 ++queryPos;
108                                 ++targetPos;
109                                 backtrace.append("M");
110                             } else {
111                                 if (letter == 'I') {
112                                     ++queryPos;
113                                     backtrace.append("I");
114                                 }
115                                 else{
116                                     ++targetPos;
117                                     backtrace.append("D");
118                                 }
119                             }
120                         }
121                     }
122                 }
123             } else {
124                 for (int32_t c = 0; c < origQueryLen; ++c) {
125                     aaIds++;
126                     backtrace.append("M");
127                 }
128             }
129         }
130 
131     }
132 
133     // calculation of the coverage and e-value
134     float qcov = 0.0;
135     float dbcov = 0.0;
136     float seqId = 0.0;
137 
138     const unsigned int qStartPos = alignment.qStartPos1;
139     const unsigned int dbStartPos = alignment.dbStartPos1;
140     const unsigned int qEndPos = alignment.qEndPos1;
141     const unsigned int dbEndPos = alignment.dbEndPos1;
142     // normalize score
143 //    alignment->score1 = alignment->score1 - log2(dbSeq->L);
144     if(alignmentMode == Matcher::SCORE_COV || alignmentMode == Matcher::SCORE_COV_SEQID) {
145         qcov  = alignment.qCov;
146         dbcov = alignment.tCov;
147     }
148 
149     unsigned int alnLength = Matcher::computeAlnLength(qStartPos, qEndPos, dbStartPos, dbEndPos);
150     // try to estimate sequence id
151     if(alignmentMode == Matcher::SCORE_COV_SEQID){
152         // compute sequence id
153         if(alignment.cigar){
154             // OVERWRITE alnLength with gapped value
155             alnLength = backtrace.size();
156         }
157         seqId = Util::computeSeqId(seqIdMode, aaIds, origQueryLen, dbSeq->L, alnLength);
158 
159     }else if( alignmentMode == Matcher::SCORE_COV){
160         // "20%   30%   40%   50%   60%   70%   80%   90%   99%"
161         // "0.52  1.12  1.73  2.33  2.93  3.53  4.14  4.74  5.28"
162         unsigned int qAlnLen = std::max(qEndPos - qStartPos, static_cast<unsigned int>(1));
163         unsigned int dbAlnLen = std::max(dbEndPos - dbStartPos, static_cast<unsigned int>(1));
164         //seqId = (alignment.score1 / static_cast<float>(std::max(qAlnLength, dbAlnLength)))  * 0.1656 + 0.1141;
165         seqId = estimateSeqIdByScorePerCol(alignment.score1, qAlnLen, dbAlnLen);
166     }else if ( alignmentMode == Matcher::SCORE_ONLY){
167         unsigned int qAlnLen = std::max(qEndPos, static_cast<unsigned int>(1));
168         unsigned int dbAlnLen = std::max(dbEndPos, static_cast<unsigned int>(1));
169         //seqId = (alignment.score1 / static_cast<float>(std::max(dbAlnLen, qAlnLen)))  * 0.1656 + 0.1141;
170         seqId = estimateSeqIdByScorePerCol(alignment.score1, qAlnLen, dbAlnLen);
171     }
172 
173     //  E =  qL dL * exp^(-S/lambda)
174     double evalue = alignment.evalue;
175     int bitScore = static_cast<int>(evaluer->computeBitScore(alignment.score1)+0.5);
176 
177     result_t result;
178     if(isReverse){
179         result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, origQueryLen, dbEndPos, dbStartPos, dbSeq->L, backtrace);
180     }else{
181         result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, origQueryLen, dbStartPos, dbEndPos, dbSeq->L, backtrace);
182     }
183 
184 
185     delete [] alignment.cigar;
186     return result;
187 }
188 
189 
readAlignmentResults(std::vector<result_t> & result,char * data,bool readCompressed)190 void Matcher::readAlignmentResults(std::vector<result_t> &result, char *data, bool readCompressed) {
191     if(data == NULL) {
192         return;
193     }
194 
195     while(*data != '\0'){
196         result.emplace_back(parseAlignmentRecord(data, readCompressed));
197         data = Util::skipLine(data);
198     }
199 }
200 
computeAlnLength(int qStart,int qEnd,int dbStart,int dbEnd)201 int Matcher::computeAlnLength(int qStart, int qEnd, int dbStart, int dbEnd) {
202     return std::max(abs(qEnd - qStart), abs(dbEnd - dbStart)) + 1;
203 }
204 
estimateSeqIdByScorePerCol(uint16_t score,unsigned int qLen,unsigned int tLen)205 float Matcher::estimateSeqIdByScorePerCol(uint16_t score, unsigned int qLen, unsigned int tLen) {
206     float estimatedSeqId = (score / static_cast<float>(std::max(qLen, tLen))) * 0.1656 + 0.1141;
207     estimatedSeqId = std::min(estimatedSeqId, 1.0f);
208     return std::max(0.0f, estimatedSeqId);
209 }
210 
211 
compressAlignment(const std::string & bt)212 std::string Matcher::compressAlignment(const std::string& bt) {
213     std::string ret;
214     char state = 'M';
215     size_t counter = 0;
216     for(size_t i = 0; i < bt.size(); i++){
217         if(bt[i] != state){
218             ret.append(std::to_string(counter));
219             ret.push_back(state);
220             state = bt[i];
221             counter = 1;
222         }else{
223             counter++;
224         }
225     }
226     ret.append(std::to_string(counter));
227     ret.push_back(state);
228     return ret;
229 }
230 
uncompressAlignment(const std::string & cbt)231 std::string Matcher::uncompressAlignment(const std::string &cbt) {
232     std::string bt;
233     size_t count = 0;
234     for(size_t i = 0; i < cbt.size(); i++) {
235         sscanf(cbt.c_str() + i, "%zu", &count);
236         for(size_t j = i; j < cbt.size(); j++ ){
237             if(isdigit(cbt[j]) == false){
238                 char state = cbt[j];
239                 bt.append(count, state);
240                 i = j;
241                 break;
242             }
243         }
244     }
245     return bt;
246 }
247 
parseAlignmentRecord(const char * data,bool readCompressed)248 Matcher::result_t Matcher::parseAlignmentRecord(const char *data, bool readCompressed) {
249     const char *entry[255];
250     size_t columns = Util::getWordsOfLine(data, entry, 255);
251     if (columns < ALN_RES_WITHOUT_BT_COL_CNT) {
252         Debug(Debug::ERROR) << "Invalid alignment result record.\n";
253         EXIT(EXIT_FAILURE);
254     }
255 
256     char key[255];
257     ptrdiff_t keySize =  (entry[1] - data);
258     strncpy(key, data, keySize);
259     key[keySize] = '\0';
260 
261     unsigned int targetId = Util::fast_atoi<unsigned int>(key);
262     int score = Util::fast_atoi<int>(entry[1]);
263     double seqId = strtod(entry[2],NULL);
264     double eval = strtod(entry[3],NULL);
265 
266     int qStart =  Util::fast_atoi<int>(entry[4]);
267     int qEnd = Util::fast_atoi<int>(entry[5]);
268     int qLen = Util::fast_atoi<int>(entry[6]);
269     int dbStart = Util::fast_atoi<int>(entry[7]);
270     int dbEnd = Util::fast_atoi<int>(entry[8]);
271     int dbLen = Util::fast_atoi<int>(entry[9]);
272     int adjustQstart = (qStart==-1)? 0 : qStart;
273     int adjustDBstart = (dbStart==-1)? 0 : dbStart;
274     double qCov = SmithWaterman::computeCov(adjustQstart, qEnd, qLen);
275     double dbCov = SmithWaterman::computeCov(adjustDBstart, dbEnd, dbLen);
276     size_t alnLength = Matcher::computeAlnLength(adjustQstart, qEnd, adjustDBstart, dbEnd);
277 
278     switch(columns) {
279         // 10 no backtrace
280         case ALN_RES_WITHOUT_BT_COL_CNT:
281             return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
282                                  alnLength, qStart, qEnd, qLen, dbStart, dbEnd, dbLen, -1, -1, -1, -1, "");
283         // 11 with backtrace
284         case ALN_RES_WITH_BT_COL_CNT:
285             if (readCompressed) {
286                 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
287                                          alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
288                                          dbLen, -1, -1, -1, -1, std::string(entry[10], entry[11] - entry[10]));
289             } else {
290                 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
291                                          alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
292                                          dbLen, -1, -1, -1, -1,
293                                          uncompressAlignment(std::string(entry[10], entry[11] - entry[10])));
294             }
295         // 12 without backtrace but qOrfStart dbOrfStart
296         case ALN_RES_WITH_ORF_POS_WITHOUT_BT_COL_CNT:
297             return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
298                                      alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
299                                      dbLen, Util::fast_atoi<int>(entry[10]), Util::fast_atoi<int>(entry[11]),
300                                      Util::fast_atoi<int>(entry[12]), Util::fast_atoi<int>(entry[13]), "");
301         // 13 without backtrace but qOrfStart dbOrfStart
302         case ALN_RES_WITH_ORF_AND_BT_COL_CNT:
303             if (readCompressed) {
304                 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
305                                          alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
306                                          dbLen, Util::fast_atoi<int>(entry[10]), Util::fast_atoi<int>(entry[11]),
307                                          Util::fast_atoi<int>(entry[12]), Util::fast_atoi<int>(entry[13]),
308                                          std::string(entry[14], entry[15] - entry[14]));
309             } else {
310                 return Matcher::result_t(targetId, score, qCov, dbCov, seqId, eval,
311                                          alnLength, qStart, qEnd, qLen, dbStart, dbEnd,
312                                          dbLen, Util::fast_atoi<int>(entry[10]), Util::fast_atoi<int>(entry[11]),
313                                          Util::fast_atoi<int>(entry[12]), Util::fast_atoi<int>(entry[13]),
314                                          uncompressAlignment(std::string(entry[14], entry[15] - entry[14])));
315             }
316         default:
317             Debug(Debug::ERROR) << "Invalid column count in alignment.\n";
318             EXIT(EXIT_FAILURE);
319     }
320 }
321 
322 
resultToBuffer(char * buff1,const result_t & result,bool addBacktrace,bool compress,bool addOrfPosition)323 size_t Matcher::resultToBuffer(char * buff1, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition) {
324     char * basePos = buff1;
325     char * tmpBuff = Itoa::u32toa_sse2((uint32_t) result.dbKey, buff1);
326     *(tmpBuff-1) = '\t';
327     tmpBuff = Itoa::i32toa_sse2(result.score, tmpBuff);
328     *(tmpBuff-1) = '\t';
329     tmpBuff = Util::fastSeqIdToBuffer(result.seqId, tmpBuff);
330     *(tmpBuff-1) = '\t';
331     tmpBuff += sprintf(tmpBuff,"%.3E",result.eval);
332     tmpBuff++;
333     *(tmpBuff-1) = '\t';
334     tmpBuff = Itoa::i32toa_sse2(result.qStartPos, tmpBuff);
335     *(tmpBuff-1) = '\t';
336     tmpBuff = Itoa::i32toa_sse2(result.qEndPos, tmpBuff);
337     *(tmpBuff-1) = '\t';
338     tmpBuff = Itoa::i32toa_sse2(result.qLen, tmpBuff);
339     *(tmpBuff-1) = '\t';
340     tmpBuff = Itoa::i32toa_sse2(result.dbStartPos, tmpBuff);
341     *(tmpBuff-1) = '\t';
342     tmpBuff = Itoa::i32toa_sse2(result.dbEndPos, tmpBuff);
343     *(tmpBuff-1) = '\t';
344     tmpBuff = Itoa::i32toa_sse2(result.dbLen, tmpBuff);
345     if(addOrfPosition){
346         *(tmpBuff-1) = '\t';
347         tmpBuff = Itoa::i32toa_sse2(result.queryOrfStartPos, tmpBuff);
348         *(tmpBuff-1) = '\t';
349         tmpBuff = Itoa::i32toa_sse2(result.queryOrfEndPos, tmpBuff);
350         *(tmpBuff-1) = '\t';
351         tmpBuff = Itoa::i32toa_sse2(result.dbOrfStartPos, tmpBuff);
352         *(tmpBuff-1) = '\t';
353         tmpBuff = Itoa::i32toa_sse2(result.dbOrfEndPos, tmpBuff);
354     }
355     if(addBacktrace == true){
356         if(compress){
357             *(tmpBuff-1) = '\t';
358             std::string compressedCigar = Matcher::compressAlignment(result.backtrace);
359             tmpBuff = strncpy(tmpBuff, compressedCigar.c_str(), compressedCigar.length());
360             tmpBuff += compressedCigar.length()+1;
361         }else{
362             *(tmpBuff-1) = '\t';
363             tmpBuff = strncpy(tmpBuff, result.backtrace.c_str(), result.backtrace.length());
364             tmpBuff += result.backtrace.length()+1;
365         }
366     }
367     *(tmpBuff-1) = '\n';
368     *(tmpBuff) = '\0';
369     return tmpBuff - basePos;
370 }
371 
updateResultByRescoringBacktrace(const char * querySeq,const char * targetSeq,const char ** subMat,EvalueComputation & evaluer,int gapOpen,int gapExtend,result_t & result)372 void Matcher::updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
373                                                 int gapOpen, int gapExtend, result_t &result) {
374     int maxScore = 0;
375     int maxBtEndPos = 0;
376     int maxBtStartPos = 0;
377     int maxQueryEndPos = 0;
378     int maxQueryStartPos = 0;
379     int maxTargetStartPos = 0;
380     int maxTargetEndPos = 0;
381     int minPos = -1;
382     int minQueryPos = result.qStartPos-1;
383     int minTargetPos = result.dbStartPos-1;
384     int score = 0;
385     int identicalAAs = 0;
386     int maxIdAaCnt = 0;
387     int queryPos = result.qStartPos;
388     int targetPos = result.dbStartPos;
389     bool isGapOpen = false;
390 
391     for(unsigned int pos = 0; pos < result.backtrace.size(); pos++){
392         char letter = result.backtrace[pos];
393         int curr;
394         if (letter == 'M') {
395             curr = subMat[static_cast<int>(querySeq[queryPos])][static_cast<int>(targetSeq[targetPos])];
396             identicalAAs += (querySeq[queryPos] == targetSeq[targetPos]);
397             isGapOpen = false;
398         } else {
399             curr = (isGapOpen) ? -gapExtend : -gapOpen;
400             isGapOpen = (isGapOpen == false) ? true : isGapOpen;
401         }
402         score = curr + score;
403         // minimum
404         const bool isMinScore = (score <= 0);
405         score = (isMinScore) ? 0 : score;
406         identicalAAs = (isMinScore) ? 0 : identicalAAs;
407         if(isMinScore){
408             minPos = pos;
409             minQueryPos = (letter == 'D') ? queryPos - 1 : queryPos;
410             minTargetPos = (letter == 'I') ? targetPos - 1 : targetPos;
411         }
412         // new max
413         const bool isNewMaxScore = (score > maxScore);
414         if(isNewMaxScore){
415             maxBtEndPos = pos;
416             maxQueryEndPos = queryPos;
417             maxTargetEndPos = targetPos;
418             maxBtStartPos = minPos + 1;
419             maxQueryStartPos = minQueryPos + 1;
420             maxTargetStartPos = minTargetPos + 1;
421             maxScore = score;
422             maxIdAaCnt = identicalAAs;
423         }
424         queryPos += (letter == 'M' || letter == 'I') ? 1 : 0;
425         targetPos += (letter == 'M' || letter == 'D') ? 1 : 0;
426     }
427 
428     result.qStartPos = maxQueryStartPos;
429     result.qEndPos = maxQueryEndPos;
430     result.dbStartPos = maxTargetStartPos;
431     result.dbEndPos = maxTargetEndPos;
432     double bitScore = evaluer.computeBitScore(maxScore);
433     double evalue = evaluer.computeEvalue(maxScore, result.qLen);
434     result.score = bitScore;
435     result.eval = evalue;
436     result.alnLength = (maxBtEndPos - maxBtStartPos) + 1;
437     result.seqId = static_cast<float>(maxIdAaCnt) / static_cast<float>(result.alnLength);
438     result.backtrace = result.backtrace.substr(maxBtStartPos, result.alnLength);
439 
440 }
441 
442 
443