1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "FindAlgorithm.h"
23 
24 #include <QRegExp>
25 
26 #include <U2Algorithm/DynTable.h>
27 #include <U2Algorithm/RollingArray.h>
28 
29 #include <U2Core/AppContext.h>
30 #include <U2Core/DNAAlphabet.h>
31 #include <U2Core/DNATranslation.h>
32 #include <U2Core/Log.h>
33 #include <U2Core/TextUtils.h>
34 #include <U2Core/U2AlphabetUtils.h>
35 #include <U2Core/U2SafePoints.h>
36 
37 namespace U2 {
38 
clear()39 void FindAlgorithmResult::clear() {
40     region.startPos = 0;
41     region.length = 0;
42     translation = false;
43     strand = U2Strand::Direct;
44     err = 0;
45 }
46 
toAnnotation(const QString & name,bool splitCircular,int seqLen) const47 SharedAnnotationData FindAlgorithmResult::toAnnotation(const QString &name, bool splitCircular, int seqLen) const {
48     SAFE_POINT(!splitCircular || (seqLen != -1), "Sequence length is not set!", SharedAnnotationData());
49     SharedAnnotationData data(new AnnotationData);
50     data->name = name;
51     if (splitCircular && (region.endPos() > seqLen)) {
52         if (region.startPos >= seqLen) {
53             data->location->regions << U2Region(region.startPos - seqLen, region.length);
54         } else {
55             SAFE_POINT(region.startPos < seqLen, "Region is not correct", SharedAnnotationData());
56             data->location->regions << U2Region(region.startPos, seqLen - region.startPos);
57             data->location->regions << U2Region(0, region.length - (seqLen - region.startPos));
58         }
59     } else {
60         data->location->regions << region;
61     }
62     data->setStrand(strand);
63     data->qualifiers.append(U2Qualifier("mismatches", QString::number(err)));
64     return data;
65 }
66 
toTable(const QList<FindAlgorithmResult> & res,const QString & name,bool splitCircular,int seqLen)67 QList<SharedAnnotationData> FindAlgorithmResult::toTable(const QList<FindAlgorithmResult> &res, const QString &name, bool splitCircular, int seqLen) {
68     QList<SharedAnnotationData> list;
69     foreach (const FindAlgorithmResult &f, res) {
70         list.append(f.toAnnotation(name, splitCircular, seqLen));
71     }
72     return list;
73 }
74 
lessByRegionStartPos(const U2::FindAlgorithmResult & r1,const U2::FindAlgorithmResult & r2)75 bool FindAlgorithmResult::lessByRegionStartPos(const U2::FindAlgorithmResult &r1, const U2::FindAlgorithmResult &r2) {
76     return r1.region.startPos < r2.region.startPos;
77 }
78 
79 class StrandContext {
80 public:
StrandContext(int width,int height,bool insDel,const char * p)81     StrandContext(int width, int height, bool insDel, const char *p)
82         : dynTable(width, height, insDel), pattern(p) {
83     }
84 
StrandContext(const char * data,int arr_size,const char * p)85     StrandContext(const char *data, int arr_size, const char *p)  // using rolling array only in subst mode
86         : rollArr(data, arr_size), pattern(p) {
87     }
88 
StrandContext()89     StrandContext()
90         : pattern(nullptr) {
91     }
92 
estimateRamUsageForOneContext(int width,int height)93     static quint64 estimateRamUsageForOneContext(int width, int height) {
94         return DynTable::estimateTableSizeInBytes(width, height);
95     }
96 
97     DynTable dynTable;
98     RollingArray<char> rollArr;
99     const char *pattern;
100     FindAlgorithmResult res;
101 };
102 
isDirect(FindAlgorithmStrand s)103 static bool isDirect(FindAlgorithmStrand s) {
104     return s == FindAlgorithmStrand_Both || s == FindAlgorithmStrand_Direct;
105 }
106 
isComplement(FindAlgorithmStrand s)107 static bool isComplement(FindAlgorithmStrand s) {
108     return s == FindAlgorithmStrand_Both || s == FindAlgorithmStrand_Complement;
109 }
110 
FindAlgorithmSettings(const QByteArray & pattern,FindAlgorithmStrand strand,DNATranslation * complementTT,DNATranslation * proteinTT,const DNAAlphabet * sequenceAlphabet,const U2Region & searchRegion,int maxErr,FindAlgorithmPatternSettings _patternSettings,bool ambBases,int _maxRegExpResult,int _maxResult2Find)111 FindAlgorithmSettings::FindAlgorithmSettings(const QByteArray &pattern, FindAlgorithmStrand strand, DNATranslation *complementTT, DNATranslation *proteinTT, const DNAAlphabet *sequenceAlphabet, const U2Region &searchRegion, int maxErr, FindAlgorithmPatternSettings _patternSettings, bool ambBases, int _maxRegExpResult, int _maxResult2Find)
112     : pattern(pattern), strand(strand), complementTT(complementTT), proteinTT(proteinTT), sequenceAlphabet(sequenceAlphabet),
113       searchRegion(searchRegion), maxErr(maxErr), patternSettings(_patternSettings),
114       useAmbiguousBases(ambBases), maxRegExpResultLength(_maxRegExpResult),
115       maxResult2Find(_maxResult2Find) {
116 }
117 
cycleIndex(qint64 segmentLen,qint64 index)118 static qint64 cycleIndex(qint64 segmentLen, qint64 index) {
119     return (index >= segmentLen ? index - segmentLen : index);
120 }
121 
getCircularOverlap(const char * seq,const U2Region & searchRange,int defaultCircularOverlap)122 static qint64 getCircularOverlap(const char *seq, const U2Region &searchRange, int defaultCircularOverlap) {
123     int seqLen = QByteArray(seq).size();
124     if (searchRange.length == seqLen && searchRange.startPos == 0) {
125         return defaultCircularOverlap;
126     }
127     return searchRange.endPos() - seqLen;
128 }
129 
getSearchEndPos(const char * seq,const U2Region & searchRange,int defaultCircularOverlap,bool searchIsCircular)130 static qint64 getSearchEndPos(const char *seq, const U2Region &searchRange, int defaultCircularOverlap, bool searchIsCircular) {
131     int seqLen = QByteArray(seq).size();
132     if (searchIsCircular && searchRange.length == seqLen && searchRange.startPos == 0) {
133         return searchRange.endPos() + defaultCircularOverlap;
134     }
135     return searchRange.endPos();
136 }
137 
138 // TODO: in BothStrands&SingleShot mode it's impossible to find result on complement strand if there also a result on direct strand from the same pos!
139 
findInAmino(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,bool insDel,const char * seq,const U2Region & range,bool searchIsCircular,const char * pattern,int patternLen,int maxErr,int & stopFlag,int & percentsCompleted)140 static void findInAmino(FindAlgorithmResultsListener *rl,
141                         DNATranslation *aminoTT,
142                         DNATranslation *complTT,
143                         FindAlgorithmStrand strand,
144                         bool insDel,
145                         const char *seq,
146                         const U2Region &range,
147                         bool searchIsCircular,
148                         const char *pattern,
149                         int patternLen,
150                         int maxErr,
151                         int &stopFlag,
152                         int &percentsCompleted) {
153     SAFE_POINT(aminoTT->getSrcAlphabet()->isNucleic() && aminoTT->getDstAlphabet()->isAmino(),
154                "Invalid alphabet detected!", );
155 
156     int seqLen = QByteArray(seq).size();
157     int width = patternLen + maxErr;
158     int height = patternLen;
159 
160     QByteArray revPattern(pattern);
161     TextUtils::reverse(revPattern.data(), patternLen);
162 
163     StrandContext context[] = {
164         StrandContext(width, height, insDel, pattern),
165         StrandContext(width, height, insDel, pattern),
166         StrandContext(width, height, insDel, pattern),
167         StrandContext(width, height, insDel, revPattern.data()),
168         StrandContext(width, height, insDel, revPattern.data()),
169         StrandContext(width, height, insDel, revPattern.data())};
170 
171     int onePercentLen = range.length / 100;
172     int leftTillPercent = onePercentLen;
173 
174     percentsCompleted = 0;
175     int conStart = isDirect(strand) ? 0 : 1;
176     int conEnd = isComplement(strand) ? 2 : 1;
177     QByteArray complMap = complTT == nullptr ? QByteArray() : complTT->getOne2OneMapper();
178     int patternLenInNucl = 3 * patternLen;
179 
180     int end = getSearchEndPos(seq, range, patternLenInNucl - 1, searchIsCircular);
181     for (int i = range.startPos, translStrand = 0;
182          i < end - 2 && !stopFlag;
183          i++, leftTillPercent--, translStrand = translStrand == 2 ? 0 : translStrand + 1) {
184         for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
185             StrandContext &ctx = context[3 * ci + translStrand];
186             DynTable &dt = ctx.dynTable;
187             const char *p = ctx.pattern;
188             FindAlgorithmResult &res = ctx.res;
189 
190             for (int j = 0; j < patternLen && !stopFlag; j++) {  // TODO: optimize -> specialize loops
191                 int k = cycleIndex(seqLen, i);
192                 char amino = ci == 0 ? aminoTT->translate3to1(seq[k],
193                                                               seq[cycleIndex(seqLen, k + 1)],
194                                                               seq[cycleIndex(seqLen, k + 2)])
195                                      :  // direct amino
196                                  aminoTT->translate3to1(complMap.at((quint8)seq[cycleIndex(seqLen, k + 2)]),
197                                                         complMap.at((quint8)seq[cycleIndex(seqLen, k + 1)]),
198                                                         complMap.at((quint8)seq[k]));  // compl amino
199 
200                 bool matched = (amino == p[j]);
201                 dt.match(j, matched);
202             }
203             int err = dt.getLast();
204             if (!res.isEmpty() && (err > maxErr || (i - res.region.startPos) >= patternLenInNucl)) {
205                 rl->onResult(res);
206                 res.clear();
207             }
208             if (err <= maxErr) {
209                 int newLen = dt.getLastLen();
210                 newLen *= 3;
211                 if (res.isEmpty() || res.err > err || (res.err == err && newLen < res.region.length)) {
212                     SAFE_POINT(newLen + 3 * maxErr >= patternLenInNucl, "Internal algorithm error!", );
213                     int newStart = i - newLen + 3;
214                     bool boundaryCheck = range.contains(newStart) && range.contains(newStart + newLen - 1);
215                     bool circularBoundaryCheck = (!range.contains(newStart + newLen - 1) && searchIsCircular);
216                     if (insDel || boundaryCheck || circularBoundaryCheck) {  // boundary check for mismatch mode
217                         SAFE_POINT(insDel || newLen == patternLenInNucl, "Internal algorithm error!", );
218                         SAFE_POINT(newStart >= range.startPos, "Internal algorithm error!", );
219                         SAFE_POINT(searchIsCircular || newStart + newLen <= range.endPos(), "Internal algorithm error!", );
220 
221                         res.region.startPos = newStart;
222                         res.region.length = newLen;
223                         res.err = err;
224                         res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
225                         res.translation = true;
226                     }
227                 }
228             }
229             dt.shiftColumn();
230             if (leftTillPercent == 0) {
231                 percentsCompleted = qMin(percentsCompleted + 1, 100);
232                 leftTillPercent = onePercentLen;
233             }
234         }  // ci
235     }
236     for (int i = 0; i < 6; i++) {
237         if (!context[i].res.isEmpty()) {  // todo: order by startpos?
238             SAFE_POINT(insDel || context[i].res.region.length == patternLenInNucl,
239                        "Internal algorithm error: found region has invalid length!", );
240             rl->onResult(context[i].res);
241         }
242     }
243 }
findInAmino_subst(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,const U2Region & range,bool searchIsCircular,const char * pattern,int patternLen,int maxErr,int & stopFlag,int & percentsCompleted)244 static void findInAmino_subst(FindAlgorithmResultsListener *rl,
245                               DNATranslation *aminoTT,
246                               DNATranslation *complTT,
247                               FindAlgorithmStrand strand,
248                               const char *seq,
249                               const U2Region &range,
250                               bool searchIsCircular,
251                               const char *pattern,
252                               int patternLen,
253                               int maxErr,
254                               int &stopFlag,
255                               int &percentsCompleted) {
256     SAFE_POINT(nullptr != complTT && nullptr != aminoTT && aminoTT->getSrcAlphabet()->isNucleic() && aminoTT->getDstAlphabet()->isAmino(), "Invalid alphabet supplied!", );
257 
258     int seqLen = QByteArray(seq).size();
259     int patternLenInNucl = 3 * patternLen;
260     if (range.length < patternLenInNucl) {
261         return;
262     }
263 
264     QByteArray revPattern(pattern);
265 
266     QByteArray translatedPiece1(patternLen, 0);
267     QByteArray translatedPiece2(patternLen, 0);
268     QByteArray translatedPiece3(patternLen, 0);
269 
270     QByteArray seqExpanded;
271     seqExpanded.append(QByteArray(seq));
272     if (searchIsCircular) {
273         int bufferSize = getCircularOverlap(seq, range, patternLenInNucl - 1);
274         seqExpanded.append(QByteArray(seq, bufferSize));
275     }
276 
277     int tail = range.length - patternLenInNucl - 2 + (range.length == seqLen && range.startPos == 0 && searchIsCircular) * (patternLenInNucl - 1);
278     tail = tail > 0 ? 0 : tail;
279     aminoTT->translate(seqExpanded.data() + range.startPos, patternLenInNucl + tail + 2, translatedPiece1.data(), patternLen);
280     aminoTT->translate(seqExpanded.data() + range.startPos + 1, patternLenInNucl + tail + 1, translatedPiece2.data(), patternLen);
281     aminoTT->translate(seqExpanded.data() + range.startPos + 2, patternLenInNucl + tail, translatedPiece3.data(), patternLen);
282 
283     QByteArray translatedPiece1c(patternLen, 0);
284     QByteArray translatedPiece2c(patternLen, 0);
285     QByteArray translatedPiece3c(patternLen, 0);
286     QByteArray compl_seq(patternLenInNucl + tail + 2, 0);
287     complTT->translate(seq + range.startPos, patternLenInNucl + tail + 2, compl_seq.data(), patternLenInNucl + tail + 2);
288     TextUtils::reverse(compl_seq.data(), compl_seq.size());
289 
290     int compl_tail = qBound(0, int(range.length - patternLenInNucl), 2);
291     int if0slot = 0 == compl_tail;
292     int if1slot = 1 == compl_tail;
293     int if2slot = 2 == compl_tail;
294 
295     aminoTT->translate(compl_seq.data() + compl_tail, patternLenInNucl, translatedPiece1c.data(), patternLen);
296     aminoTT->translate(compl_seq.data() + 2 * if0slot + if2slot, patternLenInNucl - 2 * if0slot, translatedPiece2c.data(), patternLen);
297     aminoTT->translate(compl_seq.data() + 2 * if1slot + if0slot, patternLenInNucl - if0slot - if1slot, translatedPiece3c.data(), patternLen);
298 
299     StrandContext context[] = {
300         StrandContext(translatedPiece1, patternLen, pattern),
301         StrandContext(translatedPiece2, patternLen, pattern),
302         StrandContext(translatedPiece3, patternLen, pattern),
303         StrandContext(translatedPiece1c, patternLen, revPattern.data()),
304         StrandContext(translatedPiece2c, patternLen, revPattern.data()),
305         StrandContext(translatedPiece3c, patternLen, revPattern.data()),
306     };
307 
308     int onePercentLen = (range.length + searchIsCircular * (patternLenInNucl - 1)) / 100;
309     int leftTillPercent = onePercentLen;
310 
311     percentsCompleted = 0;
312     int conStart = isDirect(strand) ? 0 : 1;
313     int conEnd = isComplement(strand) ? 2 : 1;
314     QByteArray complMap = complTT ? complTT->getOne2OneMapper() : QByteArray();
315 
316     int end = getSearchEndPos(seq, range, patternLenInNucl - 1, searchIsCircular);
317     for (int i = range.startPos, translStrand = 0;
318          i < end - patternLenInNucl + 1 && !stopFlag;
319          i++, leftTillPercent--, translStrand = translStrand == 2 ? 0 : translStrand + 1) {
320         for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
321             StrandContext &ctx = context[ci * 3 + translStrand];
322             const char *p = ctx.pattern;
323             FindAlgorithmResult &res = ctx.res;
324             bool match = true;
325             int curErr = 0;
326             for (int j = 0; j < patternLen && !stopFlag; j++) {
327                 char rollchar = ctx.rollArr.get(j);
328                 if (rollchar != p[j] && ++curErr > maxErr) {
329                     match = false;
330                     break;
331                 }
332             }
333 
334             if (i + patternLenInNucl + 2 < end) {
335                 int wheree = i + patternLenInNucl;
336                 char c1 = ci ? complMap.at((quint8)seqExpanded[wheree + 2]) : seqExpanded[wheree];
337                 char c2 = ci ? complMap.at((quint8)seqExpanded[wheree + 1]) : seqExpanded[wheree + 1];
338                 char c3 = ci ? complMap.at((quint8)seqExpanded[wheree]) : seqExpanded[wheree + 2];
339                 char newchar = aminoTT->translate3to1(c1, c2, c3);
340                 ci ? ctx.rollArr.push_front_pop_back(newchar) : ctx.rollArr.push_back_pop_front(newchar);
341             }
342 
343             if (match) {
344                 res.region.startPos = i;
345                 res.region.length = patternLenInNucl;
346                 res.err = curErr;
347                 res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
348                 res.translation = true;
349 
350                 rl->onResult(res);
351             }
352             if (leftTillPercent == 0) {
353                 percentsCompleted = qMin(percentsCompleted + 1, 100);
354                 leftTillPercent = onePercentLen;
355             }
356         }  // strand
357     }  // base pos
358 }
359 
createAmbiguousBaseMap()360 static char *createAmbiguousBaseMap() {
361     // Source: http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
362     // Unknown symbol is zero: no match
363 
364     const int SIZE = 128;
365     static char map[SIZE];
366 
367     for (int i = 0; i < SIZE; i++) {
368         map[i] = 0x00;
369     }
370 
371     map['A'] = 0x01;  // Bitmask: 00000001
372     map['C'] = 0x02;  // Bitmask: 00000010
373     map['G'] = 0x04;  // Bitmask: 00000100
374     map['T'] = 0x08;  // Bitmask: 00001000
375     map['U'] = 0x08;  // Bitmask: 00001000
376     map['M'] = 0x03;  // Bitmask: 00000011
377     map['R'] = 0x05;  // Bitmask: 00000101
378     map['W'] = 0x09;  // Bitmask: 00001001
379     map['S'] = 0x06;  // Bitmask: 00000110
380     map['Y'] = 0x0A;  // Bitmask: 00001010
381     map['K'] = 0x0C;  // Bitmask: 00001100
382     map['V'] = 0x07;  // Bitmask: 00000111
383     map['H'] = 0x0B;  // Bitmask: 00001011
384     map['D'] = 0x0D;  // Bitmask: 00001101
385     map['B'] = 0x0E;  // Bitmask: 00001110
386     map['N'] = 0x0F;  // Bitmask: 00001111
387     map['X'] = 0x0F;  // Bitmask: 00001111
388 
389     return &map[0];
390 }
391 
cmpAmbiguous(char a,char b)392 bool FindAlgorithm::cmpAmbiguous(char a, char b) {
393     static char *charMap = createAmbiguousBaseMap();
394 
395     SAFE_POINT(a >= 0 && b >= 0, "Invalid characters supplied!", false);
396 
397     char c1 = charMap[uchar(a)];
398     char c2 = charMap[uchar(b)];
399 
400     return c1 & c2;
401 }
402 
match_pattern(const char * seq,const char * p,int start,int patternLen,int maxErr,int & curErr)403 inline bool match_pattern(const char *seq, const char *p, int start, int patternLen, int maxErr, int &curErr) {
404     bool match = true;
405     curErr = 0;
406     for (int j = 0; j < patternLen; j++) {
407         if (seq[start + j] != p[j] && ++curErr > maxErr) {
408             match = false;
409             break;
410         }
411     }
412     return match;
413 }
414 
match_pattern_ambiguous(const char * seq,const char * p,int start,int patternLen,int maxErr,int & curErr)415 inline bool match_pattern_ambiguous(const char *seq, const char *p, int start, int patternLen, int maxErr, int &curErr) {
416     bool match = true;
417     curErr = 0;
418     for (int j = 0; j < patternLen; j++) {
419         if (!FindAlgorithm::cmpAmbiguous(seq[start + j], p[j]) && ++curErr > maxErr) {
420             match = false;
421             break;
422         }
423     }
424     return match;
425 }
426 
427 // reverses found result if it's located on the reverse complement strand
prepareResultPosition(int regionStart,int regionLength,int & foundStart,int foundLength,U2Strand resultStrand)428 inline void prepareResultPosition(int regionStart, int regionLength, int &foundStart, int foundLength, U2Strand resultStrand) {
429     foundStart = resultStrand.isCompementary() ? regionStart + regionLength - foundStart - foundLength : regionStart + foundStart;
430 }
431 
sendResultToListener(int resultStartPos,int resultLength,U2Strand resultStrand,FindAlgorithmResultsListener * rl)432 static void sendResultToListener(int resultStartPos, int resultLength, U2Strand resultStrand, FindAlgorithmResultsListener *rl) {
433     SAFE_POINT(resultLength >= 0 && resultStartPos >= 0, "Invalid find algorithm results", );
434     CHECK(resultLength > 0, );  // zero-length regions may satisfy some regular expressions though they don't make sense
435 
436     FindAlgorithmResult res;
437     res.region.startPos = resultStartPos;
438     res.region.length = resultLength;
439     res.strand = resultStrand;
440 
441     rl->onResult(res);
442 }
443 
regExpSearch(const QString & refSequence,const QRegExp & regExp,const U2Strand & searchStrand,const U2Region & sequenceRange,int maxResultLen,int currentStrand,int tailCut,int totalStrandCount,bool refSeqIsAminoTranslation,int aminoFrameNumber,int & percentsCompleted,int & stopFlag,FindAlgorithmResultsListener * rl,int cyclePoint=-1)444 static void regExpSearch(const QString &refSequence,
445                          const QRegExp &regExp,
446                          const U2Strand &searchStrand,
447                          const U2Region &sequenceRange,
448                          int maxResultLen,
449                          int currentStrand,
450                          int tailCut,  // only for translation on complementary strand, it is size of the tail that can not form amino acid
451                          int totalStrandCount,
452                          bool refSeqIsAminoTranslation,
453                          int aminoFrameNumber,
454                          int &percentsCompleted,
455                          int &stopFlag,
456                          FindAlgorithmResultsListener *rl,
457                          int cyclePoint = -1)  // in cyclePoint position copy of the sequence beginning starts (for circular search)
458 {
459     if (cyclePoint == -1) {
460         cyclePoint = sequenceRange.endPos();
461     }
462     int percentsCompletedOnStart = percentsCompleted;
463     int foundStartPos = 0;
464     while (stopFlag == 0 && (foundStartPos = regExp.indexIn(refSequence, foundStartPos)) != -1) {
465         // remember that there are a few iterations, so a single one yields (1 / @conEnd) of total progress
466         int percentsCompletedInner = (100 * foundStartPos * (currentStrand + 1)) / (sequenceRange.length * totalStrandCount);
467         percentsCompleted = qMin(percentsCompletedOnStart + percentsCompletedInner, 100);
468 
469         int foundLength = regExp.matchedLength();
470         if (maxResultLen >= foundLength) {
471             int resultStartPos = refSeqIsAminoTranslation ? foundStartPos * 3 : foundStartPos;
472             if (resultStartPos < cyclePoint || sequenceRange.startPos != 0) {
473                 const int resultLen = refSeqIsAminoTranslation ? foundLength * 3 : foundLength;
474                 prepareResultPosition(sequenceRange.startPos + (refSeqIsAminoTranslation * aminoFrameNumber),
475                                       sequenceRange.length - (refSeqIsAminoTranslation * aminoFrameNumber),
476                                       resultStartPos,
477                                       resultLen,
478                                       searchStrand);
479                 resultStartPos -= (searchStrand.isCompementary() && refSeqIsAminoTranslation ? tailCut : 0);
480 
481                 sendResultToListener(resultStartPos, resultLen, searchStrand, rl);
482             }
483         }
484 
485         // try to find smaller substrings starting from the same position
486         int substrLength = qMin(foundLength - 1, maxResultLen);
487         while (0 == stopFlag && 0 < substrLength && foundStartPos == (regExp.indexIn(refSequence.left(foundStartPos + substrLength), foundStartPos))) {
488             const int foundSubstrLength = regExp.matchedLength();
489             if (maxResultLen >= foundSubstrLength) {
490                 int resultStartPos = refSeqIsAminoTranslation ? foundStartPos * 3 : foundStartPos;
491                 if (resultStartPos < cyclePoint || sequenceRange.startPos != 0) {
492                     int resultLen = refSeqIsAminoTranslation ? foundSubstrLength * 3 : foundSubstrLength;
493                     prepareResultPosition(sequenceRange.startPos + (refSeqIsAminoTranslation * aminoFrameNumber),
494                                           sequenceRange.length - (refSeqIsAminoTranslation * aminoFrameNumber),
495                                           resultStartPos,
496                                           resultLen,
497                                           searchStrand);
498                     resultStartPos -= (searchStrand.isCompementary() && refSeqIsAminoTranslation ? tailCut : 0);
499 
500                     sendResultToListener(resultStartPos, resultLen, searchStrand, rl);
501                 }
502             }
503             substrLength = foundSubstrLength - 1;
504         }
505 
506         ++foundStartPos;
507     }
508 }
509 
findInAmino_regExp(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,const U2Region & range,bool searchIsCircular,const char * pattern,int maxRegExpResult,int & stopFlag,int & percentsCompleted)510 static void findInAmino_regExp(FindAlgorithmResultsListener *rl,
511                                DNATranslation *aminoTT,
512                                DNATranslation *complTT,
513                                FindAlgorithmStrand strand,
514                                const char *seq,
515                                const U2Region &range,
516                                bool searchIsCircular,
517                                const char *pattern,
518                                int maxRegExpResult,
519                                int &stopFlag,
520                                int &percentsCompleted) {
521     QRegExp regExp(pattern, Qt::CaseInsensitive);
522     CHECK(regExp.isValid(), );
523 
524     percentsCompleted = 0;
525 
526     int conStart = isDirect(strand) ? 0 : 1;
527     int conEnd = isComplement(strand) ? 2 : 1;
528 
529     int maxAminoResult = maxRegExpResult * 3;
530     int seqLen = QByteArray(seq).size();
531 
532     int bufferSize = 0;
533     const char *sequence = nullptr;
534     QByteArray temp;
535     if (searchIsCircular) {
536         bufferSize = getCircularOverlap(seq, range, (seqLen > maxRegExpResult) ? maxRegExpResult - 1 : seqLen - 1);
537         temp = QByteArray(seq) + QByteArray(seq, bufferSize);
538         sequence = temp.data();
539     } else {
540         sequence = seq;
541     }
542 
543     for (int ci = conStart; ci < conEnd && !stopFlag; ++ci) {
544         for (int aminoFrameNumber = 0; aminoFrameNumber < 3 && !stopFlag; aminoFrameNumber++) {
545             int len = seqLen - aminoFrameNumber - range.startPos;
546             if (range.startPos < seqLen && range.endPos() < seqLen) {
547                 len = range.length - aminoFrameNumber;
548             }
549 
550             QString translation;
551             QByteArray rawTranslation(len + bufferSize + 1, 0);
552             U2Strand resultStrand;
553             const int translationLen = (len + bufferSize) / 3;
554 
555             if (ci == 1) {  // complementary
556                 TextUtils::translate(complTT->getOne2OneMapper(), sequence + range.startPos + aminoFrameNumber, len + bufferSize, rawTranslation.data());
557                 TextUtils::reverse(rawTranslation.data(), len + bufferSize - (len + bufferSize) % 3);
558                 aminoTT->translate(rawTranslation.data(), len + bufferSize);
559 
560                 resultStrand = U2Strand::Complementary;
561             } else {  // direct
562                 qstrcpy(rawTranslation.data(), QByteArray(sequence + range.startPos + aminoFrameNumber, len + bufferSize).data());
563                 aminoTT->translate(rawTranslation.data(), len + bufferSize);
564 
565                 resultStrand = U2Strand::Direct;
566             }
567             translation = QString(QByteArray(rawTranslation.data(), translationLen));
568 
569             if (searchIsCircular) {
570                 U2Region cirRange = range;
571                 cirRange.length += (seqLen == range.length && range.startPos == 0) ? bufferSize : 0;
572 
573                 regExpSearch(translation, regExp, resultStrand, cirRange, maxRegExpResult, ci, (len + bufferSize) % 3, conEnd, true, aminoFrameNumber, percentsCompleted, stopFlag, rl, len);
574             } else {
575                 regExpSearch(translation, regExp, resultStrand, range, maxAminoResult, ci, (len + bufferSize) % 3, conEnd, true, aminoFrameNumber, percentsCompleted, stopFlag, rl);
576             }
577         }
578     }
579 }
580 
findRegExp(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,bool searchIsCircular,const U2Region & range,const char * pattern,int maxRegExpResult,int & stopFlag,int & percentsCompleted)581 static void findRegExp(FindAlgorithmResultsListener *rl,
582                        DNATranslation *aminoTT,
583                        DNATranslation *complTT,
584                        FindAlgorithmStrand strand,
585                        const char *seq,
586                        bool searchIsCircular,
587                        const U2Region &range,
588                        const char *pattern,
589                        int maxRegExpResult,
590                        int &stopFlag,
591                        int &percentsCompleted) {
592     QRegExp regExp(pattern, Qt::CaseInsensitive);
593     CHECK(regExp.isValid(), );
594 
595     if (aminoTT != nullptr) {
596         findInAmino_regExp(rl, aminoTT, complTT, strand, seq, range, searchIsCircular, pattern, maxRegExpResult, stopFlag, percentsCompleted);
597         return;
598     }
599 
600     percentsCompleted = 0;
601 
602     int seqLen = QByteArray(seq).size();
603     const int conStart = isDirect(strand) ? 0 : 1;
604     const int conEnd = isComplement(strand) ? 2 : 1;
605 
606     for (int ci = conStart; ci < conEnd && !stopFlag; ++ci) {
607         QString substr;
608         QByteArray tmp(range.length + 1, 0);
609         U2Strand resultStrand;
610 
611         if (ci == 1) {  // complementary
612             char *complSeq = tmp.data();
613             TextUtils::translate(complTT->getOne2OneMapper(), seq + range.startPos, qMin(range.length, seqLen - range.startPos), complSeq);
614             TextUtils::reverse(complSeq, qMin(range.length, seqLen - range.startPos));
615             substr = QString(QByteArray(complSeq, qMin(range.length, seqLen - range.startPos)));
616             if (searchIsCircular) {
617                 int bufferSize = getCircularOverlap(seq, range, (range.length > maxRegExpResult) ? maxRegExpResult - 1 : range.length);
618                 QByteArray buffer(bufferSize, 0);
619                 char *complBuffer = buffer.data();
620                 TextUtils::translate(complTT->getOne2OneMapper(), seq, bufferSize, complBuffer);
621                 TextUtils::reverse(complBuffer, bufferSize);
622                 substr.prepend(buffer);
623             }
624 
625             resultStrand = U2Strand::Complementary;
626         } else {  // direct
627             substr = QString(QByteArray(seq + range.startPos,
628                                         qMin(range.length, seqLen - range.startPos)));
629             if (searchIsCircular) {
630                 int bufferSize = getCircularOverlap(seq, range, (range.length > maxRegExpResult) ? maxRegExpResult - 1 : range.length);
631                 substr += QString(QByteArray(seq, bufferSize));
632             }
633             resultStrand = U2Strand::Direct;
634         }
635 
636         if (searchIsCircular) {
637             U2Region cirRange = range;
638             if (range.length == seqLen && range.startPos == 0) {
639                 cirRange.length += (range.length > maxRegExpResult) ? maxRegExpResult - 1 : range.length;
640             }
641             regExpSearch(substr, regExp, resultStrand, cirRange, maxRegExpResult, ci, 0, conEnd, false, 0, percentsCompleted, stopFlag, rl, seqLen);
642         } else {
643             regExpSearch(substr, regExp, resultStrand, range, maxRegExpResult, ci, 0, conEnd, false, 0, percentsCompleted, stopFlag, rl);
644         }
645     }
646 }
647 
find_subst(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,bool searchIsCircular,const U2Region & range,const char * pattern,int patternLen,bool useAmbiguousBases,int maxErr,int & stopFlag,int & percentsCompleted)648 static void find_subst(FindAlgorithmResultsListener *rl,
649                        DNATranslation *aminoTT,
650                        DNATranslation *complTT,
651                        FindAlgorithmStrand strand,
652                        const char *seq,
653                        bool searchIsCircular,
654                        const U2Region &range,
655                        const char *pattern,
656                        int patternLen,
657                        bool useAmbiguousBases,
658                        int maxErr,
659                        int &stopFlag,
660                        int &percentsCompleted) {
661     SAFE_POINT(nullptr == complTT || complTT->isOne2One(), "Invalid translation supplied!", );
662 
663     if (aminoTT != nullptr) {
664         findInAmino_subst(rl, aminoTT, complTT, strand, seq, range, searchIsCircular, pattern, patternLen, maxErr, stopFlag, percentsCompleted);
665         return;
666     }
667     if (range.length - patternLen < 0) {
668         return;
669     }
670     char *complPattern = nullptr;
671     QByteArray tmp;
672     if (isComplement(strand)) {
673         SAFE_POINT(nullptr != complTT, "Invalid translation supplied!", );
674         tmp.resize(patternLen);
675         complPattern = tmp.data();
676         TextUtils::translate(complTT->getOne2OneMapper(), pattern, patternLen, complPattern);
677         TextUtils::reverse(complPattern, patternLen);
678     }
679 
680     StrandContext context[] = {
681         StrandContext(0, 0, false, pattern),
682         StrandContext(0, 0, false, complPattern)};
683 
684     int onePercentLen = range.length / 100;
685     int leftTillPercent = onePercentLen;
686     percentsCompleted = 0;
687 
688     int conStart = isDirect(strand) ? 0 : 1;
689     int conEnd = isComplement(strand) ? 2 : 1;
690     SAFE_POINT(conStart < conEnd, "Internal algorithm error: incorrect strand order!", );
691 
692     const char *sequence = nullptr;
693     QByteArray temp;
694     int end = range.endPos();
695     if (searchIsCircular) {
696         int beginningSize = getCircularOverlap(seq, range, patternLen - 1);
697         end = getSearchEndPos(seq, range, getCircularOverlap(seq, range, patternLen - 1), searchIsCircular);
698         temp = QByteArray(seq) + QByteArray(seq, beginningSize);
699         sequence = temp.data();
700     } else {
701         sequence = seq;
702     }
703     for (int i = range.startPos;
704          i < end - patternLen + 1 && !stopFlag;
705          i++, leftTillPercent--) {
706         for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
707             StrandContext &ctx = context[ci];
708             const char *p = ctx.pattern;
709             FindAlgorithmResult &res = ctx.res;
710 
711             bool match = true;
712             int curErr = 0;
713             if (useAmbiguousBases) {
714                 match = match_pattern_ambiguous(sequence, p, i, patternLen, maxErr, curErr);
715             } else {
716                 match = match_pattern(sequence, p, i, patternLen, maxErr, curErr);
717             }
718             if (match) {
719                 res.region.startPos = i;
720                 res.region.length = patternLen;
721                 res.err = curErr;
722                 res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
723 
724                 rl->onResult(res);
725             }
726 
727             if (leftTillPercent == 0) {
728                 percentsCompleted = qMin(percentsCompleted + 1, 100);
729                 leftTillPercent = onePercentLen;
730             }
731         }  // strand
732     }  // base pos
733 }
734 
735 // value 12 - standart "out of memory" error code in linux
736 // error in FindAlgorithmResult also means number of mismatches, therefore positive values can cause confusion
737 const int FindAlgorithmResult::NOT_ENOUGH_MEMORY_ERROR = -12;
738 
find(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,FindAlgorithmPatternSettings patternSettings,bool useAmbiguousBases,const char * seq,int seqLen,const DNAAlphabet * sequenceAlphabet,bool searchIsCircular,const U2Region & range,const char * pattern,int patternLen,int maxErr,int maxRegExpResult,int & stopFlag,int & percentsCompleted)739 void FindAlgorithm::find(
740     FindAlgorithmResultsListener *rl,
741     DNATranslation *aminoTT,  // if aminoTT!=NULL -> pattern must contain amino data and sequence must contain DNA data
742     DNATranslation *complTT,  // if complTT!=NULL -> sequence is complemented before comparison with pattern
743     FindAlgorithmStrand strand,  // if not direct there complTT must not be NULL
744     FindAlgorithmPatternSettings patternSettings,
745     bool useAmbiguousBases,
746     const char *seq,
747     int seqLen,
748     const DNAAlphabet *sequenceAlphabet,
749     bool searchIsCircular,
750     const U2Region &range,
751     const char *pattern,
752     int patternLen,
753     int maxErr,
754     int maxRegExpResult,
755     int &stopFlag,
756     int &percentsCompleted) {
757     Q_UNUSED(seqLen);
758     SAFE_POINT(nullptr == complTT || complTT->isOne2One(), "Invalid translation supplied!", );
759     SAFE_POINT(patternLen > maxErr, "Invalid maximum error count supplied!", );
760 
761     if (range.endPos() > seqLen) {
762         searchIsCircular = true;
763     }
764     // no need to search circular on non-circular region
765     if (range.startPos < seqLen && range.endPos() < seqLen) {
766         searchIsCircular = false;
767     }
768 
769     DNATranslation *newComplTT = complTT;
770     if (complTT != nullptr) {
771         const DNAAlphabet *destinationAlphabet = complTT->getDstAlphabet();
772         if (destinationAlphabet->isExtended()) {
773             newComplTT = complTT;
774         } else {
775             newComplTT = AppContext::getDNATranslationRegistry()->lookupComplementTranslation(U2AlphabetUtils::getExtendedAlphabet(destinationAlphabet));
776         }
777     }
778 
779     if (patternSettings == FindAlgorithmPatternSettings_RegExp) {
780         findRegExp(rl, aminoTT, newComplTT, strand, seq, searchIsCircular, range, pattern, maxRegExpResult, stopFlag, percentsCompleted);
781         return;
782     }
783 
784     if (patternSettings == FindAlgorithmPatternSettings_Subst) {
785         find_subst(rl, aminoTT, newComplTT, strand, seq, searchIsCircular, range, pattern, patternLen, useAmbiguousBases, maxErr, stopFlag, percentsCompleted);
786         return;
787     }
788 
789     bool insDel = (patternSettings == FindAlgorithmPatternSettings_InsDel);
790 
791     if (patternSettings == FindAlgorithmPatternSettings_Exact && sequenceAlphabet != nullptr && aminoTT == nullptr) {
792         // exact search -> do not run any search if pattern has illegal symbols
793         if (!U2AlphabetUtils::matches(sequenceAlphabet, pattern, patternLen)) {
794             return;
795         }
796     }
797 
798     if (aminoTT != nullptr) {
799         findInAmino(rl, aminoTT, newComplTT, strand, insDel, seq, range, searchIsCircular, pattern, patternLen, maxErr, stopFlag, percentsCompleted);
800         return;
801     }
802     char *complPattern = nullptr;
803     QByteArray tmp;
804     if (isComplement(strand)) {
805         SAFE_POINT(nullptr != newComplTT, "Invalid translation supplied!", );
806         tmp.resize(patternLen);
807         complPattern = tmp.data();
808         TextUtils::translate(newComplTT->getOne2OneMapper(), pattern, patternLen, complPattern);
809         TextUtils::reverse(complPattern, patternLen);
810     }
811 
812     int width = patternLen + maxErr;
813     int height = patternLen;
814 
815     if (width > INT_MAX / height) {
816         const FindAlgorithmResult result(FindAlgorithmResult::NOT_ENOUGH_MEMORY_ERROR);
817         rl->onResult(result);
818         return;
819     }
820 
821     try {
822         StrandContext context[] = {
823             StrandContext(width, height, insDel, pattern),
824             StrandContext(width, height, insDel, complPattern)};
825 
826         int onePercentLen = range.length / 100;
827         int leftTillPercent = onePercentLen;
828         percentsCompleted = 0;
829 
830         int conStart = isDirect(strand) ? 0 : 1;
831         int conEnd = isComplement(strand) ? 2 : 1;
832         SAFE_POINT(conStart < conEnd, "Internal algorithm error: incorrect strand order!", );
833 
834         int end = getSearchEndPos(seq, range, patternLen - 1, searchIsCircular);
835         for (int i = range.startPos; i < end && !stopFlag; i++, leftTillPercent--) {
836             for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
837                 StrandContext &ctx = context[ci];
838                 DynTable &dt = ctx.dynTable;
839                 const char *p = ctx.pattern;
840                 FindAlgorithmResult &res = ctx.res;
841 
842                 for (int j = 0; j < patternLen && !stopFlag; j++) {
843                     bool matched = seq[cycleIndex(seqLen, i)] == p[j];
844                     dt.match(j, matched);
845                 }
846 
847                 int err = dt.getLast();
848 
849                 if (!res.isEmpty() && (err > maxErr || (i - res.region.startPos) >= patternLen)) {
850                     rl->onResult(res);
851                     res.clear();
852                 }
853 
854                 if (err <= maxErr) {
855                     int newLen = dt.getLastLen();
856                     if (res.isEmpty() || res.err > err || (res.err == err && newLen < res.region.length)) {
857                         int newStart = i - newLen + 1;
858                         bool boundaryCheck = (range.contains(newStart) && range.contains(newStart + newLen - 1));
859                         bool circularBoundaryCheck = (!range.contains(newStart + newLen - 1) && searchIsCircular);
860                         if (insDel || boundaryCheck || circularBoundaryCheck) {  // boundary check for mismatch mode
861                             SAFE_POINT(insDel || newLen == patternLen, "Internal algorithm error!", );
862                             SAFE_POINT(newStart >= range.startPos, "Internal algorithm error!", );
863                             SAFE_POINT(searchIsCircular || newStart + newLen <= range.endPos(), "Internal algorithm error!", );
864 
865                             res.region.startPos = newStart;
866                             res.region.length = newLen;
867                             res.err = err;
868                             res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
869                             res.translation = aminoTT != nullptr;
870                         }
871                     }
872                 }
873 
874                 dt.shiftColumn();
875                 if (leftTillPercent == 0) {
876                     percentsCompleted = qMin(percentsCompleted + 1, 100);
877                     leftTillPercent = onePercentLen;
878                 }
879             }  // strand
880         }  // base pos
881 
882         for (int i = 0; i < 2; i++) {
883             if (!context[i].res.isEmpty()) {  // todo: order by startpos?
884                 SAFE_POINT(insDel || context[i].res.region.length == patternLen,
885                            "Internal algorithm error: found region has invalid length!", );
886                 rl->onResult(context[i].res);
887             }
888         }
889 
890     } catch (std::bad_alloc &) {
891         const FindAlgorithmResult result(FindAlgorithmResult::NOT_ENOUGH_MEMORY_ERROR);
892         rl->onResult(result);
893         return;
894     }
895 }
896 
estimateRamUsageInMbytes(const FindAlgorithmPatternSettings patternSettings,const bool searchInAminoTT,const int patternLength,const int maxError)897 int FindAlgorithm::estimateRamUsageInMbytes(const FindAlgorithmPatternSettings patternSettings,
898                                             const bool searchInAminoTT,
899                                             const int patternLength,
900                                             const int maxError) {
901     const int bytesToMbytesFactor = 1048576;
902     quint64 ramUsage = 0;
903 
904     if (FindAlgorithmPatternSettings_InsDel == patternSettings) {
905         ramUsage = 2 * StrandContext::estimateRamUsageForOneContext(patternLength + maxError,
906                                                                     patternLength);
907         if (searchInAminoTT) {
908             ramUsage *= 4;
909         }
910     } else if (FindAlgorithmPatternSettings_Subst == patternSettings && searchInAminoTT)
911         ramUsage = 7 * patternLength * sizeof(char);
912 
913     return ramUsage / bytesToMbytesFactor;
914 }
915 
916 }  // namespace U2
917