1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "FindAlgorithm.h"
23
24 #include <QRegExp>
25
26 #include <U2Algorithm/DynTable.h>
27 #include <U2Algorithm/RollingArray.h>
28
29 #include <U2Core/AppContext.h>
30 #include <U2Core/DNAAlphabet.h>
31 #include <U2Core/DNATranslation.h>
32 #include <U2Core/Log.h>
33 #include <U2Core/TextUtils.h>
34 #include <U2Core/U2AlphabetUtils.h>
35 #include <U2Core/U2SafePoints.h>
36
37 namespace U2 {
38
clear()39 void FindAlgorithmResult::clear() {
40 region.startPos = 0;
41 region.length = 0;
42 translation = false;
43 strand = U2Strand::Direct;
44 err = 0;
45 }
46
toAnnotation(const QString & name,bool splitCircular,int seqLen) const47 SharedAnnotationData FindAlgorithmResult::toAnnotation(const QString &name, bool splitCircular, int seqLen) const {
48 SAFE_POINT(!splitCircular || (seqLen != -1), "Sequence length is not set!", SharedAnnotationData());
49 SharedAnnotationData data(new AnnotationData);
50 data->name = name;
51 if (splitCircular && (region.endPos() > seqLen)) {
52 if (region.startPos >= seqLen) {
53 data->location->regions << U2Region(region.startPos - seqLen, region.length);
54 } else {
55 SAFE_POINT(region.startPos < seqLen, "Region is not correct", SharedAnnotationData());
56 data->location->regions << U2Region(region.startPos, seqLen - region.startPos);
57 data->location->regions << U2Region(0, region.length - (seqLen - region.startPos));
58 }
59 } else {
60 data->location->regions << region;
61 }
62 data->setStrand(strand);
63 data->qualifiers.append(U2Qualifier("mismatches", QString::number(err)));
64 return data;
65 }
66
toTable(const QList<FindAlgorithmResult> & res,const QString & name,bool splitCircular,int seqLen)67 QList<SharedAnnotationData> FindAlgorithmResult::toTable(const QList<FindAlgorithmResult> &res, const QString &name, bool splitCircular, int seqLen) {
68 QList<SharedAnnotationData> list;
69 foreach (const FindAlgorithmResult &f, res) {
70 list.append(f.toAnnotation(name, splitCircular, seqLen));
71 }
72 return list;
73 }
74
lessByRegionStartPos(const U2::FindAlgorithmResult & r1,const U2::FindAlgorithmResult & r2)75 bool FindAlgorithmResult::lessByRegionStartPos(const U2::FindAlgorithmResult &r1, const U2::FindAlgorithmResult &r2) {
76 return r1.region.startPos < r2.region.startPos;
77 }
78
79 class StrandContext {
80 public:
StrandContext(int width,int height,bool insDel,const char * p)81 StrandContext(int width, int height, bool insDel, const char *p)
82 : dynTable(width, height, insDel), pattern(p) {
83 }
84
StrandContext(const char * data,int arr_size,const char * p)85 StrandContext(const char *data, int arr_size, const char *p) // using rolling array only in subst mode
86 : rollArr(data, arr_size), pattern(p) {
87 }
88
StrandContext()89 StrandContext()
90 : pattern(nullptr) {
91 }
92
estimateRamUsageForOneContext(int width,int height)93 static quint64 estimateRamUsageForOneContext(int width, int height) {
94 return DynTable::estimateTableSizeInBytes(width, height);
95 }
96
97 DynTable dynTable;
98 RollingArray<char> rollArr;
99 const char *pattern;
100 FindAlgorithmResult res;
101 };
102
isDirect(FindAlgorithmStrand s)103 static bool isDirect(FindAlgorithmStrand s) {
104 return s == FindAlgorithmStrand_Both || s == FindAlgorithmStrand_Direct;
105 }
106
isComplement(FindAlgorithmStrand s)107 static bool isComplement(FindAlgorithmStrand s) {
108 return s == FindAlgorithmStrand_Both || s == FindAlgorithmStrand_Complement;
109 }
110
FindAlgorithmSettings(const QByteArray & pattern,FindAlgorithmStrand strand,DNATranslation * complementTT,DNATranslation * proteinTT,const DNAAlphabet * sequenceAlphabet,const U2Region & searchRegion,int maxErr,FindAlgorithmPatternSettings _patternSettings,bool ambBases,int _maxRegExpResult,int _maxResult2Find)111 FindAlgorithmSettings::FindAlgorithmSettings(const QByteArray &pattern, FindAlgorithmStrand strand, DNATranslation *complementTT, DNATranslation *proteinTT, const DNAAlphabet *sequenceAlphabet, const U2Region &searchRegion, int maxErr, FindAlgorithmPatternSettings _patternSettings, bool ambBases, int _maxRegExpResult, int _maxResult2Find)
112 : pattern(pattern), strand(strand), complementTT(complementTT), proteinTT(proteinTT), sequenceAlphabet(sequenceAlphabet),
113 searchRegion(searchRegion), maxErr(maxErr), patternSettings(_patternSettings),
114 useAmbiguousBases(ambBases), maxRegExpResultLength(_maxRegExpResult),
115 maxResult2Find(_maxResult2Find) {
116 }
117
cycleIndex(qint64 segmentLen,qint64 index)118 static qint64 cycleIndex(qint64 segmentLen, qint64 index) {
119 return (index >= segmentLen ? index - segmentLen : index);
120 }
121
getCircularOverlap(const char * seq,const U2Region & searchRange,int defaultCircularOverlap)122 static qint64 getCircularOverlap(const char *seq, const U2Region &searchRange, int defaultCircularOverlap) {
123 int seqLen = QByteArray(seq).size();
124 if (searchRange.length == seqLen && searchRange.startPos == 0) {
125 return defaultCircularOverlap;
126 }
127 return searchRange.endPos() - seqLen;
128 }
129
getSearchEndPos(const char * seq,const U2Region & searchRange,int defaultCircularOverlap,bool searchIsCircular)130 static qint64 getSearchEndPos(const char *seq, const U2Region &searchRange, int defaultCircularOverlap, bool searchIsCircular) {
131 int seqLen = QByteArray(seq).size();
132 if (searchIsCircular && searchRange.length == seqLen && searchRange.startPos == 0) {
133 return searchRange.endPos() + defaultCircularOverlap;
134 }
135 return searchRange.endPos();
136 }
137
138 // TODO: in BothStrands&SingleShot mode it's impossible to find result on complement strand if there also a result on direct strand from the same pos!
139
findInAmino(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,bool insDel,const char * seq,const U2Region & range,bool searchIsCircular,const char * pattern,int patternLen,int maxErr,int & stopFlag,int & percentsCompleted)140 static void findInAmino(FindAlgorithmResultsListener *rl,
141 DNATranslation *aminoTT,
142 DNATranslation *complTT,
143 FindAlgorithmStrand strand,
144 bool insDel,
145 const char *seq,
146 const U2Region &range,
147 bool searchIsCircular,
148 const char *pattern,
149 int patternLen,
150 int maxErr,
151 int &stopFlag,
152 int &percentsCompleted) {
153 SAFE_POINT(aminoTT->getSrcAlphabet()->isNucleic() && aminoTT->getDstAlphabet()->isAmino(),
154 "Invalid alphabet detected!", );
155
156 int seqLen = QByteArray(seq).size();
157 int width = patternLen + maxErr;
158 int height = patternLen;
159
160 QByteArray revPattern(pattern);
161 TextUtils::reverse(revPattern.data(), patternLen);
162
163 StrandContext context[] = {
164 StrandContext(width, height, insDel, pattern),
165 StrandContext(width, height, insDel, pattern),
166 StrandContext(width, height, insDel, pattern),
167 StrandContext(width, height, insDel, revPattern.data()),
168 StrandContext(width, height, insDel, revPattern.data()),
169 StrandContext(width, height, insDel, revPattern.data())};
170
171 int onePercentLen = range.length / 100;
172 int leftTillPercent = onePercentLen;
173
174 percentsCompleted = 0;
175 int conStart = isDirect(strand) ? 0 : 1;
176 int conEnd = isComplement(strand) ? 2 : 1;
177 QByteArray complMap = complTT == nullptr ? QByteArray() : complTT->getOne2OneMapper();
178 int patternLenInNucl = 3 * patternLen;
179
180 int end = getSearchEndPos(seq, range, patternLenInNucl - 1, searchIsCircular);
181 for (int i = range.startPos, translStrand = 0;
182 i < end - 2 && !stopFlag;
183 i++, leftTillPercent--, translStrand = translStrand == 2 ? 0 : translStrand + 1) {
184 for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
185 StrandContext &ctx = context[3 * ci + translStrand];
186 DynTable &dt = ctx.dynTable;
187 const char *p = ctx.pattern;
188 FindAlgorithmResult &res = ctx.res;
189
190 for (int j = 0; j < patternLen && !stopFlag; j++) { // TODO: optimize -> specialize loops
191 int k = cycleIndex(seqLen, i);
192 char amino = ci == 0 ? aminoTT->translate3to1(seq[k],
193 seq[cycleIndex(seqLen, k + 1)],
194 seq[cycleIndex(seqLen, k + 2)])
195 : // direct amino
196 aminoTT->translate3to1(complMap.at((quint8)seq[cycleIndex(seqLen, k + 2)]),
197 complMap.at((quint8)seq[cycleIndex(seqLen, k + 1)]),
198 complMap.at((quint8)seq[k])); // compl amino
199
200 bool matched = (amino == p[j]);
201 dt.match(j, matched);
202 }
203 int err = dt.getLast();
204 if (!res.isEmpty() && (err > maxErr || (i - res.region.startPos) >= patternLenInNucl)) {
205 rl->onResult(res);
206 res.clear();
207 }
208 if (err <= maxErr) {
209 int newLen = dt.getLastLen();
210 newLen *= 3;
211 if (res.isEmpty() || res.err > err || (res.err == err && newLen < res.region.length)) {
212 SAFE_POINT(newLen + 3 * maxErr >= patternLenInNucl, "Internal algorithm error!", );
213 int newStart = i - newLen + 3;
214 bool boundaryCheck = range.contains(newStart) && range.contains(newStart + newLen - 1);
215 bool circularBoundaryCheck = (!range.contains(newStart + newLen - 1) && searchIsCircular);
216 if (insDel || boundaryCheck || circularBoundaryCheck) { // boundary check for mismatch mode
217 SAFE_POINT(insDel || newLen == patternLenInNucl, "Internal algorithm error!", );
218 SAFE_POINT(newStart >= range.startPos, "Internal algorithm error!", );
219 SAFE_POINT(searchIsCircular || newStart + newLen <= range.endPos(), "Internal algorithm error!", );
220
221 res.region.startPos = newStart;
222 res.region.length = newLen;
223 res.err = err;
224 res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
225 res.translation = true;
226 }
227 }
228 }
229 dt.shiftColumn();
230 if (leftTillPercent == 0) {
231 percentsCompleted = qMin(percentsCompleted + 1, 100);
232 leftTillPercent = onePercentLen;
233 }
234 } // ci
235 }
236 for (int i = 0; i < 6; i++) {
237 if (!context[i].res.isEmpty()) { // todo: order by startpos?
238 SAFE_POINT(insDel || context[i].res.region.length == patternLenInNucl,
239 "Internal algorithm error: found region has invalid length!", );
240 rl->onResult(context[i].res);
241 }
242 }
243 }
findInAmino_subst(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,const U2Region & range,bool searchIsCircular,const char * pattern,int patternLen,int maxErr,int & stopFlag,int & percentsCompleted)244 static void findInAmino_subst(FindAlgorithmResultsListener *rl,
245 DNATranslation *aminoTT,
246 DNATranslation *complTT,
247 FindAlgorithmStrand strand,
248 const char *seq,
249 const U2Region &range,
250 bool searchIsCircular,
251 const char *pattern,
252 int patternLen,
253 int maxErr,
254 int &stopFlag,
255 int &percentsCompleted) {
256 SAFE_POINT(nullptr != complTT && nullptr != aminoTT && aminoTT->getSrcAlphabet()->isNucleic() && aminoTT->getDstAlphabet()->isAmino(), "Invalid alphabet supplied!", );
257
258 int seqLen = QByteArray(seq).size();
259 int patternLenInNucl = 3 * patternLen;
260 if (range.length < patternLenInNucl) {
261 return;
262 }
263
264 QByteArray revPattern(pattern);
265
266 QByteArray translatedPiece1(patternLen, 0);
267 QByteArray translatedPiece2(patternLen, 0);
268 QByteArray translatedPiece3(patternLen, 0);
269
270 QByteArray seqExpanded;
271 seqExpanded.append(QByteArray(seq));
272 if (searchIsCircular) {
273 int bufferSize = getCircularOverlap(seq, range, patternLenInNucl - 1);
274 seqExpanded.append(QByteArray(seq, bufferSize));
275 }
276
277 int tail = range.length - patternLenInNucl - 2 + (range.length == seqLen && range.startPos == 0 && searchIsCircular) * (patternLenInNucl - 1);
278 tail = tail > 0 ? 0 : tail;
279 aminoTT->translate(seqExpanded.data() + range.startPos, patternLenInNucl + tail + 2, translatedPiece1.data(), patternLen);
280 aminoTT->translate(seqExpanded.data() + range.startPos + 1, patternLenInNucl + tail + 1, translatedPiece2.data(), patternLen);
281 aminoTT->translate(seqExpanded.data() + range.startPos + 2, patternLenInNucl + tail, translatedPiece3.data(), patternLen);
282
283 QByteArray translatedPiece1c(patternLen, 0);
284 QByteArray translatedPiece2c(patternLen, 0);
285 QByteArray translatedPiece3c(patternLen, 0);
286 QByteArray compl_seq(patternLenInNucl + tail + 2, 0);
287 complTT->translate(seq + range.startPos, patternLenInNucl + tail + 2, compl_seq.data(), patternLenInNucl + tail + 2);
288 TextUtils::reverse(compl_seq.data(), compl_seq.size());
289
290 int compl_tail = qBound(0, int(range.length - patternLenInNucl), 2);
291 int if0slot = 0 == compl_tail;
292 int if1slot = 1 == compl_tail;
293 int if2slot = 2 == compl_tail;
294
295 aminoTT->translate(compl_seq.data() + compl_tail, patternLenInNucl, translatedPiece1c.data(), patternLen);
296 aminoTT->translate(compl_seq.data() + 2 * if0slot + if2slot, patternLenInNucl - 2 * if0slot, translatedPiece2c.data(), patternLen);
297 aminoTT->translate(compl_seq.data() + 2 * if1slot + if0slot, patternLenInNucl - if0slot - if1slot, translatedPiece3c.data(), patternLen);
298
299 StrandContext context[] = {
300 StrandContext(translatedPiece1, patternLen, pattern),
301 StrandContext(translatedPiece2, patternLen, pattern),
302 StrandContext(translatedPiece3, patternLen, pattern),
303 StrandContext(translatedPiece1c, patternLen, revPattern.data()),
304 StrandContext(translatedPiece2c, patternLen, revPattern.data()),
305 StrandContext(translatedPiece3c, patternLen, revPattern.data()),
306 };
307
308 int onePercentLen = (range.length + searchIsCircular * (patternLenInNucl - 1)) / 100;
309 int leftTillPercent = onePercentLen;
310
311 percentsCompleted = 0;
312 int conStart = isDirect(strand) ? 0 : 1;
313 int conEnd = isComplement(strand) ? 2 : 1;
314 QByteArray complMap = complTT ? complTT->getOne2OneMapper() : QByteArray();
315
316 int end = getSearchEndPos(seq, range, patternLenInNucl - 1, searchIsCircular);
317 for (int i = range.startPos, translStrand = 0;
318 i < end - patternLenInNucl + 1 && !stopFlag;
319 i++, leftTillPercent--, translStrand = translStrand == 2 ? 0 : translStrand + 1) {
320 for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
321 StrandContext &ctx = context[ci * 3 + translStrand];
322 const char *p = ctx.pattern;
323 FindAlgorithmResult &res = ctx.res;
324 bool match = true;
325 int curErr = 0;
326 for (int j = 0; j < patternLen && !stopFlag; j++) {
327 char rollchar = ctx.rollArr.get(j);
328 if (rollchar != p[j] && ++curErr > maxErr) {
329 match = false;
330 break;
331 }
332 }
333
334 if (i + patternLenInNucl + 2 < end) {
335 int wheree = i + patternLenInNucl;
336 char c1 = ci ? complMap.at((quint8)seqExpanded[wheree + 2]) : seqExpanded[wheree];
337 char c2 = ci ? complMap.at((quint8)seqExpanded[wheree + 1]) : seqExpanded[wheree + 1];
338 char c3 = ci ? complMap.at((quint8)seqExpanded[wheree]) : seqExpanded[wheree + 2];
339 char newchar = aminoTT->translate3to1(c1, c2, c3);
340 ci ? ctx.rollArr.push_front_pop_back(newchar) : ctx.rollArr.push_back_pop_front(newchar);
341 }
342
343 if (match) {
344 res.region.startPos = i;
345 res.region.length = patternLenInNucl;
346 res.err = curErr;
347 res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
348 res.translation = true;
349
350 rl->onResult(res);
351 }
352 if (leftTillPercent == 0) {
353 percentsCompleted = qMin(percentsCompleted + 1, 100);
354 leftTillPercent = onePercentLen;
355 }
356 } // strand
357 } // base pos
358 }
359
createAmbiguousBaseMap()360 static char *createAmbiguousBaseMap() {
361 // Source: http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
362 // Unknown symbol is zero: no match
363
364 const int SIZE = 128;
365 static char map[SIZE];
366
367 for (int i = 0; i < SIZE; i++) {
368 map[i] = 0x00;
369 }
370
371 map['A'] = 0x01; // Bitmask: 00000001
372 map['C'] = 0x02; // Bitmask: 00000010
373 map['G'] = 0x04; // Bitmask: 00000100
374 map['T'] = 0x08; // Bitmask: 00001000
375 map['U'] = 0x08; // Bitmask: 00001000
376 map['M'] = 0x03; // Bitmask: 00000011
377 map['R'] = 0x05; // Bitmask: 00000101
378 map['W'] = 0x09; // Bitmask: 00001001
379 map['S'] = 0x06; // Bitmask: 00000110
380 map['Y'] = 0x0A; // Bitmask: 00001010
381 map['K'] = 0x0C; // Bitmask: 00001100
382 map['V'] = 0x07; // Bitmask: 00000111
383 map['H'] = 0x0B; // Bitmask: 00001011
384 map['D'] = 0x0D; // Bitmask: 00001101
385 map['B'] = 0x0E; // Bitmask: 00001110
386 map['N'] = 0x0F; // Bitmask: 00001111
387 map['X'] = 0x0F; // Bitmask: 00001111
388
389 return &map[0];
390 }
391
cmpAmbiguous(char a,char b)392 bool FindAlgorithm::cmpAmbiguous(char a, char b) {
393 static char *charMap = createAmbiguousBaseMap();
394
395 SAFE_POINT(a >= 0 && b >= 0, "Invalid characters supplied!", false);
396
397 char c1 = charMap[uchar(a)];
398 char c2 = charMap[uchar(b)];
399
400 return c1 & c2;
401 }
402
match_pattern(const char * seq,const char * p,int start,int patternLen,int maxErr,int & curErr)403 inline bool match_pattern(const char *seq, const char *p, int start, int patternLen, int maxErr, int &curErr) {
404 bool match = true;
405 curErr = 0;
406 for (int j = 0; j < patternLen; j++) {
407 if (seq[start + j] != p[j] && ++curErr > maxErr) {
408 match = false;
409 break;
410 }
411 }
412 return match;
413 }
414
match_pattern_ambiguous(const char * seq,const char * p,int start,int patternLen,int maxErr,int & curErr)415 inline bool match_pattern_ambiguous(const char *seq, const char *p, int start, int patternLen, int maxErr, int &curErr) {
416 bool match = true;
417 curErr = 0;
418 for (int j = 0; j < patternLen; j++) {
419 if (!FindAlgorithm::cmpAmbiguous(seq[start + j], p[j]) && ++curErr > maxErr) {
420 match = false;
421 break;
422 }
423 }
424 return match;
425 }
426
427 // reverses found result if it's located on the reverse complement strand
prepareResultPosition(int regionStart,int regionLength,int & foundStart,int foundLength,U2Strand resultStrand)428 inline void prepareResultPosition(int regionStart, int regionLength, int &foundStart, int foundLength, U2Strand resultStrand) {
429 foundStart = resultStrand.isCompementary() ? regionStart + regionLength - foundStart - foundLength : regionStart + foundStart;
430 }
431
sendResultToListener(int resultStartPos,int resultLength,U2Strand resultStrand,FindAlgorithmResultsListener * rl)432 static void sendResultToListener(int resultStartPos, int resultLength, U2Strand resultStrand, FindAlgorithmResultsListener *rl) {
433 SAFE_POINT(resultLength >= 0 && resultStartPos >= 0, "Invalid find algorithm results", );
434 CHECK(resultLength > 0, ); // zero-length regions may satisfy some regular expressions though they don't make sense
435
436 FindAlgorithmResult res;
437 res.region.startPos = resultStartPos;
438 res.region.length = resultLength;
439 res.strand = resultStrand;
440
441 rl->onResult(res);
442 }
443
regExpSearch(const QString & refSequence,const QRegExp & regExp,const U2Strand & searchStrand,const U2Region & sequenceRange,int maxResultLen,int currentStrand,int tailCut,int totalStrandCount,bool refSeqIsAminoTranslation,int aminoFrameNumber,int & percentsCompleted,int & stopFlag,FindAlgorithmResultsListener * rl,int cyclePoint=-1)444 static void regExpSearch(const QString &refSequence,
445 const QRegExp ®Exp,
446 const U2Strand &searchStrand,
447 const U2Region &sequenceRange,
448 int maxResultLen,
449 int currentStrand,
450 int tailCut, // only for translation on complementary strand, it is size of the tail that can not form amino acid
451 int totalStrandCount,
452 bool refSeqIsAminoTranslation,
453 int aminoFrameNumber,
454 int &percentsCompleted,
455 int &stopFlag,
456 FindAlgorithmResultsListener *rl,
457 int cyclePoint = -1) // in cyclePoint position copy of the sequence beginning starts (for circular search)
458 {
459 if (cyclePoint == -1) {
460 cyclePoint = sequenceRange.endPos();
461 }
462 int percentsCompletedOnStart = percentsCompleted;
463 int foundStartPos = 0;
464 while (stopFlag == 0 && (foundStartPos = regExp.indexIn(refSequence, foundStartPos)) != -1) {
465 // remember that there are a few iterations, so a single one yields (1 / @conEnd) of total progress
466 int percentsCompletedInner = (100 * foundStartPos * (currentStrand + 1)) / (sequenceRange.length * totalStrandCount);
467 percentsCompleted = qMin(percentsCompletedOnStart + percentsCompletedInner, 100);
468
469 int foundLength = regExp.matchedLength();
470 if (maxResultLen >= foundLength) {
471 int resultStartPos = refSeqIsAminoTranslation ? foundStartPos * 3 : foundStartPos;
472 if (resultStartPos < cyclePoint || sequenceRange.startPos != 0) {
473 const int resultLen = refSeqIsAminoTranslation ? foundLength * 3 : foundLength;
474 prepareResultPosition(sequenceRange.startPos + (refSeqIsAminoTranslation * aminoFrameNumber),
475 sequenceRange.length - (refSeqIsAminoTranslation * aminoFrameNumber),
476 resultStartPos,
477 resultLen,
478 searchStrand);
479 resultStartPos -= (searchStrand.isCompementary() && refSeqIsAminoTranslation ? tailCut : 0);
480
481 sendResultToListener(resultStartPos, resultLen, searchStrand, rl);
482 }
483 }
484
485 // try to find smaller substrings starting from the same position
486 int substrLength = qMin(foundLength - 1, maxResultLen);
487 while (0 == stopFlag && 0 < substrLength && foundStartPos == (regExp.indexIn(refSequence.left(foundStartPos + substrLength), foundStartPos))) {
488 const int foundSubstrLength = regExp.matchedLength();
489 if (maxResultLen >= foundSubstrLength) {
490 int resultStartPos = refSeqIsAminoTranslation ? foundStartPos * 3 : foundStartPos;
491 if (resultStartPos < cyclePoint || sequenceRange.startPos != 0) {
492 int resultLen = refSeqIsAminoTranslation ? foundSubstrLength * 3 : foundSubstrLength;
493 prepareResultPosition(sequenceRange.startPos + (refSeqIsAminoTranslation * aminoFrameNumber),
494 sequenceRange.length - (refSeqIsAminoTranslation * aminoFrameNumber),
495 resultStartPos,
496 resultLen,
497 searchStrand);
498 resultStartPos -= (searchStrand.isCompementary() && refSeqIsAminoTranslation ? tailCut : 0);
499
500 sendResultToListener(resultStartPos, resultLen, searchStrand, rl);
501 }
502 }
503 substrLength = foundSubstrLength - 1;
504 }
505
506 ++foundStartPos;
507 }
508 }
509
findInAmino_regExp(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,const U2Region & range,bool searchIsCircular,const char * pattern,int maxRegExpResult,int & stopFlag,int & percentsCompleted)510 static void findInAmino_regExp(FindAlgorithmResultsListener *rl,
511 DNATranslation *aminoTT,
512 DNATranslation *complTT,
513 FindAlgorithmStrand strand,
514 const char *seq,
515 const U2Region &range,
516 bool searchIsCircular,
517 const char *pattern,
518 int maxRegExpResult,
519 int &stopFlag,
520 int &percentsCompleted) {
521 QRegExp regExp(pattern, Qt::CaseInsensitive);
522 CHECK(regExp.isValid(), );
523
524 percentsCompleted = 0;
525
526 int conStart = isDirect(strand) ? 0 : 1;
527 int conEnd = isComplement(strand) ? 2 : 1;
528
529 int maxAminoResult = maxRegExpResult * 3;
530 int seqLen = QByteArray(seq).size();
531
532 int bufferSize = 0;
533 const char *sequence = nullptr;
534 QByteArray temp;
535 if (searchIsCircular) {
536 bufferSize = getCircularOverlap(seq, range, (seqLen > maxRegExpResult) ? maxRegExpResult - 1 : seqLen - 1);
537 temp = QByteArray(seq) + QByteArray(seq, bufferSize);
538 sequence = temp.data();
539 } else {
540 sequence = seq;
541 }
542
543 for (int ci = conStart; ci < conEnd && !stopFlag; ++ci) {
544 for (int aminoFrameNumber = 0; aminoFrameNumber < 3 && !stopFlag; aminoFrameNumber++) {
545 int len = seqLen - aminoFrameNumber - range.startPos;
546 if (range.startPos < seqLen && range.endPos() < seqLen) {
547 len = range.length - aminoFrameNumber;
548 }
549
550 QString translation;
551 QByteArray rawTranslation(len + bufferSize + 1, 0);
552 U2Strand resultStrand;
553 const int translationLen = (len + bufferSize) / 3;
554
555 if (ci == 1) { // complementary
556 TextUtils::translate(complTT->getOne2OneMapper(), sequence + range.startPos + aminoFrameNumber, len + bufferSize, rawTranslation.data());
557 TextUtils::reverse(rawTranslation.data(), len + bufferSize - (len + bufferSize) % 3);
558 aminoTT->translate(rawTranslation.data(), len + bufferSize);
559
560 resultStrand = U2Strand::Complementary;
561 } else { // direct
562 qstrcpy(rawTranslation.data(), QByteArray(sequence + range.startPos + aminoFrameNumber, len + bufferSize).data());
563 aminoTT->translate(rawTranslation.data(), len + bufferSize);
564
565 resultStrand = U2Strand::Direct;
566 }
567 translation = QString(QByteArray(rawTranslation.data(), translationLen));
568
569 if (searchIsCircular) {
570 U2Region cirRange = range;
571 cirRange.length += (seqLen == range.length && range.startPos == 0) ? bufferSize : 0;
572
573 regExpSearch(translation, regExp, resultStrand, cirRange, maxRegExpResult, ci, (len + bufferSize) % 3, conEnd, true, aminoFrameNumber, percentsCompleted, stopFlag, rl, len);
574 } else {
575 regExpSearch(translation, regExp, resultStrand, range, maxAminoResult, ci, (len + bufferSize) % 3, conEnd, true, aminoFrameNumber, percentsCompleted, stopFlag, rl);
576 }
577 }
578 }
579 }
580
findRegExp(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,bool searchIsCircular,const U2Region & range,const char * pattern,int maxRegExpResult,int & stopFlag,int & percentsCompleted)581 static void findRegExp(FindAlgorithmResultsListener *rl,
582 DNATranslation *aminoTT,
583 DNATranslation *complTT,
584 FindAlgorithmStrand strand,
585 const char *seq,
586 bool searchIsCircular,
587 const U2Region &range,
588 const char *pattern,
589 int maxRegExpResult,
590 int &stopFlag,
591 int &percentsCompleted) {
592 QRegExp regExp(pattern, Qt::CaseInsensitive);
593 CHECK(regExp.isValid(), );
594
595 if (aminoTT != nullptr) {
596 findInAmino_regExp(rl, aminoTT, complTT, strand, seq, range, searchIsCircular, pattern, maxRegExpResult, stopFlag, percentsCompleted);
597 return;
598 }
599
600 percentsCompleted = 0;
601
602 int seqLen = QByteArray(seq).size();
603 const int conStart = isDirect(strand) ? 0 : 1;
604 const int conEnd = isComplement(strand) ? 2 : 1;
605
606 for (int ci = conStart; ci < conEnd && !stopFlag; ++ci) {
607 QString substr;
608 QByteArray tmp(range.length + 1, 0);
609 U2Strand resultStrand;
610
611 if (ci == 1) { // complementary
612 char *complSeq = tmp.data();
613 TextUtils::translate(complTT->getOne2OneMapper(), seq + range.startPos, qMin(range.length, seqLen - range.startPos), complSeq);
614 TextUtils::reverse(complSeq, qMin(range.length, seqLen - range.startPos));
615 substr = QString(QByteArray(complSeq, qMin(range.length, seqLen - range.startPos)));
616 if (searchIsCircular) {
617 int bufferSize = getCircularOverlap(seq, range, (range.length > maxRegExpResult) ? maxRegExpResult - 1 : range.length);
618 QByteArray buffer(bufferSize, 0);
619 char *complBuffer = buffer.data();
620 TextUtils::translate(complTT->getOne2OneMapper(), seq, bufferSize, complBuffer);
621 TextUtils::reverse(complBuffer, bufferSize);
622 substr.prepend(buffer);
623 }
624
625 resultStrand = U2Strand::Complementary;
626 } else { // direct
627 substr = QString(QByteArray(seq + range.startPos,
628 qMin(range.length, seqLen - range.startPos)));
629 if (searchIsCircular) {
630 int bufferSize = getCircularOverlap(seq, range, (range.length > maxRegExpResult) ? maxRegExpResult - 1 : range.length);
631 substr += QString(QByteArray(seq, bufferSize));
632 }
633 resultStrand = U2Strand::Direct;
634 }
635
636 if (searchIsCircular) {
637 U2Region cirRange = range;
638 if (range.length == seqLen && range.startPos == 0) {
639 cirRange.length += (range.length > maxRegExpResult) ? maxRegExpResult - 1 : range.length;
640 }
641 regExpSearch(substr, regExp, resultStrand, cirRange, maxRegExpResult, ci, 0, conEnd, false, 0, percentsCompleted, stopFlag, rl, seqLen);
642 } else {
643 regExpSearch(substr, regExp, resultStrand, range, maxRegExpResult, ci, 0, conEnd, false, 0, percentsCompleted, stopFlag, rl);
644 }
645 }
646 }
647
find_subst(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,const char * seq,bool searchIsCircular,const U2Region & range,const char * pattern,int patternLen,bool useAmbiguousBases,int maxErr,int & stopFlag,int & percentsCompleted)648 static void find_subst(FindAlgorithmResultsListener *rl,
649 DNATranslation *aminoTT,
650 DNATranslation *complTT,
651 FindAlgorithmStrand strand,
652 const char *seq,
653 bool searchIsCircular,
654 const U2Region &range,
655 const char *pattern,
656 int patternLen,
657 bool useAmbiguousBases,
658 int maxErr,
659 int &stopFlag,
660 int &percentsCompleted) {
661 SAFE_POINT(nullptr == complTT || complTT->isOne2One(), "Invalid translation supplied!", );
662
663 if (aminoTT != nullptr) {
664 findInAmino_subst(rl, aminoTT, complTT, strand, seq, range, searchIsCircular, pattern, patternLen, maxErr, stopFlag, percentsCompleted);
665 return;
666 }
667 if (range.length - patternLen < 0) {
668 return;
669 }
670 char *complPattern = nullptr;
671 QByteArray tmp;
672 if (isComplement(strand)) {
673 SAFE_POINT(nullptr != complTT, "Invalid translation supplied!", );
674 tmp.resize(patternLen);
675 complPattern = tmp.data();
676 TextUtils::translate(complTT->getOne2OneMapper(), pattern, patternLen, complPattern);
677 TextUtils::reverse(complPattern, patternLen);
678 }
679
680 StrandContext context[] = {
681 StrandContext(0, 0, false, pattern),
682 StrandContext(0, 0, false, complPattern)};
683
684 int onePercentLen = range.length / 100;
685 int leftTillPercent = onePercentLen;
686 percentsCompleted = 0;
687
688 int conStart = isDirect(strand) ? 0 : 1;
689 int conEnd = isComplement(strand) ? 2 : 1;
690 SAFE_POINT(conStart < conEnd, "Internal algorithm error: incorrect strand order!", );
691
692 const char *sequence = nullptr;
693 QByteArray temp;
694 int end = range.endPos();
695 if (searchIsCircular) {
696 int beginningSize = getCircularOverlap(seq, range, patternLen - 1);
697 end = getSearchEndPos(seq, range, getCircularOverlap(seq, range, patternLen - 1), searchIsCircular);
698 temp = QByteArray(seq) + QByteArray(seq, beginningSize);
699 sequence = temp.data();
700 } else {
701 sequence = seq;
702 }
703 for (int i = range.startPos;
704 i < end - patternLen + 1 && !stopFlag;
705 i++, leftTillPercent--) {
706 for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
707 StrandContext &ctx = context[ci];
708 const char *p = ctx.pattern;
709 FindAlgorithmResult &res = ctx.res;
710
711 bool match = true;
712 int curErr = 0;
713 if (useAmbiguousBases) {
714 match = match_pattern_ambiguous(sequence, p, i, patternLen, maxErr, curErr);
715 } else {
716 match = match_pattern(sequence, p, i, patternLen, maxErr, curErr);
717 }
718 if (match) {
719 res.region.startPos = i;
720 res.region.length = patternLen;
721 res.err = curErr;
722 res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
723
724 rl->onResult(res);
725 }
726
727 if (leftTillPercent == 0) {
728 percentsCompleted = qMin(percentsCompleted + 1, 100);
729 leftTillPercent = onePercentLen;
730 }
731 } // strand
732 } // base pos
733 }
734
735 // value 12 - standart "out of memory" error code in linux
736 // error in FindAlgorithmResult also means number of mismatches, therefore positive values can cause confusion
737 const int FindAlgorithmResult::NOT_ENOUGH_MEMORY_ERROR = -12;
738
find(FindAlgorithmResultsListener * rl,DNATranslation * aminoTT,DNATranslation * complTT,FindAlgorithmStrand strand,FindAlgorithmPatternSettings patternSettings,bool useAmbiguousBases,const char * seq,int seqLen,const DNAAlphabet * sequenceAlphabet,bool searchIsCircular,const U2Region & range,const char * pattern,int patternLen,int maxErr,int maxRegExpResult,int & stopFlag,int & percentsCompleted)739 void FindAlgorithm::find(
740 FindAlgorithmResultsListener *rl,
741 DNATranslation *aminoTT, // if aminoTT!=NULL -> pattern must contain amino data and sequence must contain DNA data
742 DNATranslation *complTT, // if complTT!=NULL -> sequence is complemented before comparison with pattern
743 FindAlgorithmStrand strand, // if not direct there complTT must not be NULL
744 FindAlgorithmPatternSettings patternSettings,
745 bool useAmbiguousBases,
746 const char *seq,
747 int seqLen,
748 const DNAAlphabet *sequenceAlphabet,
749 bool searchIsCircular,
750 const U2Region &range,
751 const char *pattern,
752 int patternLen,
753 int maxErr,
754 int maxRegExpResult,
755 int &stopFlag,
756 int &percentsCompleted) {
757 Q_UNUSED(seqLen);
758 SAFE_POINT(nullptr == complTT || complTT->isOne2One(), "Invalid translation supplied!", );
759 SAFE_POINT(patternLen > maxErr, "Invalid maximum error count supplied!", );
760
761 if (range.endPos() > seqLen) {
762 searchIsCircular = true;
763 }
764 // no need to search circular on non-circular region
765 if (range.startPos < seqLen && range.endPos() < seqLen) {
766 searchIsCircular = false;
767 }
768
769 DNATranslation *newComplTT = complTT;
770 if (complTT != nullptr) {
771 const DNAAlphabet *destinationAlphabet = complTT->getDstAlphabet();
772 if (destinationAlphabet->isExtended()) {
773 newComplTT = complTT;
774 } else {
775 newComplTT = AppContext::getDNATranslationRegistry()->lookupComplementTranslation(U2AlphabetUtils::getExtendedAlphabet(destinationAlphabet));
776 }
777 }
778
779 if (patternSettings == FindAlgorithmPatternSettings_RegExp) {
780 findRegExp(rl, aminoTT, newComplTT, strand, seq, searchIsCircular, range, pattern, maxRegExpResult, stopFlag, percentsCompleted);
781 return;
782 }
783
784 if (patternSettings == FindAlgorithmPatternSettings_Subst) {
785 find_subst(rl, aminoTT, newComplTT, strand, seq, searchIsCircular, range, pattern, patternLen, useAmbiguousBases, maxErr, stopFlag, percentsCompleted);
786 return;
787 }
788
789 bool insDel = (patternSettings == FindAlgorithmPatternSettings_InsDel);
790
791 if (patternSettings == FindAlgorithmPatternSettings_Exact && sequenceAlphabet != nullptr && aminoTT == nullptr) {
792 // exact search -> do not run any search if pattern has illegal symbols
793 if (!U2AlphabetUtils::matches(sequenceAlphabet, pattern, patternLen)) {
794 return;
795 }
796 }
797
798 if (aminoTT != nullptr) {
799 findInAmino(rl, aminoTT, newComplTT, strand, insDel, seq, range, searchIsCircular, pattern, patternLen, maxErr, stopFlag, percentsCompleted);
800 return;
801 }
802 char *complPattern = nullptr;
803 QByteArray tmp;
804 if (isComplement(strand)) {
805 SAFE_POINT(nullptr != newComplTT, "Invalid translation supplied!", );
806 tmp.resize(patternLen);
807 complPattern = tmp.data();
808 TextUtils::translate(newComplTT->getOne2OneMapper(), pattern, patternLen, complPattern);
809 TextUtils::reverse(complPattern, patternLen);
810 }
811
812 int width = patternLen + maxErr;
813 int height = patternLen;
814
815 if (width > INT_MAX / height) {
816 const FindAlgorithmResult result(FindAlgorithmResult::NOT_ENOUGH_MEMORY_ERROR);
817 rl->onResult(result);
818 return;
819 }
820
821 try {
822 StrandContext context[] = {
823 StrandContext(width, height, insDel, pattern),
824 StrandContext(width, height, insDel, complPattern)};
825
826 int onePercentLen = range.length / 100;
827 int leftTillPercent = onePercentLen;
828 percentsCompleted = 0;
829
830 int conStart = isDirect(strand) ? 0 : 1;
831 int conEnd = isComplement(strand) ? 2 : 1;
832 SAFE_POINT(conStart < conEnd, "Internal algorithm error: incorrect strand order!", );
833
834 int end = getSearchEndPos(seq, range, patternLen - 1, searchIsCircular);
835 for (int i = range.startPos; i < end && !stopFlag; i++, leftTillPercent--) {
836 for (int ci = conStart; ci < conEnd && !stopFlag; ci++) {
837 StrandContext &ctx = context[ci];
838 DynTable &dt = ctx.dynTable;
839 const char *p = ctx.pattern;
840 FindAlgorithmResult &res = ctx.res;
841
842 for (int j = 0; j < patternLen && !stopFlag; j++) {
843 bool matched = seq[cycleIndex(seqLen, i)] == p[j];
844 dt.match(j, matched);
845 }
846
847 int err = dt.getLast();
848
849 if (!res.isEmpty() && (err > maxErr || (i - res.region.startPos) >= patternLen)) {
850 rl->onResult(res);
851 res.clear();
852 }
853
854 if (err <= maxErr) {
855 int newLen = dt.getLastLen();
856 if (res.isEmpty() || res.err > err || (res.err == err && newLen < res.region.length)) {
857 int newStart = i - newLen + 1;
858 bool boundaryCheck = (range.contains(newStart) && range.contains(newStart + newLen - 1));
859 bool circularBoundaryCheck = (!range.contains(newStart + newLen - 1) && searchIsCircular);
860 if (insDel || boundaryCheck || circularBoundaryCheck) { // boundary check for mismatch mode
861 SAFE_POINT(insDel || newLen == patternLen, "Internal algorithm error!", );
862 SAFE_POINT(newStart >= range.startPos, "Internal algorithm error!", );
863 SAFE_POINT(searchIsCircular || newStart + newLen <= range.endPos(), "Internal algorithm error!", );
864
865 res.region.startPos = newStart;
866 res.region.length = newLen;
867 res.err = err;
868 res.strand = (ci == 1) ? U2Strand::Complementary : U2Strand::Direct;
869 res.translation = aminoTT != nullptr;
870 }
871 }
872 }
873
874 dt.shiftColumn();
875 if (leftTillPercent == 0) {
876 percentsCompleted = qMin(percentsCompleted + 1, 100);
877 leftTillPercent = onePercentLen;
878 }
879 } // strand
880 } // base pos
881
882 for (int i = 0; i < 2; i++) {
883 if (!context[i].res.isEmpty()) { // todo: order by startpos?
884 SAFE_POINT(insDel || context[i].res.region.length == patternLen,
885 "Internal algorithm error: found region has invalid length!", );
886 rl->onResult(context[i].res);
887 }
888 }
889
890 } catch (std::bad_alloc &) {
891 const FindAlgorithmResult result(FindAlgorithmResult::NOT_ENOUGH_MEMORY_ERROR);
892 rl->onResult(result);
893 return;
894 }
895 }
896
estimateRamUsageInMbytes(const FindAlgorithmPatternSettings patternSettings,const bool searchInAminoTT,const int patternLength,const int maxError)897 int FindAlgorithm::estimateRamUsageInMbytes(const FindAlgorithmPatternSettings patternSettings,
898 const bool searchInAminoTT,
899 const int patternLength,
900 const int maxError) {
901 const int bytesToMbytesFactor = 1048576;
902 quint64 ramUsage = 0;
903
904 if (FindAlgorithmPatternSettings_InsDel == patternSettings) {
905 ramUsage = 2 * StrandContext::estimateRamUsageForOneContext(patternLength + maxError,
906 patternLength);
907 if (searchInAminoTT) {
908 ramUsage *= 4;
909 }
910 } else if (FindAlgorithmPatternSettings_Subst == patternSettings && searchInAminoTT)
911 ramUsage = 7 * patternLength * sizeof(char);
912
913 return ramUsage / bytesToMbytesFactor;
914 }
915
916 } // namespace U2
917