1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "RF_SArray_TandemFinder.h"
23 
24 #include <QMutexLocker>
25 
26 #include <U2Core/AppContext.h>
27 #include <U2Core/AppSettings.h>
28 #include <U2Core/CreateAnnotationTask.h>
29 #include <U2Core/LoadDocumentTask.h>
30 #include <U2Core/Log.h>
31 #include <U2Core/Timer.h>
32 #include <U2Core/U1AnnotationUtils.h>
33 
34 #include "RFConstants.h"
35 
36 namespace U2 {
37 
38 const int FindTandemsTaskSettings::DEFAULT_MIN_REPEAT_COUNT = 0;
39 const int FindTandemsTaskSettings::DEFAULT_MIN_TANDEM_SIZE = 9;
40 
FindTandemsToAnnotationsTask(const FindTandemsTaskSettings & s,const DNASequence & seq,const QString & _an,const QString & _gn,const QString & annDescription,const GObjectReference & _aor)41 FindTandemsToAnnotationsTask::FindTandemsToAnnotationsTask(const FindTandemsTaskSettings &s, const DNASequence &seq, const QString &_an, const QString &_gn, const QString &annDescription, const GObjectReference &_aor)
42     : Task(tr("Find repeats to annotations"), TaskFlags_NR_FOSCOE), saveAnns(true), mainSeq(seq), annName(_an), annGroup(_gn), annDescription(annDescription), annObjRef(_aor), s(s) {
43     GCOUNTER(cvar, "FindTandemsToAnnotationsTask");
44     setVerboseLogMode(true);
45     if (annObjRef.isValid()) {
46         LoadUnloadedDocumentTask::addLoadingSubtask(this,
47                                                     LoadDocumentTaskConfig(true, annObjRef, new LDTObjectFactory(this)));
48     }
49     addSubTask(new TandemFinder(s, mainSeq));
50 }
51 
FindTandemsToAnnotationsTask(const FindTandemsTaskSettings & s,const DNASequence & seq)52 FindTandemsToAnnotationsTask::FindTandemsToAnnotationsTask(const FindTandemsTaskSettings &s, const DNASequence &seq)
53     : Task(tr("Find repeats to annotations"), TaskFlags_NR_FOSCOE), saveAnns(false), mainSeq(seq), s(s) {
54     GCOUNTER(cvar, "FindTandemsToAnnotationsTask");
55     setVerboseLogMode(true);
56     addSubTask(new TandemFinder(s, mainSeq));
57 }
58 
onSubTaskFinished(Task * subTask)59 QList<Task *> FindTandemsToAnnotationsTask::onSubTaskFinished(Task *subTask) {
60     QList<Task *> res;
61     if (hasError() || isCanceled()) {
62         return res;
63     }
64 
65     if (qobject_cast<TandemFinder *>(subTask) != nullptr) {
66         TandemFinder *tandemFinderTask = qobject_cast<TandemFinder *>(subTask);
67         QList<SharedAnnotationData> annotations = importTandemAnnotations(tandemFinderTask->getResults(),
68                                                                           tandemFinderTask->getSettings().seqRegion.startPos,
69                                                                           tandemFinderTask->getSettings().showOverlappedTandems);
70         if (saveAnns) {
71             if (!annotations.isEmpty()) {
72                 algoLog.info(tr("Found %1 repeat regions").arg(annotations.size()));
73                 Task *createTask = new CreateAnnotationsTask(annObjRef, annotations, annGroup);
74                 createTask->setSubtaskProgressWeight(0);
75                 res.append(createTask);
76             }
77         } else {
78             result << annotations;
79         }
80     }
81     return res;
82 }
83 
importTandemAnnotations(const QList<Tandem> & tandems,qint64 seqStart,const bool showOverlapped)84 QList<SharedAnnotationData> FindTandemsToAnnotationsTask::importTandemAnnotations(const QList<Tandem> &tandems, qint64 seqStart, const bool showOverlapped) {
85     seqStart += s.reportSeqShift;
86 
87     QList<SharedAnnotationData> res;
88     foreach (const Tandem &tan, tandems) {
89         unsigned offset = 0;
90         const unsigned maxOffset = tan.size % tan.repeatLen;
91         do {
92             SharedAnnotationData ad(new AnnotationData());
93             ad->type = U2FeatureTypes::RepeatRegion;
94             ad->name = annName;
95             const quint32 tandemEnd = tan.offset + tan.size + seqStart;
96             quint32 pos = tan.offset + seqStart + offset;
97             for (; pos <= tandemEnd - tan.repeatLen; pos += tan.repeatLen) {
98                 ad->location->regions << U2Region(pos, tan.repeatLen);
99             }
100             if (ad->location->isEmpty()) {
101                 continue;
102             }
103             ad->qualifiers.append(U2Qualifier("num_of_repeats", QString::number(tan.size / tan.repeatLen)));
104             ad->qualifiers.append(U2Qualifier("repeat_length", QString::number(tan.repeatLen)));
105             ad->qualifiers.append(U2Qualifier("whole_length", QString::number(tan.size)));
106             U1AnnotationUtils::addDescriptionQualifier(ad, annDescription);
107             res.append(ad);
108             offset++;
109         } while (showOverlapped && offset <= maxOffset);
110     }
111     return res;
112 }
113 
TandemFinder(const FindTandemsTaskSettings & _settings,const DNASequence & directSequence)114 TandemFinder::TandemFinder(const FindTandemsTaskSettings &_settings, const DNASequence &directSequence)
115     : Task(tr("Find tandems"), TaskFlags_FOSCOE),
116       settings(_settings), regionCount(0) {
117     if (settings.seqRegion.length == 0) {
118         settings.seqRegion = U2Region(0, directSequence.length());
119     }
120     startTime = GTimer::currentTimeMicros();
121     sequence = (char *)directSequence.constData() + settings.seqRegion.startPos;
122 }
123 
124 class TF_WalkerConfig : public SequenceWalkerConfig {
125 public:
126     // TODO: check seqSize type compatibility!
TF_WalkerConfig(const char * _sequence,quint32 _seqSize,quint32 _chunkSize)127     TF_WalkerConfig(const char *_sequence, quint32 _seqSize, quint32 _chunkSize) {
128         seq = _sequence;
129         seqSize = _seqSize;
130         chunkSize = _chunkSize;
131         lastChunkExtraLen = _chunkSize / 2;
132         overlapSize = TandemFinder::maxCheckPeriod;
133         walkCircular = false;
134     }
135 };
136 
prepare()137 void TandemFinder::prepare() {
138     Q_ASSERT(settings.minRepeatCount >= 2);
139     int chunkSize = 32 * 1024 * 1024;  // optimized for 32Mb mem-chunk
140     addSubTask(new SequenceWalkerTask(TF_WalkerConfig(sequence, settings.seqRegion.length, chunkSize), this, tr("Find tandems"), TaskFlags_NR_FOSCOE));
141 }
142 
run()143 void TandemFinder::run() {
144     algoLog.trace(tr("Find tandems finished %1").arg(GTimer::currentTimeMicros() - startTime));
145 }
146 
onSubTaskFinished(Task * subTask)147 QList<Task *> TandemFinder::onSubTaskFinished(Task *subTask) {
148     if (qobject_cast<SequenceWalkerTask *>(subTask) != nullptr) {
149         setMaxParallelSubtasks(AppContext::getAppSettings()->getAppResourcePool()->getIdealThreadCount());
150         return regionTasks;
151     }
152     if (qobject_cast<TandemFinder_Region *>(subTask) != nullptr) {
153         TandemFinder_Region *regionTask = qobject_cast<TandemFinder_Region *>(subTask);
154         const quint64 offs = regionTask->getRegionOffset();
155         QMutexLocker foundTandemsLocker(&tandemsAccessMutex);
156         QList<Tandem> regionTandems = regionTask->getResult();
157         QMutableListIterator<Tandem> comTandIt(foundTandems);
158         foreach (Tandem t, regionTandems) {
159             t.offset += offs;
160             t.rightSide += offs;
161             while (comTandIt.hasNext() && comTandIt.peekNext() < t) {
162                 comTandIt.next();
163             }
164             if (!comTandIt.hasNext() || t < comTandIt.peekNext()) {
165                 comTandIt.insert(t);
166             } else {
167                 comTandIt.next().extend(t);
168             }
169         }
170     }
171     return QList<Task *>();
172 }
173 
onRegion(SequenceWalkerSubtask * t,TaskStateInfo & ti)174 void TandemFinder::onRegion(SequenceWalkerSubtask *t, TaskStateInfo &ti) {
175     if (ti.hasError()) {
176         return;
177     }
178     quint64 offs = t->getRegionSequence() - t->getGlobalConfig().seq;
179     QMutexLocker lock(&subtasksQueue);  // TODO: fix me
180     regionTasks.append(new TandemFinder_Region(regionCount++, t->getRegionSequence(), t->getRegionSequenceLen(), offs, settings));
181 }
182 
~TandemFinder_Region()183 TandemFinder_Region::~TandemFinder_Region() {
184     QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
185 }
186 
onSubTaskFinished(Task * subTask)187 QList<Task *> TandemFinder_Region::onSubTaskFinished(Task *subTask) {
188     if (qobject_cast<ConcreteTandemFinder *>(subTask) != nullptr) {
189         subTask->cleanup();
190     }
191     return QList<Task *>();
192 }
193 
prepare()194 void TandemFinder_Region::prepare() {
195     Q_ASSERT(settings.minPeriod <= settings.maxPeriod);
196     int period = 1;
197     for (; period <= 16; period = period * 2 + 1) {
198         if (period * 2 < settings.minPeriod || period > settings.maxPeriod || period >= seqSize)
199             continue;
200         addSubTask(new ExactSizedTandemFinder(sequence, seqSize, settings, period));
201     }
202     {
203         if (period > settings.maxPeriod)
204             return;
205         addSubTask(new LargeSizedTandemFinder(sequence, seqSize, settings, period));
206     }
207 }
208 
addResult(const Tandem & tandem)209 void TandemFinder_Region::addResult(const Tandem &tandem) {
210     QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
211     foundTandems.append(tandem);
212 }
addResults(const QMap<Tandem,Tandem> & tandems)213 void TandemFinder_Region::addResults(const QMap<Tandem, Tandem> &tandems) {
214     QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
215     foundTandems.append(tandems.values());
216 }
217 
ConcreteTandemFinder(QString taskName,const char * _sequence,const long _seqSize,const FindTandemsTaskSettings & _settings,const int _prefixLength)218 ConcreteTandemFinder::ConcreteTandemFinder(QString taskName, const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _prefixLength)
219     : Task(taskName, TaskFlags_FOSCOE), sequence(_sequence), seqSize(_seqSize), index(nullptr), suffixArray(nullptr),
220       settings(_settings), prefixLength(_prefixLength), suffArrSize(_seqSize - _prefixLength + 1) {
221     Q_ASSERT(settings.minRepeatCount > 1);
222     int suffArrMemory;
223     if (settings.algo == TSConstants::AlgoSuffixBinary) {
224         suffArrMemory = seqSize / 4 + seqSize * sizeof(quint32) + ((size_t)1 << qMin(prefixLength * 2, 24)) * sizeof(quint64) * 7 / 6;
225     } else {
226         suffArrMemory = seqSize * sizeof(quint32) * 2;
227     }
228     suffArrMemory = qMax(suffArrMemory / (1024 * 1024), 1);  // in Mb
229     addTaskResource(TaskResourceUsage(RESOURCE_MEMORY, suffArrMemory, true));
230 }
231 
prepare()232 void ConcreteTandemFinder::prepare() {
233     const quint32 *nuclTable = bitsTable.getBitMaskCharBits(DNAAlphabet_NUCL);
234     const quint32 bitMaskCharBitsNum = bitsTable.getBitMaskCharBitsNum(DNAAlphabet_NUCL);
235     // single thread approximation (re-estimated in some algorithms)
236     int arrayPercent = int((seqSize / double(seqSize + prefixLength)) * 100 / 5);  // array creation time ~5 times faster than search
237 
238     if (settings.algo == TSConstants::AlgoSuffix) {
239         // index with double sized match
240         CreateSArrayIndexTask *indexTask = new CreateSArrayIndexTask(sequence, seqSize, prefixLength, 'N', nuclTable, bitMaskCharBitsNum);
241         indexTask->setSubtaskProgressWeight(arrayPercent / 100.0F);
242         // TODO fix algorithm selection
243         if (qobject_cast<ExactSizedTandemFinder *>(this) != nullptr) {
244             addSubTask(indexTask);
245         }
246     }
247 }
248 
onSubTaskFinished(Task * subTask)249 QList<Task *> ConcreteTandemFinder::onSubTaskFinished(Task *subTask) {
250     if (qobject_cast<CreateSArrayIndexTask *>(subTask) != nullptr) {
251         index = qobject_cast<const CreateSArrayIndexTask *>(subTask)->index;
252     }
253     return QList<Task *>();
254 }
255 
cleanup()256 void ConcreteTandemFinder::cleanup() {
257     if (getSubtasks().size() == 0) {
258         return;
259     }
260     getSubtasks().first()->cleanup();
261 }
262 
ExactSizedTandemFinder(const char * _sequence,const long _seqSize,const FindTandemsTaskSettings & _settings,const int _analysisSize)263 ExactSizedTandemFinder::ExactSizedTandemFinder(const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize)
264     : ConcreteTandemFinder(tr("Find %1-period tandems").arg(_analysisSize), _sequence, _seqSize, _settings, _analysisSize * 2) {
265 }
266 
~ExactSizedTandemFinder()267 ExactSizedTandemFinder::~ExactSizedTandemFinder() {
268 
269 };
270 
seqCast(const quint32 * suffArr,int ind)271 inline char *seqCast(const quint32 *suffArr, int ind) {
272     return reinterpret_cast<char *>(
273         suffArr[ind]);
274 }
275 
run()276 void ExactSizedTandemFinder::run() {
277     if (seqSize < settings.minPeriod * settings.minRepeatCount) {
278         return;
279     }
280     if (seqSize < prefixLength) {
281         return;
282     }
283     int minPeriod = qMax(settings.minPeriod, prefixLength / 2);
284     int maxPeriod = qMin(settings.maxPeriod, prefixLength);
285     if (index == nullptr) {
286         try {
287             suffixArray = new SuffixArray(sequence, seqSize, prefixLength);
288         } catch (...) {
289             setError("Not enough memory");
290             return;
291         }
292         const BitMask &bitMask = suffixArray->getBitMask();
293         const quint32 *sArray = suffixArray->getArray();
294         quint32 *currentDiffPos = (quint32 *)sArray;
295         const quint32 *sArrayLast = sArray + suffArrSize - 1;
296         while (currentDiffPos < sArrayLast) {
297             int diff = currentDiffPos[1] - currentDiffPos[0];
298             if (diff >= minPeriod && diff <= maxPeriod) {  // only our exact size
299                 quint32 suffixOffset = qMax(1, (settings.minTandemSize - prefixLength) / diff);
300                 if (currentDiffPos + suffixOffset <= sArrayLast && currentDiffPos[suffixOffset] - currentDiffPos[0] == suffixOffset * diff) {
301                     if (bitMask[currentDiffPos[0]] == bitMask[currentDiffPos[suffixOffset]]) {
302                         currentDiffPos = checkAndSpreadTandem_bv(currentDiffPos, currentDiffPos + suffixOffset, diff);
303                         continue;
304                     }
305                 }
306             }
307             currentDiffPos++;
308         }
309         delete suffixArray;  // TODO: remove all deletes
310     } else {
311         const quint32 *sArray = index->getSArray();
312         const quint32 *sArrayLast = const_cast<quint32 *>(sArray + index->getSArraySize() - 1);
313         quint32 *currentDiffPos = (quint32 *)sArray;
314 
315         while (currentDiffPos < sArrayLast) {
316             const qint32 diff = currentDiffPos[1] - currentDiffPos[0];
317             if (diff >= minPeriod && diff <= maxPeriod) {  // only our exact size
318                 unsigned suffixOffset = qMax(1, signed(settings.minTandemSize - prefixLength) / diff);
319                 if (currentDiffPos + suffixOffset > sArrayLast || currentDiffPos[suffixOffset] - currentDiffPos[0] != suffixOffset * diff) {
320                 } else {
321                     const char *seq0 = index->sarr2seq(currentDiffPos);  // seqCast(currentDiffPos,0);
322                     const char *seq1 = index->sarr2seq(currentDiffPos + suffixOffset);  // seqCast(currentDiffPos,suffixOffset);
323                     if (seq0 == nullptr || seq1 == nullptr) {
324                         continue;
325                     }
326                     // in the other words  prefix currentDiffPos[0]==currentDiffPos[suffixOffset]
327                     if (comparePrefixChars(seq0, seq1)) {
328                         currentDiffPos = checkAndSpreadTandem(currentDiffPos, currentDiffPos + suffixOffset, diff);
329                         continue;
330                     }
331                 }
332             }
333             currentDiffPos++;
334         }
335     }
336 
337     qobject_cast<TandemFinder_Region *>(getParentTask())->addResults(rawTandems);
338 }
339 
comparePrefixChars(const char * first,const char * second)340 bool ExactSizedTandemFinder::comparePrefixChars(const char *first, const char *second) {
341     return strncmp(first, second, prefixLength) == 0;
342 }
checkAndSpreadTandem(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,quint32 repeatLen)343 quint32 *ExactSizedTandemFinder::checkAndSpreadTandem(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, quint32 repeatLen) {
344     const char *tandemStart = index->sarr2seq(tandemStartIndex);  // seqCast(tandemStartIndex,0);
345     quint32 *arrRunner = (quint32 *)tandemLastIndex - 1;
346     {
347         const quint32 *sArrayLast = const_cast<quint32 *>(index->getSArray() + index->getSArraySize() - 1);
348         // run until distance become incorrect, it is incredible if we run far with changing prefix
349         do {
350             ++arrRunner;
351         } while (arrRunner < sArrayLast && arrRunner[1] - arrRunner[0] == repeatLen);
352         while (!comparePrefixChars(tandemStart, index->sarr2seq(arrRunner) /*seqCast(arrRunner,0)*/)) {
353             --arrRunner;
354         }
355     }
356 
357     //    if (index->getMaskedSequence()!=NULL){
358     if (false) {
359     } else {
360         // in this case seqCast(tandemStartIndex,1)==seqCast(tandemStartIndex,0)+repeatLen
361         const char *seqRunner = index->sarr2seq(arrRunner);  // seqCast(arrRunner,0);
362         // move forward by 0.5-1.0 repeatlen to find exact lower bound
363         const char *seqEnd = sequence + seqSize;
364         while (seqRunner <= seqEnd - repeatLen && strncmp(tandemStart, seqRunner, repeatLen) == 0) {
365             seqRunner += repeatLen;
366         }
367 
368         const quint32 size = seqRunner - tandemStart;
369         const Tandem tandem(tandemStart - sequence, repeatLen, size);
370         // check if there is already marked area
371         QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
372         if (it == rawTandems.end()) {  // there are no close tandems
373             int tandemMinSize = qMax(settings.minTandemSize, settings.minRepeatCount * tandem.repeatLen);
374             if (tandem.size >= tandemMinSize) {
375                 rawTandems.insert(tandem, tandem);
376             }
377         } else {
378             Tandem t = *it;
379             rawTandems.erase(it);
380             t.extend(tandem);
381             rawTandems.insert(t, t);
382         }
383     }
384     return arrRunner;
385 }
386 
checkAndSpreadTandem_bv(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,quint32 repeatLen)387 quint32 *ExactSizedTandemFinder::checkAndSpreadTandem_bv(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, quint32 repeatLen) {
388     const BitMask &bitMask = suffixArray->getBitMask();
389     const quint32 tandemStart = tandemStartIndex[0];
390 
391     const quint64 matchRepeat = bitMask[tandemStart];
392     quint32 *arrRunner = (quint32 *)tandemLastIndex - 1;
393     {
394         const quint32 *sArrayLast = suffixArray->getArray() + suffArrSize - 1;
395         // run until distance become incorrect, it is incredible if we would run far with changing prefix
396         do {
397             ++arrRunner;
398         } while (arrRunner < sArrayLast && arrRunner[1] - arrRunner[0] == repeatLen);
399         while (matchRepeat != bitMask[arrRunner[0]]) {
400             --arrRunner;
401         }
402     }
403 
404     //    if (index->getMaskedSequence()!=NULL){
405     if (false) {
406     } else {
407         // in this case seqCast(tandemStartIndex,1)==seqCast(tandemStartIndex,0)+repeatLen
408         quint32 seqRunner = arrRunner[0];
409         // move forward by 0.5-1.0 repeatlen to find exact lower bound
410         const quint32 &seqEnd = seqSize;
411         const quint64 periodMask = ~(~0ull >> (2 * repeatLen));
412         while (seqRunner <= seqEnd - repeatLen && ((matchRepeat ^ bitMask[seqRunner]) & periodMask) == 0) {
413             seqRunner += repeatLen;
414         }
415 
416         const quint32 size = seqRunner - tandemStart;
417         const Tandem tandem(tandemStart, repeatLen, size);
418         // check if there is already marked area
419         QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
420         if (it == rawTandems.end()) {  // there are no close tandems
421             int tandemMinSize = qMax(settings.minTandemSize, settings.minRepeatCount * tandem.repeatLen);
422             if (tandem.size >= tandemMinSize) {
423                 rawTandems.insert(tandem, tandem);
424             }
425         } else {
426             Tandem t = *it;
427             rawTandems.erase(it);
428             t.extend(tandem);
429             rawTandems.insert(t, t);
430         }
431     }
432     return arrRunner;
433 }
434 
LargeSizedTandemFinder(const char * _sequence,const long _seqSize,const FindTandemsTaskSettings & _settings,const int _analysisSize)435 LargeSizedTandemFinder::LargeSizedTandemFinder(const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize)
436     : ConcreteTandemFinder(tr("Find big-period tandems"), _sequence, _seqSize, _settings, _analysisSize) {
437 }
~LargeSizedTandemFinder()438 LargeSizedTandemFinder::~LargeSizedTandemFinder() {
439 }
440 
run()441 void LargeSizedTandemFinder::run() {
442     if (seqSize < settings.minPeriod * settings.minRepeatCount) {
443         return;
444     }
445     if (seqSize < prefixLength) {
446         return;
447     }
448     int minPeriod = qMax(settings.minPeriod, prefixLength);
449     int maxPeriod = settings.maxPeriod;
450     if (index == nullptr) {
451         try {
452             suffixArray = new SuffixArray(sequence, seqSize, prefixLength);
453         } catch (...) {
454             setError("Not enough memory");
455             return;
456         }
457         const BitMask &bitMask = suffixArray->getBitMask();
458         const quint32 *sArray = suffixArray->getArray();
459         quint32 *currentDiffPos = (quint32 *)sArray;
460         const quint32 *sArrayLast = sArray + suffArrSize - 1;
461         while (currentDiffPos < sArrayLast) {
462             int diff = currentDiffPos[1] - currentDiffPos[0];
463             if (diff >= minPeriod && diff <= maxPeriod) {
464                 quint32 firstSuffixPos = currentDiffPos[0];
465                 quint32 secondSuffixPos = currentDiffPos[1];
466                 const quint32 endSuffixPos = secondSuffixPos;
467                 bool correctRepeat = false;
468                 do {
469                     correctRepeat = bitMask[firstSuffixPos] == bitMask[secondSuffixPos];
470                     firstSuffixPos += prefixLength;
471                     secondSuffixPos += prefixLength;
472                 } while (correctRepeat && firstSuffixPos < endSuffixPos);
473                 if (correctRepeat) {
474                     currentDiffPos = checkAndSpreadTandem_bv(currentDiffPos, currentDiffPos + 1, diff);
475                     continue;
476                 }
477             }
478             currentDiffPos++;
479         }
480         delete suffixArray;
481     } else {
482         const quint32 *sArray = index->getSArray();
483         const quint32 *sArrayLast = const_cast<quint32 *>(sArray + index->getSArraySize() - 1);
484         quint32 *currentDiffPos = (quint32 *)sArray;
485 
486         // TODO: rewrite this code
487         while (currentDiffPos < sArrayLast) {
488             const qint32 diff = currentDiffPos[1] - currentDiffPos[0];
489             if (diff >= minPeriod && diff <= maxPeriod) {  // skip already found tandems
490                 unsigned suffixOffset = static_cast<unsigned>(qMax(1, signed(settings.minTandemSize - prefixLength) / diff));
491                 if (currentDiffPos + suffixOffset > sArrayLast || currentDiffPos[suffixOffset] - currentDiffPos[0] != suffixOffset * diff) {
492                     //                    currentDiffPos += suffixOffset;continue;
493                 } else {
494                     const char *seq0 = index->sarr2seq(currentDiffPos);  // seqCast(currentDiffPos,0);
495                     const char *seq1 = index->sarr2seq(currentDiffPos + suffixOffset);  // seqCast(currentDiffPos,suffixOffset);
496                     // in the other words  prefix currentDiffPos[0]==currentDiffPos[1]
497                     if (comparePrefixChars(seq0, seq1)) {
498                         currentDiffPos = checkAndSpreadTandem(currentDiffPos, currentDiffPos + suffixOffset, diff);
499                         continue;
500                     }
501                 }
502             }
503             currentDiffPos++;
504         }
505     }
506 
507     qobject_cast<TandemFinder_Region *>(getParentTask())->addResults(rawTandems);
508 }
comparePrefixChars(const char * first,const char * second)509 bool LargeSizedTandemFinder::comparePrefixChars(const char *first, const char *second) {
510     return strncmp(first, second, prefixLength) == 0;
511 }
checkAndSpreadTandem(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,const unsigned repeatLen)512 quint32 *LargeSizedTandemFinder::checkAndSpreadTandem(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, const unsigned repeatLen) {
513     const char *tandemStart = index->sarr2seq(tandemStartIndex);  // seqCast(tandemStartIndex,0);
514 
515     quint32 *arrRunner = (quint32 *)tandemLastIndex - 1;
516     {
517         const quint32 *sArrayLast = const_cast<quint32 *>(index->getSArray() + index->getSArraySize() - 1);
518         // run until distance become incorrect, it is incredible if we run far with changing prefix
519         do {
520             ++arrRunner;
521         } while (arrRunner < sArrayLast && arrRunner[1] - arrRunner[0] == repeatLen);
522         while (!comparePrefixChars(tandemStart, index->sarr2seq(arrRunner) /*seqCast(arrRunner,0)*/)) {
523             --arrRunner;
524         }
525     }
526 
527     // in this case seqCast(tandemStartIndex,1)==seqCast(tandemStartIndex,0)+repeatLen
528     const char *seqRunner = index->sarr2seq(arrRunner);  // seqCast(arrRunner,0);
529     // move forward by 0.5-1.0 of repeat len to find exact lower bound
530     const char *seqEnd = sequence + seqSize;
531     while (seqRunner <= seqEnd - repeatLen && strncmp(tandemStart, seqRunner, repeatLen) == 0) {
532         seqRunner += repeatLen;
533     }
534 
535     const quint32 size = seqRunner - tandemStart;
536     const Tandem tandem(tandemStart - sequence, repeatLen, size);
537     // check if there is already marked area
538     QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
539     if (it == rawTandems.end()) {  // there are no close tandems
540         rawTandems.insert(tandem, tandem);
541     } else {
542         Tandem t = *it;
543         rawTandems.erase(it);
544         t.extend(tandem);
545         rawTandems.insert(t, t);
546     }
547 
548     return arrRunner;
549 }
550 
checkAndSpreadTandem_bv(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,const unsigned repeatLen)551 quint32 *LargeSizedTandemFinder::checkAndSpreadTandem_bv(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, const unsigned repeatLen) {
552     const BitMask &bitMask = suffixArray->getBitMask();
553     const quint32 tandemStart = tandemStartIndex[0];
554     quint32 tandemLast = tandemLastIndex[0];
555     const quint32 &seqEnd = seqSize;
556     // repeatLen > prefixLength
557     while (tandemLast < seqEnd - prefixLength && bitMask[tandemLast] == bitMask[tandemLast - repeatLen]) {
558         tandemLast += prefixLength;
559     }
560 
561     const quint32 size = tandemLast - tandemStart;
562     const Tandem tandem(tandemStart, repeatLen, size);
563     // check if there is already marked area
564     QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
565     if (it == rawTandems.end()) {  // there are no close tandems
566         int tandemMinSize = qMax(settings.minTandemSize, 2 * tandem.repeatLen);
567         if (tandem.size >= tandemMinSize) {
568             rawTandems.insert(tandem, tandem);
569         }
570     } else {
571         Tandem t = *it;
572         rawTandems.erase(it);
573         t.extend(tandem);
574         rawTandems.insert(t, t);
575     }
576     return const_cast<quint32 *>(tandemStartIndex + size / repeatLen);
577 }
578 
579 }  // namespace U2
580