1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "RF_SArray_TandemFinder.h"
23
24 #include <QMutexLocker>
25
26 #include <U2Core/AppContext.h>
27 #include <U2Core/AppSettings.h>
28 #include <U2Core/CreateAnnotationTask.h>
29 #include <U2Core/LoadDocumentTask.h>
30 #include <U2Core/Log.h>
31 #include <U2Core/Timer.h>
32 #include <U2Core/U1AnnotationUtils.h>
33
34 #include "RFConstants.h"
35
36 namespace U2 {
37
38 const int FindTandemsTaskSettings::DEFAULT_MIN_REPEAT_COUNT = 0;
39 const int FindTandemsTaskSettings::DEFAULT_MIN_TANDEM_SIZE = 9;
40
FindTandemsToAnnotationsTask(const FindTandemsTaskSettings & s,const DNASequence & seq,const QString & _an,const QString & _gn,const QString & annDescription,const GObjectReference & _aor)41 FindTandemsToAnnotationsTask::FindTandemsToAnnotationsTask(const FindTandemsTaskSettings &s, const DNASequence &seq, const QString &_an, const QString &_gn, const QString &annDescription, const GObjectReference &_aor)
42 : Task(tr("Find repeats to annotations"), TaskFlags_NR_FOSCOE), saveAnns(true), mainSeq(seq), annName(_an), annGroup(_gn), annDescription(annDescription), annObjRef(_aor), s(s) {
43 GCOUNTER(cvar, "FindTandemsToAnnotationsTask");
44 setVerboseLogMode(true);
45 if (annObjRef.isValid()) {
46 LoadUnloadedDocumentTask::addLoadingSubtask(this,
47 LoadDocumentTaskConfig(true, annObjRef, new LDTObjectFactory(this)));
48 }
49 addSubTask(new TandemFinder(s, mainSeq));
50 }
51
FindTandemsToAnnotationsTask(const FindTandemsTaskSettings & s,const DNASequence & seq)52 FindTandemsToAnnotationsTask::FindTandemsToAnnotationsTask(const FindTandemsTaskSettings &s, const DNASequence &seq)
53 : Task(tr("Find repeats to annotations"), TaskFlags_NR_FOSCOE), saveAnns(false), mainSeq(seq), s(s) {
54 GCOUNTER(cvar, "FindTandemsToAnnotationsTask");
55 setVerboseLogMode(true);
56 addSubTask(new TandemFinder(s, mainSeq));
57 }
58
onSubTaskFinished(Task * subTask)59 QList<Task *> FindTandemsToAnnotationsTask::onSubTaskFinished(Task *subTask) {
60 QList<Task *> res;
61 if (hasError() || isCanceled()) {
62 return res;
63 }
64
65 if (qobject_cast<TandemFinder *>(subTask) != nullptr) {
66 TandemFinder *tandemFinderTask = qobject_cast<TandemFinder *>(subTask);
67 QList<SharedAnnotationData> annotations = importTandemAnnotations(tandemFinderTask->getResults(),
68 tandemFinderTask->getSettings().seqRegion.startPos,
69 tandemFinderTask->getSettings().showOverlappedTandems);
70 if (saveAnns) {
71 if (!annotations.isEmpty()) {
72 algoLog.info(tr("Found %1 repeat regions").arg(annotations.size()));
73 Task *createTask = new CreateAnnotationsTask(annObjRef, annotations, annGroup);
74 createTask->setSubtaskProgressWeight(0);
75 res.append(createTask);
76 }
77 } else {
78 result << annotations;
79 }
80 }
81 return res;
82 }
83
importTandemAnnotations(const QList<Tandem> & tandems,qint64 seqStart,const bool showOverlapped)84 QList<SharedAnnotationData> FindTandemsToAnnotationsTask::importTandemAnnotations(const QList<Tandem> &tandems, qint64 seqStart, const bool showOverlapped) {
85 seqStart += s.reportSeqShift;
86
87 QList<SharedAnnotationData> res;
88 foreach (const Tandem &tan, tandems) {
89 unsigned offset = 0;
90 const unsigned maxOffset = tan.size % tan.repeatLen;
91 do {
92 SharedAnnotationData ad(new AnnotationData());
93 ad->type = U2FeatureTypes::RepeatRegion;
94 ad->name = annName;
95 const quint32 tandemEnd = tan.offset + tan.size + seqStart;
96 quint32 pos = tan.offset + seqStart + offset;
97 for (; pos <= tandemEnd - tan.repeatLen; pos += tan.repeatLen) {
98 ad->location->regions << U2Region(pos, tan.repeatLen);
99 }
100 if (ad->location->isEmpty()) {
101 continue;
102 }
103 ad->qualifiers.append(U2Qualifier("num_of_repeats", QString::number(tan.size / tan.repeatLen)));
104 ad->qualifiers.append(U2Qualifier("repeat_length", QString::number(tan.repeatLen)));
105 ad->qualifiers.append(U2Qualifier("whole_length", QString::number(tan.size)));
106 U1AnnotationUtils::addDescriptionQualifier(ad, annDescription);
107 res.append(ad);
108 offset++;
109 } while (showOverlapped && offset <= maxOffset);
110 }
111 return res;
112 }
113
TandemFinder(const FindTandemsTaskSettings & _settings,const DNASequence & directSequence)114 TandemFinder::TandemFinder(const FindTandemsTaskSettings &_settings, const DNASequence &directSequence)
115 : Task(tr("Find tandems"), TaskFlags_FOSCOE),
116 settings(_settings), regionCount(0) {
117 if (settings.seqRegion.length == 0) {
118 settings.seqRegion = U2Region(0, directSequence.length());
119 }
120 startTime = GTimer::currentTimeMicros();
121 sequence = (char *)directSequence.constData() + settings.seqRegion.startPos;
122 }
123
124 class TF_WalkerConfig : public SequenceWalkerConfig {
125 public:
126 // TODO: check seqSize type compatibility!
TF_WalkerConfig(const char * _sequence,quint32 _seqSize,quint32 _chunkSize)127 TF_WalkerConfig(const char *_sequence, quint32 _seqSize, quint32 _chunkSize) {
128 seq = _sequence;
129 seqSize = _seqSize;
130 chunkSize = _chunkSize;
131 lastChunkExtraLen = _chunkSize / 2;
132 overlapSize = TandemFinder::maxCheckPeriod;
133 walkCircular = false;
134 }
135 };
136
prepare()137 void TandemFinder::prepare() {
138 Q_ASSERT(settings.minRepeatCount >= 2);
139 int chunkSize = 32 * 1024 * 1024; // optimized for 32Mb mem-chunk
140 addSubTask(new SequenceWalkerTask(TF_WalkerConfig(sequence, settings.seqRegion.length, chunkSize), this, tr("Find tandems"), TaskFlags_NR_FOSCOE));
141 }
142
run()143 void TandemFinder::run() {
144 algoLog.trace(tr("Find tandems finished %1").arg(GTimer::currentTimeMicros() - startTime));
145 }
146
onSubTaskFinished(Task * subTask)147 QList<Task *> TandemFinder::onSubTaskFinished(Task *subTask) {
148 if (qobject_cast<SequenceWalkerTask *>(subTask) != nullptr) {
149 setMaxParallelSubtasks(AppContext::getAppSettings()->getAppResourcePool()->getIdealThreadCount());
150 return regionTasks;
151 }
152 if (qobject_cast<TandemFinder_Region *>(subTask) != nullptr) {
153 TandemFinder_Region *regionTask = qobject_cast<TandemFinder_Region *>(subTask);
154 const quint64 offs = regionTask->getRegionOffset();
155 QMutexLocker foundTandemsLocker(&tandemsAccessMutex);
156 QList<Tandem> regionTandems = regionTask->getResult();
157 QMutableListIterator<Tandem> comTandIt(foundTandems);
158 foreach (Tandem t, regionTandems) {
159 t.offset += offs;
160 t.rightSide += offs;
161 while (comTandIt.hasNext() && comTandIt.peekNext() < t) {
162 comTandIt.next();
163 }
164 if (!comTandIt.hasNext() || t < comTandIt.peekNext()) {
165 comTandIt.insert(t);
166 } else {
167 comTandIt.next().extend(t);
168 }
169 }
170 }
171 return QList<Task *>();
172 }
173
onRegion(SequenceWalkerSubtask * t,TaskStateInfo & ti)174 void TandemFinder::onRegion(SequenceWalkerSubtask *t, TaskStateInfo &ti) {
175 if (ti.hasError()) {
176 return;
177 }
178 quint64 offs = t->getRegionSequence() - t->getGlobalConfig().seq;
179 QMutexLocker lock(&subtasksQueue); // TODO: fix me
180 regionTasks.append(new TandemFinder_Region(regionCount++, t->getRegionSequence(), t->getRegionSequenceLen(), offs, settings));
181 }
182
~TandemFinder_Region()183 TandemFinder_Region::~TandemFinder_Region() {
184 QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
185 }
186
onSubTaskFinished(Task * subTask)187 QList<Task *> TandemFinder_Region::onSubTaskFinished(Task *subTask) {
188 if (qobject_cast<ConcreteTandemFinder *>(subTask) != nullptr) {
189 subTask->cleanup();
190 }
191 return QList<Task *>();
192 }
193
prepare()194 void TandemFinder_Region::prepare() {
195 Q_ASSERT(settings.minPeriod <= settings.maxPeriod);
196 int period = 1;
197 for (; period <= 16; period = period * 2 + 1) {
198 if (period * 2 < settings.minPeriod || period > settings.maxPeriod || period >= seqSize)
199 continue;
200 addSubTask(new ExactSizedTandemFinder(sequence, seqSize, settings, period));
201 }
202 {
203 if (period > settings.maxPeriod)
204 return;
205 addSubTask(new LargeSizedTandemFinder(sequence, seqSize, settings, period));
206 }
207 }
208
addResult(const Tandem & tandem)209 void TandemFinder_Region::addResult(const Tandem &tandem) {
210 QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
211 foundTandems.append(tandem);
212 }
addResults(const QMap<Tandem,Tandem> & tandems)213 void TandemFinder_Region::addResults(const QMap<Tandem, Tandem> &tandems) {
214 QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
215 foundTandems.append(tandems.values());
216 }
217
ConcreteTandemFinder(QString taskName,const char * _sequence,const long _seqSize,const FindTandemsTaskSettings & _settings,const int _prefixLength)218 ConcreteTandemFinder::ConcreteTandemFinder(QString taskName, const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _prefixLength)
219 : Task(taskName, TaskFlags_FOSCOE), sequence(_sequence), seqSize(_seqSize), index(nullptr), suffixArray(nullptr),
220 settings(_settings), prefixLength(_prefixLength), suffArrSize(_seqSize - _prefixLength + 1) {
221 Q_ASSERT(settings.minRepeatCount > 1);
222 int suffArrMemory;
223 if (settings.algo == TSConstants::AlgoSuffixBinary) {
224 suffArrMemory = seqSize / 4 + seqSize * sizeof(quint32) + ((size_t)1 << qMin(prefixLength * 2, 24)) * sizeof(quint64) * 7 / 6;
225 } else {
226 suffArrMemory = seqSize * sizeof(quint32) * 2;
227 }
228 suffArrMemory = qMax(suffArrMemory / (1024 * 1024), 1); // in Mb
229 addTaskResource(TaskResourceUsage(RESOURCE_MEMORY, suffArrMemory, true));
230 }
231
prepare()232 void ConcreteTandemFinder::prepare() {
233 const quint32 *nuclTable = bitsTable.getBitMaskCharBits(DNAAlphabet_NUCL);
234 const quint32 bitMaskCharBitsNum = bitsTable.getBitMaskCharBitsNum(DNAAlphabet_NUCL);
235 // single thread approximation (re-estimated in some algorithms)
236 int arrayPercent = int((seqSize / double(seqSize + prefixLength)) * 100 / 5); // array creation time ~5 times faster than search
237
238 if (settings.algo == TSConstants::AlgoSuffix) {
239 // index with double sized match
240 CreateSArrayIndexTask *indexTask = new CreateSArrayIndexTask(sequence, seqSize, prefixLength, 'N', nuclTable, bitMaskCharBitsNum);
241 indexTask->setSubtaskProgressWeight(arrayPercent / 100.0F);
242 // TODO fix algorithm selection
243 if (qobject_cast<ExactSizedTandemFinder *>(this) != nullptr) {
244 addSubTask(indexTask);
245 }
246 }
247 }
248
onSubTaskFinished(Task * subTask)249 QList<Task *> ConcreteTandemFinder::onSubTaskFinished(Task *subTask) {
250 if (qobject_cast<CreateSArrayIndexTask *>(subTask) != nullptr) {
251 index = qobject_cast<const CreateSArrayIndexTask *>(subTask)->index;
252 }
253 return QList<Task *>();
254 }
255
cleanup()256 void ConcreteTandemFinder::cleanup() {
257 if (getSubtasks().size() == 0) {
258 return;
259 }
260 getSubtasks().first()->cleanup();
261 }
262
ExactSizedTandemFinder(const char * _sequence,const long _seqSize,const FindTandemsTaskSettings & _settings,const int _analysisSize)263 ExactSizedTandemFinder::ExactSizedTandemFinder(const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize)
264 : ConcreteTandemFinder(tr("Find %1-period tandems").arg(_analysisSize), _sequence, _seqSize, _settings, _analysisSize * 2) {
265 }
266
~ExactSizedTandemFinder()267 ExactSizedTandemFinder::~ExactSizedTandemFinder() {
268
269 };
270
seqCast(const quint32 * suffArr,int ind)271 inline char *seqCast(const quint32 *suffArr, int ind) {
272 return reinterpret_cast<char *>(
273 suffArr[ind]);
274 }
275
run()276 void ExactSizedTandemFinder::run() {
277 if (seqSize < settings.minPeriod * settings.minRepeatCount) {
278 return;
279 }
280 if (seqSize < prefixLength) {
281 return;
282 }
283 int minPeriod = qMax(settings.minPeriod, prefixLength / 2);
284 int maxPeriod = qMin(settings.maxPeriod, prefixLength);
285 if (index == nullptr) {
286 try {
287 suffixArray = new SuffixArray(sequence, seqSize, prefixLength);
288 } catch (...) {
289 setError("Not enough memory");
290 return;
291 }
292 const BitMask &bitMask = suffixArray->getBitMask();
293 const quint32 *sArray = suffixArray->getArray();
294 quint32 *currentDiffPos = (quint32 *)sArray;
295 const quint32 *sArrayLast = sArray + suffArrSize - 1;
296 while (currentDiffPos < sArrayLast) {
297 int diff = currentDiffPos[1] - currentDiffPos[0];
298 if (diff >= minPeriod && diff <= maxPeriod) { // only our exact size
299 quint32 suffixOffset = qMax(1, (settings.minTandemSize - prefixLength) / diff);
300 if (currentDiffPos + suffixOffset <= sArrayLast && currentDiffPos[suffixOffset] - currentDiffPos[0] == suffixOffset * diff) {
301 if (bitMask[currentDiffPos[0]] == bitMask[currentDiffPos[suffixOffset]]) {
302 currentDiffPos = checkAndSpreadTandem_bv(currentDiffPos, currentDiffPos + suffixOffset, diff);
303 continue;
304 }
305 }
306 }
307 currentDiffPos++;
308 }
309 delete suffixArray; // TODO: remove all deletes
310 } else {
311 const quint32 *sArray = index->getSArray();
312 const quint32 *sArrayLast = const_cast<quint32 *>(sArray + index->getSArraySize() - 1);
313 quint32 *currentDiffPos = (quint32 *)sArray;
314
315 while (currentDiffPos < sArrayLast) {
316 const qint32 diff = currentDiffPos[1] - currentDiffPos[0];
317 if (diff >= minPeriod && diff <= maxPeriod) { // only our exact size
318 unsigned suffixOffset = qMax(1, signed(settings.minTandemSize - prefixLength) / diff);
319 if (currentDiffPos + suffixOffset > sArrayLast || currentDiffPos[suffixOffset] - currentDiffPos[0] != suffixOffset * diff) {
320 } else {
321 const char *seq0 = index->sarr2seq(currentDiffPos); // seqCast(currentDiffPos,0);
322 const char *seq1 = index->sarr2seq(currentDiffPos + suffixOffset); // seqCast(currentDiffPos,suffixOffset);
323 if (seq0 == nullptr || seq1 == nullptr) {
324 continue;
325 }
326 // in the other words prefix currentDiffPos[0]==currentDiffPos[suffixOffset]
327 if (comparePrefixChars(seq0, seq1)) {
328 currentDiffPos = checkAndSpreadTandem(currentDiffPos, currentDiffPos + suffixOffset, diff);
329 continue;
330 }
331 }
332 }
333 currentDiffPos++;
334 }
335 }
336
337 qobject_cast<TandemFinder_Region *>(getParentTask())->addResults(rawTandems);
338 }
339
comparePrefixChars(const char * first,const char * second)340 bool ExactSizedTandemFinder::comparePrefixChars(const char *first, const char *second) {
341 return strncmp(first, second, prefixLength) == 0;
342 }
checkAndSpreadTandem(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,quint32 repeatLen)343 quint32 *ExactSizedTandemFinder::checkAndSpreadTandem(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, quint32 repeatLen) {
344 const char *tandemStart = index->sarr2seq(tandemStartIndex); // seqCast(tandemStartIndex,0);
345 quint32 *arrRunner = (quint32 *)tandemLastIndex - 1;
346 {
347 const quint32 *sArrayLast = const_cast<quint32 *>(index->getSArray() + index->getSArraySize() - 1);
348 // run until distance become incorrect, it is incredible if we run far with changing prefix
349 do {
350 ++arrRunner;
351 } while (arrRunner < sArrayLast && arrRunner[1] - arrRunner[0] == repeatLen);
352 while (!comparePrefixChars(tandemStart, index->sarr2seq(arrRunner) /*seqCast(arrRunner,0)*/)) {
353 --arrRunner;
354 }
355 }
356
357 // if (index->getMaskedSequence()!=NULL){
358 if (false) {
359 } else {
360 // in this case seqCast(tandemStartIndex,1)==seqCast(tandemStartIndex,0)+repeatLen
361 const char *seqRunner = index->sarr2seq(arrRunner); // seqCast(arrRunner,0);
362 // move forward by 0.5-1.0 repeatlen to find exact lower bound
363 const char *seqEnd = sequence + seqSize;
364 while (seqRunner <= seqEnd - repeatLen && strncmp(tandemStart, seqRunner, repeatLen) == 0) {
365 seqRunner += repeatLen;
366 }
367
368 const quint32 size = seqRunner - tandemStart;
369 const Tandem tandem(tandemStart - sequence, repeatLen, size);
370 // check if there is already marked area
371 QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
372 if (it == rawTandems.end()) { // there are no close tandems
373 int tandemMinSize = qMax(settings.minTandemSize, settings.minRepeatCount * tandem.repeatLen);
374 if (tandem.size >= tandemMinSize) {
375 rawTandems.insert(tandem, tandem);
376 }
377 } else {
378 Tandem t = *it;
379 rawTandems.erase(it);
380 t.extend(tandem);
381 rawTandems.insert(t, t);
382 }
383 }
384 return arrRunner;
385 }
386
checkAndSpreadTandem_bv(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,quint32 repeatLen)387 quint32 *ExactSizedTandemFinder::checkAndSpreadTandem_bv(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, quint32 repeatLen) {
388 const BitMask &bitMask = suffixArray->getBitMask();
389 const quint32 tandemStart = tandemStartIndex[0];
390
391 const quint64 matchRepeat = bitMask[tandemStart];
392 quint32 *arrRunner = (quint32 *)tandemLastIndex - 1;
393 {
394 const quint32 *sArrayLast = suffixArray->getArray() + suffArrSize - 1;
395 // run until distance become incorrect, it is incredible if we would run far with changing prefix
396 do {
397 ++arrRunner;
398 } while (arrRunner < sArrayLast && arrRunner[1] - arrRunner[0] == repeatLen);
399 while (matchRepeat != bitMask[arrRunner[0]]) {
400 --arrRunner;
401 }
402 }
403
404 // if (index->getMaskedSequence()!=NULL){
405 if (false) {
406 } else {
407 // in this case seqCast(tandemStartIndex,1)==seqCast(tandemStartIndex,0)+repeatLen
408 quint32 seqRunner = arrRunner[0];
409 // move forward by 0.5-1.0 repeatlen to find exact lower bound
410 const quint32 &seqEnd = seqSize;
411 const quint64 periodMask = ~(~0ull >> (2 * repeatLen));
412 while (seqRunner <= seqEnd - repeatLen && ((matchRepeat ^ bitMask[seqRunner]) & periodMask) == 0) {
413 seqRunner += repeatLen;
414 }
415
416 const quint32 size = seqRunner - tandemStart;
417 const Tandem tandem(tandemStart, repeatLen, size);
418 // check if there is already marked area
419 QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
420 if (it == rawTandems.end()) { // there are no close tandems
421 int tandemMinSize = qMax(settings.minTandemSize, settings.minRepeatCount * tandem.repeatLen);
422 if (tandem.size >= tandemMinSize) {
423 rawTandems.insert(tandem, tandem);
424 }
425 } else {
426 Tandem t = *it;
427 rawTandems.erase(it);
428 t.extend(tandem);
429 rawTandems.insert(t, t);
430 }
431 }
432 return arrRunner;
433 }
434
LargeSizedTandemFinder(const char * _sequence,const long _seqSize,const FindTandemsTaskSettings & _settings,const int _analysisSize)435 LargeSizedTandemFinder::LargeSizedTandemFinder(const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize)
436 : ConcreteTandemFinder(tr("Find big-period tandems"), _sequence, _seqSize, _settings, _analysisSize) {
437 }
~LargeSizedTandemFinder()438 LargeSizedTandemFinder::~LargeSizedTandemFinder() {
439 }
440
run()441 void LargeSizedTandemFinder::run() {
442 if (seqSize < settings.minPeriod * settings.minRepeatCount) {
443 return;
444 }
445 if (seqSize < prefixLength) {
446 return;
447 }
448 int minPeriod = qMax(settings.minPeriod, prefixLength);
449 int maxPeriod = settings.maxPeriod;
450 if (index == nullptr) {
451 try {
452 suffixArray = new SuffixArray(sequence, seqSize, prefixLength);
453 } catch (...) {
454 setError("Not enough memory");
455 return;
456 }
457 const BitMask &bitMask = suffixArray->getBitMask();
458 const quint32 *sArray = suffixArray->getArray();
459 quint32 *currentDiffPos = (quint32 *)sArray;
460 const quint32 *sArrayLast = sArray + suffArrSize - 1;
461 while (currentDiffPos < sArrayLast) {
462 int diff = currentDiffPos[1] - currentDiffPos[0];
463 if (diff >= minPeriod && diff <= maxPeriod) {
464 quint32 firstSuffixPos = currentDiffPos[0];
465 quint32 secondSuffixPos = currentDiffPos[1];
466 const quint32 endSuffixPos = secondSuffixPos;
467 bool correctRepeat = false;
468 do {
469 correctRepeat = bitMask[firstSuffixPos] == bitMask[secondSuffixPos];
470 firstSuffixPos += prefixLength;
471 secondSuffixPos += prefixLength;
472 } while (correctRepeat && firstSuffixPos < endSuffixPos);
473 if (correctRepeat) {
474 currentDiffPos = checkAndSpreadTandem_bv(currentDiffPos, currentDiffPos + 1, diff);
475 continue;
476 }
477 }
478 currentDiffPos++;
479 }
480 delete suffixArray;
481 } else {
482 const quint32 *sArray = index->getSArray();
483 const quint32 *sArrayLast = const_cast<quint32 *>(sArray + index->getSArraySize() - 1);
484 quint32 *currentDiffPos = (quint32 *)sArray;
485
486 // TODO: rewrite this code
487 while (currentDiffPos < sArrayLast) {
488 const qint32 diff = currentDiffPos[1] - currentDiffPos[0];
489 if (diff >= minPeriod && diff <= maxPeriod) { // skip already found tandems
490 unsigned suffixOffset = static_cast<unsigned>(qMax(1, signed(settings.minTandemSize - prefixLength) / diff));
491 if (currentDiffPos + suffixOffset > sArrayLast || currentDiffPos[suffixOffset] - currentDiffPos[0] != suffixOffset * diff) {
492 // currentDiffPos += suffixOffset;continue;
493 } else {
494 const char *seq0 = index->sarr2seq(currentDiffPos); // seqCast(currentDiffPos,0);
495 const char *seq1 = index->sarr2seq(currentDiffPos + suffixOffset); // seqCast(currentDiffPos,suffixOffset);
496 // in the other words prefix currentDiffPos[0]==currentDiffPos[1]
497 if (comparePrefixChars(seq0, seq1)) {
498 currentDiffPos = checkAndSpreadTandem(currentDiffPos, currentDiffPos + suffixOffset, diff);
499 continue;
500 }
501 }
502 }
503 currentDiffPos++;
504 }
505 }
506
507 qobject_cast<TandemFinder_Region *>(getParentTask())->addResults(rawTandems);
508 }
comparePrefixChars(const char * first,const char * second)509 bool LargeSizedTandemFinder::comparePrefixChars(const char *first, const char *second) {
510 return strncmp(first, second, prefixLength) == 0;
511 }
checkAndSpreadTandem(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,const unsigned repeatLen)512 quint32 *LargeSizedTandemFinder::checkAndSpreadTandem(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, const unsigned repeatLen) {
513 const char *tandemStart = index->sarr2seq(tandemStartIndex); // seqCast(tandemStartIndex,0);
514
515 quint32 *arrRunner = (quint32 *)tandemLastIndex - 1;
516 {
517 const quint32 *sArrayLast = const_cast<quint32 *>(index->getSArray() + index->getSArraySize() - 1);
518 // run until distance become incorrect, it is incredible if we run far with changing prefix
519 do {
520 ++arrRunner;
521 } while (arrRunner < sArrayLast && arrRunner[1] - arrRunner[0] == repeatLen);
522 while (!comparePrefixChars(tandemStart, index->sarr2seq(arrRunner) /*seqCast(arrRunner,0)*/)) {
523 --arrRunner;
524 }
525 }
526
527 // in this case seqCast(tandemStartIndex,1)==seqCast(tandemStartIndex,0)+repeatLen
528 const char *seqRunner = index->sarr2seq(arrRunner); // seqCast(arrRunner,0);
529 // move forward by 0.5-1.0 of repeat len to find exact lower bound
530 const char *seqEnd = sequence + seqSize;
531 while (seqRunner <= seqEnd - repeatLen && strncmp(tandemStart, seqRunner, repeatLen) == 0) {
532 seqRunner += repeatLen;
533 }
534
535 const quint32 size = seqRunner - tandemStart;
536 const Tandem tandem(tandemStart - sequence, repeatLen, size);
537 // check if there is already marked area
538 QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
539 if (it == rawTandems.end()) { // there are no close tandems
540 rawTandems.insert(tandem, tandem);
541 } else {
542 Tandem t = *it;
543 rawTandems.erase(it);
544 t.extend(tandem);
545 rawTandems.insert(t, t);
546 }
547
548 return arrRunner;
549 }
550
checkAndSpreadTandem_bv(const quint32 * tandemStartIndex,const quint32 * tandemLastIndex,const unsigned repeatLen)551 quint32 *LargeSizedTandemFinder::checkAndSpreadTandem_bv(const quint32 *tandemStartIndex, const quint32 *tandemLastIndex, const unsigned repeatLen) {
552 const BitMask &bitMask = suffixArray->getBitMask();
553 const quint32 tandemStart = tandemStartIndex[0];
554 quint32 tandemLast = tandemLastIndex[0];
555 const quint32 &seqEnd = seqSize;
556 // repeatLen > prefixLength
557 while (tandemLast < seqEnd - prefixLength && bitMask[tandemLast] == bitMask[tandemLast - repeatLen]) {
558 tandemLast += prefixLength;
559 }
560
561 const quint32 size = tandemLast - tandemStart;
562 const Tandem tandem(tandemStart, repeatLen, size);
563 // check if there is already marked area
564 QMap<Tandem, Tandem>::iterator it = rawTandems.find(tandem);
565 if (it == rawTandems.end()) { // there are no close tandems
566 int tandemMinSize = qMax(settings.minTandemSize, 2 * tandem.repeatLen);
567 if (tandem.size >= tandemMinSize) {
568 rawTandems.insert(tandem, tandem);
569 }
570 } else {
571 Tandem t = *it;
572 rawTandems.erase(it);
573 t.extend(tandem);
574 rawTandems.insert(t, t);
575 }
576 return const_cast<quint32 *>(tandemStartIndex + size / repeatLen);
577 }
578
579 } // namespace U2
580