1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #ifndef RF_SARRAYTANDEMFINDER_H
23 #define RF_SARRAYTANDEMFINDER_H
24 
25 #include <QMap>
26 #include <QMutex>
27 
28 #include <U2Algorithm/BitsTable.h>
29 #include <U2Algorithm/SArrayIndex.h>
30 
31 #include <U2Core/AnnotationData.h>
32 #include <U2Core/AppResources.h>
33 #include <U2Core/DNASequence.h>
34 #include <U2Core/GObjectReference.h>
35 #include <U2Core/SequenceWalkerTask.h>
36 #include <U2Core/Task.h>
37 
38 #include "RFBase.h"
39 #include "RF_SuffixArray.h"
40 
41 namespace U2 {
42 
43 namespace TSConstants {
44 enum TSAlgo { AlgoSuffix = 0,
45               AlgoSuffixBinary = 1 };
46 enum TSPreset { PresetAll = 0,
47                 PresetMicro = 1,
48                 PresetMini = 2,
49                 PresetBigPeriod = 3,
50                 PresetCustom = 4 };
51 }  // namespace TSConstants
52 
53 class FindTandemsTaskSettings {
54 public:
55     static const int DEFAULT_MIN_TANDEM_SIZE;
56     static const int DEFAULT_MIN_REPEAT_COUNT;
57     static const TSConstants::TSAlgo DEFAULT_ALGO = TSConstants::AlgoSuffixBinary;
58 
59 public:
FindTandemsTaskSettings()60     FindTandemsTaskSettings()
61         : minPeriod(1), maxPeriod(INT_MAX), minTandemSize(DEFAULT_MIN_TANDEM_SIZE), minRepeatCount(DEFAULT_MIN_REPEAT_COUNT),
62           accuracy(0), maxResults(10 * 1000 * 100), reportSeqShift(0), showOverlappedTandems(false), algo(DEFAULT_ALGO), nThreads(MAX_PARALLEL_SUBTASKS_AUTO) {
63     }
64 
65     int minPeriod;
66     int maxPeriod;
67     int minTandemSize;
68     int minRepeatCount;
69     int accuracy;
70     int maxResults;
71     qint64 reportSeqShift;
72     U2Region seqRegion;
73     bool showOverlappedTandems;
74 
75     TSConstants::TSAlgo algo;
76     int nThreads;
77 };
78 
79 class FindTandemsToAnnotationsTask : public Task {
80     Q_OBJECT
81 public:
82     FindTandemsToAnnotationsTask(const FindTandemsTaskSettings &s, const DNASequence &seq, const QString &annName, const QString &groupName, const QString &annDescription, const GObjectReference &annObjRef);
83     FindTandemsToAnnotationsTask(const FindTandemsTaskSettings &s, const DNASequence &seq);
84 
85     QList<Task *> onSubTaskFinished(Task *subTask);
86     QList<SharedAnnotationData> importTandemAnnotations(const QList<Tandem> &tandems, qint64 seqStart, const bool showOverlapped);
87 
getResult()88     QList<SharedAnnotationData> getResult() const {
89         return result;
90     }
91 
92 private:
93     bool saveAnns;
94     DNASequence mainSeq;
95     QString annName;
96     QString annGroup;
97     const QString annDescription;
98     GObjectReference annObjRef;
99 
100     QList<SharedAnnotationData> result;
101     const FindTandemsTaskSettings s;
102 };
103 
104 class TandemFinder : public Task, public SequenceWalkerCallback {
105     Q_OBJECT
106 public:
107     TandemFinder(const FindTandemsTaskSettings &s, const DNASequence &seq);
108     void prepare();
109     void run();
getResults()110     const QList<Tandem> &getResults() {
111         return foundTandems;
112     }
getSettings()113     const FindTandemsTaskSettings &getSettings() const {
114         return settings;
115     }
116     const static quint32 maxCheckPeriod = 1024;  // max period is 1k
117 protected:
118     // main sequence
119     char *sequence;
120     FindTandemsTaskSettings settings;
121     QMutex tandemsAccessMutex;
122     QList<Tandem> foundTandems;
123     QList<Task *> onSubTaskFinished(Task *subTask);
124     void onRegion(SequenceWalkerSubtask *t, TaskStateInfo &ti);
125 
126 private:
127     QMutex subtasksQueue;
128     quint32 regionCount;
129     quint64 startTime;
130     QList<Task *> regionTasks;
131 };
132 
133 class TandemFinder_Region : public Task {
134     Q_OBJECT
135 public:
TandemFinder_Region(const int regionId,const char * _sequence,const quint32 _seqSize,const quint64 _regionOffset,const FindTandemsTaskSettings & _settings)136     TandemFinder_Region(const int regionId, const char *_sequence, const quint32 _seqSize, const quint64 _regionOffset, const FindTandemsTaskSettings &_settings)
137         : Task(tr("Find tandems in %1 region").arg(regionId), TaskFlags_NR_FOSCOE),
138           sequence(_sequence), seqSize(_seqSize), id(regionId), regionOffset(_regionOffset), settings(_settings) {
139     }
140 
141     ~TandemFinder_Region();
142 
getResult()143     const QList<Tandem> getResult() {
144         QMutexLocker tandemsAccessLocker(&tandemsAccessMutex);
145         return foundTandems;
146     };
getRegionId()147     quint64 getRegionId() const {
148         return id;
149     }
getRegionOffset()150     quint64 getRegionOffset() const {
151         return regionOffset;
152     }
153     void prepare();
154 
155 protected:
156     // main sequence
157     const char *sequence;
158     const long seqSize;
159 
160     QList<Task *> onSubTaskFinished(Task *subTask);
161 
162 private:
163     const int id;
164     const quint64 regionOffset;
165 
166     const FindTandemsTaskSettings &settings;
167     QList<Tandem> foundTandems;
168     QMutex tandemsAccessMutex;
169     friend class ExactSizedTandemFinder;
170     friend class LargeSizedTandemFinder;
171     void addResult(const Tandem &tandem);
172     void addResults(const QMap<Tandem, Tandem> &tandems);
173 };
174 
175 class ConcreteTandemFinder : public Task {
176     Q_OBJECT
177 public:
178     ConcreteTandemFinder(QString taskName, const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize);
~ConcreteTandemFinder()179     ~ConcreteTandemFinder() {};
180 
181     void prepare();
182     void cleanup();
183 
184 protected:
185     // main sequence
186     const char *sequence;
187     const int seqSize;
188 
189     SArrayIndex *index;
190     SuffixArray *suffixArray;
191     const FindTandemsTaskSettings &settings;
192     const int prefixLength;
193 
194     const quint32 suffArrSize;
195     QList<Task *> onSubTaskFinished(Task *subTask);
196 
197 private:
198     const BitsTable bitsTable;
199 
200 protected:
201     QMap<Tandem, Tandem> rawTandems;
202 };
203 
204 class ExactSizedTandemFinder : public ConcreteTandemFinder {
205     Q_OBJECT
206 public:
207     ExactSizedTandemFinder(const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize);
208     ~ExactSizedTandemFinder();
209 
210     void run();
211 
212 private:
213     quint32 *checkAndSpreadTandem(const quint32 *tandemStart, const quint32 *tandemLast, quint32 repeatLen);
214     quint32 *checkAndSpreadTandem_bv(const quint32 *tandemStart, const quint32 *tandemLast, quint32 repeatLen);
215     bool comparePrefixChars(const char *, const char *);
216 };
217 
218 class LargeSizedTandemFinder : public ConcreteTandemFinder {
219     Q_OBJECT
220 public:
221     LargeSizedTandemFinder(const char *_sequence, const long _seqSize, const FindTandemsTaskSettings &_settings, const int _analysisSize);
222     ~LargeSizedTandemFinder();
223 
224     void run();
225 
226 private:
227     quint32 *checkAndSpreadTandem(const quint32 *tandemStart, const quint32 *tandemLast, const unsigned repeatLen);
228     quint32 *checkAndSpreadTandem_bv(const quint32 *tandemStart, const quint32 *tandemLast, const unsigned repeatLen);
229     bool comparePrefixChars(const char *, const char *);
230 };
231 
232 }  // namespace U2
233 
234 #endif  // RF_SARRAYTANDEMFINDER_H
235