1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include <hmmer2/funcs.h>
23
24 #include <U2Core/AppContext.h>
25 #include <U2Core/Counter.h>
26 #include <U2Core/DNAAlphabet.h>
27 #include <U2Core/DNATranslation.h>
28 #include <U2Core/SequenceWalkerTask.h>
29
30 #include "HMMIO.h"
31 #include "HMMSearchTask.h"
32 #include "TaskLocalStorage.h"
33
34 namespace U2 {
35
HMMSearchTask(plan7_s * _hmm,const DNASequence & _seq,const UHMMSearchSettings & s)36 HMMSearchTask::HMMSearchTask(plan7_s *_hmm, const DNASequence &_seq, const UHMMSearchSettings &s)
37 : Task("", TaskFlag_NoRun),
38 hmm(_hmm), seq(_seq), settings(s), complTrans(nullptr), aminoTrans(nullptr), fName(""), readHMMTask(nullptr), swTask(nullptr) {
39 setTaskName(tr("HMM search with '%1'").arg(hmm->name));
40 GCOUNTER(cvar, "HMM2 Search");
41 }
42
HMMSearchTask(const QString & hFile,const DNASequence & _seq,const UHMMSearchSettings & s)43 HMMSearchTask::HMMSearchTask(const QString &hFile, const DNASequence &_seq, const UHMMSearchSettings &s)
44 : Task("", TaskFlag_NoRun),
45 hmm(nullptr), seq(_seq), settings(s), complTrans(nullptr), aminoTrans(nullptr), fName(hFile), readHMMTask(nullptr), swTask(nullptr) {
46 setTaskName(tr("HMM Search"));
47 GCOUNTER(cvar, "HMM2 Search");
48 }
49
prepare()50 void HMMSearchTask::prepare() {
51 if (hasError()) {
52 return;
53 }
54
55 if (hmm != nullptr) {
56 swTask = getSWSubtask();
57 if (swTask == nullptr) {
58 assert(hasError());
59 return;
60 }
61 addSubTask(swTask);
62 } else {
63 readHMMTask = new HMMReadTask(fName);
64 addSubTask(readHMMTask);
65 }
66 // if (!checkAlphabets(hmm->atype, seq.alphabet, complTrans, aminoTrans)) {
67 // return;
68 // }
69 // SequenceWalkerConfig config;
70 // config.seq = seq.seq.data();
71 // config.seqSize = seq.seq.size();
72 // config.complTrans = complTrans;
73 // config.strandToWalk = complTrans == NULL ? StrandOption_DirectOnly : StrandOption_Both;
74 // config.aminoTrans = aminoTrans;
75 // config.overlapSize = 2 * hmm->M;
76 // config.chunkSize = qMax(6 * hmm->M, settings.searchChunkSize);
77 // if (settings.extraLen == -1) {
78 // config.lastChunkExtraLen = config.chunkSize / 2;
79 // } else {
80 // config.lastChunkExtraLen = settings.extraLen;
81 // }
82 //
83 // config.nThreads = MAX_PARALLEL_SUBTASKS_AUTO;
84 //
85 // addSubTask(new SequenceWalkerTask(config, this, tr("parallel_hmm_search_task")));
86 }
87
onRegion(SequenceWalkerSubtask * t,TaskStateInfo & si)88 void HMMSearchTask::onRegion(SequenceWalkerSubtask *t, TaskStateInfo &si) {
89 const char *localSeq = t->getRegionSequence();
90 int localSeqSize = t->getRegionSequenceLen();
91 bool wasCompl = t->isDNAComplemented();
92 bool wasAmino = t->isAminoTranslated();
93 U2Region globalReg = t->getGlobalRegion();
94
95 // set TLS data
96 TaskLocalData::createHMMContext(t->getTaskId(), true);
97
98 QList<UHMMSearchResult> sresults;
99 try {
100 sresults = UHMMSearch::search(hmm, localSeq, localSeqSize, settings, si);
101 } catch (const HMMException &e) {
102 stateInfo.setError(e.error);
103 }
104 if (si.hasError()) {
105 stateInfo.setError(si.getError());
106 }
107 if (sresults.isEmpty() || stateInfo.cancelFlag || stateInfo.hasError()) {
108 TaskLocalData::freeHMMContext(t->getTaskId());
109 return;
110 }
111
112 // convert all UHMMSearchResults into HMMSearchTaskResult
113 QMutexLocker locker(&lock);
114 int halfOverlap = hmm->M;
115 foreach (const UHMMSearchResult &sr, sresults) {
116 HMMSearchTaskResult r;
117 r.evalue = sr.evalue;
118 r.score = sr.score;
119 r.onCompl = wasCompl;
120 r.onAmino = wasAmino;
121 int resLen = wasAmino ? sr.r.length * 3 : sr.r.length;
122 int resStart = wasAmino ? sr.r.startPos * 3 : sr.r.startPos;
123 if (wasCompl) {
124 resStart = globalReg.length - resStart - resLen;
125 }
126 r.r.startPos = globalReg.startPos + resStart;
127 r.r.length = resLen;
128 if (t->intersectsWithOverlaps(r.r)) {
129 // don't add to overlaps if it must be found in 2 regions
130 bool add = true;
131 if (!r.onCompl && t->hasRightOverlap()) { // check if will be found in a next chunk
132 U2Region nextChunkRegion(globalReg.endPos() - halfOverlap, halfOverlap);
133 add = !nextChunkRegion.contains(r.r);
134 } else if (r.onCompl && t->hasLeftOverlap()) { // check if will found in a prev chunk
135 U2Region prevChunkRegion(globalReg.startPos, halfOverlap);
136 add = !prevChunkRegion.contains(r.r);
137 }
138 if (add) {
139 r.borderResult = (t->hasLeftOverlap() && r.r.startPos == globalReg.startPos) || (t->hasRightOverlap() && r.r.endPos() == globalReg.endPos());
140 overlaps.append(r);
141 }
142 } else {
143 results.append(r);
144 }
145 }
146
147 TaskLocalData::freeHMMContext(t->getTaskId());
148 }
149
HMMSearchResult_LessThan(const HMMSearchTaskResult & r1,const HMMSearchTaskResult & r2)150 static bool HMMSearchResult_LessThan(const HMMSearchTaskResult &r1, const HMMSearchTaskResult &r2) {
151 if (r1.evalue == r2.evalue) {
152 if (r1.r == r2.r) {
153 if (r1.onCompl == r2.onCompl) {
154 return &r1 < &r2;
155 }
156 return r2.onCompl;
157 }
158 return r1.r < r2.r;
159 }
160 return r1.evalue < r2.evalue;
161 }
162
report()163 Task::ReportResult HMMSearchTask::report() {
164 if (hasError()) {
165 return ReportResult_Finished;
166 }
167
168 // postprocess overlaps
169 int maxCommonLen = hmm->M / 2; // if 2 results have common part of 'maxCommonLen' or greater -> select best one
170 for (int i = 0; i < overlaps.count(); i++) {
171 HMMSearchTaskResult &r1 = overlaps[i];
172 if (r1.filtered) {
173 continue;
174 }
175 for (int j = i + 1; j < overlaps.count(); j++) {
176 HMMSearchTaskResult &r2 = overlaps[j];
177 if (r2.filtered) {
178 continue;
179 }
180 if (r1.onCompl != r2.onCompl) { // check both regions are on the same strand
181 continue;
182 }
183 if (r1.onAmino) { // check both regions have the same amino frame
184 int s1 = r1.onCompl ? r1.r.endPos() % 3 : r1.r.startPos % 3;
185 int s2 = r2.onCompl ? r2.r.endPos() % 3 : r2.r.startPos % 3;
186 if (s1 != s2) {
187 continue;
188 }
189 }
190 if (r1.r.contains(r2.r) && r1.r != r2.r) {
191 r2.filtered = true;
192 } else if (r2.r.contains(r1.r) && r1.r != r2.r) {
193 r1.filtered = true;
194 break;
195 } else if (r1.r.intersect(r2.r).length >= maxCommonLen) {
196 bool useR1 = r2.score <= r1.score;
197 if (r1.score == r2.score && r1.evalue == r2.evalue && r1.borderResult && !r2.borderResult) {
198 useR1 = false;
199 }
200 if (useR1) {
201 r2.filtered = true;
202 } else {
203 r1.filtered = true;
204 break;
205 }
206 }
207 }
208 }
209
210 foreach (const HMMSearchTaskResult &r, overlaps) {
211 if (!r.filtered) {
212 results.append(r);
213 }
214 }
215
216 // sort results by E-value
217 std::sort(results.begin(), results.end(), HMMSearchResult_LessThan);
218 return ReportResult_Finished;
219 }
220
getResultsAsAnnotations(U2FeatureType type,const QString & name) const221 QList<SharedAnnotationData> HMMSearchTask::getResultsAsAnnotations(U2FeatureType type, const QString &name) const {
222 QList<SharedAnnotationData> annotations;
223 foreach (const HMMSearchTaskResult &hmmRes, results) {
224 SharedAnnotationData a(new AnnotationData);
225 a->type = type;
226 a->name = name;
227 a->setStrand(hmmRes.onCompl ? U2Strand::Complementary : U2Strand::Direct);
228 a->location->regions << hmmRes.r;
229
230 QString str; /*add zeros at begin of evalue exponent part, so exponent part must contains 3 numbers*/
231 str.sprintf("%.2g", ((double)hmmRes.evalue));
232 QRegExp rx("\\+|\\-.+");
233 int pos = rx.indexIn(str, 0);
234 if (pos != -1) {
235 str.insert(pos + 1, "0");
236 }
237 QString info = hmm->name;
238 if (hmm->flags & PLAN7_ACC) {
239 info += QString().sprintf("\nAccession number in PFAM : %s", hmm->acc);
240 }
241 if (hmm->flags & PLAN7_DESC) {
242 info += QString().sprintf("\n%s", hmm->desc);
243 }
244 if (!info.isEmpty()) {
245 a->qualifiers.append(U2Qualifier("HMM-model", info));
246 }
247 // a->qualifiers.append(U2Qualifier("E-value", QString().sprintf("%.2lg", ((double) hmmRes.evalue))));
248 a->qualifiers.append(U2Qualifier("E-value", str));
249 a->qualifiers.append(U2Qualifier("Score", QString().sprintf("%.1f", hmmRes.score)));
250 annotations.append(a);
251 }
252 return annotations;
253 }
254
checkAlphabets(int hmmAlType,const DNAAlphabet * seqAl,DNATranslation * & complTrans,DNATranslation * & aminoTrans)255 bool HMMSearchTask::checkAlphabets(int hmmAlType, const DNAAlphabet *seqAl, DNATranslation *&complTrans, DNATranslation *&aminoTrans) {
256 assert(stateInfo.getError().isEmpty());
257 DNAAlphabetType hmmAl = HMMIO::convertHMMAlphabet(hmmAlType);
258 if (hmmAl == DNAAlphabet_RAW) {
259 stateInfo.setError(tr("Invalid HMM alphabet!"));
260 return false;
261 }
262 if (seqAl->isRaw()) {
263 stateInfo.setError(tr("Invalid sequence alphabet!"));
264 return false;
265 }
266
267 complTrans = NULL;
268 aminoTrans = NULL;
269 if (seqAl->isNucleic()) {
270 DNATranslationRegistry *tr = AppContext::getDNATranslationRegistry();
271 DNATranslation *complT = tr->lookupComplementTranslation(seqAl);
272 if (complT != NULL) {
273 complTrans = complT;
274 }
275 if (hmmAl == DNAAlphabet_AMINO) {
276 QList<DNATranslation *> aminoTs = tr->lookupTranslation(seqAl, DNATranslationType_NUCL_2_AMINO);
277 if (!aminoTs.empty()) {
278 aminoTrans = tr->getStandardGeneticCodeTranslation(seqAl);
279 }
280 }
281 } else {
282 assert(seqAl->isAmino());
283 }
284
285 // check the result;
286 if (hmmAl == DNAAlphabet_AMINO) {
287 if (seqAl->isAmino()) {
288 assert(complTrans == NULL && aminoTrans == NULL);
289 } else {
290 if (aminoTrans == NULL) {
291 stateInfo.setError(tr("Amino translation is not available for the sequence alphabet!"));
292 return false;
293 }
294 }
295 }
296
297 return true;
298 }
299
getSWSubtask()300 SequenceWalkerTask *HMMSearchTask::getSWSubtask() {
301 assert(!hasError());
302 assert(NULL != hmm);
303
304 if (!checkAlphabets(hmm->atype, seq.alphabet, complTrans, aminoTrans)) {
305 return NULL;
306 }
307 SequenceWalkerConfig config;
308 config.seq = seq.seq.data();
309 config.seqSize = seq.seq.size();
310 config.complTrans = complTrans;
311 config.strandToWalk = complTrans == NULL ? StrandOption_DirectOnly : StrandOption_Both;
312 config.aminoTrans = aminoTrans;
313 config.overlapSize = 2 * hmm->M;
314 config.chunkSize = qMax(6 * hmm->M, settings.searchChunkSize);
315 if (settings.extraLen == -1) {
316 config.lastChunkExtraLen = config.chunkSize / 2;
317 } else {
318 config.lastChunkExtraLen = settings.extraLen;
319 }
320 config.walkCircular = false;
321
322 config.nThreads = MAX_PARALLEL_SUBTASKS_AUTO;
323
324 return new SequenceWalkerTask(config, this, tr("Parallel HMM search"));
325 }
326
onSubTaskFinished(Task * subTask)327 QList<Task *> HMMSearchTask::onSubTaskFinished(Task *subTask) {
328 assert(NULL != subTask);
329 QList<Task *> res;
330 if (subTask->hasError()) {
331 stateInfo.setError(subTask->getError());
332 return res;
333 }
334
335 if (readHMMTask == subTask) {
336 hmm = readHMMTask->getHMM();
337 swTask = getSWSubtask();
338 if (NULL == swTask) {
339 assert(hasError());
340 return res;
341 }
342 res << swTask;
343 } else {
344 if (swTask != subTask) {
345 assert(0 && "undefined_subtask_finished");
346 }
347 }
348
349 return res;
350 }
351
352 } // namespace U2
353