1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include <hmmer2/funcs.h>
23 
24 #include <U2Core/AppContext.h>
25 #include <U2Core/Counter.h>
26 #include <U2Core/DNAAlphabet.h>
27 #include <U2Core/DNATranslation.h>
28 #include <U2Core/SequenceWalkerTask.h>
29 
30 #include "HMMIO.h"
31 #include "HMMSearchTask.h"
32 #include "TaskLocalStorage.h"
33 
34 namespace U2 {
35 
HMMSearchTask(plan7_s * _hmm,const DNASequence & _seq,const UHMMSearchSettings & s)36 HMMSearchTask::HMMSearchTask(plan7_s *_hmm, const DNASequence &_seq, const UHMMSearchSettings &s)
37     : Task("", TaskFlag_NoRun),
38       hmm(_hmm), seq(_seq), settings(s), complTrans(nullptr), aminoTrans(nullptr), fName(""), readHMMTask(nullptr), swTask(nullptr) {
39     setTaskName(tr("HMM search with '%1'").arg(hmm->name));
40     GCOUNTER(cvar, "HMM2 Search");
41 }
42 
HMMSearchTask(const QString & hFile,const DNASequence & _seq,const UHMMSearchSettings & s)43 HMMSearchTask::HMMSearchTask(const QString &hFile, const DNASequence &_seq, const UHMMSearchSettings &s)
44     : Task("", TaskFlag_NoRun),
45       hmm(nullptr), seq(_seq), settings(s), complTrans(nullptr), aminoTrans(nullptr), fName(hFile), readHMMTask(nullptr), swTask(nullptr) {
46     setTaskName(tr("HMM Search"));
47     GCOUNTER(cvar, "HMM2 Search");
48 }
49 
prepare()50 void HMMSearchTask::prepare() {
51     if (hasError()) {
52         return;
53     }
54 
55     if (hmm != nullptr) {
56         swTask = getSWSubtask();
57         if (swTask == nullptr) {
58             assert(hasError());
59             return;
60         }
61         addSubTask(swTask);
62     } else {
63         readHMMTask = new HMMReadTask(fName);
64         addSubTask(readHMMTask);
65     }
66     //     if (!checkAlphabets(hmm->atype, seq.alphabet, complTrans, aminoTrans)) {
67     //         return;
68     //     }
69     //     SequenceWalkerConfig config;
70     //     config.seq = seq.seq.data();
71     //     config.seqSize = seq.seq.size();
72     //     config.complTrans = complTrans;
73     //     config.strandToWalk = complTrans == NULL ? StrandOption_DirectOnly : StrandOption_Both;
74     //     config.aminoTrans = aminoTrans;
75     //     config.overlapSize = 2 * hmm->M;
76     //     config.chunkSize = qMax(6 * hmm->M, settings.searchChunkSize);
77     //     if (settings.extraLen == -1) {
78     //         config.lastChunkExtraLen = config.chunkSize / 2;
79     //     } else {
80     //         config.lastChunkExtraLen = settings.extraLen;
81     //     }
82     //
83     //     config.nThreads = MAX_PARALLEL_SUBTASKS_AUTO;
84     //
85     //     addSubTask(new SequenceWalkerTask(config, this, tr("parallel_hmm_search_task")));
86 }
87 
onRegion(SequenceWalkerSubtask * t,TaskStateInfo & si)88 void HMMSearchTask::onRegion(SequenceWalkerSubtask *t, TaskStateInfo &si) {
89     const char *localSeq = t->getRegionSequence();
90     int localSeqSize = t->getRegionSequenceLen();
91     bool wasCompl = t->isDNAComplemented();
92     bool wasAmino = t->isAminoTranslated();
93     U2Region globalReg = t->getGlobalRegion();
94 
95     // set TLS data
96     TaskLocalData::createHMMContext(t->getTaskId(), true);
97 
98     QList<UHMMSearchResult> sresults;
99     try {
100         sresults = UHMMSearch::search(hmm, localSeq, localSeqSize, settings, si);
101     } catch (const HMMException &e) {
102         stateInfo.setError(e.error);
103     }
104     if (si.hasError()) {
105         stateInfo.setError(si.getError());
106     }
107     if (sresults.isEmpty() || stateInfo.cancelFlag || stateInfo.hasError()) {
108         TaskLocalData::freeHMMContext(t->getTaskId());
109         return;
110     }
111 
112     // convert all UHMMSearchResults into HMMSearchTaskResult
113     QMutexLocker locker(&lock);
114     int halfOverlap = hmm->M;
115     foreach (const UHMMSearchResult &sr, sresults) {
116         HMMSearchTaskResult r;
117         r.evalue = sr.evalue;
118         r.score = sr.score;
119         r.onCompl = wasCompl;
120         r.onAmino = wasAmino;
121         int resLen = wasAmino ? sr.r.length * 3 : sr.r.length;
122         int resStart = wasAmino ? sr.r.startPos * 3 : sr.r.startPos;
123         if (wasCompl) {
124             resStart = globalReg.length - resStart - resLen;
125         }
126         r.r.startPos = globalReg.startPos + resStart;
127         r.r.length = resLen;
128         if (t->intersectsWithOverlaps(r.r)) {
129             // don't add to overlaps if it must be found in 2 regions
130             bool add = true;
131             if (!r.onCompl && t->hasRightOverlap()) {  // check if will be found in a next chunk
132                 U2Region nextChunkRegion(globalReg.endPos() - halfOverlap, halfOverlap);
133                 add = !nextChunkRegion.contains(r.r);
134             } else if (r.onCompl && t->hasLeftOverlap()) {  // check if will found in a prev chunk
135                 U2Region prevChunkRegion(globalReg.startPos, halfOverlap);
136                 add = !prevChunkRegion.contains(r.r);
137             }
138             if (add) {
139                 r.borderResult = (t->hasLeftOverlap() && r.r.startPos == globalReg.startPos) || (t->hasRightOverlap() && r.r.endPos() == globalReg.endPos());
140                 overlaps.append(r);
141             }
142         } else {
143             results.append(r);
144         }
145     }
146 
147     TaskLocalData::freeHMMContext(t->getTaskId());
148 }
149 
HMMSearchResult_LessThan(const HMMSearchTaskResult & r1,const HMMSearchTaskResult & r2)150 static bool HMMSearchResult_LessThan(const HMMSearchTaskResult &r1, const HMMSearchTaskResult &r2) {
151     if (r1.evalue == r2.evalue) {
152         if (r1.r == r2.r) {
153             if (r1.onCompl == r2.onCompl) {
154                 return &r1 < &r2;
155             }
156             return r2.onCompl;
157         }
158         return r1.r < r2.r;
159     }
160     return r1.evalue < r2.evalue;
161 }
162 
report()163 Task::ReportResult HMMSearchTask::report() {
164     if (hasError()) {
165         return ReportResult_Finished;
166     }
167 
168     // postprocess overlaps
169     int maxCommonLen = hmm->M / 2;  // if 2 results have common part of 'maxCommonLen' or greater -> select best one
170     for (int i = 0; i < overlaps.count(); i++) {
171         HMMSearchTaskResult &r1 = overlaps[i];
172         if (r1.filtered) {
173             continue;
174         }
175         for (int j = i + 1; j < overlaps.count(); j++) {
176             HMMSearchTaskResult &r2 = overlaps[j];
177             if (r2.filtered) {
178                 continue;
179             }
180             if (r1.onCompl != r2.onCompl) {  // check both regions are on the same strand
181                 continue;
182             }
183             if (r1.onAmino) {  // check both regions have the same amino frame
184                 int s1 = r1.onCompl ? r1.r.endPos() % 3 : r1.r.startPos % 3;
185                 int s2 = r2.onCompl ? r2.r.endPos() % 3 : r2.r.startPos % 3;
186                 if (s1 != s2) {
187                     continue;
188                 }
189             }
190             if (r1.r.contains(r2.r) && r1.r != r2.r) {
191                 r2.filtered = true;
192             } else if (r2.r.contains(r1.r) && r1.r != r2.r) {
193                 r1.filtered = true;
194                 break;
195             } else if (r1.r.intersect(r2.r).length >= maxCommonLen) {
196                 bool useR1 = r2.score <= r1.score;
197                 if (r1.score == r2.score && r1.evalue == r2.evalue && r1.borderResult && !r2.borderResult) {
198                     useR1 = false;
199                 }
200                 if (useR1) {
201                     r2.filtered = true;
202                 } else {
203                     r1.filtered = true;
204                     break;
205                 }
206             }
207         }
208     }
209 
210     foreach (const HMMSearchTaskResult &r, overlaps) {
211         if (!r.filtered) {
212             results.append(r);
213         }
214     }
215 
216     // sort results by E-value
217     std::sort(results.begin(), results.end(), HMMSearchResult_LessThan);
218     return ReportResult_Finished;
219 }
220 
getResultsAsAnnotations(U2FeatureType type,const QString & name) const221 QList<SharedAnnotationData> HMMSearchTask::getResultsAsAnnotations(U2FeatureType type, const QString &name) const {
222     QList<SharedAnnotationData> annotations;
223     foreach (const HMMSearchTaskResult &hmmRes, results) {
224         SharedAnnotationData a(new AnnotationData);
225         a->type = type;
226         a->name = name;
227         a->setStrand(hmmRes.onCompl ? U2Strand::Complementary : U2Strand::Direct);
228         a->location->regions << hmmRes.r;
229 
230         QString str; /*add zeros at begin of evalue exponent part, so exponent part must contains 3 numbers*/
231         str.sprintf("%.2g", ((double)hmmRes.evalue));
232         QRegExp rx("\\+|\\-.+");
233         int pos = rx.indexIn(str, 0);
234         if (pos != -1) {
235             str.insert(pos + 1, "0");
236         }
237         QString info = hmm->name;
238         if (hmm->flags & PLAN7_ACC) {
239             info += QString().sprintf("\nAccession number in PFAM : %s", hmm->acc);
240         }
241         if (hmm->flags & PLAN7_DESC) {
242             info += QString().sprintf("\n%s", hmm->desc);
243         }
244         if (!info.isEmpty()) {
245             a->qualifiers.append(U2Qualifier("HMM-model", info));
246         }
247         // a->qualifiers.append(U2Qualifier("E-value", QString().sprintf("%.2lg", ((double) hmmRes.evalue))));
248         a->qualifiers.append(U2Qualifier("E-value", str));
249         a->qualifiers.append(U2Qualifier("Score", QString().sprintf("%.1f", hmmRes.score)));
250         annotations.append(a);
251     }
252     return annotations;
253 }
254 
checkAlphabets(int hmmAlType,const DNAAlphabet * seqAl,DNATranslation * & complTrans,DNATranslation * & aminoTrans)255 bool HMMSearchTask::checkAlphabets(int hmmAlType, const DNAAlphabet *seqAl, DNATranslation *&complTrans, DNATranslation *&aminoTrans) {
256     assert(stateInfo.getError().isEmpty());
257     DNAAlphabetType hmmAl = HMMIO::convertHMMAlphabet(hmmAlType);
258     if (hmmAl == DNAAlphabet_RAW) {
259         stateInfo.setError(tr("Invalid HMM alphabet!"));
260         return false;
261     }
262     if (seqAl->isRaw()) {
263         stateInfo.setError(tr("Invalid sequence alphabet!"));
264         return false;
265     }
266 
267     complTrans = NULL;
268     aminoTrans = NULL;
269     if (seqAl->isNucleic()) {
270         DNATranslationRegistry *tr = AppContext::getDNATranslationRegistry();
271         DNATranslation *complT = tr->lookupComplementTranslation(seqAl);
272         if (complT != NULL) {
273             complTrans = complT;
274         }
275         if (hmmAl == DNAAlphabet_AMINO) {
276             QList<DNATranslation *> aminoTs = tr->lookupTranslation(seqAl, DNATranslationType_NUCL_2_AMINO);
277             if (!aminoTs.empty()) {
278                 aminoTrans = tr->getStandardGeneticCodeTranslation(seqAl);
279             }
280         }
281     } else {
282         assert(seqAl->isAmino());
283     }
284 
285     // check the result;
286     if (hmmAl == DNAAlphabet_AMINO) {
287         if (seqAl->isAmino()) {
288             assert(complTrans == NULL && aminoTrans == NULL);
289         } else {
290             if (aminoTrans == NULL) {
291                 stateInfo.setError(tr("Amino translation is not available for the sequence alphabet!"));
292                 return false;
293             }
294         }
295     }
296 
297     return true;
298 }
299 
getSWSubtask()300 SequenceWalkerTask *HMMSearchTask::getSWSubtask() {
301     assert(!hasError());
302     assert(NULL != hmm);
303 
304     if (!checkAlphabets(hmm->atype, seq.alphabet, complTrans, aminoTrans)) {
305         return NULL;
306     }
307     SequenceWalkerConfig config;
308     config.seq = seq.seq.data();
309     config.seqSize = seq.seq.size();
310     config.complTrans = complTrans;
311     config.strandToWalk = complTrans == NULL ? StrandOption_DirectOnly : StrandOption_Both;
312     config.aminoTrans = aminoTrans;
313     config.overlapSize = 2 * hmm->M;
314     config.chunkSize = qMax(6 * hmm->M, settings.searchChunkSize);
315     if (settings.extraLen == -1) {
316         config.lastChunkExtraLen = config.chunkSize / 2;
317     } else {
318         config.lastChunkExtraLen = settings.extraLen;
319     }
320     config.walkCircular = false;
321 
322     config.nThreads = MAX_PARALLEL_SUBTASKS_AUTO;
323 
324     return new SequenceWalkerTask(config, this, tr("Parallel HMM search"));
325 }
326 
onSubTaskFinished(Task * subTask)327 QList<Task *> HMMSearchTask::onSubTaskFinished(Task *subTask) {
328     assert(NULL != subTask);
329     QList<Task *> res;
330     if (subTask->hasError()) {
331         stateInfo.setError(subTask->getError());
332         return res;
333     }
334 
335     if (readHMMTask == subTask) {
336         hmm = readHMMTask->getHMM();
337         swTask = getSWSubtask();
338         if (NULL == swTask) {
339             assert(hasError());
340             return res;
341         }
342         res << swTask;
343     } else {
344         if (swTask != subTask) {
345             assert(0 && "undefined_subtask_finished");
346         }
347     }
348 
349     return res;
350 }
351 
352 }  // namespace U2
353