1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "CDSearchWorker.h"
23 
24 #include <U2Algorithm/CDSearchTaskFactoryRegistry.h>
25 
26 #include <U2Core/AppContext.h>
27 #include <U2Core/DNAAlphabet.h>
28 #include <U2Core/DNASequence.h>
29 #include <U2Core/FailTask.h>
30 #include <U2Core/TaskSignalMapper.h>
31 #include <U2Core/U2OpStatusUtils.h>
32 #include <U2Core/U2SafePoints.h>
33 
34 #include <U2Designer/DelegateEditors.h>
35 
36 #include <U2Lang/ActorPrototypeRegistry.h>
37 #include <U2Lang/BaseActorCategories.h>
38 #include <U2Lang/BasePorts.h>
39 #include <U2Lang/BaseSlots.h>
40 #include <U2Lang/BaseTypes.h>
41 #include <U2Lang/WorkflowEnv.h>
42 
43 namespace U2 {
44 namespace LocalWorkflow {
45 
46 const QString CDSearchWorkerFactory::ACTOR_ID("cd-search");
47 
48 static const QString DATABASE_ATTR("db-name");
49 static const QString EVALUE_ATTR("e-val");
50 static const QString ANNOTATION_ATTR("result-name");
51 static const QString LOCAL_ATTR("local-search");
52 static const QString DB_PATH_ATTR("db-path");
53 
init()54 void CDSearchWorkerFactory::init() {
55     QList<PortDescriptor *> p;
56     Descriptor ind(BasePorts::IN_SEQ_PORT_ID(), CDSearchWorker::tr("Input sequence"), CDSearchWorker::tr("The sequence to search the annotations for"));
57     Descriptor outd(BasePorts::OUT_ANNOTATIONS_PORT_ID(), CDSearchWorker::tr("Annotations"), CDSearchWorker::tr("Found annotations"));
58 
59     QMap<Descriptor, DataTypePtr> inM;
60     inM[BaseSlots::DNA_SEQUENCE_SLOT()] = BaseTypes::DNA_SEQUENCE_TYPE();
61     p << new PortDescriptor(ind, DataTypePtr(new MapDataType("cds.sequence", inM)), true);
62     QMap<Descriptor, DataTypePtr> outM;
63     outM[BaseSlots::ANNOTATION_TABLE_SLOT()] = BaseTypes::ANNOTATION_TABLE_TYPE();
64     p << new PortDescriptor(outd, DataTypePtr(new MapDataType("cds.annotations", outM)), false, true);
65 
66     QList<Attribute *> a;
67     {
68         Descriptor dd(DATABASE_ATTR, CDSearchWorker::tr("Database"), CDSearchWorker::tr("Currently, CD Search is offered with the following search databases:<br><ul><li><b>CDD</b> - this is a superset including <a href=\"http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#CDSource_NCBI_curated\">NCBI-curated domains</a> and <a href=\"http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#CDSource_external\">data imported</a> from Pfam, SMART, COG, PRK, and TIGRFAM.</li><li><b>Pfam</b> - a mirror of a recent Pfam-A database of curated seed alignments. Pfam version numbers do change with incremental updates. As with SMART, families describing very short motifs or peptides may be missing from the  mirror. An HMM-based search engine is offered on the <a href=\"http://pfam.sanger.ac.uk/\">Pfam</a> site.</li><li><b>SMART</b> - a mirror of a recent SMART set of domain alignments. Note that some SMART families may be missing from the mirror due to update delays or because they describe very short conserved peptides and/or motifs, which would be difficult to detect using the CD-Search service. You may want to try the HMM-based search service offered on the <a href=\"http://smart.embl-heidelberg.de\">SMART</a> site. Note also that some SMART domains are not mirrored in CD because they represent \"superfamilies\" encompassing several individual, but related, domains; the corresponding seed alignments may not be available from the source database in these cases. Note also that SMART version numbers do not change with incremental updates of the source database (and the mirrored CD-Search database).</li> <li><b>TIGRFAM</b> - a mirror of a recent TIGRFAM set of domain alignments. An HMM-based search engine is offered on the <a href=\"http://www.jcvi.org/cms/research/projects/tigrfams/overview/\"><!-- a href=\"http://blast.jcvi.org/web-hmm/\" -->TIGRFAM</a> site.</li><li><b>COG</b> - a mirror of the current COG database of orthologous protein families focusing on prokaryotes. Seed alignments have been generated by an automated process. An alternative search engine, \"Cognitor\", which runs protein-BLAST against a database of COG-assigned sequences, is offered on the <a href=\"http://www.ncbi.nlm.nih.gov/COG\">COG</a> site.</li><li><b>KOG</b> - a eukaryotic counterpart to the COG database.  KOGs are not included in the CDD superset, but are searchable as a separate data set.</li></ul><br>More information about each database is provided in the section on <a href=\"http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#CDSource\">Where does CDD content come from?</a>."));
69         Descriptor nd(ANNOTATION_ATTR, CDSearchWorker::tr("Annotate as"), CDSearchWorker::tr("Name of the result annotations marking found conserved domains."));
70         Descriptor ld(LOCAL_ATTR, CDSearchWorker::tr("Local search"), CDSearchWorker::tr("Perform the search on local machine or submit the search to NCBI for remote execution."));
71         Descriptor ed(EVALUE_ATTR, CDSearchWorker::tr("Expect value"), CDSearchWorker::tr("Modifies the <a href=\"http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml#expect\">E-value</a> threshold used for filtering results. False positive results should be very rare with the default setting of 0.01 (use a more conservative, i.e. lower setting for more reliable results), results with E-values in the range of 1 and above should be considered putative false positives."));
72         Descriptor pd(DB_PATH_ATTR, CDSearchWorker::tr("Database folder"), CDSearchWorker::tr("Specifies database folder for local search."));
73 
74         a << new Attribute(nd, BaseTypes::STRING_TYPE(), true, "CDD result");
75         a << new Attribute(dd, BaseTypes::STRING_TYPE(), false, CDDNames::CDD_DB());
76         a << new Attribute(pd, BaseTypes::STRING_TYPE(), false);
77         a << new Attribute(ld, BaseTypes::BOOL_TYPE(), false, true);
78         a << new Attribute(ed, BaseTypes::NUM_TYPE(), false, 0.01);
79     }
80 
81     Descriptor desc(ACTOR_ID, CDSearchWorker::tr("CD Search"), CDSearchWorker::tr("Finds conserved domains in protein sequences. In case conserved domains database is downloaded the search can be executed on local machine. The search also can be submitted to the NCBI for remote execution."));
82     ActorPrototype *proto = new IntegralBusActorPrototype(desc, p, a);
83     QMap<QString, PropertyDelegate *> delegates;
84 
85     {
86         QVariantMap m;
87         m[CDDNames::CDD_DB()] = CDDNames::CDD_DB();
88         m[CDDNames::PFAM_DB()] = CDDNames::PFAM_DB();
89         m[CDDNames::SMART_DB()] = CDDNames::SMART_DB();
90         m[CDDNames::COG_DB()] = CDDNames::COG_DB();
91         m[CDDNames::KOG_DB()] = CDDNames::KOG_DB();
92         m[CDDNames::PRK_DB()] = CDDNames::PRK_DB();
93         m[CDDNames::TIGR_DB()] = CDDNames::TIGR_DB();
94         delegates[DATABASE_ATTR] = new ComboBoxDelegate(m);
95     }
96 
97     {
98         QVariantMap m;
99         m["1e-100"] = 1e-100;
100         m["1e-10"] = 1e-10;
101         m["1"] = 1;
102         m["10"] = 10;
103         m["100"] = 100;
104         m["1000"] = 1000;
105         delegates[EVALUE_ATTR] = new ComboBoxDelegate(m);
106     }
107 
108     {
109         delegates[DB_PATH_ATTR] = new URLDelegate("", "Database Folder", false, true, false);
110     }
111 
112     proto->setPrompter(new CDSearchPrompter());
113     proto->setEditor(new DelegateEditor(delegates));
114     WorkflowEnv::getProtoRegistry()->registerProto(BaseActorCategories::CATEGORY_BASIC(), proto);
115 
116     DomainFactory *localDomain = WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID);
117     localDomain->registerEntry(new CDSearchWorkerFactory());
118 }
119 
composeRichDoc()120 QString CDSearchPrompter::composeRichDoc() {
121     IntegralBusPort *input = qobject_cast<IntegralBusPort *>(target->getPort(BasePorts::IN_SEQ_PORT_ID()));
122     Actor *producer = input->getProducer(BaseSlots::DNA_SEQUENCE_SLOT().getId());
123     QString unsetStr = "<font color='red'>" + tr("unset") + "</font>";
124     QString producerName = tr(" from <u>%1</u>").arg(producer ? producer->getLabel() : unsetStr);
125 
126     QString dbStr = target->getParameter(DATABASE_ATTR)->getAttributeValueWithoutScript<QString>();
127 
128     QString doc = tr("For sequence %1 find conserved domains in database <u>%2</u>.")
129                       .arg(producerName)
130                       .arg(getHyperlink(DATABASE_ATTR, dbStr));
131     return doc;
132 }
133 
init()134 void CDSearchWorker::init() {
135     input = ports.value(BasePorts::IN_SEQ_PORT_ID());
136     output = ports.value(BasePorts::OUT_ANNOTATIONS_PORT_ID());
137 }
138 
tick()139 Task *CDSearchWorker::tick() {
140     if (input->hasMessage()) {
141         Message inputMessage = getMessageAndSetupScriptValues(input);
142         if (inputMessage.isEmpty()) {
143             output->transit();
144             return nullptr;
145         }
146         SharedDbiDataHandler seqId = inputMessage.getData().toMap().value(BaseSlots::DNA_SEQUENCE_SLOT().getId()).value<SharedDbiDataHandler>();
147         QScopedPointer<U2SequenceObject> seqObj(StorageUtils::getSequenceObject(context->getDataStorage(), seqId));
148         if (seqObj.isNull()) {
149             return nullptr;
150         }
151         U2OpStatusImpl os;
152         DNASequence seq = seqObj->getWholeSequence(os);
153         CHECK_OP(os, new FailTask(os.getError()));
154 
155         settings.query = seq.seq;
156         settings.alp = seq.alphabet;
157         if (!settings.alp->isAmino()) {
158             QString err = "Required amino acid input sequence";
159             return new FailTask(err);
160         }
161         settings.ev = actor->getParameter(EVALUE_ATTR)->getAttributeValue<float>(context);
162 
163         settings.dbName = actor->getParameter(DATABASE_ATTR)->getAttributeValue<QString>(context);
164 
165         bool local = actor->getParameter(LOCAL_ATTR)->getAttributePureValue().toBool();
166         CDSearchFactory *factory = nullptr;
167         if (local) {
168             factory = AppContext::getCDSFactoryRegistry()->getFactory(CDSearchFactoryRegistry::LocalSearch);
169             if (!factory) {
170                 QString err = tr("'External tools' plugin has to be loaded.");
171                 return new FailTask(err);
172             }
173             settings.localDbFolder = actor->getParameter(DB_PATH_ATTR)->getAttributeValue<QString>(context);
174         } else {  // remote
175             factory = AppContext::getCDSFactoryRegistry()->getFactory(CDSearchFactoryRegistry::RemoteSearch);
176             if (!factory) {
177                 QString err = tr("'Remote blast' plugin has to be loaded.");
178                 return new FailTask(err);
179             }
180         }
181         cds = factory->createCDSearch(settings);
182         Task *t = cds->getTask();
183         connect(new TaskSignalMapper(t), SIGNAL(si_taskFinished(Task *)), SLOT(sl_taskFinished(Task *)));
184         return t;
185     } else if (input->isEnded()) {
186         setDone();
187         output->setEnded();
188     }
189     return nullptr;
190 }
191 
sl_taskFinished(Task * t)192 void CDSearchWorker::sl_taskFinished(Task *t) {
193     SAFE_POINT(nullptr != t, "Invalid task is encountered", );
194     if (t->isCanceled()) {
195         return;
196     }
197     if (nullptr != output) {
198         QList<SharedAnnotationData> res = cds->getCDSResults();
199         QString annName = actor->getParameter(ANNOTATION_ATTR)->getAttributeValue<QString>(context);
200         if (!annName.isEmpty()) {
201             for (int i = 0; i < res.count(); i++) {
202                 res[i]->name = annName;
203             }
204         }
205         const SharedDbiDataHandler tableId = context->getDataStorage()->putAnnotationTable(res);
206         output->put(Message(BaseTypes::ANNOTATION_TABLE_TYPE(), qVariantFromValue<SharedDbiDataHandler>(tableId)));
207     }
208     delete cds;
209     cds = nullptr;
210 }
211 
212 }  // namespace LocalWorkflow
213 }  // namespace U2
214