1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "CDSearchWorker.h"
23
24 #include <U2Algorithm/CDSearchTaskFactoryRegistry.h>
25
26 #include <U2Core/AppContext.h>
27 #include <U2Core/DNAAlphabet.h>
28 #include <U2Core/DNASequence.h>
29 #include <U2Core/FailTask.h>
30 #include <U2Core/TaskSignalMapper.h>
31 #include <U2Core/U2OpStatusUtils.h>
32 #include <U2Core/U2SafePoints.h>
33
34 #include <U2Designer/DelegateEditors.h>
35
36 #include <U2Lang/ActorPrototypeRegistry.h>
37 #include <U2Lang/BaseActorCategories.h>
38 #include <U2Lang/BasePorts.h>
39 #include <U2Lang/BaseSlots.h>
40 #include <U2Lang/BaseTypes.h>
41 #include <U2Lang/WorkflowEnv.h>
42
43 namespace U2 {
44 namespace LocalWorkflow {
45
46 const QString CDSearchWorkerFactory::ACTOR_ID("cd-search");
47
48 static const QString DATABASE_ATTR("db-name");
49 static const QString EVALUE_ATTR("e-val");
50 static const QString ANNOTATION_ATTR("result-name");
51 static const QString LOCAL_ATTR("local-search");
52 static const QString DB_PATH_ATTR("db-path");
53
init()54 void CDSearchWorkerFactory::init() {
55 QList<PortDescriptor *> p;
56 Descriptor ind(BasePorts::IN_SEQ_PORT_ID(), CDSearchWorker::tr("Input sequence"), CDSearchWorker::tr("The sequence to search the annotations for"));
57 Descriptor outd(BasePorts::OUT_ANNOTATIONS_PORT_ID(), CDSearchWorker::tr("Annotations"), CDSearchWorker::tr("Found annotations"));
58
59 QMap<Descriptor, DataTypePtr> inM;
60 inM[BaseSlots::DNA_SEQUENCE_SLOT()] = BaseTypes::DNA_SEQUENCE_TYPE();
61 p << new PortDescriptor(ind, DataTypePtr(new MapDataType("cds.sequence", inM)), true);
62 QMap<Descriptor, DataTypePtr> outM;
63 outM[BaseSlots::ANNOTATION_TABLE_SLOT()] = BaseTypes::ANNOTATION_TABLE_TYPE();
64 p << new PortDescriptor(outd, DataTypePtr(new MapDataType("cds.annotations", outM)), false, true);
65
66 QList<Attribute *> a;
67 {
68 Descriptor dd(DATABASE_ATTR, CDSearchWorker::tr("Database"), CDSearchWorker::tr("Currently, CD Search is offered with the following search databases:<br><ul><li><b>CDD</b> - this is a superset including <a href=\"http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#CDSource_NCBI_curated\">NCBI-curated domains</a> and <a href=\"http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#CDSource_external\">data imported</a> from Pfam, SMART, COG, PRK, and TIGRFAM.</li><li><b>Pfam</b> - a mirror of a recent Pfam-A database of curated seed alignments. Pfam version numbers do change with incremental updates. As with SMART, families describing very short motifs or peptides may be missing from the mirror. An HMM-based search engine is offered on the <a href=\"http://pfam.sanger.ac.uk/\">Pfam</a> site.</li><li><b>SMART</b> - a mirror of a recent SMART set of domain alignments. Note that some SMART families may be missing from the mirror due to update delays or because they describe very short conserved peptides and/or motifs, which would be difficult to detect using the CD-Search service. You may want to try the HMM-based search service offered on the <a href=\"http://smart.embl-heidelberg.de\">SMART</a> site. Note also that some SMART domains are not mirrored in CD because they represent \"superfamilies\" encompassing several individual, but related, domains; the corresponding seed alignments may not be available from the source database in these cases. Note also that SMART version numbers do not change with incremental updates of the source database (and the mirrored CD-Search database).</li> <li><b>TIGRFAM</b> - a mirror of a recent TIGRFAM set of domain alignments. An HMM-based search engine is offered on the <a href=\"http://www.jcvi.org/cms/research/projects/tigrfams/overview/\"><!-- a href=\"http://blast.jcvi.org/web-hmm/\" -->TIGRFAM</a> site.</li><li><b>COG</b> - a mirror of the current COG database of orthologous protein families focusing on prokaryotes. Seed alignments have been generated by an automated process. An alternative search engine, \"Cognitor\", which runs protein-BLAST against a database of COG-assigned sequences, is offered on the <a href=\"http://www.ncbi.nlm.nih.gov/COG\">COG</a> site.</li><li><b>KOG</b> - a eukaryotic counterpart to the COG database. KOGs are not included in the CDD superset, but are searchable as a separate data set.</li></ul><br>More information about each database is provided in the section on <a href=\"http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#CDSource\">Where does CDD content come from?</a>."));
69 Descriptor nd(ANNOTATION_ATTR, CDSearchWorker::tr("Annotate as"), CDSearchWorker::tr("Name of the result annotations marking found conserved domains."));
70 Descriptor ld(LOCAL_ATTR, CDSearchWorker::tr("Local search"), CDSearchWorker::tr("Perform the search on local machine or submit the search to NCBI for remote execution."));
71 Descriptor ed(EVALUE_ATTR, CDSearchWorker::tr("Expect value"), CDSearchWorker::tr("Modifies the <a href=\"http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml#expect\">E-value</a> threshold used for filtering results. False positive results should be very rare with the default setting of 0.01 (use a more conservative, i.e. lower setting for more reliable results), results with E-values in the range of 1 and above should be considered putative false positives."));
72 Descriptor pd(DB_PATH_ATTR, CDSearchWorker::tr("Database folder"), CDSearchWorker::tr("Specifies database folder for local search."));
73
74 a << new Attribute(nd, BaseTypes::STRING_TYPE(), true, "CDD result");
75 a << new Attribute(dd, BaseTypes::STRING_TYPE(), false, CDDNames::CDD_DB());
76 a << new Attribute(pd, BaseTypes::STRING_TYPE(), false);
77 a << new Attribute(ld, BaseTypes::BOOL_TYPE(), false, true);
78 a << new Attribute(ed, BaseTypes::NUM_TYPE(), false, 0.01);
79 }
80
81 Descriptor desc(ACTOR_ID, CDSearchWorker::tr("CD Search"), CDSearchWorker::tr("Finds conserved domains in protein sequences. In case conserved domains database is downloaded the search can be executed on local machine. The search also can be submitted to the NCBI for remote execution."));
82 ActorPrototype *proto = new IntegralBusActorPrototype(desc, p, a);
83 QMap<QString, PropertyDelegate *> delegates;
84
85 {
86 QVariantMap m;
87 m[CDDNames::CDD_DB()] = CDDNames::CDD_DB();
88 m[CDDNames::PFAM_DB()] = CDDNames::PFAM_DB();
89 m[CDDNames::SMART_DB()] = CDDNames::SMART_DB();
90 m[CDDNames::COG_DB()] = CDDNames::COG_DB();
91 m[CDDNames::KOG_DB()] = CDDNames::KOG_DB();
92 m[CDDNames::PRK_DB()] = CDDNames::PRK_DB();
93 m[CDDNames::TIGR_DB()] = CDDNames::TIGR_DB();
94 delegates[DATABASE_ATTR] = new ComboBoxDelegate(m);
95 }
96
97 {
98 QVariantMap m;
99 m["1e-100"] = 1e-100;
100 m["1e-10"] = 1e-10;
101 m["1"] = 1;
102 m["10"] = 10;
103 m["100"] = 100;
104 m["1000"] = 1000;
105 delegates[EVALUE_ATTR] = new ComboBoxDelegate(m);
106 }
107
108 {
109 delegates[DB_PATH_ATTR] = new URLDelegate("", "Database Folder", false, true, false);
110 }
111
112 proto->setPrompter(new CDSearchPrompter());
113 proto->setEditor(new DelegateEditor(delegates));
114 WorkflowEnv::getProtoRegistry()->registerProto(BaseActorCategories::CATEGORY_BASIC(), proto);
115
116 DomainFactory *localDomain = WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID);
117 localDomain->registerEntry(new CDSearchWorkerFactory());
118 }
119
composeRichDoc()120 QString CDSearchPrompter::composeRichDoc() {
121 IntegralBusPort *input = qobject_cast<IntegralBusPort *>(target->getPort(BasePorts::IN_SEQ_PORT_ID()));
122 Actor *producer = input->getProducer(BaseSlots::DNA_SEQUENCE_SLOT().getId());
123 QString unsetStr = "<font color='red'>" + tr("unset") + "</font>";
124 QString producerName = tr(" from <u>%1</u>").arg(producer ? producer->getLabel() : unsetStr);
125
126 QString dbStr = target->getParameter(DATABASE_ATTR)->getAttributeValueWithoutScript<QString>();
127
128 QString doc = tr("For sequence %1 find conserved domains in database <u>%2</u>.")
129 .arg(producerName)
130 .arg(getHyperlink(DATABASE_ATTR, dbStr));
131 return doc;
132 }
133
init()134 void CDSearchWorker::init() {
135 input = ports.value(BasePorts::IN_SEQ_PORT_ID());
136 output = ports.value(BasePorts::OUT_ANNOTATIONS_PORT_ID());
137 }
138
tick()139 Task *CDSearchWorker::tick() {
140 if (input->hasMessage()) {
141 Message inputMessage = getMessageAndSetupScriptValues(input);
142 if (inputMessage.isEmpty()) {
143 output->transit();
144 return nullptr;
145 }
146 SharedDbiDataHandler seqId = inputMessage.getData().toMap().value(BaseSlots::DNA_SEQUENCE_SLOT().getId()).value<SharedDbiDataHandler>();
147 QScopedPointer<U2SequenceObject> seqObj(StorageUtils::getSequenceObject(context->getDataStorage(), seqId));
148 if (seqObj.isNull()) {
149 return nullptr;
150 }
151 U2OpStatusImpl os;
152 DNASequence seq = seqObj->getWholeSequence(os);
153 CHECK_OP(os, new FailTask(os.getError()));
154
155 settings.query = seq.seq;
156 settings.alp = seq.alphabet;
157 if (!settings.alp->isAmino()) {
158 QString err = "Required amino acid input sequence";
159 return new FailTask(err);
160 }
161 settings.ev = actor->getParameter(EVALUE_ATTR)->getAttributeValue<float>(context);
162
163 settings.dbName = actor->getParameter(DATABASE_ATTR)->getAttributeValue<QString>(context);
164
165 bool local = actor->getParameter(LOCAL_ATTR)->getAttributePureValue().toBool();
166 CDSearchFactory *factory = nullptr;
167 if (local) {
168 factory = AppContext::getCDSFactoryRegistry()->getFactory(CDSearchFactoryRegistry::LocalSearch);
169 if (!factory) {
170 QString err = tr("'External tools' plugin has to be loaded.");
171 return new FailTask(err);
172 }
173 settings.localDbFolder = actor->getParameter(DB_PATH_ATTR)->getAttributeValue<QString>(context);
174 } else { // remote
175 factory = AppContext::getCDSFactoryRegistry()->getFactory(CDSearchFactoryRegistry::RemoteSearch);
176 if (!factory) {
177 QString err = tr("'Remote blast' plugin has to be loaded.");
178 return new FailTask(err);
179 }
180 }
181 cds = factory->createCDSearch(settings);
182 Task *t = cds->getTask();
183 connect(new TaskSignalMapper(t), SIGNAL(si_taskFinished(Task *)), SLOT(sl_taskFinished(Task *)));
184 return t;
185 } else if (input->isEnded()) {
186 setDone();
187 output->setEnded();
188 }
189 return nullptr;
190 }
191
sl_taskFinished(Task * t)192 void CDSearchWorker::sl_taskFinished(Task *t) {
193 SAFE_POINT(nullptr != t, "Invalid task is encountered", );
194 if (t->isCanceled()) {
195 return;
196 }
197 if (nullptr != output) {
198 QList<SharedAnnotationData> res = cds->getCDSResults();
199 QString annName = actor->getParameter(ANNOTATION_ATTR)->getAttributeValue<QString>(context);
200 if (!annName.isEmpty()) {
201 for (int i = 0; i < res.count(); i++) {
202 res[i]->name = annName;
203 }
204 }
205 const SharedDbiDataHandler tableId = context->getDataStorage()->putAnnotationTable(res);
206 output->put(Message(BaseTypes::ANNOTATION_TABLE_TYPE(), qVariantFromValue<SharedDbiDataHandler>(tableId)));
207 }
208 delete cds;
209 cds = nullptr;
210 }
211
212 } // namespace LocalWorkflow
213 } // namespace U2
214