1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "DiamondClassifyWorkerFactory.h"
23
24 #include <QThread>
25
26 #include <U2Core/AppContext.h>
27 #include <U2Core/AppResources.h>
28 #include <U2Core/AppSettings.h>
29 #include <U2Core/BaseDocumentFormats.h>
30 #include <U2Core/DNAAlphabet.h>
31 #include <U2Core/DNATranslation.h>
32 #include <U2Core/DataPathRegistry.h>
33
34 #include <U2Designer/DelegateEditors.h>
35
36 #include <U2Gui/DialogUtils.h>
37
38 #include <U2Lang/ActorPrototypeRegistry.h>
39 #include <U2Lang/BaseSlots.h>
40 #include <U2Lang/BaseTypes.h>
41 #include <U2Lang/WorkflowEnv.h>
42
43 #include "../ngs_reads_classification/src/DatabaseDelegate.h"
44 #include "../ngs_reads_classification/src/NgsReadsClassificationPlugin.h"
45 #include "DiamondClassifyPrompter.h"
46 #include "DiamondClassifyTask.h"
47 #include "DiamondClassifyWorker.h"
48 #include "DiamondSupport.h"
49
50 namespace U2 {
51 namespace LocalWorkflow {
52
53 const QString DiamondClassifyWorkerFactory::ACTOR_ID = "diamond-classify";
54
55 const QString DiamondClassifyWorkerFactory::INPUT_PORT_ID = "in";
56 const QString DiamondClassifyWorkerFactory::OUTPUT_PORT_ID = "out";
57
58 // Slot should be the same as in GetReadsListWorkerFactory
59 const QString DiamondClassifyWorkerFactory::INPUT_SLOT = "reads-url1";
60
61 const QString DiamondClassifyWorkerFactory::INPUT_DATA_ATTR_ID("input-data");
62 const QString DiamondClassifyWorkerFactory::DATABASE_ATTR_ID("database");
63 const QString DiamondClassifyWorkerFactory::GENCODE_ATTR_ID("genetic-code");
64 const QString DiamondClassifyWorkerFactory::SENSITIVE_ATTR_ID("sensitive-mode");
65 const QString DiamondClassifyWorkerFactory::TOP_ALIGNMENTS_PERCENTAGE_ATTR_ID("top-alignments-percentage");
66 const QString DiamondClassifyWorkerFactory::FSHIFT_ATTR_ID("frame-shift");
67 const QString DiamondClassifyWorkerFactory::EVALUE_ATTR_ID("e-value");
68 const QString DiamondClassifyWorkerFactory::MATRIX_ATTR_ID("matrix");
69 const QString DiamondClassifyWorkerFactory::GO_PEN_ATTR_ID("gap-open");
70 const QString DiamondClassifyWorkerFactory::GE_PEN_ATTR_ID("gap-extend");
71 const QString DiamondClassifyWorkerFactory::THREADS_ATTR_ID("threads");
72 const QString DiamondClassifyWorkerFactory::BSIZE_ATTR_ID("block-size");
73 const QString DiamondClassifyWorkerFactory::CHUNKS_ATTR_ID("index-chunks");
74 const QString DiamondClassifyWorkerFactory::OUTPUT_URL_ATTR_ID("output-url");
75
76 const QString DiamondClassifyWorkerFactory::WORKFLOW_CLASSIFY_TOOL_DIAMOND = "DIAMOND";
77
DiamondClassifyWorkerFactory()78 DiamondClassifyWorkerFactory::DiamondClassifyWorkerFactory()
79 : DomainFactory(ACTOR_ID) {
80 }
81
createWorker(Actor * actor)82 Worker *DiamondClassifyWorkerFactory::createWorker(Actor *actor) {
83 return new DiamondClassifyWorker(actor);
84 }
85
init()86 void DiamondClassifyWorkerFactory::init() {
87 QList<PortDescriptor *> ports;
88 {
89 const Descriptor inSlotDesc(INPUT_SLOT,
90 DiamondClassifyPrompter::tr("Input URL"),
91 DiamondClassifyPrompter::tr("Input URL."));
92
93 QMap<Descriptor, DataTypePtr> inType;
94 inType[inSlotDesc] = BaseTypes::STRING_TYPE();
95
96 QMap<Descriptor, DataTypePtr> outType;
97 outType[TaxonomySupport::TAXONOMY_CLASSIFICATION_SLOT()] = TaxonomySupport::TAXONOMY_CLASSIFICATION_TYPE();
98
99 const Descriptor inPortDesc(INPUT_PORT_ID,
100 DiamondClassifyPrompter::tr("Input sequences"),
101 DiamondClassifyPrompter::tr("URL(s) to FASTQ or FASTA file(s) should be provided.\n\n"
102 "The input files may contain single-end reads, contigs, or \"left\" reads in case of the paired-end sequencing (see \"Input data\" parameter of the element)."));
103
104 const Descriptor outPortDesc(OUTPUT_PORT_ID,
105 DiamondClassifyPrompter::tr("DIAMOND Classification"),
106 DiamondClassifyPrompter::tr("A list of sequence names with the associated taxonomy IDs, classified by DIAMOND."));
107
108 ports << new PortDescriptor(inPortDesc, DataTypePtr(new MapDataType(ACTOR_ID + "-in", inType)), true /*input*/);
109 ports << new PortDescriptor(outPortDesc, DataTypePtr(new MapDataType(ACTOR_ID + "-out", outType)), false /*input*/, true /*multi*/);
110 }
111
112 QList<Attribute *> attributes;
113 {
114 Descriptor databaseDesc(DATABASE_ATTR_ID, DiamondClassifyPrompter::tr("Database"), DiamondClassifyPrompter::tr("Input a binary DIAMOND database file."));
115
116 Descriptor code(GENCODE_ATTR_ID, DiamondClassifyPrompter::tr("Genetic code"), DiamondClassifyPrompter::tr("Genetic code used for translation of query sequences (--query-gencode)."));
117 Descriptor sense(SENSITIVE_ATTR_ID, DiamondClassifyPrompter::tr("Sensitive mode"), DiamondClassifyPrompter::tr("The sensitive modes (--sensitive, --more-sensitive) are generally recommended for aligning longer sequences. The default mode is mainly designed for short read alignment, i.e. finding significant matches of >50 bits on 30-40aa fragments."));
118 Descriptor topAlignmentsPercentage(TOP_ALIGNMENTS_PERCENTAGE_ATTR_ID,
119 DiamondClassifyPrompter::tr("Top alignments percentage"),
120 DiamondClassifyPrompter::tr("DIAMOND uses the lowest common ancestor (LCA) algorithm for taxonomy classification of the input sequences. This parameter specifies what alignments should be taken into account during the calculations (--top)."
121 "<br><br>"
122 "For example, the default value \"10\" means to take top 10% of the best hits (i.e. sort all query/subject-alignments by score, take top 10% of the alignments with the best score, calculate the lowest common ancestor for them)."));
123 Descriptor fshift(FSHIFT_ATTR_ID, DiamondClassifyPrompter::tr("Frameshift"), DiamondClassifyPrompter::tr("Penalty for frameshift in DNA-vs-protein alignments. Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads."));
124 Descriptor evalue(EVALUE_ATTR_ID, DiamondClassifyPrompter::tr("Expected value"), DiamondClassifyPrompter::tr("Maximum expected value to report an alignment (--evalue/-e)."));
125 Descriptor matrix(MATRIX_ATTR_ID, DiamondClassifyPrompter::tr("Matrix"), DiamondClassifyPrompter::tr("Scoring matrix (--matrix)."));
126 Descriptor gapopen(GO_PEN_ATTR_ID, DiamondClassifyPrompter::tr("Gap open penalty"), DiamondClassifyPrompter::tr("Gap open penalty (--gapopen)."));
127 Descriptor gapextend(GE_PEN_ATTR_ID, DiamondClassifyPrompter::tr("Gap extension penalty"), DiamondClassifyPrompter::tr("Gap extension penalty (--gapextend)."));
128 Descriptor threads(THREADS_ATTR_ID, DiamondClassifyPrompter::tr("Number of threads"), DiamondClassifyPrompter::tr("Number of CPU threads (--treads)."));
129 Descriptor bsize(BSIZE_ATTR_ID, DiamondClassifyPrompter::tr("Block size"), DiamondClassifyPrompter::tr("Block size in billions of sequence letters to be processed at a time (--block-size). This is the main parameter for controlling the program’s memory usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance. The program can be expected to use roughly six times this number of memory (in GB)."));
130 Descriptor chunks(CHUNKS_ATTR_ID, DiamondClassifyPrompter::tr("Index chunks"), DiamondClassifyPrompter::tr("The number of chunks for processing the seed index (--index-chunks). This option can be additionally used to tune the performance. It is recommended to set this to 1 on a high memory server, which will increase performance and memory usage, but not the usage of temporary disk space."));
131 Descriptor outputUrlDesc(OUTPUT_URL_ATTR_ID, DiamondClassifyPrompter::tr("Output file"), DiamondClassifyPrompter::tr("Specify the output file name."
132 "<br><br>"
133 "The output file is a tab-delimited file with the following fields:"
134 "<ul>"
135 "<li>Query ID</li>"
136 "<li>NCBI taxonomy ID (0 if unclassified)</li>"
137 "<li>E-value of the best alignment with a known taxonomy ID found for the query (0 if unclassified)</li>"
138 "</ul>"));
139
140 Descriptor classifyToolDesc(NgsReadsClassificationPlugin::WORKFLOW_CLASSIFY_TOOL_ID,
141 WORKFLOW_CLASSIFY_TOOL_DIAMOND,
142 "Classify tool. Hidden attribute");
143
144 QString diamondDatabasePath;
145 U2DataPath *uniref50DataPath = AppContext::getDataPathRegistry()->getDataPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_DATA_ID);
146 if (nullptr != uniref50DataPath && uniref50DataPath->isValid()) {
147 diamondDatabasePath = uniref50DataPath->getPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_ITEM_ID);
148 } else {
149 U2DataPath *clarkViralDataPath = AppContext::getDataPathRegistry()->getDataPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_DATA_ID);
150 if (nullptr != clarkViralDataPath && clarkViralDataPath->isValid()) {
151 diamondDatabasePath = clarkViralDataPath->getPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_ITEM_ID);
152 }
153 }
154
155 attributes << new Attribute(databaseDesc, BaseTypes::STRING_TYPE(), Attribute::Required | Attribute::NeedValidateEncoding, diamondDatabasePath);
156 attributes << new Attribute(code, BaseTypes::NUM_TYPE(), Attribute::None, 1);
157 attributes << new Attribute(sense, BaseTypes::STRING_TYPE(), Attribute::None, DiamondClassifyTaskSettings::SENSITIVE_DEFAULT);
158 attributes << new Attribute(topAlignmentsPercentage, BaseTypes::NUM_TYPE(), Attribute::None, 10);
159 attributes << new Attribute(fshift, BaseTypes::NUM_TYPE(), Attribute::None, 0);
160 attributes << new Attribute(evalue, BaseTypes::NUM_TYPE(), Attribute::None, 0.001);
161 attributes << new Attribute(matrix, BaseTypes::STRING_TYPE(), Attribute::None, DiamondClassifyTaskSettings::BLOSUM62);
162 attributes << new Attribute(gapopen, BaseTypes::NUM_TYPE(), Attribute::None, -1);
163 attributes << new Attribute(gapextend, BaseTypes::NUM_TYPE(), Attribute::None, -1);
164 attributes << new Attribute(bsize, BaseTypes::NUM_TYPE(), Attribute::None, 0.5); // NB: unless --very-sensitive supported
165 attributes << new Attribute(chunks, BaseTypes::NUM_TYPE(), Attribute::None, 4); // NB: unless --very-sensitive supported
166 attributes << new Attribute(threads, BaseTypes::NUM_TYPE(), Attribute::None, AppContext::getAppSettings()->getAppResourcePool()->getIdealThreadCount());
167 attributes << new Attribute(outputUrlDesc, BaseTypes::STRING_TYPE(), Attribute::Required | Attribute::NeedValidateEncoding | Attribute::CanBeEmpty);
168
169 attributes << new Attribute(classifyToolDesc, BaseTypes::STRING_TYPE(), static_cast<Attribute::Flags>(Attribute::Hidden), WORKFLOW_CLASSIFY_TOOL_DIAMOND);
170 }
171
172 QMap<QString, PropertyDelegate *> delegates;
173 {
174 {
175 QList<StrStrPair> dataPathItems;
176 dataPathItems << StrStrPair(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_DATA_ID, NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_ITEM_ID);
177 dataPathItems << StrStrPair(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_DATA_ID, NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_ITEM_ID);
178 delegates[DATABASE_ATTR_ID] = new DatabaseDelegate(ACTOR_ID, DATABASE_ATTR_ID, dataPathItems, "diamond/database", false);
179 }
180 {
181 QList<ComboItem> idMap;
182 QList<DNATranslation *> TTs = AppContext::getDNATranslationRegistry()->lookupTranslation(AppContext::getDNAAlphabetRegistry()->findById(BaseDNAAlphabetIds::NUCL_DNA_DEFAULT()),
183 DNATranslationType_NUCL_2_AMINO);
184 int prefixLen = QString(DNATranslationID(1)).size() - 1;
185 foreach (DNATranslation *tt, TTs) {
186 QString id = tt->getTranslationId();
187 idMap.append(qMakePair(tt->getTranslationName(), id.mid(prefixLen).toInt()));
188 }
189 delegates[GENCODE_ATTR_ID] = new ComboBoxDelegate(idMap);
190 }
191 {
192 QList<ComboItem> items;
193 items.append(qMakePair(DiamondClassifyPrompter::tr("Default"), DiamondClassifyTaskSettings::SENSITIVE_DEFAULT));
194 items.append(qMakePair(DiamondClassifyPrompter::tr("Sensitive"), DiamondClassifyTaskSettings::SENSITIVE_HIGH));
195 items.append(qMakePair(DiamondClassifyPrompter::tr("More sensitive"), DiamondClassifyTaskSettings::SENSITIVE_ULTRA));
196 delegates[SENSITIVE_ATTR_ID] = new ComboBoxDelegate(items);
197 }
198 {
199 QVariantMap map;
200 map["minimum"] = 0;
201 map["maximum"] = 100;
202 map["suffix"] = "%";
203 delegates[TOP_ALIGNMENTS_PERCENTAGE_ATTR_ID] = new SpinBoxDelegate(map);
204 }
205 {
206 QVariantMap map;
207 map[DiamondClassifyTaskSettings::BLOSUM45] = DiamondClassifyTaskSettings::BLOSUM45;
208 map[DiamondClassifyTaskSettings::BLOSUM50] = DiamondClassifyTaskSettings::BLOSUM50;
209 map[DiamondClassifyTaskSettings::BLOSUM62] = DiamondClassifyTaskSettings::BLOSUM62;
210 map[DiamondClassifyTaskSettings::BLOSUM80] = DiamondClassifyTaskSettings::BLOSUM80;
211 map[DiamondClassifyTaskSettings::BLOSUM90] = DiamondClassifyTaskSettings::BLOSUM90;
212 map[DiamondClassifyTaskSettings::PAM30] = DiamondClassifyTaskSettings::PAM30;
213 map[DiamondClassifyTaskSettings::PAM70] = DiamondClassifyTaskSettings::PAM70;
214 map[DiamondClassifyTaskSettings::PAM250] = DiamondClassifyTaskSettings::PAM250;
215 delegates[MATRIX_ATTR_ID] = new ComboBoxDelegate(map);
216 }
217
218 {
219 QVariantMap map;
220 map["minimum"] = -1;
221 map["maximum"] = std::numeric_limits<int>::max();
222 map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
223 delegates[GO_PEN_ATTR_ID] = new SpinBoxDelegate(map);
224 }
225 {
226 QVariantMap map;
227 map["minimum"] = -1;
228 map["maximum"] = std::numeric_limits<int>::max();
229 map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
230 delegates[GE_PEN_ATTR_ID] = new SpinBoxDelegate(map);
231 }
232
233 {
234 QVariantMap map;
235 map["minimum"] = 0;
236 map["maximum"] = std::numeric_limits<int>::max();
237 map["specialValueText"] = DiamondClassifyPrompter::tr("Skipped");
238 delegates[FSHIFT_ATTR_ID] = new SpinBoxDelegate(map);
239 }
240 {
241 QVariantMap map;
242 map["minimum"] = 0;
243 map["maximum"] = std::numeric_limits<int>::max();
244 map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
245 delegates[CHUNKS_ATTR_ID] = new SpinBoxDelegate(map);
246 }
247
248 {
249 QVariantMap map;
250 map["minimum"] = 0;
251 map["singleStep"] = 0.001;
252 map["decimals"] = 4;
253 delegates[EVALUE_ATTR_ID] = new DoubleSpinBoxDelegate(map);
254 }
255
256 {
257 QVariantMap map;
258 map["minimum"] = 0;
259 map["singleStep"] = 0.1;
260 map["decimals"] = 2;
261 map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
262 delegates[BSIZE_ATTR_ID] = new DoubleSpinBoxDelegate(map);
263 }
264
265 QVariantMap threadsNumberProperties;
266 threadsNumberProperties["minimum"] = 1;
267 threadsNumberProperties["maximum"] = QThread::idealThreadCount();
268 delegates[THREADS_ATTR_ID] = new SpinBoxDelegate(threadsNumberProperties);
269
270 DelegateTags outputUrlTags;
271 outputUrlTags.set(DelegateTags::PLACEHOLDER_TEXT, "Auto");
272 outputUrlTags.set(DelegateTags::FILTER, DialogUtils::prepareDocumentsFileFilter(BaseDocumentFormats::PLAIN_TEXT, true, QStringList()));
273 outputUrlTags.set(DelegateTags::FORMAT, BaseDocumentFormats::PLAIN_TEXT);
274 delegates[OUTPUT_URL_ATTR_ID] = new URLDelegate(outputUrlTags, "diamond/output");
275 }
276
277 const Descriptor desc(ACTOR_ID,
278 DiamondClassifyPrompter::tr("Classify Sequences with DIAMOND"),
279 DiamondClassifyPrompter::tr("In general, DIAMOND is a sequence aligner for protein and translated DNA "
280 "searches similar to the NCBI BLAST software tools. However, it provides a "
281 "speedup of BLAST ranging up to x20,000."
282 "<br><br>"
283 "Using this workflow element one can use DIAMOND for taxonomic classification of "
284 "short DNA reads and longer sequences such as contigs. The lowest common "
285 "ancestor (LCA) algorithm is used for the classification."));
286
287 ActorPrototype *proto = new IntegralBusActorPrototype(desc, ports, attributes);
288 proto->setEditor(new DelegateEditor(delegates));
289 proto->setPrompter(new DiamondClassifyPrompter(nullptr));
290 proto->addExternalTool(DiamondSupport::TOOL_ID);
291 WorkflowEnv::getProtoRegistry()->registerProto(NgsReadsClassificationPlugin::WORKFLOW_ELEMENTS_GROUP, proto);
292
293 DomainFactory *localDomain = WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID);
294 localDomain->registerEntry(new DiamondClassifyWorkerFactory());
295 }
296
cleanup()297 void DiamondClassifyWorkerFactory::cleanup() {
298 delete WorkflowEnv::getProtoRegistry()->unregisterProto(ACTOR_ID);
299
300 DomainFactory *localDomain = WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID);
301 delete localDomain->unregisterEntry(ACTOR_ID);
302 }
303
304 } // namespace LocalWorkflow
305 } // namespace U2
306