1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "DiamondClassifyWorkerFactory.h"
23 
24 #include <QThread>
25 
26 #include <U2Core/AppContext.h>
27 #include <U2Core/AppResources.h>
28 #include <U2Core/AppSettings.h>
29 #include <U2Core/BaseDocumentFormats.h>
30 #include <U2Core/DNAAlphabet.h>
31 #include <U2Core/DNATranslation.h>
32 #include <U2Core/DataPathRegistry.h>
33 
34 #include <U2Designer/DelegateEditors.h>
35 
36 #include <U2Gui/DialogUtils.h>
37 
38 #include <U2Lang/ActorPrototypeRegistry.h>
39 #include <U2Lang/BaseSlots.h>
40 #include <U2Lang/BaseTypes.h>
41 #include <U2Lang/WorkflowEnv.h>
42 
43 #include "../ngs_reads_classification/src/DatabaseDelegate.h"
44 #include "../ngs_reads_classification/src/NgsReadsClassificationPlugin.h"
45 #include "DiamondClassifyPrompter.h"
46 #include "DiamondClassifyTask.h"
47 #include "DiamondClassifyWorker.h"
48 #include "DiamondSupport.h"
49 
50 namespace U2 {
51 namespace LocalWorkflow {
52 
53 const QString DiamondClassifyWorkerFactory::ACTOR_ID = "diamond-classify";
54 
55 const QString DiamondClassifyWorkerFactory::INPUT_PORT_ID = "in";
56 const QString DiamondClassifyWorkerFactory::OUTPUT_PORT_ID = "out";
57 
58 // Slot should be the same as in GetReadsListWorkerFactory
59 const QString DiamondClassifyWorkerFactory::INPUT_SLOT = "reads-url1";
60 
61 const QString DiamondClassifyWorkerFactory::INPUT_DATA_ATTR_ID("input-data");
62 const QString DiamondClassifyWorkerFactory::DATABASE_ATTR_ID("database");
63 const QString DiamondClassifyWorkerFactory::GENCODE_ATTR_ID("genetic-code");
64 const QString DiamondClassifyWorkerFactory::SENSITIVE_ATTR_ID("sensitive-mode");
65 const QString DiamondClassifyWorkerFactory::TOP_ALIGNMENTS_PERCENTAGE_ATTR_ID("top-alignments-percentage");
66 const QString DiamondClassifyWorkerFactory::FSHIFT_ATTR_ID("frame-shift");
67 const QString DiamondClassifyWorkerFactory::EVALUE_ATTR_ID("e-value");
68 const QString DiamondClassifyWorkerFactory::MATRIX_ATTR_ID("matrix");
69 const QString DiamondClassifyWorkerFactory::GO_PEN_ATTR_ID("gap-open");
70 const QString DiamondClassifyWorkerFactory::GE_PEN_ATTR_ID("gap-extend");
71 const QString DiamondClassifyWorkerFactory::THREADS_ATTR_ID("threads");
72 const QString DiamondClassifyWorkerFactory::BSIZE_ATTR_ID("block-size");
73 const QString DiamondClassifyWorkerFactory::CHUNKS_ATTR_ID("index-chunks");
74 const QString DiamondClassifyWorkerFactory::OUTPUT_URL_ATTR_ID("output-url");
75 
76 const QString DiamondClassifyWorkerFactory::WORKFLOW_CLASSIFY_TOOL_DIAMOND = "DIAMOND";
77 
DiamondClassifyWorkerFactory()78 DiamondClassifyWorkerFactory::DiamondClassifyWorkerFactory()
79     : DomainFactory(ACTOR_ID) {
80 }
81 
createWorker(Actor * actor)82 Worker *DiamondClassifyWorkerFactory::createWorker(Actor *actor) {
83     return new DiamondClassifyWorker(actor);
84 }
85 
init()86 void DiamondClassifyWorkerFactory::init() {
87     QList<PortDescriptor *> ports;
88     {
89         const Descriptor inSlotDesc(INPUT_SLOT,
90                                     DiamondClassifyPrompter::tr("Input URL"),
91                                     DiamondClassifyPrompter::tr("Input URL."));
92 
93         QMap<Descriptor, DataTypePtr> inType;
94         inType[inSlotDesc] = BaseTypes::STRING_TYPE();
95 
96         QMap<Descriptor, DataTypePtr> outType;
97         outType[TaxonomySupport::TAXONOMY_CLASSIFICATION_SLOT()] = TaxonomySupport::TAXONOMY_CLASSIFICATION_TYPE();
98 
99         const Descriptor inPortDesc(INPUT_PORT_ID,
100                                     DiamondClassifyPrompter::tr("Input sequences"),
101                                     DiamondClassifyPrompter::tr("URL(s) to FASTQ or FASTA file(s) should be provided.\n\n"
102                                                                 "The input files may contain single-end reads, contigs, or \"left\" reads in case of the paired-end sequencing (see \"Input data\" parameter of the element)."));
103 
104         const Descriptor outPortDesc(OUTPUT_PORT_ID,
105                                      DiamondClassifyPrompter::tr("DIAMOND Classification"),
106                                      DiamondClassifyPrompter::tr("A list of sequence names with the associated taxonomy IDs, classified by DIAMOND."));
107 
108         ports << new PortDescriptor(inPortDesc, DataTypePtr(new MapDataType(ACTOR_ID + "-in", inType)), true /*input*/);
109         ports << new PortDescriptor(outPortDesc, DataTypePtr(new MapDataType(ACTOR_ID + "-out", outType)), false /*input*/, true /*multi*/);
110     }
111 
112     QList<Attribute *> attributes;
113     {
114         Descriptor databaseDesc(DATABASE_ATTR_ID, DiamondClassifyPrompter::tr("Database"), DiamondClassifyPrompter::tr("Input a binary DIAMOND database file."));
115 
116         Descriptor code(GENCODE_ATTR_ID, DiamondClassifyPrompter::tr("Genetic code"), DiamondClassifyPrompter::tr("Genetic code used for translation of query sequences (--query-gencode)."));
117         Descriptor sense(SENSITIVE_ATTR_ID, DiamondClassifyPrompter::tr("Sensitive mode"), DiamondClassifyPrompter::tr("The sensitive modes (--sensitive, --more-sensitive) are generally recommended for aligning longer sequences. The default mode is mainly designed for short read alignment, i.e. finding significant matches of >50 bits on 30-40aa fragments."));
118         Descriptor topAlignmentsPercentage(TOP_ALIGNMENTS_PERCENTAGE_ATTR_ID,
119                                            DiamondClassifyPrompter::tr("Top alignments percentage"),
120                                            DiamondClassifyPrompter::tr("DIAMOND uses the lowest common ancestor (LCA) algorithm for taxonomy classification of the input sequences. This parameter specifies what alignments should be taken into account during the calculations (--top)."
121                                                                        "<br><br>"
122                                                                        "For example, the default value \"10\" means to take top 10% of the best hits (i.e. sort all query/subject-alignments by score, take top 10% of the alignments with the best score, calculate the lowest common ancestor for them)."));
123         Descriptor fshift(FSHIFT_ATTR_ID, DiamondClassifyPrompter::tr("Frameshift"), DiamondClassifyPrompter::tr("Penalty for frameshift in DNA-vs-protein alignments. Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads."));
124         Descriptor evalue(EVALUE_ATTR_ID, DiamondClassifyPrompter::tr("Expected value"), DiamondClassifyPrompter::tr("Maximum expected value to report an alignment (--evalue/-e)."));
125         Descriptor matrix(MATRIX_ATTR_ID, DiamondClassifyPrompter::tr("Matrix"), DiamondClassifyPrompter::tr("Scoring matrix (--matrix)."));
126         Descriptor gapopen(GO_PEN_ATTR_ID, DiamondClassifyPrompter::tr("Gap open penalty"), DiamondClassifyPrompter::tr("Gap open penalty (--gapopen)."));
127         Descriptor gapextend(GE_PEN_ATTR_ID, DiamondClassifyPrompter::tr("Gap extension penalty"), DiamondClassifyPrompter::tr("Gap extension penalty (--gapextend)."));
128         Descriptor threads(THREADS_ATTR_ID, DiamondClassifyPrompter::tr("Number of threads"), DiamondClassifyPrompter::tr("Number of CPU threads (--treads)."));
129         Descriptor bsize(BSIZE_ATTR_ID, DiamondClassifyPrompter::tr("Block size"), DiamondClassifyPrompter::tr("Block size in billions of sequence letters to be processed at a time (--block-size). This is the main parameter for controlling the program’s memory usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance. The program can be expected to use roughly six times this number of memory (in GB)."));
130         Descriptor chunks(CHUNKS_ATTR_ID, DiamondClassifyPrompter::tr("Index chunks"), DiamondClassifyPrompter::tr("The number of chunks for processing the seed index (--index-chunks). This option can be additionally used to tune the performance. It is recommended to set this to 1 on a high memory server, which will increase performance and memory usage, but not the usage of temporary disk space."));
131         Descriptor outputUrlDesc(OUTPUT_URL_ATTR_ID, DiamondClassifyPrompter::tr("Output file"), DiamondClassifyPrompter::tr("Specify the output file name."
132                                                                                                                              "<br><br>"
133                                                                                                                              "The output file is a tab-delimited file with the following fields:"
134                                                                                                                              "<ul>"
135                                                                                                                              "<li>Query ID</li>"
136                                                                                                                              "<li>NCBI taxonomy ID (0 if unclassified)</li>"
137                                                                                                                              "<li>E-value of the best alignment with a known taxonomy ID found for the query (0 if unclassified)</li>"
138                                                                                                                              "</ul>"));
139 
140         Descriptor classifyToolDesc(NgsReadsClassificationPlugin::WORKFLOW_CLASSIFY_TOOL_ID,
141                                     WORKFLOW_CLASSIFY_TOOL_DIAMOND,
142                                     "Classify tool. Hidden attribute");
143 
144         QString diamondDatabasePath;
145         U2DataPath *uniref50DataPath = AppContext::getDataPathRegistry()->getDataPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_DATA_ID);
146         if (nullptr != uniref50DataPath && uniref50DataPath->isValid()) {
147             diamondDatabasePath = uniref50DataPath->getPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_ITEM_ID);
148         } else {
149             U2DataPath *clarkViralDataPath = AppContext::getDataPathRegistry()->getDataPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_DATA_ID);
150             if (nullptr != clarkViralDataPath && clarkViralDataPath->isValid()) {
151                 diamondDatabasePath = clarkViralDataPath->getPathByName(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_ITEM_ID);
152             }
153         }
154 
155         attributes << new Attribute(databaseDesc, BaseTypes::STRING_TYPE(), Attribute::Required | Attribute::NeedValidateEncoding, diamondDatabasePath);
156         attributes << new Attribute(code, BaseTypes::NUM_TYPE(), Attribute::None, 1);
157         attributes << new Attribute(sense, BaseTypes::STRING_TYPE(), Attribute::None, DiamondClassifyTaskSettings::SENSITIVE_DEFAULT);
158         attributes << new Attribute(topAlignmentsPercentage, BaseTypes::NUM_TYPE(), Attribute::None, 10);
159         attributes << new Attribute(fshift, BaseTypes::NUM_TYPE(), Attribute::None, 0);
160         attributes << new Attribute(evalue, BaseTypes::NUM_TYPE(), Attribute::None, 0.001);
161         attributes << new Attribute(matrix, BaseTypes::STRING_TYPE(), Attribute::None, DiamondClassifyTaskSettings::BLOSUM62);
162         attributes << new Attribute(gapopen, BaseTypes::NUM_TYPE(), Attribute::None, -1);
163         attributes << new Attribute(gapextend, BaseTypes::NUM_TYPE(), Attribute::None, -1);
164         attributes << new Attribute(bsize, BaseTypes::NUM_TYPE(), Attribute::None, 0.5);  // NB: unless --very-sensitive supported
165         attributes << new Attribute(chunks, BaseTypes::NUM_TYPE(), Attribute::None, 4);  // NB: unless --very-sensitive supported
166         attributes << new Attribute(threads, BaseTypes::NUM_TYPE(), Attribute::None, AppContext::getAppSettings()->getAppResourcePool()->getIdealThreadCount());
167         attributes << new Attribute(outputUrlDesc, BaseTypes::STRING_TYPE(), Attribute::Required | Attribute::NeedValidateEncoding | Attribute::CanBeEmpty);
168 
169         attributes << new Attribute(classifyToolDesc, BaseTypes::STRING_TYPE(), static_cast<Attribute::Flags>(Attribute::Hidden), WORKFLOW_CLASSIFY_TOOL_DIAMOND);
170     }
171 
172     QMap<QString, PropertyDelegate *> delegates;
173     {
174         {
175             QList<StrStrPair> dataPathItems;
176             dataPathItems << StrStrPair(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_DATA_ID, NgsReadsClassificationPlugin::DIAMOND_UNIPROT_50_DATABASE_ITEM_ID);
177             dataPathItems << StrStrPair(NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_DATA_ID, NgsReadsClassificationPlugin::DIAMOND_UNIPROT_90_DATABASE_ITEM_ID);
178             delegates[DATABASE_ATTR_ID] = new DatabaseDelegate(ACTOR_ID, DATABASE_ATTR_ID, dataPathItems, "diamond/database", false);
179         }
180         {
181             QList<ComboItem> idMap;
182             QList<DNATranslation *> TTs = AppContext::getDNATranslationRegistry()->lookupTranslation(AppContext::getDNAAlphabetRegistry()->findById(BaseDNAAlphabetIds::NUCL_DNA_DEFAULT()),
183                                                                                                      DNATranslationType_NUCL_2_AMINO);
184             int prefixLen = QString(DNATranslationID(1)).size() - 1;
185             foreach (DNATranslation *tt, TTs) {
186                 QString id = tt->getTranslationId();
187                 idMap.append(qMakePair(tt->getTranslationName(), id.mid(prefixLen).toInt()));
188             }
189             delegates[GENCODE_ATTR_ID] = new ComboBoxDelegate(idMap);
190         }
191         {
192             QList<ComboItem> items;
193             items.append(qMakePair(DiamondClassifyPrompter::tr("Default"), DiamondClassifyTaskSettings::SENSITIVE_DEFAULT));
194             items.append(qMakePair(DiamondClassifyPrompter::tr("Sensitive"), DiamondClassifyTaskSettings::SENSITIVE_HIGH));
195             items.append(qMakePair(DiamondClassifyPrompter::tr("More sensitive"), DiamondClassifyTaskSettings::SENSITIVE_ULTRA));
196             delegates[SENSITIVE_ATTR_ID] = new ComboBoxDelegate(items);
197         }
198         {
199             QVariantMap map;
200             map["minimum"] = 0;
201             map["maximum"] = 100;
202             map["suffix"] = "%";
203             delegates[TOP_ALIGNMENTS_PERCENTAGE_ATTR_ID] = new SpinBoxDelegate(map);
204         }
205         {
206             QVariantMap map;
207             map[DiamondClassifyTaskSettings::BLOSUM45] = DiamondClassifyTaskSettings::BLOSUM45;
208             map[DiamondClassifyTaskSettings::BLOSUM50] = DiamondClassifyTaskSettings::BLOSUM50;
209             map[DiamondClassifyTaskSettings::BLOSUM62] = DiamondClassifyTaskSettings::BLOSUM62;
210             map[DiamondClassifyTaskSettings::BLOSUM80] = DiamondClassifyTaskSettings::BLOSUM80;
211             map[DiamondClassifyTaskSettings::BLOSUM90] = DiamondClassifyTaskSettings::BLOSUM90;
212             map[DiamondClassifyTaskSettings::PAM30] = DiamondClassifyTaskSettings::PAM30;
213             map[DiamondClassifyTaskSettings::PAM70] = DiamondClassifyTaskSettings::PAM70;
214             map[DiamondClassifyTaskSettings::PAM250] = DiamondClassifyTaskSettings::PAM250;
215             delegates[MATRIX_ATTR_ID] = new ComboBoxDelegate(map);
216         }
217 
218         {
219             QVariantMap map;
220             map["minimum"] = -1;
221             map["maximum"] = std::numeric_limits<int>::max();
222             map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
223             delegates[GO_PEN_ATTR_ID] = new SpinBoxDelegate(map);
224         }
225         {
226             QVariantMap map;
227             map["minimum"] = -1;
228             map["maximum"] = std::numeric_limits<int>::max();
229             map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
230             delegates[GE_PEN_ATTR_ID] = new SpinBoxDelegate(map);
231         }
232 
233         {
234             QVariantMap map;
235             map["minimum"] = 0;
236             map["maximum"] = std::numeric_limits<int>::max();
237             map["specialValueText"] = DiamondClassifyPrompter::tr("Skipped");
238             delegates[FSHIFT_ATTR_ID] = new SpinBoxDelegate(map);
239         }
240         {
241             QVariantMap map;
242             map["minimum"] = 0;
243             map["maximum"] = std::numeric_limits<int>::max();
244             map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
245             delegates[CHUNKS_ATTR_ID] = new SpinBoxDelegate(map);
246         }
247 
248         {
249             QVariantMap map;
250             map["minimum"] = 0;
251             map["singleStep"] = 0.001;
252             map["decimals"] = 4;
253             delegates[EVALUE_ATTR_ID] = new DoubleSpinBoxDelegate(map);
254         }
255 
256         {
257             QVariantMap map;
258             map["minimum"] = 0;
259             map["singleStep"] = 0.1;
260             map["decimals"] = 2;
261             map["specialValueText"] = DiamondClassifyPrompter::tr("Default");
262             delegates[BSIZE_ATTR_ID] = new DoubleSpinBoxDelegate(map);
263         }
264 
265         QVariantMap threadsNumberProperties;
266         threadsNumberProperties["minimum"] = 1;
267         threadsNumberProperties["maximum"] = QThread::idealThreadCount();
268         delegates[THREADS_ATTR_ID] = new SpinBoxDelegate(threadsNumberProperties);
269 
270         DelegateTags outputUrlTags;
271         outputUrlTags.set(DelegateTags::PLACEHOLDER_TEXT, "Auto");
272         outputUrlTags.set(DelegateTags::FILTER, DialogUtils::prepareDocumentsFileFilter(BaseDocumentFormats::PLAIN_TEXT, true, QStringList()));
273         outputUrlTags.set(DelegateTags::FORMAT, BaseDocumentFormats::PLAIN_TEXT);
274         delegates[OUTPUT_URL_ATTR_ID] = new URLDelegate(outputUrlTags, "diamond/output");
275     }
276 
277     const Descriptor desc(ACTOR_ID,
278                           DiamondClassifyPrompter::tr("Classify Sequences with DIAMOND"),
279                           DiamondClassifyPrompter::tr("In general, DIAMOND is a sequence aligner for protein and translated DNA "
280                                                       "searches similar to the NCBI BLAST software tools. However, it provides a "
281                                                       "speedup of BLAST ranging up to x20,000."
282                                                       "<br><br>"
283                                                       "Using this workflow element one can use DIAMOND for taxonomic classification of "
284                                                       "short DNA reads and longer sequences such as contigs. The lowest common "
285                                                       "ancestor (LCA) algorithm is used for the classification."));
286 
287     ActorPrototype *proto = new IntegralBusActorPrototype(desc, ports, attributes);
288     proto->setEditor(new DelegateEditor(delegates));
289     proto->setPrompter(new DiamondClassifyPrompter(nullptr));
290     proto->addExternalTool(DiamondSupport::TOOL_ID);
291     WorkflowEnv::getProtoRegistry()->registerProto(NgsReadsClassificationPlugin::WORKFLOW_ELEMENTS_GROUP, proto);
292 
293     DomainFactory *localDomain = WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID);
294     localDomain->registerEntry(new DiamondClassifyWorkerFactory());
295 }
296 
cleanup()297 void DiamondClassifyWorkerFactory::cleanup() {
298     delete WorkflowEnv::getProtoRegistry()->unregisterProto(ACTOR_ID);
299 
300     DomainFactory *localDomain = WorkflowEnv::getDomainRegistry()->getById(LocalDomainFactory::ID);
301     delete localDomain->unregisterEntry(ACTOR_ID);
302 }
303 
304 }  // namespace LocalWorkflow
305 }  // namespace U2
306