1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "DnaAssemblyUtils.h"
23
24 #include <QAction>
25 #include <QDir>
26 #include <QMessageBox>
27
28 #include <U2Algorithm/DnaAssemblyAlgRegistry.h>
29 #include <U2Algorithm/DnaAssemblyMultiTask.h>
30 #include <U2Algorithm/GenomeAssemblyMultiTask.h>
31 #include <U2Algorithm/GenomeAssemblyRegistry.h>
32
33 #include <U2Core/AddDocumentTask.h>
34 #include <U2Core/AppContext.h>
35 #include <U2Core/AppSettings.h>
36 #include <U2Core/BaseDocumentFormats.h>
37 #include <U2Core/DNASequenceObject.h>
38 #include <U2Core/DocumentModel.h>
39 #include <U2Core/DocumentUtils.h>
40 #include <U2Core/GObjectSelection.h>
41 #include <U2Core/GUrlUtils.h>
42 #include <U2Core/L10n.h>
43 #include <U2Core/MultiTask.h>
44 #include <U2Core/ProjectModel.h>
45 #include <U2Core/QObjectScopedPointer.h>
46 #include <U2Core/U2OpStatusUtils.h>
47 #include <U2Core/U2SafePoints.h>
48 #include <U2Core/UserApplicationsSettings.h>
49
50 #include <U2Formats/ConvertAssemblyToSamTask.h>
51 #include <U2Formats/ConvertFileTask.h>
52 #include <U2Formats/FastqFormat.h>
53 #include <U2Formats/PairedFastqComparator.h>
54
55 #include <U2Gui/OpenViewTask.h>
56 #include <U2Gui/ToolsMenu.h>
57
58 #include "BuildIndexDialog.h"
59 #include "ConvertAssemblyToSamDialog.h"
60 #include "DnaAssemblyDialog.h"
61 #include "GenomeAssemblyDialog.h"
62
63 namespace U2 {
64
DnaAssemblySupport()65 DnaAssemblySupport::DnaAssemblySupport() {
66 QAction *convertAssemblyToSamAction = new QAction(tr("Convert UGENE assembly database to SAM..."), this);
67 convertAssemblyToSamAction->setObjectName(ToolsMenu::NGS_CONVERT_SAM);
68 convertAssemblyToSamAction->setIcon(QIcon(":core/images/align.png"));
69 connect(convertAssemblyToSamAction, SIGNAL(triggered()), SLOT(sl_showConvertToSamDialog()));
70 ToolsMenu::addAction(ToolsMenu::NGS_MENU, convertAssemblyToSamAction);
71
72 QAction *dnaAssemblyAction = new QAction(tr("Map reads to reference..."), this);
73 dnaAssemblyAction->setObjectName(ToolsMenu::NGS_MAP);
74 dnaAssemblyAction->setIcon(QIcon(":core/images/align.png"));
75 connect(dnaAssemblyAction, SIGNAL(triggered()), SLOT(sl_showDnaAssemblyDialog()));
76 ToolsMenu::addAction(ToolsMenu::NGS_MENU, dnaAssemblyAction);
77
78 QAction *buildIndexAction = new QAction(tr("Build index for reads mapping..."), this);
79 buildIndexAction->setObjectName(ToolsMenu::NGS_INDEX);
80 buildIndexAction->setIcon(QIcon(":core/images/align.png"));
81 connect(buildIndexAction, SIGNAL(triggered()), SLOT(sl_showBuildIndexDialog()));
82 ToolsMenu::addAction(ToolsMenu::NGS_MENU, buildIndexAction);
83 }
84
sl_showDnaAssemblyDialog()85 void DnaAssemblySupport::sl_showDnaAssemblyDialog() {
86 DnaAssemblyAlgRegistry *registry = AppContext::getDnaAssemblyAlgRegistry();
87 if (registry->getRegisteredAlgorithmIds().isEmpty()) {
88 QMessageBox::information(QApplication::activeWindow(), tr("DNA Assembly"), tr("There are no algorithms for DNA assembly available.\nPlease, check your plugin list."));
89 return;
90 }
91
92 QObjectScopedPointer<DnaAssemblyDialog> dlg = new DnaAssemblyDialog(QApplication::activeWindow());
93 dlg->exec();
94 CHECK(!dlg.isNull(), );
95
96 if (QDialog::Accepted == dlg->result()) {
97 DnaAssemblyToRefTaskSettings s;
98 s.samOutput = dlg->isSamOutput();
99 s.refSeqUrl = dlg->getRefSeqUrl();
100 s.algName = dlg->getAlgorithmName();
101 s.resultFileName = dlg->getResultFileName();
102 s.setCustomSettings(dlg->getCustomSettings());
103 s.shortReadSets = dlg->getShortReadSets();
104 s.pairedReads = dlg->isPaired();
105 s.openView = true;
106 s.prebuiltIndex = dlg->isPrebuiltIndex();
107 Task *assemblyTask = new DnaAssemblyTaskWithConversions(s, true);
108 AppContext::getTaskScheduler()->registerTopLevelTask(assemblyTask);
109 }
110 }
111
sl_showGenomeAssemblyDialog()112 void DnaAssemblySupport::sl_showGenomeAssemblyDialog() {
113 GenomeAssemblyAlgRegistry *registry = AppContext::getGenomeAssemblyAlgRegistry();
114 if (registry->getRegisteredAlgorithmIds().isEmpty()) {
115 QMessageBox::information(QApplication::activeWindow(), tr("Genome Assembly"), tr("There are no algorithms for genome assembly available.\nPlease, check external tools in the settings."));
116 return;
117 }
118
119 QObjectScopedPointer<GenomeAssemblyDialog> dlg = new GenomeAssemblyDialog(QApplication::activeWindow());
120 dlg->exec();
121 CHECK(!dlg.isNull(), );
122
123 if (QDialog::Accepted == dlg->result()) {
124 GenomeAssemblyTaskSettings s;
125 s.algName = dlg->getAlgorithmName();
126 s.outDir = dlg->getOutDir();
127 s.setCustomSettings(dlg->getCustomSettings());
128 s.reads = dlg->getReads();
129 s.openView = true;
130 Task *assemblyTask = new GenomeAssemblyMultiTask(s);
131 AppContext::getTaskScheduler()->registerTopLevelTask(assemblyTask);
132 }
133 }
134
sl_showBuildIndexDialog()135 void DnaAssemblySupport::sl_showBuildIndexDialog() {
136 DnaAssemblyAlgRegistry *registry = AppContext::getDnaAssemblyAlgRegistry();
137 if (registry->getRegisteredAlgorithmIds().isEmpty()) {
138 QMessageBox::information(QApplication::activeWindow(), tr("DNA Assembly"), tr("There are no algorithms for DNA assembly available.\nPlease, check your plugin list."));
139 return;
140 }
141
142 QObjectScopedPointer<BuildIndexDialog> dlg = new BuildIndexDialog(registry, QApplication::activeWindow());
143 dlg->exec();
144 CHECK(!dlg.isNull(), );
145
146 if (QDialog::Accepted == dlg->result()) {
147 DnaAssemblyToRefTaskSettings s;
148 s.refSeqUrl = dlg->getRefSeqUrl();
149 s.algName = dlg->getAlgorithmName();
150 s.resultFileName = dlg->getIndexFileName();
151 s.indexFileName = dlg->getIndexFileName();
152 s.setCustomSettings(dlg->getCustomSettings());
153 s.openView = false;
154 s.prebuiltIndex = false;
155 s.pairedReads = false;
156 Task *assemblyTask = new DnaAssemblyTaskWithConversions(s, false, true);
157 AppContext::getTaskScheduler()->registerTopLevelTask(assemblyTask);
158 }
159 }
160
sl_showConvertToSamDialog()161 void DnaAssemblySupport::sl_showConvertToSamDialog() {
162 QObjectScopedPointer<ConvertAssemblyToSamDialog> dlg = new ConvertAssemblyToSamDialog(QApplication::activeWindow());
163 dlg->exec();
164 CHECK(!dlg.isNull(), );
165
166 if (QDialog::Accepted == dlg->result()) {
167 Task *convertTask = new ConvertAssemblyToSamTask(dlg->getDbFileUrl(), dlg->getSamFileUrl());
168 AppContext::getTaskScheduler()->registerTopLevelTask(convertTask);
169 }
170 }
171
172 namespace {
173 enum Result {
174 UNKNOWN,
175 CORRECT,
176 INCORRECT
177 };
178
isCorrectFormat(const GUrl & url,const QStringList & targetFormats,QString & detectedFormat)179 static Result isCorrectFormat(const GUrl &url, const QStringList &targetFormats, QString &detectedFormat) {
180 DocumentUtils::Detection r = DocumentUtils::detectFormat(url, detectedFormat);
181 CHECK(DocumentUtils::UNKNOWN != r, UNKNOWN);
182
183 bool correct = targetFormats.contains(detectedFormat);
184 if (correct) {
185 return CORRECT;
186 }
187 return INCORRECT;
188 }
189
getConvertTask(const GUrl & url,const QStringList & targetFormats)190 ConvertFileTask *getConvertTask(const GUrl &url, const QStringList &targetFormats) {
191 QString detectedFormat;
192 Result r = isCorrectFormat(url, targetFormats, detectedFormat);
193 if (UNKNOWN == r) {
194 coreLog.info("Unknown file format: " + url.getURLString());
195 return nullptr;
196 }
197
198 if (INCORRECT == r) {
199 QDir dir = QFileInfo(url.getURLString()).absoluteDir();
200 return new DefaultConvertFileTask(url, detectedFormat, targetFormats.first(), dir.absolutePath());
201 }
202 return nullptr;
203 }
204 } // namespace
205
206 #define CHECK_FILE(url, targetFormats) \
207 QString format; \
208 Result r = isCorrectFormat(url, targetFormats, format); \
209 if (UNKNOWN == r) { \
210 unknownFormatFiles << url; \
211 } else if (INCORRECT == r) { \
212 result[url.getURLString()] = format; \
213 }
214
215 #define PREPARE_FILE(url, targetFormats) \
216 if (!toConvert.contains(url.getURLString())) { \
217 ConvertFileTask *task = getConvertTask(url, targetFormats); \
218 if (nullptr != task) { \
219 addSubTask(task); \
220 conversionTasksCount++; \
221 toConvert << url.getURLString(); \
222 } \
223 }
224
toConvert(const DnaAssemblyToRefTaskSettings & settings,QList<GUrl> & unknownFormatFiles)225 QMap<QString, QString> DnaAssemblySupport::toConvert(const DnaAssemblyToRefTaskSettings &settings, QList<GUrl> &unknownFormatFiles) {
226 QMap<QString, QString> result;
227 DnaAssemblyAlgorithmEnv *env = AppContext::getDnaAssemblyAlgRegistry()->getAlgorithm(settings.algName);
228 SAFE_POINT(nullptr != env, "Unknown algorithm: " + settings.algName, result);
229
230 foreach (const GUrl &url, settings.getShortReadUrls()) {
231 CHECK_FILE(url, env->getReadsFormats());
232 }
233
234 if (!settings.prebuiltIndex) {
235 CHECK_FILE(settings.refSeqUrl, env->getRefrerenceFormats());
236 }
237 return result;
238 }
239
toConvertText(const QMap<QString,QString> & files)240 QString DnaAssemblySupport::toConvertText(const QMap<QString, QString> &files) {
241 QStringList strings;
242 foreach (const QString &url, files.keys()) {
243 QString format = files[url];
244 strings << url + " [" + format + "]";
245 }
246 return strings.join("\n");
247 }
248
unknownText(const QList<GUrl> & unknownFormatFiles)249 QString DnaAssemblySupport::unknownText(const QList<GUrl> &unknownFormatFiles) {
250 QStringList strings;
251 foreach (const GUrl &url, unknownFormatFiles) {
252 strings << url.getURLString();
253 }
254 return strings.join("\n");
255 }
256
257 /************************************************************************/
258 /* FilterUnpairedReads */
259 /************************************************************************/
FilterUnpairedReadsTask(const DnaAssemblyToRefTaskSettings & settings)260 FilterUnpairedReadsTask::FilterUnpairedReadsTask(const DnaAssemblyToRefTaskSettings &settings)
261 : Task(tr("Filter unpaired reads task"), TaskFlags_FOSE_COSC),
262 settings(settings) {
263 tmpDirPath = settings.tmpDirectoryForFilteredFiles.isEmpty() ? AppContext::getAppSettings()->getUserAppsSettings()->getCurrentProcessTemporaryDirPath() : settings.tmpDirectoryForFilteredFiles;
264 }
265
run()266 void FilterUnpairedReadsTask::run() {
267 SAFE_POINT_EXT(settings.pairedReads,
268 setError(tr("Filtering unpaired reads is launched on not-paired data")), );
269
270 QList<ShortReadSet> upstream;
271 QList<ShortReadSet> downstream;
272 foreach (const ShortReadSet &set, settings.shortReadSets) {
273 if (set.order == ShortReadSet::UpstreamMate) {
274 upstream << set;
275 } else {
276 downstream << set;
277 }
278 }
279 SAFE_POINT_EXT(upstream.size() == downstream.size(), setError(tr("The count of upstream files is not equal to the count of downstream files")), );
280
281 for (int i = 0; i < upstream.size(); i++) {
282 QString tmpFileUpstream = getTmpFilePath(upstream[i].url);
283 CHECK_OP(stateInfo, );
284 QString tmpFileDownstream = getTmpFilePath(downstream[i].url);
285 CHECK_OP(stateInfo, );
286
287 filteredReads << ShortReadSet(GUrl(tmpFileUpstream), ShortReadSet::PairedEndReads, ShortReadSet::UpstreamMate);
288 filteredReads << ShortReadSet(GUrl(tmpFileDownstream), ShortReadSet::PairedEndReads, ShortReadSet::DownstreamMate);
289
290 compareFiles(upstream[i].url.getURLString(), downstream[i].url.getURLString(), tmpFileUpstream, tmpFileDownstream);
291 CHECK_OP(stateInfo, );
292 }
293 }
294
getTmpFilePath(const GUrl & initialFile)295 QString FilterUnpairedReadsTask::getTmpFilePath(const GUrl &initialFile) {
296 QString result = GUrlUtils::prepareTmpFileLocation(tmpDirPath, initialFile.baseFileName(), "fastq", stateInfo);
297 CHECK_OP(stateInfo, QString());
298 return result;
299 }
300
compareFiles(const GUrl & upstream,const GUrl & downstream,const GUrl & upstreamFiltered,const GUrl & downstreamFiltered)301 void FilterUnpairedReadsTask::compareFiles(const GUrl &upstream, const GUrl &downstream, const GUrl &upstreamFiltered, const GUrl &downstreamFiltered) {
302 PairedFastqComparator comparator(upstream.getURLString(), downstream.getURLString(), upstreamFiltered.getURLString(), downstreamFiltered.getURLString(), stateInfo);
303 CHECK_OP(stateInfo, );
304 comparator.compare(stateInfo);
305 CHECK_OP(stateInfo, );
306
307 if (comparator.getUnpairedCount() != 0) {
308 stateInfo.addWarning(tr("%1 read pairs were mapped, %2 reads without a pair from files \"%3\" and \"%4\" were skipped.")
309 .arg(comparator.getPairsCount())
310 .arg(comparator.getUnpairedCount())
311 .arg(QFileInfo(upstream.getURLString()).fileName())
312 .arg(QFileInfo(downstream.getURLString()).fileName()));
313 }
314 }
315
316 /************************************************************************/
317 /* DnaAssemblyTaskWithConversions */
318 /************************************************************************/
DnaAssemblyTaskWithConversions(const DnaAssemblyToRefTaskSettings & settings,bool viewResult,bool justBuildIndex)319 DnaAssemblyTaskWithConversions::DnaAssemblyTaskWithConversions(const DnaAssemblyToRefTaskSettings &settings, bool viewResult, bool justBuildIndex)
320 : ExternalToolSupportTask("Dna assembly task", TaskFlags(TaskFlags_NR_FOSCOE | TaskFlag_CollectChildrenWarnings)), settings(settings), viewResult(viewResult),
321 justBuildIndex(justBuildIndex), conversionTasksCount(0), assemblyTask(nullptr) {
322 }
323
getSettings() const324 const DnaAssemblyToRefTaskSettings &DnaAssemblyTaskWithConversions::getSettings() const {
325 return settings;
326 }
prepare()327 void DnaAssemblyTaskWithConversions::prepare() {
328 DnaAssemblyAlgorithmEnv *env = AppContext::getDnaAssemblyAlgRegistry()->getAlgorithm(settings.algName);
329 if (env == nullptr) {
330 setError(QString("Algorithm %1 is not found").arg(settings.algName));
331 return;
332 }
333
334 QSet<QString> toConvert;
335 Q_UNUSED(toConvert);
336 foreach (const GUrl &url, settings.getShortReadUrls()) {
337 PREPARE_FILE(url, env->getReadsFormats());
338 }
339
340 if (!settings.prebuiltIndex) {
341 PREPARE_FILE(settings.refSeqUrl, env->getRefrerenceFormats());
342 }
343
344 if (0 == conversionTasksCount) {
345 if (settings.filterUnpaired && settings.pairedReads) {
346 addSubTask(new FilterUnpairedReadsTask(settings));
347 return;
348 }
349 assemblyTask = new DnaAssemblyMultiTask(settings, viewResult, justBuildIndex);
350 assemblyTask->addListeners(getListeners());
351 addSubTask(assemblyTask);
352 }
353 }
354
onSubTaskFinished(Task * subTask)355 QList<Task *> DnaAssemblyTaskWithConversions::onSubTaskFinished(Task *subTask) {
356 QList<Task *> result;
357 FilterUnpairedReadsTask *filterTask = qobject_cast<FilterUnpairedReadsTask *>(subTask);
358 if (filterTask != nullptr) {
359 settings.shortReadSets = filterTask->getFilteredReadList();
360 }
361 CHECK(!subTask->hasError(), result);
362 CHECK(!hasError(), result);
363
364 ConvertFileTask *convertTask = qobject_cast<ConvertFileTask *>(subTask);
365 if (nullptr != convertTask) {
366 SAFE_POINT_EXT(conversionTasksCount > 0, setError("Conversions task count error"), result);
367 if (convertTask->getSourceURL() == settings.refSeqUrl) {
368 settings.refSeqUrl = convertTask->getResult();
369 }
370
371 for (QList<ShortReadSet>::Iterator i = settings.shortReadSets.begin(); i != settings.shortReadSets.end(); i++) {
372 if (convertTask->getSourceURL() == i->url) {
373 i->url = convertTask->getResult();
374 }
375 }
376 conversionTasksCount--;
377
378 if (0 == conversionTasksCount) {
379 if (settings.filterUnpaired && settings.pairedReads) {
380 result << new FilterUnpairedReadsTask(settings);
381 return result;
382 }
383 assemblyTask = new DnaAssemblyMultiTask(settings, viewResult, justBuildIndex);
384 result << assemblyTask;
385 }
386 }
387 if (settings.filterUnpaired && filterTask != nullptr) {
388 assemblyTask = new DnaAssemblyMultiTask(settings, viewResult, justBuildIndex);
389 result << assemblyTask;
390 }
391
392 return result;
393 }
394
report()395 Task::ReportResult DnaAssemblyTaskWithConversions::report() {
396 if (settings.filterUnpaired && settings.pairedReads) {
397 foreach (const ShortReadSet &set, settings.shortReadSets) {
398 if (!QFile::remove(set.url.getURLString())) {
399 stateInfo.addWarning(tr("Cannot remove temporary file %1").arg(set.url.getURLString()));
400 }
401 }
402 }
403 return ReportResult_Finished;
404 }
405
406 } // namespace U2
407