1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21 #include <limits.h>
22
23 #include <U2Core/AppContext.h>
24 #include <U2Core/AppResources.h>
25 #include <U2Core/AppSettings.h>
26 #include <U2Core/Counter.h>
27 #include <U2Core/DNATranslation.h>
28 #include <U2Core/IOAdapter.h>
29 #include <U2Core/LoadDocumentTask.h>
30 #include <U2Core/Log.h>
31 #include <U2Core/TextUtils.h>
32 #include <U2Core/Timer.h>
33 #include <U2Core/U2AssemblyDbi.h>
34 #include <U2Core/U2SafePoints.h>
35 #include <U2Core/UserApplicationsSettings.h>
36
37 #include <U2Formats/BgzipTask.h>
38
39 #include "GenomeAlignerFindTask.h"
40 #include "GenomeAlignerIndex.h"
41 #include "GenomeAlignerIndexTask.h"
42 #include "GenomeAlignerTask.h"
43 #include "ReadShortReadsSubTask.h"
44 #include "WriteAlignedReadsSubTask.h"
45
46 namespace U2 {
47
48 // TODO: calling tr() int static context! Translator is not initialized yet!
49 const QString GenomeAlignerTask::taskName(QObject::tr("UGENE Genome Aligner"));
50 const QString GenomeAlignerTask::OPTION_ALIGN_REVERSED("align_reversed");
51 const QString GenomeAlignerTask::OPTION_OPENCL("use_gpu_optimization");
52 const QString GenomeAlignerTask::OPTION_IF_ABS_MISMATCHES("if_absolute_mismatches_value");
53 const QString GenomeAlignerTask::OPTION_MISMATCHES("mismatches_allowed");
54 const QString GenomeAlignerTask::OPTION_PERCENTAGE_MISMATCHES("mismatches_percentage_allowed");
55 const QString GenomeAlignerTask::OPTION_INDEX_DIR("dir_of_the_index_file");
56 const QString GenomeAlignerTask::OPTION_BEST("best_mode");
57 const QString GenomeAlignerTask::OPTION_QUAL_THRESHOLD("quality_threshold");
58 const QString GenomeAlignerTask::OPTION_READS_MEMORY_SIZE("reads_mem_size");
59 const QString GenomeAlignerTask::OPTION_SEQ_PART_SIZE("seq_part_size");
60
GenomeAlignerTask(const DnaAssemblyToRefTaskSettings & _settings,bool _justBuildIndex)61 GenomeAlignerTask::GenomeAlignerTask(const DnaAssemblyToRefTaskSettings &_settings, bool _justBuildIndex)
62 : DnaAssemblyToReferenceTask(_settings, TaskFlags_NR_FOSE_COSC, _justBuildIndex),
63 loadDbiTask(nullptr),
64 createIndexTask(nullptr),
65 readTask(nullptr),
66 findTask(nullptr),
67 writeTask(nullptr),
68 pWriteTask(nullptr),
69 unzipTask(nullptr),
70 seqReader(nullptr),
71 seqWriter(nullptr),
72 justBuildIndex(_justBuildIndex),
73 index(nullptr),
74 lastQuery(nullptr) {
75 GCOUNTER(cvar, "GenomeAlignerTask");
76 setMaxParallelSubtasks(4);
77 hasResults = false;
78 readsCount = 0;
79 readsAligned = 0;
80 shortreadLoadTime = 0;
81 resultWriteTime = 0;
82 indexLoadTime = 0;
83 shortreadIOTime = 0;
84 currentProgress = 0.0f;
85 noDataToAlign = false;
86
87 alignReversed = settings.getCustomValue(OPTION_ALIGN_REVERSED, true).toBool();
88 alignContext.openCL = settings.getCustomValue(OPTION_OPENCL, false).toBool();
89 alignContext.absMismatches = settings.getCustomValue(OPTION_IF_ABS_MISMATCHES, true).toBool();
90 alignContext.nMismatches = settings.getCustomValue(OPTION_MISMATCHES, 0).toInt();
91 alignContext.ptMismatches = settings.getCustomValue(OPTION_PERCENTAGE_MISMATCHES, 0).toInt();
92 qualityThreshold = settings.getCustomValue(OPTION_QUAL_THRESHOLD, 0).toInt();
93 alignContext.bestMode = settings.getCustomValue(OPTION_BEST, false).toBool();
94 seqPartSize = settings.getCustomValue(OPTION_SEQ_PART_SIZE, 10).toInt();
95 readMemSize = settings.getCustomValue(OPTION_READS_MEMORY_SIZE, 10).toInt();
96 prebuiltIndex = settings.prebuiltIndex;
97
98 QStringList indexExtensions;
99 indexExtensions << ".idx"
100 << ".0.sarr"
101 << ".ref";
102
103 if (!justBuildIndex) {
104 setUpIndexBuilding(indexExtensions);
105 prebuiltIndex = settings.prebuiltIndex;
106 }
107
108 if (settings.indexFileName.isEmpty()) {
109 if (prebuiltIndex) {
110 indexFileName = settings.refSeqUrl.dirPath() + "/" + settings.refSeqUrl.baseFileName();
111 } else {
112 QString tempDir = AppContext::getAppSettings()->getUserAppsSettings()->getCurrentProcessTemporaryDirPath("aligner");
113 QString indexDir = settings.getCustomValue(OPTION_INDEX_DIR, tempDir).toString();
114 QDir indexQDir(indexDir);
115 indexQDir.mkpath(indexDir);
116
117 indexFileName = indexDir + "/" + settings.refSeqUrl.baseFileName() + "." + GenomeAlignerIndex::HEADER_EXTENSION;
118 }
119 } else {
120 indexFileName = settings.indexFileName;
121 }
122
123 taskLog.details(tr("Genome Aligner settings"));
124 taskLog.details(tr("Index file name: %1").arg(indexFileName));
125 taskLog.details(tr("Use prebuilt index: %2").arg(prebuiltIndex));
126
127 qint64 memUseMB = seqPartSize * 13;
128 if (!justBuildIndex) {
129 memUseMB += readMemSize;
130 }
131 addTaskResource(TaskResourceUsage(RESOURCE_MEMORY, memUseMB, true));
132 if (alignContext.openCL) {
133 addTaskResource(TaskResourceUsage(RESOURCE_OPENCL_GPU, 1, true));
134 }
135 }
136
~GenomeAlignerTask()137 GenomeAlignerTask::~GenomeAlignerTask() {
138 delete seqReader;
139 delete seqWriter;
140 qDeleteAll(alignContext.data);
141 alignContext.data.clear();
142 delete index;
143 }
144
prepare()145 void GenomeAlignerTask::prepare() {
146 if (GzipDecompressTask::checkZipped(settings.refSeqUrl)) {
147 temp.open(); // opening creates new temporary file
148 temp.close();
149 unzipTask = new GzipDecompressTask(settings.refSeqUrl, GUrl(QFileInfo(temp).absoluteFilePath()));
150 settings.refSeqUrl = GUrl(QFileInfo(temp).absoluteFilePath());
151 }
152
153 setupCreateIndexTask();
154
155 if (unzipTask != nullptr) {
156 addSubTask(unzipTask);
157 } else {
158 addSubTask(createIndexTask);
159 if (!justBuildIndex && !alignContext.bestMode) {
160 createGenomeAlignerWriteTask();
161 addSubTask(pWriteTask);
162 }
163 }
164 }
165
onSubTaskFinished(Task * subTask)166 QList<Task *> GenomeAlignerTask::onSubTaskFinished(Task *subTask) {
167 QList<Task *> subTasks;
168
169 if (subTask == unzipTask) {
170 subTasks.append(createIndexTask);
171 if (!justBuildIndex && !alignContext.bestMode) {
172 createGenomeAlignerWriteTask();
173 subTasks.append(pWriteTask);
174 }
175 return subTasks;
176 }
177
178 if (subTask == createIndexTask) {
179 SAFE_POINT(createIndexTask != nullptr, "Create index task error", subTasks);
180 delete index;
181 index = nullptr;
182 index = createIndexTask->index;
183 }
184
185 if (justBuildIndex || hasError() || isCanceled()) {
186 if (!justBuildIndex && !alignContext.bestMode) {
187 pWriteTask->setFinished();
188 }
189 return subTasks;
190 }
191
192 assert(createIndexTask != nullptr);
193 qint64 time = (subTask->getTimeInfo().finishTime - subTask->getTimeInfo().startTime);
194 if (subTask == createIndexTask) {
195 seqReader = new GenomeAlignerUrlReader(settings.getShortReadUrls());
196
197 if (seqReader->isEnd()) {
198 if (!hasError()) {
199 QString error = seqReader->getMemberError();
200 if (error.isEmpty()) {
201 setError(tr("Can not init short reads loader."));
202 } else {
203 setError(error);
204 }
205 if (nullptr != pWriteTask) {
206 pWriteTask->setFinished();
207 }
208 }
209 return subTasks;
210 }
211
212 QString assemblyObjectName = settings.resultFileName.baseFileName();
213 bool nonMergedReference = index->getNumberOfSequencesInIndex();
214 QString referenceSequenceName = nonMergedReference ? index->getFirstSequenceObjectName() : QString();
215 if (settings.samOutput) {
216 seqWriter = new GenomeAlignerUrlWriter(settings.resultFileName, assemblyObjectName, index->getSeqLength());
217 } else {
218 QString referenceSequenceUrl = nonMergedReference ? settings.refSeqUrl.getURLString() : QString();
219 try {
220 seqWriter = new GenomeAlignerDbiWriter(settings.resultFileName.getURLString(),
221 assemblyObjectName,
222 index->getSeqLength(),
223 referenceSequenceName,
224 referenceSequenceUrl);
225 } catch (const QString &exeptionMessage) {
226 setError(exeptionMessage);
227 if (nullptr != pWriteTask) {
228 pWriteTask->setFinished();
229 }
230 return subTasks;
231 }
232 }
233 if (!referenceSequenceName.isEmpty()) {
234 seqWriter->setReferenceName(index->getFirstSequenceObjectName());
235 }
236 if (!alignContext.bestMode) {
237 pWriteTask->setSeqWriter(seqWriter);
238 }
239 taskLog.details(QString("Genome aligner index creation time: %1 sec.").arg((double)time / (1000 * 1000)));
240 }
241
242 if (subTask == findTask) {
243 taskLog.details(QString("Aligned in %3 sec.").arg((double)time / (1000 * 1000)));
244 indexLoadTime += findTask->getIndexLoadTime();
245
246 if (alignContext.bestMode) {
247 // ReadShortReadsSubTask can add new data what can lead to realloc. Noone can touch these vectors without sync
248 writeTask = new WriteAlignedReadsSubTask(alignContext.listM, writeLock, seqWriter, alignContext.data, readsAligned);
249 writeTask->setSubtaskProgressWeight(0.0f);
250 subTasks.append(writeTask);
251 return subTasks;
252 }
253 }
254
255 if (subTask == readTask) {
256 shortreadLoadTime += time;
257 shortreadIOTime += time;
258 if (alignContext.data.count() == 0) {
259 // no more reads to align
260 if (!alignContext.bestMode) {
261 pWriteTask->setFinished();
262 }
263 seqWriter->close();
264 noDataToAlign = true;
265 return subTasks;
266 }
267
268 readsCount += readTask->bunchSize;
269 taskLog.details(QString("GenomeAlignerTask: %1 short reads loaded and complemented in %2 sec, file progress %3%.")
270 .arg(readTask->bunchSize)
271 .arg(time / (double)1000000, 0, 'f', 2)
272 .arg(seqReader->getProgress()));
273 }
274
275 if (subTask == createIndexTask || subTask == findTask || subTask == writeTask) {
276 if (subTask == writeTask) {
277 resultWriteTime += time;
278 shortreadIOTime += time;
279 }
280 if (!noDataToAlign) {
281 alignContext.listM.lockForWrite();
282 alignContext.isReadingStarted = false;
283 alignContext.isReadingFinished = false;
284 alignContext.listM.unlock();
285
286 alignContext.minReadLength = INT_MAX;
287 alignContext.maxReadLength = 0;
288 readTask = new ReadShortReadsSubTask(seqReader, settings, alignContext, readMemSize * 1000 * 1000);
289 readTask->setSubtaskProgressWeight(0.0f);
290 subTasks.append(readTask);
291 findTask = new GenomeAlignerFindTask(index, &alignContext, pWriteTask);
292 findTask->setSubtaskProgressWeight(0.0f);
293 subTasks.append(findTask);
294
295 Task *loadIndexTask = new LoadIndexTask(index, &alignContext);
296 loadIndexTask->setSubtaskProgressWeight(0.0f);
297 subTasks.append(loadIndexTask);
298 }
299 return subTasks;
300 }
301
302 return subTasks;
303 }
304
setupCreateIndexTask()305 void GenomeAlignerTask::setupCreateIndexTask() {
306 GenomeAlignerIndexSettings s;
307 s.refFileName = settings.refSeqUrl.getURLString();
308 s.indexFileName = indexFileName;
309 s.justBuildIndex = justBuildIndex;
310 s.seqPartSize = seqPartSize;
311 s.prebuiltIndex = prebuiltIndex;
312 createIndexTask = new GenomeAlignerIndexTask(s);
313 if (justBuildIndex) {
314 createIndexTask->setSubtaskProgressWeight(1.0f);
315 } else {
316 createIndexTask->setSubtaskProgressWeight(0.0f);
317 }
318 }
319
report()320 Task::ReportResult GenomeAlignerTask::report() {
321 TaskTimeInfo inf = getTimeInfo();
322 if (hasError() || isCanceled()) {
323 return ReportResult_Finished;
324 }
325
326 if (justBuildIndex) {
327 return ReportResult_Finished;
328 }
329
330 if (seqWriter->getWrittenReadsCount() == 0) {
331 hasResults = false;
332 return ReportResult_Finished;
333 }
334
335 qint64 nReadsProcessed = readsAligned;
336 if (!alignContext.bestMode) {
337 SAFE_POINT_EXT(nullptr != pWriteTask,
338 stateInfo.setError("No parallel write task in non best mode"),
339 ReportResult_Finished);
340 nReadsProcessed = pWriteTask->getWrittenReadsCount();
341 }
342
343 if (readsCount > 0) {
344 taskLog.info(tr("The aligning is finished."));
345 taskLog.info(tr("Whole working time = %1.").arg((GTimer::currentTimeMicros() - inf.startTime) / (1000 * 1000)));
346 taskLog.info(tr("%1% reads aligned.").arg(100 * (double)nReadsProcessed / readsCount));
347 if (alignContext.bestMode) { // not parallel writing could be measured
348 taskLog.info(tr("Short-reads loading time = %1").arg(shortreadLoadTime / (1000 * 1000)));
349 taskLog.info(tr("Results writing time = %1").arg(resultWriteTime / (1000 * 1000)));
350 }
351 taskLog.info(tr("Index loading time = %1").arg(indexLoadTime));
352 taskLog.info(tr("Short-reads IO time = %1").arg(shortreadIOTime / (1000 * 1000)));
353 }
354
355 hasResults = nReadsProcessed > 0;
356 taskLog.details(tr("Number of reads processed: ") + QString::number(nReadsProcessed));
357
358 return ReportResult_Finished;
359 }
360
calculateWindowSize(bool absMismatches,int nMismatches,int ptMismatches,int minReadLength,int maxReadLength)361 int GenomeAlignerTask::calculateWindowSize(bool absMismatches, int nMismatches, int ptMismatches, int minReadLength, int maxReadLength) {
362 int CMAX = nMismatches;
363 int windowSize = MAX_BIT_MASK_LENGTH;
364 for (int len = minReadLength; len <= maxReadLength; len++) {
365 if (!absMismatches) {
366 CMAX = len * ptMismatches / MAX_PERCENTAGE;
367 }
368 int q = len / (CMAX + 1);
369 if (windowSize > q) {
370 windowSize = q;
371 }
372 }
373 return windowSize;
374 }
375
getIndexPath()376 QString GenomeAlignerTask::getIndexPath() {
377 return indexFileName;
378 }
379
createGenomeAlignerWriteTask()380 void GenomeAlignerTask::createGenomeAlignerWriteTask() {
381 pWriteTask = new GenomeAlignerWriteTask(seqWriter);
382 pWriteTask->setSubtaskProgressWeight(0.0f);
383 }
384
385 } // namespace U2
386