1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 #include <limits.h>
22 
23 #include <U2Core/AppContext.h>
24 #include <U2Core/AppResources.h>
25 #include <U2Core/AppSettings.h>
26 #include <U2Core/Counter.h>
27 #include <U2Core/DNATranslation.h>
28 #include <U2Core/IOAdapter.h>
29 #include <U2Core/LoadDocumentTask.h>
30 #include <U2Core/Log.h>
31 #include <U2Core/TextUtils.h>
32 #include <U2Core/Timer.h>
33 #include <U2Core/U2AssemblyDbi.h>
34 #include <U2Core/U2SafePoints.h>
35 #include <U2Core/UserApplicationsSettings.h>
36 
37 #include <U2Formats/BgzipTask.h>
38 
39 #include "GenomeAlignerFindTask.h"
40 #include "GenomeAlignerIndex.h"
41 #include "GenomeAlignerIndexTask.h"
42 #include "GenomeAlignerTask.h"
43 #include "ReadShortReadsSubTask.h"
44 #include "WriteAlignedReadsSubTask.h"
45 
46 namespace U2 {
47 
48 // TODO: calling tr() int static context! Translator is not initialized yet!
49 const QString GenomeAlignerTask::taskName(QObject::tr("UGENE Genome Aligner"));
50 const QString GenomeAlignerTask::OPTION_ALIGN_REVERSED("align_reversed");
51 const QString GenomeAlignerTask::OPTION_OPENCL("use_gpu_optimization");
52 const QString GenomeAlignerTask::OPTION_IF_ABS_MISMATCHES("if_absolute_mismatches_value");
53 const QString GenomeAlignerTask::OPTION_MISMATCHES("mismatches_allowed");
54 const QString GenomeAlignerTask::OPTION_PERCENTAGE_MISMATCHES("mismatches_percentage_allowed");
55 const QString GenomeAlignerTask::OPTION_INDEX_DIR("dir_of_the_index_file");
56 const QString GenomeAlignerTask::OPTION_BEST("best_mode");
57 const QString GenomeAlignerTask::OPTION_QUAL_THRESHOLD("quality_threshold");
58 const QString GenomeAlignerTask::OPTION_READS_MEMORY_SIZE("reads_mem_size");
59 const QString GenomeAlignerTask::OPTION_SEQ_PART_SIZE("seq_part_size");
60 
GenomeAlignerTask(const DnaAssemblyToRefTaskSettings & _settings,bool _justBuildIndex)61 GenomeAlignerTask::GenomeAlignerTask(const DnaAssemblyToRefTaskSettings &_settings, bool _justBuildIndex)
62     : DnaAssemblyToReferenceTask(_settings, TaskFlags_NR_FOSE_COSC, _justBuildIndex),
63       loadDbiTask(nullptr),
64       createIndexTask(nullptr),
65       readTask(nullptr),
66       findTask(nullptr),
67       writeTask(nullptr),
68       pWriteTask(nullptr),
69       unzipTask(nullptr),
70       seqReader(nullptr),
71       seqWriter(nullptr),
72       justBuildIndex(_justBuildIndex),
73       index(nullptr),
74       lastQuery(nullptr) {
75     GCOUNTER(cvar, "GenomeAlignerTask");
76     setMaxParallelSubtasks(4);
77     hasResults = false;
78     readsCount = 0;
79     readsAligned = 0;
80     shortreadLoadTime = 0;
81     resultWriteTime = 0;
82     indexLoadTime = 0;
83     shortreadIOTime = 0;
84     currentProgress = 0.0f;
85     noDataToAlign = false;
86 
87     alignReversed = settings.getCustomValue(OPTION_ALIGN_REVERSED, true).toBool();
88     alignContext.openCL = settings.getCustomValue(OPTION_OPENCL, false).toBool();
89     alignContext.absMismatches = settings.getCustomValue(OPTION_IF_ABS_MISMATCHES, true).toBool();
90     alignContext.nMismatches = settings.getCustomValue(OPTION_MISMATCHES, 0).toInt();
91     alignContext.ptMismatches = settings.getCustomValue(OPTION_PERCENTAGE_MISMATCHES, 0).toInt();
92     qualityThreshold = settings.getCustomValue(OPTION_QUAL_THRESHOLD, 0).toInt();
93     alignContext.bestMode = settings.getCustomValue(OPTION_BEST, false).toBool();
94     seqPartSize = settings.getCustomValue(OPTION_SEQ_PART_SIZE, 10).toInt();
95     readMemSize = settings.getCustomValue(OPTION_READS_MEMORY_SIZE, 10).toInt();
96     prebuiltIndex = settings.prebuiltIndex;
97 
98     QStringList indexExtensions;
99     indexExtensions << ".idx"
100                     << ".0.sarr"
101                     << ".ref";
102 
103     if (!justBuildIndex) {
104         setUpIndexBuilding(indexExtensions);
105         prebuiltIndex = settings.prebuiltIndex;
106     }
107 
108     if (settings.indexFileName.isEmpty()) {
109         if (prebuiltIndex) {
110             indexFileName = settings.refSeqUrl.dirPath() + "/" + settings.refSeqUrl.baseFileName();
111         } else {
112             QString tempDir = AppContext::getAppSettings()->getUserAppsSettings()->getCurrentProcessTemporaryDirPath("aligner");
113             QString indexDir = settings.getCustomValue(OPTION_INDEX_DIR, tempDir).toString();
114             QDir indexQDir(indexDir);
115             indexQDir.mkpath(indexDir);
116 
117             indexFileName = indexDir + "/" + settings.refSeqUrl.baseFileName() + "." + GenomeAlignerIndex::HEADER_EXTENSION;
118         }
119     } else {
120         indexFileName = settings.indexFileName;
121     }
122 
123     taskLog.details(tr("Genome Aligner settings"));
124     taskLog.details(tr("Index file name: %1").arg(indexFileName));
125     taskLog.details(tr("Use prebuilt index: %2").arg(prebuiltIndex));
126 
127     qint64 memUseMB = seqPartSize * 13;
128     if (!justBuildIndex) {
129         memUseMB += readMemSize;
130     }
131     addTaskResource(TaskResourceUsage(RESOURCE_MEMORY, memUseMB, true));
132     if (alignContext.openCL) {
133         addTaskResource(TaskResourceUsage(RESOURCE_OPENCL_GPU, 1, true));
134     }
135 }
136 
~GenomeAlignerTask()137 GenomeAlignerTask::~GenomeAlignerTask() {
138     delete seqReader;
139     delete seqWriter;
140     qDeleteAll(alignContext.data);
141     alignContext.data.clear();
142     delete index;
143 }
144 
prepare()145 void GenomeAlignerTask::prepare() {
146     if (GzipDecompressTask::checkZipped(settings.refSeqUrl)) {
147         temp.open();  // opening creates new temporary file
148         temp.close();
149         unzipTask = new GzipDecompressTask(settings.refSeqUrl, GUrl(QFileInfo(temp).absoluteFilePath()));
150         settings.refSeqUrl = GUrl(QFileInfo(temp).absoluteFilePath());
151     }
152 
153     setupCreateIndexTask();
154 
155     if (unzipTask != nullptr) {
156         addSubTask(unzipTask);
157     } else {
158         addSubTask(createIndexTask);
159         if (!justBuildIndex && !alignContext.bestMode) {
160             createGenomeAlignerWriteTask();
161             addSubTask(pWriteTask);
162         }
163     }
164 }
165 
onSubTaskFinished(Task * subTask)166 QList<Task *> GenomeAlignerTask::onSubTaskFinished(Task *subTask) {
167     QList<Task *> subTasks;
168 
169     if (subTask == unzipTask) {
170         subTasks.append(createIndexTask);
171         if (!justBuildIndex && !alignContext.bestMode) {
172             createGenomeAlignerWriteTask();
173             subTasks.append(pWriteTask);
174         }
175         return subTasks;
176     }
177 
178     if (subTask == createIndexTask) {
179         SAFE_POINT(createIndexTask != nullptr, "Create index task error", subTasks);
180         delete index;
181         index = nullptr;
182         index = createIndexTask->index;
183     }
184 
185     if (justBuildIndex || hasError() || isCanceled()) {
186         if (!justBuildIndex && !alignContext.bestMode) {
187             pWriteTask->setFinished();
188         }
189         return subTasks;
190     }
191 
192     assert(createIndexTask != nullptr);
193     qint64 time = (subTask->getTimeInfo().finishTime - subTask->getTimeInfo().startTime);
194     if (subTask == createIndexTask) {
195         seqReader = new GenomeAlignerUrlReader(settings.getShortReadUrls());
196 
197         if (seqReader->isEnd()) {
198             if (!hasError()) {
199                 QString error = seqReader->getMemberError();
200                 if (error.isEmpty()) {
201                     setError(tr("Can not init short reads loader."));
202                 } else {
203                     setError(error);
204                 }
205                 if (nullptr != pWriteTask) {
206                     pWriteTask->setFinished();
207                 }
208             }
209             return subTasks;
210         }
211 
212         QString assemblyObjectName = settings.resultFileName.baseFileName();
213         bool nonMergedReference = index->getNumberOfSequencesInIndex();
214         QString referenceSequenceName = nonMergedReference ? index->getFirstSequenceObjectName() : QString();
215         if (settings.samOutput) {
216             seqWriter = new GenomeAlignerUrlWriter(settings.resultFileName, assemblyObjectName, index->getSeqLength());
217         } else {
218             QString referenceSequenceUrl = nonMergedReference ? settings.refSeqUrl.getURLString() : QString();
219             try {
220                 seqWriter = new GenomeAlignerDbiWriter(settings.resultFileName.getURLString(),
221                                                        assemblyObjectName,
222                                                        index->getSeqLength(),
223                                                        referenceSequenceName,
224                                                        referenceSequenceUrl);
225             } catch (const QString &exeptionMessage) {
226                 setError(exeptionMessage);
227                 if (nullptr != pWriteTask) {
228                     pWriteTask->setFinished();
229                 }
230                 return subTasks;
231             }
232         }
233         if (!referenceSequenceName.isEmpty()) {
234             seqWriter->setReferenceName(index->getFirstSequenceObjectName());
235         }
236         if (!alignContext.bestMode) {
237             pWriteTask->setSeqWriter(seqWriter);
238         }
239         taskLog.details(QString("Genome aligner index creation time: %1 sec.").arg((double)time / (1000 * 1000)));
240     }
241 
242     if (subTask == findTask) {
243         taskLog.details(QString("Aligned in %3 sec.").arg((double)time / (1000 * 1000)));
244         indexLoadTime += findTask->getIndexLoadTime();
245 
246         if (alignContext.bestMode) {
247             // ReadShortReadsSubTask can add new data what can lead to realloc. Noone can touch these vectors without sync
248             writeTask = new WriteAlignedReadsSubTask(alignContext.listM, writeLock, seqWriter, alignContext.data, readsAligned);
249             writeTask->setSubtaskProgressWeight(0.0f);
250             subTasks.append(writeTask);
251             return subTasks;
252         }
253     }
254 
255     if (subTask == readTask) {
256         shortreadLoadTime += time;
257         shortreadIOTime += time;
258         if (alignContext.data.count() == 0) {
259             // no more reads to align
260             if (!alignContext.bestMode) {
261                 pWriteTask->setFinished();
262             }
263             seqWriter->close();
264             noDataToAlign = true;
265             return subTasks;
266         }
267 
268         readsCount += readTask->bunchSize;
269         taskLog.details(QString("GenomeAlignerTask: %1 short reads loaded and complemented in %2 sec, file progress %3%.")
270                             .arg(readTask->bunchSize)
271                             .arg(time / (double)1000000, 0, 'f', 2)
272                             .arg(seqReader->getProgress()));
273     }
274 
275     if (subTask == createIndexTask || subTask == findTask || subTask == writeTask) {
276         if (subTask == writeTask) {
277             resultWriteTime += time;
278             shortreadIOTime += time;
279         }
280         if (!noDataToAlign) {
281             alignContext.listM.lockForWrite();
282             alignContext.isReadingStarted = false;
283             alignContext.isReadingFinished = false;
284             alignContext.listM.unlock();
285 
286             alignContext.minReadLength = INT_MAX;
287             alignContext.maxReadLength = 0;
288             readTask = new ReadShortReadsSubTask(seqReader, settings, alignContext, readMemSize * 1000 * 1000);
289             readTask->setSubtaskProgressWeight(0.0f);
290             subTasks.append(readTask);
291             findTask = new GenomeAlignerFindTask(index, &alignContext, pWriteTask);
292             findTask->setSubtaskProgressWeight(0.0f);
293             subTasks.append(findTask);
294 
295             Task *loadIndexTask = new LoadIndexTask(index, &alignContext);
296             loadIndexTask->setSubtaskProgressWeight(0.0f);
297             subTasks.append(loadIndexTask);
298         }
299         return subTasks;
300     }
301 
302     return subTasks;
303 }
304 
setupCreateIndexTask()305 void GenomeAlignerTask::setupCreateIndexTask() {
306     GenomeAlignerIndexSettings s;
307     s.refFileName = settings.refSeqUrl.getURLString();
308     s.indexFileName = indexFileName;
309     s.justBuildIndex = justBuildIndex;
310     s.seqPartSize = seqPartSize;
311     s.prebuiltIndex = prebuiltIndex;
312     createIndexTask = new GenomeAlignerIndexTask(s);
313     if (justBuildIndex) {
314         createIndexTask->setSubtaskProgressWeight(1.0f);
315     } else {
316         createIndexTask->setSubtaskProgressWeight(0.0f);
317     }
318 }
319 
report()320 Task::ReportResult GenomeAlignerTask::report() {
321     TaskTimeInfo inf = getTimeInfo();
322     if (hasError() || isCanceled()) {
323         return ReportResult_Finished;
324     }
325 
326     if (justBuildIndex) {
327         return ReportResult_Finished;
328     }
329 
330     if (seqWriter->getWrittenReadsCount() == 0) {
331         hasResults = false;
332         return ReportResult_Finished;
333     }
334 
335     qint64 nReadsProcessed = readsAligned;
336     if (!alignContext.bestMode) {
337         SAFE_POINT_EXT(nullptr != pWriteTask,
338                        stateInfo.setError("No parallel write task in non best mode"),
339                        ReportResult_Finished);
340         nReadsProcessed = pWriteTask->getWrittenReadsCount();
341     }
342 
343     if (readsCount > 0) {
344         taskLog.info(tr("The aligning is finished."));
345         taskLog.info(tr("Whole working time = %1.").arg((GTimer::currentTimeMicros() - inf.startTime) / (1000 * 1000)));
346         taskLog.info(tr("%1% reads aligned.").arg(100 * (double)nReadsProcessed / readsCount));
347         if (alignContext.bestMode) {  // not parallel writing could be measured
348             taskLog.info(tr("Short-reads loading time = %1").arg(shortreadLoadTime / (1000 * 1000)));
349             taskLog.info(tr("Results writing time = %1").arg(resultWriteTime / (1000 * 1000)));
350         }
351         taskLog.info(tr("Index loading time = %1").arg(indexLoadTime));
352         taskLog.info(tr("Short-reads IO time = %1").arg(shortreadIOTime / (1000 * 1000)));
353     }
354 
355     hasResults = nReadsProcessed > 0;
356     taskLog.details(tr("Number of reads processed: ") + QString::number(nReadsProcessed));
357 
358     return ReportResult_Finished;
359 }
360 
calculateWindowSize(bool absMismatches,int nMismatches,int ptMismatches,int minReadLength,int maxReadLength)361 int GenomeAlignerTask::calculateWindowSize(bool absMismatches, int nMismatches, int ptMismatches, int minReadLength, int maxReadLength) {
362     int CMAX = nMismatches;
363     int windowSize = MAX_BIT_MASK_LENGTH;
364     for (int len = minReadLength; len <= maxReadLength; len++) {
365         if (!absMismatches) {
366             CMAX = len * ptMismatches / MAX_PERCENTAGE;
367         }
368         int q = len / (CMAX + 1);
369         if (windowSize > q) {
370             windowSize = q;
371         }
372     }
373     return windowSize;
374 }
375 
getIndexPath()376 QString GenomeAlignerTask::getIndexPath() {
377     return indexFileName;
378 }
379 
createGenomeAlignerWriteTask()380 void GenomeAlignerTask::createGenomeAlignerWriteTask() {
381     pWriteTask = new GenomeAlignerWriteTask(seqWriter);
382     pWriteTask->setSubtaskProgressWeight(0.0f);
383 }
384 
385 }  // namespace U2
386