1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "MafftAddToAlignmentTask.h"
23 
24 #include <QCoreApplication>
25 #include <QDir>
26 #include <QTemporaryFile>
27 
28 #include <U2Algorithm/AlignmentAlgorithmsRegistry.h>
29 #include <U2Algorithm/BaseAlignmentAlgorithmsIds.h>
30 
31 #include <U2Core/AddDocumentTask.h>
32 #include <U2Core/AppContext.h>
33 #include <U2Core/AppResources.h>
34 #include <U2Core/AppSettings.h>
35 #include <U2Core/Counter.h>
36 #include <U2Core/DNAAlphabet.h>
37 #include <U2Core/DNASequenceObject.h>
38 #include <U2Core/DocumentModel.h>
39 #include <U2Core/ExternalToolRegistry.h>
40 #include <U2Core/GObjectUtils.h>
41 #include <U2Core/IOAdapterUtils.h>
42 #include <U2Core/LoadDocumentTask.h>
43 #include <U2Core/Log.h>
44 #include <U2Core/MSAUtils.h>
45 #include <U2Core/MsaDbiUtils.h>
46 #include <U2Core/MultipleSequenceAlignmentExporter.h>
47 #include <U2Core/MultipleSequenceAlignmentObject.h>
48 #include <U2Core/ProjectModel.h>
49 #include <U2Core/U2AlphabetUtils.h>
50 #include <U2Core/U2Mod.h>
51 #include <U2Core/U2OpStatusUtils.h>
52 #include <U2Core/U2SafePoints.h>
53 #include <U2Core/UserApplicationsSettings.h>
54 
55 #include <U2Gui/OpenViewTask.h>
56 
57 #include "MAFFTSupport.h"
58 #include "MAFFTSupportTask.h"
59 
60 namespace U2 {
61 
62 static const int UNBREAKABLE_SEQUENCE_LENGTH_LIMIT = 50;
63 
64 /************************************************************************/
65 /* MafftAddToAlignmentTask */
66 /************************************************************************/
MafftAddToAlignmentTask(const AlignSequencesToAlignmentTaskSettings & settings)67 MafftAddToAlignmentTask::MafftAddToAlignmentTask(const AlignSequencesToAlignmentTaskSettings &settings)
68     : AbstractAlignmentTask(tr("Align sequences to alignment task"), TaskFlag_None),
69       settings(settings),
70       logParser(nullptr),
71       saveSequencesDocumentTask(nullptr),
72       saveAlignmentDocumentTask(nullptr),
73       mafftTask(nullptr),
74       loadTmpDocumentTask(nullptr),
75       modStep(nullptr) {
76     GCOUNTER(cvar, "MafftAddToAlignmentTask");
77 
78     SAFE_POINT_EXT(settings.isValid(), setError("Incorrect settings were passed into MafftAddToAlignmentTask"), );
79 
80     MultipleSequenceAlignmentExporter alnExporter;
81     inputMsa = alnExporter.getAlignment(settings.msaRef.dbiRef, settings.msaRef.entityId, stateInfo);
82     int rowNumber = inputMsa->getNumRows();
83     for (int i = 0; i < rowNumber; i++) {
84         inputMsa->renameRow(i, QString::number(i));
85     }
86 }
87 
generateTmpFileUrl(const QString & filePathAndPattern)88 static QString generateTmpFileUrl(const QString &filePathAndPattern) {
89     QTemporaryFile *generatedFile = new QTemporaryFile(filePathAndPattern);
90     QFileInfo generatedFileInfo(generatedFile->fileName());
91     while (generatedFile->exists() || generatedFileInfo.baseName().contains(" ") || !generatedFile->open()) {
92         delete generatedFile;
93         generatedFile = new QTemporaryFile(filePathAndPattern);
94     }
95     generatedFile->close();
96     QString result = generatedFile->fileName();
97     delete generatedFile;
98     return result;
99 }
100 
prepare()101 void MafftAddToAlignmentTask::prepare() {
102     algoLog.info(tr("Align sequences to alignment with MAFFT started"));
103 
104     MSAUtils::removeColumnsWithGaps(inputMsa, inputMsa->getNumRows());
105 
106     tmpDirUrl = ExternalToolSupportUtils::createTmpDir("add_to_alignment", stateInfo);
107 
108     QString tmpAddedUrl = generateTmpFileUrl(tmpDirUrl + QDir::separator() + "XXXXXXXXXXXXXXXX_add.fa");
109     ;
110 
111     DocumentFormatRegistry *dfr = AppContext::getDocumentFormatRegistry();
112     DocumentFormat *dfd = dfr->getFormatById(BaseDocumentFormats::FASTA);
113     Document *tempDocument = dfd->createNewLoadedDocument(IOAdapterUtils::get(BaseIOAdapters::LOCAL_FILE), GUrl(tmpAddedUrl), stateInfo);
114 
115     QListIterator<QString> namesIterator(settings.addedSequencesNames);
116     int currentRowNumber = inputMsa->getNumRows();
117     foreach (const U2EntityRef &sequenceRef, settings.addedSequencesRefs) {
118         uniqueIdsToNames[QString::number(currentRowNumber)] = namesIterator.next();
119         U2SequenceObject seqObject(QString::number(currentRowNumber), sequenceRef);
120         GObject *cloned = seqObject.clone(tempDocument->getDbiRef(), stateInfo);
121         CHECK_OP(stateInfo, );
122         cloned->setGObjectName(QString::number(currentRowNumber));
123         tempDocument->addObject(cloned);
124         currentRowNumber++;
125     }
126 
127     saveSequencesDocumentTask = new SaveDocumentTask(tempDocument, tempDocument->getIOAdapterFactory(), tmpAddedUrl, SaveDocFlags(SaveDoc_Roll) | SaveDoc_DestroyAfter | SaveDoc_ReduceLoggingLevel);
128     addSubTask(saveSequencesDocumentTask);
129 
130     QString tmpExistingAlignmentUrl = generateTmpFileUrl(tmpDirUrl + QDir::separator() + "XXXXXXXXXXXXXXXX.fa");
131 
132     saveAlignmentDocumentTask = new SaveMSA2SequencesTask(inputMsa, tmpExistingAlignmentUrl, false, BaseDocumentFormats::FASTA);
133     addSubTask(saveAlignmentDocumentTask);
134 }
135 
onSubTaskFinished(Task * subTask)136 QList<Task *> MafftAddToAlignmentTask::onSubTaskFinished(Task *subTask) {
137     QList<Task *> subTasks;
138 
139     propagateSubtaskError();
140     if (subTask->isCanceled() || isCanceled() || hasError()) {
141         return subTasks;
142     }
143 
144     if ((subTask == saveAlignmentDocumentTask || subTask == saveSequencesDocumentTask) && saveAlignmentDocumentTask->isFinished() && saveSequencesDocumentTask->isFinished()) {
145         resultFilePath = settings.resultFileName.isEmpty() ? tmpDirUrl + QDir::separator() + "result_aln.fa" : settings.resultFileName.getURLString();
146         QStringList arguments;
147         if (settings.addAsFragments) {
148             arguments << "--addfragments";
149         } else {
150             arguments << "--add";
151         }
152         arguments << saveSequencesDocumentTask->getURL().getURLString();
153         const DNAAlphabet *alphabet = U2AlphabetUtils::getById(settings.alphabet);
154         SAFE_POINT_EXT(alphabet != nullptr, setError("Albhabet is invalid."), subTasks);
155         if (alphabet->isRaw()) {
156             arguments << "--anysymbol";
157         }
158         if (useMemsaveOption()) {
159             arguments << "--memsave";
160         }
161         if (settings.reorderSequences) {
162             arguments << "--reorder";
163         }
164         arguments << saveAlignmentDocumentTask->getDocument()->getURLString();
165         QString outputUrl = resultFilePath + ".out.fa";
166 
167         logParser = new MAFFTLogParser(inputMsa->getNumRows(), 1, outputUrl);
168         mafftTask = new ExternalToolRunTask(MAFFTSupport::ET_MAFFT_ID, arguments, logParser);
169         mafftTask->setStandartOutputFile(resultFilePath);
170         mafftTask->setSubtaskProgressWeight(65);
171         subTasks.append(mafftTask);
172     } else if (subTask == mafftTask) {
173         SAFE_POINT(logParser != nullptr, "logParser is null", subTasks);
174         logParser->cleanup();
175         if (!QFileInfo(resultFilePath).exists()) {
176             if (AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->isValid()) {
177                 stateInfo.setError(tr("Output file '%1' not found").arg(resultFilePath));
178             } else {
179                 stateInfo.setError(tr("Output file '%3' not found. May be %1 tool path '%2' not valid?")
180                                        .arg(AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->getName())
181                                        .arg(AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->getPath())
182                                        .arg(resultFilePath));
183             }
184             return subTasks;
185         }
186         ioLog.details(tr("Loading output file '%1'").arg(resultFilePath));
187         IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(BaseIOAdapters::LOCAL_FILE);
188         loadTmpDocumentTask = new LoadDocumentTask(BaseDocumentFormats::FASTA, resultFilePath, iof);
189         loadTmpDocumentTask->setSubtaskProgressWeight(5);
190         subTasks.append(loadTmpDocumentTask);
191     } else if (subTask == loadTmpDocumentTask) {
192         modStep = new U2UseCommonUserModStep(settings.msaRef, stateInfo);
193     }
194 
195     return subTasks;
196 }
197 
run()198 void MafftAddToAlignmentTask::run() {
199     CHECK_OP(stateInfo, );
200     tpm = Progress_Manual;
201     SAFE_POINT(loadTmpDocumentTask != nullptr, QString("Load task is NULL"), );
202     tmpDoc = QSharedPointer<Document>(loadTmpDocumentTask->takeDocument(false));
203     SAFE_POINT(tmpDoc != nullptr, QString("output document '%1' not loaded").arg(tmpDoc->getURLString()), );
204     SAFE_POINT(tmpDoc->getObjects().length() != 0, QString("no objects in output document '%1'").arg(tmpDoc->getURLString()), );
205 
206     U2MsaDbi *dbi = modStep->getDbi()->getMsaDbi();
207 
208     QStringList rowNames = inputMsa->getRowNames();
209 
210     int posInMsa = 0;
211     int objectsCount = tmpDoc->getObjects().count();
212     bool hasDbiUpdates = false;
213 
214     U2AlphabetId currentAlphabet = dbi->getMsaAlphabet(settings.msaRef.entityId, stateInfo);
215     CHECK_OP(stateInfo, );
216 
217     if (currentAlphabet != settings.alphabet) {
218         hasDbiUpdates = true;
219         dbi->updateMsaAlphabet(settings.msaRef.entityId, settings.alphabet, stateInfo);
220         CHECK_OP(stateInfo, );
221     }
222     QMap<QString, qint64> uniqueNamesToIds;
223     foreach (const MultipleSequenceAlignmentRow &refRow, inputMsa->getMsaRows()) {
224         uniqueNamesToIds[refRow->getName()] = refRow->getRowId();
225     }
226 
227     bool additionalModificationPerformed = false;
228     QStringList unalignedSequences;
229     foreach (GObject *object, tmpDoc->getObjects()) {
230         if (hasError() || isCanceled()) {
231             return;
232         }
233         stateInfo.setProgress(70 + 30 * posInMsa / objectsCount);
234         U2SequenceObject *sequenceObject = qobject_cast<U2SequenceObject *>(object);
235         bool rowWasAdded = true;
236         if (!rowNames.contains(sequenceObject->getSequenceName())) {
237             //inserting new rows
238             sequenceObject->setGObjectName(uniqueIdsToNames[sequenceObject->getGObjectName()]);
239             SAFE_POINT(sequenceObject != nullptr, "U2SequenceObject is null", );
240 
241             U2MsaRow row = MSAUtils::copyRowFromSequence(sequenceObject, settings.msaRef.dbiRef, stateInfo);
242 
243             rowWasAdded = row.length != 0;
244             if (row.length - MsaRowUtils::getGapsLength(row.gaps) <= UNBREAKABLE_SEQUENCE_LENGTH_LIMIT) {
245                 if (MsaRowUtils::hasLeadingGaps(row.gaps)) {
246                     row.gaps = row.gaps.mid(0, 1);
247                 } else {
248                     row.gaps.clear();
249                 }
250                 additionalModificationPerformed = true;
251             }
252 
253             if (rowWasAdded) {
254                 hasDbiUpdates = true;
255                 dbi->addRow(settings.msaRef.entityId, posInMsa, row, stateInfo);
256                 CHECK_OP(stateInfo, );
257             } else {
258                 unalignedSequences << object->getGObjectName();
259             }
260         } else {
261             //maybe need add leading gaps to original rows
262             U2MsaRow row = MSAUtils::copyRowFromSequence(sequenceObject, settings.msaRef.dbiRef, stateInfo);
263             qint64 rowId = uniqueNamesToIds.value(sequenceObject->getSequenceName(), -1);
264             if (rowId == -1) {
265                 stateInfo.setError(tr("Row for updating doesn't found"));
266                 CHECK_OP(stateInfo, );
267             }
268 
269             U2MsaRow currentRow = dbi->getRow(settings.msaRef.entityId, rowId, stateInfo);
270             CHECK_OP(stateInfo, );
271             U2MsaRowGapModel modelToChop(currentRow.gaps);
272             MsaRowUtils::chopGapModel(modelToChop, row.length);
273 
274             if (modelToChop != row.gaps) {
275                 hasDbiUpdates = true;
276                 dbi->updateGapModel(settings.msaRef.entityId, rowId, row.gaps, stateInfo);
277                 CHECK_OP(stateInfo, );
278             }
279         }
280 
281         if (additionalModificationPerformed) {
282             algoLog.info(tr("Additional enhancement of short sequences alignment performed"));
283         }
284 
285         if (rowWasAdded) {
286             posInMsa++;
287         }
288     }
289 
290     if (!unalignedSequences.isEmpty()) {
291         stateInfo.addWarning(tr("The following sequence(s) were not aligned as they do not contain meaningful characters: \"%1\".")
292                                  .arg(unalignedSequences.join("\", \"")));
293     }
294 
295     if (hasDbiUpdates) {
296         MsaDbiUtils::trim(settings.msaRef, stateInfo);
297         CHECK_OP(stateInfo, );
298     }
299 
300     if (hasError()) {
301         return;
302     }
303     algoLog.info(tr("MAFFT alignment successfully finished"));
304 }
305 
report()306 Task::ReportResult MafftAddToAlignmentTask::report() {
307     ExternalToolSupportUtils::removeTmpDir(tmpDirUrl, stateInfo);
308     delete modStep;
309 
310     return ReportResult_Finished;
311 }
312 
useMemsaveOption() const313 bool MafftAddToAlignmentTask::useMemsaveOption() const {
314     qint64 maxLength = qMax(qint64(inputMsa->getLength()), settings.maxSequenceLength);
315     qint64 memoryInMB = 10 * maxLength * maxLength / 1024 / 1024;
316     AppResourcePool *pool = AppContext::getAppSettings()->getAppResourcePool();
317     return memoryInMB > qMin(pool->getMaxMemorySizeInMB(), pool->getTotalPhysicalMemory() / 2);
318 }
319 
getTaskInstance(AbstractAlignmentTaskSettings * _settings) const320 AbstractAlignmentTask *MafftAddToAlignmentTaskFactory::getTaskInstance(AbstractAlignmentTaskSettings *_settings) const {
321     AlignSequencesToAlignmentTaskSettings *addSettings = dynamic_cast<AlignSequencesToAlignmentTaskSettings *>(_settings);
322     SAFE_POINT(addSettings != nullptr,
323                "Add sequences to alignment: incorrect settings",
324                nullptr);
325     return new MafftAddToAlignmentTask(*addSettings);
326 }
327 
MafftAddToAlignmentAlgorithm()328 MafftAddToAlignmentAlgorithm::MafftAddToAlignmentAlgorithm()
329     : AlignmentAlgorithm(AddToAlignment,
330                          BaseAlignmentAlgorithmsIds::ALIGN_SEQUENCES_TO_ALIGNMENT_BY_MAFFT,
331                          AlignmentAlgorithmsRegistry::tr("Align sequences to alignment with MAFFT…"),
332                          new MafftAddToAlignmentTaskFactory()) {
333 }
334 
isAlgorithmAvailable() const335 bool MafftAddToAlignmentAlgorithm::isAlgorithmAvailable() const {
336     return AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->isValid();
337 }
338 
339 }    // namespace U2
340