1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "MafftAddToAlignmentTask.h"
23
24 #include <QCoreApplication>
25 #include <QDir>
26 #include <QTemporaryFile>
27
28 #include <U2Algorithm/AlignmentAlgorithmsRegistry.h>
29 #include <U2Algorithm/BaseAlignmentAlgorithmsIds.h>
30
31 #include <U2Core/AddDocumentTask.h>
32 #include <U2Core/AppContext.h>
33 #include <U2Core/AppResources.h>
34 #include <U2Core/AppSettings.h>
35 #include <U2Core/Counter.h>
36 #include <U2Core/DNAAlphabet.h>
37 #include <U2Core/DNASequenceObject.h>
38 #include <U2Core/DocumentModel.h>
39 #include <U2Core/ExternalToolRegistry.h>
40 #include <U2Core/GObjectUtils.h>
41 #include <U2Core/IOAdapterUtils.h>
42 #include <U2Core/LoadDocumentTask.h>
43 #include <U2Core/Log.h>
44 #include <U2Core/MSAUtils.h>
45 #include <U2Core/MsaDbiUtils.h>
46 #include <U2Core/MultipleSequenceAlignmentExporter.h>
47 #include <U2Core/MultipleSequenceAlignmentObject.h>
48 #include <U2Core/ProjectModel.h>
49 #include <U2Core/U2AlphabetUtils.h>
50 #include <U2Core/U2Mod.h>
51 #include <U2Core/U2OpStatusUtils.h>
52 #include <U2Core/U2SafePoints.h>
53 #include <U2Core/UserApplicationsSettings.h>
54
55 #include <U2Gui/OpenViewTask.h>
56
57 #include "MAFFTSupport.h"
58 #include "MAFFTSupportTask.h"
59
60 namespace U2 {
61
62 static const int UNBREAKABLE_SEQUENCE_LENGTH_LIMIT = 50;
63
64 /************************************************************************/
65 /* MafftAddToAlignmentTask */
66 /************************************************************************/
MafftAddToAlignmentTask(const AlignSequencesToAlignmentTaskSettings & settings)67 MafftAddToAlignmentTask::MafftAddToAlignmentTask(const AlignSequencesToAlignmentTaskSettings &settings)
68 : AbstractAlignmentTask(tr("Align sequences to alignment task"), TaskFlag_None),
69 settings(settings),
70 logParser(nullptr),
71 saveSequencesDocumentTask(nullptr),
72 saveAlignmentDocumentTask(nullptr),
73 mafftTask(nullptr),
74 loadTmpDocumentTask(nullptr),
75 modStep(nullptr) {
76 GCOUNTER(cvar, "MafftAddToAlignmentTask");
77
78 SAFE_POINT_EXT(settings.isValid(), setError("Incorrect settings were passed into MafftAddToAlignmentTask"), );
79
80 MultipleSequenceAlignmentExporter alnExporter;
81 inputMsa = alnExporter.getAlignment(settings.msaRef.dbiRef, settings.msaRef.entityId, stateInfo);
82 int rowNumber = inputMsa->getNumRows();
83 for (int i = 0; i < rowNumber; i++) {
84 inputMsa->renameRow(i, QString::number(i));
85 }
86 }
87
generateTmpFileUrl(const QString & filePathAndPattern)88 static QString generateTmpFileUrl(const QString &filePathAndPattern) {
89 QTemporaryFile *generatedFile = new QTemporaryFile(filePathAndPattern);
90 QFileInfo generatedFileInfo(generatedFile->fileName());
91 while (generatedFile->exists() || generatedFileInfo.baseName().contains(" ") || !generatedFile->open()) {
92 delete generatedFile;
93 generatedFile = new QTemporaryFile(filePathAndPattern);
94 }
95 generatedFile->close();
96 QString result = generatedFile->fileName();
97 delete generatedFile;
98 return result;
99 }
100
prepare()101 void MafftAddToAlignmentTask::prepare() {
102 algoLog.info(tr("Align sequences to alignment with MAFFT started"));
103
104 MSAUtils::removeColumnsWithGaps(inputMsa, inputMsa->getNumRows());
105
106 tmpDirUrl = ExternalToolSupportUtils::createTmpDir("add_to_alignment", stateInfo);
107
108 QString tmpAddedUrl = generateTmpFileUrl(tmpDirUrl + QDir::separator() + "XXXXXXXXXXXXXXXX_add.fa");
109 ;
110
111 DocumentFormatRegistry *dfr = AppContext::getDocumentFormatRegistry();
112 DocumentFormat *dfd = dfr->getFormatById(BaseDocumentFormats::FASTA);
113 Document *tempDocument = dfd->createNewLoadedDocument(IOAdapterUtils::get(BaseIOAdapters::LOCAL_FILE), GUrl(tmpAddedUrl), stateInfo);
114
115 QListIterator<QString> namesIterator(settings.addedSequencesNames);
116 int currentRowNumber = inputMsa->getNumRows();
117 foreach (const U2EntityRef &sequenceRef, settings.addedSequencesRefs) {
118 uniqueIdsToNames[QString::number(currentRowNumber)] = namesIterator.next();
119 U2SequenceObject seqObject(QString::number(currentRowNumber), sequenceRef);
120 GObject *cloned = seqObject.clone(tempDocument->getDbiRef(), stateInfo);
121 CHECK_OP(stateInfo, );
122 cloned->setGObjectName(QString::number(currentRowNumber));
123 tempDocument->addObject(cloned);
124 currentRowNumber++;
125 }
126
127 saveSequencesDocumentTask = new SaveDocumentTask(tempDocument, tempDocument->getIOAdapterFactory(), tmpAddedUrl, SaveDocFlags(SaveDoc_Roll) | SaveDoc_DestroyAfter | SaveDoc_ReduceLoggingLevel);
128 addSubTask(saveSequencesDocumentTask);
129
130 QString tmpExistingAlignmentUrl = generateTmpFileUrl(tmpDirUrl + QDir::separator() + "XXXXXXXXXXXXXXXX.fa");
131
132 saveAlignmentDocumentTask = new SaveMSA2SequencesTask(inputMsa, tmpExistingAlignmentUrl, false, BaseDocumentFormats::FASTA);
133 addSubTask(saveAlignmentDocumentTask);
134 }
135
onSubTaskFinished(Task * subTask)136 QList<Task *> MafftAddToAlignmentTask::onSubTaskFinished(Task *subTask) {
137 QList<Task *> subTasks;
138
139 propagateSubtaskError();
140 if (subTask->isCanceled() || isCanceled() || hasError()) {
141 return subTasks;
142 }
143
144 if ((subTask == saveAlignmentDocumentTask || subTask == saveSequencesDocumentTask) && saveAlignmentDocumentTask->isFinished() && saveSequencesDocumentTask->isFinished()) {
145 resultFilePath = settings.resultFileName.isEmpty() ? tmpDirUrl + QDir::separator() + "result_aln.fa" : settings.resultFileName.getURLString();
146 QStringList arguments;
147 if (settings.addAsFragments) {
148 arguments << "--addfragments";
149 } else {
150 arguments << "--add";
151 }
152 arguments << saveSequencesDocumentTask->getURL().getURLString();
153 const DNAAlphabet *alphabet = U2AlphabetUtils::getById(settings.alphabet);
154 SAFE_POINT_EXT(alphabet != nullptr, setError("Albhabet is invalid."), subTasks);
155 if (alphabet->isRaw()) {
156 arguments << "--anysymbol";
157 }
158 if (useMemsaveOption()) {
159 arguments << "--memsave";
160 }
161 if (settings.reorderSequences) {
162 arguments << "--reorder";
163 }
164 arguments << saveAlignmentDocumentTask->getDocument()->getURLString();
165 QString outputUrl = resultFilePath + ".out.fa";
166
167 logParser = new MAFFTLogParser(inputMsa->getNumRows(), 1, outputUrl);
168 mafftTask = new ExternalToolRunTask(MAFFTSupport::ET_MAFFT_ID, arguments, logParser);
169 mafftTask->setStandartOutputFile(resultFilePath);
170 mafftTask->setSubtaskProgressWeight(65);
171 subTasks.append(mafftTask);
172 } else if (subTask == mafftTask) {
173 SAFE_POINT(logParser != nullptr, "logParser is null", subTasks);
174 logParser->cleanup();
175 if (!QFileInfo(resultFilePath).exists()) {
176 if (AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->isValid()) {
177 stateInfo.setError(tr("Output file '%1' not found").arg(resultFilePath));
178 } else {
179 stateInfo.setError(tr("Output file '%3' not found. May be %1 tool path '%2' not valid?")
180 .arg(AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->getName())
181 .arg(AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->getPath())
182 .arg(resultFilePath));
183 }
184 return subTasks;
185 }
186 ioLog.details(tr("Loading output file '%1'").arg(resultFilePath));
187 IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(BaseIOAdapters::LOCAL_FILE);
188 loadTmpDocumentTask = new LoadDocumentTask(BaseDocumentFormats::FASTA, resultFilePath, iof);
189 loadTmpDocumentTask->setSubtaskProgressWeight(5);
190 subTasks.append(loadTmpDocumentTask);
191 } else if (subTask == loadTmpDocumentTask) {
192 modStep = new U2UseCommonUserModStep(settings.msaRef, stateInfo);
193 }
194
195 return subTasks;
196 }
197
run()198 void MafftAddToAlignmentTask::run() {
199 CHECK_OP(stateInfo, );
200 tpm = Progress_Manual;
201 SAFE_POINT(loadTmpDocumentTask != nullptr, QString("Load task is NULL"), );
202 tmpDoc = QSharedPointer<Document>(loadTmpDocumentTask->takeDocument(false));
203 SAFE_POINT(tmpDoc != nullptr, QString("output document '%1' not loaded").arg(tmpDoc->getURLString()), );
204 SAFE_POINT(tmpDoc->getObjects().length() != 0, QString("no objects in output document '%1'").arg(tmpDoc->getURLString()), );
205
206 U2MsaDbi *dbi = modStep->getDbi()->getMsaDbi();
207
208 QStringList rowNames = inputMsa->getRowNames();
209
210 int posInMsa = 0;
211 int objectsCount = tmpDoc->getObjects().count();
212 bool hasDbiUpdates = false;
213
214 U2AlphabetId currentAlphabet = dbi->getMsaAlphabet(settings.msaRef.entityId, stateInfo);
215 CHECK_OP(stateInfo, );
216
217 if (currentAlphabet != settings.alphabet) {
218 hasDbiUpdates = true;
219 dbi->updateMsaAlphabet(settings.msaRef.entityId, settings.alphabet, stateInfo);
220 CHECK_OP(stateInfo, );
221 }
222 QMap<QString, qint64> uniqueNamesToIds;
223 foreach (const MultipleSequenceAlignmentRow &refRow, inputMsa->getMsaRows()) {
224 uniqueNamesToIds[refRow->getName()] = refRow->getRowId();
225 }
226
227 bool additionalModificationPerformed = false;
228 QStringList unalignedSequences;
229 foreach (GObject *object, tmpDoc->getObjects()) {
230 if (hasError() || isCanceled()) {
231 return;
232 }
233 stateInfo.setProgress(70 + 30 * posInMsa / objectsCount);
234 U2SequenceObject *sequenceObject = qobject_cast<U2SequenceObject *>(object);
235 bool rowWasAdded = true;
236 if (!rowNames.contains(sequenceObject->getSequenceName())) {
237 //inserting new rows
238 sequenceObject->setGObjectName(uniqueIdsToNames[sequenceObject->getGObjectName()]);
239 SAFE_POINT(sequenceObject != nullptr, "U2SequenceObject is null", );
240
241 U2MsaRow row = MSAUtils::copyRowFromSequence(sequenceObject, settings.msaRef.dbiRef, stateInfo);
242
243 rowWasAdded = row.length != 0;
244 if (row.length - MsaRowUtils::getGapsLength(row.gaps) <= UNBREAKABLE_SEQUENCE_LENGTH_LIMIT) {
245 if (MsaRowUtils::hasLeadingGaps(row.gaps)) {
246 row.gaps = row.gaps.mid(0, 1);
247 } else {
248 row.gaps.clear();
249 }
250 additionalModificationPerformed = true;
251 }
252
253 if (rowWasAdded) {
254 hasDbiUpdates = true;
255 dbi->addRow(settings.msaRef.entityId, posInMsa, row, stateInfo);
256 CHECK_OP(stateInfo, );
257 } else {
258 unalignedSequences << object->getGObjectName();
259 }
260 } else {
261 //maybe need add leading gaps to original rows
262 U2MsaRow row = MSAUtils::copyRowFromSequence(sequenceObject, settings.msaRef.dbiRef, stateInfo);
263 qint64 rowId = uniqueNamesToIds.value(sequenceObject->getSequenceName(), -1);
264 if (rowId == -1) {
265 stateInfo.setError(tr("Row for updating doesn't found"));
266 CHECK_OP(stateInfo, );
267 }
268
269 U2MsaRow currentRow = dbi->getRow(settings.msaRef.entityId, rowId, stateInfo);
270 CHECK_OP(stateInfo, );
271 U2MsaRowGapModel modelToChop(currentRow.gaps);
272 MsaRowUtils::chopGapModel(modelToChop, row.length);
273
274 if (modelToChop != row.gaps) {
275 hasDbiUpdates = true;
276 dbi->updateGapModel(settings.msaRef.entityId, rowId, row.gaps, stateInfo);
277 CHECK_OP(stateInfo, );
278 }
279 }
280
281 if (additionalModificationPerformed) {
282 algoLog.info(tr("Additional enhancement of short sequences alignment performed"));
283 }
284
285 if (rowWasAdded) {
286 posInMsa++;
287 }
288 }
289
290 if (!unalignedSequences.isEmpty()) {
291 stateInfo.addWarning(tr("The following sequence(s) were not aligned as they do not contain meaningful characters: \"%1\".")
292 .arg(unalignedSequences.join("\", \"")));
293 }
294
295 if (hasDbiUpdates) {
296 MsaDbiUtils::trim(settings.msaRef, stateInfo);
297 CHECK_OP(stateInfo, );
298 }
299
300 if (hasError()) {
301 return;
302 }
303 algoLog.info(tr("MAFFT alignment successfully finished"));
304 }
305
report()306 Task::ReportResult MafftAddToAlignmentTask::report() {
307 ExternalToolSupportUtils::removeTmpDir(tmpDirUrl, stateInfo);
308 delete modStep;
309
310 return ReportResult_Finished;
311 }
312
useMemsaveOption() const313 bool MafftAddToAlignmentTask::useMemsaveOption() const {
314 qint64 maxLength = qMax(qint64(inputMsa->getLength()), settings.maxSequenceLength);
315 qint64 memoryInMB = 10 * maxLength * maxLength / 1024 / 1024;
316 AppResourcePool *pool = AppContext::getAppSettings()->getAppResourcePool();
317 return memoryInMB > qMin(pool->getMaxMemorySizeInMB(), pool->getTotalPhysicalMemory() / 2);
318 }
319
getTaskInstance(AbstractAlignmentTaskSettings * _settings) const320 AbstractAlignmentTask *MafftAddToAlignmentTaskFactory::getTaskInstance(AbstractAlignmentTaskSettings *_settings) const {
321 AlignSequencesToAlignmentTaskSettings *addSettings = dynamic_cast<AlignSequencesToAlignmentTaskSettings *>(_settings);
322 SAFE_POINT(addSettings != nullptr,
323 "Add sequences to alignment: incorrect settings",
324 nullptr);
325 return new MafftAddToAlignmentTask(*addSettings);
326 }
327
MafftAddToAlignmentAlgorithm()328 MafftAddToAlignmentAlgorithm::MafftAddToAlignmentAlgorithm()
329 : AlignmentAlgorithm(AddToAlignment,
330 BaseAlignmentAlgorithmsIds::ALIGN_SEQUENCES_TO_ALIGNMENT_BY_MAFFT,
331 AlignmentAlgorithmsRegistry::tr("Align sequences to alignment with MAFFT…"),
332 new MafftAddToAlignmentTaskFactory()) {
333 }
334
isAlgorithmAvailable() const335 bool MafftAddToAlignmentAlgorithm::isAlgorithmAvailable() const {
336 return AppContext::getExternalToolRegistry()->getById(MAFFTSupport::ET_MAFFT_ID)->isValid();
337 }
338
339 } // namespace U2
340