1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "ExportTasks.h"
23 
24 #include <QFileInfo>
25 
26 #include <U2Core/AddDocumentTask.h>
27 #include <U2Core/AppContext.h>
28 #include <U2Core/Counter.h>
29 #include <U2Core/DNAChromatogramObject.h>
30 #include <U2Core/DNASequenceObject.h>
31 #include <U2Core/DNASequenceUtils.h>
32 #include <U2Core/DNATranslation.h>
33 #include <U2Core/DNATranslationImpl.h>
34 #include <U2Core/DocumentModel.h>
35 #include <U2Core/GObjectRelationRoles.h>
36 #include <U2Core/IOAdapter.h>
37 #include <U2Core/IOAdapterUtils.h>
38 #include <U2Core/LoadDocumentTask.h>
39 #include <U2Core/MSAUtils.h>
40 #include <U2Core/MultipleSequenceAlignmentImporter.h>
41 #include <U2Core/MultipleSequenceAlignmentObject.h>
42 #include <U2Core/ProjectModel.h>
43 #include <U2Core/TextUtils.h>
44 #include <U2Core/U2SafePoints.h>
45 #include <U2Core/U2SequenceUtils.h>
46 
47 #include <U2Formats/SCFFormat.h>
48 
49 namespace U2 {
50 
51 //////////////////////////////////////////////////////////////////////////
52 // DNAExportAlignmentTask
ExportAlignmentTask(const MultipleSequenceAlignment & _ma,const QString & _url,const DocumentFormatId & _documentFormatId)53 ExportAlignmentTask::ExportAlignmentTask(const MultipleSequenceAlignment &_ma, const QString &_url, const DocumentFormatId &_documentFormatId)
54     : DocumentProviderTask(tr("Export alignment to %1").arg(_url), TaskFlag_None), ma(_ma->getCopy()), url(_url), documentFormatId(_documentFormatId) {
55     GCOUNTER(cvar, "ExportAlignmentTask");
56     documentDescription = QFileInfo(url).fileName();
57     setVerboseLogMode(true);
58     CHECK_EXT(!ma->isEmpty(), setError(tr("Nothing to export: multiple alignment is empty")), );
59 }
60 
run()61 void ExportAlignmentTask::run() {
62     DocumentFormat *format = AppContext::getDocumentFormatRegistry()->getFormatById(documentFormatId);
63     SAFE_POINT(format != nullptr, L10N::nullPointerError("sequence document format"), );
64     IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(IOAdapterUtils::url2io(url));
65     SAFE_POINT(iof != nullptr, L10N::nullPointerError("I/O adapter factory"), );
66     QScopedPointer<Document> exportedDocument(format->createNewLoadedDocument(iof, url, stateInfo));
67     CHECK_OP(stateInfo, );
68 
69     MultipleSequenceAlignmentObject *obj = MultipleSequenceAlignmentImporter::createAlignment(exportedDocument->getDbiRef(), ma, stateInfo);
70     CHECK_OP(stateInfo, );
71 
72     exportedDocument->addObject(obj);
73     format->storeDocument(exportedDocument.get(), stateInfo);
74     CHECK_OP(stateInfo, );
75     exportedDocument.reset();  // Release resources.
76 
77     // Now reload the document.
78     // Reason: document format may have some limits and change the original data: trim sequence names or replace spaces with underscores.
79     resultDocument = format->loadDocument(iof, url, {}, stateInfo);
80 }
81 
82 //////////////////////////////////////////////////////////////////////////
83 // export alignment  2 sequence format
84 
ExportMSA2SequencesTask(const MultipleSequenceAlignment & _ma,const QString & _url,bool _trimLeadingAndTrailingGaps,const DocumentFormatId & _documentFormatId)85 ExportMSA2SequencesTask::ExportMSA2SequencesTask(const MultipleSequenceAlignment &_ma,
86                                                  const QString &_url,
87                                                  bool _trimLeadingAndTrailingGaps,
88                                                  const DocumentFormatId &_documentFormatId)
89     : DocumentProviderTask(tr("Export alignment as sequence to %1").arg(_url), TaskFlag_None), ma(_ma->getCopy()), url(_url),
90       trimLeadingAndTrailingGaps(_trimLeadingAndTrailingGaps), documentFormatId(_documentFormatId) {
91     documentDescription = QFileInfo(url).fileName();
92     GCOUNTER(cvar, "ExportMSA2SequencesTask");
93     setVerboseLogMode(true);
94 }
95 
run()96 void ExportMSA2SequencesTask::run() {
97     DocumentFormat *format = AppContext::getDocumentFormatRegistry()->getFormatById(documentFormatId);
98     SAFE_POINT(format != nullptr, L10N::nullPointerError("sequence document format"), );
99     IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(IOAdapterUtils::url2io(url));
100     SAFE_POINT(iof != nullptr, L10N::nullPointerError("I/O adapter factory"), );
101     QScopedPointer<Document> exportedDocument(format->createNewLoadedDocument(iof, url, stateInfo));
102     CHECK_OP(stateInfo, );
103     QList<DNASequence> sequenceList = MSAUtils::convertMsaToSequenceList(ma, stateInfo, trimLeadingAndTrailingGaps);
104     CHECK_OP(stateInfo, );
105     QSet<QString> usedNames;
106     for (DNASequence &sequence : sequenceList) {
107         QString name = sequence.getName();
108         if (usedNames.contains(name)) {
109             name = TextUtils::variate(name, " ", usedNames, false, 1);
110             sequence.setName(name);
111         }
112         U2EntityRef seqRef = U2SequenceUtils::import(stateInfo, exportedDocument->getDbiRef(), sequence);
113         CHECK_OP(stateInfo, );
114         exportedDocument->addObject(new U2SequenceObject(name, seqRef));
115         usedNames.insert(name);
116     }
117     format->storeDocument(exportedDocument.get(), stateInfo);
118     CHECK_OP(stateInfo, );
119     exportedDocument.reset();  // Release resources.
120 
121     // Now reload the document.
122     // Reason: document format may have some limits and change the original data: trim sequence names or replace spaces with underscores.
123     resultDocument = format->loadDocument(iof, url, {}, stateInfo);
124 }
125 
126 //////////////////////////////////////////////////////////////////////////
127 // export nucleic alignment 2 amino alignment
128 
ExportMSA2MSATask(const MultipleSequenceAlignment & msa,const QList<qint64> & rowIds,const U2Region & columnRegion,const QString & _url,const DNATranslation * _aminoTranslation,const DocumentFormatId & _documentFormatId,bool _trimGaps,bool _convertUnknownToGap,bool _reverseComplement,int _translationFrame)129 ExportMSA2MSATask::ExportMSA2MSATask(const MultipleSequenceAlignment &msa,
130                                      const QList<qint64> &rowIds,
131                                      const U2Region &columnRegion,
132                                      const QString &_url,
133                                      const DNATranslation *_aminoTranslation,
134                                      const DocumentFormatId &_documentFormatId,
135                                      bool _trimGaps,
136                                      bool _convertUnknownToGap,
137                                      bool _reverseComplement,
138                                      int _translationFrame)
139     : DocumentProviderTask(tr("Export alignment as alignment to %1").arg(_url), TaskFlag_None),
140       url(_url), documentFormatId(_documentFormatId), aminoTranslation(_aminoTranslation), trimLeadingAndTrailingGaps(_trimGaps),
141       convertUnknownToGap(_convertUnknownToGap), reverseComplement(_reverseComplement), translationFrame(_translationFrame) {
142     GCOUNTER(cvar, "ExportMSA2MSATask");
143     documentDescription = QFileInfo(url).fileName();
144 
145     CHECK_EXT(!msa->isEmpty(), setError(tr("Nothing to export: multiple alignment is empty")), );
146 
147     SAFE_POINT_EXT(translationFrame >= 0 && translationFrame <= 2, setError(tr("Illegal translation frame offset: %1").arg(translationFrame)), );
148     SAFE_POINT_EXT(aminoTranslation == nullptr || aminoTranslation->isThree2One(), setError(tr("Invalid amino translation: %1").arg(aminoTranslation->getTranslationName())), );
149     setVerboseLogMode(true);
150 
151     sequenceList = MSAUtils::convertMsaToSequenceList(msa, stateInfo, trimLeadingAndTrailingGaps, rowIds.toSet(), columnRegion);
152     CHECK_OP(stateInfo, )
153 }
154 
run()155 void ExportMSA2MSATask::run() {
156     DocumentFormat *format = AppContext::getDocumentFormatRegistry()->getFormatById(documentFormatId);
157     SAFE_POINT(format != nullptr, L10N::nullPointerError("sequence document format"), );
158     IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(IOAdapterUtils::url2io(url));
159     SAFE_POINT(iof != nullptr, L10N::nullPointerError("I/O adapter factory"), );
160     QScopedPointer<Document> exportedDocument(format->createNewLoadedDocument(iof, url, stateInfo));
161     CHECK_OP(stateInfo, );
162 
163     QList<DNASequence> resultSequenceList;
164     for (const DNASequence &originalSequence : sequenceList) {
165         DNASequence sequence = reverseComplement ? DNASequenceUtils::reverseComplement(originalSequence) : originalSequence;
166         sequence.seq = sequence.seq.right(sequence.seq.length() - translationFrame);
167         QString name = sequence.getName();
168         if (aminoTranslation != nullptr) {
169             name += "(translated)";
170 
171             const QByteArray &seq = sequence.seq;
172             int aminoSequenceLength = seq.length() / 3;
173             QByteArray resultData(aminoSequenceLength, '\0');
174             CHECK_EXT(resultData.size() == aminoSequenceLength, L10N::outOfMemory(), );
175             aminoTranslation->translate(seq.constData(), seq.length(), resultData.data(), resultData.length());
176 
177             if (!trimLeadingAndTrailingGaps && convertUnknownToGap) {
178                 resultData.replace("X", "-");
179             }
180             resultData.replace("*", "X");
181             DNASequence resultSequence(name, resultData, aminoTranslation->getDstAlphabet());
182             resultSequenceList << resultSequence;
183         } else {
184             resultSequenceList << sequence;
185         }
186     }
187     MultipleSequenceAlignment aminoMa = MSAUtils::seq2ma(resultSequenceList, stateInfo);
188     CHECK_OP(stateInfo, );
189 
190     MultipleSequenceAlignmentObject *obj = MultipleSequenceAlignmentImporter::createAlignment(exportedDocument->getDbiRef(), aminoMa, stateInfo);
191     CHECK_OP(stateInfo, );
192 
193     exportedDocument->addObject(obj);
194     format->storeDocument(exportedDocument.get(), stateInfo);
195     CHECK_OP(stateInfo, );
196 
197     // Now reload the document.
198     // Reason: document format may have some limits and change the original data: trim sequence names or replace spaces with underscores.
199     resultDocument = format->loadDocument(iof, url, {}, stateInfo);
200 }
201 
202 //////////////////////////////////////////////////////////////////////////
203 // export chromatogram to SCF
204 
ExportDNAChromatogramTask(DNAChromatogramObject * _obj,const ExportChromatogramTaskSettings & _settings)205 ExportDNAChromatogramTask::ExportDNAChromatogramTask(DNAChromatogramObject *_obj, const ExportChromatogramTaskSettings &_settings)
206     : DocumentProviderTask(tr("Export chromatogram to SCF"), TaskFlags_NR_FOSCOE), chromaObject(_obj), settings(_settings), loadTask(nullptr) {
207     GCOUNTER(cvar, "ExportDNAChromatogramTask");
208     setVerboseLogMode(true);
209 }
210 
prepare()211 void ExportDNAChromatogramTask::prepare() {
212     Document *d = chromaObject->getDocument();
213     SAFE_POINT_EXT(d != nullptr, setError(L10N::internalError("Chromatogram object has no associated document")), );
214 
215     QList<GObjectRelation> relatedObjs = chromaObject->findRelatedObjectsByRole(ObjectRole_Sequence);
216     SAFE_POINT_EXT(relatedObjs.count() == 1, setError("Sequence related to chromatogram is not found!"), );
217 
218     QString seqObjName = relatedObjs.first().ref.objName;
219 
220     GObject *resObj = d->findGObjectByName(seqObjName);
221     auto sObj = qobject_cast<U2SequenceObject *>(resObj);
222     SAFE_POINT_EXT(sObj != nullptr, setError(L10N::nullPointerError("sequence object is null")), );
223 
224     DNAChromatogram cd = chromaObject->getChromatogram();
225     QByteArray seq = sObj->getWholeSequenceData(stateInfo);
226     CHECK_OP(stateInfo, );
227 
228     if (settings.reverse) {
229         TextUtils::reverse(seq.data(), seq.length());
230         reverseVector(cd.A);
231         reverseVector(cd.C);
232         reverseVector(cd.G);
233         reverseVector(cd.T);
234         int offset = 0;
235         if (chromaObject->getDocument()->getDocumentFormatId() == BaseDocumentFormats::ABIF) {
236             int baseNum = cd.baseCalls.count();
237             int seqLen = cd.seqLength;
238             // this is required for base <-> peak correspondence
239             if (baseNum > seqLen) {
240                 cd.baseCalls.remove(baseNum - 1);
241                 cd.prob_A.remove(baseNum - 1);
242                 cd.prob_C.remove(baseNum - 1);
243                 cd.prob_G.remove(baseNum - 1);
244                 cd.prob_T.remove(baseNum - 1);
245             }
246         } else if (chromaObject->getDocument()->getDocumentFormatId() == BaseDocumentFormats::SCF) {
247             // SCF format particularities
248             offset = -1;
249         }
250 
251         for (int i = 0; i < cd.seqLength; ++i) {
252             cd.baseCalls[i] = cd.traceLength - cd.baseCalls[i] + offset;
253         }
254         reverseVector(cd.baseCalls);
255         reverseVector(cd.prob_A);
256         reverseVector(cd.prob_C);
257         reverseVector(cd.prob_G);
258         reverseVector(cd.prob_T);
259     }
260 
261     if (settings.complement) {
262         DNATranslation *tr = AppContext::getDNATranslationRegistry()->lookupTranslation(BaseDNATranslationIds::NUCL_DNA_DEFAULT_COMPLEMENT);
263         tr->translate(seq.data(), seq.length());
264         qSwap(cd.A, cd.T);
265         qSwap(cd.C, cd.G);
266         qSwap(cd.prob_A, cd.prob_T);
267         qSwap(cd.prob_C, cd.prob_G);
268     }
269 
270     SCFFormat::exportDocumentToSCF(settings.url, cd, seq, stateInfo);
271     CHECK_OP(stateInfo, );
272 
273     if (settings.loadDocument) {
274         IOAdapterFactory *iof = AppContext::getIOAdapterRegistry()->getIOAdapterFactoryById(BaseIOAdapters::LOCAL_FILE);
275         loadTask = new LoadDocumentTask(BaseDocumentFormats::SCF, settings.url, iof);
276         addSubTask(loadTask);
277     }
278 }
onSubTaskFinished(Task * subTask)279 QList<Task *> ExportDNAChromatogramTask::onSubTaskFinished(Task *subTask) {
280     if (subTask == loadTask) {
281         resultDocument = loadTask->takeDocument();
282     }
283     return {};
284 }
285 
286 }  // namespace U2
287