1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "U2SequenceUtils.h"
23 
24 #include <QApplication>
25 
26 #include <U2Core/AppContext.h>
27 #include <U2Core/DNASequenceObject.h>
28 #include <U2Core/DNATranslation.h>
29 #include <U2Core/GObject.h>
30 #include <U2Core/SequenceUtils.h>
31 #include <U2Core/U1AnnotationUtils.h>
32 #include <U2Core/U2AlphabetUtils.h>
33 #include <U2Core/U2AttributeDbi.h>
34 #include <U2Core/U2AttributeUtils.h>
35 #include <U2Core/U2ObjectDbi.h>
36 #include <U2Core/U2OpStatus.h>
37 #include <U2Core/U2OpStatusUtils.h>
38 #include <U2Core/U2SafePoints.h>
39 #include <U2Core/U2SequenceDbi.h>
40 
41 namespace U2 {
42 
43 const QString U2SequenceDbiHints::UPDATE_SEQUENCE_LENGTH = "update-length";
44 const QString U2SequenceDbiHints::EMPTY_SEQUENCE = "empty-sequence";
45 
alphabetType(const U2EntityRef & ref,U2OpStatus & os)46 DNAAlphabetType U2SequenceUtils::alphabetType(const U2EntityRef &ref, U2OpStatus &os) {
47     DNAAlphabetType res = DNAAlphabet_RAW;
48     DbiConnection con(ref.dbiRef, os);
49 
50     U2Sequence seq = con.dbi->getSequenceDbi()->getSequenceObject(ref.entityId, os);
51     CHECK_OP(os, res);
52 
53     const DNAAlphabet *al = AppContext::getDNAAlphabetRegistry()->findById(seq.alphabet.id);
54     CHECK_EXT(al != nullptr, os.setError(tr("Alphabet is not found!")), res);
55 
56     return al->getType();
57 }
58 
length(const U2EntityRef & ref,U2OpStatus & os)59 qint64 U2SequenceUtils::length(const U2EntityRef &ref, U2OpStatus &os) {
60     DbiConnection con(ref.dbiRef, os);
61 
62     U2Sequence seq = con.dbi->getSequenceDbi()->getSequenceObject(ref.entityId, os);
63     CHECK_OP(os, -1);
64 
65     return seq.length;
66 }
67 
copySequence(const DNASequence & srcSeq,const U2DbiRef & dstDbi,const QString & dstFolder,U2OpStatus & os)68 U2Sequence U2SequenceUtils::copySequence(const DNASequence &srcSeq, const U2DbiRef &dstDbi, const QString &dstFolder, U2OpStatus &os) {
69     U2Sequence res;
70     // TODO: ClustalW format does not assign sequence alphabets!
71     res.alphabet = srcSeq.alphabet == nullptr ? nullptr : srcSeq.alphabet->getId();
72     res.circular = srcSeq.circular;
73     res.length = srcSeq.length();
74     res.visualName = srcSeq.getName();
75 
76     TmpDbiObjects tmpObjects(dstDbi, os);
77 
78     if (os.isCoR()) {
79         return res;
80     }
81 
82     DbiConnection dstCon(dstDbi, os);
83     CHECK_OP(os, res);
84     dstCon.dbi->getSequenceDbi()->createSequenceObject(res, dstFolder, os);
85     CHECK_OP(os, res);
86 
87     tmpObjects.objects.append(res.id);
88 
89     dstCon.dbi->getSequenceDbi()->updateSequenceData(res.id, U2Region(0, 0), srcSeq.seq, QVariantMap(), os);
90     CHECK_OP(os, res);
91 
92     return res;
93 }
94 
copySequence(const U2EntityRef & srcSeq,const U2DbiRef & dstDbi,const QString & dstFolder,U2OpStatus & os)95 U2Sequence U2SequenceUtils::copySequence(const U2EntityRef &srcSeq, const U2DbiRef &dstDbi, const QString &dstFolder, U2OpStatus &os) {
96     U2Sequence res;
97     DbiConnection srcCon(srcSeq.dbiRef, os);
98     CHECK_OP(os, res);
99 
100     U2SequenceDbi *srcSeqDbi = srcCon.dbi->getSequenceDbi();
101     SAFE_POINT_EXT(nullptr != srcSeqDbi, os.setError(tr("Invalid sequence DBI")), res);
102     U2Sequence seq = srcSeqDbi->getSequenceObject(srcSeq.entityId, os);
103     CHECK_OP(os, res);
104 
105     res = seq;
106     U2TrackModType modType = res.trackModType;
107     res.trackModType = NoTrack;
108 
109     res.id.clear();
110     res.length = 0;
111 
112     TmpDbiObjects tmpObjects(dstDbi, os);
113 
114     DbiConnection dstCon(dstDbi, os);
115     CHECK_OP(os, res);
116     U2SequenceDbi *dstSeqDbi = dstCon.dbi->getSequenceDbi();
117     SAFE_POINT_EXT(nullptr != dstSeqDbi, os.setError(tr("Invalid sequence DBI")), res);
118     dstSeqDbi->createSequenceObject(res, dstFolder, os);
119     CHECK_OP(os, res);
120 
121     tmpObjects.objects.append(res.id);
122 
123     const qint64 MAX_CHUNK_LENGTH = 4194304;  // 4 MiB chunk
124     for (qint64 pos = 0; pos < seq.length; pos += MAX_CHUNK_LENGTH) {
125         const qint64 currentChunkSize = qMin(MAX_CHUNK_LENGTH, seq.length - pos);
126         const U2Region chunkRegion(pos, currentChunkSize);
127         const QByteArray chunkContent = srcSeqDbi->getSequenceData(srcSeq.entityId, chunkRegion, os);
128         CHECK_OP(os, res);
129         dstSeqDbi->updateSequenceData(res.id, chunkRegion, chunkContent, QVariantMap(), os);
130         CHECK_OP(os, res);
131         res.length += currentChunkSize;
132     }
133 
134     U2DbiObjectRank rank = srcCon.dbi->getObjectDbi()->getObjectRank(seq.id, os);
135     CHECK_OP(os, res);
136 
137     dstCon.dbi->getObjectDbi()->setObjectRank(res.id, rank, os);
138     CHECK_OP(os, res);
139 
140     res.trackModType = modType;
141     return res;
142 }
143 
updateSequenceName(const U2EntityRef & entityRef,const QString & newName,U2OpStatus & os)144 void U2SequenceUtils::updateSequenceName(const U2EntityRef &entityRef, const QString &newName, U2OpStatus &os) {
145     DbiConnection con(entityRef.dbiRef, os);
146     CHECK_OP(os, );
147 
148     U2SequenceDbi *sequenceDbi = con.dbi->getSequenceDbi();
149     U2Sequence sequenceObject = sequenceDbi->getSequenceObject(entityRef.entityId, os);
150     CHECK_OP(os, );
151     if (sequenceObject.visualName != newName) {
152         sequenceObject.visualName = newName;
153         sequenceDbi->updateSequenceObject(sequenceObject, os);
154     }
155 }
156 
_extractRegions(const U2EntityRef & seqRef,const QVector<U2Region> & regions,const DNATranslation * complTT,U2OpStatus & os)157 static QList<QByteArray> _extractRegions(const U2EntityRef &seqRef, const QVector<U2Region> &regions, const DNATranslation *complTT, U2OpStatus &os) {
158     QList<QByteArray> res;
159 
160     DbiConnection con(seqRef.dbiRef, os);
161     CHECK_OP(os, res);
162     U2SequenceDbi *seqDbi = con.dbi->getSequenceDbi();
163     U2Sequence seq = seqDbi->getSequenceObject(seqRef.entityId, os);
164     CHECK_OP(os, res);
165 
166     QVector<U2Region> safeLocation = regions;
167     U2Region::bound(0, seq.length, safeLocation);
168 
169     for (int i = 0, n = safeLocation.size(); i < n; i++) {
170         const U2Region &oReg = safeLocation.at(i);
171         if (complTT == nullptr) {
172             QByteArray part = seqDbi->getSequenceData(seq.id, U2Region(oReg.startPos, oReg.length), os);
173             CHECK_OP(os, QList<QByteArray>());
174             res.append(part);
175         } else {
176             QByteArray arr = seqDbi->getSequenceData(seq.id, U2Region(oReg.startPos, oReg.length), os);
177             CHECK_OP(os, QList<QByteArray>());
178             TextUtils::reverse(arr.data(), arr.length());
179             complTT->translate(arr.data(), arr.length());
180             res.prepend(arr);
181         }
182     }
183     return res;
184 }
185 
extractRegions(const U2EntityRef & seqRef,const QVector<U2Region> & origLocation,const DNATranslation * complTT,const DNATranslation * aminoTT,bool join,U2OpStatus & os)186 QList<QByteArray> U2SequenceUtils::extractRegions(const U2EntityRef &seqRef, const QVector<U2Region> &origLocation, const DNATranslation *complTT, const DNATranslation *aminoTT, bool join, U2OpStatus &os) {
187     QList<QByteArray> res = _extractRegions(seqRef, origLocation, complTT, os);
188     CHECK_OP(os, res)
189 
190     DbiConnection con(seqRef.dbiRef, os);
191     CHECK_OP(os, res);
192 
193     U2SequenceDbi *seqDbi = con.dbi->getSequenceDbi();
194     U2Sequence seq = seqDbi->getSequenceObject(seqRef.entityId, os);
195     CHECK_OP(os, res);
196 
197     if (seq.circular && res.size() > 1) {
198         const U2Region &firstL = origLocation.first();
199         const U2Region &lastL = origLocation.last();
200         if (firstL.startPos == 0 && lastL.endPos() == seq.length) {
201             QByteArray lastS = res.last();
202             QByteArray firstS = res.first();
203             res.removeLast();
204             res[0] = lastS.append(firstS);
205         }
206     }
207     if (aminoTT != nullptr) {
208         res = U1SequenceUtils::translateRegions(res, aminoTT, join);
209     }
210 
211     if (join && res.size() > 1) {
212         QByteArray joined = U1SequenceUtils::joinRegions(res);
213         res.clear();
214         res.append(joined);
215     }
216 
217     return res;
218 }
219 
import(U2OpStatus & os,const U2DbiRef & dbiRef,const DNASequence & seq,const U2AlphabetId & alphabetId)220 U2EntityRef U2SequenceUtils::import(U2OpStatus &os, const U2DbiRef &dbiRef, const DNASequence &seq, const U2AlphabetId &alphabetId) {
221     return import(os, dbiRef, U2ObjectDbi::ROOT_FOLDER, seq, alphabetId);
222 }
223 
import(U2OpStatus & os,const U2DbiRef & dbiRef,const QString & folder,const DNASequence & seq,const U2AlphabetId & alphabetId)224 U2EntityRef U2SequenceUtils::import(U2OpStatus &os, const U2DbiRef &dbiRef, const QString &folder, const DNASequence &seq, const U2AlphabetId &alphabetId) {
225     U2EntityRef res;
226     U2SequenceImporter i;
227 
228     i.startSequence(os, dbiRef, folder, seq.getName(), seq.circular, alphabetId);
229     CHECK_OP(os, res);
230 
231     i.addBlock(seq.constData(), seq.length(), os);
232     CHECK_OP(os, res);
233 
234     U2Sequence u2seq = i.finalizeSequenceAndValidate(os);
235     CHECK_OP(os, res);
236 
237     res.dbiRef = dbiRef;
238     res.entityId = u2seq.id;
239 
240     setSequenceInfo(os, res, seq.info);
241     CHECK_OP(os, res);
242 
243     setQuality(res, seq.quality);
244 
245     return res;
246 }
247 
setQuality(const U2EntityRef & entityRef,const DNAQuality & q)248 void U2SequenceUtils::setQuality(const U2EntityRef &entityRef, const DNAQuality &q) {
249     U2OpStatus2Log os;
250     DbiConnection con(entityRef.dbiRef, os);
251     CHECK_OP(os, );
252     QList<U2DataId> idQualList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::FASTQ_QUAL_CODES, os);
253     CHECK_OP(os, );
254     if (!idQualList.isEmpty()) {
255         con.dbi->getAttributeDbi()->removeAttributes(idQualList, os);
256         CHECK_OP(os, );
257     }
258     QList<U2DataId> idQualTypeList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::FASTQ_QUAL_TYPE, os);
259     CHECK_OP(os, );
260     if (!idQualTypeList.isEmpty()) {
261         con.dbi->getAttributeDbi()->removeAttributes(idQualTypeList, os);
262         CHECK_OP(os, );
263     }
264 
265     U2ByteArrayAttribute qualityCodes(entityRef.entityId, DNAInfo::FASTQ_QUAL_CODES, q.qualCodes);
266     U2IntegerAttribute qualityType(entityRef.entityId, DNAInfo::FASTQ_QUAL_TYPE, q.type);
267     con.dbi->getAttributeDbi()->createByteArrayAttribute(qualityCodes, os);
268     CHECK_OP(os, );
269     con.dbi->getAttributeDbi()->createIntegerAttribute(qualityType, os);
270     CHECK_OP(os, );
271 }
272 
setSequenceInfo(U2OpStatus & os,const U2EntityRef & entityRef,const QVariantMap & info)273 void U2SequenceUtils::setSequenceInfo(U2OpStatus &os, const U2EntityRef &entityRef, const QVariantMap &info) {
274     DbiConnection con(entityRef.dbiRef, os);
275     CHECK_OP(os, );
276     QList<U2DataId> chainIdList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::CHAIN_ID, os);
277     CHECK_OP(os, );
278     if (!chainIdList.isEmpty()) {
279         con.dbi->getAttributeDbi()->removeObjectAttributes(chainIdList.first(), os);
280         CHECK_OP(os, );
281     }
282     QList<U2DataId> commentList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::COMMENT, os);
283     CHECK_OP(os, );
284     if (!commentList.isEmpty()) {
285         con.dbi->getAttributeDbi()->removeObjectAttributes(commentList.first(), os);
286         CHECK_OP(os, );
287     }
288     QList<U2DataId> definitionList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::DEFINITION, os);
289     CHECK_OP(os, );
290     if (!definitionList.isEmpty()) {
291         con.dbi->getAttributeDbi()->removeObjectAttributes(definitionList.first(), os);
292         CHECK_OP(os, );
293     }
294     U2StringAttribute chainID(entityRef.entityId, DNAInfo::CHAIN_ID, info.value(DNAInfo::CHAIN_ID).toString());
295     U2StringAttribute comment(entityRef.entityId, DNAInfo::COMMENT, info.value(DNAInfo::COMMENT).toString());
296     U2StringAttribute definition(entityRef.entityId, DNAInfo::DEFINITION, info.value(DNAInfo::DEFINITION).toString());
297     con.dbi->getAttributeDbi()->createStringAttribute(chainID, os);
298     CHECK_OP(os, );
299     con.dbi->getAttributeDbi()->createStringAttribute(comment, os);
300     CHECK_OP(os, );
301     con.dbi->getAttributeDbi()->createStringAttribute(definition, os);
302     CHECK_OP(os, );
303 }
304 
getSequenceInfo(U2OpStatus & os,const U2EntityRef & entityRef,const QString & name)305 QVariantMap U2SequenceUtils::getSequenceInfo(U2OpStatus &os, const U2EntityRef &entityRef, const QString &name) {
306     QVariantMap resultingInfo;
307     DbiConnection con(entityRef.dbiRef, os);
308     QList<U2DataId> chainIdList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::CHAIN_ID, os);
309     CHECK_OP(os, resultingInfo);
310     QList<U2DataId> commentList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::COMMENT, os);
311     CHECK_OP(os, resultingInfo);
312     QList<U2DataId> definitionList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::DEFINITION, os);
313     CHECK_OP(os, resultingInfo);
314     if (!chainIdList.isEmpty() && !commentList.isEmpty() && !definitionList.isEmpty()) {
315         resultingInfo.insert(DNAInfo::CHAIN_ID, con.dbi->getAttributeDbi()->getStringAttribute(chainIdList.first(), os).value);
316         CHECK_OP(os, QVariantMap());
317         resultingInfo.insert(DNAInfo::COMMENT, con.dbi->getAttributeDbi()->getStringAttribute(commentList.first(), os).value);
318         CHECK_OP(os, QVariantMap());
319         resultingInfo.insert(DNAInfo::DEFINITION, con.dbi->getAttributeDbi()->getStringAttribute(definitionList.first(), os).value);
320         CHECK_OP(os, QVariantMap());
321     }
322     U2StringAttribute attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, DNAInfo::GENBANK_HEADER, os);
323     if (attr.hasValidId()) {
324         resultingInfo.insert(DNAInfo::GENBANK_HEADER, attr.value);
325         CHECK_OP(os, QVariantMap());
326     }
327 
328     attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, DNAInfo::SOURCE, os);
329     if (attr.hasValidId()) {
330         resultingInfo.insert(DNAInfo::SOURCE, attr.value);
331         CHECK_OP(os, QVariantMap());
332     }
333 
334     attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, DNAInfo::ACCESSION, os);
335     if (attr.hasValidId()) {
336         resultingInfo.insert(DNAInfo::ACCESSION, attr.value);
337         CHECK_OP(os, QVariantMap());
338     }
339     attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, Translation_Table_Id_Attribute, os);
340     if (attr.hasValidId()) {
341         resultingInfo.insert(Translation_Table_Id_Attribute, attr.value);
342         CHECK_OP(os, QVariantMap());
343     }
344 
345     if (!name.isEmpty()) {
346         resultingInfo.insert(DNAInfo::ID, name);
347     }
348     return resultingInfo;
349 }
350 
getSequenceDbInfo(U2SequenceObject * seqObj)351 U2Sequence U2SequenceUtils::getSequenceDbInfo(U2SequenceObject *seqObj) {
352     U2Sequence seq;
353 
354     seq.id = seqObj->getEntityRef().entityId;
355     seq.dbiId = seqObj->getEntityRef().dbiRef.dbiId;
356     if (nullptr != seqObj->getAlphabet()) {
357         seq.alphabet.id = seqObj->getAlphabet()->getId();
358     }
359     seq.circular = seqObj->isCircular();
360     seq.length = seqObj->getSequenceLength();
361     seq.visualName = seqObj->getSequenceName();
362 
363     return seq;
364 }
365 
366 //////////////////////////////////////////////////////////////////////////
367 // U2SequenceImporter
368 #define DEFAULT_SEQUENCE_INSERT_BLOCK_SIZE (4 * 1024 * 1024)
369 
370 /**
371  * If GObjectHint_CaseAnns is present in the QVariantMap (it should be stored as int),
372  * then the method verifies the value and returns it (if it is correct).
373  * Otherwise, returns NO_CASE_ANNS.
374  */
getCaseAnnotationsModeHint(const QVariantMap & fs)375 static CaseAnnotationsMode getCaseAnnotationsModeHint(const QVariantMap &fs) {
376     if (fs.keys().contains(GObjectHint_CaseAnns)) {
377         QVariant caseAnnsVariant = fs.value(GObjectHint_CaseAnns);
378         SAFE_POINT(caseAnnsVariant.canConvert<int>(), "Can't convert a case annotations hint!", NO_CASE_ANNS);
379 
380         bool conversionIsOK = 0;
381         int caseAnnsInt = caseAnnsVariant.toInt(&conversionIsOK);
382         SAFE_POINT(conversionIsOK, "Can't convert a case annotations hint to int!", NO_CASE_ANNS);
383 
384         SAFE_POINT((caseAnnsInt == LOWER_CASE) || (caseAnnsInt == UPPER_CASE) || (caseAnnsInt == NO_CASE_ANNS),
385                    "Incorrect value of a case annotation hint!",
386                    NO_CASE_ANNS);
387 
388         return (CaseAnnotationsMode)caseAnnsInt;
389     }
390 
391     return NO_CASE_ANNS;
392 }
393 
394 const QString U2SequenceImporter::EMPTY_SEQUENCE_ERROR = QApplication::translate("U2SequenceImporter",
395                                                                                  "Sequence was not imported. Probably, this is because the sequence is empty.");
396 
U2SequenceImporter(const QVariantMap & fs,bool lazyMode,bool singleThread)397 U2SequenceImporter::U2SequenceImporter(const QVariantMap &fs, bool lazyMode, bool singleThread)
398     : lazyMode(lazyMode), singleThread(singleThread), sequenceCreated(false) {
399     insertBlockSize = DEFAULT_SEQUENCE_INSERT_BLOCK_SIZE;
400     currentLength = 0;
401     isUnfinishedRegion = false;
402     caseAnnsMode = getCaseAnnotationsModeHint(fs);
403     sequenceCreated = false;
404     committedLength = 0;
405 }
406 
U2SequenceImporter(qint64 _insertBlockSize,const QVariantMap & fs,bool lazyMode,bool singleThread)407 U2SequenceImporter::U2SequenceImporter(qint64 _insertBlockSize, const QVariantMap &fs, bool lazyMode, bool singleThread)
408     : insertBlockSize(_insertBlockSize), lazyMode(lazyMode), singleThread(singleThread) {
409     insertBlockSize = qMin((qint64)10, insertBlockSize);
410     currentLength = 0;
411     isUnfinishedRegion = false;
412     caseAnnsMode = getCaseAnnotationsModeHint(fs);
413     sequenceCreated = false;
414     committedLength = 0;
415 }
416 
~U2SequenceImporter()417 U2SequenceImporter::~U2SequenceImporter() {
418     if (con.isOpen() && sequenceCreated) {
419         coreLog.trace(QString("Removing sequence from unfinished import: %1").arg(sequence.visualName));
420         U2OpStatus2Log os;
421         con.dbi->getObjectDbi()->removeObject(sequence.id, os);
422     }
423 }
424 
startSequence(U2OpStatus & os,const U2DbiRef & dbiRef,const QString & dstFolder,const QString & visualName,bool circular,const U2AlphabetId & alphabetId)425 void U2SequenceImporter::startSequence(U2OpStatus &os,
426                                        const U2DbiRef &dbiRef,
427                                        const QString &dstFolder,
428                                        const QString &visualName,
429                                        bool circular,
430                                        const U2AlphabetId &alphabetId) {
431     SAFE_POINT(!con.isOpen(), "Connection is already opened!", );
432     con.open(dbiRef, true, os);
433     CHECK_OP(os, );
434 
435     folder = dstFolder;
436 
437     sequence = U2Sequence();
438     sequence.visualName = visualName;
439     sequence.circular = circular;
440     sequence.alphabet.id = alphabetId.id;
441 
442     currentLength = 0;
443     isUnfinishedRegion = false;
444     annList.clear();
445 
446     if (!lazyMode) {
447         con.dbi->getSequenceDbi()->createSequenceObject(sequence, folder, os);
448         CHECK_OP(os, );
449         sequenceCreated = true;
450     }
451 }
452 
addBlock(const char * data,qint64 len,U2OpStatus & os)453 void U2SequenceImporter::addBlock(const char *data, qint64 len, U2OpStatus &os) {
454     // derive common alphabet
455     const DNAAlphabet *blockAl = U2AlphabetUtils::findBestAlphabet(data, len);
456     CHECK_EXT(blockAl != nullptr, os.setError("Failed to match sequence alphabet!"), );
457 
458     const DNAAlphabet *oldAl = U2AlphabetUtils::getById(sequence.alphabet);
459     const DNAAlphabet *resAl = blockAl;
460     if (oldAl != nullptr) {
461         if (oldAl->getType() == DNAAlphabet_AMINO && resAl->getType() == DNAAlphabet_NUCL) {
462             resAl = oldAl;
463         } else if (resAl->getType() == DNAAlphabet_AMINO && oldAl->getType() == DNAAlphabet_NUCL) {
464             oldAl = resAl;
465         } else {
466             resAl = U2AlphabetUtils::deriveCommonAlphabet(blockAl, oldAl);
467         }
468         CHECK_EXT(resAl != nullptr, os.setError(U2SequenceUtils::tr("Failed to derive sequence alphabet!")), );
469     }
470 
471     if (resAl != U2AlphabetUtils::getById(sequence.alphabet)) {
472         sequence.alphabet.id = resAl->getId();
473         if (sequenceCreated) {
474             con.dbi->getSequenceDbi()->updateSequenceObject(sequence, os);
475             CHECK_OP(os, );
476         }
477     }
478 
479     _addBlock2Buffer(data, len, os);
480 
481     if (caseAnnsMode != NO_CASE_ANNS) {
482         annList << U1AnnotationUtils::getCaseAnnotations(data, len, currentLength, isUnfinishedRegion, unfinishedRegion, LOWER_CASE == caseAnnsMode);
483     }
484     currentLength += len;
485 }
486 
addSequenceBlock(const U2EntityRef & sequenceRef,const U2Region & r,U2OpStatus & os)487 void U2SequenceImporter::addSequenceBlock(const U2EntityRef &sequenceRef, const U2Region &r, U2OpStatus &os) {
488     _addBuffer2Db(os);
489     CHECK_OP(os, );
490     DbiConnection con(sequenceRef.dbiRef, os);
491     CHECK_OP(os, );
492 
493     // TODO: optimize -> create utility that uses small to copy sequence!
494     QByteArray arr = con.dbi->getSequenceDbi()->getSequenceData(sequenceRef.entityId, r, os);
495     CHECK_OP(os, );
496     addBlock(arr.constData(), arr.size(), os);
497 }
498 
addDefaultSymbolsBlock(int n,U2OpStatus & os)499 void U2SequenceImporter::addDefaultSymbolsBlock(int n, U2OpStatus &os) {
500     SAFE_POINT(n >= 0, QString("Invalid number of symbols: %1").arg(n), );
501     const DNAAlphabet *al = AppContext::getDNAAlphabetRegistry()->findById(sequence.alphabet.id);
502     if (nullptr == al) {
503         os.setError(QObject::tr("Unable to detect sequence alphabet. Probably, this is because some of merged sequences are empty."));
504         return;
505     }
506     char defaultChar = U2AlphabetUtils::getDefaultSymbol(sequence.alphabet);
507     QByteArray a(n, defaultChar);
508     _addBlock2Buffer(a.data(), a.size(), os);
509     currentLength += n;
510 }
511 
_addBlock2Buffer(const char * data,qint64 len,U2OpStatus & os)512 void U2SequenceImporter::_addBlock2Buffer(const char *data, qint64 len, U2OpStatus &os) {
513     if (len + sequenceBuffer.length() < insertBlockSize) {
514         sequenceBuffer.append(data, len);
515         return;
516     }
517     _addBlock2Db(sequenceBuffer.data(), sequenceBuffer.length(), os);
518     CHECK_OP(os, );
519     sequenceBuffer.clear();
520     _addBlock2Db(data, len, os);
521 }
522 
_addBlock2Db(const char * data,qint64 len,U2OpStatus & os)523 void U2SequenceImporter::_addBlock2Db(const char *data, qint64 len, U2OpStatus &os) {
524     SAFE_POINT(len >= 0, "Illegal block length!", );
525     if (len == 0) {
526         return;
527     }
528     QByteArray arr(data, len);
529     TextUtils::translate(TextUtils::UPPER_CASE_MAP, arr.data(), arr.length());
530 
531     bool updateLength = true;
532     bool emptySequence = false;
533     bool justCreated = false;
534     if (!sequenceCreated) {
535         emptySequence = true;
536         if (singleThread) {
537             SAFE_POINT(0 == committedLength, "Sequence object is not created, but sequence data already exists", );
538             sequence.length = len;
539             updateLength = false;
540         }
541         con.dbi->getSequenceDbi()->createSequenceObject(sequence, folder, os);
542         CHECK_OP(os, );
543         sequenceCreated = true;
544         justCreated = true;
545     }
546 
547     QVariantMap hints;
548     hints[U2SequenceDbiHints::UPDATE_SEQUENCE_LENGTH] = updateLength;
549     hints[U2SequenceDbiHints::EMPTY_SEQUENCE] = emptySequence;
550     U2Region reg(sequence.length, 0);
551     if (justCreated) {
552         reg.startPos = 0;
553         reg.length = 0;
554     }
555     con.dbi->getSequenceDbi()->updateSequenceData(sequence.id, reg, arr, hints, os);
556     CHECK_OP(os, );
557     if (committedLength == sequence.length) {
558         sequence.length += len;
559     } else {  // because of lazyMode and delayed sequence creation
560         sequence.length = committedLength + len;
561     }
562     committedLength += len;
563 }
564 
_addBuffer2Db(U2OpStatus & os)565 void U2SequenceImporter::_addBuffer2Db(U2OpStatus &os) {
566     CHECK(!sequenceBuffer.isEmpty(), );
567     _addBlock2Db(sequenceBuffer.data(), sequenceBuffer.length(), os);
568     sequenceBuffer.clear();
569 }
570 
finalizeSequence(U2OpStatus & os)571 U2Sequence U2SequenceImporter::finalizeSequence(U2OpStatus &os) {
572     _addBuffer2Db(os);
573     LOG_OP(os);
574     // If sequence is empty, addBlock is never called and alphabet is not set. So set it here to some default value
575     if (!sequence.alphabet.isValid() && sequence.version != 0) {
576         sequence.alphabet.id = BaseDNAAlphabetIds::RAW();
577         con.dbi->getSequenceDbi()->updateSequenceObject(sequence, os);
578         LOG_OP(os);
579     }
580     con.close(os);
581     if (caseAnnsMode != NO_CASE_ANNS) {
582         annList << U1AnnotationUtils::finalizeUnfinishedRegion(isUnfinishedRegion, unfinishedRegion, LOWER_CASE == caseAnnsMode);
583 
584         if (1 == annList.size()) {
585             const QVector<U2Region> &regs = annList.first()->getRegions();
586             if (1 == regs.size()) {
587                 U2Region reg = regs.first();
588                 if (0 == reg.startPos && sequence.length == reg.length) {
589                     annList.clear();
590                 }
591             }
592         }
593     }
594     sequenceCreated = false;
595     committedLength = 0;
596     return sequence;
597 }
598 
finalizeSequenceAndValidate(U2OpStatus & os)599 U2Sequence U2SequenceImporter::finalizeSequenceAndValidate(U2OpStatus &os) {
600     U2Sequence result = finalizeSequence(os);
601     CHECK_OP(os, result);
602     if (!result.hasValidId()) {
603         os.setError(EMPTY_SEQUENCE_ERROR);
604     }
605     return result;
606 }
607 
setCaseAnnotationsMode(CaseAnnotationsMode mode)608 void U2SequenceImporter::setCaseAnnotationsMode(CaseAnnotationsMode mode) {
609     caseAnnsMode = mode;
610 }
611 
isCaseAnnotationsModeOn() const612 bool U2SequenceImporter::isCaseAnnotationsModeOn() const {
613     return caseAnnsMode != NO_CASE_ANNS;
614 }
615 
getCaseAnnotations()616 QList<SharedAnnotationData> &U2SequenceImporter::getCaseAnnotations() {
617     return annList;
618 }
619 
getCurrentLength() const620 qint64 U2SequenceImporter::getCurrentLength() const {
621     return currentLength;
622 }
623 
addBlock(const char * data,qint64 len,U2OpStatus & os)624 void U2MemorySequenceImporter::addBlock(const char *data, qint64 len, U2OpStatus &os) {
625     if (qstrlen(data) < len) {
626         os.setError("Wrong data length in addBlock");
627         return;
628     }
629 
630     // derive common alphabet
631     const DNAAlphabet *blockAl = U2AlphabetUtils::findBestAlphabet(data, len);
632     CHECK_EXT(blockAl != nullptr, os.setError("Failed to match sequence alphabet!"), );
633 
634     const DNAAlphabet *oldAl = U2AlphabetUtils::getById(sequence.alphabet);
635     const DNAAlphabet *resAl = blockAl;
636     if (oldAl != nullptr) {
637         if (oldAl->getType() == DNAAlphabet_AMINO && resAl->getType() == DNAAlphabet_NUCL) {
638             resAl = oldAl;
639         } else if (resAl->getType() == DNAAlphabet_AMINO && oldAl->getType() == DNAAlphabet_NUCL) {
640             oldAl = resAl;
641         } else {
642             resAl = U2AlphabetUtils::deriveCommonAlphabet(blockAl, oldAl);
643         }
644         CHECK_EXT(resAl != nullptr, os.setError(U2SequenceUtils::tr("Failed to derive sequence alphabet!")), );
645     }
646 
647     if (resAl != U2AlphabetUtils::getById(sequence.alphabet)) {
648         sequence.alphabet.id = resAl->getId();
649     }
650 
651     sequenceData.append(data, len);
652 }
653 
addDefaultSymbolsBlock(int n,U2OpStatus & os)654 void U2MemorySequenceImporter::addDefaultSymbolsBlock(int n, U2OpStatus &os) {
655     SAFE_POINT_EXT(n >= 0, os.setError(QObject::tr("Invalid number of symbols: %1").arg(n)), );
656     char defaultChar = U2AlphabetUtils::getDefaultSymbol(sequence.alphabet);
657     QByteArray a(n, defaultChar);
658 
659     sequenceData.append(a);
660 }
661 
getCurrentLength() const662 qint64 U2MemorySequenceImporter::getCurrentLength() const {
663     return sequenceData.length();
664 }
665 
U2PseudoCircularization(QObject * parent,bool isCircular,QByteArray & seq,qint64 circOverlap)666 U2PseudoCircularization::U2PseudoCircularization(QObject *parent, bool isCircular, QByteArray &seq, qint64 circOverlap)
667     : QObject(parent) {
668     seqLen = seq.size();
669     if (isCircular) {
670         circOverlap = (circOverlap == -1 ? seqLen - 1 : circOverlap);
671         seq.append(QByteArray(seq).left(circOverlap));
672     }
673 }
674 
uncircularizeRegion(const U2Region & region,bool & uncircularized) const675 QVector<U2Region> U2PseudoCircularization::uncircularizeRegion(const U2Region &region, bool &uncircularized) const {
676     uncircularized = false;
677     if ((region.startPos >= seqLen && region.endPos() >= seqLen) || (region.length > seqLen)) {  // dublicate
678         return QVector<U2Region>();
679     }
680     if (region.endPos() > seqLen) {
681         uncircularized = true;
682         return QVector<U2Region>() << U2Region(region.startPos, seqLen - region.startPos)
683                                    << U2Region(0, region.endPos() - seqLen);
684     }
685     return QVector<U2Region>() << region;
686 }
687 
uncircularizeLocation(U2Location & location) const688 void U2PseudoCircularization::uncircularizeLocation(U2Location &location) const {
689     QVector<U2Region> res;
690     foreach (const U2Region &r, location->regions) {
691         bool regionWasSplitted = false;
692         res << uncircularizeRegion(r, regionWasSplitted);
693         if (regionWasSplitted) {
694             location->op = U2LocationOperator_Join;
695         }
696     }
697     location->regions = res;
698 }
699 
700 }  // namespace U2
701