1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "U2SequenceUtils.h"
23
24 #include <QApplication>
25
26 #include <U2Core/AppContext.h>
27 #include <U2Core/DNASequenceObject.h>
28 #include <U2Core/DNATranslation.h>
29 #include <U2Core/GObject.h>
30 #include <U2Core/SequenceUtils.h>
31 #include <U2Core/U1AnnotationUtils.h>
32 #include <U2Core/U2AlphabetUtils.h>
33 #include <U2Core/U2AttributeDbi.h>
34 #include <U2Core/U2AttributeUtils.h>
35 #include <U2Core/U2ObjectDbi.h>
36 #include <U2Core/U2OpStatus.h>
37 #include <U2Core/U2OpStatusUtils.h>
38 #include <U2Core/U2SafePoints.h>
39 #include <U2Core/U2SequenceDbi.h>
40
41 namespace U2 {
42
43 const QString U2SequenceDbiHints::UPDATE_SEQUENCE_LENGTH = "update-length";
44 const QString U2SequenceDbiHints::EMPTY_SEQUENCE = "empty-sequence";
45
alphabetType(const U2EntityRef & ref,U2OpStatus & os)46 DNAAlphabetType U2SequenceUtils::alphabetType(const U2EntityRef &ref, U2OpStatus &os) {
47 DNAAlphabetType res = DNAAlphabet_RAW;
48 DbiConnection con(ref.dbiRef, os);
49
50 U2Sequence seq = con.dbi->getSequenceDbi()->getSequenceObject(ref.entityId, os);
51 CHECK_OP(os, res);
52
53 const DNAAlphabet *al = AppContext::getDNAAlphabetRegistry()->findById(seq.alphabet.id);
54 CHECK_EXT(al != nullptr, os.setError(tr("Alphabet is not found!")), res);
55
56 return al->getType();
57 }
58
length(const U2EntityRef & ref,U2OpStatus & os)59 qint64 U2SequenceUtils::length(const U2EntityRef &ref, U2OpStatus &os) {
60 DbiConnection con(ref.dbiRef, os);
61
62 U2Sequence seq = con.dbi->getSequenceDbi()->getSequenceObject(ref.entityId, os);
63 CHECK_OP(os, -1);
64
65 return seq.length;
66 }
67
copySequence(const DNASequence & srcSeq,const U2DbiRef & dstDbi,const QString & dstFolder,U2OpStatus & os)68 U2Sequence U2SequenceUtils::copySequence(const DNASequence &srcSeq, const U2DbiRef &dstDbi, const QString &dstFolder, U2OpStatus &os) {
69 U2Sequence res;
70 // TODO: ClustalW format does not assign sequence alphabets!
71 res.alphabet = srcSeq.alphabet == nullptr ? nullptr : srcSeq.alphabet->getId();
72 res.circular = srcSeq.circular;
73 res.length = srcSeq.length();
74 res.visualName = srcSeq.getName();
75
76 TmpDbiObjects tmpObjects(dstDbi, os);
77
78 if (os.isCoR()) {
79 return res;
80 }
81
82 DbiConnection dstCon(dstDbi, os);
83 CHECK_OP(os, res);
84 dstCon.dbi->getSequenceDbi()->createSequenceObject(res, dstFolder, os);
85 CHECK_OP(os, res);
86
87 tmpObjects.objects.append(res.id);
88
89 dstCon.dbi->getSequenceDbi()->updateSequenceData(res.id, U2Region(0, 0), srcSeq.seq, QVariantMap(), os);
90 CHECK_OP(os, res);
91
92 return res;
93 }
94
copySequence(const U2EntityRef & srcSeq,const U2DbiRef & dstDbi,const QString & dstFolder,U2OpStatus & os)95 U2Sequence U2SequenceUtils::copySequence(const U2EntityRef &srcSeq, const U2DbiRef &dstDbi, const QString &dstFolder, U2OpStatus &os) {
96 U2Sequence res;
97 DbiConnection srcCon(srcSeq.dbiRef, os);
98 CHECK_OP(os, res);
99
100 U2SequenceDbi *srcSeqDbi = srcCon.dbi->getSequenceDbi();
101 SAFE_POINT_EXT(nullptr != srcSeqDbi, os.setError(tr("Invalid sequence DBI")), res);
102 U2Sequence seq = srcSeqDbi->getSequenceObject(srcSeq.entityId, os);
103 CHECK_OP(os, res);
104
105 res = seq;
106 U2TrackModType modType = res.trackModType;
107 res.trackModType = NoTrack;
108
109 res.id.clear();
110 res.length = 0;
111
112 TmpDbiObjects tmpObjects(dstDbi, os);
113
114 DbiConnection dstCon(dstDbi, os);
115 CHECK_OP(os, res);
116 U2SequenceDbi *dstSeqDbi = dstCon.dbi->getSequenceDbi();
117 SAFE_POINT_EXT(nullptr != dstSeqDbi, os.setError(tr("Invalid sequence DBI")), res);
118 dstSeqDbi->createSequenceObject(res, dstFolder, os);
119 CHECK_OP(os, res);
120
121 tmpObjects.objects.append(res.id);
122
123 const qint64 MAX_CHUNK_LENGTH = 4194304; // 4 MiB chunk
124 for (qint64 pos = 0; pos < seq.length; pos += MAX_CHUNK_LENGTH) {
125 const qint64 currentChunkSize = qMin(MAX_CHUNK_LENGTH, seq.length - pos);
126 const U2Region chunkRegion(pos, currentChunkSize);
127 const QByteArray chunkContent = srcSeqDbi->getSequenceData(srcSeq.entityId, chunkRegion, os);
128 CHECK_OP(os, res);
129 dstSeqDbi->updateSequenceData(res.id, chunkRegion, chunkContent, QVariantMap(), os);
130 CHECK_OP(os, res);
131 res.length += currentChunkSize;
132 }
133
134 U2DbiObjectRank rank = srcCon.dbi->getObjectDbi()->getObjectRank(seq.id, os);
135 CHECK_OP(os, res);
136
137 dstCon.dbi->getObjectDbi()->setObjectRank(res.id, rank, os);
138 CHECK_OP(os, res);
139
140 res.trackModType = modType;
141 return res;
142 }
143
updateSequenceName(const U2EntityRef & entityRef,const QString & newName,U2OpStatus & os)144 void U2SequenceUtils::updateSequenceName(const U2EntityRef &entityRef, const QString &newName, U2OpStatus &os) {
145 DbiConnection con(entityRef.dbiRef, os);
146 CHECK_OP(os, );
147
148 U2SequenceDbi *sequenceDbi = con.dbi->getSequenceDbi();
149 U2Sequence sequenceObject = sequenceDbi->getSequenceObject(entityRef.entityId, os);
150 CHECK_OP(os, );
151 if (sequenceObject.visualName != newName) {
152 sequenceObject.visualName = newName;
153 sequenceDbi->updateSequenceObject(sequenceObject, os);
154 }
155 }
156
_extractRegions(const U2EntityRef & seqRef,const QVector<U2Region> & regions,const DNATranslation * complTT,U2OpStatus & os)157 static QList<QByteArray> _extractRegions(const U2EntityRef &seqRef, const QVector<U2Region> ®ions, const DNATranslation *complTT, U2OpStatus &os) {
158 QList<QByteArray> res;
159
160 DbiConnection con(seqRef.dbiRef, os);
161 CHECK_OP(os, res);
162 U2SequenceDbi *seqDbi = con.dbi->getSequenceDbi();
163 U2Sequence seq = seqDbi->getSequenceObject(seqRef.entityId, os);
164 CHECK_OP(os, res);
165
166 QVector<U2Region> safeLocation = regions;
167 U2Region::bound(0, seq.length, safeLocation);
168
169 for (int i = 0, n = safeLocation.size(); i < n; i++) {
170 const U2Region &oReg = safeLocation.at(i);
171 if (complTT == nullptr) {
172 QByteArray part = seqDbi->getSequenceData(seq.id, U2Region(oReg.startPos, oReg.length), os);
173 CHECK_OP(os, QList<QByteArray>());
174 res.append(part);
175 } else {
176 QByteArray arr = seqDbi->getSequenceData(seq.id, U2Region(oReg.startPos, oReg.length), os);
177 CHECK_OP(os, QList<QByteArray>());
178 TextUtils::reverse(arr.data(), arr.length());
179 complTT->translate(arr.data(), arr.length());
180 res.prepend(arr);
181 }
182 }
183 return res;
184 }
185
extractRegions(const U2EntityRef & seqRef,const QVector<U2Region> & origLocation,const DNATranslation * complTT,const DNATranslation * aminoTT,bool join,U2OpStatus & os)186 QList<QByteArray> U2SequenceUtils::extractRegions(const U2EntityRef &seqRef, const QVector<U2Region> &origLocation, const DNATranslation *complTT, const DNATranslation *aminoTT, bool join, U2OpStatus &os) {
187 QList<QByteArray> res = _extractRegions(seqRef, origLocation, complTT, os);
188 CHECK_OP(os, res)
189
190 DbiConnection con(seqRef.dbiRef, os);
191 CHECK_OP(os, res);
192
193 U2SequenceDbi *seqDbi = con.dbi->getSequenceDbi();
194 U2Sequence seq = seqDbi->getSequenceObject(seqRef.entityId, os);
195 CHECK_OP(os, res);
196
197 if (seq.circular && res.size() > 1) {
198 const U2Region &firstL = origLocation.first();
199 const U2Region &lastL = origLocation.last();
200 if (firstL.startPos == 0 && lastL.endPos() == seq.length) {
201 QByteArray lastS = res.last();
202 QByteArray firstS = res.first();
203 res.removeLast();
204 res[0] = lastS.append(firstS);
205 }
206 }
207 if (aminoTT != nullptr) {
208 res = U1SequenceUtils::translateRegions(res, aminoTT, join);
209 }
210
211 if (join && res.size() > 1) {
212 QByteArray joined = U1SequenceUtils::joinRegions(res);
213 res.clear();
214 res.append(joined);
215 }
216
217 return res;
218 }
219
import(U2OpStatus & os,const U2DbiRef & dbiRef,const DNASequence & seq,const U2AlphabetId & alphabetId)220 U2EntityRef U2SequenceUtils::import(U2OpStatus &os, const U2DbiRef &dbiRef, const DNASequence &seq, const U2AlphabetId &alphabetId) {
221 return import(os, dbiRef, U2ObjectDbi::ROOT_FOLDER, seq, alphabetId);
222 }
223
import(U2OpStatus & os,const U2DbiRef & dbiRef,const QString & folder,const DNASequence & seq,const U2AlphabetId & alphabetId)224 U2EntityRef U2SequenceUtils::import(U2OpStatus &os, const U2DbiRef &dbiRef, const QString &folder, const DNASequence &seq, const U2AlphabetId &alphabetId) {
225 U2EntityRef res;
226 U2SequenceImporter i;
227
228 i.startSequence(os, dbiRef, folder, seq.getName(), seq.circular, alphabetId);
229 CHECK_OP(os, res);
230
231 i.addBlock(seq.constData(), seq.length(), os);
232 CHECK_OP(os, res);
233
234 U2Sequence u2seq = i.finalizeSequenceAndValidate(os);
235 CHECK_OP(os, res);
236
237 res.dbiRef = dbiRef;
238 res.entityId = u2seq.id;
239
240 setSequenceInfo(os, res, seq.info);
241 CHECK_OP(os, res);
242
243 setQuality(res, seq.quality);
244
245 return res;
246 }
247
setQuality(const U2EntityRef & entityRef,const DNAQuality & q)248 void U2SequenceUtils::setQuality(const U2EntityRef &entityRef, const DNAQuality &q) {
249 U2OpStatus2Log os;
250 DbiConnection con(entityRef.dbiRef, os);
251 CHECK_OP(os, );
252 QList<U2DataId> idQualList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::FASTQ_QUAL_CODES, os);
253 CHECK_OP(os, );
254 if (!idQualList.isEmpty()) {
255 con.dbi->getAttributeDbi()->removeAttributes(idQualList, os);
256 CHECK_OP(os, );
257 }
258 QList<U2DataId> idQualTypeList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::FASTQ_QUAL_TYPE, os);
259 CHECK_OP(os, );
260 if (!idQualTypeList.isEmpty()) {
261 con.dbi->getAttributeDbi()->removeAttributes(idQualTypeList, os);
262 CHECK_OP(os, );
263 }
264
265 U2ByteArrayAttribute qualityCodes(entityRef.entityId, DNAInfo::FASTQ_QUAL_CODES, q.qualCodes);
266 U2IntegerAttribute qualityType(entityRef.entityId, DNAInfo::FASTQ_QUAL_TYPE, q.type);
267 con.dbi->getAttributeDbi()->createByteArrayAttribute(qualityCodes, os);
268 CHECK_OP(os, );
269 con.dbi->getAttributeDbi()->createIntegerAttribute(qualityType, os);
270 CHECK_OP(os, );
271 }
272
setSequenceInfo(U2OpStatus & os,const U2EntityRef & entityRef,const QVariantMap & info)273 void U2SequenceUtils::setSequenceInfo(U2OpStatus &os, const U2EntityRef &entityRef, const QVariantMap &info) {
274 DbiConnection con(entityRef.dbiRef, os);
275 CHECK_OP(os, );
276 QList<U2DataId> chainIdList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::CHAIN_ID, os);
277 CHECK_OP(os, );
278 if (!chainIdList.isEmpty()) {
279 con.dbi->getAttributeDbi()->removeObjectAttributes(chainIdList.first(), os);
280 CHECK_OP(os, );
281 }
282 QList<U2DataId> commentList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::COMMENT, os);
283 CHECK_OP(os, );
284 if (!commentList.isEmpty()) {
285 con.dbi->getAttributeDbi()->removeObjectAttributes(commentList.first(), os);
286 CHECK_OP(os, );
287 }
288 QList<U2DataId> definitionList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::DEFINITION, os);
289 CHECK_OP(os, );
290 if (!definitionList.isEmpty()) {
291 con.dbi->getAttributeDbi()->removeObjectAttributes(definitionList.first(), os);
292 CHECK_OP(os, );
293 }
294 U2StringAttribute chainID(entityRef.entityId, DNAInfo::CHAIN_ID, info.value(DNAInfo::CHAIN_ID).toString());
295 U2StringAttribute comment(entityRef.entityId, DNAInfo::COMMENT, info.value(DNAInfo::COMMENT).toString());
296 U2StringAttribute definition(entityRef.entityId, DNAInfo::DEFINITION, info.value(DNAInfo::DEFINITION).toString());
297 con.dbi->getAttributeDbi()->createStringAttribute(chainID, os);
298 CHECK_OP(os, );
299 con.dbi->getAttributeDbi()->createStringAttribute(comment, os);
300 CHECK_OP(os, );
301 con.dbi->getAttributeDbi()->createStringAttribute(definition, os);
302 CHECK_OP(os, );
303 }
304
getSequenceInfo(U2OpStatus & os,const U2EntityRef & entityRef,const QString & name)305 QVariantMap U2SequenceUtils::getSequenceInfo(U2OpStatus &os, const U2EntityRef &entityRef, const QString &name) {
306 QVariantMap resultingInfo;
307 DbiConnection con(entityRef.dbiRef, os);
308 QList<U2DataId> chainIdList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::CHAIN_ID, os);
309 CHECK_OP(os, resultingInfo);
310 QList<U2DataId> commentList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::COMMENT, os);
311 CHECK_OP(os, resultingInfo);
312 QList<U2DataId> definitionList = con.dbi->getAttributeDbi()->getObjectAttributes(entityRef.entityId, DNAInfo::DEFINITION, os);
313 CHECK_OP(os, resultingInfo);
314 if (!chainIdList.isEmpty() && !commentList.isEmpty() && !definitionList.isEmpty()) {
315 resultingInfo.insert(DNAInfo::CHAIN_ID, con.dbi->getAttributeDbi()->getStringAttribute(chainIdList.first(), os).value);
316 CHECK_OP(os, QVariantMap());
317 resultingInfo.insert(DNAInfo::COMMENT, con.dbi->getAttributeDbi()->getStringAttribute(commentList.first(), os).value);
318 CHECK_OP(os, QVariantMap());
319 resultingInfo.insert(DNAInfo::DEFINITION, con.dbi->getAttributeDbi()->getStringAttribute(definitionList.first(), os).value);
320 CHECK_OP(os, QVariantMap());
321 }
322 U2StringAttribute attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, DNAInfo::GENBANK_HEADER, os);
323 if (attr.hasValidId()) {
324 resultingInfo.insert(DNAInfo::GENBANK_HEADER, attr.value);
325 CHECK_OP(os, QVariantMap());
326 }
327
328 attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, DNAInfo::SOURCE, os);
329 if (attr.hasValidId()) {
330 resultingInfo.insert(DNAInfo::SOURCE, attr.value);
331 CHECK_OP(os, QVariantMap());
332 }
333
334 attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, DNAInfo::ACCESSION, os);
335 if (attr.hasValidId()) {
336 resultingInfo.insert(DNAInfo::ACCESSION, attr.value);
337 CHECK_OP(os, QVariantMap());
338 }
339 attr = U2AttributeUtils::findStringAttribute(con.dbi->getAttributeDbi(), entityRef.entityId, Translation_Table_Id_Attribute, os);
340 if (attr.hasValidId()) {
341 resultingInfo.insert(Translation_Table_Id_Attribute, attr.value);
342 CHECK_OP(os, QVariantMap());
343 }
344
345 if (!name.isEmpty()) {
346 resultingInfo.insert(DNAInfo::ID, name);
347 }
348 return resultingInfo;
349 }
350
getSequenceDbInfo(U2SequenceObject * seqObj)351 U2Sequence U2SequenceUtils::getSequenceDbInfo(U2SequenceObject *seqObj) {
352 U2Sequence seq;
353
354 seq.id = seqObj->getEntityRef().entityId;
355 seq.dbiId = seqObj->getEntityRef().dbiRef.dbiId;
356 if (nullptr != seqObj->getAlphabet()) {
357 seq.alphabet.id = seqObj->getAlphabet()->getId();
358 }
359 seq.circular = seqObj->isCircular();
360 seq.length = seqObj->getSequenceLength();
361 seq.visualName = seqObj->getSequenceName();
362
363 return seq;
364 }
365
366 //////////////////////////////////////////////////////////////////////////
367 // U2SequenceImporter
368 #define DEFAULT_SEQUENCE_INSERT_BLOCK_SIZE (4 * 1024 * 1024)
369
370 /**
371 * If GObjectHint_CaseAnns is present in the QVariantMap (it should be stored as int),
372 * then the method verifies the value and returns it (if it is correct).
373 * Otherwise, returns NO_CASE_ANNS.
374 */
getCaseAnnotationsModeHint(const QVariantMap & fs)375 static CaseAnnotationsMode getCaseAnnotationsModeHint(const QVariantMap &fs) {
376 if (fs.keys().contains(GObjectHint_CaseAnns)) {
377 QVariant caseAnnsVariant = fs.value(GObjectHint_CaseAnns);
378 SAFE_POINT(caseAnnsVariant.canConvert<int>(), "Can't convert a case annotations hint!", NO_CASE_ANNS);
379
380 bool conversionIsOK = 0;
381 int caseAnnsInt = caseAnnsVariant.toInt(&conversionIsOK);
382 SAFE_POINT(conversionIsOK, "Can't convert a case annotations hint to int!", NO_CASE_ANNS);
383
384 SAFE_POINT((caseAnnsInt == LOWER_CASE) || (caseAnnsInt == UPPER_CASE) || (caseAnnsInt == NO_CASE_ANNS),
385 "Incorrect value of a case annotation hint!",
386 NO_CASE_ANNS);
387
388 return (CaseAnnotationsMode)caseAnnsInt;
389 }
390
391 return NO_CASE_ANNS;
392 }
393
394 const QString U2SequenceImporter::EMPTY_SEQUENCE_ERROR = QApplication::translate("U2SequenceImporter",
395 "Sequence was not imported. Probably, this is because the sequence is empty.");
396
U2SequenceImporter(const QVariantMap & fs,bool lazyMode,bool singleThread)397 U2SequenceImporter::U2SequenceImporter(const QVariantMap &fs, bool lazyMode, bool singleThread)
398 : lazyMode(lazyMode), singleThread(singleThread), sequenceCreated(false) {
399 insertBlockSize = DEFAULT_SEQUENCE_INSERT_BLOCK_SIZE;
400 currentLength = 0;
401 isUnfinishedRegion = false;
402 caseAnnsMode = getCaseAnnotationsModeHint(fs);
403 sequenceCreated = false;
404 committedLength = 0;
405 }
406
U2SequenceImporter(qint64 _insertBlockSize,const QVariantMap & fs,bool lazyMode,bool singleThread)407 U2SequenceImporter::U2SequenceImporter(qint64 _insertBlockSize, const QVariantMap &fs, bool lazyMode, bool singleThread)
408 : insertBlockSize(_insertBlockSize), lazyMode(lazyMode), singleThread(singleThread) {
409 insertBlockSize = qMin((qint64)10, insertBlockSize);
410 currentLength = 0;
411 isUnfinishedRegion = false;
412 caseAnnsMode = getCaseAnnotationsModeHint(fs);
413 sequenceCreated = false;
414 committedLength = 0;
415 }
416
~U2SequenceImporter()417 U2SequenceImporter::~U2SequenceImporter() {
418 if (con.isOpen() && sequenceCreated) {
419 coreLog.trace(QString("Removing sequence from unfinished import: %1").arg(sequence.visualName));
420 U2OpStatus2Log os;
421 con.dbi->getObjectDbi()->removeObject(sequence.id, os);
422 }
423 }
424
startSequence(U2OpStatus & os,const U2DbiRef & dbiRef,const QString & dstFolder,const QString & visualName,bool circular,const U2AlphabetId & alphabetId)425 void U2SequenceImporter::startSequence(U2OpStatus &os,
426 const U2DbiRef &dbiRef,
427 const QString &dstFolder,
428 const QString &visualName,
429 bool circular,
430 const U2AlphabetId &alphabetId) {
431 SAFE_POINT(!con.isOpen(), "Connection is already opened!", );
432 con.open(dbiRef, true, os);
433 CHECK_OP(os, );
434
435 folder = dstFolder;
436
437 sequence = U2Sequence();
438 sequence.visualName = visualName;
439 sequence.circular = circular;
440 sequence.alphabet.id = alphabetId.id;
441
442 currentLength = 0;
443 isUnfinishedRegion = false;
444 annList.clear();
445
446 if (!lazyMode) {
447 con.dbi->getSequenceDbi()->createSequenceObject(sequence, folder, os);
448 CHECK_OP(os, );
449 sequenceCreated = true;
450 }
451 }
452
addBlock(const char * data,qint64 len,U2OpStatus & os)453 void U2SequenceImporter::addBlock(const char *data, qint64 len, U2OpStatus &os) {
454 // derive common alphabet
455 const DNAAlphabet *blockAl = U2AlphabetUtils::findBestAlphabet(data, len);
456 CHECK_EXT(blockAl != nullptr, os.setError("Failed to match sequence alphabet!"), );
457
458 const DNAAlphabet *oldAl = U2AlphabetUtils::getById(sequence.alphabet);
459 const DNAAlphabet *resAl = blockAl;
460 if (oldAl != nullptr) {
461 if (oldAl->getType() == DNAAlphabet_AMINO && resAl->getType() == DNAAlphabet_NUCL) {
462 resAl = oldAl;
463 } else if (resAl->getType() == DNAAlphabet_AMINO && oldAl->getType() == DNAAlphabet_NUCL) {
464 oldAl = resAl;
465 } else {
466 resAl = U2AlphabetUtils::deriveCommonAlphabet(blockAl, oldAl);
467 }
468 CHECK_EXT(resAl != nullptr, os.setError(U2SequenceUtils::tr("Failed to derive sequence alphabet!")), );
469 }
470
471 if (resAl != U2AlphabetUtils::getById(sequence.alphabet)) {
472 sequence.alphabet.id = resAl->getId();
473 if (sequenceCreated) {
474 con.dbi->getSequenceDbi()->updateSequenceObject(sequence, os);
475 CHECK_OP(os, );
476 }
477 }
478
479 _addBlock2Buffer(data, len, os);
480
481 if (caseAnnsMode != NO_CASE_ANNS) {
482 annList << U1AnnotationUtils::getCaseAnnotations(data, len, currentLength, isUnfinishedRegion, unfinishedRegion, LOWER_CASE == caseAnnsMode);
483 }
484 currentLength += len;
485 }
486
addSequenceBlock(const U2EntityRef & sequenceRef,const U2Region & r,U2OpStatus & os)487 void U2SequenceImporter::addSequenceBlock(const U2EntityRef &sequenceRef, const U2Region &r, U2OpStatus &os) {
488 _addBuffer2Db(os);
489 CHECK_OP(os, );
490 DbiConnection con(sequenceRef.dbiRef, os);
491 CHECK_OP(os, );
492
493 // TODO: optimize -> create utility that uses small to copy sequence!
494 QByteArray arr = con.dbi->getSequenceDbi()->getSequenceData(sequenceRef.entityId, r, os);
495 CHECK_OP(os, );
496 addBlock(arr.constData(), arr.size(), os);
497 }
498
addDefaultSymbolsBlock(int n,U2OpStatus & os)499 void U2SequenceImporter::addDefaultSymbolsBlock(int n, U2OpStatus &os) {
500 SAFE_POINT(n >= 0, QString("Invalid number of symbols: %1").arg(n), );
501 const DNAAlphabet *al = AppContext::getDNAAlphabetRegistry()->findById(sequence.alphabet.id);
502 if (nullptr == al) {
503 os.setError(QObject::tr("Unable to detect sequence alphabet. Probably, this is because some of merged sequences are empty."));
504 return;
505 }
506 char defaultChar = U2AlphabetUtils::getDefaultSymbol(sequence.alphabet);
507 QByteArray a(n, defaultChar);
508 _addBlock2Buffer(a.data(), a.size(), os);
509 currentLength += n;
510 }
511
_addBlock2Buffer(const char * data,qint64 len,U2OpStatus & os)512 void U2SequenceImporter::_addBlock2Buffer(const char *data, qint64 len, U2OpStatus &os) {
513 if (len + sequenceBuffer.length() < insertBlockSize) {
514 sequenceBuffer.append(data, len);
515 return;
516 }
517 _addBlock2Db(sequenceBuffer.data(), sequenceBuffer.length(), os);
518 CHECK_OP(os, );
519 sequenceBuffer.clear();
520 _addBlock2Db(data, len, os);
521 }
522
_addBlock2Db(const char * data,qint64 len,U2OpStatus & os)523 void U2SequenceImporter::_addBlock2Db(const char *data, qint64 len, U2OpStatus &os) {
524 SAFE_POINT(len >= 0, "Illegal block length!", );
525 if (len == 0) {
526 return;
527 }
528 QByteArray arr(data, len);
529 TextUtils::translate(TextUtils::UPPER_CASE_MAP, arr.data(), arr.length());
530
531 bool updateLength = true;
532 bool emptySequence = false;
533 bool justCreated = false;
534 if (!sequenceCreated) {
535 emptySequence = true;
536 if (singleThread) {
537 SAFE_POINT(0 == committedLength, "Sequence object is not created, but sequence data already exists", );
538 sequence.length = len;
539 updateLength = false;
540 }
541 con.dbi->getSequenceDbi()->createSequenceObject(sequence, folder, os);
542 CHECK_OP(os, );
543 sequenceCreated = true;
544 justCreated = true;
545 }
546
547 QVariantMap hints;
548 hints[U2SequenceDbiHints::UPDATE_SEQUENCE_LENGTH] = updateLength;
549 hints[U2SequenceDbiHints::EMPTY_SEQUENCE] = emptySequence;
550 U2Region reg(sequence.length, 0);
551 if (justCreated) {
552 reg.startPos = 0;
553 reg.length = 0;
554 }
555 con.dbi->getSequenceDbi()->updateSequenceData(sequence.id, reg, arr, hints, os);
556 CHECK_OP(os, );
557 if (committedLength == sequence.length) {
558 sequence.length += len;
559 } else { // because of lazyMode and delayed sequence creation
560 sequence.length = committedLength + len;
561 }
562 committedLength += len;
563 }
564
_addBuffer2Db(U2OpStatus & os)565 void U2SequenceImporter::_addBuffer2Db(U2OpStatus &os) {
566 CHECK(!sequenceBuffer.isEmpty(), );
567 _addBlock2Db(sequenceBuffer.data(), sequenceBuffer.length(), os);
568 sequenceBuffer.clear();
569 }
570
finalizeSequence(U2OpStatus & os)571 U2Sequence U2SequenceImporter::finalizeSequence(U2OpStatus &os) {
572 _addBuffer2Db(os);
573 LOG_OP(os);
574 // If sequence is empty, addBlock is never called and alphabet is not set. So set it here to some default value
575 if (!sequence.alphabet.isValid() && sequence.version != 0) {
576 sequence.alphabet.id = BaseDNAAlphabetIds::RAW();
577 con.dbi->getSequenceDbi()->updateSequenceObject(sequence, os);
578 LOG_OP(os);
579 }
580 con.close(os);
581 if (caseAnnsMode != NO_CASE_ANNS) {
582 annList << U1AnnotationUtils::finalizeUnfinishedRegion(isUnfinishedRegion, unfinishedRegion, LOWER_CASE == caseAnnsMode);
583
584 if (1 == annList.size()) {
585 const QVector<U2Region> ®s = annList.first()->getRegions();
586 if (1 == regs.size()) {
587 U2Region reg = regs.first();
588 if (0 == reg.startPos && sequence.length == reg.length) {
589 annList.clear();
590 }
591 }
592 }
593 }
594 sequenceCreated = false;
595 committedLength = 0;
596 return sequence;
597 }
598
finalizeSequenceAndValidate(U2OpStatus & os)599 U2Sequence U2SequenceImporter::finalizeSequenceAndValidate(U2OpStatus &os) {
600 U2Sequence result = finalizeSequence(os);
601 CHECK_OP(os, result);
602 if (!result.hasValidId()) {
603 os.setError(EMPTY_SEQUENCE_ERROR);
604 }
605 return result;
606 }
607
setCaseAnnotationsMode(CaseAnnotationsMode mode)608 void U2SequenceImporter::setCaseAnnotationsMode(CaseAnnotationsMode mode) {
609 caseAnnsMode = mode;
610 }
611
isCaseAnnotationsModeOn() const612 bool U2SequenceImporter::isCaseAnnotationsModeOn() const {
613 return caseAnnsMode != NO_CASE_ANNS;
614 }
615
getCaseAnnotations()616 QList<SharedAnnotationData> &U2SequenceImporter::getCaseAnnotations() {
617 return annList;
618 }
619
getCurrentLength() const620 qint64 U2SequenceImporter::getCurrentLength() const {
621 return currentLength;
622 }
623
addBlock(const char * data,qint64 len,U2OpStatus & os)624 void U2MemorySequenceImporter::addBlock(const char *data, qint64 len, U2OpStatus &os) {
625 if (qstrlen(data) < len) {
626 os.setError("Wrong data length in addBlock");
627 return;
628 }
629
630 // derive common alphabet
631 const DNAAlphabet *blockAl = U2AlphabetUtils::findBestAlphabet(data, len);
632 CHECK_EXT(blockAl != nullptr, os.setError("Failed to match sequence alphabet!"), );
633
634 const DNAAlphabet *oldAl = U2AlphabetUtils::getById(sequence.alphabet);
635 const DNAAlphabet *resAl = blockAl;
636 if (oldAl != nullptr) {
637 if (oldAl->getType() == DNAAlphabet_AMINO && resAl->getType() == DNAAlphabet_NUCL) {
638 resAl = oldAl;
639 } else if (resAl->getType() == DNAAlphabet_AMINO && oldAl->getType() == DNAAlphabet_NUCL) {
640 oldAl = resAl;
641 } else {
642 resAl = U2AlphabetUtils::deriveCommonAlphabet(blockAl, oldAl);
643 }
644 CHECK_EXT(resAl != nullptr, os.setError(U2SequenceUtils::tr("Failed to derive sequence alphabet!")), );
645 }
646
647 if (resAl != U2AlphabetUtils::getById(sequence.alphabet)) {
648 sequence.alphabet.id = resAl->getId();
649 }
650
651 sequenceData.append(data, len);
652 }
653
addDefaultSymbolsBlock(int n,U2OpStatus & os)654 void U2MemorySequenceImporter::addDefaultSymbolsBlock(int n, U2OpStatus &os) {
655 SAFE_POINT_EXT(n >= 0, os.setError(QObject::tr("Invalid number of symbols: %1").arg(n)), );
656 char defaultChar = U2AlphabetUtils::getDefaultSymbol(sequence.alphabet);
657 QByteArray a(n, defaultChar);
658
659 sequenceData.append(a);
660 }
661
getCurrentLength() const662 qint64 U2MemorySequenceImporter::getCurrentLength() const {
663 return sequenceData.length();
664 }
665
U2PseudoCircularization(QObject * parent,bool isCircular,QByteArray & seq,qint64 circOverlap)666 U2PseudoCircularization::U2PseudoCircularization(QObject *parent, bool isCircular, QByteArray &seq, qint64 circOverlap)
667 : QObject(parent) {
668 seqLen = seq.size();
669 if (isCircular) {
670 circOverlap = (circOverlap == -1 ? seqLen - 1 : circOverlap);
671 seq.append(QByteArray(seq).left(circOverlap));
672 }
673 }
674
uncircularizeRegion(const U2Region & region,bool & uncircularized) const675 QVector<U2Region> U2PseudoCircularization::uncircularizeRegion(const U2Region ®ion, bool &uncircularized) const {
676 uncircularized = false;
677 if ((region.startPos >= seqLen && region.endPos() >= seqLen) || (region.length > seqLen)) { // dublicate
678 return QVector<U2Region>();
679 }
680 if (region.endPos() > seqLen) {
681 uncircularized = true;
682 return QVector<U2Region>() << U2Region(region.startPos, seqLen - region.startPos)
683 << U2Region(0, region.endPos() - seqLen);
684 }
685 return QVector<U2Region>() << region;
686 }
687
uncircularizeLocation(U2Location & location) const688 void U2PseudoCircularization::uncircularizeLocation(U2Location &location) const {
689 QVector<U2Region> res;
690 foreach (const U2Region &r, location->regions) {
691 bool regionWasSplitted = false;
692 res << uncircularizeRegion(r, regionWasSplitted);
693 if (regionWasSplitted) {
694 location->op = U2LocationOperator_Join;
695 }
696 }
697 location->regions = res;
698 }
699
700 } // namespace U2
701