1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "AceFormat.h"
23
24 #include <U2Core/AppContext.h>
25 #include <U2Core/GObjectRelationRoles.h>
26 #include <U2Core/GObjectTypes.h>
27 #include <U2Core/IOAdapter.h>
28 #include <U2Core/L10n.h>
29 #include <U2Core/MSAUtils.h>
30 #include <U2Core/MultipleSequenceAlignmentImporter.h>
31 #include <U2Core/MultipleSequenceAlignmentObject.h>
32 #include <U2Core/TextUtils.h>
33 #include <U2Core/U2AlphabetUtils.h>
34 #include <U2Core/U2ObjectDbi.h>
35 #include <U2Core/U2OpStatus.h>
36 #include <U2Core/U2SafePoints.h>
37
38 #include <U2Formats/DocumentFormatUtils.h>
39
40 namespace U2 {
41
42 const QString ACEFormat::CO = "CO";
43 const QString ACEFormat::RD = "RD";
44 const QString ACEFormat::QA = "QA";
45 const QString ACEFormat::AS = "AS";
46 const QString ACEFormat::AF = "AF";
47 const QString ACEFormat::BQ = "BQ";
48
ACEFormat(QObject * p)49 ACEFormat::ACEFormat(QObject *p)
50 : TextDocumentFormatDeprecated(p, BaseDocumentFormats::ACE, DocumentFormatFlags(0), QStringList("ace")) {
51 formatName = tr("ACE");
52 formatDescription = tr("ACE is a format used for storing information about genomic confgurations");
53 supportedObjectTypes += GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT;
54 }
55
modifyLine(QString & line,int pos)56 static int modifyLine(QString &line, int pos) {
57 int curIdx = 0;
58 char space = ' ';
59
60 line = line.simplified();
61
62 for (int i = 0; i < pos; i++) {
63 curIdx = line.indexOf(space);
64 if (-1 == curIdx) {
65 return 0;
66 }
67
68 line = line.mid(curIdx + 1);
69 }
70 curIdx = line.indexOf(space);
71 if (-1 == curIdx) {
72 return 0;
73 }
74
75 line = line.mid(0, curIdx);
76
77 bool ok = false;
78 int result = line.toInt(&ok);
79 if (ok == false) {
80 return -1;
81 } else {
82 return result;
83 }
84 }
85
prepareLine(QString & line,int pos)86 static int prepareLine(QString &line, int pos) {
87 int curIdx = 0;
88 char space = ' ';
89
90 line = line.simplified();
91
92 for (int i = 0; i < pos; i++) {
93 curIdx = line.indexOf(space);
94 if (-1 == curIdx) {
95 return -1;
96 }
97
98 line = line.mid(curIdx + 1);
99 }
100
101 return curIdx;
102 }
103
104 #define READS_COUNT_POS 3
readsCount(const QString & cur_line)105 static int readsCount(const QString &cur_line) {
106 QString line = cur_line;
107 return modifyLine(line, READS_COUNT_POS);
108 }
109
110 #define CONTIG_COUNT_POS 1
contigCount(const QString & cur_line)111 static int contigCount(const QString &cur_line) {
112 QString line = cur_line;
113 return modifyLine(line, CONTIG_COUNT_POS);
114 }
115
116 #define LAST_QA_POS 4
clearRange(const QString & cur_line)117 static int clearRange(const QString &cur_line) {
118 QString line = cur_line;
119 modifyLine(line, LAST_QA_POS);
120
121 bool ok = true;
122 int result = line.toInt(&ok);
123 if (!ok) {
124 return INT_MAX;
125 } else {
126 return result;
127 }
128 }
129 #define PADDED_START_POS 3
paddedStartCons(const QString & cur_line)130 static int paddedStartCons(const QString &cur_line) {
131 QString line = cur_line;
132 modifyLine(line, PADDED_START_POS);
133
134 bool ok = true;
135 int result = line.toInt(&ok);
136 if (!ok) {
137 return INT_MAX;
138 } else {
139 return result;
140 }
141 }
142
143 #define READS_POS 3
readsPos(const QString & cur_line)144 static int readsPos(const QString &cur_line) {
145 QString line = cur_line;
146 prepareLine(line, READS_POS);
147
148 if (-1 != line.indexOf(' ')) {
149 return INT_MAX;
150 }
151
152 line = line.mid(0, line.length());
153
154 bool ok = true;
155 int result = line.toInt(&ok);
156 if (!ok) {
157 return INT_MAX;
158 } else {
159 return result;
160 }
161 }
162 #define COMPLEMENT_POS 2
readsComplement(const QString & cur_line)163 static int readsComplement(const QString &cur_line) {
164 QString line = cur_line;
165 prepareLine(line, COMPLEMENT_POS);
166
167 if (line.startsWith("U")) {
168 return 0;
169 } else if (line.startsWith("C")) {
170 return 1;
171 } else {
172 return -1;
173 }
174 }
175
getName(const QString & line)176 static QString getName(const QString &line) {
177 int curIdx = 0;
178 char space = ' ';
179
180 QString name = line.simplified();
181
182 curIdx = name.indexOf(space);
183 if (-1 == curIdx) {
184 return "";
185 }
186
187 name = name.mid(curIdx + 1);
188
189 curIdx = name.indexOf(space);
190 if (-1 == curIdx) {
191 return "";
192 }
193
194 name = name.mid(0, curIdx);
195
196 return name;
197 }
198
checkSeq(const QByteArray & seq)199 static bool checkSeq(const QByteArray &seq) {
200 const DNAAlphabet *alphabet = AppContext::getDNAAlphabetRegistry()->findById(BaseDNAAlphabetIds::NUCL_DNA_EXTENDED());
201 for (int i = 0; i < seq.length(); i++) {
202 if (!(alphabet->contains(seq[i]) || seq[i] == '*')) {
203 return false;
204 }
205 }
206 return true;
207 }
208
skipBreaks(U2::IOAdapter * io,U2OpStatus & ti,char * buff,qint64 * len)209 static inline void skipBreaks(U2::IOAdapter *io, U2OpStatus &ti, char *buff, qint64 *len) {
210 bool lineOk = true;
211 *len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &lineOk);
212 CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
213 CHECK_EXT(*len != 0, ti.setError(ACEFormat::tr("Unexpected end of file")), ); // end if stream
214 CHECK_EXT(lineOk, ti.setError(ACEFormat::tr("Line is too long")), );
215 }
216
parseConsensus(U2::IOAdapter * io,U2OpStatus & ti,char * buff,QString & consName,QSet<QString> & names,QString & headerLine,QByteArray & consensus)217 static inline void parseConsensus(U2::IOAdapter *io, U2OpStatus &ti, char *buff, QString &consName, QSet<QString> &names, QString &headerLine, QByteArray &consensus) {
218 char aceBStartChar = 'B';
219 QBitArray aceBStart = TextUtils::createBitMap(aceBStartChar);
220 qint64 len = 0;
221 bool ok = true;
222 QString line;
223 consName = getName(headerLine);
224 CHECK_EXT(!consName.isEmpty(), ti.setError(ACEFormat::tr("There is no AF note")), );
225 CHECK_EXT(!names.contains(consName), ti.setError(ACEFormat::tr("A name is duplicated")), );
226
227 names.insert(consName);
228 consensus.clear();
229 do {
230 len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, aceBStart, IOAdapter::Term_Exclude, &ok);
231 CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
232 CHECK_EXT(len > 0, ti.setError(ACEFormat::tr("No consensus")), );
233
234 len = TextUtils::remove(buff, len, TextUtils::WHITES);
235 buff[len] = 0;
236 consensus.append(buff);
237 ti.setProgress(io->getProgress());
238 } while (!ti.isCoR() && !ok);
239
240 len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &ok);
241 CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
242
243 line = QString(QByteArray(buff, len)).trimmed();
244 CHECK_EXT(line.startsWith("BQ"), ti.setError(ACEFormat::tr("BQ keyword hasn't been found")), );
245
246 consensus = consensus.toUpper();
247 CHECK_EXT(checkSeq(consensus), ti.setError(ACEFormat::tr("Bad consensus data")), );
248
249 consensus.replace('*', U2Msa::GAP_CHAR);
250 }
251
parseAFTag(U2::IOAdapter * io,U2OpStatus & ti,char * buff,int count,QMap<QString,int> & posMap,QMap<QString,bool> & complMap,QSet<QString> & names)252 static inline void parseAFTag(U2::IOAdapter *io, U2OpStatus &ti, char *buff, int count, QMap<QString, int> &posMap, QMap<QString, bool> &complMap, QSet<QString> &names) {
253 int count1 = count;
254 QString readLine;
255 QString name;
256 qint64 len = 0;
257 while (!ti.isCoR() && count1 > 0) {
258 do {
259 skipBreaks(io, ti, buff, &len);
260 CHECK_OP(ti, );
261
262 readLine = QString(QByteArray(buff, len)).trimmed();
263 } while (!readLine.startsWith("AF"));
264
265 name = getName(readLine);
266 if (!readLine.startsWith("AF") || "" == name) {
267 ti.setError(ACEFormat::tr("There is no AF note"));
268 return;
269 }
270
271 int readPos = readsPos(readLine);
272 int complStrand = readsComplement(readLine);
273 if ((INT_MAX == readPos) || (-1 == complStrand)) {
274 ti.setError(ACEFormat::tr("Bad AF note"));
275 return;
276 }
277
278 int paddedStart = paddedStartCons(readLine);
279 CHECK_EXT(paddedStart != INT_MAX, ti.setError(ACEFormat::tr("Bad AF note")), );
280
281 posMap.insert(name, paddedStart);
282 CHECK_EXT(!names.contains(name), ti.setError(ACEFormat::tr("A name is duplicated")), );
283
284 bool cur_compl = (complStrand == 1);
285 complMap.insert(name, cur_compl);
286
287 names.insert(name);
288
289 count1--;
290 ti.setProgress(io->getProgress());
291 }
292 }
293
parseRDandQATag(U2::IOAdapter * io,U2OpStatus & ti,char * buff,QSet<QString> & names,QString & name,QByteArray & sequence)294 static inline void parseRDandQATag(U2::IOAdapter *io, U2OpStatus &ti, char *buff, QSet<QString> &names, QString &name, QByteArray &sequence) {
295 QString line;
296 qint64 len = 0;
297 bool ok = true;
298 char aceQStartChar = 'Q';
299 QBitArray aceQStart = TextUtils::createBitMap(aceQStartChar);
300 do {
301 skipBreaks(io, ti, buff, &len);
302 CHECK_OP(ti, );
303
304 line = QString(QByteArray(buff, len)).trimmed();
305 } while (!line.startsWith("RD"));
306
307 name = getName(line);
308 if (!line.startsWith("RD") || "" == name) {
309 ti.setError(ACEFormat::tr("There is no read note"));
310 return;
311 }
312
313 sequence.clear();
314 do {
315 len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, aceQStart, IOAdapter::Term_Exclude, &ok);
316 CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
317 CHECK_EXT(len > 0, ti.setError(ACEFormat::tr("No sequence")), );
318
319 len = TextUtils::remove(buff, len, TextUtils::WHITES);
320 buff[len] = 0;
321 sequence.append(buff);
322 ti.setProgress(io->getProgress());
323 } while (!ti.isCoR() && !ok);
324
325 len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &ok);
326 CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
327
328 line = QString(QByteArray(buff, len)).trimmed();
329 CHECK_EXT(line.startsWith("QA"), ti.setError(ACEFormat::tr("QA keyword hasn't been found")), );
330
331 int clearRangeStart = 0;
332 int clearRangeEnd = 0;
333
334 clearRangeStart = readsCount(line);
335 CHECK_EXT(clearRangeStart != -1, ti.setError(ACEFormat::tr("QA error no clear range")), );
336
337 clearRangeEnd = clearRange(line);
338 CHECK_EXT(clearRangeEnd != 0, ti.setError(ACEFormat::tr("QA error no clear range")), );
339
340 len = sequence.length();
341 if (clearRangeStart > clearRangeEnd || clearRangeEnd > len) {
342 ti.setError(ACEFormat::tr("QA error bad range"));
343 return;
344 }
345
346 sequence = sequence.toUpper();
347 CHECK_EXT(checkSeq(sequence), ti.setError(ACEFormat::tr("Bad sequence data")), );
348
349 if (!names.contains(name)) {
350 ti.setError(ACEFormat::tr("A name is not match with AF names"));
351 return;
352 } else {
353 names.remove(name);
354 }
355
356 sequence.replace('*', U2Msa::GAP_CHAR);
357 sequence.replace('N', U2Msa::GAP_CHAR);
358 sequence.replace('X', U2Msa::GAP_CHAR);
359 }
360
361 /**
362 * Offsets in an ACE file are specified relatively to the reference sequence,
363 * so "pos" can be negative.
364 */
getSmallestOffset(const QMap<QString,int> & posMap)365 static inline int getSmallestOffset(const QMap<QString, int> &posMap) {
366 int smallestOffset = 0;
367 foreach (int value, posMap) {
368 smallestOffset = qMin(smallestOffset, value - 1);
369 }
370
371 return smallestOffset;
372 }
373
load(IOAdapter * io,const U2DbiRef & dbiRef,QList<GObject * > & objects,const QVariantMap & hints,U2OpStatus & os)374 void ACEFormat::load(IOAdapter *io, const U2DbiRef &dbiRef, QList<GObject *> &objects, const QVariantMap &hints, U2OpStatus &os) {
375 QByteArray readBuff(READ_BUFF_SIZE + 1, 0);
376 char *buff = readBuff.data();
377 qint64 len = 0;
378
379 QByteArray sequence;
380 QSet<QString> names;
381 QMap<QString, bool> complMap;
382
383 // skip leading whites if present
384 bool lineOk = true;
385 skipBreaks(io, os, buff, &len);
386 CHECK_OP(os, );
387
388 QString headerLine = QString(QByteArray(buff, len)).trimmed();
389 CHECK_EXT(headerLine.startsWith(AS), os.setError(ACEFormat::tr("First line is not an ace header")), );
390
391 int contigC = contigCount(headerLine);
392 CHECK_EXT(contigC != -1, os.setError(ACEFormat::tr("No contig count tag in the header line")), );
393
394 for (int i = 0; i < contigC; i++) {
395 if (i == 0) {
396 QBitArray nonWhites = ~TextUtils::WHITES;
397 io->readUntil(buff, READ_BUFF_SIZE, nonWhites, IOAdapter::Term_Exclude, &lineOk);
398 CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );
399
400 // read header
401 skipBreaks(io, os, buff, &len);
402 CHECK_OP(os, );
403
404 headerLine = QString(QByteArray(buff, len)).trimmed();
405 CHECK_EXT(headerLine.startsWith(CO), os.setError(ACEFormat::tr("Must be CO keyword")), );
406 } else {
407 do {
408 skipBreaks(io, os, buff, &len);
409 CHECK_OP(os, );
410
411 headerLine = QString(QByteArray(buff, len)).trimmed();
412 } while (!headerLine.startsWith(CO));
413 }
414 int count = readsCount(headerLine);
415 CHECK_EXT(count != -1, os.setError(ACEFormat::tr("There is no note about reads count")), );
416
417 // consensus
418 QString name;
419 QByteArray consensus;
420 QString consName;
421
422 parseConsensus(io, os, buff, consName, names, headerLine, consensus);
423 CHECK_OP(os, );
424
425 MultipleSequenceAlignment al(consName);
426 al->addRow(consName, consensus);
427
428 // AF
429 QMap<QString, int> posMap;
430 parseAFTag(io, os, buff, count, posMap, complMap, names);
431 CHECK_OP(os, );
432
433 int smallestOffset = getSmallestOffset(posMap);
434 if (smallestOffset < 0) {
435 al->insertGaps(0, 0, qAbs(smallestOffset), os);
436 CHECK_OP(os, );
437 }
438
439 // RD and QA
440 while (!os.isCoR() && count > 0) {
441 parseRDandQATag(io, os, buff, names, name, sequence);
442 CHECK_OP(os, );
443
444 bool isComplement = complMap.take(name);
445 int pos = posMap.value(name) - 1;
446 if (smallestOffset < 0) {
447 pos += qAbs(smallestOffset);
448 }
449 QString rowName(name);
450 if (isComplement) {
451 rowName.append("(rev-compl)");
452 }
453
454 QByteArray offsetGaps;
455 offsetGaps.fill(U2Msa::GAP_CHAR, pos);
456 sequence.prepend(offsetGaps);
457 al->addRow(rowName, sequence);
458
459 count--;
460 os.setProgress(io->getProgress());
461 }
462 U2AlphabetUtils::assignAlphabet(al);
463 CHECK_EXT(al->getAlphabet() != nullptr, ACEFormat::tr("Alphabet unknown"), );
464
465 const QString folder = hints.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();
466
467 MultipleSequenceAlignmentObject *obj = MultipleSequenceAlignmentImporter::createAlignment(dbiRef, folder, al, os);
468 CHECK_OP(os, );
469 objects.append(obj);
470 }
471 }
472
checkRawTextData(const QByteArray & rawData,const GUrl &) const473 FormatCheckResult ACEFormat::checkRawTextData(const QByteArray &rawData, const GUrl &) const {
474 static const char *formatTag = "AS";
475
476 if (!rawData.startsWith(formatTag)) {
477 return FormatDetection_NotMatched;
478 }
479 return FormatDetection_AverageSimilarity;
480 }
481
loadTextDocument(IOAdapter * io,const U2DbiRef & dbiRef,const QVariantMap & fs,U2OpStatus & os)482 Document *ACEFormat::loadTextDocument(IOAdapter *io, const U2DbiRef &dbiRef, const QVariantMap &fs, U2OpStatus &os) {
483 QList<GObject *> objs;
484 load(io, dbiRef, objs, fs, os);
485
486 CHECK_OP_EXT(os, qDeleteAll(objs), nullptr);
487
488 if (objs.isEmpty()) {
489 os.setError(ACEFormat::tr("File doesn't contain any msa objects"));
490 return nullptr;
491 }
492 Document *doc = new Document(this, io->getFactory(), io->getURL(), dbiRef, objs, fs);
493
494 return doc;
495 }
496
497 } // namespace U2
498