1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "AceFormat.h"
23 
24 #include <U2Core/AppContext.h>
25 #include <U2Core/GObjectRelationRoles.h>
26 #include <U2Core/GObjectTypes.h>
27 #include <U2Core/IOAdapter.h>
28 #include <U2Core/L10n.h>
29 #include <U2Core/MSAUtils.h>
30 #include <U2Core/MultipleSequenceAlignmentImporter.h>
31 #include <U2Core/MultipleSequenceAlignmentObject.h>
32 #include <U2Core/TextUtils.h>
33 #include <U2Core/U2AlphabetUtils.h>
34 #include <U2Core/U2ObjectDbi.h>
35 #include <U2Core/U2OpStatus.h>
36 #include <U2Core/U2SafePoints.h>
37 
38 #include <U2Formats/DocumentFormatUtils.h>
39 
40 namespace U2 {
41 
42 const QString ACEFormat::CO = "CO";
43 const QString ACEFormat::RD = "RD";
44 const QString ACEFormat::QA = "QA";
45 const QString ACEFormat::AS = "AS";
46 const QString ACEFormat::AF = "AF";
47 const QString ACEFormat::BQ = "BQ";
48 
ACEFormat(QObject * p)49 ACEFormat::ACEFormat(QObject *p)
50     : TextDocumentFormatDeprecated(p, BaseDocumentFormats::ACE, DocumentFormatFlags(0), QStringList("ace")) {
51     formatName = tr("ACE");
52     formatDescription = tr("ACE is a format used for storing information about genomic confgurations");
53     supportedObjectTypes += GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT;
54 }
55 
modifyLine(QString & line,int pos)56 static int modifyLine(QString &line, int pos) {
57     int curIdx = 0;
58     char space = ' ';
59 
60     line = line.simplified();
61 
62     for (int i = 0; i < pos; i++) {
63         curIdx = line.indexOf(space);
64         if (-1 == curIdx) {
65             return 0;
66         }
67 
68         line = line.mid(curIdx + 1);
69     }
70     curIdx = line.indexOf(space);
71     if (-1 == curIdx) {
72         return 0;
73     }
74 
75     line = line.mid(0, curIdx);
76 
77     bool ok = false;
78     int result = line.toInt(&ok);
79     if (ok == false) {
80         return -1;
81     } else {
82         return result;
83     }
84 }
85 
prepareLine(QString & line,int pos)86 static int prepareLine(QString &line, int pos) {
87     int curIdx = 0;
88     char space = ' ';
89 
90     line = line.simplified();
91 
92     for (int i = 0; i < pos; i++) {
93         curIdx = line.indexOf(space);
94         if (-1 == curIdx) {
95             return -1;
96         }
97 
98         line = line.mid(curIdx + 1);
99     }
100 
101     return curIdx;
102 }
103 
104 #define READS_COUNT_POS 3
readsCount(const QString & cur_line)105 static int readsCount(const QString &cur_line) {
106     QString line = cur_line;
107     return modifyLine(line, READS_COUNT_POS);
108 }
109 
110 #define CONTIG_COUNT_POS 1
contigCount(const QString & cur_line)111 static int contigCount(const QString &cur_line) {
112     QString line = cur_line;
113     return modifyLine(line, CONTIG_COUNT_POS);
114 }
115 
116 #define LAST_QA_POS 4
clearRange(const QString & cur_line)117 static int clearRange(const QString &cur_line) {
118     QString line = cur_line;
119     modifyLine(line, LAST_QA_POS);
120 
121     bool ok = true;
122     int result = line.toInt(&ok);
123     if (!ok) {
124         return INT_MAX;
125     } else {
126         return result;
127     }
128 }
129 #define PADDED_START_POS 3
paddedStartCons(const QString & cur_line)130 static int paddedStartCons(const QString &cur_line) {
131     QString line = cur_line;
132     modifyLine(line, PADDED_START_POS);
133 
134     bool ok = true;
135     int result = line.toInt(&ok);
136     if (!ok) {
137         return INT_MAX;
138     } else {
139         return result;
140     }
141 }
142 
143 #define READS_POS 3
readsPos(const QString & cur_line)144 static int readsPos(const QString &cur_line) {
145     QString line = cur_line;
146     prepareLine(line, READS_POS);
147 
148     if (-1 != line.indexOf(' ')) {
149         return INT_MAX;
150     }
151 
152     line = line.mid(0, line.length());
153 
154     bool ok = true;
155     int result = line.toInt(&ok);
156     if (!ok) {
157         return INT_MAX;
158     } else {
159         return result;
160     }
161 }
162 #define COMPLEMENT_POS 2
readsComplement(const QString & cur_line)163 static int readsComplement(const QString &cur_line) {
164     QString line = cur_line;
165     prepareLine(line, COMPLEMENT_POS);
166 
167     if (line.startsWith("U")) {
168         return 0;
169     } else if (line.startsWith("C")) {
170         return 1;
171     } else {
172         return -1;
173     }
174 }
175 
getName(const QString & line)176 static QString getName(const QString &line) {
177     int curIdx = 0;
178     char space = ' ';
179 
180     QString name = line.simplified();
181 
182     curIdx = name.indexOf(space);
183     if (-1 == curIdx) {
184         return "";
185     }
186 
187     name = name.mid(curIdx + 1);
188 
189     curIdx = name.indexOf(space);
190     if (-1 == curIdx) {
191         return "";
192     }
193 
194     name = name.mid(0, curIdx);
195 
196     return name;
197 }
198 
checkSeq(const QByteArray & seq)199 static bool checkSeq(const QByteArray &seq) {
200     const DNAAlphabet *alphabet = AppContext::getDNAAlphabetRegistry()->findById(BaseDNAAlphabetIds::NUCL_DNA_EXTENDED());
201     for (int i = 0; i < seq.length(); i++) {
202         if (!(alphabet->contains(seq[i]) || seq[i] == '*')) {
203             return false;
204         }
205     }
206     return true;
207 }
208 
skipBreaks(U2::IOAdapter * io,U2OpStatus & ti,char * buff,qint64 * len)209 static inline void skipBreaks(U2::IOAdapter *io, U2OpStatus &ti, char *buff, qint64 *len) {
210     bool lineOk = true;
211     *len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &lineOk);
212     CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
213     CHECK_EXT(*len != 0, ti.setError(ACEFormat::tr("Unexpected end of file")), );  // end if stream
214     CHECK_EXT(lineOk, ti.setError(ACEFormat::tr("Line is too long")), );
215 }
216 
parseConsensus(U2::IOAdapter * io,U2OpStatus & ti,char * buff,QString & consName,QSet<QString> & names,QString & headerLine,QByteArray & consensus)217 static inline void parseConsensus(U2::IOAdapter *io, U2OpStatus &ti, char *buff, QString &consName, QSet<QString> &names, QString &headerLine, QByteArray &consensus) {
218     char aceBStartChar = 'B';
219     QBitArray aceBStart = TextUtils::createBitMap(aceBStartChar);
220     qint64 len = 0;
221     bool ok = true;
222     QString line;
223     consName = getName(headerLine);
224     CHECK_EXT(!consName.isEmpty(), ti.setError(ACEFormat::tr("There is no AF note")), );
225     CHECK_EXT(!names.contains(consName), ti.setError(ACEFormat::tr("A name is duplicated")), );
226 
227     names.insert(consName);
228     consensus.clear();
229     do {
230         len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, aceBStart, IOAdapter::Term_Exclude, &ok);
231         CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
232         CHECK_EXT(len > 0, ti.setError(ACEFormat::tr("No consensus")), );
233 
234         len = TextUtils::remove(buff, len, TextUtils::WHITES);
235         buff[len] = 0;
236         consensus.append(buff);
237         ti.setProgress(io->getProgress());
238     } while (!ti.isCoR() && !ok);
239 
240     len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &ok);
241     CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
242 
243     line = QString(QByteArray(buff, len)).trimmed();
244     CHECK_EXT(line.startsWith("BQ"), ti.setError(ACEFormat::tr("BQ keyword hasn't been found")), );
245 
246     consensus = consensus.toUpper();
247     CHECK_EXT(checkSeq(consensus), ti.setError(ACEFormat::tr("Bad consensus data")), );
248 
249     consensus.replace('*', U2Msa::GAP_CHAR);
250 }
251 
parseAFTag(U2::IOAdapter * io,U2OpStatus & ti,char * buff,int count,QMap<QString,int> & posMap,QMap<QString,bool> & complMap,QSet<QString> & names)252 static inline void parseAFTag(U2::IOAdapter *io, U2OpStatus &ti, char *buff, int count, QMap<QString, int> &posMap, QMap<QString, bool> &complMap, QSet<QString> &names) {
253     int count1 = count;
254     QString readLine;
255     QString name;
256     qint64 len = 0;
257     while (!ti.isCoR() && count1 > 0) {
258         do {
259             skipBreaks(io, ti, buff, &len);
260             CHECK_OP(ti, );
261 
262             readLine = QString(QByteArray(buff, len)).trimmed();
263         } while (!readLine.startsWith("AF"));
264 
265         name = getName(readLine);
266         if (!readLine.startsWith("AF") || "" == name) {
267             ti.setError(ACEFormat::tr("There is no AF note"));
268             return;
269         }
270 
271         int readPos = readsPos(readLine);
272         int complStrand = readsComplement(readLine);
273         if ((INT_MAX == readPos) || (-1 == complStrand)) {
274             ti.setError(ACEFormat::tr("Bad AF note"));
275             return;
276         }
277 
278         int paddedStart = paddedStartCons(readLine);
279         CHECK_EXT(paddedStart != INT_MAX, ti.setError(ACEFormat::tr("Bad AF note")), );
280 
281         posMap.insert(name, paddedStart);
282         CHECK_EXT(!names.contains(name), ti.setError(ACEFormat::tr("A name is duplicated")), );
283 
284         bool cur_compl = (complStrand == 1);
285         complMap.insert(name, cur_compl);
286 
287         names.insert(name);
288 
289         count1--;
290         ti.setProgress(io->getProgress());
291     }
292 }
293 
parseRDandQATag(U2::IOAdapter * io,U2OpStatus & ti,char * buff,QSet<QString> & names,QString & name,QByteArray & sequence)294 static inline void parseRDandQATag(U2::IOAdapter *io, U2OpStatus &ti, char *buff, QSet<QString> &names, QString &name, QByteArray &sequence) {
295     QString line;
296     qint64 len = 0;
297     bool ok = true;
298     char aceQStartChar = 'Q';
299     QBitArray aceQStart = TextUtils::createBitMap(aceQStartChar);
300     do {
301         skipBreaks(io, ti, buff, &len);
302         CHECK_OP(ti, );
303 
304         line = QString(QByteArray(buff, len)).trimmed();
305     } while (!line.startsWith("RD"));
306 
307     name = getName(line);
308     if (!line.startsWith("RD") || "" == name) {
309         ti.setError(ACEFormat::tr("There is no read note"));
310         return;
311     }
312 
313     sequence.clear();
314     do {
315         len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, aceQStart, IOAdapter::Term_Exclude, &ok);
316         CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
317         CHECK_EXT(len > 0, ti.setError(ACEFormat::tr("No sequence")), );
318 
319         len = TextUtils::remove(buff, len, TextUtils::WHITES);
320         buff[len] = 0;
321         sequence.append(buff);
322         ti.setProgress(io->getProgress());
323     } while (!ti.isCoR() && !ok);
324 
325     len = io->readUntil(buff, DocumentFormat::READ_BUFF_SIZE, TextUtils::LINE_BREAKS, IOAdapter::Term_Include, &ok);
326     CHECK_EXT(!io->hasError(), ti.setError(io->errorString()), );
327 
328     line = QString(QByteArray(buff, len)).trimmed();
329     CHECK_EXT(line.startsWith("QA"), ti.setError(ACEFormat::tr("QA keyword hasn't been found")), );
330 
331     int clearRangeStart = 0;
332     int clearRangeEnd = 0;
333 
334     clearRangeStart = readsCount(line);
335     CHECK_EXT(clearRangeStart != -1, ti.setError(ACEFormat::tr("QA error no clear range")), );
336 
337     clearRangeEnd = clearRange(line);
338     CHECK_EXT(clearRangeEnd != 0, ti.setError(ACEFormat::tr("QA error no clear range")), );
339 
340     len = sequence.length();
341     if (clearRangeStart > clearRangeEnd || clearRangeEnd > len) {
342         ti.setError(ACEFormat::tr("QA error bad range"));
343         return;
344     }
345 
346     sequence = sequence.toUpper();
347     CHECK_EXT(checkSeq(sequence), ti.setError(ACEFormat::tr("Bad sequence data")), );
348 
349     if (!names.contains(name)) {
350         ti.setError(ACEFormat::tr("A name is not match with AF names"));
351         return;
352     } else {
353         names.remove(name);
354     }
355 
356     sequence.replace('*', U2Msa::GAP_CHAR);
357     sequence.replace('N', U2Msa::GAP_CHAR);
358     sequence.replace('X', U2Msa::GAP_CHAR);
359 }
360 
361 /**
362  * Offsets in an ACE file are specified relatively to the reference sequence,
363  * so "pos" can be negative.
364  */
getSmallestOffset(const QMap<QString,int> & posMap)365 static inline int getSmallestOffset(const QMap<QString, int> &posMap) {
366     int smallestOffset = 0;
367     foreach (int value, posMap) {
368         smallestOffset = qMin(smallestOffset, value - 1);
369     }
370 
371     return smallestOffset;
372 }
373 
load(IOAdapter * io,const U2DbiRef & dbiRef,QList<GObject * > & objects,const QVariantMap & hints,U2OpStatus & os)374 void ACEFormat::load(IOAdapter *io, const U2DbiRef &dbiRef, QList<GObject *> &objects, const QVariantMap &hints, U2OpStatus &os) {
375     QByteArray readBuff(READ_BUFF_SIZE + 1, 0);
376     char *buff = readBuff.data();
377     qint64 len = 0;
378 
379     QByteArray sequence;
380     QSet<QString> names;
381     QMap<QString, bool> complMap;
382 
383     // skip leading whites if present
384     bool lineOk = true;
385     skipBreaks(io, os, buff, &len);
386     CHECK_OP(os, );
387 
388     QString headerLine = QString(QByteArray(buff, len)).trimmed();
389     CHECK_EXT(headerLine.startsWith(AS), os.setError(ACEFormat::tr("First line is not an ace header")), );
390 
391     int contigC = contigCount(headerLine);
392     CHECK_EXT(contigC != -1, os.setError(ACEFormat::tr("No contig count tag in the header line")), );
393 
394     for (int i = 0; i < contigC; i++) {
395         if (i == 0) {
396             QBitArray nonWhites = ~TextUtils::WHITES;
397             io->readUntil(buff, READ_BUFF_SIZE, nonWhites, IOAdapter::Term_Exclude, &lineOk);
398             CHECK_EXT(!io->hasError(), os.setError(io->errorString()), );
399 
400             // read header
401             skipBreaks(io, os, buff, &len);
402             CHECK_OP(os, );
403 
404             headerLine = QString(QByteArray(buff, len)).trimmed();
405             CHECK_EXT(headerLine.startsWith(CO), os.setError(ACEFormat::tr("Must be CO keyword")), );
406         } else {
407             do {
408                 skipBreaks(io, os, buff, &len);
409                 CHECK_OP(os, );
410 
411                 headerLine = QString(QByteArray(buff, len)).trimmed();
412             } while (!headerLine.startsWith(CO));
413         }
414         int count = readsCount(headerLine);
415         CHECK_EXT(count != -1, os.setError(ACEFormat::tr("There is no note about reads count")), );
416 
417         // consensus
418         QString name;
419         QByteArray consensus;
420         QString consName;
421 
422         parseConsensus(io, os, buff, consName, names, headerLine, consensus);
423         CHECK_OP(os, );
424 
425         MultipleSequenceAlignment al(consName);
426         al->addRow(consName, consensus);
427 
428         // AF
429         QMap<QString, int> posMap;
430         parseAFTag(io, os, buff, count, posMap, complMap, names);
431         CHECK_OP(os, );
432 
433         int smallestOffset = getSmallestOffset(posMap);
434         if (smallestOffset < 0) {
435             al->insertGaps(0, 0, qAbs(smallestOffset), os);
436             CHECK_OP(os, );
437         }
438 
439         // RD and QA
440         while (!os.isCoR() && count > 0) {
441             parseRDandQATag(io, os, buff, names, name, sequence);
442             CHECK_OP(os, );
443 
444             bool isComplement = complMap.take(name);
445             int pos = posMap.value(name) - 1;
446             if (smallestOffset < 0) {
447                 pos += qAbs(smallestOffset);
448             }
449             QString rowName(name);
450             if (isComplement) {
451                 rowName.append("(rev-compl)");
452             }
453 
454             QByteArray offsetGaps;
455             offsetGaps.fill(U2Msa::GAP_CHAR, pos);
456             sequence.prepend(offsetGaps);
457             al->addRow(rowName, sequence);
458 
459             count--;
460             os.setProgress(io->getProgress());
461         }
462         U2AlphabetUtils::assignAlphabet(al);
463         CHECK_EXT(al->getAlphabet() != nullptr, ACEFormat::tr("Alphabet unknown"), );
464 
465         const QString folder = hints.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();
466 
467         MultipleSequenceAlignmentObject *obj = MultipleSequenceAlignmentImporter::createAlignment(dbiRef, folder, al, os);
468         CHECK_OP(os, );
469         objects.append(obj);
470     }
471 }
472 
checkRawTextData(const QByteArray & rawData,const GUrl &) const473 FormatCheckResult ACEFormat::checkRawTextData(const QByteArray &rawData, const GUrl &) const {
474     static const char *formatTag = "AS";
475 
476     if (!rawData.startsWith(formatTag)) {
477         return FormatDetection_NotMatched;
478     }
479     return FormatDetection_AverageSimilarity;
480 }
481 
loadTextDocument(IOAdapter * io,const U2DbiRef & dbiRef,const QVariantMap & fs,U2OpStatus & os)482 Document *ACEFormat::loadTextDocument(IOAdapter *io, const U2DbiRef &dbiRef, const QVariantMap &fs, U2OpStatus &os) {
483     QList<GObject *> objs;
484     load(io, dbiRef, objs, fs, os);
485 
486     CHECK_OP_EXT(os, qDeleteAll(objs), nullptr);
487 
488     if (objs.isEmpty()) {
489         os.setError(ACEFormat::tr("File doesn't contain any msa objects"));
490         return nullptr;
491     }
492     Document *doc = new Document(this, io->getFactory(), io->getURL(), dbiRef, objs, fs);
493 
494     return doc;
495 }
496 
497 }  // namespace U2
498