1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "MSFFormat.h"
23 
24 #include <U2Core/IOAdapter.h>
25 #include <U2Core/IOAdapterTextStream.h>
26 #include <U2Core/MultipleSequenceAlignmentImporter.h>
27 #include <U2Core/MultipleSequenceAlignmentObject.h>
28 #include <U2Core/MultipleSequenceAlignmentWalker.h>
29 #include <U2Core/TextUtils.h>
30 #include <U2Core/U2AlphabetUtils.h>
31 #include <U2Core/U2ObjectDbi.h>
32 #include <U2Core/U2OpStatus.h>
33 #include <U2Core/U2OpStatusUtils.h>
34 #include <U2Core/U2SafePoints.h>
35 
36 namespace U2 {
37 
38 const int MSFFormat::CHECK_SUM_MOD = 10000;
39 const QByteArray MSFFormat::MSF_FIELD = "MSF:";
40 const QByteArray MSFFormat::CHECK_FIELD = "Check:";
41 const QByteArray MSFFormat::LEN_FIELD = "Len:";
42 const QByteArray MSFFormat::NAME_FIELD = "Name:";
43 const QByteArray MSFFormat::TYPE_FIELD = "Type:";
44 const QByteArray MSFFormat::WEIGHT_FIELD = "Weight:";
45 const QByteArray MSFFormat::TYPE_VALUE_PROTEIN = "P";
46 const QByteArray MSFFormat::TYPE_VALUE_NUCLEIC = "N";
47 const double MSFFormat::WEIGHT_VALUE = 1.0;
48 const QByteArray MSFFormat::END_OF_HEADER_LINE = "..";
49 const QByteArray MSFFormat::SECTION_SEPARATOR = "//";
50 const int MSFFormat::CHARS_IN_ROW = 50;
51 const int MSFFormat::CHARS_IN_WORD = 10;
52 
53 // TODO: recheck if it does support streaming! Fix isObjectOpSupported if not!
54 
MSFFormat(QObject * parent)55 MSFFormat::MSFFormat(QObject *parent)
56     : TextDocumentFormat(parent, BaseDocumentFormats::MSF, DocumentFormatFlags(DocumentFormatFlag_SupportWriting) | DocumentFormatFlag_OnlyOneObject, QStringList("msf")) {
57     formatName = tr("MSF");
58     supportedObjectTypes += GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT;
59     formatDescription = tr("MSF format is used to store multiple aligned sequences. Files include the sequence name and the sequence itself, which is usually aligned with other sequences in the file.");
60 }
61 
62 /** Parses MSF field value from the line. Returns empty line if the field was not found. */
parseField(const QString & line,const QString & name)63 static QString parseField(const QString &line, const QString &name) {
64     int p = line.indexOf(name);
65     if (p >= 0) {
66         p += name.length();
67         if (line[p] == ' ') {
68             ++p;
69         }
70         int q = line.indexOf(' ', p);
71         return q >= 0 ? line.mid(p, q - p) : line.mid(p);
72     }
73     return "";
74 }
75 
getCheckSum(const QByteArray & seq)76 int MSFFormat::getCheckSum(const QByteArray &seq) {
77     int sum = 0;
78     static int CHECK_SUM_COUNTER_MOD = 57;
79     for (int i = 0; i < seq.length(); ++i) {
80         char ch = seq[i];
81         if (ch >= 'a' && ch <= 'z') {
82             ch = (char)(ch + 'A' - 'a');
83         }
84         sum = (sum + ((i % CHECK_SUM_COUNTER_MOD) + 1) * ch) % MSFFormat::CHECK_SUM_MOD;
85     }
86     return sum;
87 }
88 
89 struct MsfRow {
MsfRowU2::MsfRow90     MsfRow()
91         : checksum(0), length(0) {
92     }
93 
94     QString name;
95     int checksum;
96     int length;
97 };
98 
load(IOAdapterReader & reader,const U2DbiRef & dbiRef,QList<GObject * > & objects,const QVariantMap & hints,U2OpStatus & os)99 void MSFFormat::load(IOAdapterReader &reader, const U2DbiRef &dbiRef, QList<GObject *> &objects, const QVariantMap &hints, U2OpStatus &os) {
100     QString objName = reader.getURL().baseFileName();
101     MultipleSequenceAlignment al(objName);
102     int lineNumber = 0;  // Current line number from the object start. Used for error reporing.
103 
104     // Skip comments.
105     int checkSum = -1;
106     while (!os.isCoR() && checkSum < 0 && !reader.atEnd()) {
107         QString line = reader.readLine(os, DocumentFormat::READ_BUFF_SIZE).simplified();
108         lineNumber++;
109         CHECK_OP(os, );
110         if (line.endsWith(END_OF_HEADER_LINE)) {
111             bool ok;
112             checkSum = parseField(line, CHECK_FIELD).toInt(&ok);
113             if (!ok || checkSum < 0) {
114                 checkSum = CHECK_SUM_MOD;
115             }
116         }
117         os.setProgress(reader.getProgress());
118     }
119 
120     // Read MSF structure.
121     int sum = 0;
122     QList<MsfRow> msfRows;
123 
124     while (!os.isCoR() && !reader.atEnd()) {
125         QString line = reader.readLine(os, DocumentFormat::READ_BUFF_SIZE).simplified();
126         lineNumber++;
127         CHECK_OP(os, );
128         if (line.startsWith(SECTION_SEPARATOR)) {
129             break;
130         }
131 
132         bool ok = false;
133         QString name = parseField(line, NAME_FIELD);
134         if (name.isEmpty()) {
135             continue;
136         }
137         int check = parseField(line, CHECK_FIELD).toInt(&ok);
138         if (!ok || check < 0) {
139             sum = check = CHECK_SUM_MOD;
140         }
141 
142         MsfRow row;
143         row.name = name;
144         row.checksum = check;
145         msfRows << row;
146         al->addRow(name, QByteArray());
147         if (sum < CHECK_SUM_MOD) {
148             sum = (sum + check) % CHECK_SUM_MOD;
149         }
150 
151         os.setProgress(reader.getProgress());
152     }
153     if (checkSum < CHECK_SUM_MOD && sum < CHECK_SUM_MOD && sum != checkSum) {
154         coreLog.info(tr("File check sum is incorrect: expected value: %1, current value %2").arg(checkSum).arg(sum));
155     }
156 
157     // Read sequences.
158     QRegExp coordsRegexp("^\\d+(\\s+\\d+)?$");
159     int maRowIndex = 0;
160     bool prevLineIsEmpty = false;
161     while (!os.isCoR() && !reader.atEnd()) {
162         QString line = reader.readLine(os, DocumentFormat::READ_BUFF_SIZE).trimmed();
163         CHECK_OP(os, )
164         lineNumber++;
165 
166         // Skip empty lines and lines with coordinates.
167         // 2 empty lines in a row or a line with coordinates make a new block: this way we support both
168         // MSFs with block coordinates and without (blocks separated by 2-empty lines only).
169         bool isCoordsRegexMatched = coordsRegexp.indexIn(line) != -1;
170         if (line.isEmpty() || isCoordsRegexMatched) {
171             if (isCoordsRegexMatched || prevLineIsEmpty) {
172                 maRowIndex = 0;
173             }
174             prevLineIsEmpty = line.isEmpty();
175             continue;
176         }
177         CHECK_EXT(maRowIndex < msfRows.length(), os.setError(tr("MSF: too many rows in the block, line: %1").arg(QString::number(lineNumber))), );
178 
179         int nameAndValueSeparatorIndex = line.indexOf(" ");
180         CHECK_EXT(nameAndValueSeparatorIndex >= 0, os.setError(tr("MSF: can't find name and value separator spacing, line: %1").arg(QString::number(lineNumber))), );
181 
182         QString name = line.mid(0, nameAndValueSeparatorIndex);
183         CHECK_EXT(name == msfRows[maRowIndex].name,
184                   os.setError(tr("MSF: row names do not match: %1 vs %2, line: %3").arg(msfRows[maRowIndex].name, name, QString::number(lineNumber))), );
185 
186         QByteArray value = line.mid(nameAndValueSeparatorIndex + 1).simplified().replace(" ", "").toLatin1();
187         al->appendChars(maRowIndex, msfRows[maRowIndex].length, value.constData(), value.length());
188         msfRows[maRowIndex].length += value.length();
189         maRowIndex++;
190         os.setProgress(reader.getProgress());
191     }
192 
193     // checksum
194     U2OpStatus2Log seqCheckOs;
195     const int numRows = al->getNumRows();
196     for (int i = 0; i < numRows; i++) {
197         const MultipleSequenceAlignmentRow row = al->getMsaRow(i);
198         const int expectedCheckSum = msfRows[i].checksum;
199         const int sequenceCheckSum = getCheckSum(row->toByteArray(seqCheckOs, al->getLength()));
200         if (expectedCheckSum < CHECK_SUM_MOD && sequenceCheckSum != expectedCheckSum) {
201             coreLog.info(tr("Unexpected check sum in the row number %1, name: %2; expected value: %3, current value %4").arg(i + 1).arg(row->getName()).arg(expectedCheckSum).arg(sequenceCheckSum));
202         }
203         al->replaceChars(i, '.', U2Msa::GAP_CHAR);
204         al->replaceChars(i, '~', U2Msa::GAP_CHAR);
205     }
206 
207     U2AlphabetUtils::assignAlphabet(al);
208     CHECK_EXT(al->getAlphabet() != nullptr, os.setError(MSFFormat::tr("Alphabet unknown")), );
209 
210     QString folder = hints.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();
211     MultipleSequenceAlignmentObject *obj = MultipleSequenceAlignmentImporter::createAlignment(dbiRef, folder, al, os);
212     CHECK_OP(os, );
213     objects.append(obj);
214 }
215 
loadTextDocument(IOAdapterReader & reader,const U2DbiRef & dbiRef,const QVariantMap & fs,U2OpStatus & os)216 Document *MSFFormat::loadTextDocument(IOAdapterReader &reader, const U2DbiRef &dbiRef, const QVariantMap &fs, U2OpStatus &os) {
217     QList<GObject *> objs;
218     load(reader, dbiRef, objs, fs, os);
219 
220     CHECK_OP_EXT(os, qDeleteAll(objs), nullptr);
221     return new Document(this, reader.getFactory(), reader.getURL(), dbiRef, objs, fs);
222 }
223 
storeTextDocument(IOAdapterWriter & writer,Document * document,U2OpStatus & os)224 void MSFFormat::storeTextDocument(IOAdapterWriter &writer, Document *document, U2OpStatus &os) {
225     CHECK_OP(os, );
226     QMap<GObjectType, QList<GObject *>> objectsMap;
227     objectsMap[GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT] = document->getObjects();
228     storeTextEntry(writer, objectsMap, os);
229 }
230 
231 static const QString SUFFIX_SEPARATOR = "_";
232 
splitCompleteName(const QString & completeName,QString & baseName,QString & suffix)233 static void splitCompleteName(const QString &completeName, QString &baseName, QString &suffix) {
234     const int separatorIndex = completeName.lastIndexOf(SUFFIX_SEPARATOR);
235     if (separatorIndex == -1) {
236         baseName = completeName;
237         suffix = QString();
238         return;
239     }
240 
241     suffix = completeName.mid(separatorIndex + 1);
242     bool ok = false;
243     suffix.toInt(&ok);
244     if (!ok) {
245         baseName = completeName;
246         suffix = QString();
247     } else {
248         baseName = completeName.left(separatorIndex);
249     }
250 }
251 
increaseSuffix(const QString & completeName)252 static QString increaseSuffix(const QString &completeName) {
253     QString baseName;
254     QString suffix;
255     splitCompleteName(completeName, baseName, suffix);
256     if (suffix.isEmpty()) {
257         return completeName + SUFFIX_SEPARATOR + QString::number(1);
258     }
259     return baseName + SUFFIX_SEPARATOR + QString("%1").arg(suffix.toInt() + 1, suffix.length(), 10, QChar('0'));
260 }
261 
rollRowName(const QString & rowName,const QList<QString> & nonUniqueNames)262 static QString rollRowName(const QString &rowName, const QList<QString> &nonUniqueNames) {
263     QString resultName = rowName;
264     while (nonUniqueNames.contains(resultName)) {
265         resultName = increaseSuffix(resultName);
266     }
267     return resultName;
268 }
269 
storeTextEntry(IOAdapterWriter & writer,const QMap<GObjectType,QList<GObject * >> & objectsMap,U2OpStatus & os)270 void MSFFormat::storeTextEntry(IOAdapterWriter &writer, const QMap<GObjectType, QList<GObject *>> &objectsMap, U2OpStatus &os) {
271     SAFE_POINT(objectsMap.contains(GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT), "MSF entry storing: no alignment", );
272     const QList<GObject *> &objectList = objectsMap[GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT];
273     SAFE_POINT(objectList.size() == 1, "MSFFormat::storeTextEntry can store only 1 object per file. Got: " + QString::number(objectList.size()), );
274 
275     auto *obj = dynamic_cast<MultipleSequenceAlignmentObject *>(objectList.first());
276     SAFE_POINT(obj != nullptr, "MSF entry storing: the object is not an alignment", );
277 
278     const MultipleSequenceAlignment &msa = obj->getMultipleAlignment();
279 
280     // Make row names unique
281     QMap<qint64, QString> uniqueRowNames;
282     int maxNameLen = 0;
283     foreach (const MultipleSequenceAlignmentRow &row, msa->getMsaRows()) {
284         QString rolledRowName = rollRowName(row->getName().replace(' ', '_'), uniqueRowNames.values());
285         uniqueRowNames.insert(row->getRowId(), rolledRowName);
286         maxNameLen = qMax(maxNameLen, uniqueRowNames.last().length());
287     }
288 
289     // Precalculate sequence writing params.
290     int maLen = msa->getLength();
291     int checkSum = 0;
292     QMap<qint64, int> checkSums;
293     foreach (const MultipleSequenceAlignmentRow &row, msa->getMsaRows()) {
294         QByteArray sequence = row->toByteArray(os, maLen).replace(U2Msa::GAP_CHAR, '.');
295         int seqCheckSum = getCheckSum(sequence);
296         checkSums.insert(row->getRowId(), seqCheckSum);
297         checkSum = (checkSum + seqCheckSum) % CHECK_SUM_MOD;
298     }
299     int maxLengthLen = QString::number(maLen).length();
300 
301     // Write the first line.
302     QString line = "  " + MSF_FIELD;
303     line += " " + QString::number(maLen);
304     line += "  " + TYPE_FIELD;
305     line += " " + (obj->getAlphabet()->isAmino() ? TYPE_VALUE_PROTEIN : TYPE_VALUE_NUCLEIC);
306     line += "  " + QDateTime::currentDateTime().toString("dd.MM.yyyy hh:mm");
307     line += "  " + CHECK_FIELD;
308     line += " " + QString::number(checkSum);
309     line += "  " + END_OF_HEADER_LINE + "\n\n";
310     writer.write(os, line);
311     CHECK_OP(os, );
312 
313     // Write per-row info.
314     int maxCheckSumLen = 4;
315     foreach (const MultipleSequenceAlignmentRow &row, msa->getMsaRows()) {
316         line = " " + NAME_FIELD;
317         line += " " + uniqueRowNames[row->getRowId()].leftJustified(maxNameLen + 1);
318         line += "  " + LEN_FIELD;
319         line += " " + QString("%1").arg(maLen, -maxLengthLen);
320         line += "  " + CHECK_FIELD;
321         line += " " + QString("%1").arg(checkSums[row->getRowId()], -maxCheckSumLen);
322         line += "  " + WEIGHT_FIELD;
323         line += " " + QByteArray::number(WEIGHT_VALUE) + "\n";
324         writer.write(os, line);
325         CHECK_OP(os, );
326     }
327 
328     writer.write(os, "\n" + SECTION_SEPARATOR + "\n\n");
329     CHECK_OP(os, );
330 
331     MultipleSequenceAlignmentWalker walker(msa, '.');
332     for (int i = 0; !os.isCoR() && i < maLen; i += CHARS_IN_ROW) {
333         /* write numbers */ {
334             line = QString(maxNameLen + 2, ' ');
335             QString t = QString("%1").arg(i + 1);
336             QString s = QString("%1").arg(i + CHARS_IN_ROW < maLen ? i + CHARS_IN_ROW : maLen);
337             int r = maLen - i < CHARS_IN_ROW ? maLen % CHARS_IN_ROW : CHARS_IN_ROW;
338             r += (r - 1) / CHARS_IN_WORD - (t.length() + s.length());
339             line += t;
340             if (r > 0) {
341                 line += QString(r, ' ');
342                 line += s;
343             }
344             line += '\n';
345             writer.write(os, line);
346             CHECK_OP(os, );
347         }
348 
349         // write sequence
350         QList<QByteArray> sequenceList = walker.nextData(CHARS_IN_ROW, os);
351         CHECK_OP(os, );
352         QList<QByteArray>::ConstIterator si = sequenceList.constBegin();
353         QList<MultipleSequenceAlignmentRow> msaRowList = msa->getMsaRows();
354         QList<MultipleSequenceAlignmentRow>::ConstIterator ri = msaRowList.constBegin();
355         for (; si != sequenceList.constEnd(); si++, ri++) {
356             const MultipleSequenceAlignmentRow &row = *ri;
357             QString rowName = uniqueRowNames[row->getRowId()].leftJustified(maxNameLen + 1);
358             for (int j = 0; j < CHARS_IN_ROW && i + j < maLen; j += CHARS_IN_WORD) {
359                 rowName += ' ';
360                 int nChars = qMin(CHARS_IN_WORD, maLen - (i + j));
361                 QString sequencePart = si->mid(j, nChars);
362                 rowName += sequencePart;
363             }
364             rowName += '\n';
365             writer.write(os, rowName);
366             CHECK_OP(os, );
367         }
368         writer.write(os, "\n");
369         CHECK_OP(os, );
370     }
371 }
372 
checkRawTextData(const QString & dataPrefix,const GUrl &) const373 FormatCheckResult MSFFormat::checkRawTextData(const QString &dataPrefix, const GUrl &) const {
374     if (dataPrefix.contains("MSF:") ||
375         dataPrefix.contains("!!AA_MULTIPLE_ALIGNMENT 1.0") ||
376         dataPrefix.contains("!!NA_MULTIPLE_ALIGNMENT 1.0") ||
377         (dataPrefix.contains("Name:") && dataPrefix.contains("Len:") &&
378          dataPrefix.contains("Check:") && dataPrefix.contains("Weight:"))) {
379         return FormatDetection_VeryHighSimilarity;
380     }
381 
382     if (dataPrefix.contains("GDC ")) {
383         return FormatDetection_AverageSimilarity;
384     }
385 
386     // MSF documents may contain unlimited number of comment lines in header ->
387     // it is impossible to determine if file has MSF format by some predefined
388     // amount of raw data read from it.
389     if (dataPrefix.contains("GCG ") || dataPrefix.contains("MSF ")) {
390         return FormatDetection_LowSimilarity;
391     }
392     return FormatDetection_NotMatched;
393 }
394 
395 }  // namespace U2
396