1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "MSFFormat.h"
23
24 #include <U2Core/IOAdapter.h>
25 #include <U2Core/IOAdapterTextStream.h>
26 #include <U2Core/MultipleSequenceAlignmentImporter.h>
27 #include <U2Core/MultipleSequenceAlignmentObject.h>
28 #include <U2Core/MultipleSequenceAlignmentWalker.h>
29 #include <U2Core/TextUtils.h>
30 #include <U2Core/U2AlphabetUtils.h>
31 #include <U2Core/U2ObjectDbi.h>
32 #include <U2Core/U2OpStatus.h>
33 #include <U2Core/U2OpStatusUtils.h>
34 #include <U2Core/U2SafePoints.h>
35
36 namespace U2 {
37
38 const int MSFFormat::CHECK_SUM_MOD = 10000;
39 const QByteArray MSFFormat::MSF_FIELD = "MSF:";
40 const QByteArray MSFFormat::CHECK_FIELD = "Check:";
41 const QByteArray MSFFormat::LEN_FIELD = "Len:";
42 const QByteArray MSFFormat::NAME_FIELD = "Name:";
43 const QByteArray MSFFormat::TYPE_FIELD = "Type:";
44 const QByteArray MSFFormat::WEIGHT_FIELD = "Weight:";
45 const QByteArray MSFFormat::TYPE_VALUE_PROTEIN = "P";
46 const QByteArray MSFFormat::TYPE_VALUE_NUCLEIC = "N";
47 const double MSFFormat::WEIGHT_VALUE = 1.0;
48 const QByteArray MSFFormat::END_OF_HEADER_LINE = "..";
49 const QByteArray MSFFormat::SECTION_SEPARATOR = "//";
50 const int MSFFormat::CHARS_IN_ROW = 50;
51 const int MSFFormat::CHARS_IN_WORD = 10;
52
53 // TODO: recheck if it does support streaming! Fix isObjectOpSupported if not!
54
MSFFormat(QObject * parent)55 MSFFormat::MSFFormat(QObject *parent)
56 : TextDocumentFormat(parent, BaseDocumentFormats::MSF, DocumentFormatFlags(DocumentFormatFlag_SupportWriting) | DocumentFormatFlag_OnlyOneObject, QStringList("msf")) {
57 formatName = tr("MSF");
58 supportedObjectTypes += GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT;
59 formatDescription = tr("MSF format is used to store multiple aligned sequences. Files include the sequence name and the sequence itself, which is usually aligned with other sequences in the file.");
60 }
61
62 /** Parses MSF field value from the line. Returns empty line if the field was not found. */
parseField(const QString & line,const QString & name)63 static QString parseField(const QString &line, const QString &name) {
64 int p = line.indexOf(name);
65 if (p >= 0) {
66 p += name.length();
67 if (line[p] == ' ') {
68 ++p;
69 }
70 int q = line.indexOf(' ', p);
71 return q >= 0 ? line.mid(p, q - p) : line.mid(p);
72 }
73 return "";
74 }
75
getCheckSum(const QByteArray & seq)76 int MSFFormat::getCheckSum(const QByteArray &seq) {
77 int sum = 0;
78 static int CHECK_SUM_COUNTER_MOD = 57;
79 for (int i = 0; i < seq.length(); ++i) {
80 char ch = seq[i];
81 if (ch >= 'a' && ch <= 'z') {
82 ch = (char)(ch + 'A' - 'a');
83 }
84 sum = (sum + ((i % CHECK_SUM_COUNTER_MOD) + 1) * ch) % MSFFormat::CHECK_SUM_MOD;
85 }
86 return sum;
87 }
88
89 struct MsfRow {
MsfRowU2::MsfRow90 MsfRow()
91 : checksum(0), length(0) {
92 }
93
94 QString name;
95 int checksum;
96 int length;
97 };
98
load(IOAdapterReader & reader,const U2DbiRef & dbiRef,QList<GObject * > & objects,const QVariantMap & hints,U2OpStatus & os)99 void MSFFormat::load(IOAdapterReader &reader, const U2DbiRef &dbiRef, QList<GObject *> &objects, const QVariantMap &hints, U2OpStatus &os) {
100 QString objName = reader.getURL().baseFileName();
101 MultipleSequenceAlignment al(objName);
102 int lineNumber = 0; // Current line number from the object start. Used for error reporing.
103
104 // Skip comments.
105 int checkSum = -1;
106 while (!os.isCoR() && checkSum < 0 && !reader.atEnd()) {
107 QString line = reader.readLine(os, DocumentFormat::READ_BUFF_SIZE).simplified();
108 lineNumber++;
109 CHECK_OP(os, );
110 if (line.endsWith(END_OF_HEADER_LINE)) {
111 bool ok;
112 checkSum = parseField(line, CHECK_FIELD).toInt(&ok);
113 if (!ok || checkSum < 0) {
114 checkSum = CHECK_SUM_MOD;
115 }
116 }
117 os.setProgress(reader.getProgress());
118 }
119
120 // Read MSF structure.
121 int sum = 0;
122 QList<MsfRow> msfRows;
123
124 while (!os.isCoR() && !reader.atEnd()) {
125 QString line = reader.readLine(os, DocumentFormat::READ_BUFF_SIZE).simplified();
126 lineNumber++;
127 CHECK_OP(os, );
128 if (line.startsWith(SECTION_SEPARATOR)) {
129 break;
130 }
131
132 bool ok = false;
133 QString name = parseField(line, NAME_FIELD);
134 if (name.isEmpty()) {
135 continue;
136 }
137 int check = parseField(line, CHECK_FIELD).toInt(&ok);
138 if (!ok || check < 0) {
139 sum = check = CHECK_SUM_MOD;
140 }
141
142 MsfRow row;
143 row.name = name;
144 row.checksum = check;
145 msfRows << row;
146 al->addRow(name, QByteArray());
147 if (sum < CHECK_SUM_MOD) {
148 sum = (sum + check) % CHECK_SUM_MOD;
149 }
150
151 os.setProgress(reader.getProgress());
152 }
153 if (checkSum < CHECK_SUM_MOD && sum < CHECK_SUM_MOD && sum != checkSum) {
154 coreLog.info(tr("File check sum is incorrect: expected value: %1, current value %2").arg(checkSum).arg(sum));
155 }
156
157 // Read sequences.
158 QRegExp coordsRegexp("^\\d+(\\s+\\d+)?$");
159 int maRowIndex = 0;
160 bool prevLineIsEmpty = false;
161 while (!os.isCoR() && !reader.atEnd()) {
162 QString line = reader.readLine(os, DocumentFormat::READ_BUFF_SIZE).trimmed();
163 CHECK_OP(os, )
164 lineNumber++;
165
166 // Skip empty lines and lines with coordinates.
167 // 2 empty lines in a row or a line with coordinates make a new block: this way we support both
168 // MSFs with block coordinates and without (blocks separated by 2-empty lines only).
169 bool isCoordsRegexMatched = coordsRegexp.indexIn(line) != -1;
170 if (line.isEmpty() || isCoordsRegexMatched) {
171 if (isCoordsRegexMatched || prevLineIsEmpty) {
172 maRowIndex = 0;
173 }
174 prevLineIsEmpty = line.isEmpty();
175 continue;
176 }
177 CHECK_EXT(maRowIndex < msfRows.length(), os.setError(tr("MSF: too many rows in the block, line: %1").arg(QString::number(lineNumber))), );
178
179 int nameAndValueSeparatorIndex = line.indexOf(" ");
180 CHECK_EXT(nameAndValueSeparatorIndex >= 0, os.setError(tr("MSF: can't find name and value separator spacing, line: %1").arg(QString::number(lineNumber))), );
181
182 QString name = line.mid(0, nameAndValueSeparatorIndex);
183 CHECK_EXT(name == msfRows[maRowIndex].name,
184 os.setError(tr("MSF: row names do not match: %1 vs %2, line: %3").arg(msfRows[maRowIndex].name, name, QString::number(lineNumber))), );
185
186 QByteArray value = line.mid(nameAndValueSeparatorIndex + 1).simplified().replace(" ", "").toLatin1();
187 al->appendChars(maRowIndex, msfRows[maRowIndex].length, value.constData(), value.length());
188 msfRows[maRowIndex].length += value.length();
189 maRowIndex++;
190 os.setProgress(reader.getProgress());
191 }
192
193 // checksum
194 U2OpStatus2Log seqCheckOs;
195 const int numRows = al->getNumRows();
196 for (int i = 0; i < numRows; i++) {
197 const MultipleSequenceAlignmentRow row = al->getMsaRow(i);
198 const int expectedCheckSum = msfRows[i].checksum;
199 const int sequenceCheckSum = getCheckSum(row->toByteArray(seqCheckOs, al->getLength()));
200 if (expectedCheckSum < CHECK_SUM_MOD && sequenceCheckSum != expectedCheckSum) {
201 coreLog.info(tr("Unexpected check sum in the row number %1, name: %2; expected value: %3, current value %4").arg(i + 1).arg(row->getName()).arg(expectedCheckSum).arg(sequenceCheckSum));
202 }
203 al->replaceChars(i, '.', U2Msa::GAP_CHAR);
204 al->replaceChars(i, '~', U2Msa::GAP_CHAR);
205 }
206
207 U2AlphabetUtils::assignAlphabet(al);
208 CHECK_EXT(al->getAlphabet() != nullptr, os.setError(MSFFormat::tr("Alphabet unknown")), );
209
210 QString folder = hints.value(DBI_FOLDER_HINT, U2ObjectDbi::ROOT_FOLDER).toString();
211 MultipleSequenceAlignmentObject *obj = MultipleSequenceAlignmentImporter::createAlignment(dbiRef, folder, al, os);
212 CHECK_OP(os, );
213 objects.append(obj);
214 }
215
loadTextDocument(IOAdapterReader & reader,const U2DbiRef & dbiRef,const QVariantMap & fs,U2OpStatus & os)216 Document *MSFFormat::loadTextDocument(IOAdapterReader &reader, const U2DbiRef &dbiRef, const QVariantMap &fs, U2OpStatus &os) {
217 QList<GObject *> objs;
218 load(reader, dbiRef, objs, fs, os);
219
220 CHECK_OP_EXT(os, qDeleteAll(objs), nullptr);
221 return new Document(this, reader.getFactory(), reader.getURL(), dbiRef, objs, fs);
222 }
223
storeTextDocument(IOAdapterWriter & writer,Document * document,U2OpStatus & os)224 void MSFFormat::storeTextDocument(IOAdapterWriter &writer, Document *document, U2OpStatus &os) {
225 CHECK_OP(os, );
226 QMap<GObjectType, QList<GObject *>> objectsMap;
227 objectsMap[GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT] = document->getObjects();
228 storeTextEntry(writer, objectsMap, os);
229 }
230
231 static const QString SUFFIX_SEPARATOR = "_";
232
splitCompleteName(const QString & completeName,QString & baseName,QString & suffix)233 static void splitCompleteName(const QString &completeName, QString &baseName, QString &suffix) {
234 const int separatorIndex = completeName.lastIndexOf(SUFFIX_SEPARATOR);
235 if (separatorIndex == -1) {
236 baseName = completeName;
237 suffix = QString();
238 return;
239 }
240
241 suffix = completeName.mid(separatorIndex + 1);
242 bool ok = false;
243 suffix.toInt(&ok);
244 if (!ok) {
245 baseName = completeName;
246 suffix = QString();
247 } else {
248 baseName = completeName.left(separatorIndex);
249 }
250 }
251
increaseSuffix(const QString & completeName)252 static QString increaseSuffix(const QString &completeName) {
253 QString baseName;
254 QString suffix;
255 splitCompleteName(completeName, baseName, suffix);
256 if (suffix.isEmpty()) {
257 return completeName + SUFFIX_SEPARATOR + QString::number(1);
258 }
259 return baseName + SUFFIX_SEPARATOR + QString("%1").arg(suffix.toInt() + 1, suffix.length(), 10, QChar('0'));
260 }
261
rollRowName(const QString & rowName,const QList<QString> & nonUniqueNames)262 static QString rollRowName(const QString &rowName, const QList<QString> &nonUniqueNames) {
263 QString resultName = rowName;
264 while (nonUniqueNames.contains(resultName)) {
265 resultName = increaseSuffix(resultName);
266 }
267 return resultName;
268 }
269
storeTextEntry(IOAdapterWriter & writer,const QMap<GObjectType,QList<GObject * >> & objectsMap,U2OpStatus & os)270 void MSFFormat::storeTextEntry(IOAdapterWriter &writer, const QMap<GObjectType, QList<GObject *>> &objectsMap, U2OpStatus &os) {
271 SAFE_POINT(objectsMap.contains(GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT), "MSF entry storing: no alignment", );
272 const QList<GObject *> &objectList = objectsMap[GObjectTypes::MULTIPLE_SEQUENCE_ALIGNMENT];
273 SAFE_POINT(objectList.size() == 1, "MSFFormat::storeTextEntry can store only 1 object per file. Got: " + QString::number(objectList.size()), );
274
275 auto *obj = dynamic_cast<MultipleSequenceAlignmentObject *>(objectList.first());
276 SAFE_POINT(obj != nullptr, "MSF entry storing: the object is not an alignment", );
277
278 const MultipleSequenceAlignment &msa = obj->getMultipleAlignment();
279
280 // Make row names unique
281 QMap<qint64, QString> uniqueRowNames;
282 int maxNameLen = 0;
283 foreach (const MultipleSequenceAlignmentRow &row, msa->getMsaRows()) {
284 QString rolledRowName = rollRowName(row->getName().replace(' ', '_'), uniqueRowNames.values());
285 uniqueRowNames.insert(row->getRowId(), rolledRowName);
286 maxNameLen = qMax(maxNameLen, uniqueRowNames.last().length());
287 }
288
289 // Precalculate sequence writing params.
290 int maLen = msa->getLength();
291 int checkSum = 0;
292 QMap<qint64, int> checkSums;
293 foreach (const MultipleSequenceAlignmentRow &row, msa->getMsaRows()) {
294 QByteArray sequence = row->toByteArray(os, maLen).replace(U2Msa::GAP_CHAR, '.');
295 int seqCheckSum = getCheckSum(sequence);
296 checkSums.insert(row->getRowId(), seqCheckSum);
297 checkSum = (checkSum + seqCheckSum) % CHECK_SUM_MOD;
298 }
299 int maxLengthLen = QString::number(maLen).length();
300
301 // Write the first line.
302 QString line = " " + MSF_FIELD;
303 line += " " + QString::number(maLen);
304 line += " " + TYPE_FIELD;
305 line += " " + (obj->getAlphabet()->isAmino() ? TYPE_VALUE_PROTEIN : TYPE_VALUE_NUCLEIC);
306 line += " " + QDateTime::currentDateTime().toString("dd.MM.yyyy hh:mm");
307 line += " " + CHECK_FIELD;
308 line += " " + QString::number(checkSum);
309 line += " " + END_OF_HEADER_LINE + "\n\n";
310 writer.write(os, line);
311 CHECK_OP(os, );
312
313 // Write per-row info.
314 int maxCheckSumLen = 4;
315 foreach (const MultipleSequenceAlignmentRow &row, msa->getMsaRows()) {
316 line = " " + NAME_FIELD;
317 line += " " + uniqueRowNames[row->getRowId()].leftJustified(maxNameLen + 1);
318 line += " " + LEN_FIELD;
319 line += " " + QString("%1").arg(maLen, -maxLengthLen);
320 line += " " + CHECK_FIELD;
321 line += " " + QString("%1").arg(checkSums[row->getRowId()], -maxCheckSumLen);
322 line += " " + WEIGHT_FIELD;
323 line += " " + QByteArray::number(WEIGHT_VALUE) + "\n";
324 writer.write(os, line);
325 CHECK_OP(os, );
326 }
327
328 writer.write(os, "\n" + SECTION_SEPARATOR + "\n\n");
329 CHECK_OP(os, );
330
331 MultipleSequenceAlignmentWalker walker(msa, '.');
332 for (int i = 0; !os.isCoR() && i < maLen; i += CHARS_IN_ROW) {
333 /* write numbers */ {
334 line = QString(maxNameLen + 2, ' ');
335 QString t = QString("%1").arg(i + 1);
336 QString s = QString("%1").arg(i + CHARS_IN_ROW < maLen ? i + CHARS_IN_ROW : maLen);
337 int r = maLen - i < CHARS_IN_ROW ? maLen % CHARS_IN_ROW : CHARS_IN_ROW;
338 r += (r - 1) / CHARS_IN_WORD - (t.length() + s.length());
339 line += t;
340 if (r > 0) {
341 line += QString(r, ' ');
342 line += s;
343 }
344 line += '\n';
345 writer.write(os, line);
346 CHECK_OP(os, );
347 }
348
349 // write sequence
350 QList<QByteArray> sequenceList = walker.nextData(CHARS_IN_ROW, os);
351 CHECK_OP(os, );
352 QList<QByteArray>::ConstIterator si = sequenceList.constBegin();
353 QList<MultipleSequenceAlignmentRow> msaRowList = msa->getMsaRows();
354 QList<MultipleSequenceAlignmentRow>::ConstIterator ri = msaRowList.constBegin();
355 for (; si != sequenceList.constEnd(); si++, ri++) {
356 const MultipleSequenceAlignmentRow &row = *ri;
357 QString rowName = uniqueRowNames[row->getRowId()].leftJustified(maxNameLen + 1);
358 for (int j = 0; j < CHARS_IN_ROW && i + j < maLen; j += CHARS_IN_WORD) {
359 rowName += ' ';
360 int nChars = qMin(CHARS_IN_WORD, maLen - (i + j));
361 QString sequencePart = si->mid(j, nChars);
362 rowName += sequencePart;
363 }
364 rowName += '\n';
365 writer.write(os, rowName);
366 CHECK_OP(os, );
367 }
368 writer.write(os, "\n");
369 CHECK_OP(os, );
370 }
371 }
372
checkRawTextData(const QString & dataPrefix,const GUrl &) const373 FormatCheckResult MSFFormat::checkRawTextData(const QString &dataPrefix, const GUrl &) const {
374 if (dataPrefix.contains("MSF:") ||
375 dataPrefix.contains("!!AA_MULTIPLE_ALIGNMENT 1.0") ||
376 dataPrefix.contains("!!NA_MULTIPLE_ALIGNMENT 1.0") ||
377 (dataPrefix.contains("Name:") && dataPrefix.contains("Len:") &&
378 dataPrefix.contains("Check:") && dataPrefix.contains("Weight:"))) {
379 return FormatDetection_VeryHighSimilarity;
380 }
381
382 if (dataPrefix.contains("GDC ")) {
383 return FormatDetection_AverageSimilarity;
384 }
385
386 // MSF documents may contain unlimited number of comment lines in header ->
387 // it is impossible to determine if file has MSF format by some predefined
388 // amount of raw data read from it.
389 if (dataPrefix.contains("GCG ") || dataPrefix.contains("MSF ")) {
390 return FormatDetection_LowSimilarity;
391 }
392 return FormatDetection_NotMatched;
393 }
394
395 } // namespace U2
396