1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "SamReader.h"
23 #include <SamtoolsAdapter.h>
24 
25 #include <U2Core/Log.h>
26 #include <U2Core/U2AssemblyUtils.h>
27 
28 #include "BAMDbiPlugin.h"
29 #include "CigarValidator.h"
30 #include "InvalidFormatException.h"
31 
32 namespace U2 {
33 namespace BAM {
34 
35 const int SamReader::LOCAL_READ_BUFFER_SIZE = 100000;
36 
parseAlignmentString(QByteArray line)37 Alignment SamReader::parseAlignmentString(QByteArray line) {
38     Alignment alignment;
39 
40     if (line.isEmpty()) {
41         assert(0);
42         throw InvalidFormatException(BAMDbiPlugin::tr("Unexpected empty string"));
43     }
44     QByteArray recordTag;
45     QHash<QByteArray, QByteArray> fields;
46     QList<QByteArray> tokens = line.split('\t');
47 
48     if (tokens.length() < 11) {
49         throw InvalidFormatException(BAMDbiPlugin::tr("Not enough fields in one of alignments"));
50     }
51 
52     {
53         QByteArray &qname = tokens[0];
54         // workaround for malformed SAMs
55         if (!QRegExp("[ -~]{1,255}").exactMatch(qname)) {
56             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid query template name: %1").arg(QString(qname)));
57         }
58         alignment.setName(qname);
59     }
60     {
61         bool ok = false;
62         int flagsValue = tokens[1].toInt(&ok);
63         if (!ok) {
64             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid FLAG value: %1").arg(QString(tokens[1])));
65         }
66         qint64 flags = 0;
67         if (flagsValue & 0x1) {
68             flags |= Fragmented;
69         }
70         if (flagsValue & 0x2) {
71             flags |= FragmentsAligned;
72         }
73         if (flagsValue & 0x4) {
74             flags |= Unmapped;
75         }
76         if (flagsValue & 0x8) {
77             flags |= NextUnmapped;
78         }
79         if (flagsValue & 0x10) {
80             flags |= Reverse;
81         }
82         if (flagsValue & 0x20) {
83             flags |= NextReverse;
84         }
85         if (flagsValue & 0x40) {
86             flags |= FirstInTemplate;
87         }
88         if (flagsValue & 0x80) {
89             flags |= LastInTemplate;
90         }
91         if (flagsValue & 0x100) {
92             flags |= SecondaryAlignment;
93         }
94         if (flagsValue & 0x200) {
95             flags |= FailsChecks;
96         }
97         if (flagsValue & 0x400) {
98             flags |= Duplicate;
99         }
100         alignment.setFlags(flags);
101     }
102     {
103         QByteArray &rname = tokens[2];
104         // workaround for malformed SAMs
105         if (!QRegExp("[*]|[!-()+-<>-~][ -~]*").exactMatch(rname)) {
106             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid reference name: %1").arg(QString(rname)));
107         }
108         if ("*" == rname) {
109             alignment.setReferenceId(-1);
110         } else {
111             if (!referencesMap.contains(rname)) {
112                 throw InvalidFormatException(BAMDbiPlugin::tr("Undefined reference name: %1").arg(QString(rname)));
113             }
114             alignment.setReferenceId(referencesMap[rname]);
115         }
116     }
117     {
118         bool ok = false;
119         int position = tokens[3].toInt(&ok);
120         if (!ok) {
121             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid read position value: %1").arg(QString(tokens[3])));
122         }
123         if (position < 0) {
124             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid read position: %1").arg(position));
125         }
126         alignment.setPosition(position - 1);  // to 0-based mapping
127     }
128     {
129         bool ok = false;
130         int mapQuality = tokens[4].toInt(&ok);
131         if (!ok) {
132             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mapping quality value: %1").arg(QString(tokens[4])));
133         }
134         if (mapQuality < 0 || mapQuality > 255) {
135             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mapping quality: %1").arg(mapQuality));
136         }
137         alignment.setMapQuality(mapQuality);
138     }
139     {
140         QByteArray &cigarString = tokens[5];
141         if (!QRegExp("[*]|([0-9]+[MIDNSHPX=])+").exactMatch(cigarString)) {
142             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid cigar value: %1").arg(QString(cigarString)));
143         }
144         if ("*" != cigarString) {
145             QString err;
146             QList<U2CigarToken> res = U2AssemblyUtils::parseCigar(cigarString, err);
147             if (!err.isEmpty()) {
148                 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid cigar value: %1").arg(QString(cigarString)));
149             }
150 
151             QList<Alignment::CigarOperation> cigar;
152             for (int i = 0; i < res.length(); i++) {
153                 Alignment::CigarOperation::Operation operation;
154                 switch (res[i].op) {
155                     case U2CigarOp_M:
156                         operation = Alignment::CigarOperation::AlignmentMatch;
157                         break;
158                     case U2CigarOp_I:
159                         operation = Alignment::CigarOperation::Insertion;
160                         break;
161                     case U2CigarOp_D:
162                         operation = Alignment::CigarOperation::Deletion;
163                         break;
164                     case U2CigarOp_N:
165                         operation = Alignment::CigarOperation::Skipped;
166                         break;
167                     case U2CigarOp_S:
168                         operation = Alignment::CigarOperation::SoftClip;
169                         break;
170                     case U2CigarOp_H:
171                         operation = Alignment::CigarOperation::HardClip;
172                         break;
173                     case U2CigarOp_P:
174                         operation = Alignment::CigarOperation::Padding;
175                         break;
176                     case U2CigarOp_EQ:
177                         operation = Alignment::CigarOperation::SequenceMatch;
178                         break;
179                     case U2CigarOp_X:
180                         operation = Alignment::CigarOperation::SequenceMismatch;
181                         break;
182                     default:
183                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid cigar operation code: %1").arg(res[i].op));
184                 }
185                 int operatonLength = res[i].count;
186                 cigar.append(Alignment::CigarOperation(operatonLength, operation));
187             }
188             alignment.setCigar(cigar);
189         }
190     }
191     {
192         QByteArray nextReference = tokens[6];
193         // workaround for malformed SAMs
194         if (!QRegExp("[*]|[=]|[!-()+-<>-~][ -~]*").exactMatch(nextReference)) {
195             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mate reference name: %1").arg(QString(nextReference)));
196         }
197         if ("*" == nextReference) {
198             alignment.setNextReferenceId(-1);
199         } else if ("=" == nextReference) {
200             alignment.setNextReferenceId(alignment.getReferenceId());
201         } else {
202             if (!referencesMap.contains(nextReference)) {
203                 throw InvalidFormatException(BAMDbiPlugin::tr("Undefined mate reference name: %1").arg(QString(nextReference)));
204             }
205             alignment.setNextReferenceId(referencesMap[nextReference]);
206         }
207         alignment.setNextReferenceName(nextReference);
208     }
209     {
210         bool ok = false;
211         int nextPosition = tokens[7].toInt(&ok);
212         if (!ok) {
213             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mate position value: %1").arg(QString(tokens[7])));
214         }
215         if (nextPosition < 0) {
216             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mate position: %1").arg(nextPosition));
217         }
218         alignment.setNextPosition(nextPosition - 1);  // to 0-based mapping
219     }
220     {
221         bool ok = false;
222         int templateLength = tokens[8].toInt(&ok);
223         if (!ok) {
224             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid template length value: %1").arg(QString(tokens[8])));
225         }
226         if (!(alignment.getFlags() & Fragmented)) {
227             if (0 != templateLength) {
228                 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid template length of a single-fragment template: %1").arg(templateLength));
229             }
230         }
231         alignment.setTemplateLength(templateLength);
232     }
233     {
234         QByteArray sequence = tokens[9];
235         if (!QRegExp("[*]|[A-Za-z=.]+").exactMatch(sequence)) {
236             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid sequence: %1").arg(QString(sequence)));
237         }
238         alignment.setSequence(sequence);
239     }
240     {
241         QByteArray quality = tokens[10];
242         if ("*" != quality) {
243             if (QRegExp("[!-~]+").exactMatch(quality)) {
244                 alignment.setQuality(quality);
245             }
246         }
247     }
248     {
249         QByteArray samAuxString;
250         bool first = true;
251         QMap<QByteArray, QVariant> optionalFields;
252         for (int i = 11; i < tokens.length(); i++) {
253             if (!first) {
254                 samAuxString += '\t';
255             }
256             samAuxString += tokens[i];
257             first = false;
258         }
259         alignment.setAuxData(SamtoolsAdapter::samString2aux(samAuxString));
260     }
261     {
262         // Validation of the CIGAR string.
263         int totalLength = 0;
264         int length = alignment.getSequence().length();
265         const QList<Alignment::CigarOperation> &cigar = alignment.getCigar();
266         CigarValidator validator(cigar);
267         validator.validate(&totalLength);
268         if (!cigar.isEmpty() && length != totalLength) {
269             const_cast<QList<Alignment::CigarOperation> &>(cigar).clear();  // Ignore invalid cigar
270         }
271     }
272     return alignment;
273 }
274 
SamReader(IOAdapter & ioAdapter)275 SamReader::SamReader(IOAdapter &ioAdapter)
276     : Reader(ioAdapter),
277       readBuffer(LOCAL_READ_BUFFER_SIZE, '\0') {
278     readHeader();
279 }
280 
getHeader() const281 const Header &SamReader::getHeader() const {
282     return header;
283 }
284 
readAlignment(bool & eof)285 Alignment SamReader::readAlignment(bool &eof) {
286     QByteArray alignmentString = readString(eof);
287 
288     return parseAlignmentString(alignmentString);
289 }
290 
isEof() const291 bool SamReader::isEof() const {
292     return ioAdapter.isEof();
293 }
294 
readString(bool & eof)295 QByteArray SamReader::readString(bool &eof) {
296     char *buff = readBuffer.data();
297     bool lineOk = false;
298     int len = 0;
299     QByteArray result;
300     while ((len = ioAdapter.readLine(buff, LOCAL_READ_BUFFER_SIZE, &lineOk)) == 0) {
301     }
302     if (len == -1) {
303         eof = true;
304     } else {
305         result = QByteArray(buff, len);
306     }
307 
308     return result;
309 }
310 
readHeader()311 void SamReader::readHeader() {
312     char *buff = readBuffer.data();
313     bool lineOk = false;
314     int len = 0;
315     qint64 bRead = ioAdapter.bytesRead();
316 
317     QList<Header::Reference> references;
318     {
319         QList<Header::ReadGroup> readGroups;
320         QList<Header::Program> programs;
321         QList<QByteArray> previousProgramIds;
322         while ((len = ioAdapter.readLine(buff, LOCAL_READ_BUFFER_SIZE, &lineOk)) >= 0) {
323             if (isEof()) {
324                 break;
325             }
326 
327             QByteArray line = QByteArray(buff, len);
328             if (line.isEmpty()) {
329                 continue;
330             }
331             if (line.startsWith("@CO")) {
332                 continue;
333             }
334             if (!line.startsWith('@')) {
335                 ioAdapter.skip(bRead - ioAdapter.bytesRead());
336                 break;
337             }
338             bRead = ioAdapter.bytesRead();
339             QByteArray recordTag;
340             QHash<QByteArray, QByteArray> fields;
341             {
342                 QList<QByteArray> tokens = line.split('\t');
343                 recordTag = tokens[0].mid(1);
344                 if (!QRegExp("[A-Za-z][A-Za-z]").exactMatch(recordTag)) {
345                     throw InvalidFormatException(BAMDbiPlugin::tr("Invalid header record tag: %1").arg(QString(recordTag)));
346                 }
347                 for (int index = 1; index < tokens.size(); index++) {
348                     QByteArray fieldTag;
349                     QByteArray fieldValue;
350                     {
351                         int colonIndex = tokens[index].indexOf(':');
352                         if (-1 != colonIndex) {
353                             fieldTag = tokens[index].mid(0, colonIndex);
354                             fieldValue = tokens[index].mid(colonIndex + 1);
355                         } else if ("PG" == recordTag) {  // workaround for invalid headers produced by some programs
356                             continue;
357                         } else {
358                             throw InvalidFormatException(BAMDbiPlugin::tr("Invalid header field: %1").arg(QString(tokens[index])));
359                         }
360                     }
361                     if (!QRegExp("[A-Za-z][A-Za-z]").exactMatch(fieldTag) && "M5" != fieldTag) {  // workaround for bug in the spec
362                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid header field tag: %1").arg(QString(fieldTag)));
363                     }
364                     // CL and PN tags of can contain any string
365                     if (fieldTag != "CL" && fieldTag != "PN" && !QRegExp("[ -~]+").exactMatch(fieldValue)) {
366                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid %1-%2 value: %3").arg(QString(recordTag)).arg(QString(fieldTag)).arg(QString(fieldValue)));
367                     }
368                     if (!fields.contains(fieldTag)) {
369                         fields.insert(fieldTag, fieldValue);
370                     } else {
371                         throw InvalidFormatException(BAMDbiPlugin::tr("Duplicate header field: %1").arg(QString(fieldTag)));
372                     }
373                 }
374             }
375             if ("HD" == recordTag) {
376                 if (fields.contains("VN")) {
377                     QByteArray value = fields["VN"];
378                     if (!QRegExp("[0-9]+\\.[0-9]+").exactMatch(value)) {
379                         // Do nothing to support malformed BAMs
380                         // throw InvalidFormatException(BAMDbiPlugin::tr("Invalid HD-VN value: %1").arg(QString(value)));
381                     }
382                     header.setFormatVersion(Version::parseVersion(value));
383                 } else {
384                     throw InvalidFormatException(BAMDbiPlugin::tr("HD record without VN field"));
385                 }
386                 if (fields.contains("SO")) {
387                     QByteArray value = fields["SO"];
388                     if ("unknown" == value) {
389                         header.setSortingOrder(Header::Unknown);
390                     } else if ("unsorted" == value) {
391                         header.setSortingOrder(Header::Unsorted);
392                     } else if ("queryname" == value) {
393                         header.setSortingOrder(Header::QueryName);
394                     } else if ("coordinate" == value) {
395                         header.setSortingOrder(Header::Coordinate);
396                     } else if ("sorted" == value) {  // workaround for invalid headers produced by some programs
397                         header.setSortingOrder(Header::Coordinate);
398                     } else {
399                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid HD-SO value: %1").arg(QString(value)));
400                     }
401                 } else {
402                     header.setSortingOrder(Header::Unknown);
403                 }
404             } else if ("SQ" == recordTag) {
405                 Header::Reference *reference = nullptr;
406                 QByteArray name;
407                 if (fields.contains("SN")) {
408                     name = fields["SN"];
409                 } else {
410                     throw InvalidFormatException(BAMDbiPlugin::tr("SQ record without SN field"));
411                 }
412                 if (fields.contains("LN")) {
413                     QByteArray value = fields["LN"];
414                     bool ok = false;
415                     int length = value.toInt(&ok);
416                     if (ok) {
417                         Header::Reference newRef(name, length);
418                         referencesMap.insert(name, references.size());
419                         references.append(newRef);
420                         reference = &(references.last());
421                     } else {
422                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid SQ-LN value: %1").arg(QString(value)));
423                     }
424                 } else {
425                     throw InvalidFormatException(BAMDbiPlugin::tr("SQ record without LN field"));
426                 }
427                 assert(nullptr != reference);
428                 if (fields.contains("AS")) {
429                     reference->setAssemblyId(fields["AS"]);
430                 }
431                 if (fields.contains("M5")) {
432                     QByteArray value = fields["M5"];
433                     //[a-f] is a workaround (not matching to SAM-1.3 spec) to open 1000 Genomes project BAMs
434                     if (!QRegExp("[0-9A-Fa-f]+").exactMatch(value)) {
435                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid SQ-M5 value: %1").arg(QString(value)));
436                     }
437                     reference->setMd5(fields["M5"]);
438                 }
439                 if (fields.contains("SP")) {
440                     reference->setSpecies(fields["SP"]);
441                 }
442                 if (fields.contains("UR")) {
443                     reference->setUri(fields["UR"]);
444                 }
445             } else if ("RG" == recordTag) {
446                 Header::ReadGroup readGroup;
447                 if (fields.contains("ID")) {
448                     QByteArray value = fields["SN"];
449                     readGroupsMap.insert(value, readGroups.size());
450                 } else {
451                     fields.insert("ID", "-1");
452                 }
453                 if (fields.contains("CN")) {
454                     readGroup.setSequencingCenter(fields["CN"]);
455                 }
456                 if (fields.contains("DS")) {
457                     readGroup.setDescription(fields["DS"]);
458                 }
459                 if (fields.contains("DT")) {
460                     QByteArray value = fields["DT"];
461                     QDateTime dateTime = QDateTime::fromString(value, Qt::ISODate);
462                     if (dateTime.isValid()) {
463                         readGroup.setDate(dateTime);
464                     } else {
465                         QDate date = QDate::fromString(value, Qt::ISODate);
466                         if (date.isValid()) {
467                             readGroup.setDate(date);
468                         } else {
469                             // Allow anything.
470                             // throw InvalidFormatException(BAMDbiPlugin::tr("Invalid RG-DT field value: %1").arg(QString(value)));
471                         }
472                     }
473                 }
474                 if (fields.contains("LB")) {
475                     readGroup.setLibrary(fields["LB"]);
476                 }
477                 if (fields.contains("PG")) {
478                     readGroup.setPrograms(fields["PG"]);
479                 }
480                 if (fields.contains("PI")) {
481                     QByteArray value = fields["PI"];
482                     bool ok = false;
483                     int predictedInsertSize = value.toInt(&ok);
484                     if (ok) {
485                         readGroup.setPredictedInsertSize(predictedInsertSize);
486                     } else {
487                         throw InvalidFormatException(BAMDbiPlugin::tr("Invalid RG-PI field value: %1").arg(QString(value)));
488                     }
489                 }
490                 if (fields.contains("PL")) {
491                     readGroup.setPlatform(fields["PL"]);
492                 }
493                 if (fields.contains("PU")) {
494                     readGroup.setPlatformUnit(fields["PU"]);
495                 }
496                 if (fields.contains("SM")) {
497                     readGroup.setSample(fields["SM"]);
498                 }
499                 readGroups.append(readGroup);
500             } else if ("PG" == recordTag) {
501                 Header::Program program;
502                 if (!fields.contains("ID")) {
503                     fields.insert("ID", QByteArray::number(programs.size()));
504                 }
505                 programsMap.insert(fields["ID"], programs.size());
506                 if (fields.contains("PN")) {
507                     program.setName(fields["PN"]);
508                 }
509                 if (fields.contains("CL")) {
510                     program.setCommandLine(fields["CL"]);
511                 }
512                 if (fields.contains("PP")) {
513                     previousProgramIds.append(fields["PP"]);
514                 } else {
515                     previousProgramIds.append(QByteArray());
516                 }
517                 if (fields.contains("VN")) {
518                     program.setVersion(fields["VN"]);
519                 }
520                 programs.append(program);
521             }
522         }
523         for (int index = 0; index < programs.size(); index++) {
524             const QByteArray &previousProgramId = previousProgramIds[index];
525             if (!previousProgramId.isEmpty()) {
526                 if (programsMap.contains(previousProgramId)) {
527                     programs[index].setPreviousId(programsMap[previousProgramId]);
528                 } else {
529                     throw InvalidFormatException(BAMDbiPlugin::tr("Invalid PG-PP field value: %1").arg(QString(previousProgramId)));
530                 }
531             }
532         }
533         header.setReferences(references);
534         header.setReadGroups(readGroups);
535         header.setPrograms(programs);
536     }
537 }
538 
539 }  // namespace BAM
540 }  // namespace U2
541