1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "SamReader.h"
23 #include <SamtoolsAdapter.h>
24
25 #include <U2Core/Log.h>
26 #include <U2Core/U2AssemblyUtils.h>
27
28 #include "BAMDbiPlugin.h"
29 #include "CigarValidator.h"
30 #include "InvalidFormatException.h"
31
32 namespace U2 {
33 namespace BAM {
34
35 const int SamReader::LOCAL_READ_BUFFER_SIZE = 100000;
36
parseAlignmentString(QByteArray line)37 Alignment SamReader::parseAlignmentString(QByteArray line) {
38 Alignment alignment;
39
40 if (line.isEmpty()) {
41 assert(0);
42 throw InvalidFormatException(BAMDbiPlugin::tr("Unexpected empty string"));
43 }
44 QByteArray recordTag;
45 QHash<QByteArray, QByteArray> fields;
46 QList<QByteArray> tokens = line.split('\t');
47
48 if (tokens.length() < 11) {
49 throw InvalidFormatException(BAMDbiPlugin::tr("Not enough fields in one of alignments"));
50 }
51
52 {
53 QByteArray &qname = tokens[0];
54 // workaround for malformed SAMs
55 if (!QRegExp("[ -~]{1,255}").exactMatch(qname)) {
56 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid query template name: %1").arg(QString(qname)));
57 }
58 alignment.setName(qname);
59 }
60 {
61 bool ok = false;
62 int flagsValue = tokens[1].toInt(&ok);
63 if (!ok) {
64 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid FLAG value: %1").arg(QString(tokens[1])));
65 }
66 qint64 flags = 0;
67 if (flagsValue & 0x1) {
68 flags |= Fragmented;
69 }
70 if (flagsValue & 0x2) {
71 flags |= FragmentsAligned;
72 }
73 if (flagsValue & 0x4) {
74 flags |= Unmapped;
75 }
76 if (flagsValue & 0x8) {
77 flags |= NextUnmapped;
78 }
79 if (flagsValue & 0x10) {
80 flags |= Reverse;
81 }
82 if (flagsValue & 0x20) {
83 flags |= NextReverse;
84 }
85 if (flagsValue & 0x40) {
86 flags |= FirstInTemplate;
87 }
88 if (flagsValue & 0x80) {
89 flags |= LastInTemplate;
90 }
91 if (flagsValue & 0x100) {
92 flags |= SecondaryAlignment;
93 }
94 if (flagsValue & 0x200) {
95 flags |= FailsChecks;
96 }
97 if (flagsValue & 0x400) {
98 flags |= Duplicate;
99 }
100 alignment.setFlags(flags);
101 }
102 {
103 QByteArray &rname = tokens[2];
104 // workaround for malformed SAMs
105 if (!QRegExp("[*]|[!-()+-<>-~][ -~]*").exactMatch(rname)) {
106 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid reference name: %1").arg(QString(rname)));
107 }
108 if ("*" == rname) {
109 alignment.setReferenceId(-1);
110 } else {
111 if (!referencesMap.contains(rname)) {
112 throw InvalidFormatException(BAMDbiPlugin::tr("Undefined reference name: %1").arg(QString(rname)));
113 }
114 alignment.setReferenceId(referencesMap[rname]);
115 }
116 }
117 {
118 bool ok = false;
119 int position = tokens[3].toInt(&ok);
120 if (!ok) {
121 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid read position value: %1").arg(QString(tokens[3])));
122 }
123 if (position < 0) {
124 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid read position: %1").arg(position));
125 }
126 alignment.setPosition(position - 1); // to 0-based mapping
127 }
128 {
129 bool ok = false;
130 int mapQuality = tokens[4].toInt(&ok);
131 if (!ok) {
132 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mapping quality value: %1").arg(QString(tokens[4])));
133 }
134 if (mapQuality < 0 || mapQuality > 255) {
135 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mapping quality: %1").arg(mapQuality));
136 }
137 alignment.setMapQuality(mapQuality);
138 }
139 {
140 QByteArray &cigarString = tokens[5];
141 if (!QRegExp("[*]|([0-9]+[MIDNSHPX=])+").exactMatch(cigarString)) {
142 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid cigar value: %1").arg(QString(cigarString)));
143 }
144 if ("*" != cigarString) {
145 QString err;
146 QList<U2CigarToken> res = U2AssemblyUtils::parseCigar(cigarString, err);
147 if (!err.isEmpty()) {
148 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid cigar value: %1").arg(QString(cigarString)));
149 }
150
151 QList<Alignment::CigarOperation> cigar;
152 for (int i = 0; i < res.length(); i++) {
153 Alignment::CigarOperation::Operation operation;
154 switch (res[i].op) {
155 case U2CigarOp_M:
156 operation = Alignment::CigarOperation::AlignmentMatch;
157 break;
158 case U2CigarOp_I:
159 operation = Alignment::CigarOperation::Insertion;
160 break;
161 case U2CigarOp_D:
162 operation = Alignment::CigarOperation::Deletion;
163 break;
164 case U2CigarOp_N:
165 operation = Alignment::CigarOperation::Skipped;
166 break;
167 case U2CigarOp_S:
168 operation = Alignment::CigarOperation::SoftClip;
169 break;
170 case U2CigarOp_H:
171 operation = Alignment::CigarOperation::HardClip;
172 break;
173 case U2CigarOp_P:
174 operation = Alignment::CigarOperation::Padding;
175 break;
176 case U2CigarOp_EQ:
177 operation = Alignment::CigarOperation::SequenceMatch;
178 break;
179 case U2CigarOp_X:
180 operation = Alignment::CigarOperation::SequenceMismatch;
181 break;
182 default:
183 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid cigar operation code: %1").arg(res[i].op));
184 }
185 int operatonLength = res[i].count;
186 cigar.append(Alignment::CigarOperation(operatonLength, operation));
187 }
188 alignment.setCigar(cigar);
189 }
190 }
191 {
192 QByteArray nextReference = tokens[6];
193 // workaround for malformed SAMs
194 if (!QRegExp("[*]|[=]|[!-()+-<>-~][ -~]*").exactMatch(nextReference)) {
195 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mate reference name: %1").arg(QString(nextReference)));
196 }
197 if ("*" == nextReference) {
198 alignment.setNextReferenceId(-1);
199 } else if ("=" == nextReference) {
200 alignment.setNextReferenceId(alignment.getReferenceId());
201 } else {
202 if (!referencesMap.contains(nextReference)) {
203 throw InvalidFormatException(BAMDbiPlugin::tr("Undefined mate reference name: %1").arg(QString(nextReference)));
204 }
205 alignment.setNextReferenceId(referencesMap[nextReference]);
206 }
207 alignment.setNextReferenceName(nextReference);
208 }
209 {
210 bool ok = false;
211 int nextPosition = tokens[7].toInt(&ok);
212 if (!ok) {
213 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mate position value: %1").arg(QString(tokens[7])));
214 }
215 if (nextPosition < 0) {
216 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid mate position: %1").arg(nextPosition));
217 }
218 alignment.setNextPosition(nextPosition - 1); // to 0-based mapping
219 }
220 {
221 bool ok = false;
222 int templateLength = tokens[8].toInt(&ok);
223 if (!ok) {
224 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid template length value: %1").arg(QString(tokens[8])));
225 }
226 if (!(alignment.getFlags() & Fragmented)) {
227 if (0 != templateLength) {
228 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid template length of a single-fragment template: %1").arg(templateLength));
229 }
230 }
231 alignment.setTemplateLength(templateLength);
232 }
233 {
234 QByteArray sequence = tokens[9];
235 if (!QRegExp("[*]|[A-Za-z=.]+").exactMatch(sequence)) {
236 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid sequence: %1").arg(QString(sequence)));
237 }
238 alignment.setSequence(sequence);
239 }
240 {
241 QByteArray quality = tokens[10];
242 if ("*" != quality) {
243 if (QRegExp("[!-~]+").exactMatch(quality)) {
244 alignment.setQuality(quality);
245 }
246 }
247 }
248 {
249 QByteArray samAuxString;
250 bool first = true;
251 QMap<QByteArray, QVariant> optionalFields;
252 for (int i = 11; i < tokens.length(); i++) {
253 if (!first) {
254 samAuxString += '\t';
255 }
256 samAuxString += tokens[i];
257 first = false;
258 }
259 alignment.setAuxData(SamtoolsAdapter::samString2aux(samAuxString));
260 }
261 {
262 // Validation of the CIGAR string.
263 int totalLength = 0;
264 int length = alignment.getSequence().length();
265 const QList<Alignment::CigarOperation> &cigar = alignment.getCigar();
266 CigarValidator validator(cigar);
267 validator.validate(&totalLength);
268 if (!cigar.isEmpty() && length != totalLength) {
269 const_cast<QList<Alignment::CigarOperation> &>(cigar).clear(); // Ignore invalid cigar
270 }
271 }
272 return alignment;
273 }
274
SamReader(IOAdapter & ioAdapter)275 SamReader::SamReader(IOAdapter &ioAdapter)
276 : Reader(ioAdapter),
277 readBuffer(LOCAL_READ_BUFFER_SIZE, '\0') {
278 readHeader();
279 }
280
getHeader() const281 const Header &SamReader::getHeader() const {
282 return header;
283 }
284
readAlignment(bool & eof)285 Alignment SamReader::readAlignment(bool &eof) {
286 QByteArray alignmentString = readString(eof);
287
288 return parseAlignmentString(alignmentString);
289 }
290
isEof() const291 bool SamReader::isEof() const {
292 return ioAdapter.isEof();
293 }
294
readString(bool & eof)295 QByteArray SamReader::readString(bool &eof) {
296 char *buff = readBuffer.data();
297 bool lineOk = false;
298 int len = 0;
299 QByteArray result;
300 while ((len = ioAdapter.readLine(buff, LOCAL_READ_BUFFER_SIZE, &lineOk)) == 0) {
301 }
302 if (len == -1) {
303 eof = true;
304 } else {
305 result = QByteArray(buff, len);
306 }
307
308 return result;
309 }
310
readHeader()311 void SamReader::readHeader() {
312 char *buff = readBuffer.data();
313 bool lineOk = false;
314 int len = 0;
315 qint64 bRead = ioAdapter.bytesRead();
316
317 QList<Header::Reference> references;
318 {
319 QList<Header::ReadGroup> readGroups;
320 QList<Header::Program> programs;
321 QList<QByteArray> previousProgramIds;
322 while ((len = ioAdapter.readLine(buff, LOCAL_READ_BUFFER_SIZE, &lineOk)) >= 0) {
323 if (isEof()) {
324 break;
325 }
326
327 QByteArray line = QByteArray(buff, len);
328 if (line.isEmpty()) {
329 continue;
330 }
331 if (line.startsWith("@CO")) {
332 continue;
333 }
334 if (!line.startsWith('@')) {
335 ioAdapter.skip(bRead - ioAdapter.bytesRead());
336 break;
337 }
338 bRead = ioAdapter.bytesRead();
339 QByteArray recordTag;
340 QHash<QByteArray, QByteArray> fields;
341 {
342 QList<QByteArray> tokens = line.split('\t');
343 recordTag = tokens[0].mid(1);
344 if (!QRegExp("[A-Za-z][A-Za-z]").exactMatch(recordTag)) {
345 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid header record tag: %1").arg(QString(recordTag)));
346 }
347 for (int index = 1; index < tokens.size(); index++) {
348 QByteArray fieldTag;
349 QByteArray fieldValue;
350 {
351 int colonIndex = tokens[index].indexOf(':');
352 if (-1 != colonIndex) {
353 fieldTag = tokens[index].mid(0, colonIndex);
354 fieldValue = tokens[index].mid(colonIndex + 1);
355 } else if ("PG" == recordTag) { // workaround for invalid headers produced by some programs
356 continue;
357 } else {
358 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid header field: %1").arg(QString(tokens[index])));
359 }
360 }
361 if (!QRegExp("[A-Za-z][A-Za-z]").exactMatch(fieldTag) && "M5" != fieldTag) { // workaround for bug in the spec
362 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid header field tag: %1").arg(QString(fieldTag)));
363 }
364 // CL and PN tags of can contain any string
365 if (fieldTag != "CL" && fieldTag != "PN" && !QRegExp("[ -~]+").exactMatch(fieldValue)) {
366 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid %1-%2 value: %3").arg(QString(recordTag)).arg(QString(fieldTag)).arg(QString(fieldValue)));
367 }
368 if (!fields.contains(fieldTag)) {
369 fields.insert(fieldTag, fieldValue);
370 } else {
371 throw InvalidFormatException(BAMDbiPlugin::tr("Duplicate header field: %1").arg(QString(fieldTag)));
372 }
373 }
374 }
375 if ("HD" == recordTag) {
376 if (fields.contains("VN")) {
377 QByteArray value = fields["VN"];
378 if (!QRegExp("[0-9]+\\.[0-9]+").exactMatch(value)) {
379 // Do nothing to support malformed BAMs
380 // throw InvalidFormatException(BAMDbiPlugin::tr("Invalid HD-VN value: %1").arg(QString(value)));
381 }
382 header.setFormatVersion(Version::parseVersion(value));
383 } else {
384 throw InvalidFormatException(BAMDbiPlugin::tr("HD record without VN field"));
385 }
386 if (fields.contains("SO")) {
387 QByteArray value = fields["SO"];
388 if ("unknown" == value) {
389 header.setSortingOrder(Header::Unknown);
390 } else if ("unsorted" == value) {
391 header.setSortingOrder(Header::Unsorted);
392 } else if ("queryname" == value) {
393 header.setSortingOrder(Header::QueryName);
394 } else if ("coordinate" == value) {
395 header.setSortingOrder(Header::Coordinate);
396 } else if ("sorted" == value) { // workaround for invalid headers produced by some programs
397 header.setSortingOrder(Header::Coordinate);
398 } else {
399 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid HD-SO value: %1").arg(QString(value)));
400 }
401 } else {
402 header.setSortingOrder(Header::Unknown);
403 }
404 } else if ("SQ" == recordTag) {
405 Header::Reference *reference = nullptr;
406 QByteArray name;
407 if (fields.contains("SN")) {
408 name = fields["SN"];
409 } else {
410 throw InvalidFormatException(BAMDbiPlugin::tr("SQ record without SN field"));
411 }
412 if (fields.contains("LN")) {
413 QByteArray value = fields["LN"];
414 bool ok = false;
415 int length = value.toInt(&ok);
416 if (ok) {
417 Header::Reference newRef(name, length);
418 referencesMap.insert(name, references.size());
419 references.append(newRef);
420 reference = &(references.last());
421 } else {
422 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid SQ-LN value: %1").arg(QString(value)));
423 }
424 } else {
425 throw InvalidFormatException(BAMDbiPlugin::tr("SQ record without LN field"));
426 }
427 assert(nullptr != reference);
428 if (fields.contains("AS")) {
429 reference->setAssemblyId(fields["AS"]);
430 }
431 if (fields.contains("M5")) {
432 QByteArray value = fields["M5"];
433 //[a-f] is a workaround (not matching to SAM-1.3 spec) to open 1000 Genomes project BAMs
434 if (!QRegExp("[0-9A-Fa-f]+").exactMatch(value)) {
435 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid SQ-M5 value: %1").arg(QString(value)));
436 }
437 reference->setMd5(fields["M5"]);
438 }
439 if (fields.contains("SP")) {
440 reference->setSpecies(fields["SP"]);
441 }
442 if (fields.contains("UR")) {
443 reference->setUri(fields["UR"]);
444 }
445 } else if ("RG" == recordTag) {
446 Header::ReadGroup readGroup;
447 if (fields.contains("ID")) {
448 QByteArray value = fields["SN"];
449 readGroupsMap.insert(value, readGroups.size());
450 } else {
451 fields.insert("ID", "-1");
452 }
453 if (fields.contains("CN")) {
454 readGroup.setSequencingCenter(fields["CN"]);
455 }
456 if (fields.contains("DS")) {
457 readGroup.setDescription(fields["DS"]);
458 }
459 if (fields.contains("DT")) {
460 QByteArray value = fields["DT"];
461 QDateTime dateTime = QDateTime::fromString(value, Qt::ISODate);
462 if (dateTime.isValid()) {
463 readGroup.setDate(dateTime);
464 } else {
465 QDate date = QDate::fromString(value, Qt::ISODate);
466 if (date.isValid()) {
467 readGroup.setDate(date);
468 } else {
469 // Allow anything.
470 // throw InvalidFormatException(BAMDbiPlugin::tr("Invalid RG-DT field value: %1").arg(QString(value)));
471 }
472 }
473 }
474 if (fields.contains("LB")) {
475 readGroup.setLibrary(fields["LB"]);
476 }
477 if (fields.contains("PG")) {
478 readGroup.setPrograms(fields["PG"]);
479 }
480 if (fields.contains("PI")) {
481 QByteArray value = fields["PI"];
482 bool ok = false;
483 int predictedInsertSize = value.toInt(&ok);
484 if (ok) {
485 readGroup.setPredictedInsertSize(predictedInsertSize);
486 } else {
487 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid RG-PI field value: %1").arg(QString(value)));
488 }
489 }
490 if (fields.contains("PL")) {
491 readGroup.setPlatform(fields["PL"]);
492 }
493 if (fields.contains("PU")) {
494 readGroup.setPlatformUnit(fields["PU"]);
495 }
496 if (fields.contains("SM")) {
497 readGroup.setSample(fields["SM"]);
498 }
499 readGroups.append(readGroup);
500 } else if ("PG" == recordTag) {
501 Header::Program program;
502 if (!fields.contains("ID")) {
503 fields.insert("ID", QByteArray::number(programs.size()));
504 }
505 programsMap.insert(fields["ID"], programs.size());
506 if (fields.contains("PN")) {
507 program.setName(fields["PN"]);
508 }
509 if (fields.contains("CL")) {
510 program.setCommandLine(fields["CL"]);
511 }
512 if (fields.contains("PP")) {
513 previousProgramIds.append(fields["PP"]);
514 } else {
515 previousProgramIds.append(QByteArray());
516 }
517 if (fields.contains("VN")) {
518 program.setVersion(fields["VN"]);
519 }
520 programs.append(program);
521 }
522 }
523 for (int index = 0; index < programs.size(); index++) {
524 const QByteArray &previousProgramId = previousProgramIds[index];
525 if (!previousProgramId.isEmpty()) {
526 if (programsMap.contains(previousProgramId)) {
527 programs[index].setPreviousId(programsMap[previousProgramId]);
528 } else {
529 throw InvalidFormatException(BAMDbiPlugin::tr("Invalid PG-PP field value: %1").arg(QString(previousProgramId)));
530 }
531 }
532 }
533 header.setReferences(references);
534 header.setReadGroups(readGroups);
535 header.setPrograms(programs);
536 }
537 }
538
539 } // namespace BAM
540 } // namespace U2
541