1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 */
25
26 #include <iostream>
27 #include <fstream>
28 #include "bam.hpp"
29
30 #define MAX_INDEX_SEQ_LEN ((1u << 29) - 1)
31 #define MAX_BIN (37449u)
32 #define NUMINTV ((MAX_INDEX_SEQ_LEN + 1) >> 14)
33
34 class RefIndex {
35 public:
36 BAMFilePosType off_beg, off_end;
37 uint64_t n_mapped, n_unmapped;
38 BAMFilePosTypeList interval;
39 std::vector<BAMFilePosTypeList> bins;
40
41 private:
CopyWhereLess(BAMFilePosTypeList & dst,BAMFilePosTypeList const & src,BAMFilePosType const maxpos)42 static void CopyWhereLess(BAMFilePosTypeList &dst,
43 BAMFilePosTypeList const &src,
44 BAMFilePosType const maxpos)
45 {
46 if (maxpos.hasValue()) {
47 for (unsigned i = 0; i < src.size(); ++i) {
48 BAMFilePosType const pos = src[i];
49
50 if (pos < maxpos)
51 dst.push_back(pos);
52 }
53 }
54 else {
55 for (unsigned i = 0; i < src.size(); ++i)
56 dst.push_back(src[i]);
57 }
58 }
59 public:
LoadIndexIntervals(char const * data,char const * const endp)60 char const *LoadIndexIntervals(char const *data, char const *const endp)
61 {
62 if (data + 4 > endp)
63 throw std::runtime_error("insufficient data to load index interval count");
64 int32_t const n = LE2Host<int32_t>(data); data += 4;
65
66 char const *const next = data + 8 * n;
67 if (next > endp)
68 throw std::runtime_error("insufficient data to load index intervals");
69
70 BAMFilePosType last(0);
71
72 interval.resize(n);
73 for (unsigned i = 0; i < n; ++i) {
74 BAMFilePosType const intvl = LE2Host<BAMFilePosType>(data); data += 8;
75
76 interval[i] = (intvl == last) ? BAMFilePosType(0) : intvl;
77 last = intvl;
78 }
79 while (interval.size() > 0 && !interval.back().hasValue())
80 interval.pop_back();
81
82 return next;
83 }
LoadIndexBins(char const * data,char const * const endp)84 char const *LoadIndexBins(char const *data, char const *const endp)
85 {
86 if (data + 4 > endp)
87 throw std::runtime_error("insufficient data to load index bin count");
88 int32_t const n_bin = LE2Host<int32_t>(data); data += 4;
89
90 bins.resize(MAX_BIN);
91 for (int i = 0; i < n_bin; ++i) {
92 if (data + 8 > endp)
93 throw std::runtime_error("insufficient data to load index bin size");
94
95 uint32_t const bin = LE2Host<uint32_t>(data + 0);
96 int32_t const n_chunk = LE2Host< int32_t>(data + 4);
97
98 data += 8;
99 if (data + n_chunk * 16 > endp)
100 throw std::runtime_error("insufficient data to load index bin chunks");
101
102 if (bin == MAX_BIN && n_chunk == 2) {
103 // special doodad
104 off_beg = LE2Host<BAMFilePosType>(data); data += 8;
105 off_end = LE2Host<BAMFilePosType>(data); data += 8;
106 n_mapped = LE2Host<uint64_t>(data); data += 8;
107 n_unmapped = LE2Host<uint64_t>(data); data += 8;
108 }
109 else if (bin < MAX_BIN) {
110 BAMFilePosTypeList &this_bin = bins[bin];
111
112 this_bin.resize(n_chunk);
113
114 for (unsigned k = 0; k < n_chunk; ++k) {
115 BAMFilePosType const beg = LE2Host<BAMFilePosType>(data); data += 8;
116 BAMFilePosType const end = LE2Host<BAMFilePosType>(data); data += 8;
117
118 (void)end;
119 this_bin[k] = beg;
120 }
121 }
122 else
123 data += 16 * n_chunk;
124 }
125 return data;
126 }
RefIndex()127 RefIndex()
128 {}
slice(unsigned const beg,unsigned const end) const129 BAMFilePosTypeList slice(unsigned const beg, unsigned const end) const
130 {
131 unsigned const first[] = { 1, 9, 73, 585, 4681 };
132 unsigned const cnt = end - 1 - beg;
133 unsigned const maxintvl = (end >> 14) + 1;
134 BAMFilePosType const maxpos = maxintvl < interval.size() ? interval[maxintvl] : off_end;
135 unsigned int_beg[5], int_cnt[5];
136 BAMFilePosTypeList rslt;
137
138 for (unsigned i = 0; i < 5; ++i) {
139 unsigned const shift = 14 + 3 * (4 - i);
140
141 int_beg[i] = (beg >> shift);
142 }
143 for (unsigned i = 0; i < 5; ++i) {
144 unsigned const shift = 14 + 3 * (4 - i);
145
146 int_cnt[i] = (cnt >> shift) + 1;
147 }
148 CopyWhereLess(rslt, bins[0], maxpos);
149 for (unsigned i = 0; i < 5; ++i) {
150 unsigned const beg = int_beg[i] + first[i];
151 unsigned const N = int_cnt[i];
152
153 for (unsigned j = 0; j < N; ++j) {
154 CopyWhereLess(rslt, bins[beg + j], maxpos);
155 }
156 }
157 std::sort(rslt.begin(), rslt.end());
158 return rslt;
159 }
160 };
161
LoadIndex(char const data[],char const * const endp)162 size_t HeaderRefInfo::LoadIndex(char const data[], char const *const endp)
163 {
164 RefIndex *i = new RefIndex();
165 try {
166 char const *const next1 = i->LoadIndexBins(data, endp);
167 char const *const next2 = i->LoadIndexIntervals(next1, endp);
168
169 index = i;
170
171 return next2 - data;
172 }
173 catch (...) {
174 delete i;
175 throw;
176 }
177 }
178
DropIndex()179 void HeaderRefInfo::DropIndex()
180 {
181 if (index) {
182 delete index;
183 index = 0;
184 }
185 }
186
slice(unsigned const beg,unsigned const end) const187 BAMFilePosTypeList HeaderRefInfo::slice(unsigned const beg, unsigned const end) const {
188 return index ? index->slice(beg, end) : BAMFilePosTypeList();
189 }
190
DumpSAM(std::ostream & oss,BAMRecord const & rec) const191 void BAMFile::DumpSAM(std::ostream &oss, BAMRecord const &rec) const
192 {
193 unsigned const seqlen = rec.l_seq();
194 std::string const RNAME = rec.isSelfMapped() ? getRefInfo(rec.refID()).getName() : "*";
195 std::string const RNEXT = rec.isMateMapped() ? getRefInfo(rec.next_refID()).getName() : "*";
196 int const POS = rec.isSelfMapped() ? (rec.pos() + 1) : 0;
197 int const PNEXT = rec.isMateMapped() ? (rec.next_pos() + 1) : 0;
198
199 oss << rec.readname()
200 << '\t' << rec.flag()
201 << '\t' << RNAME
202 << '\t' << POS
203 << '\t' << int(rec.mq())
204 << '\t';
205
206 if (rec.isSelfMapped()) {
207 unsigned const N = rec.nc();
208
209 for (unsigned i = 0; i < N; ++i) {
210 uint32_t const cv = rec.cigar(i);
211 int const op = cv & 0x0F;
212 int len = cv >> 4;
213 char buf[16];
214 char *cur = buf + sizeof(buf);
215
216 *--cur = '\0';
217 *--cur = "MIDNSHP=X???????"[op];
218 do {
219 int const digit = len % 10;
220
221 *--cur = digit + '0';
222 len /= 10;
223 } while (len > 0);
224 oss << cur;
225 }
226 }
227 else
228 oss << '*';
229
230 oss << '\t' << RNEXT
231 << '\t' << PNEXT
232 << '\t' << rec.tlen()
233 << '\t';
234
235 if (seqlen > 0) {
236 for (int i = 0; i < seqlen; ++i)
237 oss << rec.seq(i);
238 oss << '\t';
239
240 bool allFF = true;
241 uint8_t const *const q = rec.qual();
242 for (unsigned i = 0; i < seqlen; ++i) {
243 uint8_t const qv = q[i];
244
245 if (qv != 0xFF) {
246 allFF = false;
247 break;
248 }
249 }
250 if (allFF)
251 oss << '*';
252 else {
253 for (unsigned i = 0; i < seqlen; ++i) {
254 uint8_t const qv = q[i];
255 int const offset = ((int)qv) + '!';
256 int const out = offset < '~' ? offset : '~';
257
258 oss << char(out);
259 }
260 }
261 }
262 else
263 oss << "*\t*";
264
265 for (BAMRecord::OptionalField::const_iterator i = rec.begin(); i != rec.end(); ++i) {
266 std::string const tag = std::string(i->getTag(), 2);
267 int const elems = i->getElementCount();
268 char const type = i->getValueType();
269 char const *raw = i->getRawValue();
270
271 oss << '\t' << tag << ':' << (elems == 1 ? type : 'B') << ':';
272 if (type == 'Z' || type == 'H')
273 oss << raw;
274 else {
275 if (elems > 1)
276 oss << type;
277 for (int j = 0; j < elems; ++j) {
278 if (elems > 1)
279 oss << ',';
280 switch (type) {
281 case 'A':
282 oss << *raw;
283 ++raw;
284 break;
285 case 'C':
286 oss << unsigned(*((uint8_t const *)raw));
287 ++raw;
288 break;
289 case 'c':
290 oss << int(*((int8_t const *)raw));
291 ++raw;
292 break;
293 case 'S':
294 oss << unsigned(LE2Host<uint16_t>(raw));
295 raw += 2;
296 break;
297 case 's':
298 oss << int(LE2Host<int16_t>(raw));
299 raw += 2;
300 break;
301 case 'F':
302 oss << float(LE2Host<float>(raw));
303 raw += 4;
304 break;
305 case 'I':
306 oss << unsigned(LE2Host<uint32_t>(raw));
307 raw += 4;
308 break;
309 case 'i':
310 oss << int(LE2Host<int32_t>(raw));
311 raw += 4;
312 break;
313 }
314 }
315 }
316 }
317 }
318
FillBuffer(int const n)319 unsigned BAMFile::FillBuffer(int const n) {
320 size_t const nwant = n * IO_BLK_SIZE;
321 char *const dst = (char *)(iobuffer + 2 * IO_BLK_SIZE - nwant);
322 #if USE_STDIO
323 size_t const nread = feof(file) ? 0 : fread(dst, 1, nwant, file);
324 #else
325 size_t const nread = file.eof() ? 0 : file.read(dst, nwant).gcount();
326
327 if (nread == 0 && !file.eof())
328 throw std::runtime_error("read failed");
329 #endif
330 return (unsigned)nread;
331 }
332
ReadZlib(void)333 void BAMFile::ReadZlib(void) {
334 zs.next_out = bambuffer;
335 zs.avail_out = sizeof(bambuffer);
336 zs.total_out = 0;
337 bam_cur = 0;
338
339 if (zs.avail_in == 0) {
340 FILL_BUFFER:
341 #if USE_STDIO
342 cpos = ftell(file);
343 #else
344 cpos = file.tellg();
345 #endif
346 zs.avail_in = FillBuffer(2);
347 zs.next_in = iobuffer;
348 if (zs.avail_in == 0) /* EOF */
349 return;
350 }
351
352 bpos = cpos + (zs.next_in - iobuffer);
353 int const zrc = inflate(&zs, Z_FINISH);
354
355 if (zrc == Z_STREAM_END) {
356 /* inflateReset clobbers these value but we want them */
357 uLong const total_out = zs.total_out;
358 uLong const total_in = zs.total_in;
359
360 int const zrc = inflateReset(&zs);
361 if (zrc != Z_OK)
362 throw std::logic_error("inflateReset didn't return Z_OK");
363
364 zs.total_out = total_out;
365 zs.total_in = total_in;
366
367 #if 0
368 std::cerr << std::dec << "total_in: " << total_in << "; avail_in: " << zs.avail_in << "; IO_BLK_SIZE: " << IO_BLK_SIZE << std::endl;
369 #endif
370
371 if (total_in >= IO_BLK_SIZE && total_in + zs.avail_in == 2 * IO_BLK_SIZE)
372 {
373 memmove(iobuffer, &iobuffer[sizeof(iobuffer)/2], sizeof(iobuffer)/2);
374 cpos += sizeof(iobuffer)/2;
375 zs.next_in -= sizeof(iobuffer)/2;
376 zs.avail_in += FillBuffer(1);
377 }
378
379 return;
380 }
381 if (zrc != Z_OK && zrc != Z_BUF_ERROR)
382 throw std::runtime_error("decompression failed");
383
384 if (zs.avail_in == 0)
385 goto FILL_BUFFER;
386
387 throw std::runtime_error("zs.avail_in != 0");
388 }
389
ReadN(size_t N,void * Dst)390 size_t BAMFile::ReadN(size_t N, void *Dst) {
391 uint8_t *const dst = reinterpret_cast<uint8_t *>(Dst);
392 size_t n = 0;
393
394 while (n < N) {
395 size_t const avail_out = N - n;
396 size_t const avail_in = zs.total_out - bam_cur;
397
398 if (avail_in) {
399 size_t const copy = avail_out < avail_in ? avail_out : avail_in;
400
401 memmove(dst + n, bambuffer + bam_cur, copy);
402 bam_cur += copy;
403 if (bam_cur == zs.total_out)
404 bam_cur = zs.total_out = 0;
405
406 n += copy;
407 if (n == N)
408 break;
409 }
410 ReadZlib();
411 if (zs.total_out == 0)
412 break;
413 }
414 return n;
415 }
416
SkipN(size_t N)417 size_t BAMFile::SkipN(size_t N) {
418 size_t n = 0;
419
420 while (n < N) {
421 size_t const avail_out = N - n;
422 size_t const avail_in = zs.total_out - bam_cur;
423
424 if (avail_in) {
425 size_t const copy = avail_out < avail_in ? avail_out : avail_in;
426
427 bam_cur += copy;
428 n += copy;
429 if (n == N)
430 break;
431 }
432 ReadZlib();
433 if (zs.total_out == 0)
434 break;
435 }
436 return n;
437 }
438
Seek(size_t const new_bpos,unsigned const new_bam_cur)439 void BAMFile::Seek(size_t const new_bpos, unsigned const new_bam_cur) {
440 unsigned const c_offset = new_bpos % IO_BLK_SIZE;
441 size_t const new_cpos = new_bpos - c_offset;
442
443 #if 0
444 std::cerr << "seek to " << std::hex << new_bpos << "|" << new_bam_cur << std::endl;
445 #endif
446
447 #if USE_STDIO
448 if (fseek(file, new_cpos, SEEK_SET))
449 throw std::runtime_error("position is invalid");
450 cpos = ftell(file);
451 #else
452 file.seekg(new_cpos);
453 cpos = file.tellg();
454 #endif
455 zs.avail_in = FillBuffer(2);
456 if (zs.avail_in > c_offset) {
457 zs.avail_in -= c_offset;
458 zs.next_in = iobuffer + c_offset;
459 ReadZlib();
460 if (zs.total_out > new_bam_cur) {
461 bam_cur = new_bam_cur;
462 return;
463 }
464 }
465 throw std::runtime_error("position is invalid");
466 }
467
468 template <typename T>
Read(size_t count,T * dst)469 bool BAMFile::Read(size_t count, T *dst) {
470 size_t const nwant = count * sizeof(T);
471 size_t const nread = ReadN(nwant, reinterpret_cast<void *>(dst));
472
473 return nwant == nread;
474 }
475
ReadI32()476 int32_t BAMFile::ReadI32() {
477 int32_t value;
478
479 if (Read(1, &value))
480 return LE2Host<int32_t>(&value);
481 throw std::runtime_error("insufficient data while reading bam file");
482 }
483
ReadI32(int32_t & rslt)484 bool BAMFile::ReadI32(int32_t &rslt) {
485 int32_t value;
486
487 if (Read(1, &value)) {
488 rslt = LE2Host<int32_t>(&value);
489 return true;
490 }
491 return false;
492 }
493
InflateInit(void)494 void BAMFile::InflateInit(void) {
495 memset(&zs, 0, sizeof(zs));
496
497 int const zrc = inflateInit2(&zs, MAX_WBITS + 16);
498 switch (zrc) {
499 case Z_OK:
500 break;
501 case Z_MEM_ERROR:
502 throw std::bad_alloc();
503 break;
504 case Z_VERSION_ERROR:
505 throw std::runtime_error(std::string("zlib version is not compatible; need version " ZLIB_VERSION " but have ") + zlibVersion());
506 break;
507 case Z_STREAM_ERROR:
508 default:
509 throw std::invalid_argument(zs.msg ? zs.msg : "unknown");
510 break;
511 }
512 }
513
CheckHeaderSignature(void)514 void BAMFile::CheckHeaderSignature(void) {
515 static char const sig[] = "BAM\1";
516 char actual[4];
517
518 if (!Read(4, actual) || memcmp(actual, sig, 4) != 0)
519 throw std::runtime_error("Not a BAM file");
520 }
521
ReadHeader(void)522 void BAMFile::ReadHeader(void) {
523 CheckHeaderSignature();
524
525 {
526 int32_t const l_text = ReadI32();
527 if (l_text < 0)
528 throw std::runtime_error("header text length < 0");
529
530 char *const text = new char[l_text];
531 if (!Read(l_text, text))
532 throw std::runtime_error("file is truncated");
533
534 headerText = text;
535 delete [] text;
536 }
537 int32_t const n_ref = ReadI32();
538 if (n_ref < 0)
539 throw std::runtime_error("header reference count < 0");
540
541 references.reserve(n_ref);
542
543 for (int i = 0; i < n_ref; ++i) {
544 int32_t const l_name = ReadI32();
545
546 if (l_name < 0)
547 throw std::runtime_error("header reference name length < 0");
548
549 char *const name = new char[l_name];
550 if (!Read(l_name, name))
551 throw std::runtime_error("file is truncated");
552
553 int32_t const l_ref = ReadI32();
554 if (l_ref < 0)
555 throw std::runtime_error("header reference length < 0");
556
557 references.push_back(HeaderRefInfo(name, l_ref));
558 referencesByName[name] = i;
559
560 delete [] name;
561 }
562 }
563
LoadIndexData(size_t const fsize,char const data[])564 void BAMFile::LoadIndexData(size_t const fsize, char const data[]) {
565 char const *const endp = data + fsize;
566
567 if (memcmp(data, "BAI\1", 4) != 0)
568 return;
569
570 int32_t const n_ref = LE2Host<int32_t>(data + 4);
571 if (n_ref != references.size())
572 return;
573
574 size_t offset = 8;
575
576 for (int i = 0; i < n_ref; ++i) {
577 size_t const size = references[i].LoadIndex(data + offset, endp);
578
579 offset += size;
580 if (size == 0) {
581 std::cerr << "failed to load index #" << (i + 1) << std::endl;
582 for (int j = 0; j < i; ++j)
583 references[j].DropIndex();
584 return;
585 }
586 }
587 }
588
LoadIndex(std::string const & filepath)589 void BAMFile::LoadIndex(std::string const &filepath) {
590 char *data;
591 size_t fsize;
592 {
593 std::string const idxpath(filepath+".bai");
594 std::ifstream ifile;
595
596 ifile.open(idxpath.c_str(), std::ifstream::in | std::ifstream::binary);
597 if (!ifile.is_open())
598 return;
599
600 std::filebuf *const buf = ifile.rdbuf();
601 fsize = buf->pubseekoff(0, ifile.end, ifile.in);
602
603 if (fsize < 8)
604 return;
605
606 buf->pubseekpos(0, ifile.in);
607
608 data = new char[fsize];
609 buf->sgetn(data, fsize);
610 }
611 LoadIndexData(fsize, data);
612 delete [] data;
613 }
614
BAMFile(std::string const & filepath)615 BAMFile::BAMFile(std::string const &filepath)
616 {
617 InflateInit();
618
619 #if USE_STDIO
620 file = fopen(filepath.c_str(), "rb");
621 if (file == NULL)
622 throw std::runtime_error(std::string("The file '")+filepath+"' could not be opened");
623 #else
624 file.open(filepath.c_str(), std::ifstream::in | std::ifstream::binary);
625 if (!file.is_open())
626 throw std::runtime_error(std::string("The file '")+filepath+"' could not be opened");
627 #endif
628
629 ReadHeader();
630 first_bpos = cpos + (zs.next_in - iobuffer);
631 first_bam_cur = bam_cur;
632 LoadIndex(filepath);
633 }
634
~BAMFile()635 BAMFile::~BAMFile()
636 {
637 inflateEnd(&zs);
638 }
639
Read()640 BAMRecord const *BAMFile::Read()
641 {
642 union aligned_BAMRecord {
643 SizedRawData raw;
644 BAMRecord record;
645 struct {
646 uint8_t align[16];
647 } align;
648 };
649 int32_t datasize;
650
651 if (!ReadI32(datasize)) // assumes cause is EOF
652 return 0;
653
654 if (datasize < 0)
655 throw std::runtime_error("file is corrupt: record size < 0");
656
657 uint32_t const size = (uint32_t)datasize;
658
659 union aligned_BAMRecord *data = new aligned_BAMRecord[(size + sizeof(uint32_t) + sizeof(aligned_BAMRecord) - 1)/sizeof(aligned_BAMRecord)];
660 data->raw.size = size;
661 if (Read(size, data->raw.data))
662 return &data->record;
663
664 delete [] data;
665 throw std::runtime_error("file is truncated");
666 }
667
isGoodRecord(BAMRecord const & rec)668 bool BAMFile::isGoodRecord(BAMRecord const &rec)
669 {
670 if (rec.isTooSmall())
671 return false;
672
673 unsigned const refs = (unsigned)references.size();
674 unsigned const flag = rec.flag();
675 int const self_refID = rec.refID();
676 if ((flag & 0x40) != 0 && self_refID > 0 && self_refID >= refs)
677 return false;
678
679 int const mate_refID = rec.next_refID();
680 if ((flag & 0x80) != 0 && mate_refID > 0 && mate_refID >= refs)
681 return false;
682
683 return true;
684 }
685
Slice(const std::string & rname,unsigned start,unsigned last)686 BAMRecordSource *BAMFile::Slice(const std::string &rname, unsigned start, unsigned last)
687 {
688 int const refID = getReferenceIndexByName(rname);
689
690 if (refID < 0)
691 return new BAMRecordSource();
692
693 HeaderRefInfo const &ri = getRefInfo(refID);
694
695 if (start > 0)
696 --start;
697 if (last == 0)
698 last = ri.length;
699
700 if (!ri.index || start >= ri.length)
701 return new BAMRecordSource();
702
703 if (last > start + ri.length)
704 last = start + ri.length;
705
706 BAMFilePosTypeList const &index = ri.slice(start, last);
707
708 if (index.size() == 0)
709 return new BAMRecordSource();
710
711 return new BAMFileSlice(*this, refID, start, last, index);
712 }
713