1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  */
25 
26 #include <iostream>
27 #include <fstream>
28 #include "bam.hpp"
29 
30 #define MAX_INDEX_SEQ_LEN ((1u << 29) - 1)
31 #define MAX_BIN  (37449u)
32 #define NUMINTV ((MAX_INDEX_SEQ_LEN + 1) >> 14)
33 
34 class RefIndex {
35 public:
36     BAMFilePosType off_beg, off_end;
37     uint64_t n_mapped, n_unmapped;
38     BAMFilePosTypeList interval;
39     std::vector<BAMFilePosTypeList> bins;
40 
41 private:
CopyWhereLess(BAMFilePosTypeList & dst,BAMFilePosTypeList const & src,BAMFilePosType const maxpos)42     static void CopyWhereLess(BAMFilePosTypeList &dst,
43                               BAMFilePosTypeList const &src,
44                               BAMFilePosType const maxpos)
45     {
46         if (maxpos.hasValue()) {
47             for (unsigned i = 0; i < src.size(); ++i) {
48                 BAMFilePosType const pos = src[i];
49 
50                 if (pos < maxpos)
51                     dst.push_back(pos);
52             }
53         }
54         else {
55             for (unsigned i = 0; i < src.size(); ++i)
56                 dst.push_back(src[i]);
57         }
58     }
59 public:
LoadIndexIntervals(char const * data,char const * const endp)60     char const *LoadIndexIntervals(char const *data, char const *const endp)
61     {
62         if (data + 4 > endp)
63             throw std::runtime_error("insufficient data to load index interval count");
64         int32_t const n = LE2Host<int32_t>(data); data += 4;
65 
66         char const *const next = data + 8 * n;
67         if (next > endp)
68             throw std::runtime_error("insufficient data to load index intervals");
69 
70         BAMFilePosType last(0);
71 
72         interval.resize(n);
73         for (unsigned i = 0; i < n; ++i) {
74             BAMFilePosType const intvl = LE2Host<BAMFilePosType>(data); data += 8;
75 
76             interval[i] = (intvl == last) ? BAMFilePosType(0) : intvl;
77             last = intvl;
78         }
79         while (interval.size() > 0 && !interval.back().hasValue())
80             interval.pop_back();
81 
82         return next;
83     }
LoadIndexBins(char const * data,char const * const endp)84     char const *LoadIndexBins(char const *data, char const *const endp)
85     {
86         if (data + 4 > endp)
87             throw std::runtime_error("insufficient data to load index bin count");
88         int32_t const n_bin = LE2Host<int32_t>(data); data += 4;
89 
90         bins.resize(MAX_BIN);
91         for (int i = 0; i < n_bin; ++i) {
92             if (data + 8 > endp)
93                 throw std::runtime_error("insufficient data to load index bin size");
94 
95             uint32_t const bin     = LE2Host<uint32_t>(data + 0);
96              int32_t const n_chunk = LE2Host< int32_t>(data + 4);
97 
98             data += 8;
99             if (data + n_chunk * 16 > endp)
100                 throw std::runtime_error("insufficient data to load index bin chunks");
101 
102             if (bin == MAX_BIN && n_chunk == 2) {
103                 // special doodad
104                 off_beg    = LE2Host<BAMFilePosType>(data); data += 8;
105                 off_end    = LE2Host<BAMFilePosType>(data); data += 8;
106                 n_mapped   = LE2Host<uint64_t>(data); data += 8;
107                 n_unmapped = LE2Host<uint64_t>(data); data += 8;
108             }
109             else if (bin < MAX_BIN) {
110                 BAMFilePosTypeList &this_bin = bins[bin];
111 
112                 this_bin.resize(n_chunk);
113 
114                 for (unsigned k = 0; k < n_chunk; ++k) {
115                     BAMFilePosType const beg = LE2Host<BAMFilePosType>(data); data += 8;
116                     BAMFilePosType const end = LE2Host<BAMFilePosType>(data); data += 8;
117 
118                     (void)end;
119                     this_bin[k] = beg;
120                 }
121             }
122             else
123                 data += 16 * n_chunk;
124         }
125         return data;
126     }
RefIndex()127     RefIndex()
128     {}
slice(unsigned const beg,unsigned const end) const129     BAMFilePosTypeList slice(unsigned const beg, unsigned const end) const
130     {
131         unsigned const first[] = { 1, 9, 73, 585, 4681 };
132         unsigned const cnt = end - 1 - beg;
133         unsigned const maxintvl = (end >> 14) + 1;
134         BAMFilePosType const maxpos = maxintvl < interval.size() ? interval[maxintvl] : off_end;
135         unsigned int_beg[5], int_cnt[5];
136         BAMFilePosTypeList rslt;
137 
138         for (unsigned i = 0; i < 5; ++i) {
139             unsigned const shift = 14 + 3 * (4 - i);
140 
141             int_beg[i] = (beg >> shift);
142         }
143         for (unsigned i = 0; i < 5; ++i) {
144             unsigned const shift = 14 + 3 * (4 - i);
145 
146             int_cnt[i] = (cnt >> shift) + 1;
147         }
148         CopyWhereLess(rslt, bins[0], maxpos);
149         for (unsigned i = 0; i < 5; ++i) {
150             unsigned const beg = int_beg[i] + first[i];
151             unsigned const N = int_cnt[i];
152 
153             for (unsigned j = 0; j < N; ++j) {
154                 CopyWhereLess(rslt, bins[beg + j], maxpos);
155             }
156         }
157         std::sort(rslt.begin(), rslt.end());
158         return rslt;
159     }
160 };
161 
LoadIndex(char const data[],char const * const endp)162 size_t HeaderRefInfo::LoadIndex(char const data[], char const *const endp)
163 {
164     RefIndex *i = new RefIndex();
165     try {
166         char const *const next1 = i->LoadIndexBins(data, endp);
167         char const *const next2 = i->LoadIndexIntervals(next1, endp);
168 
169         index = i;
170 
171         return next2 - data;
172     }
173     catch (...) {
174         delete i;
175         throw;
176     }
177 }
178 
DropIndex()179 void HeaderRefInfo::DropIndex()
180 {
181     if (index) {
182         delete index;
183         index = 0;
184     }
185 }
186 
slice(unsigned const beg,unsigned const end) const187 BAMFilePosTypeList HeaderRefInfo::slice(unsigned const beg, unsigned const end) const {
188     return index ? index->slice(beg, end) : BAMFilePosTypeList();
189 }
190 
DumpSAM(std::ostream & oss,BAMRecord const & rec) const191 void BAMFile::DumpSAM(std::ostream &oss, BAMRecord const &rec) const
192 {
193     unsigned const seqlen   = rec.l_seq();
194     std::string const RNAME = rec.isSelfMapped() ? getRefInfo(rec.refID()).getName() : "*";
195     std::string const RNEXT = rec.isMateMapped() ? getRefInfo(rec.next_refID()).getName() : "*";
196     int const POS   		= rec.isSelfMapped() ? (rec.pos() + 1) : 0;
197     int const PNEXT 		= rec.isMateMapped() ? (rec.next_pos() + 1) : 0;
198 
199     oss	        << rec.readname()
200  		<< '\t' << rec.flag()
201     	<< '\t' << RNAME
202     	<< '\t' << POS
203         << '\t' << int(rec.mq())
204     	<< '\t';
205 
206     if (rec.isSelfMapped()) {
207         unsigned const N = rec.nc();
208 
209         for (unsigned i = 0; i < N; ++i) {
210             uint32_t const cv = rec.cigar(i);
211             int const op = cv & 0x0F;
212             int len = cv >> 4;
213             char buf[16];
214             char *cur = buf + sizeof(buf);
215 
216             *--cur = '\0';
217             *--cur = "MIDNSHP=X???????"[op];
218             do {
219                 int const digit = len % 10;
220 
221                 *--cur = digit + '0';
222                 len /= 10;
223             } while (len > 0);
224             oss << cur;
225         }
226     }
227     else
228         oss << '*';
229 
230 	oss	<< '\t' << RNEXT
231     	<< '\t' << PNEXT
232         << '\t' << rec.tlen()
233     	<< '\t';
234 
235     if (seqlen > 0) {
236         for (int i = 0; i < seqlen; ++i)
237             oss << rec.seq(i);
238         oss << '\t';
239 
240         bool allFF = true;
241         uint8_t const *const q = rec.qual();
242         for (unsigned i = 0; i < seqlen; ++i) {
243             uint8_t const qv = q[i];
244 
245             if (qv != 0xFF) {
246                 allFF = false;
247                 break;
248             }
249         }
250         if (allFF)
251             oss << '*';
252         else {
253             for (unsigned i = 0; i < seqlen; ++i) {
254                 uint8_t const qv = q[i];
255                 int const offset = ((int)qv) + '!';
256                 int const out = offset < '~' ? offset : '~';
257 
258                 oss << char(out);
259             }
260         }
261     }
262     else
263         oss << "*\t*";
264 
265     for (BAMRecord::OptionalField::const_iterator i = rec.begin(); i != rec.end(); ++i) {
266         std::string const tag = std::string(i->getTag(), 2);
267         int const elems = i->getElementCount();
268         char const type = i->getValueType();
269         char const *raw = i->getRawValue();
270 
271         oss << '\t' << tag << ':' << (elems == 1 ? type : 'B') << ':';
272         if (type == 'Z' || type == 'H')
273             oss << raw;
274         else {
275             if (elems > 1)
276                 oss << type;
277             for (int j = 0; j < elems; ++j) {
278                 if (elems > 1)
279                     oss << ',';
280                 switch (type) {
281                     case 'A':
282                         oss << *raw;
283                         ++raw;
284                         break;
285                     case 'C':
286                         oss << unsigned(*((uint8_t const *)raw));
287                         ++raw;
288                         break;
289                     case 'c':
290                         oss << int(*((int8_t const *)raw));
291                         ++raw;
292                         break;
293                     case 'S':
294                         oss << unsigned(LE2Host<uint16_t>(raw));
295                         raw += 2;
296                         break;
297                     case 's':
298                         oss << int(LE2Host<int16_t>(raw));
299                         raw += 2;
300                         break;
301                     case 'F':
302                         oss << float(LE2Host<float>(raw));
303                         raw += 4;
304                         break;
305                     case 'I':
306                         oss << unsigned(LE2Host<uint32_t>(raw));
307                         raw += 4;
308                         break;
309                     case 'i':
310                         oss << int(LE2Host<int32_t>(raw));
311                         raw += 4;
312                         break;
313                 }
314             }
315         }
316     }
317 }
318 
FillBuffer(int const n)319 unsigned BAMFile::FillBuffer(int const n) {
320     size_t const nwant = n * IO_BLK_SIZE;
321     char *const dst = (char *)(iobuffer + 2 * IO_BLK_SIZE - nwant);
322 #if USE_STDIO
323     size_t const nread = feof(file) ? 0 : fread(dst, 1, nwant, file);
324 #else
325     size_t const nread = file.eof() ? 0 : file.read(dst, nwant).gcount();
326 
327     if (nread == 0 && !file.eof())
328         throw std::runtime_error("read failed");
329 #endif
330     return (unsigned)nread;
331 }
332 
ReadZlib(void)333 void BAMFile::ReadZlib(void) {
334     zs.next_out  = bambuffer;
335     zs.avail_out = sizeof(bambuffer);
336     zs.total_out = 0;
337     bam_cur      = 0;
338 
339     if (zs.avail_in == 0) {
340     FILL_BUFFER:
341 #if USE_STDIO
342         cpos = ftell(file);
343 #else
344         cpos = file.tellg();
345 #endif
346         zs.avail_in = FillBuffer(2);
347         zs.next_in  = iobuffer;
348         if (zs.avail_in == 0) /* EOF */
349             return;
350     }
351 
352     bpos = cpos + (zs.next_in - iobuffer);
353     int const zrc = inflate(&zs, Z_FINISH);
354 
355     if (zrc == Z_STREAM_END) {
356         /* inflateReset clobbers these value but we want them */
357         uLong const total_out = zs.total_out;
358         uLong const total_in  = zs.total_in;
359 
360         int const zrc = inflateReset(&zs);
361         if (zrc != Z_OK)
362             throw std::logic_error("inflateReset didn't return Z_OK");
363 
364         zs.total_out = total_out;
365         zs.total_in  = total_in;
366 
367 #if 0
368         std::cerr << std::dec << "total_in: " << total_in << "; avail_in: " << zs.avail_in << "; IO_BLK_SIZE: " << IO_BLK_SIZE << std::endl;
369 #endif
370 
371         if (total_in >= IO_BLK_SIZE && total_in + zs.avail_in == 2 * IO_BLK_SIZE)
372         {
373             memmove(iobuffer, &iobuffer[sizeof(iobuffer)/2], sizeof(iobuffer)/2);
374             cpos += sizeof(iobuffer)/2;
375             zs.next_in  -= sizeof(iobuffer)/2;
376             zs.avail_in += FillBuffer(1);
377         }
378 
379         return;
380     }
381     if (zrc != Z_OK && zrc != Z_BUF_ERROR)
382         throw std::runtime_error("decompression failed");
383 
384     if (zs.avail_in == 0)
385         goto FILL_BUFFER;
386 
387     throw std::runtime_error("zs.avail_in != 0");
388 }
389 
ReadN(size_t N,void * Dst)390 size_t BAMFile::ReadN(size_t N, void *Dst) {
391     uint8_t *const dst = reinterpret_cast<uint8_t *>(Dst);
392     size_t n = 0;
393 
394     while (n < N) {
395         size_t const avail_out = N - n;
396         size_t const avail_in = zs.total_out - bam_cur;
397 
398         if (avail_in) {
399             size_t const copy = avail_out < avail_in ? avail_out : avail_in;
400 
401             memmove(dst + n, bambuffer + bam_cur, copy);
402             bam_cur += copy;
403             if (bam_cur == zs.total_out)
404                 bam_cur = zs.total_out = 0;
405 
406             n += copy;
407             if (n == N)
408                 break;
409         }
410         ReadZlib();
411         if (zs.total_out == 0)
412             break;
413     }
414     return n;
415 }
416 
SkipN(size_t N)417 size_t BAMFile::SkipN(size_t N) {
418     size_t n = 0;
419 
420     while (n < N) {
421         size_t const avail_out = N - n;
422         size_t const avail_in = zs.total_out - bam_cur;
423 
424         if (avail_in) {
425             size_t const copy = avail_out < avail_in ? avail_out : avail_in;
426 
427             bam_cur += copy;
428             n += copy;
429             if (n == N)
430                 break;
431         }
432         ReadZlib();
433         if (zs.total_out == 0)
434             break;
435     }
436     return n;
437 }
438 
Seek(size_t const new_bpos,unsigned const new_bam_cur)439 void BAMFile::Seek(size_t const new_bpos, unsigned const new_bam_cur) {
440     unsigned const c_offset = new_bpos % IO_BLK_SIZE;
441     size_t const new_cpos = new_bpos - c_offset;
442 
443 #if 0
444     std::cerr << "seek to " << std::hex << new_bpos << "|" << new_bam_cur << std::endl;
445 #endif
446 
447 #if USE_STDIO
448     if (fseek(file, new_cpos, SEEK_SET))
449         throw std::runtime_error("position is invalid");
450     cpos = ftell(file);
451 #else
452     file.seekg(new_cpos);
453     cpos = file.tellg();
454 #endif
455     zs.avail_in = FillBuffer(2);
456     if (zs.avail_in > c_offset) {
457         zs.avail_in -= c_offset;
458         zs.next_in = iobuffer + c_offset;
459         ReadZlib();
460         if (zs.total_out > new_bam_cur) {
461             bam_cur = new_bam_cur;
462             return;
463         }
464     }
465     throw std::runtime_error("position is invalid");
466 }
467 
468 template <typename T>
Read(size_t count,T * dst)469 bool BAMFile::Read(size_t count, T *dst) {
470     size_t const nwant = count * sizeof(T);
471     size_t const nread = ReadN(nwant, reinterpret_cast<void *>(dst));
472 
473     return nwant == nread;
474 }
475 
ReadI32()476 int32_t BAMFile::ReadI32() {
477     int32_t value;
478 
479     if (Read(1, &value))
480         return LE2Host<int32_t>(&value);
481     throw std::runtime_error("insufficient data while reading bam file");
482 }
483 
ReadI32(int32_t & rslt)484 bool BAMFile::ReadI32(int32_t &rslt) {
485     int32_t value;
486 
487     if (Read(1, &value)) {
488         rslt = LE2Host<int32_t>(&value);
489         return true;
490     }
491     return false;
492 }
493 
InflateInit(void)494 void BAMFile::InflateInit(void) {
495     memset(&zs, 0, sizeof(zs));
496 
497     int const zrc = inflateInit2(&zs, MAX_WBITS + 16);
498     switch (zrc) {
499         case Z_OK:
500             break;
501         case Z_MEM_ERROR:
502             throw std::bad_alloc();
503             break;
504         case Z_VERSION_ERROR:
505             throw std::runtime_error(std::string("zlib version is not compatible; need version " ZLIB_VERSION " but have ") + zlibVersion());
506             break;
507         case Z_STREAM_ERROR:
508         default:
509             throw std::invalid_argument(zs.msg ? zs.msg : "unknown");
510             break;
511     }
512 }
513 
CheckHeaderSignature(void)514 void BAMFile::CheckHeaderSignature(void) {
515     static char const sig[] = "BAM\1";
516     char actual[4];
517 
518     if (!Read(4, actual) || memcmp(actual, sig, 4) != 0)
519         throw std::runtime_error("Not a BAM file");
520 }
521 
ReadHeader(void)522 void BAMFile::ReadHeader(void) {
523     CheckHeaderSignature();
524 
525     {
526         int32_t const l_text = ReadI32();
527         if (l_text < 0)
528             throw std::runtime_error("header text length < 0");
529 
530         char *const text = new char[l_text];
531         if (!Read(l_text, text))
532             throw std::runtime_error("file is truncated");
533 
534         headerText = text;
535         delete [] text;
536     }
537     int32_t const n_ref = ReadI32();
538     if (n_ref < 0)
539         throw std::runtime_error("header reference count < 0");
540 
541     references.reserve(n_ref);
542 
543     for (int i = 0; i < n_ref; ++i) {
544         int32_t const l_name = ReadI32();
545 
546         if (l_name < 0)
547             throw std::runtime_error("header reference name length < 0");
548 
549         char *const name = new char[l_name];
550         if (!Read(l_name, name))
551             throw std::runtime_error("file is truncated");
552 
553         int32_t const l_ref = ReadI32();
554         if (l_ref < 0)
555             throw std::runtime_error("header reference length < 0");
556 
557         references.push_back(HeaderRefInfo(name, l_ref));
558         referencesByName[name] = i;
559 
560         delete [] name;
561     }
562 }
563 
LoadIndexData(size_t const fsize,char const data[])564 void BAMFile::LoadIndexData(size_t const fsize, char const data[]) {
565     char const *const endp = data + fsize;
566 
567     if (memcmp(data, "BAI\1", 4) != 0)
568         return;
569 
570     int32_t const n_ref = LE2Host<int32_t>(data + 4);
571     if (n_ref != references.size())
572         return;
573 
574     size_t offset = 8;
575 
576     for (int i = 0; i < n_ref; ++i) {
577         size_t const size = references[i].LoadIndex(data + offset, endp);
578 
579         offset += size;
580         if (size == 0) {
581             std::cerr << "failed to load index #" << (i + 1) << std::endl;
582             for (int j = 0; j < i; ++j)
583                 references[j].DropIndex();
584             return;
585         }
586     }
587 }
588 
LoadIndex(std::string const & filepath)589 void BAMFile::LoadIndex(std::string const &filepath) {
590     char *data;
591     size_t fsize;
592     {
593         std::string const idxpath(filepath+".bai");
594         std::ifstream ifile;
595 
596         ifile.open(idxpath.c_str(), std::ifstream::in | std::ifstream::binary);
597         if (!ifile.is_open())
598             return;
599 
600         std::filebuf *const buf = ifile.rdbuf();
601         fsize = buf->pubseekoff(0, ifile.end, ifile.in);
602 
603         if (fsize < 8)
604             return;
605 
606         buf->pubseekpos(0, ifile.in);
607 
608         data = new char[fsize];
609         buf->sgetn(data, fsize);
610     }
611     LoadIndexData(fsize, data);
612     delete [] data;
613 }
614 
BAMFile(std::string const & filepath)615 BAMFile::BAMFile(std::string const &filepath)
616 {
617     InflateInit();
618 
619 #if USE_STDIO
620     file = fopen(filepath.c_str(), "rb");
621     if (file == NULL)
622         throw std::runtime_error(std::string("The file '")+filepath+"' could not be opened");
623 #else
624     file.open(filepath.c_str(), std::ifstream::in | std::ifstream::binary);
625     if (!file.is_open())
626         throw std::runtime_error(std::string("The file '")+filepath+"' could not be opened");
627 #endif
628 
629     ReadHeader();
630     first_bpos = cpos + (zs.next_in - iobuffer);
631     first_bam_cur = bam_cur;
632     LoadIndex(filepath);
633 }
634 
~BAMFile()635 BAMFile::~BAMFile()
636 {
637     inflateEnd(&zs);
638 }
639 
Read()640 BAMRecord const *BAMFile::Read()
641 {
642     union aligned_BAMRecord {
643         SizedRawData raw;
644         BAMRecord record;
645         struct {
646             uint8_t align[16];
647         } align;
648     };
649     int32_t datasize;
650 
651     if (!ReadI32(datasize)) // assumes cause is EOF
652         return 0;
653 
654     if (datasize < 0)
655         throw std::runtime_error("file is corrupt: record size < 0");
656 
657     uint32_t const size = (uint32_t)datasize;
658 
659     union aligned_BAMRecord *data = new aligned_BAMRecord[(size + sizeof(uint32_t) + sizeof(aligned_BAMRecord) - 1)/sizeof(aligned_BAMRecord)];
660     data->raw.size = size;
661     if (Read(size, data->raw.data))
662         return &data->record;
663 
664     delete [] data;
665     throw std::runtime_error("file is truncated");
666 }
667 
isGoodRecord(BAMRecord const & rec)668 bool BAMFile::isGoodRecord(BAMRecord const &rec)
669 {
670     if (rec.isTooSmall())
671         return false;
672 
673     unsigned const refs = (unsigned)references.size();
674     unsigned const flag = rec.flag();
675     int const self_refID = rec.refID();
676     if ((flag & 0x40) != 0 && self_refID > 0 && self_refID >= refs)
677         return false;
678 
679     int const mate_refID = rec.next_refID();
680     if ((flag & 0x80) != 0 && mate_refID > 0 && mate_refID >= refs)
681         return false;
682 
683     return true;
684 }
685 
Slice(const std::string & rname,unsigned start,unsigned last)686 BAMRecordSource *BAMFile::Slice(const std::string &rname, unsigned start, unsigned last)
687 {
688     int const refID = getReferenceIndexByName(rname);
689 
690     if (refID < 0)
691         return new BAMRecordSource();
692 
693     HeaderRefInfo const &ri = getRefInfo(refID);
694 
695     if (start > 0)
696         --start;
697     if (last == 0)
698         last = ri.length;
699 
700     if (!ri.index || start >= ri.length)
701         return new BAMRecordSource();
702 
703     if (last > start + ri.length)
704         last = start + ri.length;
705 
706     BAMFilePosTypeList const &index = ri.slice(start, last);
707 
708     if (index.size() == 0)
709         return new BAMRecordSource();
710 
711     return new BAMFileSlice(*this, refID, start, last, index);
712 }
713