1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  */
25 
26 #define USE_STDIO 1
27 
28 #include <stdint.h>
29 #include <string.h>
30 
31 #include <stdexcept>
32 #include <vector>
33 #include <map>
34 #include <algorithm>
35 #include <iterator>
36 
37 #include <zlib.h>
38 
39 #if USE_STDIO
40 #include <cstdio>
41 #else
42 #include <fstream>
43 #endif
44 
45 #define BAM_BLK_MAX (64u * 1024u)
46 #define IO_BLK_SIZE (1024u * 1024u)
47 
48 template<typename T>
LE2Host(void const * const src)49 static T LE2Host(void const *const src)
50 {
51     union {
52         uint8_t ch[sizeof(T)];
53         T v; // ensures alignment if necessary
54     } u;
55     memmove(reinterpret_cast<void *>(&u), src, sizeof(T));
56 #if BYTE_ORDER == LITTLE_ENDIAN
57     return u.v;
58 #else
59     T y = 0;
60     size_t j = sizeof(T);
61 
62     for (size_t i = 0; i < sizeof(T); ++i)
63         y = (y << 8) | u.ch[--j];
64 
65     return y;
66 #endif
67 }
68 
69 class BAMFilePosType {
70     uint64_t value;
71 public:
BAMFilePosType(uint64_t const x=0)72     BAMFilePosType(uint64_t const x = 0) : value(x) {}
hasValue() const73     bool hasValue() const { return value != 0; }
fpos() const74     uint64_t fpos() const {
75         return value >> 16;
76     }
bpos() const77     uint16_t bpos() const {
78         return (uint16_t)value;
79     }
operator <(BAMFilePosType const lhs,BAMFilePosType const rhs)80     friend bool operator <(BAMFilePosType const lhs, BAMFilePosType const rhs) {
81         return rhs.value < rhs.value;
82     }
operator ==(BAMFilePosType const lhs,BAMFilePosType const rhs)83     friend bool operator ==(BAMFilePosType const lhs, BAMFilePosType const rhs) {
84         return rhs.value == rhs.value;
85     }
86 };
87 
88 template <>
LE2Host(void const * const src)89 BAMFilePosType LE2Host<BAMFilePosType>(void const *const src)
90 {
91     return BAMFilePosType(LE2Host<uint64_t>(src));
92 }
93 
94 
95 typedef std::vector<BAMFilePosType> BAMFilePosTypeList;
96 
97 class BAMFile;
98 class RefIndex;
99 
100 class HeaderRefInfo
101 {
102     friend class BAMFile;
103 
104     RefIndex const *index;
105     std::string name;
106     unsigned length;
107 
HeaderRefInfo(std::string const & Name,int32_t const Length)108     HeaderRefInfo(std::string const &Name, int32_t const Length)
109     : name(Name), length(Length), index(0)
110     {}
111     size_t LoadIndex(char const data[], char const *const endp);
112     void DropIndex();
113 public:
~HeaderRefInfo()114     ~HeaderRefInfo() {
115         DropIndex();
116     }
117     BAMFilePosTypeList slice(unsigned const beg, unsigned const end) const;
getName() const118     std::string const &getName() const {
119         return name;
120     }
getLength() const121     unsigned getLength() const {
122         return length;
123     }
124 };
125 
126 struct SizedRawData
127 {
128     uint32_t size;
129     uint8_t data[1];
130 };
131 
132 struct BAMLayout : public SizedRawData {
133 /* layout looks like:
134     uint8_t
135         m_refID[4],
136         m_pos[4],
137         m_bin_mq_nl[4],
138         m_flag_nc[4],
139         m_l_seq[4],
140         m_next_refID[4],
141         m_next_pos[4],
142         m_tlen[4],
143         m_readname[1];
144  */
145     enum layout {
146         start_refID = 0,
147         start_pos = 4,
148         start_bin_mq_nl = start_pos + 4,
149         start_flag_nc = start_bin_mq_nl + 4,
150         start_l_seq = start_flag_nc + 4,
151         start_next_refID = start_l_seq + 4,
152         start_next_pos = start_next_refID + 4,
153         start_tlen = start_next_pos + 4,
154         start_readname = start_tlen + 4,
155 
156         length_fixed_part = start_readname,
157 
158         start_nl = start_bin_mq_nl,
159         start_mq = start_nl + 1,
160         start_bin = start_mq + 1,
161         start_nc = start_flag_nc,
162         start_flag = start_nc + 2
163     };
p_refIDBAMLayout164     uint8_t const *p_refID     () const { return data + start_refID     ; }
p_posBAMLayout165     uint8_t const *p_pos       () const { return data + start_pos       ; }
p_nlBAMLayout166     uint8_t const *p_nl        () const { return data + start_nl        ; }
p_mqBAMLayout167     uint8_t const *p_mq        () const { return data + start_mq        ; }
p_binBAMLayout168     uint8_t const *p_bin       () const { return data + start_bin       ; }
p_ncBAMLayout169     uint8_t const *p_nc        () const { return data + start_nc        ; }
p_flagBAMLayout170     uint8_t const *p_flag      () const { return data + start_flag      ; }
p_l_seqBAMLayout171     uint8_t const *p_l_seq     () const { return data + start_l_seq     ; }
p_next_refIDBAMLayout172     uint8_t const *p_next_refID() const { return data + start_next_refID; }
p_next_posBAMLayout173     uint8_t const *p_next_pos  () const { return data + start_next_pos  ; }
p_tlenBAMLayout174     uint8_t const *p_tlen      () const { return data + start_tlen      ; }
p_readnameBAMLayout175     uint8_t const *p_readname  () const { return data + start_readname  ; }
176 
min_sizeBAMLayout177     size_t min_size() const {
178         if (size < length_fixed_part)
179             return length_fixed_part;
180 
181         uint16_t const nc = LE2Host<uint16_t>(p_nc());
182         uint32_t const lseq = LE2Host<int32_t>(p_l_seq());
183         return length_fixed_part + 4u * nc + ((lseq + 1) >> 1) + lseq;
184     }
endpBAMLayout185     void const *endp() const {
186         return (void const *)(data + size);
187     }
188 };
189 
190 class BAMRecord : private BAMLayout {
191 public:
isTooSmall() const192     bool isTooSmall() const {
193         return (size_t)size < min_size();
194     }
195 
refID() const196     int32_t refID() const { return LE2Host<int32_t>(p_refID()); }
pos() const197     int32_t pos() const { return LE2Host<int32_t>(p_pos()); }
mq() const198     uint8_t mq() const { return *p_mq(); }
l_read_name() const199     uint8_t l_read_name() const { return *p_nl(); }
flag() const200     uint16_t flag() const { return LE2Host<uint16_t>(p_flag()); }
nc() const201     uint16_t nc() const { return LE2Host<uint16_t>(p_nc()); }
l_seq() const202     int32_t l_seq() const { return LE2Host<int32_t>(p_l_seq()); }
next_refID() const203     int32_t next_refID() const { return LE2Host<int32_t>(p_next_refID()); }
next_pos() const204     int32_t next_pos() const { return LE2Host<int32_t>(p_next_pos()); }
tlen() const205     int32_t tlen() const { return LE2Host<int32_t>(p_tlen()); }
readname() const206     char const *readname() const { return (char const *)p_readname(); }
cigar(unsigned const i) const207     uint32_t cigar(unsigned const i) const {
208         uint8_t const *const p_cigar = p_readname() + l_read_name();
209         return LE2Host<uint32_t>(p_cigar + 4u * i);
210     }
seq() const211     uint8_t const *seq() const { return p_readname() + l_read_name() + 4u * nc(); }
seq(unsigned const i) const212     char seq(unsigned const i) const {
213         static char const tr[] = "=ACMGRSVTWYHKDBN";
214         uint8_t const b4na2 = seq()[i >> 1];
215         uint8_t const lo = b4na2 & 15;
216         uint8_t const hi = b4na2 >> 4;
217         return tr[(i & 1) ? lo : hi];
218     }
qual() const219     uint8_t const *qual() const { return seq() + ((l_seq() + 1) >> 1); }
extra() const220     void const *extra() const { return (void const *)(qual() + l_seq()); }
221 
refLen() const222     unsigned refLen() const {
223         unsigned const n = nc();
224         unsigned rslt = 0;
225 
226         for (unsigned i = 0; i < n; ++i) {
227             uint32_t const op = cigar(i);
228             int const code = op & 0x0F;
229             int const len = op >> 4;
230             switch (code) {
231                 case 0: /* M */
232                 case 2: /* D */
233                 case 3: /* N */
234                 case 7: /* = */
235                 case 8: /* X */
236                     rslt += len;
237             }
238         }
239         return rslt;
240     }
isSelfMapped() const241     bool isSelfMapped() const {
242         return ((flag() & 0x0004) != 0 || refID() < 0 || pos() < 0 || nc() == 0) ? false : true;
243     }
isMateMapped() const244     bool isMateMapped() const {
245         int const FLAG = flag();
246 
247         return ((FLAG & 0x0001) == 0 || (FLAG & 0x0008) != 0 || next_refID() < 0 || next_pos() < 0) ? false : true;
248     }
249 
cigarString(std::string & rslt,bool const clipped,char const OPCODE[]) const250     void cigarString(std::string &rslt, bool const clipped, char const OPCODE[]) const {
251         unsigned const n = nc();
252         int last_len = 0;
253         char last_code = 0;
254         unsigned last_size = 0;
255 
256         rslt.resize(0);
257         rslt.reserve(11*n);
258 
259         for (unsigned i = 0; i < n; ++i) {
260             char buf[12];
261             char *cur = buf + sizeof(buf);
262             uint32_t const op = cigar(i);
263             char const code = OPCODE[op & 0x0F];
264             int len = op >> 4;
265 
266             if (last_code == code) {
267                 len += last_len;
268                 rslt.resize(last_size);
269             }
270             last_size = (unsigned)rslt.size();
271             last_len = len;
272             last_code = code;
273 
274             *--cur = '\0';
275             *--cur = code;
276             for ( ; ; ) {
277                 *--cur = len % 10 + '0';
278                 if ((len /= 10) == 0)
279                     break;
280             }
281             if (!clipped || last_size != 0 || !(last_code == 'S' || last_code == 'H'))
282 	            rslt.append(cur);
283         }
284         if (clipped && (last_code == 'S' || last_code == 'H'))
285             rslt.resize(last_size);
286     }
287     class OptionalField {
288         char tag[2];
289         char val_type;
290         union {
291             char scalar[1];
292             struct {
293                 char type;
294                 char count[4];
295                 char value[1];
296             } array;
297         } value;
298 
type_size(int const type)299         static int type_size(int const type) {
300             switch (type) {
301                 case 'A':
302                 case 'C':
303                 case 'c':
304                     return 1;
305                 case 'S':
306                 case 's':
307                     return 2;
308                 case 'F':
309                 case 'I':
310                 case 'i':
311                     return 4;
312                 default:
313                     return -1;
314             }
315         }
316 
size(void const * const max) const317         int size(void const *const max) const {
318             if (val_type == 'B') {
319                 int const elem_size = type_size(value.array.type);
320                 if (elem_size < 0)
321                     return -1;
322                 int const elem_count = LE2Host<int32_t>(value.array.count);
323                 char const *end = &value.array.value[elem_size * elem_count];
324 
325                 if (end > max)
326                     return -1;
327                 return (int)(end - tag);
328             }
329             else if (val_type == 'Z' || val_type == 'H') {
330                 for (char const *cur = value.scalar; cur != max; ++cur) {
331                     if (*cur == '\0')
332                         return (int)((cur + 1) - tag);
333                 }
334                 return -1;
335             }
336             else {
337                 int const ts = type_size(val_type);
338                 return ts < 0 ? -1 : (int)(&value.scalar[type_size(val_type)] - tag);
339             }
340         }
next(void const * const max) const341         void const *next(void const *const max) const {
342             int const bytes = size(max);
343             if (bytes <= 0)
344                 return 0;
345             return (void const *)(((char const *)this) + bytes);
346         }
OptionalField()347         OptionalField() {}
348     public:
getTag() const349         char const *getTag() const {
350             return tag;
351         }
getValueType() const352         char getValueType() const {
353             if (val_type != 'B')
354                 return val_type;
355             else
356                 return value.array.type;
357         }
getElementCount() const358         int getElementCount() const {
359             if (val_type != 'B')
360                 return 1;
361             else
362                 return LE2Host<int32_t>(value.array.count);
363         }
getElementSize() const364         int getElementSize() const {
365             if (val_type == 'B')
366                 return type_size(value.array.type);
367             else if (val_type != 'Z' && val_type != 'H')
368                 return type_size(val_type);
369             else
370                 return (int)strlen(value.scalar);
371         }
getRawValue() const372         char const *getRawValue() const {
373             if (val_type == 'B')
374                 return value.array.value;
375             else
376                 return value.scalar;
377         }
378 
379         typedef OptionalField const constOptionalField;
380         class const_iterator : public std::iterator<std::forward_iterator_tag, constOptionalField>
381         {
382             friend class BAMRecord;
383             void const *cur;
384             void const *const endp;
385 
const_iterator(void const * const init,void const * const last)386             const_iterator(void const *const init, void const *const last) : cur(init), endp(last) {}
387         public:
operator ++()388             const_iterator &operator ++() {
389                 if (cur < endp)
390                     cur = ((OptionalField const *)cur)->next(endp);
391                 if (!cur)
392                     cur = endp;
393                 return *this;
394             }
operator *()395             OptionalField const &operator *() {
396                 return *((OptionalField const *)cur);
397             }
operator ->()398             OptionalField const *operator ->() {
399                 return (OptionalField const *)cur;
400             }
operator ==(const_iterator const & a,const_iterator const & b)401             friend bool operator ==(const_iterator const &a, const_iterator const &b) {
402                 return a.endp == b.endp && a.cur == b.cur;
403             }
operator !=(const_iterator const & a,const_iterator const & b)404             friend bool operator !=(const_iterator const &a, const_iterator const &b) {
405                 return !(a == b);
406             }
407         };
408     };
begin() const409     OptionalField::const_iterator begin() const {
410         return OptionalField::const_iterator(extra(), endp());
411     }
end() const412     OptionalField::const_iterator end() const {
413         return OptionalField::const_iterator(endp(), endp());
414     }
415 };
416 
417 class BAMRecordSource
418 {
419 public:
isGoodRecord(BAMRecord const & rec)420     virtual bool isGoodRecord(BAMRecord const &rec) {
421         return false;
422     }
Read()423     virtual BAMRecord const *Read() {
424         return 0;
425     }
DumpSAM(std::ostream & oss,BAMRecord const & rec) const426     virtual void DumpSAM(std::ostream &oss, BAMRecord const &rec) const {
427 
428     }
429 };
430 
431 class BAMFile : public BAMRecordSource {
432 #if USE_STDIO
433     FILE *file;
434 #else
435     std::ifstream file;
436 #endif
437     std::vector<HeaderRefInfo> references;
438     std::map<std::string, unsigned> referencesByName;
439     std::string headerText;
440 
441     size_t first_bpos;
442     size_t bpos;                    /* file position of bambuffer */
443     size_t cpos;                    /* file position of iobuffer  */
444     z_stream zs;
445 
446     unsigned first_bam_cur;
447     unsigned bam_cur;               /* current offset in bambuffer */
448 
449     Bytef iobuffer[2*IO_BLK_SIZE];
450     Bytef bambuffer[BAM_BLK_MAX];
451 
452     unsigned FillBuffer(int const n);
453     void ReadZlib(void);
454     size_t ReadN(size_t N, void *Dst);
455     size_t SkipN(size_t N);
456     template <typename T> bool Read(size_t count, T *dst);
457     int32_t ReadI32();
458     bool ReadI32(int32_t &rslt);
459     void InflateInit(void);
460     void CheckHeaderSignature(void);
461     void ReadHeader(void);
462     void LoadIndexData(size_t const fsize, char const data[]);
463     void LoadIndex(std::string const &filepath);
464 
465 public:
466     BAMFile(std::string const &filepath);
467     ~BAMFile();
468     void Seek(size_t const new_bpos, unsigned new_bam_cur);
Rewind()469     void Rewind() {
470         Seek(first_bpos, first_bam_cur);
471     }
472     virtual bool isGoodRecord(BAMRecord const &rec);
473     virtual BAMRecord const *Read();
474 
countOfReferences() const475     unsigned countOfReferences() const {
476         return (unsigned)references.size();
477     }
478 
getReferenceIndexByName(std::string const & name) const479     int getReferenceIndexByName(std::string const &name) const {
480         std::map<std::string, unsigned>::const_iterator i = referencesByName.find(name);
481         if (i != referencesByName.end())
482             return i->second;
483         else
484             return -1;
485     }
486 
getRefInfo(unsigned const i) const487     HeaderRefInfo const &getRefInfo(unsigned const i) const {
488         return references[i];
489     }
490 
491     BAMRecordSource *Slice(std::string const &rname, unsigned start, unsigned last);
492 
493     void DumpSAM(std::ostream &oss, BAMRecord const &rec) const;
494 };
495 
496 class BAMFileSlice : public BAMRecordSource {
497     friend class BAMFile;
498 
499     BAMFile *const parent;
500     BAMFilePosTypeList const index;
501     unsigned const refID;
502     unsigned const start;
503     unsigned const end;
504     BAMFilePosTypeList::const_iterator cur;
505 
Seek(void)506     void Seek(void)
507     {
508         BAMFilePosType const pos = *cur++;
509         size_t const fpos = pos.fpos();
510         uint16_t const bpos = pos.bpos();
511 
512         parent->Seek(fpos, bpos);
513     }
BAMFileSlice(BAMFile & p,unsigned const r,unsigned const s,unsigned const e,BAMFilePosTypeList const & i)514     BAMFileSlice(BAMFile &p, unsigned const r, unsigned const s, unsigned const e, BAMFilePosTypeList const &i)
515     : parent(&p)
516     , refID(r)
517     , start(s)
518     , end(e)
519     , index(i)
520     {
521         cur = index.begin();
522         Seek();
523     }
524 public:
isGoodRecord(BAMRecord const & rec)525     virtual bool isGoodRecord(BAMRecord const &rec) {
526         return parent->isGoodRecord(rec);
527     }
Read()528     virtual BAMRecord const *Read() {
529         for ( ; ; ) {
530             BAMRecord const *const current = parent->Read();
531 
532             if (!current)
533                 return 0;
534 
535             if (!current->isSelfMapped()) {
536                 delete current;
537                 continue;
538             }
539             unsigned const REF = current->refID();
540             unsigned const POS = current->pos();
541 
542             if (REF != refID || POS >= end) {
543                 delete current;
544                 return 0;
545             }
546             unsigned const LEN = current->refLen();
547 
548             if (POS + LEN <= start) {
549                 delete current;
550                 continue;
551             }
552             return current;
553         }
554     }
DumpSAM(std::ostream & oss,BAMRecord const & rec) const555     void DumpSAM(std::ostream &oss, BAMRecord const &rec) const {
556         parent->DumpSAM(oss, rec);
557     }
558 };
559