1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 */
25
26 #define USE_STDIO 1
27
28 #include <stdint.h>
29 #include <string.h>
30
31 #include <stdexcept>
32 #include <vector>
33 #include <map>
34 #include <algorithm>
35 #include <iterator>
36
37 #include <zlib.h>
38
39 #if USE_STDIO
40 #include <cstdio>
41 #else
42 #include <fstream>
43 #endif
44
45 #define BAM_BLK_MAX (64u * 1024u)
46 #define IO_BLK_SIZE (1024u * 1024u)
47
48 template<typename T>
LE2Host(void const * const src)49 static T LE2Host(void const *const src)
50 {
51 union {
52 uint8_t ch[sizeof(T)];
53 T v; // ensures alignment if necessary
54 } u;
55 memmove(reinterpret_cast<void *>(&u), src, sizeof(T));
56 #if BYTE_ORDER == LITTLE_ENDIAN
57 return u.v;
58 #else
59 T y = 0;
60 size_t j = sizeof(T);
61
62 for (size_t i = 0; i < sizeof(T); ++i)
63 y = (y << 8) | u.ch[--j];
64
65 return y;
66 #endif
67 }
68
69 class BAMFilePosType {
70 uint64_t value;
71 public:
BAMFilePosType(uint64_t const x=0)72 BAMFilePosType(uint64_t const x = 0) : value(x) {}
hasValue() const73 bool hasValue() const { return value != 0; }
fpos() const74 uint64_t fpos() const {
75 return value >> 16;
76 }
bpos() const77 uint16_t bpos() const {
78 return (uint16_t)value;
79 }
operator <(BAMFilePosType const lhs,BAMFilePosType const rhs)80 friend bool operator <(BAMFilePosType const lhs, BAMFilePosType const rhs) {
81 return rhs.value < rhs.value;
82 }
operator ==(BAMFilePosType const lhs,BAMFilePosType const rhs)83 friend bool operator ==(BAMFilePosType const lhs, BAMFilePosType const rhs) {
84 return rhs.value == rhs.value;
85 }
86 };
87
88 template <>
LE2Host(void const * const src)89 BAMFilePosType LE2Host<BAMFilePosType>(void const *const src)
90 {
91 return BAMFilePosType(LE2Host<uint64_t>(src));
92 }
93
94
95 typedef std::vector<BAMFilePosType> BAMFilePosTypeList;
96
97 class BAMFile;
98 class RefIndex;
99
100 class HeaderRefInfo
101 {
102 friend class BAMFile;
103
104 RefIndex const *index;
105 std::string name;
106 unsigned length;
107
HeaderRefInfo(std::string const & Name,int32_t const Length)108 HeaderRefInfo(std::string const &Name, int32_t const Length)
109 : name(Name), length(Length), index(0)
110 {}
111 size_t LoadIndex(char const data[], char const *const endp);
112 void DropIndex();
113 public:
~HeaderRefInfo()114 ~HeaderRefInfo() {
115 DropIndex();
116 }
117 BAMFilePosTypeList slice(unsigned const beg, unsigned const end) const;
getName() const118 std::string const &getName() const {
119 return name;
120 }
getLength() const121 unsigned getLength() const {
122 return length;
123 }
124 };
125
126 struct SizedRawData
127 {
128 uint32_t size;
129 uint8_t data[1];
130 };
131
132 struct BAMLayout : public SizedRawData {
133 /* layout looks like:
134 uint8_t
135 m_refID[4],
136 m_pos[4],
137 m_bin_mq_nl[4],
138 m_flag_nc[4],
139 m_l_seq[4],
140 m_next_refID[4],
141 m_next_pos[4],
142 m_tlen[4],
143 m_readname[1];
144 */
145 enum layout {
146 start_refID = 0,
147 start_pos = 4,
148 start_bin_mq_nl = start_pos + 4,
149 start_flag_nc = start_bin_mq_nl + 4,
150 start_l_seq = start_flag_nc + 4,
151 start_next_refID = start_l_seq + 4,
152 start_next_pos = start_next_refID + 4,
153 start_tlen = start_next_pos + 4,
154 start_readname = start_tlen + 4,
155
156 length_fixed_part = start_readname,
157
158 start_nl = start_bin_mq_nl,
159 start_mq = start_nl + 1,
160 start_bin = start_mq + 1,
161 start_nc = start_flag_nc,
162 start_flag = start_nc + 2
163 };
p_refIDBAMLayout164 uint8_t const *p_refID () const { return data + start_refID ; }
p_posBAMLayout165 uint8_t const *p_pos () const { return data + start_pos ; }
p_nlBAMLayout166 uint8_t const *p_nl () const { return data + start_nl ; }
p_mqBAMLayout167 uint8_t const *p_mq () const { return data + start_mq ; }
p_binBAMLayout168 uint8_t const *p_bin () const { return data + start_bin ; }
p_ncBAMLayout169 uint8_t const *p_nc () const { return data + start_nc ; }
p_flagBAMLayout170 uint8_t const *p_flag () const { return data + start_flag ; }
p_l_seqBAMLayout171 uint8_t const *p_l_seq () const { return data + start_l_seq ; }
p_next_refIDBAMLayout172 uint8_t const *p_next_refID() const { return data + start_next_refID; }
p_next_posBAMLayout173 uint8_t const *p_next_pos () const { return data + start_next_pos ; }
p_tlenBAMLayout174 uint8_t const *p_tlen () const { return data + start_tlen ; }
p_readnameBAMLayout175 uint8_t const *p_readname () const { return data + start_readname ; }
176
min_sizeBAMLayout177 size_t min_size() const {
178 if (size < length_fixed_part)
179 return length_fixed_part;
180
181 uint16_t const nc = LE2Host<uint16_t>(p_nc());
182 uint32_t const lseq = LE2Host<int32_t>(p_l_seq());
183 return length_fixed_part + 4u * nc + ((lseq + 1) >> 1) + lseq;
184 }
endpBAMLayout185 void const *endp() const {
186 return (void const *)(data + size);
187 }
188 };
189
190 class BAMRecord : private BAMLayout {
191 public:
isTooSmall() const192 bool isTooSmall() const {
193 return (size_t)size < min_size();
194 }
195
refID() const196 int32_t refID() const { return LE2Host<int32_t>(p_refID()); }
pos() const197 int32_t pos() const { return LE2Host<int32_t>(p_pos()); }
mq() const198 uint8_t mq() const { return *p_mq(); }
l_read_name() const199 uint8_t l_read_name() const { return *p_nl(); }
flag() const200 uint16_t flag() const { return LE2Host<uint16_t>(p_flag()); }
nc() const201 uint16_t nc() const { return LE2Host<uint16_t>(p_nc()); }
l_seq() const202 int32_t l_seq() const { return LE2Host<int32_t>(p_l_seq()); }
next_refID() const203 int32_t next_refID() const { return LE2Host<int32_t>(p_next_refID()); }
next_pos() const204 int32_t next_pos() const { return LE2Host<int32_t>(p_next_pos()); }
tlen() const205 int32_t tlen() const { return LE2Host<int32_t>(p_tlen()); }
readname() const206 char const *readname() const { return (char const *)p_readname(); }
cigar(unsigned const i) const207 uint32_t cigar(unsigned const i) const {
208 uint8_t const *const p_cigar = p_readname() + l_read_name();
209 return LE2Host<uint32_t>(p_cigar + 4u * i);
210 }
seq() const211 uint8_t const *seq() const { return p_readname() + l_read_name() + 4u * nc(); }
seq(unsigned const i) const212 char seq(unsigned const i) const {
213 static char const tr[] = "=ACMGRSVTWYHKDBN";
214 uint8_t const b4na2 = seq()[i >> 1];
215 uint8_t const lo = b4na2 & 15;
216 uint8_t const hi = b4na2 >> 4;
217 return tr[(i & 1) ? lo : hi];
218 }
qual() const219 uint8_t const *qual() const { return seq() + ((l_seq() + 1) >> 1); }
extra() const220 void const *extra() const { return (void const *)(qual() + l_seq()); }
221
refLen() const222 unsigned refLen() const {
223 unsigned const n = nc();
224 unsigned rslt = 0;
225
226 for (unsigned i = 0; i < n; ++i) {
227 uint32_t const op = cigar(i);
228 int const code = op & 0x0F;
229 int const len = op >> 4;
230 switch (code) {
231 case 0: /* M */
232 case 2: /* D */
233 case 3: /* N */
234 case 7: /* = */
235 case 8: /* X */
236 rslt += len;
237 }
238 }
239 return rslt;
240 }
isSelfMapped() const241 bool isSelfMapped() const {
242 return ((flag() & 0x0004) != 0 || refID() < 0 || pos() < 0 || nc() == 0) ? false : true;
243 }
isMateMapped() const244 bool isMateMapped() const {
245 int const FLAG = flag();
246
247 return ((FLAG & 0x0001) == 0 || (FLAG & 0x0008) != 0 || next_refID() < 0 || next_pos() < 0) ? false : true;
248 }
249
cigarString(std::string & rslt,bool const clipped,char const OPCODE[]) const250 void cigarString(std::string &rslt, bool const clipped, char const OPCODE[]) const {
251 unsigned const n = nc();
252 int last_len = 0;
253 char last_code = 0;
254 unsigned last_size = 0;
255
256 rslt.resize(0);
257 rslt.reserve(11*n);
258
259 for (unsigned i = 0; i < n; ++i) {
260 char buf[12];
261 char *cur = buf + sizeof(buf);
262 uint32_t const op = cigar(i);
263 char const code = OPCODE[op & 0x0F];
264 int len = op >> 4;
265
266 if (last_code == code) {
267 len += last_len;
268 rslt.resize(last_size);
269 }
270 last_size = (unsigned)rslt.size();
271 last_len = len;
272 last_code = code;
273
274 *--cur = '\0';
275 *--cur = code;
276 for ( ; ; ) {
277 *--cur = len % 10 + '0';
278 if ((len /= 10) == 0)
279 break;
280 }
281 if (!clipped || last_size != 0 || !(last_code == 'S' || last_code == 'H'))
282 rslt.append(cur);
283 }
284 if (clipped && (last_code == 'S' || last_code == 'H'))
285 rslt.resize(last_size);
286 }
287 class OptionalField {
288 char tag[2];
289 char val_type;
290 union {
291 char scalar[1];
292 struct {
293 char type;
294 char count[4];
295 char value[1];
296 } array;
297 } value;
298
type_size(int const type)299 static int type_size(int const type) {
300 switch (type) {
301 case 'A':
302 case 'C':
303 case 'c':
304 return 1;
305 case 'S':
306 case 's':
307 return 2;
308 case 'F':
309 case 'I':
310 case 'i':
311 return 4;
312 default:
313 return -1;
314 }
315 }
316
size(void const * const max) const317 int size(void const *const max) const {
318 if (val_type == 'B') {
319 int const elem_size = type_size(value.array.type);
320 if (elem_size < 0)
321 return -1;
322 int const elem_count = LE2Host<int32_t>(value.array.count);
323 char const *end = &value.array.value[elem_size * elem_count];
324
325 if (end > max)
326 return -1;
327 return (int)(end - tag);
328 }
329 else if (val_type == 'Z' || val_type == 'H') {
330 for (char const *cur = value.scalar; cur != max; ++cur) {
331 if (*cur == '\0')
332 return (int)((cur + 1) - tag);
333 }
334 return -1;
335 }
336 else {
337 int const ts = type_size(val_type);
338 return ts < 0 ? -1 : (int)(&value.scalar[type_size(val_type)] - tag);
339 }
340 }
next(void const * const max) const341 void const *next(void const *const max) const {
342 int const bytes = size(max);
343 if (bytes <= 0)
344 return 0;
345 return (void const *)(((char const *)this) + bytes);
346 }
OptionalField()347 OptionalField() {}
348 public:
getTag() const349 char const *getTag() const {
350 return tag;
351 }
getValueType() const352 char getValueType() const {
353 if (val_type != 'B')
354 return val_type;
355 else
356 return value.array.type;
357 }
getElementCount() const358 int getElementCount() const {
359 if (val_type != 'B')
360 return 1;
361 else
362 return LE2Host<int32_t>(value.array.count);
363 }
getElementSize() const364 int getElementSize() const {
365 if (val_type == 'B')
366 return type_size(value.array.type);
367 else if (val_type != 'Z' && val_type != 'H')
368 return type_size(val_type);
369 else
370 return (int)strlen(value.scalar);
371 }
getRawValue() const372 char const *getRawValue() const {
373 if (val_type == 'B')
374 return value.array.value;
375 else
376 return value.scalar;
377 }
378
379 typedef OptionalField const constOptionalField;
380 class const_iterator : public std::iterator<std::forward_iterator_tag, constOptionalField>
381 {
382 friend class BAMRecord;
383 void const *cur;
384 void const *const endp;
385
const_iterator(void const * const init,void const * const last)386 const_iterator(void const *const init, void const *const last) : cur(init), endp(last) {}
387 public:
operator ++()388 const_iterator &operator ++() {
389 if (cur < endp)
390 cur = ((OptionalField const *)cur)->next(endp);
391 if (!cur)
392 cur = endp;
393 return *this;
394 }
operator *()395 OptionalField const &operator *() {
396 return *((OptionalField const *)cur);
397 }
operator ->()398 OptionalField const *operator ->() {
399 return (OptionalField const *)cur;
400 }
operator ==(const_iterator const & a,const_iterator const & b)401 friend bool operator ==(const_iterator const &a, const_iterator const &b) {
402 return a.endp == b.endp && a.cur == b.cur;
403 }
operator !=(const_iterator const & a,const_iterator const & b)404 friend bool operator !=(const_iterator const &a, const_iterator const &b) {
405 return !(a == b);
406 }
407 };
408 };
begin() const409 OptionalField::const_iterator begin() const {
410 return OptionalField::const_iterator(extra(), endp());
411 }
end() const412 OptionalField::const_iterator end() const {
413 return OptionalField::const_iterator(endp(), endp());
414 }
415 };
416
417 class BAMRecordSource
418 {
419 public:
isGoodRecord(BAMRecord const & rec)420 virtual bool isGoodRecord(BAMRecord const &rec) {
421 return false;
422 }
Read()423 virtual BAMRecord const *Read() {
424 return 0;
425 }
DumpSAM(std::ostream & oss,BAMRecord const & rec) const426 virtual void DumpSAM(std::ostream &oss, BAMRecord const &rec) const {
427
428 }
429 };
430
431 class BAMFile : public BAMRecordSource {
432 #if USE_STDIO
433 FILE *file;
434 #else
435 std::ifstream file;
436 #endif
437 std::vector<HeaderRefInfo> references;
438 std::map<std::string, unsigned> referencesByName;
439 std::string headerText;
440
441 size_t first_bpos;
442 size_t bpos; /* file position of bambuffer */
443 size_t cpos; /* file position of iobuffer */
444 z_stream zs;
445
446 unsigned first_bam_cur;
447 unsigned bam_cur; /* current offset in bambuffer */
448
449 Bytef iobuffer[2*IO_BLK_SIZE];
450 Bytef bambuffer[BAM_BLK_MAX];
451
452 unsigned FillBuffer(int const n);
453 void ReadZlib(void);
454 size_t ReadN(size_t N, void *Dst);
455 size_t SkipN(size_t N);
456 template <typename T> bool Read(size_t count, T *dst);
457 int32_t ReadI32();
458 bool ReadI32(int32_t &rslt);
459 void InflateInit(void);
460 void CheckHeaderSignature(void);
461 void ReadHeader(void);
462 void LoadIndexData(size_t const fsize, char const data[]);
463 void LoadIndex(std::string const &filepath);
464
465 public:
466 BAMFile(std::string const &filepath);
467 ~BAMFile();
468 void Seek(size_t const new_bpos, unsigned new_bam_cur);
Rewind()469 void Rewind() {
470 Seek(first_bpos, first_bam_cur);
471 }
472 virtual bool isGoodRecord(BAMRecord const &rec);
473 virtual BAMRecord const *Read();
474
countOfReferences() const475 unsigned countOfReferences() const {
476 return (unsigned)references.size();
477 }
478
getReferenceIndexByName(std::string const & name) const479 int getReferenceIndexByName(std::string const &name) const {
480 std::map<std::string, unsigned>::const_iterator i = referencesByName.find(name);
481 if (i != referencesByName.end())
482 return i->second;
483 else
484 return -1;
485 }
486
getRefInfo(unsigned const i) const487 HeaderRefInfo const &getRefInfo(unsigned const i) const {
488 return references[i];
489 }
490
491 BAMRecordSource *Slice(std::string const &rname, unsigned start, unsigned last);
492
493 void DumpSAM(std::ostream &oss, BAMRecord const &rec) const;
494 };
495
496 class BAMFileSlice : public BAMRecordSource {
497 friend class BAMFile;
498
499 BAMFile *const parent;
500 BAMFilePosTypeList const index;
501 unsigned const refID;
502 unsigned const start;
503 unsigned const end;
504 BAMFilePosTypeList::const_iterator cur;
505
Seek(void)506 void Seek(void)
507 {
508 BAMFilePosType const pos = *cur++;
509 size_t const fpos = pos.fpos();
510 uint16_t const bpos = pos.bpos();
511
512 parent->Seek(fpos, bpos);
513 }
BAMFileSlice(BAMFile & p,unsigned const r,unsigned const s,unsigned const e,BAMFilePosTypeList const & i)514 BAMFileSlice(BAMFile &p, unsigned const r, unsigned const s, unsigned const e, BAMFilePosTypeList const &i)
515 : parent(&p)
516 , refID(r)
517 , start(s)
518 , end(e)
519 , index(i)
520 {
521 cur = index.begin();
522 Seek();
523 }
524 public:
isGoodRecord(BAMRecord const & rec)525 virtual bool isGoodRecord(BAMRecord const &rec) {
526 return parent->isGoodRecord(rec);
527 }
Read()528 virtual BAMRecord const *Read() {
529 for ( ; ; ) {
530 BAMRecord const *const current = parent->Read();
531
532 if (!current)
533 return 0;
534
535 if (!current->isSelfMapped()) {
536 delete current;
537 continue;
538 }
539 unsigned const REF = current->refID();
540 unsigned const POS = current->pos();
541
542 if (REF != refID || POS >= end) {
543 delete current;
544 return 0;
545 }
546 unsigned const LEN = current->refLen();
547
548 if (POS + LEN <= start) {
549 delete current;
550 continue;
551 }
552 return current;
553 }
554 }
DumpSAM(std::ostream & oss,BAMRecord const & rec) const555 void DumpSAM(std::ostream &oss, BAMRecord const &rec) const {
556 parent->DumpSAM(oss, rec);
557 }
558 };
559