1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <align/extern.h>
28 #include <klib/defs.h>
29 #include <klib/debug.h>
30 #include <klib/sort.h>
31 #include <klib/rc.h>
32 #include <kfs/file.h>
33 #include <kfs/directory.h>
34 #include <kfs/mmap.h>
35 #include <klib/printf.h>
36 #include <klib/log.h>
37 #include <klib/text.h>
38 #include <klib/refcount.h>
39 #include <sysalloc.h>
40
41 #include <atomic32.h>
42 #include <strtol.h>
43
44 #include <align/bam.h>
45 #include "bam-priv.h"
46
47 #include <vfs/manager.h>
48 #include <vfs/path.h>
49
50 #include <limits.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <ctype.h>
54 #include <assert.h>
55 #if 1
56 /*_DEBUGGING*/
57 #include <stdio.h>
58 #endif
59
60 #include <endian.h>
61 #include <byteswap.h>
62
63 #include <zlib.h>
64
65 #include <os-native.h>
66
67
68 #if __BYTE_ORDER == __LITTLE_ENDIAN
LE2HUI16(void const * X)69 static uint16_t LE2HUI16(void const *X) { uint16_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HUI32(void const * X)70 static uint32_t LE2HUI32(void const *X) { uint32_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HUI64(void const * X)71 static uint64_t LE2HUI64(void const *X) { uint64_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HI16(void const * X)72 static int16_t LE2HI16(void const *X) { int16_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HI32(void const * X)73 static int32_t LE2HI32(void const *X) { int32_t y; memmove(&y, X, sizeof(y)); return y; }
74 /* static int64_t LE2HI64(void const *X) { int64_t y; memmove(&y, X, sizeof(y)); return y; } */
75 #endif
76 #if __BYTE_ORDER == __BIG_ENDIAN
LE2HUI16(void const * X)77 static uint16_t LE2HUI16(void const *X) { uint16_t y; memmove(&y, X, sizeof(y)); return (uint16_t)bswap_16(y); }
LE2HUI32(void const * X)78 static uint32_t LE2HUI32(void const *X) { uint32_t y; memmove(&y, X, sizeof(y)); return (uint32_t)bswap_32(y); }
LE2HUI64(void const * X)79 static uint64_t LE2HUI64(void const *X) { uint64_t y; memmove(&y, X, sizeof(y)); return (uint64_t)bswap_64(y); }
LE2HI16(void const * X)80 static int16_t LE2HI16(void const *X) { int16_t y; memmove(&y, X, sizeof(y)); return ( int16_t)bswap_16(y); }
LE2HI32(void const * X)81 static int32_t LE2HI32(void const *X) { int32_t y; memmove(&y, X, sizeof(y)); return ( int32_t)bswap_32(y); }
LE2HI64(void const * X)82 static int64_t LE2HI64(void const *X) { int64_t y; memmove(&y, X, sizeof(y)); return ( int64_t)bswap_64(y); }
83 #endif
84
85 typedef struct BAMIndex BAMIndex;
86 typedef struct BGZFile BGZFile;
87
88 /* MARK: BGZFile *** Start *** */
89
90 #define VALIDATE_BGZF_HEADER 1
91 #if (ZLIB_VERNUM < 0x1230)
92 #undef VALIDATE_BGZF_HEADER
93 #warning "zlib too old, inflateGetHeader not available, not validating BGZF headers"
94 #else
95 #endif
96
97 #define ZLIB_BLOCK_SIZE ( 64 * 1024 )
98 typedef uint8_t zlib_block_t[ZLIB_BLOCK_SIZE];
99
100 #define MEM_ALIGN_SIZE ( 64 * 1024 )
101 /* MEM_CHUNK_SIZE must be an integer multiple of ZLIB_BLOCK_SIZE.
102 * The multiple must be >= 2 shouldn't be < 3.
103 */
104 #define MEM_CHUNK_SIZE ( 256 * ZLIB_BLOCK_SIZE )
105 #define CG_NUM_SEGS 4
106
107 typedef struct BGZFile_vt_s {
108 rc_t (*FileRead)(void *, zlib_block_t, unsigned *);
109 uint64_t (*FileGetPos)(void const *);
110 float (*FileProPos)(void const *);
111 uint64_t (*FileGetSize)(void const *);
112 rc_t (*FileSetPos)(void *, uint64_t);
113 void (*FileWhack)(void *);
114 } BGZFile_vt;
115
116 struct BGZFile {
117 uint64_t fsize;
118 uint64_t fpos; /* position in file of first byte in buffer */
119 const uint8_t *buf; /* page aligned or memmapped */
120 const KFile *kfp;
121 uint8_t *_buf; /* allocated */
122 unsigned malign;
123 size_t bcount; /* number of valid bytes in buffer */
124 uint32_t bpos; /* position in buffer of read head */
125 z_stream zs;
126 };
127
128 static
BGZFileGetMoreBytes(BGZFile * self)129 rc_t BGZFileGetMoreBytes(BGZFile *self)
130 {
131 rc_t rc;
132
133 self->fpos += self->bpos;
134 self->bpos &= (MEM_ALIGN_SIZE - 1);
135 self->fpos -= self->bpos;
136
137 rc = KFileRead(self->kfp, self->fpos, self->_buf + self->malign,
138 MEM_CHUNK_SIZE, &self->bcount);
139 if (rc) {
140 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Error reading BAM file: %R\n", rc));
141 return rc;
142 }
143 if (self->bcount == 0 || self->bcount == self->bpos)
144 return RC(rcAlign, rcFile, rcReading, rcData, rcInsufficient);
145
146 self->zs.avail_in = (uInt)(self->bcount - self->bpos);
147 self->zs.next_in = (Bytef *)&self->buf[self->bpos];
148 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Read %u bytes from BAM file at position %lu\n", self->zs.avail_in, self->fpos));
149
150 return 0;
151 }
152
153 static
BGZFileRead(BGZFile * self,zlib_block_t dst,unsigned * pNumRead)154 rc_t BGZFileRead(BGZFile *self, zlib_block_t dst, unsigned *pNumRead)
155 {
156 #if VALIDATE_BGZF_HEADER
157 uint8_t extra[256];
158 gz_header head;
159 #endif
160 rc_t rc = 0;
161 unsigned loops;
162 int zr;
163
164 *pNumRead = 0;
165 if (self->bcount == 0 || self->zs.avail_in == 0) {
166 rc = BGZFileGetMoreBytes(self);
167 if (rc)
168 return rc;
169 }
170
171 #if VALIDATE_BGZF_HEADER
172 memset(&head, 0, sizeof(head));
173 head.extra = extra;
174 head.extra_max = sizeof(extra);
175
176 zr = inflateGetHeader(&self->zs, &head);
177 assert(zr == Z_OK);
178 #endif
179
180 self->zs.next_out = (Bytef *)dst;
181 self->zs.avail_out = sizeof(zlib_block_t);
182
183 for (loops = 0; loops != 2; ++loops) {
184 {
185 uLong const initial = self->zs.total_in;
186
187 zr = inflate(&self->zs, Z_FINISH);
188 {
189 uLong const final = self->zs.total_in;
190 uLong const len = final - initial;
191
192 self->bpos += len;
193 }
194 }
195 assert(self->zs.avail_in == self->bcount - self->bpos);
196
197 switch (zr) {
198 case Z_OK:
199 case Z_BUF_ERROR:
200 rc = BGZFileGetMoreBytes(self);
201 if ( rc != 0 )
202 {
203 if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
204 {
205 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("EOF in Zlib block after %lu bytes\n", self->fpos + self->bpos));
206 rc = RC( rcAlign, rcFile, rcReading, rcFile, rcTooShort );
207 }
208 return rc;
209 }
210 break;
211 case Z_STREAM_END:
212 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Zlib block size (before/after): %u/%u\n", self->zs.total_in, self->zs.total_out));
213 #if VALIDATE_BGZF_HEADER
214 if (head.done) {
215 unsigned const extra_len = head.extra_len;
216 unsigned i;
217 unsigned bsize = 0;
218
219 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("GZIP Header extra length: %u\n", extra_len));
220 for (i = 0; i < extra_len; ) {
221 uint8_t const si1 = extra[i + 0];
222 uint8_t const si2 = extra[i + 1];
223 unsigned const slen = LE2HUI16(&extra[i + 2]);
224
225 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("GZIP Header extra: %c%c(%u)\n", si1, si2, slen));
226 if (si1 == 'B' && si2 == 'C') {
227 bsize = 1 + LE2HUI16(&extra[i + 4]);
228 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("BGZF Header extra field BC: bsize %u\n", bsize));
229 break;
230 }
231 i += slen + 4;
232 }
233 if (bsize == 0 || bsize != self->zs.total_in) {
234 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("BGZF Header extra field BC not found\n"));
235 rc = RC(rcAlign, rcFile, rcReading, rcFormat, rcInvalid); /* not BGZF */
236 }
237 }
238 else {
239 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("GZIP Header not found\n"));
240 rc = RC(rcAlign, rcFile, rcReading, rcFile, rcCorrupt);
241 }
242 #endif
243 *pNumRead = (unsigned)self->zs.total_out; /* <= 64k */
244 zr = inflateReset(&self->zs);
245 assert(zr == Z_OK);
246 return rc;
247 default:
248 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Unexpected Zlib result %i\n", zr));
249 return RC(rcAlign, rcFile, rcReading, rcFile, rcCorrupt);
250 }
251 }
252 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Failed reading BAM file after %lu bytes\n", self->fpos + self->bpos));
253 return RC(rcAlign, rcFile, rcReading, rcFile, rcTooShort);
254 }
255
BGZFileGetPos(const BGZFile * self)256 static uint64_t BGZFileGetPos(const BGZFile *self)
257 {
258 return self->fpos + self->bpos;
259 }
260
261 /* returns the position as proportion of the whole file */
BGZFileProPos(BGZFile const * const self)262 static float BGZFileProPos(BGZFile const *const self)
263 {
264 return BGZFileGetPos(self) / (double)self->fsize;
265 }
266
BGZFileSetPos(BGZFile * const self,uint64_t const pos)267 static rc_t BGZFileSetPos(BGZFile *const self, uint64_t const pos)
268 {
269 if (self->fpos > pos || pos >= self->fpos + self->bcount) {
270 /* desired position is outside of current buffer */
271 self->fpos = pos ^ (pos & ((uint64_t)(MEM_ALIGN_SIZE - 1)));
272 self->bpos = (unsigned)(pos - self->fpos);
273 self->bcount = 0; /* force re-read */
274 }
275 else {
276 /* desired position is inside of current buffer */
277 unsigned const bpos = (unsigned)(pos - self->fpos); /* < 64k */
278
279 self->bpos = bpos; /* pos - self->fpos; */
280 self->zs.avail_in = (uInt)(self->bcount - bpos);
281 self->zs.next_in = (Bytef *)&self->buf[bpos];
282 }
283 return 0;
284 }
285
286 typedef rc_t (*BGZFileWalkBlocks_cb)(void *ctx, const BGZFile *file,
287 rc_t rc, uint64_t fpos,
288 const zlib_block_t data, unsigned dsize);
289
290 /* Without Decompression */
BGZFileWalkBlocksND(BGZFile * const self,BGZFileWalkBlocks_cb const cb,void * const ctx)291 static rc_t BGZFileWalkBlocksND(BGZFile *const self, BGZFileWalkBlocks_cb const cb, void *const ctx)
292 {
293 rc_t rc = 0;
294 #if VALIDATE_BGZF_HEADER
295 uint8_t extra[256];
296 char dummy[64];
297 gz_header head;
298 int zr;
299
300 memset(&head, 0, sizeof(head));
301 head.extra = extra;
302 head.extra_max = sizeof(extra);
303
304 do {
305 unsigned loops;
306 unsigned hsize = 0;
307 unsigned bsize = 0;
308 unsigned bsize2;
309 uint64_t const fpos = self->fpos + self->bpos;
310
311 self->zs.next_out = (Bytef *)dummy;
312 self->zs.avail_out = sizeof(dummy);
313
314 zr = inflateGetHeader(&self->zs, &head);
315 assert(zr == Z_OK);
316
317 for (loops = 0; loops != 2; ++loops) {
318 {
319 uLong const orig = self->zs.total_in;
320
321 zr = inflate(&self->zs, Z_BLOCK); /* Z_BLOCK stops at end of header */
322 {
323 uLong const final = self->zs.total_in;
324 uLong const bytes = final - orig;
325
326 self->bpos += bytes;
327 hsize += bytes;
328 }
329 }
330 if (head.done) {
331 unsigned i;
332
333 for (i = 0; i < head.extra_len; ) {
334 if (extra[i] == 'B' && extra[i + 1] == 'C') {
335 bsize = 1 + LE2HUI16(&extra[i + 4]);
336 break;
337 }
338 i += LE2HUI16(&extra[i + 2]);
339 }
340 break;
341 }
342 else if (self->zs.avail_in == 0) {
343 rc = BGZFileGetMoreBytes(self);
344 if (rc) {
345 rc = RC(rcAlign, rcFile, rcReading, rcFile, rcTooShort);
346 goto DONE;
347 }
348 }
349 else {
350 rc = RC(rcAlign, rcFile, rcReading, rcFile, rcCorrupt);
351 goto DONE;
352 }
353 }
354 if (bsize == 0) {
355 rc = RC(rcAlign, rcFile, rcReading, rcFormat, rcInvalid); /* not BGZF */
356 break;
357 }
358 bsize2 = bsize;
359 bsize -= hsize;
360 for ( ; ; ) {
361 unsigned const max = (unsigned)(self->bcount - self->bpos); /* <= 64k */
362 unsigned const len = bsize > max ? max : bsize;
363
364 self->bpos += len;
365 bsize -= len;
366 if (self->bpos == self->bcount) {
367 rc = BGZFileGetMoreBytes(self);
368 if (rc) {
369 if (bsize)
370 rc = RC(rcAlign, rcFile, rcReading, rcFile, rcTooShort);
371 goto DONE;
372 }
373 }
374 else {
375 zr = inflateReset(&self->zs);
376 assert(zr == Z_OK);
377 self->zs.avail_in = (uInt)(self->bcount - self->bpos);
378 self->zs.next_in = (Bytef *)&self->buf[self->bpos];
379 rc = cb(ctx, self, 0, fpos, NULL, bsize2);
380 break;
381 }
382 }
383 } while (rc == 0);
384 DONE:
385 if ( GetRCState( rc ) == rcInsufficient && GetRCObject( rc ) == (enum RCObject)rcData )
386 rc = 0;
387 rc = cb( ctx, self, rc, self->fpos + self->bpos, NULL, 0 );
388 #endif
389 return rc;
390 }
391
BGZFileWalkBlocksUnzip(BGZFile * const self,zlib_block_t * const bufp,BGZFileWalkBlocks_cb const cb,void * const ctx)392 static rc_t BGZFileWalkBlocksUnzip(BGZFile *const self, zlib_block_t *const bufp, BGZFileWalkBlocks_cb const cb, void *const ctx)
393 {
394 rc_t rc;
395 rc_t rc2;
396
397 do {
398 uint64_t const fpos = self->fpos + self->bpos;
399 unsigned dsize;
400
401 rc2 = BGZFileRead(self, *bufp, &dsize);
402 rc = cb(ctx, self, rc2, fpos, *bufp, dsize);
403 } while (rc == 0 && rc2 == 0);
404 if ( GetRCState( rc2 ) == rcInsufficient && GetRCObject( rc2 ) == (enum RCObject)rcData )
405 rc2 = 0;
406 rc = cb( ctx, self, rc2, self->fpos + self->bpos, NULL, 0 );
407 return rc ? rc : rc2;
408 }
409
BGZFileWalkBlocks(BGZFile * self,bool decompress,zlib_block_t * bufp,BGZFileWalkBlocks_cb cb,void * ctx)410 static rc_t BGZFileWalkBlocks(BGZFile *self, bool decompress, zlib_block_t *bufp,
411 BGZFileWalkBlocks_cb cb, void *ctx)
412 {
413 rc_t rc;
414
415 #if VALIDATE_BGZF_HEADER
416 #else
417 decompress = true;
418 #endif
419 self->fpos = 0;
420 self->bpos = 0;
421
422 rc = BGZFileGetMoreBytes(self);
423 if (rc)
424 return rc;
425
426 if (decompress)
427 return BGZFileWalkBlocksUnzip(self, bufp, cb, ctx);
428 else
429 return BGZFileWalkBlocksND(self, cb, ctx);
430 }
431
BGZFileGetSize(BGZFile const * const self)432 static uint64_t BGZFileGetSize(BGZFile const *const self)
433 {
434 return self->fsize;
435 }
436
BGZFileWhack(BGZFile * self)437 static void BGZFileWhack(BGZFile *self)
438 {
439 inflateEnd(&self->zs);
440 KFileRelease(self->kfp);
441 if (self->_buf)
442 free(self->_buf);
443 }
444
BGZFileInit(BGZFile * self,const KFile * kfp,BGZFile_vt * vt)445 static rc_t BGZFileInit(BGZFile *self, const KFile *kfp, BGZFile_vt *vt)
446 {
447 int i;
448 rc_t rc;
449 static BGZFile_vt const my_vt = {
450 (rc_t (*)(void *, zlib_block_t, unsigned *))BGZFileRead,
451 (uint64_t (*)(void const *))BGZFileGetPos,
452 (float (*)(void const *))BGZFileProPos,
453 (uint64_t (*)(void const *))BGZFileGetSize,
454 (rc_t (*)(void *, uint64_t))BGZFileSetPos,
455 (void (*)(void *))BGZFileWhack
456 };
457
458 memset(self, 0, sizeof(*self));
459 memset(vt, 0, sizeof(*vt));
460
461 i = inflateInit2(&self->zs, MAX_WBITS + 16); /* max + enable gzip headers */
462 switch (i) {
463 case Z_OK:
464 break;
465 case Z_MEM_ERROR:
466 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
467 default:
468 return RC(rcAlign, rcFile, rcConstructing, rcNoObj, rcUnexpected);
469 }
470
471 rc = KFileSize(kfp, &self->fsize);
472 if (rc)
473 return rc;
474
475 self->_buf = malloc(MEM_CHUNK_SIZE + MEM_ALIGN_SIZE);
476 if (self->_buf == NULL)
477 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
478 self->malign = (MEM_ALIGN_SIZE - ((intptr_t)self->_buf & (MEM_ALIGN_SIZE - 1))) & (MEM_ALIGN_SIZE - 1);
479 self->buf = self->_buf + self->malign;
480
481 self->kfp = kfp;
482 KFileAddRef(kfp);
483
484 *vt = my_vt;
485
486 return 0;
487 }
488
489 #ifndef WINDOWS
490
491 /* MARK: BGZThreadFile *** Start *** */
492
493 #include <kproc/thread.h>
494 #include <kproc/lock.h>
495 #include <kproc/cond.h>
496
497 typedef struct BGZThreadFile_s BGZThreadFile;
498
499 #define BUFFER_COUNT (3)
500
501 typedef struct BGZThreadFileWorkQElem_s BGZThreadFileWorkQElem;
502
503 struct BGZThreadFileWorkQElem_s {
504 uint8_t *buf;
505 uint64_t pos;
506 unsigned bsz;
507 };
508
509 struct BGZThreadFile_s {
510 BGZFile file;
511 KLock *lock;
512 KCondition *have_data;
513 KCondition *need_data;
514 KThread *th;
515 uint64_t pos;
516 BGZThreadFileWorkQElem que[BUFFER_COUNT];
517 rc_t volatile rc;
518 unsigned volatile nque;
519 bool eof;
520 uint8_t buffer[sizeof(zlib_block_t) * BUFFER_COUNT];
521 };
522
BGZThreadFileRead(BGZThreadFile * self,zlib_block_t dst,unsigned * pNumRead)523 static rc_t BGZThreadFileRead(BGZThreadFile *self, zlib_block_t dst, unsigned *pNumRead)
524 {
525 rc_t rc;
526
527 *pNumRead = 0;
528
529 KLockAcquire(self->lock);
530 if ((rc = self->rc) == 0) {
531 while (self->nque == 0 && (rc = self->rc) == 0)
532 KConditionWait(self->have_data, self->lock);
533 if (rc == 0) {
534 BGZThreadFileWorkQElem const work = self->que[0];
535
536 self->pos = work.pos;
537 if (work.buf) {
538 memmove(dst, work.buf, *pNumRead = work.bsz);
539 memmove(&self->que[0], &self->que[1], --self->nque * sizeof(self->que[0]));
540 KConditionSignal(self->need_data);
541 }
542 else {
543 self->eof = true;
544 self->rc = rc = RC(rcAlign, rcFile, rcReading, rcData, rcInsufficient);
545 }
546 }
547 }
548 KLockUnlock(self->lock);
549 return rc;
550 }
551
BGZThreadFileMain(KThread const * const th,void * const vp)552 static rc_t CC BGZThreadFileMain(KThread const *const th, void *const vp)
553 {
554 BGZThreadFile *const self = (BGZThreadFile *)vp;
555 rc_t rc = 0;
556 unsigned bufno;
557
558 KLockAcquire(self->lock);
559 for (bufno = 0; ; bufno = (bufno + 1) % BUFFER_COUNT) {
560 while (self->nque == BUFFER_COUNT)
561 KConditionWait(self->need_data, self->lock);
562 {
563 BGZThreadFileWorkQElem work;
564
565 work.pos = BGZFileGetPos(&self->file);
566 work.buf = &self->buffer[bufno * sizeof(zlib_block_t)];
567 rc = BGZFileRead(&self->file, work.buf, &work.bsz);
568 if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
569 work.buf = NULL;
570 else if ( rc != 0 )
571 break;
572 self->que[self->nque++] = work;
573 KConditionSignal(self->have_data);
574 }
575 }
576 self->rc = rc;
577 KLockUnlock(self->lock);
578 return 0;
579 }
580
BGZThreadFileGetPos(BGZThreadFile const * const self)581 static uint64_t BGZThreadFileGetPos(BGZThreadFile const *const self)
582 {
583 return self->pos;
584 }
585
586 /* returns the position as proportion of the whole file */
BGZThreadFileProPos(BGZThreadFile const * const self)587 static float BGZThreadFileProPos(BGZThreadFile const *const self)
588 {
589 return BGZThreadFileGetPos(self) / (double)self->file.fsize;
590 }
591
BGZThreadFileGetSize(BGZThreadFile const * const self)592 static uint64_t BGZThreadFileGetSize(BGZThreadFile const *const self)
593 {
594 return BGZFileGetSize(&self->file);
595 }
596
BGZThreadFileSetPos(BGZThreadFile * const self)597 static rc_t BGZThreadFileSetPos(BGZThreadFile *const self)
598 {
599 return RC(rcAlign, rcFile, rcPositioning, rcFunction, rcUnsupported);
600 }
601
BGZThreadFileWhack(BGZThreadFile * const self)602 static void BGZThreadFileWhack(BGZThreadFile *const self)
603 {
604 KThreadCancel(self->th);
605 KThreadWait(self->th, NULL);
606 BGZFileWhack(&self->file);
607 KConditionRelease(self->need_data);
608 KConditionRelease(self->have_data);
609 KLockRelease(self->lock);
610 KThreadRelease(self->th);
611 }
612
BGZThreadFileInit(BGZThreadFile * self,const KFile * kfp,BGZFile_vt * vt)613 static rc_t BGZThreadFileInit(BGZThreadFile *self, const KFile *kfp, BGZFile_vt *vt)
614 {
615 rc_t rc;
616 static BGZFile_vt const my_vt = {
617 (rc_t (*)(void *, zlib_block_t, unsigned *))BGZThreadFileRead,
618 (uint64_t (*)(void const *))BGZThreadFileGetPos,
619 (float (*)(void const *))BGZThreadFileProPos,
620 (uint64_t (*)(void const *))BGZThreadFileGetSize,
621 (rc_t (*)(void *, uint64_t))BGZThreadFileSetPos,
622 (void (*)(void *))BGZThreadFileWhack
623 };
624
625 memset(self, 0, sizeof(*self));
626
627 rc = BGZFileInit(&self->file, kfp, vt);
628 if (rc == 0) {
629 rc = KLockMake(&self->lock);
630 if (rc == 0) {
631 rc = KConditionMake(&self->have_data);
632 if (rc == 0) {
633 rc = KConditionMake(&self->need_data);
634 if (rc == 0) {
635 rc = KThreadMake(&self->th, BGZThreadFileMain, self);
636 if (rc == 0) {
637 *vt = my_vt;
638 return 0;
639 }
640 KConditionRelease(self->need_data);
641 }
642 KConditionRelease(self->have_data);
643 }
644 KLockRelease(self->lock);
645 }
646 BGZFileWhack(&self->file);
647 }
648 memset(self, 0, sizeof(*self));
649 memset(vt, 0, sizeof(*vt));
650 return rc;
651 }
652
653 #endif
654
655 /* MARK: BAMFile structures */
656 #define MAX_BIN 37449
657
658 typedef struct BAMFileRange {
659 BAMFilePosition start;
660 BAMFilePosition end;
661 } BAMFileRange;
662
663 typedef struct BAMIndexReference {
664 int intervals;
665 int binSize[MAX_BIN];
666 int binStart[MAX_BIN];
667 BAMFilePosition start;
668 BAMFilePosition end;
669 BAMFileRange *bin;
670 BAMFilePosition *interval;
671 } BAMIndexReference;
672
673 struct BAMIndex {
674 int numRefs;
675 BAMFileRange *rng;
676 BAMFilePosition *pos;
677 void *data; // allocated
678 BAMIndexReference ref[1];
679 };
680
681 struct BAMFile {
682 uint64_t fpos_first;
683 uint64_t fpos_cur;
684
685 union {
686 BGZFile plain;
687 #ifndef WINDOWS
688 BGZThreadFile thread;
689 #endif
690 } file;
691 BGZFile_vt vt;
692
693 BAMRefSeq *refSeq; /* pointers into headerData1 except name points into headerData2 */
694 BAMReadGroup *readGroup; /* pointers into headerData1 */
695 char const *version;
696 char const *header;
697 char *headerData1; /* gets used for refSeq and readGroup */
698 uint8_t *headerData2; /* gets used for refSeq */
699 BAMAlignment *bufLocker;
700 BAMAlignment *nocopy; /* used to hold current record for BAMFileRead2 */
701 BAMIndex const *ndx;
702
703 size_t nocopy_size;
704
705 unsigned refSeqs;
706 unsigned readGroups;
707
708 KRefcount refcount;
709 unsigned ucfirst; /* offset of first record in uncompressed buffer */
710 unsigned bufSize; /* current size of uncompressed buffer */
711 unsigned bufCurrent; /* location in uncompressed buffer of read head */
712 bool eof;
713 bool threaded;
714 zlib_block_t buffer; /* uncompressed buffer */
715 };
716
717 /* MARK: Alignment structures */
718
719 struct bam_alignment_s {
720 uint8_t rID[4];
721 uint8_t pos[4];
722 uint8_t read_name_len;
723 uint8_t mapQual;
724 uint8_t bin[2];
725 uint8_t n_cigars[2];
726 uint8_t flags[2];
727 uint8_t read_len[4];
728 uint8_t mate_rID[4];
729 uint8_t mate_pos[4];
730 uint8_t ins_size[4];
731 char read_name[1 /* read_name_len */];
732 /* if you change length of read_name,
733 * adjust calculation of offsets in BAMAlignmentSetOffsets */
734 /* uint32_t cigar[n_cigars];
735 * uint8_t seq[(read_len + 1) / 2];
736 * uint8_t qual[read_len];
737 * uint8_t extra[...];
738 */
739 };
740
741 typedef union bam_alignment_u {
742 struct bam_alignment_s cooked;
743 uint8_t raw[sizeof(struct bam_alignment_s)];
744 } bam_alignment;
745
746 struct offset_size_s {
747 unsigned offset;
748 unsigned size; /* this is the total length of the tag; length of data is size - 3 */
749 };
750
751 struct BAMAlignment {
752 KRefcount refcount;
753
754 BAMFilePosition pos;
755 BAMFile *parent;
756 bam_alignment const *data;
757 uint8_t *storage;
758 unsigned datasize;
759
760 unsigned cigar;
761 unsigned seq;
762 unsigned qual;
763 unsigned numExtra;
764 unsigned hasColor;
765 struct offset_size_s extra[1];
766 };
767
768 static const char cigarChars[] = {
769 ct_Match,
770 ct_Insert,
771 ct_Delete,
772 ct_Skip,
773 ct_SoftClip,
774 ct_HardClip,
775 ct_Padded,
776 ct_Equal,
777 ct_NotEqual
778 /* ct_Overlap must not appear in actual BAM file */
779 };
780
781 /* MARK: Alignment accessors */
782
getRefSeqId(const BAMAlignment * cself)783 static int32_t getRefSeqId(const BAMAlignment *cself) {
784 return LE2HI32(cself->data->cooked.rID);
785 }
786
getPosition(const BAMAlignment * cself)787 static int32_t getPosition(const BAMAlignment *cself) {
788 return LE2HI32(cself->data->cooked.pos);
789 }
790
getReadNameLength(const BAMAlignment * cself)791 static uint8_t getReadNameLength(const BAMAlignment *cself) {
792 return cself->data->cooked.read_name_len;
793 }
794
getBin(const BAMAlignment * cself)795 static uint16_t getBin(const BAMAlignment *cself) {
796 return LE2HUI16(cself->data->cooked.bin);
797 }
798
getMapQual(const BAMAlignment * cself)799 static uint8_t getMapQual(const BAMAlignment *cself) {
800 return cself->data->cooked.mapQual;
801 }
802
getCigarCount(const BAMAlignment * cself)803 static uint16_t getCigarCount(const BAMAlignment *cself) {
804 return LE2HUI16(cself->data->cooked.n_cigars);
805 }
806
getFlags(const BAMAlignment * cself)807 static uint16_t getFlags(const BAMAlignment *cself) {
808 return LE2HUI16(cself->data->cooked.flags);
809 }
810
getReadLen(const BAMAlignment * cself)811 static uint32_t getReadLen(const BAMAlignment *cself) {
812 return LE2HUI32(cself->data->cooked.read_len);
813 }
814
getMateRefSeqId(const BAMAlignment * cself)815 static int32_t getMateRefSeqId(const BAMAlignment *cself) {
816 return LE2HI32(cself->data->cooked.mate_rID);
817 }
818
getMatePos(const BAMAlignment * cself)819 static int32_t getMatePos(const BAMAlignment *cself) {
820 return LE2HI32(cself->data->cooked.mate_pos);
821 }
822
getInsertSize(const BAMAlignment * cself)823 static int32_t getInsertSize(const BAMAlignment *cself) {
824 return LE2HI32(cself->data->cooked.ins_size);
825 }
826
getReadName(const BAMAlignment * cself)827 static char const *getReadName(const BAMAlignment *cself) {
828 return &cself->data->cooked.read_name[0];
829 }
830
getCigarBase(BAMAlignment const * cself)831 static void const *getCigarBase(BAMAlignment const *cself)
832 {
833 return &cself->data->raw[cself->cigar];
834 }
835
opt_tag_cmp(char const a[2],char const b[2])836 static int opt_tag_cmp(char const a[2], char const b[2])
837 {
838 int const d0 = (int)a[0] - (int)b[0];
839 return d0 ? d0 : ((int)a[1] - (int)b[1]);
840 }
841
OptTag_sort(void const * A,void const * B,void * ctx)842 static int64_t CC OptTag_sort(void const *A, void const *B, void *ctx)
843 {
844 BAMAlignment const *const self = ctx;
845 unsigned const a_off = ((struct offset_size_s const *)A)->offset;
846 unsigned const b_off = ((struct offset_size_s const *)B)->offset;
847 char const *const a = (char const *)&self->data->raw[a_off];
848 char const *const b = (char const *)&self->data->raw[b_off];
849 int const diff = opt_tag_cmp(a, b);
850
851 return diff ? (int64_t)diff : (int64_t)a - (int64_t)b;
852 }
853
tag_findfirst(BAMAlignment const * const self,char const tag[2])854 static unsigned tag_findfirst(BAMAlignment const *const self, char const tag[2])
855 {
856 unsigned f = 0;
857 unsigned e = self->numExtra;
858
859 while (f < e) {
860 unsigned const m = (f + e) >> 1;
861 char const *const mtag = (char const *)&self->data->raw[self->extra[m].offset];
862 int const d = opt_tag_cmp(tag, mtag);
863
864 if (d > 0)
865 f = m + 1;
866 else
867 e = m;
868 }
869 return f;
870 }
871
tag_runlength(BAMAlignment const * const self,char const tag[2],unsigned const at)872 static unsigned tag_runlength(BAMAlignment const *const self,
873 char const tag[2],
874 unsigned const at)
875 {
876 unsigned n;
877
878 for (n = 0; n + at < self->numExtra; ++n) {
879 if (opt_tag_cmp(tag, (char const *)&self->data->raw[self->extra[n + at].offset]) != 0)
880 break;
881 }
882 return n;
883 }
884
tag_search(BAMAlignment const * const self,char const tag[2],int const which)885 static struct offset_size_s const *tag_search(BAMAlignment const *const self,
886 char const tag[2],
887 int const which)
888 {
889 unsigned const fnd = tag_findfirst(self, tag);
890 unsigned const run = tag_runlength(self, tag, fnd);
891 unsigned const want = which < 0 ? (run + which) : which;
892
893 return run == 0 ? NULL : &self->extra[fnd + (want % run)];
894 }
895
get_RG(BAMAlignment const * cself)896 static char const *get_RG(BAMAlignment const *cself)
897 {
898 struct offset_size_s const *const x = tag_search(cself, "RG", 0);
899 return (char const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
900 }
901
get_CS_info(BAMAlignment const * cself)902 static struct offset_size_s const *get_CS_info(BAMAlignment const *cself)
903 {
904 return tag_search(cself, "CS", 0);
905 }
906
get_CQ_info(BAMAlignment const * cself)907 static struct offset_size_s const *get_CQ_info(BAMAlignment const *cself)
908 {
909 return tag_search(cself, "CQ", 0);
910 }
911
get_CS(BAMAlignment const * cself)912 static char const *get_CS(BAMAlignment const *cself)
913 {
914 struct offset_size_s const *const x = get_CS_info(cself);
915 return (char const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
916 }
917
get_CQ(BAMAlignment const * cself)918 static uint8_t const *get_CQ(BAMAlignment const *cself)
919 {
920 struct offset_size_s const *const x = get_CQ_info(cself);
921 return (uint8_t const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
922 }
923
get_OQ_info(BAMAlignment const * cself)924 static struct offset_size_s const *get_OQ_info(BAMAlignment const *cself)
925 {
926 return tag_search(cself, "OQ", 0);
927 }
928
get_OQ(BAMAlignment const * cself)929 static uint8_t const *get_OQ(BAMAlignment const *cself)
930 {
931 struct offset_size_s const *const x = get_OQ_info(cself);
932 return (uint8_t const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
933 }
934
get_XT(BAMAlignment const * cself)935 static char const *get_XT(BAMAlignment const *cself)
936 {
937 struct offset_size_s const *const x = tag_search(cself, "XT", 0);
938 return (char const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
939 }
940
get_XS(BAMAlignment const * cself)941 static uint8_t const *get_XS(BAMAlignment const *cself)
942 {
943 struct offset_size_s const *const x = tag_search(cself, "XS", -1); /* want last one */
944 return (uint8_t const *)(x && cself->data->raw[x->offset + 2] == 'A' ? &cself->data->raw[x->offset + 3] : NULL);
945 }
946
get_CG_ZA_info(BAMAlignment const * cself)947 static struct offset_size_s const *get_CG_ZA_info(BAMAlignment const *cself)
948 {
949 struct offset_size_s const *const x = tag_search(cself, "ZA", 0);
950 return x;
951 }
952
get_CG_ZI_info(BAMAlignment const * cself)953 static struct offset_size_s const *get_CG_ZI_info(BAMAlignment const *cself)
954 {
955 struct offset_size_s const *const x = tag_search(cself, "ZI", 0);
956 return x;
957 }
958
get_CG_GC_info(BAMAlignment const * cself)959 static struct offset_size_s const *get_CG_GC_info(BAMAlignment const *cself)
960 {
961 struct offset_size_s const *const x = tag_search(cself, "GC", 0);
962 return x;
963 }
964
get_CG_GS_info(BAMAlignment const * cself)965 static struct offset_size_s const *get_CG_GS_info(BAMAlignment const *cself)
966 {
967 struct offset_size_s const *const x = tag_search(cself, "GS", 0);
968 return x;
969 }
970
get_CG_GQ_info(BAMAlignment const * cself)971 static struct offset_size_s const *get_CG_GQ_info(BAMAlignment const *cself)
972 {
973 struct offset_size_s const *const x = tag_search(cself, "GQ", 0);
974 return x;
975 }
976
977 /* MARK: BAMFile Reading functions */
978
979 /* returns (rcData, rcInsufficient) if eof */
BAMFileFillBuffer(BAMFile * self)980 static rc_t BAMFileFillBuffer(BAMFile *self)
981 {
982 rc_t const rc = self->vt.FileRead(&self->file, self->buffer, &self->bufSize);
983 if (rc)
984 return rc;
985 if (self->bufSize == 0 || self->bufSize <= self->bufCurrent)
986 return SILENT_RC(rcAlign, rcFile, rcReading, rcData, rcInsufficient);
987 return 0;
988 }
989
BAMFileReadn(BAMFile * self,const unsigned len,uint8_t dst[])990 static rc_t BAMFileReadn(BAMFile *self, const unsigned len, uint8_t dst[/* len */]) {
991 rc_t rc;
992 unsigned cur;
993 unsigned n = 0;
994
995 if (len == 0)
996 return 0;
997
998 for (cur = 0; ; cur += n) {
999 if (self->bufSize > self->bufCurrent) {
1000 n = self->bufSize - self->bufCurrent;
1001 if (cur + n > len)
1002 n = len - cur;
1003 memmove(&dst[cur], &self->buffer[self->bufCurrent], n);
1004 self->bufCurrent += n;
1005 }
1006 if (self->bufCurrent != self->bufSize && self->bufSize != 0)
1007 return 0;
1008 if (self->bufSize != 0) {
1009 /* a seek has not just been done so update the file position.
1010 * if we didn't and a request for the position is made before the
1011 * next read, we will not have the position of the next read.
1012 *
1013 * if a seek had just been done then
1014 * self->fpos_cur == BGZFileGetPos(&self->file)
1015 * is already true.
1016 */
1017 self->fpos_cur = self->vt.FileGetPos(&self->file);
1018 self->bufCurrent = 0;
1019 self->bufSize = 0;
1020 if (cur + n == len)
1021 return 0;
1022 }
1023
1024 rc = BAMFileFillBuffer(self);
1025 if (rc)
1026 return rc;
1027 }
1028 }
1029
BAMFilePeek(BAMFile const * const self,unsigned const offset)1030 static void const *BAMFilePeek(BAMFile const *const self, unsigned const offset)
1031 {
1032 return &self->buffer[self->bufCurrent + offset];
1033 }
1034
BAMFileMaxPeek(BAMFile const * self)1035 static unsigned BAMFileMaxPeek(BAMFile const *self)
1036 {
1037 return self->bufSize > self->bufCurrent ? self->bufSize - self->bufCurrent : 0;
1038 }
1039
BAMFilePeekI32(BAMFile * self)1040 static int32_t BAMFilePeekI32(BAMFile *self)
1041 {
1042 return LE2HI32(BAMFilePeek(self, 0));
1043 }
1044
BAMFileReadI32(BAMFile * self,int32_t * rhs)1045 static rc_t BAMFileReadI32(BAMFile *self, int32_t *rhs)
1046 {
1047 uint8_t buf[sizeof(int32_t)];
1048 rc_t rc = BAMFileReadn(self, sizeof(int32_t), buf);
1049
1050 if (rc == 0)
1051 *rhs = LE2HI32(buf);
1052 return rc;
1053 }
1054
1055 /* MARK: BAM File header parsing functions */
1056
comp_ReadGroup(const void * a,const void * b,void * ignored)1057 static int64_t CC comp_ReadGroup(const void *a, const void *b, void *ignored) {
1058 return strcmp(((BAMReadGroup const *)a)->name, ((BAMReadGroup const *)b)->name);
1059 }
1060
ParseHD(BAMFile * self,char hdata[],size_t hlen,unsigned * used)1061 static rc_t ParseHD(BAMFile *self, char hdata[], size_t hlen, unsigned *used)
1062 {
1063 unsigned i;
1064 unsigned tag;
1065 unsigned value;
1066 int st = 0;
1067 int ws = 1;
1068
1069 for (i = 0; i < hlen; ++i) {
1070 char const cc = hdata[i];
1071
1072 if (ws && isspace(cc))
1073 continue;
1074 ws = 0;
1075
1076 switch (st) {
1077 case 0:
1078 tag = i;
1079 ++st;
1080 break;
1081 case 1:
1082 if (isspace(cc))
1083 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1084 ++st;
1085 break;
1086 case 2:
1087 if (cc != ':')
1088 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1089 hdata[i] = '\0';
1090 value = i + 1;
1091 ++st;
1092 break;
1093 case 3:
1094 if (cc == '\t' || cc == '\r' || cc == '\n') {
1095 hdata[i] = '\0';
1096
1097 if (strcmp(&hdata[tag], "VN") == 0)
1098 self->version = &hdata[value];
1099
1100 ++st;
1101 ws = 1;
1102 }
1103 break;
1104 case 4:
1105 if (cc == '@')
1106 goto DONE;
1107 tag = i;
1108 st = 1;
1109 break;
1110 }
1111 }
1112 if (st == 4) {
1113 DONE:
1114 *used = i;
1115 return 0;
1116 }
1117 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1118 }
1119
ParseSQ(BAMFile * self,char hdata[],size_t hlen,unsigned * used,unsigned const rs_by_name[])1120 static rc_t ParseSQ(BAMFile *self, char hdata[], size_t hlen, unsigned *used, unsigned const rs_by_name[])
1121 {
1122 unsigned i;
1123 unsigned tag;
1124 unsigned value;
1125 int st = 0;
1126 int ws = 1;
1127 BAMRefSeq rs;
1128
1129 memset(&rs, 0, sizeof(rs));
1130
1131 for (i = 0; i < hlen; ++i) {
1132 char const cc = hdata[i];
1133
1134 if (ws && isspace(cc))
1135 continue;
1136 ws = 0;
1137
1138 switch (st) {
1139 case 0:
1140 tag = i;
1141 ++st;
1142 break;
1143 case 1:
1144 if (isspace(cc))
1145 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1146 ++st;
1147 break;
1148 case 2:
1149 #define HACKAMATIC 1
1150 #if HACKAMATIC
1151 if (cc != ':') {
1152 if (i + 1 >= hlen || hdata[i+1] != ':')
1153 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1154 else
1155 ++i;
1156 }
1157 #else
1158 if (cc != ':')
1159 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1160 #endif
1161 hdata[i] = '\0';
1162 value = i + 1;
1163 ++st;
1164 break;
1165 case 3:
1166 if (cc == '\t' || cc == '\r' || cc == '\n') {
1167 unsigned j;
1168
1169 hdata[i] = '\0';
1170
1171 while (value < i && isspace(hdata[value]))
1172 ++value;
1173 for (j = i; value < j && isspace(hdata[j - 1]); )
1174 hdata[--j] = '\0';
1175
1176 if (strcmp(&hdata[tag], "SN") == 0)
1177 rs.name = &hdata[value];
1178 else if (strcmp(&hdata[tag], "LN") == 0)
1179 rs.length = strtou64(&hdata[value], NULL, 10);
1180 else if (strcmp(&hdata[tag], "AS") == 0)
1181 rs.assemblyId = &hdata[value];
1182 #if HACKAMATIC
1183 else if (strcmp(&hdata[tag], "M5") == 0 || strcmp(&hdata[tag], "MD5") == 0)
1184 #else
1185 else if (strcmp(&hdata[tag], "M5") == 0)
1186 #endif
1187 #undef HACKAMATIC
1188 {
1189 unsigned len = j - value;
1190
1191 if ((hdata[value] == '\'' || hdata[value] == '"') && hdata[value + len - 1] == hdata[value]) {
1192 ++value;
1193 len -= 2;
1194 }
1195 if (len == 32) {
1196 rs.checksum = &rs.checksum_array[0];
1197 for (j = 0; j != 16; ++j) {
1198 int const ch1 = toupper(hdata[value + j * 2 + 0]);
1199 int const ch2 = toupper(hdata[value + j * 2 + 1]);
1200
1201 if (isxdigit(ch1) && isxdigit(ch2)) {
1202 rs.checksum_array[j] =
1203 ((ch1 > '9' ? (ch1 - ('A' - 10)) : (ch1 - '0')) << 4) +
1204 (ch2 > '9' ? (ch2 - ('A' - 10)) : (ch2 - '0'));
1205 }
1206 else {
1207 rs.checksum = NULL;
1208 break;
1209 }
1210 }
1211 }
1212 }
1213 else if (strcmp(&hdata[tag], "UR") == 0)
1214 rs.uri = &hdata[value];
1215 else if (strcmp(&hdata[tag], "SP") == 0)
1216 rs.species = &hdata[value];
1217
1218 ++st;
1219 ws = 1;
1220 }
1221 break;
1222 case 4:
1223 if (cc == '@')
1224 goto DONE;
1225 tag = i;
1226 st = 1;
1227 break;
1228 }
1229 }
1230 DONE:
1231 if (st == 4) {
1232 unsigned f = 0;
1233 unsigned e = self->refSeqs;
1234
1235 if (rs.name == NULL) /* required tags */
1236 return RC(rcAlign, rcFile, rcParsing, rcConstraint, rcViolated);
1237
1238 while (f < e) {
1239 unsigned const m = (f + e) >> 1;
1240 BAMRefSeq *const x = &self->refSeq[rs_by_name[m]];
1241 int const cmp = strcmp(rs.name, x->name);
1242
1243 if (cmp < 0)
1244 e = m;
1245 else if (cmp > 0)
1246 f = m + 1;
1247 else {
1248 x->assemblyId = rs.assemblyId;
1249 x->uri = rs.uri;
1250 x->species = rs.species;
1251 if (rs.checksum) {
1252 x->checksum = &x->checksum_array[0];
1253 memmove(x->checksum_array, rs.checksum_array, 16);
1254 }
1255 else
1256 x->checksum = NULL;
1257 break;
1258 }
1259 }
1260 *used = i;
1261 return 0;
1262 }
1263 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1264 }
1265
ParseRG(BAMFile * self,char hdata[],size_t hlen,unsigned * used,BAMReadGroup * dst)1266 static rc_t ParseRG(BAMFile *self, char hdata[], size_t hlen, unsigned *used, BAMReadGroup *dst)
1267 {
1268 unsigned i;
1269 unsigned tag;
1270 unsigned value;
1271 int st = 0;
1272 int ws = 1;
1273 #if _DEBUGGING
1274 /* char const *cur = hdata; */
1275 #endif
1276
1277 memset(dst, 0, sizeof(*dst));
1278
1279 for (i = 0; i < hlen; ++i) {
1280 char const cc = hdata[i];
1281
1282 if (ws && isspace(cc))
1283 continue;
1284 ws = 0;
1285
1286 switch (st) {
1287 case 0:
1288 tag = i;
1289 ++st;
1290 break;
1291 case 1:
1292 if (isspace(cc))
1293 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1294 ++st;
1295 break;
1296 case 2:
1297 if (cc != ':')
1298 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1299 #if _DEBUGGING
1300 /* cur = hdata + i + 1; */
1301 #endif
1302 hdata[i] = '\0';
1303 value = i + 1;
1304 ++st;
1305 break;
1306 case 3:
1307 if (cc == '\t' || cc == '\r' || cc == '\n') {
1308 unsigned j = i;
1309 #if _DEBUGGING
1310 /* cur = hdata + i + 1; */
1311 #endif
1312 hdata[i] = '\0';
1313
1314 while (value < i && isspace(hdata[value]))
1315 ++value;
1316 while (value < j && isspace(hdata[j - 1]))
1317 hdata[--j] = '\0';
1318
1319 if ((hdata[value] == '\"' || hdata[value] == '\'') && hdata[value] == hdata[j - 1]) {
1320 ++value;
1321 hdata[j - 1] = '\0';
1322 }
1323 if (strcmp(&hdata[tag], "ID") == 0)
1324 dst->name = &hdata[value];
1325 else if (strcmp(&hdata[tag], "SM") == 0)
1326 dst->sample = &hdata[value];
1327 else if (strcmp(&hdata[tag], "LB") == 0)
1328 dst->library = &hdata[value];
1329 else if (strcmp(&hdata[tag], "DS") == 0)
1330 dst->description = &hdata[value];
1331 else if (strcmp(&hdata[tag], "PU") == 0)
1332 dst->unit = &hdata[value];
1333 else if (strcmp(&hdata[tag], "PI") == 0)
1334 dst->insertSize = &hdata[value];
1335 else if (strcmp(&hdata[tag], "CN") == 0)
1336 dst->center = &hdata[value];
1337 else if (strcmp(&hdata[tag], "DT") == 0)
1338 dst->runDate = &hdata[value];
1339 else if (strcmp(&hdata[tag], "PL") == 0)
1340 dst->platform = &hdata[value];
1341
1342 ++st;
1343 ws = 1;
1344 }
1345 break;
1346 case 4:
1347 if (cc == '@')
1348 goto DONE;
1349 tag = i;
1350 st = 1;
1351 break;
1352 }
1353 }
1354 if (st == 4) {
1355 DONE:
1356 *used = i;
1357 if (dst->name == NULL) /* required */
1358 return RC(rcAlign, rcFile, rcParsing, rcConstraint, rcViolated);
1359 return 0;
1360 }
1361 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1362 }
1363
ParseHeader(BAMFile * self,char hdata[],size_t hlen,unsigned const rs_by_name[])1364 static rc_t ParseHeader(BAMFile *self, char hdata[], size_t hlen, unsigned const rs_by_name[]) {
1365 unsigned rg = 0;
1366 unsigned i;
1367 unsigned tag;
1368 int st = 0;
1369 int ws = 1;
1370 unsigned used;
1371 rc_t rc;
1372
1373 for (i = 0; i < hlen; ++i) {
1374 char const cc = hdata[i];
1375
1376 if (ws && isspace(cc))
1377 continue;
1378 ws = 0;
1379
1380 switch (st) {
1381 case 0:
1382 if (cc == '@')
1383 ++st;
1384 else
1385 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1386 break;
1387 case 1:
1388 if (isspace(cc))
1389 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1390 tag = i;
1391 ++st;
1392 break;
1393 case 2:
1394 if (isspace(cc)) {
1395 hdata[i] = '\0';
1396 if (i - tag == 2) {
1397 if (strcmp(&hdata[tag], "HD") == 0) {
1398 rc = ParseHD(self, &hdata[i + 1], hlen - i - 1, &used);
1399 if (rc) return rc;
1400 i += used;
1401 st = 0;
1402 }
1403 else if (strcmp(&hdata[tag], "SQ") == 0) {
1404 rc = ParseSQ(self, &hdata[i + 1], hlen - i - 1, &used, rs_by_name);
1405 if (rc) return rc;
1406 i += used;
1407 st = 0;
1408 }
1409 else if (strcmp(&hdata[tag], "RG") == 0) {
1410 rc = ParseRG(self, &hdata[i + 1], hlen - i - 1, &used, &self->readGroup[rg]);
1411 if (GetRCObject(rc) == rcConstraint && GetRCState(rc) == rcViolated) {
1412 (void)LOGERR(klogWarn, rc, "Read Group is missing ID in BAM header");
1413 rc = 0;
1414 if (self->readGroups) --self->readGroups;
1415 }
1416 else if (rc)
1417 return rc;
1418 else
1419 ++rg;
1420 i += used;
1421 st = 0;
1422 }
1423 }
1424 if (st == 2) {
1425 ++st;
1426 ws = 0;
1427 }
1428 }
1429 else if (i - tag > 2)
1430 ++st;
1431 break;
1432 case 3:
1433 if (cc == '\r' || cc == '\n') {
1434 st = 0;
1435 ws = 1;
1436 }
1437 break;
1438 }
1439 }
1440 ksort( self->readGroup, self->readGroups, sizeof(self->readGroup[0]), comp_ReadGroup, NULL );
1441 for (rg = 0; rg != self->readGroups; ++rg) {
1442 if (rg > 0 && strcmp(self->readGroup[rg - 1].name, self->readGroup[rg].name) == 0)
1443 return RC(rcAlign, rcFile, rcParsing, rcConstraint, rcViolated); /* name must be unique */
1444 self->readGroup[rg].id = rg;
1445 }
1446 for (i = 0; i < self->refSeqs; ++i) {
1447 if (self->refSeq[i].length == 0) {
1448 (void)PLOGMSG(klogWarn, (klogWarn, "Reference '$(ref)' has zero length", "ref=%s", self->refSeq[i].name));
1449 }
1450 }
1451
1452 return 0;
1453 }
1454
CountReadGroups(char const txt[],size_t len,unsigned * reads)1455 static rc_t CountReadGroups(char const txt[], size_t len, unsigned *reads) {
1456 const char *const endp = txt + len;
1457
1458 *reads = 0;
1459
1460 do {
1461 while (txt != endp && isspace(*txt))
1462 ++txt;
1463 if (txt == endp || txt + 3 >= endp)
1464 break;
1465
1466 if (txt[0] == '@' && txt[1] == 'R' && txt[2] == 'G')
1467 ++*reads;
1468
1469 txt = memchr(txt, '\n', endp - txt);
1470 } while (txt);
1471 return 0;
1472 }
1473
ReadMagic(BAMFile * self)1474 static rc_t ReadMagic(BAMFile *self)
1475 {
1476 uint8_t sig[4];
1477 rc_t rc = BAMFileReadn(self, 4, sig);
1478
1479 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM signature: '%c%c%c' %u\n", sig[0], sig[1], sig[2], sig[3]));
1480 if (rc == 0 && (sig[0] != 'B' || sig[1] != 'A' || sig[2] != 'M' || sig[3] != 1))
1481 rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcBadVersion);
1482 return rc;
1483 }
1484
ReadHeaders(BAMFile * self,char ** headerText,size_t * headerTextLen,uint8_t ** refData,unsigned * numrefs)1485 static rc_t ReadHeaders(BAMFile *self,
1486 char **headerText, size_t *headerTextLen,
1487 uint8_t **refData, unsigned *numrefs)
1488 {
1489 unsigned hlen;
1490 char *htxt = NULL;
1491 unsigned nrefs;
1492 uint8_t *rdat = NULL;
1493 unsigned rdsz;
1494 unsigned rdms;
1495 unsigned i;
1496 int32_t i32;
1497 rc_t rc = BAMFileReadI32(self, &i32);
1498
1499 if (rc) return rc;
1500
1501 if (i32 < 0) {
1502 rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcInvalid);
1503 goto BAILOUT;
1504 }
1505 hlen = i32;
1506 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Header text size: %u\n", hlen));
1507 if (hlen) {
1508 htxt = malloc(hlen + 1);
1509 if (htxt == NULL) {
1510 rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1511 goto BAILOUT;
1512 }
1513
1514 rc = BAMFileReadn(self, hlen, (uint8_t *)htxt); if (rc) goto BAILOUT;
1515 htxt[hlen] = '\0';
1516 }
1517 rc = BAMFileReadI32(self, &i32); if (rc) goto BAILOUT;
1518 if (i32 < 0) {
1519 rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcInvalid);
1520 goto BAILOUT;
1521 }
1522 nrefs = i32;
1523 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Header reference count: %u\n", nrefs));
1524 if (nrefs) {
1525 rdms = nrefs * 16;
1526 if (rdms < 4096)
1527 rdms = 4096;
1528 rdat = malloc(rdms);
1529 if (rdat == NULL) {
1530 rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1531 goto BAILOUT;
1532 }
1533 for (i = rdsz = 0; i < nrefs; ++i) {
1534 rc = BAMFileReadI32(self, &i32); if (rc) goto BAILOUT;
1535 if (i32 <= 0) {
1536 rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcInvalid);
1537 goto BAILOUT;
1538 }
1539 if (rdsz + i32 + 8 > rdms) {
1540 void *tmp;
1541
1542 do { rdms <<= 1; } while (rdsz + i32 + 8 > rdms);
1543 tmp = realloc(rdat, rdms);
1544 if (tmp == NULL) {
1545 rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1546 goto BAILOUT;
1547 }
1548 rdat = tmp;
1549 }
1550 memmove(rdat + rdsz, &i32, 4);
1551 rdsz += 4;
1552 rc = BAMFileReadn(self, i32, &rdat[rdsz]); if (rc) goto BAILOUT;
1553 rdsz += i32;
1554 rc = BAMFileReadI32(self, &i32); if (rc) goto BAILOUT;
1555 memmove(rdat + rdsz, &i32, 4);
1556 rdsz += 4;
1557 }
1558 }
1559 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Header reference size: %u\n", rdsz));
1560
1561 *headerText = htxt;
1562 *headerTextLen = hlen;
1563 *refData = rdat;
1564 *numrefs = nrefs;
1565 return 0;
1566
1567 BAILOUT:
1568 if (htxt)
1569 free(htxt);
1570 if (rdat)
1571 free(rdat);
1572
1573 return rc;
1574 }
1575
comp_RefSeqName(const void * A,const void * B,void * ignored)1576 static int64_t CC comp_RefSeqName(const void *A, const void *B, void *ignored) {
1577 BAMFile const *self = (BAMFile const *)ignored;
1578 unsigned const a = *(unsigned const *)A;
1579 unsigned const b = *(unsigned const *)B;
1580
1581 return strcmp(self->refSeq[a].name, self->refSeq[b].name);
1582 }
1583
ProcessHeader(BAMFile * self,char const headerText[])1584 static rc_t ProcessHeader(BAMFile *self, char const headerText[])
1585 {
1586 unsigned *rs_by_name = NULL;
1587 unsigned i;
1588 unsigned cp;
1589 char *htxt;
1590 uint8_t *rdat;
1591 size_t hlen;
1592 unsigned nrefs;
1593 rc_t rc = ReadMagic(self);
1594
1595 if (rc) return rc;
1596
1597 rc = ReadHeaders(self, &htxt, &hlen, &rdat, &nrefs);
1598 if (rc) return rc;
1599
1600 self->fpos_first = self->fpos_cur;
1601 self->ucfirst = self->bufCurrent;
1602 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Data records start at: %lu+%u\n", self->ucfirst, self->fpos_first));
1603
1604 if (headerText) {
1605 free(htxt);
1606 hlen = string_size( headerText );
1607 htxt = malloc(hlen + 1);
1608 if (htxt == NULL) {
1609 free(rdat);
1610 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1611 }
1612 memmove(htxt, headerText, hlen + 1);
1613 }
1614
1615 self->headerData2 = rdat;
1616 if (hlen) {
1617 self->header = htxt;
1618 self->headerData1 = malloc(hlen + 1);
1619 if (self->headerData1 == NULL)
1620 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1621 memmove(self->headerData1, self->header, hlen + 1);
1622 }
1623 else {
1624 htxt = malloc(1);
1625 htxt[0] = '\0';
1626 self->header = htxt;
1627 self->headerData1 = NULL;
1628 }
1629 self->refSeqs = nrefs;
1630 if (nrefs) {
1631 self->refSeq = calloc(nrefs, sizeof(self->refSeq[0]));
1632 if (self->refSeq == NULL)
1633 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1634
1635 rs_by_name = calloc(nrefs, sizeof(rs_by_name[0]));
1636 if (rs_by_name == NULL)
1637 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1638
1639 for (i = cp = 0; i < nrefs; ++i) {
1640 uint32_t nlen;
1641 uint32_t rlen;
1642
1643 rs_by_name[i] = i;
1644 self->refSeq[i].id = i;
1645 memmove(&nlen, &self->headerData2[cp], 4);
1646 cp += 4;
1647 self->refSeq[i].name = (char const *)&self->headerData2[cp];
1648 cp += nlen;
1649 memmove(&rlen, &self->headerData2[cp], 4);
1650 self->headerData2[cp] = 0;
1651 cp += 4;
1652 self->refSeq[i].length = rlen;
1653 }
1654 ksort((void *)rs_by_name, self->refSeqs, sizeof(rs_by_name[0]), comp_RefSeqName, self);
1655 }
1656 if (self->headerData1) {
1657 rc = CountReadGroups(self->headerData1, hlen, &self->readGroups);
1658 if (rc == 0) {
1659 self->readGroup = calloc(self->readGroups, sizeof(self->readGroup[0]));
1660 if (self->readGroup != NULL)
1661 rc = ParseHeader(self, self->headerData1, hlen, rs_by_name);
1662 else
1663 rc = RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1664 }
1665 }
1666 if (rs_by_name != NULL)
1667 free((void *)rs_by_name);
1668
1669 return rc;
1670 }
1671
1672 /* MARK: BAM File destructor */
1673
1674 static rc_t BAMIndexWhack(const BAMIndex *);
1675
BAMFileWhack(BAMFile * self)1676 static rc_t BAMFileWhack(BAMFile *self) {
1677 if (self->refSeq)
1678 free(self->refSeq);
1679 if (self->readGroup)
1680 free(self->readGroup);
1681 if (self->header)
1682 free((void *)self->header);
1683 if (self->headerData1)
1684 free((void *)self->headerData1);
1685 if (self->headerData2)
1686 free((void *)self->headerData2);
1687 if (self->ndx)
1688 BAMIndexWhack(self->ndx);
1689 if (self->nocopy)
1690 free(self->nocopy);
1691 if (self->vt.FileWhack)
1692 self->vt.FileWhack(&self->file);
1693
1694 return 0;
1695 }
1696
1697 /* MARK: BAM File constructors */
1698
1699 /* file is retained */
BAMFileMakeWithKFileAndHeader(BAMFile const ** cself,KFile const * file,char const * headerText,bool threaded)1700 static rc_t BAMFileMakeWithKFileAndHeader(BAMFile const **cself,
1701 KFile const *file,
1702 char const *headerText,
1703 bool threaded)
1704 {
1705 BAMFile *self = calloc(1, sizeof(*self));
1706 rc_t rc;
1707
1708 if (self == NULL)
1709 return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1710
1711 KRefcountInit(&self->refcount, 1, "BAMFile", "new", "");
1712 #ifndef WINDOWS
1713 if (threaded)
1714 rc = BGZThreadFileInit(&self->file.thread, file, &self->vt);
1715 else
1716 #endif
1717 rc = BGZFileInit(&self->file.plain, file, &self->vt);
1718
1719 if (rc == 0) {
1720 rc = ProcessHeader(self, headerText);
1721 if (rc == 0) {
1722 *cself = self;
1723 return 0;
1724 }
1725 }
1726 BAMFileWhack(self);
1727 return rc;
1728 }
1729
1730 /* file is retained */
BAMFileMakeWithKFile(const BAMFile ** cself,const KFile * file)1731 LIB_EXPORT rc_t CC BAMFileMakeWithKFile(const BAMFile **cself, const KFile *file)
1732 {
1733 return BAMFileMakeWithKFileAndHeader(cself, file, NULL, false);
1734 }
1735
BAMFileVMakeWithDir(const BAMFile ** result,const KDirectory * dir,const char * path,va_list args)1736 LIB_EXPORT rc_t CC BAMFileVMakeWithDir(const BAMFile **result,
1737 const KDirectory *dir,
1738 const char *path,
1739 va_list args
1740 )
1741 {
1742 rc_t rc;
1743 const KFile *kf;
1744
1745 if (result == NULL)
1746 return RC(rcAlign, rcFile, rcOpening, rcParam, rcNull);
1747 *result = NULL;
1748 rc = KDirectoryVOpenFileRead(dir, &kf, path, args);
1749 if (rc == 0) {
1750 rc = BAMFileMakeWithKFile(result, kf);
1751 KFileRelease(kf);
1752 }
1753 return rc;
1754 }
1755
BAMFileMakeWithDir(const BAMFile ** result,const KDirectory * dir,const char * path,...)1756 LIB_EXPORT rc_t CC BAMFileMakeWithDir(const BAMFile **result,
1757 const KDirectory *dir,
1758 const char *path, ...
1759 )
1760 {
1761 va_list args;
1762 rc_t rc;
1763
1764 va_start(args, path);
1765 rc = BAMFileVMakeWithDir(result, dir, path, args);
1766 va_end(args);
1767 return rc;
1768 }
1769
BAMFileMake(const BAMFile ** cself,const char * path,...)1770 LIB_EXPORT rc_t CC BAMFileMake(const BAMFile **cself, const char *path, ...)
1771 {
1772 KDirectory *dir;
1773 va_list args;
1774 rc_t rc;
1775
1776 if (cself == NULL)
1777 return RC(rcAlign, rcFile, rcOpening, rcParam, rcNull);
1778 *cself = NULL;
1779
1780 rc = KDirectoryNativeDir(&dir);
1781 if (rc) return rc;
1782 va_start(args, path);
1783 rc = BAMFileVMakeWithDir(cself, dir, path, args);
1784 va_end(args);
1785 KDirectoryRelease(dir);
1786 return rc;
1787 }
1788
BAMFileMakeWithHeader(const BAMFile ** cself,char const headerText[],char const path[],...)1789 LIB_EXPORT rc_t CC BAMFileMakeWithHeader ( const BAMFile **cself,
1790 char const headerText[],
1791 char const path[], ... )
1792 {
1793 KDirectory *dir;
1794 va_list args;
1795 rc_t rc;
1796 const KFile *kf;
1797
1798 if (cself == NULL)
1799 return RC(rcAlign, rcFile, rcOpening, rcParam, rcNull);
1800 *cself = NULL;
1801
1802 rc = KDirectoryNativeDir(&dir);
1803 if (rc) return rc;
1804 va_start(args, path);
1805 rc = KDirectoryVOpenFileRead(dir, &kf, path, args);
1806 if (rc == 0) {
1807 rc = BAMFileMakeWithKFileAndHeader(cself, kf, headerText, false);
1808 KFileRelease(kf);
1809 }
1810 va_end(args);
1811 KDirectoryRelease(dir);
1812 return rc;
1813 }
1814
BAMFileMakeWithVPath(const BAMFile ** cself,const VPath * path)1815 LIB_EXPORT rc_t CC BAMFileMakeWithVPath(const BAMFile **cself, const VPath *path)
1816 {
1817 VFSManager *vfs = NULL;
1818 KFile const *fp = NULL;
1819 rc_t rc = 0;
1820
1821 rc = VFSManagerMake(&vfs);
1822 if (rc) return rc;
1823
1824 rc = VFSManagerOpenFileRead ( vfs, &fp, path);
1825 VFSManagerRelease(vfs);
1826 if (rc) return rc;
1827
1828 rc = BAMFileMakeWithKFile(cself, fp);
1829 if (rc) return rc;
1830
1831 KFileRelease(fp);
1832 return 0;
1833 }
1834
1835 /* MARK: BAM File ref-counting */
1836
BAMFileAddRef(const BAMFile * cself)1837 LIB_EXPORT rc_t CC BAMFileAddRef(const BAMFile *cself) {
1838 if (cself != NULL)
1839 KRefcountAdd(&cself->refcount, "BAMFile");
1840 return 0;
1841 }
1842
BAMFileRelease(const BAMFile * cself)1843 LIB_EXPORT rc_t CC BAMFileRelease(const BAMFile *cself) {
1844 rc_t rc = 0;
1845 BAMFile *self = (BAMFile *)cself;
1846
1847 if (cself != NULL) {
1848 if (KRefcountDrop(&self->refcount, "BAMFile") == krefWhack) {
1849 rc = BAMFileWhack(self);
1850 free(self);
1851 }
1852 }
1853 return rc;
1854 }
1855
1856 /* MARK: BAM File positioning */
1857
BAMFileGetProportionalPosition(const BAMFile * self)1858 LIB_EXPORT float CC BAMFileGetProportionalPosition(const BAMFile *self)
1859 {
1860 return self->vt.FileProPos(&self->file);
1861 }
1862
BAMFileGetPositionInt(const BAMFile * self)1863 static BAMFilePosition BAMFileGetPositionInt(const BAMFile *self) {
1864 return (self->fpos_cur << 16) | self->bufCurrent;
1865 }
1866
BAMFileGetPosition(const BAMFile * self,BAMFilePosition * pos)1867 LIB_EXPORT rc_t CC BAMFileGetPosition(const BAMFile *self, BAMFilePosition *pos) {
1868 *pos = BAMFileGetPositionInt(self);
1869 return 0;
1870 }
1871
BAMFileSetPositionInt(const BAMFile * cself,uint64_t fpos,uint16_t bpos)1872 static rc_t BAMFileSetPositionInt(const BAMFile *cself, uint64_t fpos, uint16_t bpos)
1873 {
1874 rc_t rc;
1875 BAMFile *self = (BAMFile *)cself;
1876
1877 if (cself->fpos_first > fpos || fpos > cself->vt.FileGetSize(&cself->file) ||
1878 (fpos == cself->fpos_first && bpos < cself->ucfirst))
1879 {
1880 return RC(rcAlign, rcFile, rcPositioning, rcParam, rcInvalid);
1881 }
1882 if (cself->fpos_cur == fpos) {
1883 if (bpos <= cself->bufSize) {
1884 self->eof = false;
1885 self->bufCurrent = bpos;
1886 return 0;
1887 }
1888 return RC(rcAlign, rcFile, rcPositioning, rcParam, rcInvalid);
1889 }
1890 rc = self->vt.FileSetPos(&self->file, fpos);
1891 if (rc == 0) {
1892 self->eof = false;
1893 self->bufSize = 0; /* force re-read */
1894 self->bufCurrent = bpos;
1895 self->fpos_cur = fpos;
1896 }
1897 return rc;
1898 }
1899
BAMFileSetPosition(const BAMFile * cself,const BAMFilePosition * pos)1900 LIB_EXPORT rc_t CC BAMFileSetPosition(const BAMFile *cself, const BAMFilePosition *pos)
1901 {
1902 return BAMFileSetPositionInt(cself, *pos >> 16, (uint16_t)(*pos));
1903 }
1904
BAMFileRewind(const BAMFile * cself)1905 LIB_EXPORT rc_t CC BAMFileRewind(const BAMFile *cself)
1906 {
1907 return BAMFileSetPositionInt(cself, cself->fpos_first, cself->ucfirst);
1908 }
1909
BAMFileAdvance(BAMFile * const self,unsigned distance)1910 static void BAMFileAdvance(BAMFile *const self, unsigned distance)
1911 {
1912 self->bufCurrent += distance;
1913 if (self->bufCurrent == self->bufSize) {
1914 self->fpos_cur = self->vt.FileGetPos(&self->file);
1915 self->bufCurrent = 0;
1916 self->bufSize = 0;
1917 }
1918 }
1919
1920 /* MARK: BAM Alignment contruction */
1921
TagTypeSize(int const type)1922 static int TagTypeSize(int const type)
1923 {
1924 switch (type) {
1925 case dt_ASCII: /* A */
1926 case dt_INT8: /* c */
1927 case dt_UINT8: /* C */
1928 return 1;
1929
1930 case dt_INT16: /* s */
1931 case dt_UINT16: /* S */
1932 return 2;
1933
1934 case dt_INT: /* i */
1935 case dt_UINT: /* I */
1936 case dt_FLOAT32: /* f */
1937 return 4;
1938 #if 0
1939 case dt_FLOAT64: /* d */
1940 return 8;
1941 #endif
1942 case dt_CSTRING: /* Z */
1943 case dt_HEXSTRING: /* H */
1944 return -'S';
1945
1946 case dt_NUM_ARRAY: /* B */
1947 return -'A';
1948 }
1949 return 0;
1950 }
1951
ColorCheck(BAMAlignment * const self,char const tag[2],unsigned const len)1952 static void ColorCheck(BAMAlignment *const self, char const tag[2], unsigned const len)
1953 {
1954 if (tag[0] == 'C' && len != 0) {
1955 int const ch = tag[1];
1956 int const flag = ch == 'Q' ? 2 : ch == 'S' ? 1 : 0;
1957
1958 if (flag)
1959 self->hasColor ^= (len << 2) | flag;
1960 }
1961 }
1962
ParseOptData(BAMAlignment * const self,size_t const maxsize,size_t const xtra,size_t const datasize)1963 static rc_t ParseOptData(BAMAlignment *const self, size_t const maxsize,
1964 size_t const xtra, size_t const datasize)
1965 {
1966 size_t const maxExtra = (maxsize - (sizeof(*self) - sizeof(self->extra))) / sizeof(self->extra[0]);
1967 char const *const base = (char const *)self->data->raw;
1968 unsigned i = 0;
1969 unsigned len;
1970 unsigned offset;
1971
1972 self->numExtra = 0;
1973 for (len = 0, offset = (unsigned)xtra; offset < datasize; offset += len) {
1974 int const valuelen1 = TagTypeSize(base[offset + 2]);
1975 unsigned valuelen;
1976
1977 if (valuelen1 < 0) {
1978 char const *const value = &base[offset + 3];
1979
1980 if (-valuelen1 == 'S') {
1981 valuelen = 0;
1982 while (value[valuelen] != '\0') {
1983 ++valuelen;
1984 if (offset + valuelen >= datasize) {
1985 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
1986 return rc;
1987 }
1988 }
1989 ColorCheck(self, base + offset, valuelen);
1990 ++valuelen;
1991 }
1992 else {
1993 int const elem_size = TagTypeSize(value[0]);
1994
1995 assert(-valuelen1 == 'A');
1996 if (elem_size <= 0) {
1997 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
1998 return rc;
1999 }
2000 else {
2001 int const elem_count = LE2HI32(&value[1]);
2002
2003 valuelen = elem_size * elem_count + 1 + 4;
2004 if (offset + valuelen >= datasize) {
2005 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2006 return rc;
2007 }
2008 }
2009 }
2010 }
2011 else if (valuelen1 == 0) {
2012 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
2013 return rc;
2014 }
2015 else
2016 valuelen = valuelen1;
2017
2018 len = valuelen + 3;
2019 if (i < maxExtra) {
2020 self->extra[i].offset = offset;
2021 self->extra[i].size = len;
2022 }
2023 ++i;
2024 }
2025 self->numExtra = i;
2026 if (2 <= i && i <= maxExtra)
2027 ksort(self->extra, i, sizeof(self->extra[0]), OptTag_sort, self);
2028
2029 return 0;
2030 }
2031
ParseOptDataLog(BAMAlignment * const self,unsigned const maxsize,unsigned const xtra,unsigned const datasize)2032 static rc_t ParseOptDataLog(BAMAlignment *const self, unsigned const maxsize,
2033 unsigned const xtra, unsigned const datasize)
2034 {
2035 unsigned const maxExtra = (maxsize - (sizeof(*self) - sizeof(self->extra))) / sizeof(self->extra[0]);
2036 char const *const base = (char const *)self->data->raw;
2037 unsigned i = 0;
2038 unsigned len;
2039 unsigned offset;
2040
2041 self->numExtra = 0;
2042 for (len = 0, offset = (unsigned)xtra; offset < datasize; offset += len) {
2043 int const type = base[offset + 2];
2044 int const valuelen1 = TagTypeSize(type);
2045 unsigned valuelen;
2046
2047 if (valuelen1 > 0)
2048 valuelen = valuelen1;
2049 else if (valuelen1 < 0) {
2050 char const *const value = &base[offset + 3];
2051
2052 if (-valuelen1 == 'S') {
2053 valuelen = 0;
2054 while (value[valuelen] != '\0') {
2055 ++valuelen;
2056 if (offset + valuelen >= datasize) {
2057 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2058 (void)LOGERR(klogErr, rc,
2059 "Parsing BAM optional fields: "
2060 "unterminated string");
2061 return rc;
2062 }
2063 }
2064 ColorCheck(self, base + offset, valuelen);
2065 ++valuelen;
2066 }
2067 else {
2068 int const elem_type = value[0];
2069 int const elem_size = TagTypeSize(elem_type);
2070
2071 assert(-valuelen1 == 'A');
2072 if (elem_size <= 0) {
2073 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
2074 (void)LOGERR(klogErr, rc,
2075 "Parsing BAM optional fields: "
2076 "unknown array type");
2077 return rc;
2078 }
2079 else {
2080 int const elem_count = LE2HI32(&value[1]);
2081
2082 valuelen = elem_size * elem_count + 1 + 4;
2083 if (offset + valuelen >= datasize) {
2084 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2085 (void)LOGERR(klogErr, rc,
2086 "Parsing BAM optional fields: "
2087 "array too big");
2088 return rc;
2089 }
2090 }
2091 }
2092 }
2093 else {
2094 rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
2095 (void)LOGERR(klogErr, rc,
2096 "Parsing BAM optional fields: "
2097 "unknown type");
2098 return rc;
2099 }
2100
2101 len = valuelen + 3;
2102 if (i < maxExtra) {
2103 self->extra[i].offset = offset;
2104 self->extra[i].size = len;
2105 }
2106 ++i;
2107 }
2108 self->numExtra = i;
2109 if (2 <= i && i <= maxExtra)
2110 ksort(self->extra, i, sizeof(self->extra[0]), OptTag_sort, self);
2111
2112 return 0;
2113 }
2114
BAMAlignmentSize(unsigned const max_extra_tags)2115 static unsigned CC BAMAlignmentSize(unsigned const max_extra_tags)
2116 {
2117 BAMAlignment const *const y = NULL;
2118
2119 return sizeof(*y) + (max_extra_tags ? max_extra_tags - 1 : 0) * sizeof(y->extra);
2120 }
2121
BAMAlignmentSetOffsets(BAMAlignment * const self)2122 static unsigned BAMAlignmentSetOffsets(BAMAlignment *const self)
2123 {
2124 unsigned const nameLen = getReadNameLength(self);
2125 unsigned const cigCnt = getCigarCount(self);
2126 unsigned const readLen = getReadLen(self);
2127 unsigned const cigar = (unsigned)(&((struct bam_alignment_s const *)NULL)->read_name[nameLen] - (const char *)NULL);
2128 unsigned const seq = cigar + 4 * cigCnt;
2129 unsigned const qual = seq + (readLen + 1) / 2;
2130 unsigned const xtra = qual + readLen;
2131
2132 self->cigar = cigar;
2133 self->seq = seq;
2134 self->qual = qual;
2135
2136 return xtra;
2137 }
2138
BAMAlignmentInit(BAMAlignment * const self,unsigned const maxsize,BAMFilePosition pos,unsigned const datasize,void const * const data)2139 static bool BAMAlignmentInit(BAMAlignment *const self, unsigned const maxsize,
2140 BAMFilePosition pos,
2141 unsigned const datasize, void const *const data)
2142 {
2143 memset(self, 0, sizeof(*self));
2144 self->pos = pos;
2145 self->data = data;
2146 self->datasize = datasize;
2147 {
2148 unsigned const xtra = BAMAlignmentSetOffsets(self);
2149
2150 if ( datasize >= xtra
2151 && datasize >= self->cigar
2152 && datasize >= self->seq
2153 && datasize >= self->qual)
2154 {
2155 rc_t const rc = ParseOptData(self, maxsize, xtra, datasize);
2156
2157 if (rc == 0)
2158 return true;
2159 }
2160 return false;
2161 }
2162 }
2163
BAMAlignmentInitLog(BAMAlignment * const self,unsigned const maxsize,BAMFilePosition pos,unsigned const datasize,void const * const data)2164 static bool BAMAlignmentInitLog(BAMAlignment *const self, unsigned const maxsize,
2165 BAMFilePosition pos,
2166 unsigned const datasize, void const *const data)
2167 {
2168 memset(self, 0, sizeof(*self));
2169 self->pos = pos;
2170 self->data = data;
2171 self->datasize = datasize;
2172 {
2173 unsigned const xtra = BAMAlignmentSetOffsets(self);
2174
2175 if ( datasize >= xtra
2176 && datasize >= self->cigar
2177 && datasize >= self->seq
2178 && datasize >= self->qual)
2179 {
2180 rc_t const rc = ParseOptDataLog(self, maxsize, xtra, datasize);
2181
2182 if (rc == 0) {
2183 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("{"
2184 "\"BAM record\": "
2185 "{ "
2186 "\"size\": %u, "
2187 "\"name length\": %u, "
2188 "\"cigar count\": %u, "
2189 "\"read length\": %u, "
2190 "\"extra count\": %u "
2191 "}"
2192 "}\n",
2193 (unsigned)datasize,
2194 (unsigned)getReadNameLength(self),
2195 (unsigned)getCigarCount(self),
2196 (unsigned)getReadLen(self),
2197 (unsigned)self->numExtra));
2198 return true;
2199 }
2200 }
2201 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("{"
2202 "\"BAM record\": "
2203 "{ "
2204 "\"size\": %u, "
2205 "\"name length\": %u, "
2206 "\"cigar count\": %u, "
2207 "\"read length\": %u "
2208 "}"
2209 "}\n",
2210 (unsigned)datasize,
2211 (unsigned)getReadNameLength(self),
2212 (unsigned)getCigarCount(self),
2213 (unsigned)getReadLen(self)));
2214 return false;
2215 }
2216 }
2217
BAMAlignmentLogParseError(BAMAlignment const * self)2218 static void BAMAlignmentLogParseError(BAMAlignment const *self)
2219 {
2220 char const *const reason = self->cigar > self->datasize ? "BAM Record CIGAR too long"
2221 : self->seq > self->datasize ? "BAM Record SEQ too long"
2222 : self->qual > self->datasize ? "BAM Record QUAL too long"
2223 : self->qual + getReadLen(self) > self->datasize ? "BAM Record EXTRA too long"
2224 : "BAM Record EXTRA parsing failure";
2225
2226 LOGERR(klogErr, SILENT_RC(rcAlign, rcFile, rcReading, rcRow, rcInvalid), reason);
2227 }
2228
2229 /* MARK: BAM Alignment readers */
2230
2231 /* returns
2232 * (rcAlign, rcFile, rcReading, rcBuffer, rcNotAvailable)
2233 * or
2234 * (rcAlign, rcFile, rcReading, rcBuffer, rcInsufficient)
2235 * if should read with copy
2236 */
2237 static
BAMFileReadNoCopy(BAMFile * const self,unsigned actsize[],BAMAlignment rhs[],unsigned const maxsize)2238 rc_t BAMFileReadNoCopy(BAMFile *const self, unsigned actsize[], BAMAlignment rhs[],
2239 unsigned const maxsize)
2240 {
2241 unsigned const maxPeek = BAMFileMaxPeek(self);
2242 bool isgood;
2243
2244 *actsize = 0;
2245
2246 if (maxPeek == 0) {
2247 rc_t const rc = BAMFileFillBuffer(self);
2248
2249 if (rc == 0)
2250 return BAMFileReadNoCopy(self, actsize, rhs, maxsize);
2251
2252 if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
2253 {
2254 self->eof = true;
2255 return SILENT_RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
2256 }
2257 return rc;
2258 }
2259 if (maxPeek < 4)
2260 return SILENT_RC(rcAlign, rcFile, rcReading, rcBuffer, rcNotAvailable);
2261 else {
2262 BAMFilePosition curPos = (self->fpos_cur << 16) | self->bufCurrent;
2263 int32_t const i32 = BAMFilePeekI32(self);
2264
2265 if (i32 <= 0)
2266 return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2267
2268 if (maxPeek < ( uint32_t ) i32 + 4)
2269 return SILENT_RC(rcAlign, rcFile, rcReading, rcBuffer, rcNotAvailable);
2270
2271 isgood = BAMAlignmentInitLog(rhs, maxsize, curPos, i32, BAMFilePeek(self, 4));
2272 rhs[0].parent = self;
2273 KRefcountInit(&rhs->refcount, 1, "BAMAlignment", "ReadNoCopy", "");
2274 }
2275 *actsize = BAMAlignmentSize(rhs[0].numExtra);
2276 if (isgood && *actsize > maxsize)
2277 return SILENT_RC(rcAlign, rcFile, rcReading, rcBuffer, rcInsufficient);
2278
2279 BAMFileAdvance(self, 4 + rhs->datasize);
2280 return isgood ? 0 : RC(rcAlign, rcFile, rcReading, rcRow, rcInvalid);
2281 }
2282
2283 static
BAMAlignmentSizeFromData(unsigned const datasize,void const * data)2284 unsigned BAMAlignmentSizeFromData(unsigned const datasize, void const *data)
2285 {
2286 BAMAlignment temp;
2287
2288 BAMAlignmentInit(&temp, sizeof(temp), 0, datasize, data);
2289
2290 return BAMAlignmentSize(temp.numExtra);
2291 }
2292
BAMAlignmentIsEmpty(BAMAlignment const * const self)2293 static bool BAMAlignmentIsEmpty(BAMAlignment const *const self)
2294 {
2295 if (getReadNameLength(self) == 0)
2296 return true;
2297 if (self->hasColor == 3)
2298 return false;
2299 if (getReadLen(self) != 0)
2300 return false;
2301 if (getCigarCount(self) != 0)
2302 return false;
2303 return true;
2304 }
2305
2306 static
BAMFileReadCopy(BAMFile * const self,BAMAlignment const * rslt[],bool const log)2307 rc_t BAMFileReadCopy(BAMFile *const self, BAMAlignment const *rslt[], bool const log)
2308 {
2309 void *storage;
2310 void const *data;
2311 unsigned datasize;
2312 rc_t rc;
2313 BAMFilePosition curPos = (self->fpos_cur << 16) | self->bufCurrent;
2314
2315 rslt[0] = NULL;
2316 {
2317 int32_t i32;
2318
2319 rc = BAMFileReadI32(self, &i32);
2320 if ( rc != 0 )
2321 {
2322 if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
2323 {
2324 self->eof = true;
2325 rc = RC( rcAlign, rcFile, rcReading, rcRow, rcNotFound );
2326 }
2327 return rc;
2328 }
2329 if (i32 <= 0)
2330 return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2331
2332 datasize = i32;
2333 }
2334 if (BAMFileMaxPeek(self) < datasize) {
2335 data = storage = malloc(datasize);
2336 if (storage == NULL)
2337 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2338
2339 rc = BAMFileReadn(self, datasize, storage);
2340 }
2341 else {
2342 storage = NULL;
2343 data = (bam_alignment *)&self->buffer[self->bufCurrent];
2344
2345 BAMFileAdvance(self, datasize);
2346 }
2347 if (rc == 0) {
2348 unsigned const rsltsize = BAMAlignmentSizeFromData(datasize, data);
2349 BAMAlignment *const y = malloc(rsltsize);
2350
2351 if (y) {
2352 if ((log ? BAMAlignmentInitLog : BAMAlignmentInit)(y, rsltsize, curPos, datasize, data)) {
2353 if (storage == NULL)
2354 self->bufLocker = y;
2355 else
2356 y->storage = storage;
2357
2358 y->parent = self;
2359 KRefcountInit(&y->refcount, 1, "BAMAlignment", "ReadCopy", "");
2360 BAMFileAddRef(self);
2361 rslt[0] = y;
2362
2363 if (BAMAlignmentIsEmpty(y))
2364 return RC(rcAlign, rcFile, rcReading, rcRow, rcEmpty);
2365 return 0;
2366 }
2367 rc = RC(rcAlign, rcFile, rcReading, rcRow, rcInvalid);
2368 free(y);
2369 }
2370 else
2371 rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2372 }
2373 free(storage);
2374
2375 return rc;
2376 }
2377
2378 static
BAMFileBreakLock(BAMFile * const self)2379 rc_t BAMFileBreakLock(BAMFile *const self)
2380 {
2381 if (self->bufLocker != NULL) {
2382 if (self->bufLocker->storage == NULL)
2383 self->bufLocker->storage = malloc(self->bufLocker->datasize);
2384 if (self->bufLocker->storage == NULL)
2385 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2386
2387 memmove(self->bufLocker->storage, self->bufLocker->data, self->bufLocker->datasize);
2388 self->bufLocker->data = (bam_alignment *)&self->bufLocker->storage[0];
2389 self->bufLocker = NULL;
2390 }
2391 return 0;
2392 }
2393
BAMFileRead2(const BAMFile * cself,const BAMAlignment ** rhs)2394 LIB_EXPORT rc_t CC BAMFileRead2(const BAMFile *cself, const BAMAlignment **rhs)
2395 {
2396 BAMFile *const self = (BAMFile *)cself;
2397 unsigned actsize = 0;
2398 rc_t rc;
2399
2400 if (self == NULL || rhs == NULL)
2401 return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2402
2403 *rhs = NULL;
2404
2405 if (self->bufCurrent >= self->bufSize && self->eof)
2406 return RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
2407
2408 rc = BAMFileBreakLock(self);
2409 if (rc)
2410 return rc;
2411
2412 if (self->nocopy_size == 0) {
2413 size_t const size = 4096u;
2414 void *const temp = malloc(size);
2415
2416 if (temp == NULL)
2417 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2418
2419 self->nocopy = temp;
2420 self->nocopy_size = size;
2421 }
2422
2423 AGAIN:
2424 rc = BAMFileReadNoCopy(self, &actsize, self->nocopy, (unsigned)self->nocopy_size);
2425 if (rc == 0) {
2426 *rhs = self->nocopy;
2427 if (BAMAlignmentIsEmpty(self->nocopy)) {
2428 rc = RC(rcAlign, rcFile, rcReading, rcRow, rcEmpty);
2429 LOGERR(klogWarn, rc, "BAM Record contains no alignment or sequence data");
2430 }
2431 }
2432 else if ( GetRCObject( rc ) == (enum RCObject)rcBuffer && GetRCState( rc ) == rcInsufficient )
2433 {
2434 unsigned const size = (actsize + 4095u) & ~4095u;
2435 void *const temp = realloc(self->nocopy, size);
2436
2437 if (temp == NULL)
2438 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2439
2440 self->nocopy = temp;
2441 self->nocopy_size = size;
2442
2443 goto AGAIN;
2444 }
2445 else if ( GetRCObject( rc ) == (enum RCObject)rcBuffer && GetRCState( rc ) == rcNotAvailable )
2446 {
2447 rc = BAMFileReadCopy( self, rhs, true );
2448 }
2449 else if (GetRCObject(rc) == rcRow && GetRCState(rc) == rcInvalid) {
2450 BAMAlignmentLogParseError(self->nocopy);
2451 }
2452 return rc;
2453 }
2454
BAMFileRead(const BAMFile * cself,const BAMAlignment ** rhs)2455 LIB_EXPORT rc_t CC BAMFileRead(const BAMFile *cself, const BAMAlignment **rhs)
2456 {
2457 BAMFile *const self = (BAMFile *)cself;
2458
2459 if (self == NULL || rhs == NULL)
2460 return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2461
2462 *rhs = NULL;
2463
2464 if (self->bufCurrent >= self->bufSize && self->eof)
2465 return RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
2466 else {
2467 rc_t const rc = BAMFileBreakLock(self);
2468 if (rc)
2469 return rc;
2470 }
2471 return BAMFileReadCopy(self, rhs, false);
2472 }
2473
2474 /* MARK: BAM File header info accessor */
2475
BAMFileGetRefSeqById(const BAMFile * cself,int32_t id,const BAMRefSeq ** rhs)2476 LIB_EXPORT rc_t CC BAMFileGetRefSeqById(const BAMFile *cself, int32_t id, const BAMRefSeq **rhs)
2477 {
2478 *rhs = NULL;
2479 if (id >= 0 && id < cself->refSeqs)
2480 *rhs = &cself->refSeq[id];
2481 return 0;
2482 }
2483
BAMFileGetReadGroupByName(const BAMFile * cself,const char * name,const BAMReadGroup ** rhs)2484 LIB_EXPORT rc_t CC BAMFileGetReadGroupByName(const BAMFile *cself, const char *name, const BAMReadGroup **rhs)
2485 {
2486 BAMReadGroup rg;
2487
2488 *rhs = NULL;
2489
2490 rg.name = name;
2491 if (rg.name != NULL)
2492 *rhs = kbsearch(&rg, cself->readGroup, cself->readGroups, sizeof(rg), comp_ReadGroup, NULL);
2493
2494 return 0;
2495 }
2496
BAMFileGetRefSeqCount(const BAMFile * cself,unsigned * rhs)2497 LIB_EXPORT rc_t CC BAMFileGetRefSeqCount(const BAMFile *cself, unsigned *rhs)
2498 {
2499 *rhs = cself->refSeqs;
2500 return 0;
2501 }
2502
BAMFileGetRefSeq(const BAMFile * cself,unsigned i,const BAMRefSeq ** rhs)2503 LIB_EXPORT rc_t CC BAMFileGetRefSeq(const BAMFile *cself, unsigned i, const BAMRefSeq **rhs)
2504 {
2505 *rhs = NULL;
2506 if (i < cself->refSeqs)
2507 *rhs = &cself->refSeq[i];
2508 return 0;
2509 }
2510
BAMFileGetReadGroupCount(const BAMFile * cself,unsigned * rhs)2511 LIB_EXPORT rc_t CC BAMFileGetReadGroupCount(const BAMFile *cself, unsigned *rhs)
2512 {
2513 *rhs = cself->readGroups;
2514 return 0;
2515 }
2516
BAMFileGetReadGroup(const BAMFile * cself,unsigned i,const BAMReadGroup ** rhs)2517 LIB_EXPORT rc_t CC BAMFileGetReadGroup(const BAMFile *cself, unsigned i, const BAMReadGroup **rhs)
2518 {
2519 *rhs = NULL;
2520 if (i < cself->readGroups)
2521 *rhs = &cself->readGroup[i];
2522 return 0;
2523 }
2524
BAMFileGetHeaderText(BAMFile const * cself,char const ** header,size_t * header_len)2525 LIB_EXPORT rc_t CC BAMFileGetHeaderText(BAMFile const *cself, char const **header, size_t *header_len)
2526 {
2527 *header = cself->header;
2528 *header_len = *header ? string_size( *header ) : 0;
2529 return 0;
2530 }
2531
2532 /* MARK: BAM Alignment destructor */
2533
BAMAlignmentWhack(BAMAlignment * self)2534 static rc_t BAMAlignmentWhack(BAMAlignment *self)
2535 {
2536 if (self->parent->bufLocker == self)
2537 self->parent->bufLocker = NULL;
2538 if (self != self->parent->nocopy) {
2539 BAMFileRelease(self->parent);
2540 free(self->storage);
2541 free(self);
2542 }
2543 return 0;
2544 }
2545
2546 /* MARK: BAM Alignment ref-counting */
2547
BAMAlignmentAddRef(const BAMAlignment * cself)2548 LIB_EXPORT rc_t CC BAMAlignmentAddRef(const BAMAlignment *cself)
2549 {
2550 if (cself != NULL)
2551 KRefcountAdd(&cself->refcount, "BAMAlignment");
2552 return 0;
2553 }
2554
BAMAlignmentRelease(const BAMAlignment * cself)2555 LIB_EXPORT rc_t CC BAMAlignmentRelease(const BAMAlignment *cself)
2556 {
2557 if (cself && KRefcountDrop(&cself->refcount, "BAMAlignment") == krefWhack)
2558 BAMAlignmentWhack((BAMAlignment *)cself);
2559
2560 return 0;
2561 }
2562
2563 #if 0
2564 LIB_EXPORT uint16_t CC BAMAlignmentIffyFields(const BAMAlignment *self)
2565 {
2566 }
2567
2568 LIB_EXPORT uint16_t CC BAMAlignmentBadFields(const BAMAlignment *self)
2569 {
2570 }
2571 #endif
2572
2573 /* MARK: BAM Alignment accessors */
2574
BAMAlignmentGetFilePos(const BAMAlignment * self)2575 LIB_EXPORT BAMFilePosition BAMAlignmentGetFilePos(const BAMAlignment *self)
2576 {
2577 return self->pos;
2578 }
2579
BAMAlignmentGetCigarElement(const BAMAlignment * self,unsigned i)2580 static uint32_t BAMAlignmentGetCigarElement(const BAMAlignment *self, unsigned i)
2581 {
2582 return LE2HUI32(&((uint8_t const *)getCigarBase(self))[i * 4]);
2583 }
2584
BAMAlignmentGetRefSeqId(const BAMAlignment * cself,int32_t * rhs)2585 LIB_EXPORT rc_t CC BAMAlignmentGetRefSeqId(const BAMAlignment *cself, int32_t *rhs)
2586 {
2587 *rhs = getRefSeqId(cself);
2588 return 0;
2589 }
2590
BAMAlignmentGetPosition(const BAMAlignment * cself,int64_t * rhs)2591 LIB_EXPORT rc_t CC BAMAlignmentGetPosition(const BAMAlignment *cself, int64_t *rhs)
2592 {
2593 *rhs = getPosition(cself);
2594 return 0;
2595 }
2596
BAMAlignmentIsMapped(const BAMAlignment * cself)2597 LIB_EXPORT bool CC BAMAlignmentIsMapped(const BAMAlignment *cself)
2598 {
2599 if (((getFlags(cself) & BAMFlags_SelfIsUnmapped) == 0) && getRefSeqId(cself) >= 0 && getPosition(cself) >= 0 && getCigarCount(cself) > 0)
2600 return true;
2601 return false;
2602 }
2603
2604 /* static bool BAMAlignmentIsMateMapped(const BAMAlignment *cself)
2605 {
2606 if (((getFlags(cself) & BAMFlags_MateIsUnmapped) == 0) && getMateRefSeqId(cself) >= 0 && getMatePos(cself) >= 0)
2607 return true;
2608 return false;
2609 } */
2610
BAMAlignmentGetAlignmentDetail(const BAMAlignment * self,BAMAlignmentDetail * rslt,uint32_t count,uint32_t * actual,int32_t * pfirst,int32_t * plast)2611 LIB_EXPORT rc_t CC BAMAlignmentGetAlignmentDetail(
2612 const BAMAlignment *self,
2613 BAMAlignmentDetail *rslt, uint32_t count, uint32_t *actual,
2614 int32_t *pfirst, int32_t *plast
2615 )
2616 {
2617 unsigned i;
2618 unsigned ccnt; /* cigar count */
2619 int32_t gpos; /* refSeq pos in global coordinates */
2620 unsigned rpos; /* read pos (always local coordinates) */
2621 uint32_t rlen; /* read length */
2622 int32_t first = -1;
2623 int32_t last = -1;
2624
2625 if (!self)
2626 return RC(rcAlign, rcFile, rcReading, rcSelf, rcNull);
2627
2628 rlen = getReadLen(self);
2629 ccnt = getCigarCount(self);
2630 gpos = getPosition(self);
2631
2632 if (gpos < 0)
2633 ccnt = 0;
2634
2635 if (actual)
2636 *actual = ccnt;
2637
2638 if (pfirst)
2639 *pfirst = -1;
2640
2641 if (plast)
2642 *plast = -1;
2643
2644 if (ccnt == 0)
2645 return 0;
2646
2647 if (rslt == NULL) {
2648 if (actual == NULL)
2649 return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2650 count = 0;
2651 }
2652
2653 if (count < ccnt)
2654 return RC(rcAlign, rcFile, rcReading, rcBuffer, rcInsufficient);
2655
2656 for (rpos = 0, i = 0; i != ccnt; ++i) {
2657 uint32_t len = BAMAlignmentGetCigarElement(self, i);
2658 int op = len & 0x0F;
2659
2660 if (op > sizeof(cigarChars))
2661 return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2662
2663 op = cigarChars[op];
2664 len >>= 4;
2665
2666 rslt[i].refSeq_pos = gpos;
2667 rslt[i].read_pos = rpos;
2668 rslt[i].length = len;
2669 rslt[i].type = (BAMCigarType)op;
2670
2671 /* because we're assigning unsigned to signed below */
2672 assert ( ( int32_t ) i >= 0 );
2673
2674 switch ((BAMCigarType)op) {
2675 case ct_Match:
2676 case ct_Equal:
2677 if (first == -1)
2678 first = ( int32_t ) i;
2679 last = ( int32_t ) i;
2680 gpos += len;
2681 rpos += len;
2682 break;
2683 case ct_Insert:
2684 case ct_SoftClip:
2685 gpos += len;
2686 break;
2687 case ct_Delete:
2688 case ct_Skip:
2689 rpos += len;
2690 break;
2691 case ct_HardClip:
2692 case ct_Padded:
2693 rslt[i].refSeq_pos = -1;
2694 rslt[i].read_pos = -1;
2695 break;
2696 default:
2697 break;
2698 }
2699
2700 if (rslt[i].read_pos > rlen)
2701 return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2702 }
2703 if (pfirst)
2704 *pfirst = first;
2705
2706 if (plast)
2707 *plast = last;
2708
2709 return 0;
2710 }
2711
2712 static
ReferenceLengthFromCIGAR(const BAMAlignment * self)2713 unsigned ReferenceLengthFromCIGAR(const BAMAlignment *self)
2714 {
2715 unsigned i;
2716 unsigned n = getCigarCount(self);
2717 unsigned y;
2718
2719 for (i = 0, y = 0; i != n; ++i) {
2720 uint32_t const len = BAMAlignmentGetCigarElement(self, i);
2721
2722 switch (cigarChars[len & 0x0F]) {
2723 case ct_Match:
2724 case ct_Equal:
2725 case ct_NotEqual:
2726 case ct_Delete:
2727 case ct_Skip:
2728 y += len >> 4;
2729 break;
2730 default:
2731 break;
2732 }
2733 }
2734 return y;
2735 }
2736
2737 static
SequenceLengthFromCIGAR(const BAMAlignment * self)2738 unsigned SequenceLengthFromCIGAR(const BAMAlignment *self)
2739 {
2740 unsigned i;
2741 unsigned n = getCigarCount(self);
2742 unsigned y;
2743
2744 for (i = 0, y = 0; i != n; ++i) {
2745 uint32_t const len = BAMAlignmentGetCigarElement(self, i);
2746
2747 switch (cigarChars[len & 0x0F]) {
2748 case ct_Match:
2749 case ct_Equal:
2750 case ct_NotEqual:
2751 case ct_Insert:
2752 case ct_SoftClip:
2753 y += len >> 4;
2754 break;
2755 default:
2756 break;
2757 }
2758 }
2759 return y;
2760 }
2761
BAMAlignmentGetPosition2(const BAMAlignment * cself,int64_t * rhs,uint32_t * length)2762 LIB_EXPORT rc_t CC BAMAlignmentGetPosition2(const BAMAlignment *cself, int64_t *rhs, uint32_t *length)
2763 {
2764 *rhs = getPosition(cself);
2765 if (*rhs >= 0)
2766 *length = ReferenceLengthFromCIGAR(cself);
2767 return 0;
2768 }
2769
BAMAlignmentGetReadGroupName(const BAMAlignment * cself,const char ** rhs)2770 LIB_EXPORT rc_t CC BAMAlignmentGetReadGroupName(const BAMAlignment *cself, const char **rhs)
2771 {
2772 *rhs = get_RG(cself);
2773 return 0;
2774 }
2775
BAMAlignmentGetReadName(const BAMAlignment * cself,const char ** rhs)2776 LIB_EXPORT rc_t CC BAMAlignmentGetReadName(const BAMAlignment *cself, const char **rhs)
2777 {
2778 *rhs = getReadName(cself);
2779 return 0;
2780 }
2781
BAMAlignmentGetReadName2(const BAMAlignment * cself,const char ** rhs,size_t * length)2782 LIB_EXPORT rc_t CC BAMAlignmentGetReadName2(const BAMAlignment *cself, const char **rhs, size_t *length)
2783 {
2784 *length = getReadNameLength(cself) - 1;
2785 *rhs = getReadName(cself);
2786 return 0;
2787 }
2788
BAMAlignmentGetReadName3(const BAMAlignment * cself,const char ** rhs,size_t * length)2789 LIB_EXPORT rc_t CC BAMAlignmentGetReadName3(const BAMAlignment *cself, const char **rhs, size_t *length)
2790 {
2791 char const *const name = getReadName(cself);
2792 size_t len = getReadNameLength(cself);
2793 size_t i;
2794
2795 for (i = len; i; ) {
2796 int const ch = name[--i];
2797
2798 if (ch == '/') {
2799 len = i;
2800 break;
2801 }
2802 if (!isdigit(ch))
2803 break;
2804 }
2805 *rhs = name;
2806 *length = len;
2807
2808 return 0;
2809 }
2810
BAMAlignmentGetFlags(const BAMAlignment * cself,uint16_t * rhs)2811 LIB_EXPORT rc_t CC BAMAlignmentGetFlags(const BAMAlignment *cself, uint16_t *rhs)
2812 {
2813 *rhs = getFlags(cself);
2814 return 0;
2815 }
2816
BAMAlignmentGetMapQuality(const BAMAlignment * cself,uint8_t * rhs)2817 LIB_EXPORT rc_t CC BAMAlignmentGetMapQuality(const BAMAlignment *cself, uint8_t *rhs)
2818 {
2819 *rhs = getMapQual(cself);
2820 return 0;
2821 }
2822
BAMAlignmentGetCigarCount(const BAMAlignment * cself,unsigned * rhs)2823 LIB_EXPORT rc_t CC BAMAlignmentGetCigarCount(const BAMAlignment *cself, unsigned *rhs)
2824 {
2825 *rhs = getCigarCount(cself);
2826 return 0;
2827 }
2828
BAMAlignmentGetRawCigar(const BAMAlignment * cself,uint32_t const * rslt[],uint32_t * length)2829 LIB_EXPORT rc_t CC BAMAlignmentGetRawCigar(const BAMAlignment *cself, uint32_t const *rslt[], uint32_t *length)
2830 {
2831 *rslt = getCigarBase(cself);
2832 *length = getCigarCount(cself);
2833 return 0;
2834 }
2835
BAMAlignmentGetCigar(const BAMAlignment * cself,uint32_t i,BAMCigarType * type,uint32_t * length)2836 LIB_EXPORT rc_t CC BAMAlignmentGetCigar(const BAMAlignment *cself, uint32_t i, BAMCigarType *type, uint32_t *length)
2837 {
2838 uint32_t x;
2839
2840 if (i >= getCigarCount(cself))
2841 return RC(rcAlign, rcFile, rcReading, rcParam, rcInvalid);
2842
2843 x = BAMAlignmentGetCigarElement(cself, i);
2844 *type = (BAMCigarType)(cigarChars[x & 0x0F]);
2845 *length = x >> 4;
2846 return 0;
2847 }
2848
BAMAlignmentGetReadLength(const BAMAlignment * cself,uint32_t * rhs)2849 LIB_EXPORT rc_t CC BAMAlignmentGetReadLength(const BAMAlignment *cself, uint32_t *rhs)
2850 {
2851 *rhs = getReadLen(cself);
2852 return 0;
2853 }
2854
BAMAlignmentGetSequence2(const BAMAlignment * cself,char * rhs,uint32_t start,uint32_t stop)2855 LIB_EXPORT rc_t CC BAMAlignmentGetSequence2(const BAMAlignment *cself, char *rhs, uint32_t start, uint32_t stop)
2856 {
2857 /*
2858 * = A C M G R S V T W Y H K D B N
2859 * 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
2860 * 1111 1000 0100 1100 0010 1010 0110 1110 0001 1001 0101 1101 0011 1011 0111 0000
2861 * N T G K C Y S B A W R D M H V =
2862 */
2863 static const char tr[16] = "=ACMGRSVTWYHKDBN";
2864 /* static const char ctr[16] = "=TGKCYSBAWRDMHVN"; */
2865 unsigned const n = getReadLen(cself);
2866 const uint8_t * const seq = &cself->data->raw[cself->seq];
2867 unsigned si, di;
2868
2869 if (stop == 0 || stop > n)
2870 stop = n;
2871
2872 for (di = 0, si = start; si != stop; ++si, ++di) {
2873 unsigned const b4na2 = seq[si >> 1];
2874 unsigned const b4na = (si & 1) == 0 ? (b4na2 >> 4) : (b4na2 & 0x0F);
2875
2876 rhs[di] = tr[b4na];
2877 }
2878 return 0;
2879 }
2880
BAMAlignmentGetSequence(const BAMAlignment * cself,char * rhs)2881 LIB_EXPORT rc_t CC BAMAlignmentGetSequence(const BAMAlignment *cself, char *rhs)
2882 {
2883 return BAMAlignmentGetSequence2(cself, rhs, 0, 0);
2884 }
2885
BAMAlignmentHasColorSpace(BAMAlignment const * cself)2886 LIB_EXPORT bool CC BAMAlignmentHasColorSpace(BAMAlignment const *cself)
2887 {
2888 return get_CS(cself) != NULL;
2889 }
2890
BAMAlignmentGetCSKey(BAMAlignment const * cself,char rhs[1])2891 LIB_EXPORT rc_t CC BAMAlignmentGetCSKey(BAMAlignment const *cself, char rhs[1])
2892 {
2893 char const *const vCS = get_CS(cself);
2894
2895 if (vCS)
2896 rhs[0] = vCS[0];
2897 return 0;
2898 }
2899
BAMAlignmentGetCSSeqLen(BAMAlignment const * cself,uint32_t * rhs)2900 LIB_EXPORT rc_t CC BAMAlignmentGetCSSeqLen(BAMAlignment const *cself, uint32_t *rhs)
2901 {
2902 struct offset_size_s const *const vCS = get_CS_info(cself);
2903
2904 *rhs = vCS ? vCS->size - 5 : 0;
2905 return 0;
2906 }
2907
BAMAlignmentGetCSSequence(BAMAlignment const * cself,char rhs[],uint32_t seqlen)2908 LIB_EXPORT rc_t CC BAMAlignmentGetCSSequence(BAMAlignment const *cself, char rhs[], uint32_t seqlen)
2909 {
2910 char const *const vCS = get_CS(cself);
2911
2912 if (vCS) {
2913 unsigned i;
2914
2915 for (i = 0;i != seqlen; ++i) {
2916 char const ch = vCS[i+1];
2917
2918 rhs[i] = (ch == '4') ? '.' : ch;
2919 }
2920 }
2921 return 0;
2922 }
2923
BAMAlignmentGetQuality(const BAMAlignment * cself,const uint8_t ** rhs)2924 LIB_EXPORT rc_t CC BAMAlignmentGetQuality(const BAMAlignment *cself, const uint8_t **rhs)
2925 {
2926 *rhs = &cself->data->raw[cself->qual];
2927 return 0;
2928 }
2929
BAMAlignmentGetQuality2(BAMAlignment const * cself,uint8_t const ** rhs,uint8_t * offset)2930 LIB_EXPORT rc_t CC BAMAlignmentGetQuality2(BAMAlignment const *cself, uint8_t const **rhs, uint8_t *offset)
2931 {
2932 uint8_t const *const OQ = get_OQ(cself);
2933
2934 if (OQ) {
2935 struct offset_size_s const *const oq = get_OQ_info(cself);
2936
2937 if (oq->size - 4 == getReadLen(cself)) {
2938 *offset = 33;
2939 *rhs = OQ;
2940 }
2941 else
2942 return RC(rcAlign, rcRow, rcReading, rcData, rcInconsistent);
2943 }
2944 else {
2945 *offset = 0;
2946 *rhs = &cself->data->raw[cself->qual];
2947 }
2948 return 0;
2949 }
2950
BAMAlignmentGetCSQuality(BAMAlignment const * cself,uint8_t const ** rhs,uint8_t * offset)2951 LIB_EXPORT rc_t CC BAMAlignmentGetCSQuality(BAMAlignment const *cself, uint8_t const **rhs, uint8_t *offset)
2952 {
2953 struct offset_size_s const *const cs = get_CS_info(cself);
2954 struct offset_size_s const *const cq = get_CQ_info(cself);
2955 uint8_t const *const CQ = get_CQ(cself);
2956
2957 if (cs && cq && CQ) {
2958 if (cs->size == cq->size) {
2959 *offset = 33;
2960 *rhs = CQ + 1;
2961 return 0;
2962 }
2963 if (cs->size == cq->size + 1) {
2964 *offset = 33;
2965 *rhs = CQ;
2966 return 0;
2967 }
2968 return RC(rcAlign, rcRow, rcReading, rcData, rcInconsistent);
2969 }
2970 *offset = 0;
2971 *rhs = &cself->data->raw[cself->qual];
2972 return 0;
2973 }
2974
BAMAlignmentGetMateRefSeqId(const BAMAlignment * cself,int32_t * rhs)2975 LIB_EXPORT rc_t CC BAMAlignmentGetMateRefSeqId(const BAMAlignment *cself, int32_t *rhs)
2976 {
2977 *rhs = getMateRefSeqId(cself);
2978 return 0;
2979 }
2980
BAMAlignmentGetMatePosition(const BAMAlignment * cself,int64_t * rhs)2981 LIB_EXPORT rc_t CC BAMAlignmentGetMatePosition(const BAMAlignment *cself, int64_t *rhs)
2982 {
2983 *rhs = getMatePos(cself);
2984 return 0;
2985 }
2986
BAMAlignmentGetInsertSize(const BAMAlignment * cself,int64_t * rhs)2987 LIB_EXPORT rc_t CC BAMAlignmentGetInsertSize(const BAMAlignment *cself, int64_t *rhs)
2988 {
2989 *rhs = getInsertSize(cself);
2990 return 0;
2991 }
2992
FormatOptData(BAMAlignment const * const self,size_t const maxsize,char buffer[])2993 static int FormatOptData(BAMAlignment const *const self,
2994 size_t const maxsize,
2995 char buffer[])
2996 {
2997 char const *const base = (char const *)&self->data->raw[self->qual + getReadLen(self)];
2998 unsigned i;
2999 unsigned offset;
3000 unsigned cur = 0;
3001 int j;
3002
3003 for (i = 0, offset = 0; i < self->numExtra; ++i) {
3004 int type;
3005 union { float f; uint32_t i; } fi;
3006
3007 if (cur + 7 > maxsize)
3008 return -1;
3009 buffer[cur++] = '\t';
3010 buffer[cur++] = base[offset++];
3011 buffer[cur++] = base[offset++];
3012 buffer[cur++] = ':';
3013 type = base[offset++];
3014
3015 switch (type) {
3016 case dt_ASCII: /* A */
3017 buffer[cur++] = 'A';
3018 buffer[cur++] = ':';
3019 buffer[cur++] = base[offset++];
3020 break;
3021
3022 case dt_INT8: /* c */
3023 buffer[cur++] = 'i';
3024 buffer[cur++] = ':';
3025 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)*((int8_t const *)(base + offset)));
3026 if ((cur += j) >= maxsize)
3027 return -1;
3028 offset += 1;
3029 break;
3030
3031 case dt_UINT8: /* C */
3032 buffer[cur++] = 'i';
3033 buffer[cur++] = ':';
3034 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)*((uint8_t const *)(base + offset)));
3035 if ((cur += j) >= maxsize)
3036 return -1;
3037 offset += 1;
3038 break;
3039
3040 case dt_INT16: /* s */
3041 buffer[cur++] = 'i';
3042 buffer[cur++] = ':';
3043 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI16(base + offset));
3044 if ((cur += j) >= maxsize)
3045 return -1;
3046 offset += 2;
3047 break;
3048
3049 case dt_UINT16: /* S */
3050 buffer[cur++] = 'i';
3051 buffer[cur++] = ':';
3052 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)LE2HUI16(base + offset));
3053 if ((cur += j) >= maxsize)
3054 return -1;
3055 offset += 2;
3056 break;
3057
3058 case dt_INT: /* i */
3059 buffer[cur++] = 'i';
3060 buffer[cur++] = ':';
3061 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI32(base + offset));
3062 if ((cur += j) >= maxsize)
3063 return -1;
3064 offset += 4;
3065 break;
3066
3067 case dt_UINT: /* I */
3068 buffer[cur++] = 'i';
3069 buffer[cur++] = ':';
3070 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI32(base + offset));
3071 if ((cur += j) >= maxsize)
3072 return -1;
3073 offset += 4;
3074 break;
3075
3076 case dt_FLOAT32: /* f */
3077 buffer[cur++] = 'f';
3078 buffer[cur++] = ':';
3079 fi.i = LE2HUI32(base + offset);
3080 j = snprintf(buffer + cur, maxsize - cur, "%f", fi.f);
3081 if ((cur += j) >= maxsize)
3082 return -1;
3083 offset += 4;
3084 break;
3085
3086 case dt_HEXSTRING: /* H */
3087 case dt_CSTRING: /* Z */
3088 buffer[cur++] = type == dt_CSTRING ? 'Z' : 'H';
3089 buffer[cur++] = ':';
3090 for ( ; ; ) {
3091 int const ch = base[offset++];
3092
3093 if (ch == '\0')
3094 break;
3095 if (cur >= maxsize)
3096 return -1;
3097 buffer[cur++] = ch;
3098 }
3099 break;
3100
3101 case dt_NUM_ARRAY: /* B */
3102 buffer[cur++] = 'B';
3103 buffer[cur++] = ':';
3104 {
3105 int const elemtype = base[offset++];
3106 unsigned const elemcount = LE2HUI32(base + offset);
3107 unsigned k;
3108
3109 if (cur + 2 >= maxsize)
3110 return -1;
3111 buffer[cur++] = elemtype;
3112 offset += 4;
3113 for (k = 0; k < elemcount; ++k) {
3114 buffer[cur++] = ',';
3115 switch (elemtype) {
3116 case dt_INT8:
3117 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)*((int8_t const *)(base + offset)));
3118 if ((cur += j) >= maxsize)
3119 return -1;
3120 offset += 1;
3121 break;
3122
3123 case dt_UINT8:
3124 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)*((uint8_t const *)(base + offset)));
3125 if ((cur += j) >= maxsize)
3126 return -1;
3127 offset += 1;
3128 break;
3129
3130 case dt_INT16:
3131 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI16(base + offset));
3132 if ((cur += j) >= maxsize)
3133 return -1;
3134 offset += 2;
3135 break;
3136
3137 case dt_UINT16:
3138 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)LE2HUI16(base + offset));
3139 if ((cur += j) >= maxsize)
3140 return -1;
3141 offset += 2;
3142 break;
3143
3144 case dt_INT:
3145 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI32(base + offset));
3146 if ((cur += j) >= maxsize)
3147 return -1;
3148 offset += 4;
3149 break;
3150
3151 case dt_UINT:
3152 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)LE2HUI32(base + offset));
3153 if ((cur += j) >= maxsize)
3154 return -1;
3155 offset += 4;
3156 break;
3157
3158 case dt_FLOAT32:
3159 fi.i = LE2HUI32(base + offset);
3160 j = snprintf(buffer + cur, maxsize - cur, "%f", fi.f);
3161 if ((cur += j) >= maxsize)
3162 return -1;
3163 offset += 4;
3164
3165 default:
3166 return -1;
3167 break;
3168 }
3169 }
3170 }
3171 break;
3172
3173 default:
3174 return -1;
3175 break;
3176 }
3177 }
3178 return cur;
3179 }
3180
FormatSAM(BAMAlignment const * self,size_t * const actsize,size_t const maxsize,char * const buffer)3181 static rc_t FormatSAM(BAMAlignment const *self,
3182 size_t *const actsize,
3183 size_t const maxsize,
3184 char *const buffer)
3185 {
3186 int i = 0;
3187 size_t cur = 0;
3188 unsigned j;
3189 int const refSeqId = getRefSeqId(self);
3190 int const refPos = getPosition(self);
3191 unsigned const cigCount = getCigarCount(self);
3192 uint32_t const *const cigar = getCigarBase(self);
3193 int const mateRefSeqId = getMateRefSeqId(self);
3194 int const mateRefPos = getMatePos(self);
3195 unsigned const readlen = getReadLen(self);
3196
3197 i = snprintf(&buffer[cur], maxsize - cur,
3198 "%s\t%i\t%s\t%i\t%i\t",
3199 getReadName(self),
3200 getFlags(self),
3201 refSeqId < 0 ? "*" : self->parent->refSeq[refSeqId].name,
3202 refPos < 0 ? 0 : refPos + 1,
3203 getMapQual(self)
3204 );
3205 if ((cur += i) > maxsize)
3206 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3207
3208 if (cigCount > 0) {
3209 for (j = 0; j < cigCount; ++j) {
3210 uint32_t const el = cigar[j];
3211 BAMCigarType const type = (BAMCigarType)(cigarChars[el & 0x0F]);
3212 unsigned const length = el >> 4;
3213
3214 i = snprintf(&buffer[cur], maxsize - cur, "%u%c", length, type);
3215 if ((cur += i) > maxsize)
3216 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3217 }
3218 }
3219 else {
3220 if ((cur + 1) > maxsize)
3221 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3222 buffer[cur++] = '*';
3223 }
3224 i = snprintf(&buffer[cur], maxsize - cur,
3225 "\t%s\t%i\t%i\t",
3226 mateRefSeqId < 0 ? "*" : mateRefSeqId == refSeqId ? "=" : self->parent->refSeq[mateRefSeqId].name,
3227 mateRefPos < 0 ? 0 : mateRefPos + 1,
3228 getInsertSize(self)
3229 );
3230 if ((cur += i) > maxsize)
3231 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3232 if (readlen) {
3233 uint8_t const *const qual = &self->data->raw[self->qual];
3234
3235 if (cur + 2 * readlen + 1 > maxsize)
3236 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3237 BAMAlignmentGetSequence(self, &buffer[cur]);
3238 cur += readlen;
3239 buffer[cur] = '\t';
3240 ++cur;
3241
3242 for (j = 0; j < readlen; ++j) {
3243 if (qual[j] != 0xFF)
3244 goto HAS_QUAL;
3245 }
3246 if (1) {
3247 buffer[cur++] = '*';
3248 }
3249 else {
3250 HAS_QUAL:
3251 for (j = 0; j < readlen; ++j)
3252 buffer[cur++] = qual[j] + 33;
3253 }
3254 }
3255 else {
3256 i = snprintf(&buffer[cur], maxsize - cur, "*\t*");
3257 if ((cur += i) > maxsize)
3258 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3259 }
3260 i = FormatOptData(self, maxsize - cur, &buffer[cur]);
3261 if (i < 0)
3262 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3263 if ((cur += i) + 2 > maxsize)
3264 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3265 buffer[cur++] = '\n';
3266 buffer[cur] = '\0';
3267 *actsize = cur;
3268
3269 return 0;
3270 }
3271
3272 #define FORMAT_SAM_SCRATCH_SIZE ((size_t)(64u * 1024u))
FormatSAMBuffer(BAMAlignment const * self,size_t actSize[],size_t const maxsize,char * const buffer)3273 static rc_t FormatSAMBuffer(BAMAlignment const *self,
3274 size_t actSize[],
3275 size_t const maxsize,
3276 char *const buffer)
3277 {
3278 char scratch[FORMAT_SAM_SCRATCH_SIZE];
3279 size_t actsize = 0;
3280 rc_t const rc = FormatSAM(self, &actsize, FORMAT_SAM_SCRATCH_SIZE, scratch);
3281
3282 actSize[0] = actsize;
3283 if (rc) return rc;
3284
3285 if (actsize > maxsize)
3286 return RC(rcAlign, rcReading, rcRow, rcBuffer, rcInsufficient);
3287
3288 memmove(buffer, scratch, actsize);
3289 return 0;
3290 }
3291
BAMAlignmentFormatSAM(BAMAlignment const * self,size_t * const actSize,size_t const maxsize,char * const buffer)3292 LIB_EXPORT rc_t CC BAMAlignmentFormatSAM(BAMAlignment const *self,
3293 size_t *const actSize,
3294 size_t const maxsize,
3295 char *const buffer)
3296 {
3297 if (self == NULL)
3298 return RC(rcAlign, rcReading, rcRow, rcSelf, rcNull);
3299 if (buffer == NULL)
3300 return RC(rcAlign, rcReading, rcRow, rcParam, rcNull);
3301 else {
3302 size_t actsize = 0;
3303 rc_t const rc = (maxsize < FORMAT_SAM_SCRATCH_SIZE ? FormatSAMBuffer : FormatSAM)(self, &actsize, maxsize, buffer);
3304
3305 if (actSize)
3306 *actSize = actsize;
3307 return rc;
3308 }
3309 }
3310
3311 typedef struct OptForEach_ctx_s {
3312 BAMOptData *val;
3313 BAMOptData **alloced;
3314 size_t valsize;
3315 rc_t rc;
3316 BAMOptionalDataFunction user_f;
3317 void *user_ctx;
3318 } OptForEach_ctx_t;
3319
i_OptDataForEach(BAMAlignment const * cself,void * Ctx,char const tag[2],BAMOptDataValueType type,unsigned count,void const * value,unsigned size)3320 static bool i_OptDataForEach(BAMAlignment const *cself, void *Ctx, char const tag[2], BAMOptDataValueType type, unsigned count, void const *value, unsigned size)
3321 {
3322 OptForEach_ctx_t *ctx = (OptForEach_ctx_t *)Ctx;
3323 size_t const need = (size_t)&((BAMOptData const *)NULL)->u.f64[(count * size + sizeof(double) - 1)/sizeof(double)];
3324
3325 if (need > ctx->valsize) {
3326 void *const temp = realloc(ctx->alloced, need);
3327 if (temp == NULL) {
3328 ctx->rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
3329 return true;
3330 }
3331 *ctx->alloced = ctx->val = temp;
3332 ctx->valsize = need;
3333 }
3334 ctx->val->type = type;
3335 ctx->val->element_count = (type == dt_CSTRING || type == dt_HEXSTRING) ? size - 1 : count;
3336
3337 memmove(ctx->val->u.u8, value, size * count);
3338 #if __BYTE_ORDER == __BIG_ENDIAN
3339 {{
3340 unsigned di;
3341 uint32_t elem_count = ctx->val->element_count;
3342
3343 switch (size) {
3344 case 2:
3345 for (di = 0; di != elem_count; ++di)
3346 ctx->val->u.u16[di] = LE2HUI16(&ctx->val->u.u16[di]);
3347 break;
3348 case 4:
3349 for (di = 0; di != elem_count; ++di)
3350 ctx->val->u.u32[di] = LE2HUI32(&ctx->val->u.u32[di]);
3351 break;
3352 case 8:
3353 for (di = 0; di != elem_count; ++di)
3354 ctx->val->u.u64[di] = LE2HUI64(&ctx->val->u.u64[di]);
3355 break;
3356 }
3357 }}
3358 #endif
3359 ctx->rc = ctx->user_f(ctx->user_ctx, tag, ctx->val);
3360 return ctx->rc != 0;
3361 }
3362
BAMAlignmentOptDataForEach(const BAMAlignment * cself,void * user_ctx,BAMOptionalDataFunction f)3363 LIB_EXPORT rc_t CC BAMAlignmentOptDataForEach(const BAMAlignment *cself, void *user_ctx, BAMOptionalDataFunction f)
3364 {
3365 union u {
3366 BAMOptData value;
3367 uint8_t storage[4096];
3368 } value_auto;
3369 OptForEach_ctx_t ctx;
3370 rc_t rc = 0;
3371 unsigned i;
3372
3373 ctx.val = &value_auto.value;
3374 ctx.alloced = NULL;
3375 ctx.valsize = sizeof(value_auto);
3376 ctx.rc = 0;
3377 ctx.user_f = f;
3378 ctx.user_ctx = user_ctx;
3379
3380 for (i = 0; i != cself->numExtra; ++i) {
3381 char const *const tag = (char const *)&cself->data->raw[cself->extra[i].offset];
3382 uint8_t type = tag[2];
3383 uint8_t const *const vp = (uint8_t const *)&tag[3];
3384 unsigned len = cself->extra[i].size - 3;
3385 unsigned size = cself->extra[i].size - 3;
3386 unsigned count = 1;
3387 unsigned offset = 0;
3388
3389 if (type == dt_NUM_ARRAY) {
3390 unsigned elem_size = 0;
3391 uint32_t elem_count = 0;
3392
3393 offset = len = 5;
3394 switch (vp[0]) {
3395 case dt_INT8:
3396 case dt_UINT8:
3397 elem_size = 1;
3398 break;
3399 case dt_INT16:
3400 case dt_UINT16:
3401 elem_size = 2;
3402 break;
3403 case dt_FLOAT32:
3404 case dt_INT:
3405 case dt_UINT:
3406 elem_size = 4;
3407 break;
3408 #if 0
3409 case dt_FLOAT64:
3410 elem_size = 8;
3411 break;
3412 #endif
3413 default:
3414 rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
3415 break;
3416 }
3417 if (rc)
3418 break;
3419 elem_count = LE2HUI32(&vp[1]);
3420 len += elem_size * elem_count;
3421 type = vp[0];
3422 count = elem_count;
3423 size = elem_size;
3424 break;
3425 }
3426 if (i_OptDataForEach(cself, &ctx, tag, type, count, &vp[offset], size))
3427 break;
3428 }
3429 rc = rc ? rc : ctx.rc;
3430 if (ctx.alloced)
3431 free(ctx.alloced);
3432 return rc;
3433 }
3434
3435 /* MARK: Complete Genomics stuff */
3436
BAMAlignmentHasCGData(BAMAlignment const * self)3437 LIB_EXPORT bool CC BAMAlignmentHasCGData(BAMAlignment const *self)
3438 {
3439 return get_CG_GC_info(self) && get_CG_GS_info(self) && get_CG_GQ_info(self);
3440 }
3441
BAMAlignmentParseCGTag(BAMAlignment const * self,size_t const max_cg_segs,unsigned cg_segs[])3442 static bool BAMAlignmentParseCGTag(BAMAlignment const *self, size_t const max_cg_segs, unsigned cg_segs[/* max_cg_segs */])
3443 {
3444 /*** patern in cg_segs should be nSnGnSnG - no more then 7 segments **/
3445 struct offset_size_s const *const GCi = get_CG_GC_info(self);
3446 char const *cg = (char const *)&self->data->raw[GCi->offset + 3];
3447 char const *const end = cg + GCi->size - 4;
3448 unsigned iseg = 0;
3449 char last_op = 'S';
3450
3451 memset(cg_segs, 0, max_cg_segs * sizeof(cg_segs[0]));
3452
3453 while (cg < end && iseg < max_cg_segs) {
3454 char *endp;
3455 long const op_len = strtol(cg, &endp, 10);
3456 char const op = *(cg = endp);
3457
3458 ++cg;
3459 if (op==last_op) {
3460 cg_segs[iseg] += op_len;
3461 }
3462 else {
3463 last_op = op;
3464 ++iseg;
3465 cg_segs[iseg] = (unsigned)op_len;
3466 }
3467 }
3468 return true;
3469 }
3470
3471 static
ExtractInt32(BAMAlignment const * self,int32_t * result,struct offset_size_s const * const tag)3472 rc_t ExtractInt32(BAMAlignment const *self, int32_t *result,
3473 struct offset_size_s const *const tag)
3474 {
3475 int64_t y;
3476 int const type = self->data->raw[tag->offset + 2];
3477 void const *const pvalue = &self->data->raw[tag->offset + 3];
3478
3479 switch (type) {
3480 case 'c':
3481 if (tag->size == 4)
3482 y = *((int8_t const *)pvalue);
3483 else
3484 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3485 break;
3486 case 'C':
3487 if (tag->size == 4)
3488 y = *((uint8_t const *)pvalue);
3489 else
3490 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3491 break;
3492 case 's':
3493 if (tag->size == 5)
3494 y = LE2HI16(pvalue);
3495 else
3496 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3497 break;
3498 case 'S':
3499 if (tag->size == 5)
3500 y = LE2HUI16(pvalue);
3501 else
3502 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3503 break;
3504 case 'i':
3505 if (tag->size == 7)
3506 y = LE2HI32(pvalue);
3507 else
3508 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3509 break;
3510 case 'I':
3511 if (tag->size == 7)
3512 y = LE2HUI32(pvalue);
3513 else
3514 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3515 break;
3516 default:
3517 return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3518 }
3519 if (INT32_MIN <= y && y <= INT32_MAX) {
3520 *result = (int32_t)y;
3521 return 0;
3522 }
3523 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3524 }
3525
3526 LIB_EXPORT
BAMAlignmentGetCGAlignGroup(BAMAlignment const * self,char buffer[],size_t max_size,size_t * act_size)3527 rc_t CC BAMAlignmentGetCGAlignGroup(BAMAlignment const *self,
3528 char buffer[],
3529 size_t max_size,
3530 size_t *act_size)
3531 {
3532 struct offset_size_s const *const ZA = get_CG_ZA_info(self);
3533 struct offset_size_s const *const ZI = get_CG_ZI_info(self);
3534
3535 if (ZA && ZI) {
3536 rc_t rc;
3537 int32_t za;
3538 int32_t zi;
3539
3540 rc = ExtractInt32(self, &za, ZA); if (rc) return rc;
3541 rc = ExtractInt32(self, &zi, ZI); if (rc) return rc;
3542 return string_printf(buffer, max_size, act_size, "%i_%i", zi, za);
3543 }
3544 return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3545 }
3546
3547 LIB_EXPORT
BAMAlignmentGetCGSeqQual(BAMAlignment const * self,char sequence[],uint8_t quality[])3548 rc_t CC BAMAlignmentGetCGSeqQual(BAMAlignment const *self,
3549 char sequence[],
3550 uint8_t quality[])
3551 {
3552 struct offset_size_s const *const GCi = get_CG_GC_info(self);
3553 struct offset_size_s const *const GSi = get_CG_GS_info(self);
3554 struct offset_size_s const *const GQi = get_CG_GQ_info(self);
3555
3556 if (GCi && GSi && GQi) {
3557 char const *const vGS = (char const *)&self->data->raw[GSi->offset + 3];
3558 char const *const GQ = (char const *)&self->data->raw[GQi->offset + 3];
3559 unsigned const GSsize = GSi->size - 4;
3560 unsigned const sn = getReadLen(self);
3561 unsigned cg_segs[2*CG_NUM_SEGS-1]; /** 4 segments + 3gaps **/
3562 unsigned i,G,S;
3563
3564 if (GSi->size != GQi->size)
3565 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3566
3567 if (SequenceLengthFromCIGAR(self) != sn)
3568 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3569
3570 if (!BAMAlignmentParseCGTag(self, 2*CG_NUM_SEGS-1, cg_segs))
3571 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3572
3573 for (S = cg_segs[0], G = 0, i = 1; i < CG_NUM_SEGS; ++i) { /** sum all S and G **/
3574 S += cg_segs[2*i];
3575 G += cg_segs[2*i-1];
3576 }
3577 if (G + G != GSsize || S + G > sn || sn + G != 35) {
3578 /*fprintf(stderr, "GSsize: %u; sn: %u; S: %u; G: %u\n", GSsize, sn, S, G);*/
3579 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3580 }
3581 if (G > 0) {
3582 unsigned nsi = cg_segs[0]; /** new index into sequence */
3583 unsigned osi = nsi + G; /** old index into sequence */
3584 unsigned k; /** index into inserted sequence **/
3585
3586 /***make room for inserts **/
3587 memmove(sequence + osi, sequence + nsi, sn - nsi);
3588 memmove(quality + osi, quality + nsi, sn - nsi);
3589
3590 for (i = 1, k = 0; i < CG_NUM_SEGS && nsi < osi; ++i) {/*** when osi and nsi meet we are done ***/
3591 unsigned j;
3592
3593 for (j = cg_segs[2*i-1]; j > 0; --j) { /** insert mode **/
3594 sequence[nsi] = vGS[k];
3595 quality [nsi] = GQ[k] - 33;
3596 ++nsi; ++k;
3597 sequence[nsi] = vGS[k];
3598 quality [nsi] = GQ[k] - 33;
3599 ++nsi;
3600 ++osi;
3601 ++k;
3602 }
3603 if (nsi < osi){
3604 for (j = cg_segs[2*i]; j > 0; --j) { /** copy mode **/
3605 sequence[nsi] = sequence[osi];
3606 quality[nsi] = quality[osi];
3607 ++nsi;
3608 ++osi;
3609 }
3610 }
3611 }
3612 }
3613 return 0;
3614 }
3615 return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3616 }
3617
3618
splice(uint32_t cigar[],unsigned n,unsigned at,unsigned out,unsigned in,uint32_t const new_values[])3619 static unsigned splice(uint32_t cigar[], unsigned n, unsigned at, unsigned out, unsigned in, uint32_t const new_values[/* in */])
3620 {
3621 assert(at + out <= n);
3622 memmove(&cigar[at + in], &cigar[at + out], (n - at - out) * 4);
3623 if (in)
3624 memmove(&cigar[at], new_values, in * 4);
3625 return n + in - out;
3626 }
3627
3628 #define OPCODE_2_FIX (0xF)
3629
insert_B(unsigned S,unsigned G,unsigned const n,uint32_t cigar[])3630 static unsigned insert_B(unsigned S, unsigned G, unsigned const n, uint32_t cigar[/* n */])
3631 {
3632 unsigned i;
3633 unsigned pos;
3634 unsigned const T = S + G;
3635
3636 for (pos = i = 0; i < n; ++i) {
3637 int const opcode = cigar[i] & 0xF;
3638
3639 switch (opcode) {
3640 case 0:
3641 case 1:
3642 case 4:
3643 case 7:
3644 case 8:
3645 {{
3646 unsigned const len = cigar[i] >> 4;
3647 unsigned const nxt = pos + len;
3648
3649 if (pos <= T && T <= nxt) {
3650 unsigned const l = T - pos;
3651 unsigned const r = len - l;
3652 unsigned B = i + 2;
3653 unsigned in = 4;
3654 uint32_t Ops[4];
3655 uint32_t *ops = Ops;
3656
3657 Ops[0] = (l << 4) | opcode;
3658 Ops[1] = (G << 4) | 9; /* B */
3659 Ops[2] = (G << 4) | 0; /* M this is not backwards */
3660 Ops[3] = (r << 4) | opcode;
3661
3662 if (r == 0)
3663 --in;
3664 if (l == 0) {
3665 ++ops;
3666 --in;
3667 --B;
3668 }
3669 return splice(cigar, n, i, 1, in, ops);
3670 }
3671 pos = nxt;
3672 }}
3673 break;
3674 default:
3675 break;
3676 }
3677 }
3678 return n;
3679 }
3680
3681 #if 0
3682 static unsigned fix_I(uint32_t cigar[], unsigned n)
3683 {
3684 unsigned i;
3685 /* int last_b = 0; */
3686
3687 for (i = 0; i < n; ++i) {
3688 unsigned const opcode = cigar[i] & 0xF;
3689
3690 if (opcode == 0xF) {
3691 unsigned const oplen = cigar[i] >> 4;
3692 uint32_t ops[2];
3693
3694 if (0/*last_b*/) {
3695 ops[0] = (oplen << 4) | 0; /* M */
3696 ops[1] = (oplen << 4) | 9; /* B */
3697 }
3698 else {
3699 ops[0] = (oplen << 4) | 9; /* B */
3700 ops[1] = (oplen << 4) | 0; /* M */
3701 }
3702
3703 n = splice(cigar, n, i, 1, 2, ops);
3704 ++i;
3705 }
3706 /* else if (opcode == 9)
3707 last_b = 1;
3708 else
3709 last_b = 0; */
3710 }
3711 return n;
3712 }
3713
3714 static unsigned fix_IN(uint32_t cigar[], unsigned n)
3715 {
3716 unsigned i;
3717
3718 for (i = 1; i < n; ++i) {
3719 unsigned const opL = cigar[i-1] & 0xF;
3720 unsigned const opI = cigar[ i ] & 0xF;
3721
3722 if (opL == 1 && opI == 3) {
3723 unsigned const oplen = cigar[i-1] >> 4;
3724 uint32_t ops[2];
3725
3726 ops[0] = (oplen << 4) | 9; /* B */
3727 ops[1] = (oplen << 4) | 0; /* M */
3728
3729 n = splice(cigar, n, i-1, 1, 2, ops);
3730 ++i;
3731 }
3732 else if (opL == 3 && opI == 1) {
3733 unsigned const oplen = cigar[i] >> 4;
3734 uint32_t ops[2];
3735
3736 ops[0] = (oplen << 4) | 9; /* M */
3737 ops[1] = (oplen << 4) | 0; /* B */
3738
3739 n = splice(cigar, n, i, 1, 2, ops);
3740 ++i;
3741 }
3742 }
3743 return n;
3744 }
3745 #endif
3746
canonicalize(uint32_t cigar[],unsigned n)3747 static unsigned canonicalize(uint32_t cigar[], unsigned n)
3748 {
3749 unsigned i;
3750
3751 for (i = n; i > 0; ) {
3752 --i;
3753 if (cigar[i] >> 4 == 0 || (cigar[i] & 0xF) == 6)
3754 n = splice(cigar, n, i, 1, 0, NULL);
3755 }
3756 for (i = 1; i < n; ) {
3757 unsigned const opL = cigar[i-1] & 0xF;
3758 unsigned const opI = cigar[ i ] & 0xF;
3759
3760 if (opI == opL) {
3761 unsigned const oplen = (cigar[i] >> 4) + (cigar[i-1] >> 4);
3762 uint32_t const op = (oplen << 4) | opI;
3763
3764 n = splice(cigar, n, i-1, 2, 1, &op);
3765 }
3766 else
3767 ++i;
3768 }
3769 #if 0
3770 if ((cigar[0] & 0xF) == 1)
3771 cigar[0] = (cigar[0] & ~(uint32_t)0xF) | 4; /* I -> S */
3772 if ((cigar[n - 1] & 0xF) == 1)
3773 cigar[n - 1] = (cigar[n - 1] & ~(uint32_t)0xF) | 4; /* I -> S */
3774 #endif
3775 return n;
3776 }
3777
3778 /* static void reverse(uint32_t cigar[], unsigned n)
3779 {
3780 unsigned i;
3781 unsigned j;
3782
3783 for (j = n - 1, i = 0; i < j; ++i, --j) {
3784 uint32_t const tmp = cigar[i];
3785 cigar[i] = cigar[j];
3786 cigar[j] = tmp;
3787 }
3788 } */
3789
GetCGCigar(BAMAlignment const * self,unsigned const N,uint32_t cigar[])3790 static unsigned GetCGCigar(BAMAlignment const *self, unsigned const N, uint32_t cigar[/* N */])
3791 {
3792 unsigned i;
3793 unsigned G;
3794 unsigned S;
3795 unsigned n = getCigarCount(self);
3796 unsigned cg_segs[2*CG_NUM_SEGS-1]; /** 4 segments + 3 gaps **/
3797
3798 if (!BAMAlignmentParseCGTag(self, 2*CG_NUM_SEGS-1, cg_segs))
3799 return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3800
3801 if (N < n + 5)
3802 return RC(rcAlign, rcRow, rcReading, rcBuffer, rcInsufficient);
3803
3804 memmove(cigar, getCigarBase(self), n * 4);
3805 n = canonicalize(cigar, n); /* just in case */
3806 for (i = 0, S = 0; i < CG_NUM_SEGS - 1; ++i) {
3807 S += cg_segs[2*i];
3808 G = cg_segs[2*i+1];
3809 if (G > 0) {
3810 n = insert_B(S, G, n, cigar);
3811 S += G;
3812 }
3813 }
3814 return n;
3815 }
3816
3817 LIB_EXPORT
BAMAlignmentGetCGCigar(BAMAlignment const * self,uint32_t * cigar,uint32_t cig_max,uint32_t * cig_act)3818 rc_t CC BAMAlignmentGetCGCigar(BAMAlignment const *self,
3819 uint32_t *cigar,
3820 uint32_t cig_max,
3821 uint32_t *cig_act)
3822 {
3823 struct offset_size_s const *const GCi = get_CG_GC_info(self);
3824
3825 *cig_act = 0;
3826
3827 if (GCi) {
3828 *cig_act = GetCGCigar(self, cig_max, cigar);
3829 return 0;
3830 }
3831 return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3832 }
3833
3834 /* MARK: end CG stuff */
3835
BAMAlignmentGetTI(BAMAlignment const * self,uint64_t * ti)3836 LIB_EXPORT rc_t BAMAlignmentGetTI(BAMAlignment const *self, uint64_t *ti)
3837 {
3838 char const *const TI = get_XT(self);
3839 long long unsigned temp;
3840
3841 if (TI && sscanf(TI, "ti|%llu", &temp) == 1) {
3842 *ti = (uint64_t)temp;
3843 return 0;
3844 }
3845 return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3846 }
3847
BAMAlignmentGetRNAStrand(BAMAlignment const * const self,uint8_t * const rslt)3848 LIB_EXPORT rc_t BAMAlignmentGetRNAStrand(BAMAlignment const *const self, uint8_t *const rslt)
3849 {
3850 if (rslt) {
3851 uint8_t const *const XS = get_XS(self);
3852
3853 *rslt = XS ? XS[0] : ' ';
3854 }
3855 return 0;
3856 }
3857
3858 /* MARK: BAMIndex stuff */
3859
get_pos(uint8_t const buf[])3860 static uint64_t get_pos(uint8_t const buf[])
3861 {
3862 return LE2HUI64(buf);
3863 }
3864
3865 #if 0
3866 static uint16_t bin2ival(uint16_t bin)
3867 {
3868 if (bin < 1)
3869 return 0; /* (bin - 0) << 15; */
3870
3871 if (bin < 9)
3872 return (bin - 1) << 12;
3873
3874 if (bin < 73)
3875 return (bin - 9) << 9;
3876
3877 if (bin < 585)
3878 return (bin - 73) << 6;
3879
3880 if (bin < 4681)
3881 return (bin - 585) << 3;
3882
3883 if (bin < 37449)
3884 return (bin - 4681) << 0;
3885
3886 return 0;
3887 }
3888
3889 static uint16_t bin_ival_count(uint16_t bin)
3890 {
3891 if (bin < 1)
3892 return 1 << 15;
3893
3894 if (bin < 9)
3895 return 1 << 12;
3896
3897 if (bin < 73)
3898 return 1 << 9;
3899
3900 if (bin < 585)
3901 return 1 << 6;
3902
3903 if (bin < 4681)
3904 return 1 << 3;
3905
3906 if (bin < 37449)
3907 return 1;
3908
3909 return 0;
3910 }
3911 #endif
3912
3913 enum BAMIndexStructureTypes {
3914 bai_pairs,
3915 bai_intervals
3916 };
3917
3918 typedef rc_t (*WalkIndexStructureCallBack)(const uint8_t data[], size_t dlen,
3919 unsigned refNo,
3920 unsigned refs,
3921 enum BAMIndexStructureTypes type,
3922 unsigned binNo,
3923 unsigned bins,
3924 unsigned elements,
3925 void *ctx);
3926
3927 static
WalkIndexStructure(uint8_t const buf[],size_t const blen,WalkIndexStructureCallBack func,void * ctx)3928 rc_t WalkIndexStructure(uint8_t const buf[], size_t const blen,
3929 WalkIndexStructureCallBack func,
3930 void *ctx
3931 )
3932 {
3933 unsigned cp = 0;
3934 int32_t nrefs;
3935 unsigned i;
3936 rc_t rc;
3937
3938 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index data length: %u", blen));
3939
3940 if (cp + 4 > blen)
3941 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3942
3943 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index signature: '%c%c%c%u'", buf[cp+0], buf[cp+1], buf[cp+2], buf[cp+3]));
3944 if (memcmp(buf + cp, "BAI\1", 4) != 0)
3945 return RC(rcAlign, rcIndex, rcReading, rcFormat, rcUnknown);
3946
3947 cp += 4;
3948 if (cp + 4 > blen)
3949 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3950
3951 nrefs = LE2HI32(buf + cp); cp += 4;
3952 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference count: %i", nrefs));
3953
3954 if (nrefs == 0)
3955 return RC(rcAlign, rcIndex, rcReading, rcData, rcEmpty);
3956
3957 for (i = 0; i < nrefs; ++i) {
3958 int32_t bins;
3959 int32_t chunks;
3960 int32_t intervals;
3961 unsigned di;
3962
3963 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u: starts at %u", i, cp));
3964 if (cp + 4 > blen)
3965 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3966
3967 bins = LE2HI32(buf + cp); cp += 4;
3968 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u: %i bins", i, nrefs));
3969
3970 for (di = 0; di < bins; ++di) {
3971 uint32_t binNo;
3972
3973 if (cp + 8 > blen)
3974 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3975
3976 binNo = LE2HUI32(buf + cp); cp += 4;
3977 chunks = LE2HI32(buf + cp); cp += 4;
3978 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u, bin %u: %i chunks", i, binNo, chunks));
3979
3980 if (cp + 16 * chunks > blen)
3981 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3982 if (chunks > 0)
3983 rc = func(&buf[cp], 16 * chunks, i, nrefs, bai_pairs, binNo, bins, chunks, ctx);
3984 if (rc)
3985 return rc;
3986 cp += 16 * chunks;
3987 }
3988 if (cp + 4 > blen)
3989 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3990
3991 intervals = LE2HI32(buf + cp); cp += 4;
3992 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u: %i intervals", i, intervals));
3993
3994 if (cp + 8 * intervals > blen)
3995 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3996 rc = func(&buf[cp], 8 * intervals, i, nrefs, bai_intervals, ~(unsigned)0, bins, intervals, ctx);
3997 if (rc)
3998 return rc;
3999 cp += 8 * intervals;
4000 }
4001 if (cp > blen)
4002 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
4003 return 0;
4004 }
4005
4006 struct MeasureIndex_s {
4007 unsigned refs;
4008 unsigned pairs;
4009 unsigned intervals;
4010 };
4011
4012 struct LoadIndex2_s {
4013 BAMIndex *self;
4014 unsigned cur_rng;
4015 unsigned cur_pos;
4016 };
4017
4018 static
MeasureIndex(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4019 rc_t MeasureIndex(const uint8_t data[], size_t dlen, unsigned refNo,
4020 unsigned refs, enum BAMIndexStructureTypes type,
4021 unsigned binNo, unsigned bins,
4022 unsigned elements, void *Ctx)
4023 {
4024 struct MeasureIndex_s *ctx = (struct MeasureIndex_s *)Ctx;
4025
4026 ctx->refs = refs;
4027 if (elements != 0) {
4028 if (type == bai_intervals) {
4029 ctx->intervals += elements;
4030 }
4031 else if (type == bai_pairs && binNo < MAX_BIN) {
4032 ctx->pairs += elements;
4033 }
4034 }
4035 return 0;
4036 }
4037
4038 static
LoadIndex2(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4039 rc_t LoadIndex2(const uint8_t data[], size_t dlen, unsigned refNo,
4040 unsigned refs, enum BAMIndexStructureTypes type,
4041 unsigned binNo, unsigned bins,
4042 unsigned elements, void *Ctx)
4043 {
4044 struct LoadIndex2_s *const ctx = (struct LoadIndex2_s *)Ctx;
4045 BAMIndexReference *const self = &ctx->self->ref[refNo];
4046 unsigned i;
4047
4048 if (type == bai_pairs) {
4049 if (binNo < MAX_BIN) {
4050 BAMFileRange *const dst = ctx->self->rng + ctx->cur_rng;
4051 if (self->bin == NULL)
4052 self->bin = dst;
4053 self->binSize[binNo] = elements;
4054 self->binStart[binNo] = dst - self->bin;
4055 for (i = 0; i < elements; ++i) {
4056 dst[i].start = get_pos(data + 16 * i + 0);
4057 dst[i].end = get_pos(data + 16 * i + 8);
4058 }
4059 ctx->cur_rng += elements;
4060 }
4061 else if (binNo == MAX_BIN) {
4062 self->start = get_pos(data + 0);
4063 self->end = get_pos(data + 8);
4064 }
4065 }
4066 else if (type == bai_intervals) {
4067 BAMFilePosition *const dst = ctx->self->pos + ctx->cur_pos;
4068 self->intervals = elements;
4069 self->interval = dst;
4070 for (i = 0; i < elements; ++i) {
4071 dst[i] = get_pos(data + 8 * i);
4072 }
4073 ctx->cur_pos += elements;
4074 }
4075 return 0;
4076 }
4077
4078 static
LoadIndex(BAMFile * self,const uint8_t buf[],size_t blen)4079 rc_t LoadIndex(BAMFile *self, const uint8_t buf[], size_t blen)
4080 {
4081 rc_t rc;
4082 struct MeasureIndex_s ctx1;
4083
4084 memset(&ctx1, 0, sizeof(ctx1));
4085 rc = WalkIndexStructure(buf, blen, MeasureIndex, &ctx1);
4086 if (rc == 0) {
4087 size_t indexRootSize = sizeof(BAMIndex) + sizeof(BAMIndexReference) * (ctx1.refs - 1);
4088 size_t indexDataCount = ctx1.pairs * 2 + ctx1.intervals;
4089 BAMIndex *idx = calloc(1, indexRootSize);
4090 if (idx) {
4091 idx->numRefs = ctx1.refs;
4092 idx->data = calloc(indexDataCount, sizeof(BAMFilePosition));
4093 idx->rng = idx->data;
4094 idx->pos = (void *)(idx->rng + ctx1.pairs);
4095 if (idx->data) {
4096 struct LoadIndex2_s ctx2;
4097
4098 ctx2.self = idx;
4099 ctx2.cur_pos = 0;
4100 ctx2.cur_rng = 0;
4101 WalkIndexStructure(buf, blen, LoadIndex2, &ctx2);
4102
4103 if (self->ndx)
4104 BAMIndexWhack(self->ndx);
4105 self->ndx = idx;
4106 }
4107 else
4108 rc = RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4109 }
4110 else
4111 rc = RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4112 }
4113 return rc;
4114 }
4115
4116 static
BAMFileOpenIndexKFile(const BAMFile * self,KFile const * kf)4117 rc_t BAMFileOpenIndexKFile(const BAMFile *self, KFile const *kf)
4118 {
4119 rc_t rc;
4120 size_t fsize;
4121 uint8_t *buf;
4122 {
4123 uint64_t u64;
4124
4125 rc = KFileSize(kf, &u64);
4126 if (sizeof(size_t) < sizeof(u64) && (size_t)u64 != u64) {
4127 return RC(rcAlign, rcIndex, rcReading, rcData, rcExcessive);
4128 }
4129 fsize = u64;
4130 }
4131 if (rc == 0) {
4132 buf = malloc(fsize);
4133 if (buf != NULL) {
4134 size_t nread;
4135
4136 rc = KFileReadAll(kf, 0, buf, fsize, &nread);
4137 if (rc == 0) {
4138 if (nread == fsize) {
4139 rc = LoadIndex((BAMFile *)self, buf, nread);
4140 free(buf);
4141 return rc;
4142 }
4143 rc = RC(rcAlign, rcIndex, rcReading, rcData, rcInvalid);
4144 }
4145 free(buf);
4146 }
4147 else
4148 rc = RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4149 }
4150 return rc;
4151 }
4152
BAMFileOpenIndex(const BAMFile * self,const char * path)4153 LIB_EXPORT rc_t CC BAMFileOpenIndex(const BAMFile *self, const char *path)
4154 {
4155 const KFile *kf;
4156 rc_t rc;
4157 KDirectory *dir;
4158
4159 rc = KDirectoryNativeDir(&dir);
4160 if (rc) return rc;
4161 rc = KDirectoryOpenFileRead(dir, &kf, "%s", path);
4162 KDirectoryRelease(dir);
4163 if (rc) return rc;
4164 rc = BAMFileOpenIndexKFile(self, kf);
4165 KFileRelease(kf);
4166 return rc;
4167 }
4168
BAMFileOpenIndexWithVPath(const BAMFile * self,const VPath * path)4169 LIB_EXPORT rc_t CC BAMFileOpenIndexWithVPath(const BAMFile *self, const VPath *path)
4170 {
4171 VFSManager *vfs = NULL;
4172 KFile const *fp = NULL;
4173 rc_t rc = 0;
4174
4175 rc = VFSManagerMake(&vfs);
4176 if (rc) return rc;
4177
4178 rc = VFSManagerOpenFileRead( vfs, &fp, path );
4179 VFSManagerRelease(vfs);
4180 if (rc) return rc;
4181
4182 rc = BAMFileOpenIndexKFile(self, fp);
4183 KFileRelease(fp);
4184 return rc;
4185 }
4186
BAMFileIsIndexed(const BAMFile * self)4187 LIB_EXPORT bool CC BAMFileIsIndexed(const BAMFile *self)
4188 {
4189 if (self && self->ndx)
4190 return true;
4191 return false;
4192 }
4193
BAMFileIndexHasRefSeqId(const BAMFile * self,uint32_t refSeqId)4194 LIB_EXPORT bool CC BAMFileIndexHasRefSeqId(const BAMFile *self, uint32_t refSeqId)
4195 {
4196 if (self && self->ndx && refSeqId < self->ndx->numRefs)
4197 return true;
4198 return false;
4199 }
4200
4201 struct BAMFileSlice {
4202 unsigned refSeqId;
4203 unsigned sliceStart;
4204 unsigned sliceEnd;
4205 unsigned ranges;
4206 unsigned current;
4207 unsigned started;
4208 struct BAMFileRange range[1 /* ranges */];
4209 };
4210
4211 typedef struct BinRange {
4212 uint16_t beg, end;
4213 } BinRange;
4214
4215 typedef struct BinList {
4216 BinRange range[6];
4217 } BinList;
4218
calcBinList(unsigned const refBeg,unsigned const refEnd)4219 BinList calcBinList(unsigned const refBeg, unsigned const refEnd)
4220 {
4221 BinList rslt;
4222 unsigned size = 1 << 29;
4223 unsigned offset = 0;
4224 unsigned i;
4225
4226 for (i = 0; i < 6; ++i) {
4227 rslt.range[i].beg = offset + refBeg / size;
4228 rslt.range[i].end = offset + (refEnd - 1) / size;
4229 offset += 1 << (3 * i);
4230 size >>= 3;
4231 }
4232 return rslt;
4233 }
4234
BAMFileRange_cmp(void const * const A,void const * const B,void * ctx)4235 static int64_t CC BAMFileRange_cmp(void const *const A, void const *const B, void *ctx)
4236 {
4237 struct BAMFileRange const *const a = (struct BAMFileRange const *)A;
4238 struct BAMFileRange const *const b = (struct BAMFileRange const *)B;
4239 return a->start < b->start ? -1 : b->start < a->start ? 1 : a->end < b->end ? -1 : b->end < a->end ? 1 : 0;
4240 }
4241
4242 #define USE_MAXPOS 0
4243 #define USE_MINPOS 1
4244 #define SLICE_VERBOSE 0
4245
includeRange(BAMFileRange const * const range,BAMFilePosition const minPos,BAMFilePosition const maxPos)4246 static int includeRange(BAMFileRange const *const range, BAMFilePosition const minPos, BAMFilePosition const maxPos)
4247 {
4248 #if USE_MAXPOS
4249 if (maxPos != 0 && range->start >= maxPos)
4250 return 0;
4251 #endif
4252 #if USE_MINPOS
4253 if (minPos != 0 && range->end <= minPos)
4254 return 0;
4255 #endif
4256 return 1;
4257 }
4258
copyRanges(BAMFileSlice * slice,BAMIndexReference const * const refIndex,BAMFilePosition const minPos,BAMFilePosition const maxPos,BinRange const ranges[6])4259 static void copyRanges(BAMFileSlice *slice,
4260 BAMIndexReference const *const refIndex,
4261 BAMFilePosition const minPos,
4262 BAMFilePosition const maxPos,
4263 BinRange const ranges[6])
4264 {
4265 unsigned j = 0;
4266 unsigned i;
4267
4268 for (i = 0; i < 6; ++i) {
4269 BinRange const r = ranges[i];
4270 uint16_t bin = r.beg;
4271 while (bin <= r.end) {
4272 unsigned const binSize = refIndex->binSize[bin];
4273 BAMFileRange const *const range = refIndex->bin + refIndex->binStart[bin];
4274 unsigned k;
4275
4276 for (k = 0; k < binSize; ++k) {
4277 if (includeRange(&range[k], minPos, maxPos)) {
4278 slice->range[j] = range[k];
4279 ++j;
4280 }
4281 }
4282 ++bin;
4283 }
4284 }
4285 assert(j == slice->ranges);
4286 ksort(slice->range, slice->ranges, sizeof(slice->range[0]), BAMFileRange_cmp, NULL);
4287 }
4288
countRanges(BAMIndexReference const * const refIndex,BAMFilePosition const minPos,BAMFilePosition const maxPos,BinRange const ranges[6])4289 static unsigned countRanges(BAMIndexReference const *const refIndex,
4290 BAMFilePosition const minPos,
4291 BAMFilePosition const maxPos,
4292 BinRange const ranges[6])
4293 {
4294 unsigned j = 0;
4295 unsigned i;
4296
4297 #if SLICE_VERBOSE
4298 fprintf(stderr, "min: %016llX; max: %016llX\n", (unsigned long long)minPos, (unsigned long long)maxPos);
4299 #endif
4300
4301 for (i = 0; i < 6; ++i) {
4302 BinRange const r = ranges[i];
4303 uint16_t bin = r.beg;
4304 while (bin <= r.end) {
4305 unsigned const binSize = refIndex->binSize[bin];
4306 BAMFileRange const *const range = refIndex->bin + refIndex->binStart[bin];
4307 unsigned k;
4308
4309 for (k = 0; k < binSize; ++k) {
4310 if (includeRange(&range[k], minPos, maxPos)) {
4311 ++j;
4312 }
4313 }
4314 ++bin;
4315 }
4316 }
4317 return j;
4318 }
4319
makeSlice(BAMFile const * const self,unsigned const refSeqId,unsigned const alignStart,unsigned const alignEnd)4320 static BAMFileSlice *makeSlice(BAMFile const *const self,
4321 unsigned const refSeqId,
4322 unsigned const alignStart,
4323 unsigned const alignEnd)
4324 {
4325 BinList const bins = calcBinList(alignStart, alignEnd);
4326 BAMIndexReference const *const refIndex = &self->ndx->ref[refSeqId];
4327 unsigned const startBin = alignStart >> 14;
4328 unsigned const endBin = ((alignEnd - 1) >> 14) + 1;
4329 BAMFilePosition const minPos = refIndex->interval[startBin];
4330 BAMFilePosition const maxPos = endBin < refIndex->intervals ? refIndex->interval[endBin] : refIndex->end;
4331 unsigned const ranges = countRanges(refIndex, minPos, maxPos, bins.range);
4332 BAMFileSlice *slice;
4333
4334 if (ranges > 0) {
4335 slice = malloc(sizeof(*slice) + (ranges - 1) * sizeof(slice->range[0]));
4336 if (slice) {
4337 slice->refSeqId = refSeqId;
4338 slice->sliceStart = alignStart;
4339 slice->sliceEnd = alignEnd;
4340 slice->ranges = ranges;
4341 slice->current = 0;
4342 slice->started = 0;
4343
4344 copyRanges(slice, refIndex, minPos, maxPos, bins.range);
4345 }
4346 }
4347 else {
4348 slice = calloc(1, sizeof(*slice));
4349 if (slice) {
4350 slice->refSeqId = refSeqId;
4351 slice->sliceStart = alignStart;
4352 slice->sliceEnd = alignEnd;
4353 }
4354 }
4355 return slice;
4356 }
4357
BAMFileMakeSlice(const BAMFile * self,BAMFileSlice ** rslt,uint32_t refSeqId,uint64_t alignStart,uint64_t alignEnd)4358 LIB_EXPORT rc_t CC BAMFileMakeSlice(const BAMFile *self, BAMFileSlice **rslt, uint32_t refSeqId, uint64_t alignStart, uint64_t alignEnd)
4359 {
4360 if (self->ndx == NULL)
4361 return RC(rcAlign, rcFile, rcPositioning, rcIndex, rcNotFound);
4362 if (refSeqId >= self->refSeqs)
4363 return RC(rcAlign, rcFile, rcPositioning, rcData, rcNotFound);
4364 if (self->ndx->numRefs <= refSeqId)
4365 return RC(rcAlign, rcFile, rcPositioning, rcData, rcNotFound);
4366 if (alignStart >= self->refSeq[refSeqId].length)
4367 return RC(rcAlign, rcFile, rcPositioning, rcData, rcNotFound);
4368
4369 if (alignEnd > self->refSeq[refSeqId].length)
4370 alignEnd = self->refSeq[refSeqId].length;
4371
4372 *rslt = makeSlice(self, refSeqId, alignStart, alignEnd);
4373 return 0;
4374 }
4375
BAMFileReadSlice(const BAMFile * cself,const BAMAlignment ** rhs,BAMFileSlice * slice)4376 LIB_EXPORT rc_t CC BAMFileReadSlice(const BAMFile *cself, const BAMAlignment **rhs, BAMFileSlice *slice)
4377 {
4378 assert(cself != NULL);
4379 assert(rhs != NULL);
4380 assert(slice != NULL);
4381 if (cself == NULL || rhs == NULL || slice == NULL)
4382 return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
4383
4384 while (slice->current < slice->ranges) {
4385 if (slice->started == 0) {
4386 rc_t rc = BAMFileSetPosition(cself, &slice->range[slice->current].start);
4387 if (rc) break;
4388 }
4389 ++slice->started;
4390 {
4391 BAMFilePosition const curPos = BAMFileGetPositionInt(cself);
4392 if (curPos < slice->range[slice->current].end) {
4393 return BAMFileRead2(cself, rhs);
4394 }
4395 #if SLICE_VERBOSE
4396 fprintf(stderr, "slice #%u (%016llX-%016llX) contained %u records\n", slice->current + 1, (unsigned long long)slice->range[slice->current].start, (unsigned long long)slice->range[slice->current].end, slice->started);
4397 #endif
4398 ++slice->current;
4399 if (slice->current == slice->ranges)
4400 break;
4401 if (curPos < slice->range[slice->current].start)
4402 slice->started = 0;
4403 }
4404 }
4405 return RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
4406 }
4407
BAMFileSeek(const BAMFile * self,uint32_t refSeqId,uint64_t alignStart,uint64_t alignEnd)4408 LIB_EXPORT rc_t CC BAMFileSeek(const BAMFile *self, uint32_t refSeqId, uint64_t alignStart, uint64_t alignEnd)
4409 {
4410 /* Use MakeSlice and ReadSlice instead */
4411 return RC(rcAlign, rcFile, rcPositioning, rcFunction, rcUnsupported);
4412 }
4413
BAMIndexWhack(const BAMIndex * cself)4414 static rc_t BAMIndexWhack(const BAMIndex *cself) {
4415 free(cself->data);
4416 free((void *)cself);
4417 return 0;
4418 }
4419
4420 /* MARK: BAM Validation Stuff */
4421
OpenVPathRead(const KFile ** fp,struct VPath const * path)4422 static rc_t OpenVPathRead(const KFile **fp, struct VPath const *path)
4423 {
4424 char buffer[4096];
4425 size_t blen;
4426 rc_t rc = VPathReadPath(path, buffer, sizeof(buffer), &blen);
4427
4428 if (rc == 0) {
4429 KDirectory *dir;
4430
4431 rc = KDirectoryNativeDir(&dir);
4432 if (rc == 0) {
4433 rc = KDirectoryOpenFileRead(dir, fp, "%.*s", (int)blen, buffer);
4434 KDirectoryRelease(dir);
4435 }
4436 }
4437 return rc;
4438 }
4439
ReadVPath(void ** data,size_t * dsize,struct VPath const * path)4440 static rc_t ReadVPath(void **data, size_t *dsize, struct VPath const *path)
4441 {
4442 const KFile *fp;
4443 rc_t rc = OpenVPathRead(&fp, path);
4444
4445 if (rc == 0) {
4446 uint8_t *buff;
4447 uint64_t fsz;
4448 size_t bsz = 0;
4449
4450 rc = KFileSize(fp, &fsz);
4451 if (rc == 0) {
4452 if ((size_t)fsz != fsz)
4453 return RC(rcAlign, rcFile, rcReading, rcFile, rcTooBig);
4454 buff = malloc(fsz);
4455 if (buff == NULL)
4456 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
4457 do {
4458 size_t nread;
4459
4460 rc = KFileRead(fp, 0, buff + bsz, fsz - bsz, &nread);
4461 if (rc)
4462 break;
4463 bsz += nread;
4464 } while (bsz < (size_t)fsz);
4465 if (rc == 0) {
4466 *data = buff;
4467 *dsize = bsz;
4468 return 0;
4469 }
4470 free(buff);
4471 }
4472 }
4473 return rc;
4474 }
4475
VPath2BGZF(BGZFile * bgzf,struct VPath const * path)4476 static rc_t VPath2BGZF(BGZFile *bgzf, struct VPath const *path)
4477 {
4478 const KFile *fp;
4479 BGZFile_vt dummy;
4480 rc_t rc = OpenVPathRead(&fp, path);
4481
4482 if (rc == 0) {
4483 rc = BGZFileInit(bgzf, fp, &dummy);
4484 KFileRelease(fp);
4485 }
4486 return rc;
4487 }
4488
4489 struct index_data {
4490 uint64_t position;
4491 unsigned refNo;
4492 unsigned binNo;
4493 bool found;
4494 };
4495
4496 struct buffer_data {
4497 uint64_t position;
4498 size_t size;
4499 };
4500
4501 typedef struct BAMValidate_ctx_s BAMValidate_ctx_t;
4502 struct BAMValidate_ctx_s {
4503 BAMValidateCallback callback;
4504 void *ctx;
4505 BAMValidateStats *stats;
4506 const uint8_t *bai;
4507 int32_t *refLen;
4508 struct index_data *position;
4509 uint8_t *buf;
4510 uint8_t *nxt;
4511 size_t bsize;
4512 size_t alloced;
4513 size_t dnext;
4514 uint32_t options;
4515 uint32_t lastRefId;
4516 uint32_t lastRefPos;
4517 unsigned npositions;
4518 unsigned mpositions;
4519 unsigned nrefs;
4520 bool cancelled;
4521 };
4522
4523 static
IndexValidateStructure(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4524 rc_t IndexValidateStructure(const uint8_t data[], size_t dlen,
4525 unsigned refNo,
4526 unsigned refs,
4527 enum BAMIndexStructureTypes type,
4528 unsigned binNo,
4529 unsigned bins,
4530 unsigned elements,
4531 void *Ctx)
4532 {
4533 BAMValidate_ctx_t *ctx = Ctx;
4534 rc_t rc = 0;
4535
4536 ctx->stats->baiFilePosition = data - ctx->bai;
4537 rc = ctx->callback(ctx->ctx, 0, ctx->stats);
4538 if (rc)
4539 ctx->cancelled = true;
4540 return rc;
4541 }
4542
comp_index_data(const void * A,const void * B,void * ignored)4543 static int64_t CC comp_index_data(const void *A, const void *B, void *ignored)
4544 {
4545 const struct index_data *a = A;
4546 const struct index_data *b = B;
4547
4548 if (a->position < b->position)
4549 return -1;
4550 else if (a->position > b->position)
4551 return 1;
4552 else
4553 return 0;
4554 }
4555
4556 static
BAMValidateLoadIndex(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4557 rc_t BAMValidateLoadIndex(const uint8_t data[], size_t dlen,
4558 unsigned refNo,
4559 unsigned refs,
4560 enum BAMIndexStructureTypes type,
4561 unsigned binNo,
4562 unsigned bins,
4563 unsigned elements,
4564 void *Ctx)
4565 {
4566 BAMValidate_ctx_t *ctx = Ctx;
4567 unsigned const n = type == bai_intervals ? elements : elements * 2;
4568 unsigned i;
4569 unsigned j;
4570
4571 if (type == bai_pairs && binNo >= MAX_BIN)
4572 return 0;
4573
4574 if (ctx->npositions + elements > ctx->mpositions) {
4575 void *temp;
4576
4577 do { ctx->mpositions <<= 1; } while (ctx->npositions + elements > ctx->mpositions);
4578 temp = realloc(ctx->position, ctx->mpositions * sizeof(ctx->position[0]));
4579 if (temp == NULL)
4580 return RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4581 ctx->position = temp;
4582 }
4583 for (j = i = 0; i != n; ++i) {
4584 uint64_t const pos = get_pos(&data[i * 8]);
4585
4586 if (type == bai_pairs && (i & 1) != 0)
4587 continue;
4588
4589 if (pos) {
4590 ctx->position[ctx->npositions + j].refNo = refNo;
4591 ctx->position[ctx->npositions + j].binNo = binNo;
4592 ctx->position[ctx->npositions + j].position = pos;
4593 ++j;
4594 }
4595 }
4596 ctx->npositions += j;
4597 return 0;
4598 }
4599
4600 static
BAMValidateHeader(const uint8_t data[],unsigned dsize,unsigned * header_len,unsigned * refs_start,unsigned * nrefs,unsigned * data_start)4601 rc_t BAMValidateHeader(const uint8_t data[],
4602 unsigned dsize,
4603 unsigned *header_len,
4604 unsigned *refs_start,
4605 unsigned *nrefs,
4606 unsigned *data_start
4607 )
4608 {
4609 int32_t hlen;
4610 int32_t refs;
4611 unsigned i;
4612 unsigned cp;
4613
4614 if (dsize < 8)
4615 return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4616
4617 if (memcmp(data, "BAM\1", 4) != 0)
4618 return RC(rcAlign, rcFile, rcValidating, rcFormat, rcUnrecognized);
4619
4620 hlen = LE2HI32(&data[4]);
4621 if (hlen < 0)
4622 return RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4623
4624 if (dsize < hlen + 12)
4625 return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4626
4627 refs = LE2HI32(&data[hlen + 8]);
4628 if (refs < 0)
4629 return RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4630
4631 for (cp = hlen + 12, i = 0; i != refs; ++i) {
4632 int32_t nlen;
4633
4634 if (dsize < cp + 4)
4635 return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4636
4637 nlen = LE2HI32(&data[cp]);
4638 if (nlen < 0)
4639 return RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4640
4641 if (dsize < cp + nlen + 4)
4642 return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4643
4644 cp += nlen + 4;
4645 }
4646
4647 *nrefs = refs;
4648 *refs_start = 12 + (*header_len = hlen);
4649 *data_start = cp;
4650 return 0;
4651 }
4652
BAMValidateIndex(struct VPath const * bampath,struct VPath const * baipath,BAMValidateOption options,BAMValidateCallback callback,void * callbackContext)4653 static rc_t BAMValidateIndex(struct VPath const *bampath,
4654 struct VPath const *baipath,
4655 BAMValidateOption options,
4656 BAMValidateCallback callback,
4657 void *callbackContext
4658 )
4659 {
4660 rc_t rc = 0;
4661 BGZFile bam;
4662 uint8_t *bai = NULL;
4663 size_t bai_size;
4664 BAMValidateStats stats;
4665 BAMValidate_ctx_t ctx;
4666 uint8_t data[2 * ZLIB_BLOCK_SIZE];
4667 uint32_t dsize = 0;
4668 uint64_t pos = 0;
4669 uint32_t temp;
4670 int32_t ref = -1;
4671 int32_t rpos = -1;
4672
4673 if ((options & bvo_IndexOptions) == 0)
4674 return callback(callbackContext, 0, &stats);
4675
4676 rc = ReadVPath((void **)&bai, &bai_size, baipath);
4677 if (rc)
4678 return rc;
4679
4680 memset(&stats, 0, sizeof(stats));
4681 memset(&ctx, 0, sizeof(ctx));
4682
4683 ctx.bai = bai;
4684 ctx.stats = &stats;
4685 ctx.options = options;
4686 ctx.ctx = callbackContext;
4687 ctx.callback = callback;
4688
4689 if ((options & bvo_IndexOptions) == bvo_IndexStructure)
4690 return WalkIndexStructure(bai, bai_size, IndexValidateStructure, &ctx);
4691
4692 rc = VPath2BGZF(&bam, bampath);
4693 if (rc == 0) {
4694 ctx.mpositions = 1024 * 32;
4695 ctx.position = malloc(ctx.mpositions * sizeof(ctx.position[0]));
4696 if (ctx.position == NULL)
4697 return RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4698
4699 rc = WalkIndexStructure(bai, bai_size, BAMValidateLoadIndex, &ctx);
4700 free(bai);
4701 if (rc) {
4702 stats.indexStructureIsBad = true;
4703 rc = callback(callbackContext, rc, &stats);
4704 }
4705 else {
4706 unsigned i = 0;
4707
4708 stats.indexStructureIsGood = true;
4709 stats.baiFileSize = ctx.npositions;
4710
4711 ksort(ctx.position, ctx.npositions, sizeof(ctx.position[0]), comp_index_data, 0);
4712
4713 stats.bamFileSize = bam.fsize;
4714
4715 while (i < ctx.npositions) {
4716 uint64_t const ifpos = ctx.position[i].position >> 16;
4717 uint16_t const bpos = (uint16_t)ctx.position[i].position;
4718
4719 stats.baiFilePosition = i;
4720 if (i == 0 || ifpos != pos) {
4721 stats.bamFilePosition = pos = ifpos;
4722 rc = BGZFileSetPos(&bam, pos);
4723 if (rc == 0)
4724 rc = BGZFileRead(&bam, data, &dsize);
4725 if (rc) {
4726 ++stats.indexFileOffset.error;
4727 do {
4728 ++i;
4729 if (i == ctx.npositions)
4730 break;
4731 if (ctx.position[i].position >> 16 != pos)
4732 break;
4733 ++stats.indexFileOffset.error;
4734 } while (1);
4735 }
4736 else
4737 ++stats.indexFileOffset.good;
4738
4739 rc = callback(callbackContext, rc, &stats);
4740 if (rc)
4741 break;
4742 }
4743 else
4744 ++stats.indexFileOffset.good;
4745 if ((options & bvo_IndexOptions) > bvo_IndexOffsets1) {
4746 int32_t rsize = 0;
4747 BAMAlignment algn;
4748
4749 if (bpos >= dsize)
4750 goto BAD_BLOCK_OFFSET;
4751 if (dsize - bpos < 4) {
4752 READ_MORE:
4753 if (dsize > ZLIB_BLOCK_SIZE)
4754 goto BAD_BLOCK_OFFSET;
4755
4756 rc = BGZFileRead(&bam, data + dsize, &temp);
4757 if (rc) {
4758 ++stats.blockCompression.error;
4759 goto BAD_BLOCK_OFFSET;
4760 }
4761 dsize += temp;
4762 if (dsize - bpos < 4 || dsize - bpos < rsize)
4763 goto BAD_BLOCK_OFFSET;
4764 }
4765 rsize = LE2HI32(data + bpos);
4766 if (rsize <= 0)
4767 goto BAD_BLOCK_OFFSET;
4768 if (rsize > 0xFFFF) {
4769 ++stats.indexBlockOffset.warning;
4770 ++i;
4771 continue;
4772 }
4773 if (dsize - bpos < rsize)
4774 goto READ_MORE;
4775 /* rc = BAMAlignmentParse(&algn, data + bpos + 4, rsize); */
4776 if (rc)
4777 goto BAD_BLOCK_OFFSET;
4778 ++stats.indexBlockOffset.good;
4779 if ((options & bvo_IndexOptions) > bvo_IndexOffsets2) {
4780 int32_t const refSeqId = getRefSeqId(&algn);
4781 uint16_t const binNo = getBin(&algn);
4782 int32_t const position = getPosition(&algn);
4783
4784 if (ctx.position[i].refNo == refSeqId &&
4785 (ctx.position[i].binNo == binNo ||
4786 ctx.position[i].binNo == ~((unsigned)0)
4787 ))
4788 ++stats.indexBin.good;
4789 else if (ctx.position[i].refNo == refSeqId)
4790 ++stats.indexBin.warning;
4791 else
4792 ++stats.indexBin.error;
4793
4794 if (refSeqId < ref || position < rpos)
4795 ++stats.inOrder.error;
4796
4797 ref = refSeqId;
4798 rpos = position;
4799 }
4800 }
4801 if (0) {
4802 BAD_BLOCK_OFFSET:
4803 ++stats.indexBlockOffset.error;
4804 }
4805 ++i;
4806 }
4807 }
4808
4809 free(ctx.position);
4810 BGZFileWhack(&bam);
4811 }
4812 stats.bamFilePosition = stats.bamFileSize;
4813 return callback(callbackContext, rc, &stats);
4814 }
4815
BAMValidate3(BAMValidate_ctx_t * ctx,BAMAlignment const * algn)4816 static rc_t BAMValidate3(BAMValidate_ctx_t *ctx,
4817 BAMAlignment const *algn
4818 )
4819 {
4820 rc_t rc = 0;
4821 uint16_t const flags = getFlags(algn);
4822 int32_t const refSeqId = getRefSeqId(algn);
4823 int32_t const refPos = getPosition(algn);
4824 unsigned const mapQ = getMapQual(algn);
4825 bool const aligned =
4826 ((flags & BAMFlags_SelfIsUnmapped) == 0) &&
4827 (refSeqId >= 0) && (refSeqId < ctx->nrefs) &&
4828 (refPos >= 0) && (refPos < ctx->refLen[refSeqId]) && (mapQ > 0);
4829
4830 if (ctx->options & bvo_ExtraFields) {
4831 }
4832 if (aligned) {
4833 if ((ctx->options & bvo_Sorted) != 0) {
4834 if (ctx->lastRefId < refSeqId || (ctx->lastRefId == refSeqId && ctx->lastRefPos <= refPos))
4835 ++ctx->stats->inOrder.good;
4836 else
4837 ++ctx->stats->inOrder.error;
4838 ctx->lastRefId = refSeqId;
4839 ctx->lastRefPos = refPos;
4840 }
4841 if (ctx->options & bvo_CIGARConsistency) {
4842 }
4843 if (ctx->options & bvo_BinConsistency) {
4844 }
4845 }
4846 if (ctx->options & bvo_FlagsConsistency) {
4847 }
4848 if (ctx->options & bvo_QualityValues) {
4849 }
4850 if (ctx->options & bvo_MissingSequence) {
4851 }
4852 if (ctx->options & bvo_MissingQuality) {
4853 }
4854 if (ctx->options & bvo_FlagsStats) {
4855 }
4856 return rc;
4857 }
4858
BAMValidate2(void * Ctx,const BGZFile * file,rc_t rc,uint64_t fpos,const zlib_block_t data,unsigned dsize)4859 static rc_t BAMValidate2(void *Ctx, const BGZFile *file,
4860 rc_t rc, uint64_t fpos,
4861 const zlib_block_t data, unsigned dsize)
4862 {
4863 BAMValidate_ctx_t *ctx = Ctx;
4864 rc_t rc2;
4865 bool fatal = false;
4866
4867 ctx->stats->bamFilePosition = fpos;
4868 if (rc) {
4869 if (ctx->options == bvo_BlockHeaders)
4870 ++ctx->stats->blockHeaders.error;
4871 else
4872 ++ctx->stats->blockCompression.error;
4873 }
4874 else if (ctx->options == bvo_BlockHeaders) {
4875 ++ctx->stats->blockHeaders.good;
4876 }
4877 else if (ctx->options == bvo_BlockCompression) {
4878 ++ctx->stats->blockHeaders.good;
4879 ++ctx->stats->blockCompression.good;
4880 }
4881 else if (dsize) {
4882 ctx->bsize += dsize;
4883 if (!ctx->stats->bamHeaderIsBad && !ctx->stats->bamHeaderIsGood) {
4884 unsigned header_len;
4885 unsigned refs_start;
4886 unsigned nrefs;
4887 unsigned data_start;
4888
4889 rc2 = BAMValidateHeader(ctx->buf, ctx->bsize,
4890 &header_len, &refs_start,
4891 &nrefs, &data_start);
4892
4893 if (rc2 == 0) {
4894 ctx->stats->bamHeaderIsGood = true;
4895 if (ctx->options & bvo_BinConsistency) {
4896 ctx->refLen = malloc(nrefs * sizeof(ctx->refLen[0]));
4897 if (ctx->refLen == NULL) {
4898 rc = RC(rcAlign, rcFile, rcValidating, rcMemory, rcExhausted);
4899 fatal = true;
4900 }
4901 else {
4902 unsigned cp;
4903 unsigned i;
4904
4905 ctx->nrefs = nrefs;
4906 for (i = 0, cp = refs_start; cp != data_start; ++i) {
4907 int32_t len;
4908
4909 memmove(&len, &ctx->buf[cp], 4);
4910 memmove(&ctx->refLen[i], &ctx->buf[cp + 4 + len], 4);
4911 cp += len + 8;
4912 }
4913 }
4914 }
4915 ctx->dnext = data_start;
4916 }
4917 else if ( GetRCState( rc2 ) != rcIncomplete || GetRCObject( rc2 ) != (enum RCObject)rcData)
4918 {
4919 ctx->stats->bamHeaderIsBad = true;
4920 ctx->options = bvo_BlockCompression;
4921 rc = rc2;
4922 }
4923 else
4924 ctx->dnext = ctx->bsize;
4925 }
4926 if (rc == 0) {
4927 if (ctx->stats->bamHeaderIsGood) {
4928 unsigned cp = ctx->dnext;
4929
4930 while (cp + 4 < ctx->bsize) {
4931 int32_t rsize;
4932
4933 rsize = LE2HI32(&ctx->buf[cp]);
4934 if (rsize < 0) {
4935 ++ctx->stats->blockStructure.error;
4936 ctx->options = bvo_BlockStructure;
4937
4938 /* throw away the rest of the current buffer */
4939 if (cp >= ctx->bsize - dsize)
4940 cp = ctx->bsize;
4941 else
4942 cp = ctx->bsize - dsize;
4943
4944 rc = RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4945 break;
4946 }
4947 else if (cp + 4 + rsize < ctx->bsize) {
4948 if (rsize > UINT16_MAX)
4949 ++ctx->stats->blockStructure.warning;
4950 else
4951 ++ctx->stats->blockStructure.good;
4952 if (ctx->options > bvo_BlockStructure) {
4953 BAMAlignment algn;
4954
4955 /* rc = BAMAlignmentParse(&algn, &ctx->buf[cp + 4], rsize); */
4956 if (rc == 0) {
4957 ++ctx->stats->recordStructure.good;
4958 if (ctx->options > bvo_RecordStructure)
4959 rc = BAMValidate3(ctx, &algn);
4960 }
4961 else
4962 ++ctx->stats->recordStructure.error;
4963 }
4964 cp += 4 + rsize;
4965 }
4966 else
4967 break;
4968 }
4969 if (&ctx->buf[cp] >= data) {
4970 if (cp < ctx->bsize) {
4971 ctx->bsize -= cp;
4972 memmove(ctx->buf, &ctx->buf[cp], ctx->bsize);
4973 cp = ctx->bsize;
4974 }
4975 else {
4976 assert(cp == ctx->bsize);
4977 cp = ctx->bsize = 0;
4978 }
4979 }
4980 ctx->dnext = cp;
4981 }
4982 if (ctx->alloced < ctx->bsize + ZLIB_BLOCK_SIZE) {
4983 void *temp;
4984
4985 temp = realloc(ctx->buf, ctx->alloced + ZLIB_BLOCK_SIZE);
4986 if (temp == NULL) {
4987 rc = RC(rcAlign, rcFile, rcValidating, rcMemory, rcExhausted);
4988 fatal = true;
4989 }
4990 else {
4991 ctx->buf = temp;
4992 ctx->alloced += ZLIB_BLOCK_SIZE;
4993 }
4994 }
4995 ctx->nxt = &ctx->buf[ctx->dnext];
4996 }
4997 }
4998 rc2 = ctx->callback(ctx->ctx, rc, ctx->stats);
4999 ctx->cancelled |= rc2 != 0;
5000 return fatal ? rc : rc2;
5001 }
5002
BAMValidateBAM(struct VPath const * bampath,BAMValidateOption options,BAMValidateCallback callback,void * callbackContext)5003 static rc_t BAMValidateBAM(struct VPath const *bampath,
5004 BAMValidateOption options,
5005 BAMValidateCallback callback,
5006 void *callbackContext
5007 )
5008 {
5009 rc_t rc;
5010 BGZFile bam;
5011 BAMValidate_ctx_t ctx;
5012 BAMValidateStats stats;
5013
5014 if (bampath == NULL)
5015 return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
5016
5017 memset(&ctx, 0, sizeof(ctx));
5018 memset(&stats, 0, sizeof(stats));
5019
5020 ctx.callback = callback;
5021 ctx.ctx = callbackContext;
5022 ctx.options = options;
5023 ctx.stats = &stats;
5024
5025 if (options > bvo_BlockCompression) {
5026 ctx.alloced = ZLIB_BLOCK_SIZE * 2;
5027 ctx.nxt = ctx.buf = malloc(ctx.alloced);
5028
5029 if (ctx.buf == NULL)
5030 return RC(rcAlign, rcFile, rcValidating, rcMemory, rcExhausted);
5031 }
5032
5033 if (options > bvo_RecordStructure)
5034 options = bvo_RecordStructure | (options & 0xFFF0);
5035
5036 rc = VPath2BGZF(&bam, bampath);
5037 if (rc == 0) {
5038 stats.bamFileSize = bam.fsize;
5039 if ((options & 7) > bvo_BlockHeaders)
5040 rc = BGZFileWalkBlocks(&bam, true, (zlib_block_t *)&ctx.nxt, BAMValidate2, &ctx);
5041 else
5042 rc = BGZFileWalkBlocks(&bam, false, NULL, BAMValidate2, &ctx);
5043 }
5044 BGZFileWhack(&bam);
5045 return rc;
5046 }
5047
dummy_cb(void * ctx,rc_t result,const BAMValidateStats * stats)5048 static rc_t CC dummy_cb(void *ctx, rc_t result, const BAMValidateStats *stats)
5049 {
5050 return 0;
5051 }
5052
BAMValidate(struct VPath const * bampath,struct VPath const * baipath,BAMValidateOption options,BAMValidateCallback callback,void * callbackContext)5053 LIB_EXPORT rc_t CC BAMValidate(struct VPath const *bampath,
5054 struct VPath const *baipath,
5055 BAMValidateOption options,
5056 BAMValidateCallback callback,
5057 void *callbackContext
5058 )
5059 {
5060 if (callback == NULL)
5061 callback = dummy_cb;
5062 if (bampath == NULL)
5063 return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
5064 if (baipath == NULL) {
5065 if (options & bvo_IndexOptions)
5066 return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
5067 return BAMValidateBAM(bampath, options, callback, callbackContext);
5068 }
5069 return BAMValidateIndex(bampath, baipath, options, callback, callbackContext);
5070 }
5071