1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include <align/extern.h>
28 #include <klib/defs.h>
29 #include <klib/debug.h>
30 #include <klib/sort.h>
31 #include <klib/rc.h>
32 #include <kfs/file.h>
33 #include <kfs/directory.h>
34 #include <kfs/mmap.h>
35 #include <klib/printf.h>
36 #include <klib/log.h>
37 #include <klib/text.h>
38 #include <klib/refcount.h>
39 #include <sysalloc.h>
40 
41 #include <atomic32.h>
42 #include <strtol.h>
43 
44 #include <align/bam.h>
45 #include "bam-priv.h"
46 
47 #include <vfs/manager.h>
48 #include <vfs/path.h>
49 
50 #include <limits.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <ctype.h>
54 #include <assert.h>
55 #if 1
56 /*_DEBUGGING*/
57 #include <stdio.h>
58 #endif
59 
60 #include <endian.h>
61 #include <byteswap.h>
62 
63 #include <zlib.h>
64 
65 #include <os-native.h>
66 
67 
68 #if __BYTE_ORDER == __LITTLE_ENDIAN
LE2HUI16(void const * X)69 static uint16_t LE2HUI16(void const *X) { uint16_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HUI32(void const * X)70 static uint32_t LE2HUI32(void const *X) { uint32_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HUI64(void const * X)71 static uint64_t LE2HUI64(void const *X) { uint64_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HI16(void const * X)72 static  int16_t  LE2HI16(void const *X) {  int16_t y; memmove(&y, X, sizeof(y)); return y; }
LE2HI32(void const * X)73 static  int32_t  LE2HI32(void const *X) {  int32_t y; memmove(&y, X, sizeof(y)); return y; }
74 /* static  int64_t  LE2HI64(void const *X) {  int64_t y; memmove(&y, X, sizeof(y)); return y; } */
75 #endif
76 #if __BYTE_ORDER == __BIG_ENDIAN
LE2HUI16(void const * X)77 static uint16_t LE2HUI16(void const *X) { uint16_t y; memmove(&y, X, sizeof(y)); return (uint16_t)bswap_16(y); }
LE2HUI32(void const * X)78 static uint32_t LE2HUI32(void const *X) { uint32_t y; memmove(&y, X, sizeof(y)); return (uint32_t)bswap_32(y); }
LE2HUI64(void const * X)79 static uint64_t LE2HUI64(void const *X) { uint64_t y; memmove(&y, X, sizeof(y)); return (uint64_t)bswap_64(y); }
LE2HI16(void const * X)80 static  int16_t  LE2HI16(void const *X) {  int16_t y; memmove(&y, X, sizeof(y)); return ( int16_t)bswap_16(y); }
LE2HI32(void const * X)81 static  int32_t  LE2HI32(void const *X) {  int32_t y; memmove(&y, X, sizeof(y)); return ( int32_t)bswap_32(y); }
LE2HI64(void const * X)82 static  int64_t  LE2HI64(void const *X) {  int64_t y; memmove(&y, X, sizeof(y)); return ( int64_t)bswap_64(y); }
83 #endif
84 
85 typedef struct BAMIndex BAMIndex;
86 typedef struct BGZFile BGZFile;
87 
88 /* MARK: BGZFile *** Start *** */
89 
90 #define VALIDATE_BGZF_HEADER 1
91 #if (ZLIB_VERNUM < 0x1230)
92 #undef VALIDATE_BGZF_HEADER
93 #warning "zlib too old, inflateGetHeader not available, not validating BGZF headers"
94 #else
95 #endif
96 
97 #define ZLIB_BLOCK_SIZE ( 64 * 1024 )
98 typedef uint8_t zlib_block_t[ZLIB_BLOCK_SIZE];
99 
100 #define MEM_ALIGN_SIZE ( 64 * 1024 )
101 /* MEM_CHUNK_SIZE must be an integer multiple of ZLIB_BLOCK_SIZE.
102  * The multiple must be >= 2 shouldn't be < 3.
103  */
104 #define MEM_CHUNK_SIZE ( 256 * ZLIB_BLOCK_SIZE )
105 #define CG_NUM_SEGS 4
106 
107 typedef struct BGZFile_vt_s {
108     rc_t (*FileRead)(void *, zlib_block_t, unsigned *);
109     uint64_t (*FileGetPos)(void const *);
110     float (*FileProPos)(void const *);
111     uint64_t (*FileGetSize)(void const *);
112     rc_t (*FileSetPos)(void *, uint64_t);
113     void (*FileWhack)(void *);
114 } BGZFile_vt;
115 
116 struct BGZFile {
117     uint64_t fsize;
118     uint64_t fpos;  /* position in file of first byte in buffer */
119     const uint8_t *buf;   /* page aligned or memmapped */
120     const KFile *kfp;
121     uint8_t *_buf;  /* allocated */
122     unsigned malign;
123     size_t bcount;  /* number of valid bytes in buffer */
124     uint32_t bpos;  /* position in buffer of read head */
125     z_stream zs;
126 };
127 
128 static
BGZFileGetMoreBytes(BGZFile * self)129 rc_t BGZFileGetMoreBytes(BGZFile *self)
130 {
131     rc_t rc;
132 
133     self->fpos += self->bpos;
134     self->bpos &= (MEM_ALIGN_SIZE - 1);
135     self->fpos -= self->bpos;
136 
137     rc = KFileRead(self->kfp, self->fpos, self->_buf + self->malign,
138                    MEM_CHUNK_SIZE, &self->bcount);
139     if (rc) {
140         DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Error reading BAM file: %R\n", rc));
141         return rc;
142     }
143     if (self->bcount == 0 || self->bcount == self->bpos)
144         return RC(rcAlign, rcFile, rcReading, rcData, rcInsufficient);
145 
146     self->zs.avail_in = (uInt)(self->bcount - self->bpos);
147     self->zs.next_in = (Bytef *)&self->buf[self->bpos];
148     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Read %u bytes from BAM file at position %lu\n", self->zs.avail_in, self->fpos));
149 
150     return 0;
151 }
152 
153 static
BGZFileRead(BGZFile * self,zlib_block_t dst,unsigned * pNumRead)154 rc_t BGZFileRead(BGZFile *self, zlib_block_t dst, unsigned *pNumRead)
155 {
156 #if VALIDATE_BGZF_HEADER
157     uint8_t extra[256];
158     gz_header head;
159 #endif
160     rc_t rc = 0;
161     unsigned loops;
162     int zr;
163 
164     *pNumRead = 0;
165     if (self->bcount == 0 || self->zs.avail_in == 0) {
166         rc = BGZFileGetMoreBytes(self);
167         if (rc)
168             return rc;
169     }
170 
171 #if VALIDATE_BGZF_HEADER
172     memset(&head, 0, sizeof(head));
173     head.extra = extra;
174     head.extra_max = sizeof(extra);
175 
176     zr = inflateGetHeader(&self->zs, &head);
177     assert(zr == Z_OK);
178 #endif
179 
180     self->zs.next_out = (Bytef *)dst;
181     self->zs.avail_out = sizeof(zlib_block_t);
182 
183     for (loops = 0; loops != 2; ++loops) {
184         {
185             uLong const initial = self->zs.total_in;
186 
187             zr = inflate(&self->zs, Z_FINISH);
188             {
189                 uLong const final = self->zs.total_in;
190                 uLong const len = final - initial;
191 
192                 self->bpos += len;
193             }
194         }
195         assert(self->zs.avail_in == self->bcount - self->bpos);
196 
197         switch (zr) {
198         case Z_OK:
199         case Z_BUF_ERROR:
200             rc = BGZFileGetMoreBytes(self);
201             if ( rc != 0 )
202             {
203                 if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
204                 {
205                     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("EOF in Zlib block after %lu bytes\n", self->fpos + self->bpos));
206                     rc = RC( rcAlign, rcFile, rcReading, rcFile, rcTooShort );
207                 }
208                 return rc;
209             }
210             break;
211         case Z_STREAM_END:
212             DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Zlib block size (before/after): %u/%u\n", self->zs.total_in, self->zs.total_out));
213 #if VALIDATE_BGZF_HEADER
214             if (head.done) {
215                 unsigned const extra_len = head.extra_len;
216                 unsigned i;
217                 unsigned bsize = 0;
218 
219                 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("GZIP Header extra length: %u\n", extra_len));
220                 for (i = 0; i < extra_len; ) {
221                     uint8_t const si1 = extra[i + 0];
222                     uint8_t const si2 = extra[i + 1];
223                     unsigned const slen = LE2HUI16(&extra[i + 2]);
224 
225                     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("GZIP Header extra: %c%c(%u)\n", si1, si2, slen));
226                     if (si1 == 'B' && si2 == 'C') {
227                         bsize = 1 + LE2HUI16(&extra[i + 4]);
228                         DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("BGZF Header extra field BC: bsize %u\n", bsize));
229                         break;
230                     }
231                     i += slen + 4;
232                 }
233                 if (bsize == 0 || bsize != self->zs.total_in) {
234                     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("BGZF Header extra field BC not found\n"));
235                     rc = RC(rcAlign, rcFile, rcReading, rcFormat, rcInvalid); /* not BGZF */
236                 }
237             }
238             else {
239                 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("GZIP Header not found\n"));
240                 rc = RC(rcAlign, rcFile, rcReading, rcFile, rcCorrupt);
241             }
242 #endif
243             *pNumRead = (unsigned)self->zs.total_out; /* <= 64k */
244             zr = inflateReset(&self->zs);
245             assert(zr == Z_OK);
246             return rc;
247         default:
248             DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Unexpected Zlib result %i\n", zr));
249             return RC(rcAlign, rcFile, rcReading, rcFile, rcCorrupt);
250         }
251     }
252     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BGZF), ("Failed reading BAM file after %lu bytes\n", self->fpos + self->bpos));
253     return RC(rcAlign, rcFile, rcReading, rcFile, rcTooShort);
254 }
255 
BGZFileGetPos(const BGZFile * self)256 static uint64_t BGZFileGetPos(const BGZFile *self)
257 {
258     return self->fpos + self->bpos;
259 }
260 
261 /* returns the position as proportion of the whole file */
BGZFileProPos(BGZFile const * const self)262 static float BGZFileProPos(BGZFile const *const self)
263 {
264     return BGZFileGetPos(self) / (double)self->fsize;
265 }
266 
BGZFileSetPos(BGZFile * const self,uint64_t const pos)267 static rc_t BGZFileSetPos(BGZFile *const self, uint64_t const pos)
268 {
269     if (self->fpos > pos || pos >= self->fpos + self->bcount) {
270         /* desired position is outside of current buffer */
271         self->fpos = pos ^ (pos & ((uint64_t)(MEM_ALIGN_SIZE - 1)));
272         self->bpos = (unsigned)(pos - self->fpos);
273         self->bcount = 0; /* force re-read */
274     }
275     else {
276         /* desired position is inside of current buffer */
277         unsigned const bpos = (unsigned)(pos - self->fpos); /* < 64k */
278 
279         self->bpos = bpos; /* pos - self->fpos; */
280         self->zs.avail_in = (uInt)(self->bcount - bpos);
281         self->zs.next_in = (Bytef *)&self->buf[bpos];
282     }
283     return 0;
284 }
285 
286 typedef rc_t (*BGZFileWalkBlocks_cb)(void *ctx, const BGZFile *file,
287                                      rc_t rc, uint64_t fpos,
288                                      const zlib_block_t data, unsigned dsize);
289 
290 /* Without Decompression */
BGZFileWalkBlocksND(BGZFile * const self,BGZFileWalkBlocks_cb const cb,void * const ctx)291 static rc_t BGZFileWalkBlocksND(BGZFile *const self, BGZFileWalkBlocks_cb const cb, void *const ctx)
292 {
293     rc_t rc = 0;
294 #if VALIDATE_BGZF_HEADER
295     uint8_t extra[256];
296     char dummy[64];
297     gz_header head;
298     int zr;
299 
300     memset(&head, 0, sizeof(head));
301     head.extra = extra;
302     head.extra_max = sizeof(extra);
303 
304     do {
305         unsigned loops;
306         unsigned hsize = 0;
307         unsigned bsize = 0;
308         unsigned bsize2;
309         uint64_t const fpos = self->fpos + self->bpos;
310 
311         self->zs.next_out = (Bytef *)dummy;
312         self->zs.avail_out = sizeof(dummy);
313 
314         zr = inflateGetHeader(&self->zs, &head);
315         assert(zr == Z_OK);
316 
317         for (loops = 0; loops != 2; ++loops) {
318             {
319                 uLong const orig = self->zs.total_in;
320 
321                 zr = inflate(&self->zs, Z_BLOCK); /* Z_BLOCK stops at end of header */
322                 {
323                     uLong const final = self->zs.total_in;
324                     uLong const bytes = final - orig;
325 
326                     self->bpos += bytes;
327                     hsize += bytes;
328                 }
329             }
330             if (head.done) {
331                 unsigned i;
332 
333                 for (i = 0; i < head.extra_len; ) {
334                     if (extra[i] == 'B' && extra[i + 1] == 'C') {
335                         bsize = 1 + LE2HUI16(&extra[i + 4]);
336                         break;
337                     }
338                     i += LE2HUI16(&extra[i + 2]);
339                 }
340                 break;
341             }
342             else if (self->zs.avail_in == 0) {
343                 rc = BGZFileGetMoreBytes(self);
344                 if (rc) {
345                     rc = RC(rcAlign, rcFile, rcReading, rcFile, rcTooShort);
346                     goto DONE;
347                 }
348             }
349             else {
350                 rc = RC(rcAlign, rcFile, rcReading, rcFile, rcCorrupt);
351                 goto DONE;
352             }
353         }
354         if (bsize == 0) {
355             rc = RC(rcAlign, rcFile, rcReading, rcFormat, rcInvalid); /* not BGZF */
356             break;
357         }
358         bsize2 = bsize;
359         bsize -= hsize;
360         for ( ; ; ) {
361             unsigned const max = (unsigned)(self->bcount - self->bpos); /* <= 64k */
362             unsigned const len = bsize > max ? max : bsize;
363 
364             self->bpos += len;
365             bsize -= len;
366             if (self->bpos == self->bcount) {
367                 rc = BGZFileGetMoreBytes(self);
368                 if (rc) {
369                     if (bsize)
370                         rc = RC(rcAlign, rcFile, rcReading, rcFile, rcTooShort);
371                     goto DONE;
372                 }
373             }
374             else {
375                 zr = inflateReset(&self->zs);
376                 assert(zr == Z_OK);
377                 self->zs.avail_in = (uInt)(self->bcount - self->bpos);
378                 self->zs.next_in = (Bytef *)&self->buf[self->bpos];
379                 rc = cb(ctx, self, 0, fpos, NULL, bsize2);
380                 break;
381             }
382         }
383     } while (rc == 0);
384 DONE:
385     if ( GetRCState( rc ) == rcInsufficient && GetRCObject( rc ) == (enum RCObject)rcData )
386         rc = 0;
387     rc = cb( ctx, self, rc, self->fpos + self->bpos, NULL, 0 );
388 #endif
389     return rc;
390 }
391 
BGZFileWalkBlocksUnzip(BGZFile * const self,zlib_block_t * const bufp,BGZFileWalkBlocks_cb const cb,void * const ctx)392 static rc_t BGZFileWalkBlocksUnzip(BGZFile *const self, zlib_block_t *const bufp, BGZFileWalkBlocks_cb const cb, void *const ctx)
393 {
394     rc_t rc;
395     rc_t rc2;
396 
397     do {
398         uint64_t const fpos = self->fpos + self->bpos;
399         unsigned dsize;
400 
401         rc2 = BGZFileRead(self, *bufp, &dsize);
402         rc = cb(ctx, self, rc2, fpos, *bufp, dsize);
403     } while (rc == 0 && rc2 == 0);
404     if ( GetRCState( rc2 ) == rcInsufficient && GetRCObject( rc2 ) == (enum RCObject)rcData )
405         rc2 = 0;
406     rc = cb( ctx, self, rc2, self->fpos + self->bpos, NULL, 0 );
407     return rc ? rc : rc2;
408 }
409 
BGZFileWalkBlocks(BGZFile * self,bool decompress,zlib_block_t * bufp,BGZFileWalkBlocks_cb cb,void * ctx)410 static rc_t BGZFileWalkBlocks(BGZFile *self, bool decompress, zlib_block_t *bufp,
411                               BGZFileWalkBlocks_cb cb, void *ctx)
412 {
413     rc_t rc;
414 
415 #if VALIDATE_BGZF_HEADER
416 #else
417     decompress = true;
418 #endif
419     self->fpos = 0;
420     self->bpos = 0;
421 
422     rc = BGZFileGetMoreBytes(self);
423     if (rc)
424         return rc;
425 
426     if (decompress)
427         return BGZFileWalkBlocksUnzip(self, bufp, cb, ctx);
428     else
429         return BGZFileWalkBlocksND(self, cb, ctx);
430 }
431 
BGZFileGetSize(BGZFile const * const self)432 static uint64_t BGZFileGetSize(BGZFile const *const self)
433 {
434     return self->fsize;
435 }
436 
BGZFileWhack(BGZFile * self)437 static void BGZFileWhack(BGZFile *self)
438 {
439     inflateEnd(&self->zs);
440     KFileRelease(self->kfp);
441     if (self->_buf)
442         free(self->_buf);
443 }
444 
BGZFileInit(BGZFile * self,const KFile * kfp,BGZFile_vt * vt)445 static rc_t BGZFileInit(BGZFile *self, const KFile *kfp, BGZFile_vt *vt)
446 {
447     int i;
448     rc_t rc;
449     static BGZFile_vt const my_vt = {
450         (rc_t (*)(void *, zlib_block_t, unsigned *))BGZFileRead,
451         (uint64_t (*)(void const *))BGZFileGetPos,
452         (float (*)(void const *))BGZFileProPos,
453         (uint64_t (*)(void const *))BGZFileGetSize,
454         (rc_t (*)(void *, uint64_t))BGZFileSetPos,
455         (void (*)(void *))BGZFileWhack
456     };
457 
458     memset(self, 0, sizeof(*self));
459     memset(vt, 0, sizeof(*vt));
460 
461     i = inflateInit2(&self->zs, MAX_WBITS + 16); /* max + enable gzip headers */
462     switch (i) {
463     case Z_OK:
464         break;
465     case Z_MEM_ERROR:
466         return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
467     default:
468         return RC(rcAlign, rcFile, rcConstructing, rcNoObj, rcUnexpected);
469     }
470 
471     rc = KFileSize(kfp, &self->fsize);
472     if (rc)
473         return rc;
474 
475     self->_buf = malloc(MEM_CHUNK_SIZE + MEM_ALIGN_SIZE);
476     if (self->_buf == NULL)
477         return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
478     self->malign = (MEM_ALIGN_SIZE - ((intptr_t)self->_buf & (MEM_ALIGN_SIZE - 1))) & (MEM_ALIGN_SIZE - 1);
479     self->buf = self->_buf + self->malign;
480 
481     self->kfp = kfp;
482     KFileAddRef(kfp);
483 
484     *vt = my_vt;
485 
486     return 0;
487 }
488 
489 #ifndef WINDOWS
490 
491 /* MARK: BGZThreadFile *** Start *** */
492 
493 #include <kproc/thread.h>
494 #include <kproc/lock.h>
495 #include <kproc/cond.h>
496 
497 typedef struct BGZThreadFile_s BGZThreadFile;
498 
499 #define BUFFER_COUNT (3)
500 
501 typedef struct BGZThreadFileWorkQElem_s BGZThreadFileWorkQElem;
502 
503 struct BGZThreadFileWorkQElem_s {
504     uint8_t *buf;
505     uint64_t pos;
506     unsigned bsz;
507 };
508 
509 struct BGZThreadFile_s {
510     BGZFile file;
511     KLock *lock;
512     KCondition *have_data;
513     KCondition *need_data;
514     KThread *th;
515     uint64_t pos;
516     BGZThreadFileWorkQElem que[BUFFER_COUNT];
517     rc_t volatile rc;
518     unsigned volatile nque;
519     bool eof;
520     uint8_t buffer[sizeof(zlib_block_t) * BUFFER_COUNT];
521 };
522 
BGZThreadFileRead(BGZThreadFile * self,zlib_block_t dst,unsigned * pNumRead)523 static rc_t BGZThreadFileRead(BGZThreadFile *self, zlib_block_t dst, unsigned *pNumRead)
524 {
525     rc_t rc;
526 
527     *pNumRead = 0;
528 
529     KLockAcquire(self->lock);
530     if ((rc = self->rc) == 0) {
531         while (self->nque == 0 && (rc = self->rc) == 0)
532             KConditionWait(self->have_data, self->lock);
533         if (rc == 0) {
534             BGZThreadFileWorkQElem const work = self->que[0];
535 
536             self->pos = work.pos;
537             if (work.buf) {
538                 memmove(dst, work.buf, *pNumRead = work.bsz);
539                 memmove(&self->que[0], &self->que[1], --self->nque * sizeof(self->que[0]));
540                 KConditionSignal(self->need_data);
541             }
542             else {
543                 self->eof = true;
544                 self->rc = rc = RC(rcAlign, rcFile, rcReading, rcData, rcInsufficient);
545             }
546         }
547     }
548     KLockUnlock(self->lock);
549     return rc;
550 }
551 
BGZThreadFileMain(KThread const * const th,void * const vp)552 static rc_t CC BGZThreadFileMain(KThread const *const th, void *const vp)
553 {
554     BGZThreadFile *const self = (BGZThreadFile *)vp;
555     rc_t rc = 0;
556     unsigned bufno;
557 
558     KLockAcquire(self->lock);
559     for (bufno = 0; ; bufno = (bufno + 1) % BUFFER_COUNT) {
560         while (self->nque == BUFFER_COUNT)
561             KConditionWait(self->need_data, self->lock);
562         {
563             BGZThreadFileWorkQElem work;
564 
565             work.pos = BGZFileGetPos(&self->file);
566             work.buf = &self->buffer[bufno * sizeof(zlib_block_t)];
567             rc = BGZFileRead(&self->file, work.buf, &work.bsz);
568             if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
569                 work.buf = NULL;
570             else if ( rc != 0 )
571                 break;
572             self->que[self->nque++] = work;
573             KConditionSignal(self->have_data);
574         }
575     }
576     self->rc = rc;
577     KLockUnlock(self->lock);
578     return 0;
579 }
580 
BGZThreadFileGetPos(BGZThreadFile const * const self)581 static uint64_t BGZThreadFileGetPos(BGZThreadFile const *const self)
582 {
583     return self->pos;
584 }
585 
586 /* returns the position as proportion of the whole file */
BGZThreadFileProPos(BGZThreadFile const * const self)587 static float BGZThreadFileProPos(BGZThreadFile const *const self)
588 {
589     return BGZThreadFileGetPos(self) / (double)self->file.fsize;
590 }
591 
BGZThreadFileGetSize(BGZThreadFile const * const self)592 static uint64_t BGZThreadFileGetSize(BGZThreadFile const *const self)
593 {
594     return BGZFileGetSize(&self->file);
595 }
596 
BGZThreadFileSetPos(BGZThreadFile * const self)597 static rc_t BGZThreadFileSetPos(BGZThreadFile *const self)
598 {
599     return RC(rcAlign, rcFile, rcPositioning, rcFunction, rcUnsupported);
600 }
601 
BGZThreadFileWhack(BGZThreadFile * const self)602 static void BGZThreadFileWhack(BGZThreadFile *const self)
603 {
604     KThreadCancel(self->th);
605     KThreadWait(self->th, NULL);
606     BGZFileWhack(&self->file);
607     KConditionRelease(self->need_data);
608     KConditionRelease(self->have_data);
609     KLockRelease(self->lock);
610     KThreadRelease(self->th);
611 }
612 
BGZThreadFileInit(BGZThreadFile * self,const KFile * kfp,BGZFile_vt * vt)613 static rc_t BGZThreadFileInit(BGZThreadFile *self, const KFile *kfp, BGZFile_vt *vt)
614 {
615     rc_t rc;
616     static BGZFile_vt const my_vt = {
617         (rc_t (*)(void *, zlib_block_t, unsigned *))BGZThreadFileRead,
618         (uint64_t (*)(void const *))BGZThreadFileGetPos,
619         (float (*)(void const *))BGZThreadFileProPos,
620         (uint64_t (*)(void const *))BGZThreadFileGetSize,
621         (rc_t (*)(void *, uint64_t))BGZThreadFileSetPos,
622         (void (*)(void *))BGZThreadFileWhack
623     };
624 
625     memset(self, 0, sizeof(*self));
626 
627     rc = BGZFileInit(&self->file, kfp, vt);
628     if (rc == 0) {
629         rc = KLockMake(&self->lock);
630         if (rc == 0) {
631             rc = KConditionMake(&self->have_data);
632             if (rc == 0) {
633                 rc = KConditionMake(&self->need_data);
634                 if (rc == 0) {
635                     rc = KThreadMake(&self->th, BGZThreadFileMain, self);
636                     if (rc == 0) {
637                         *vt = my_vt;
638                         return 0;
639                     }
640                     KConditionRelease(self->need_data);
641                 }
642                 KConditionRelease(self->have_data);
643             }
644             KLockRelease(self->lock);
645         }
646         BGZFileWhack(&self->file);
647     }
648     memset(self, 0, sizeof(*self));
649     memset(vt, 0, sizeof(*vt));
650     return rc;
651 }
652 
653 #endif
654 
655 /* MARK: BAMFile structures */
656 #define MAX_BIN 37449
657 
658 typedef struct BAMFileRange {
659     BAMFilePosition start;
660     BAMFilePosition end;
661 } BAMFileRange;
662 
663 typedef struct BAMIndexReference {
664     int intervals;
665     int binSize[MAX_BIN];
666     int binStart[MAX_BIN];
667     BAMFilePosition start;
668     BAMFilePosition end;
669     BAMFileRange *bin;
670     BAMFilePosition *interval;
671 } BAMIndexReference;
672 
673 struct BAMIndex {
674     int numRefs;
675     BAMFileRange *rng;
676     BAMFilePosition *pos;
677     void *data; // allocated
678     BAMIndexReference ref[1];
679 };
680 
681 struct BAMFile {
682     uint64_t fpos_first;
683     uint64_t fpos_cur;
684 
685     union {
686         BGZFile plain;
687 #ifndef WINDOWS
688         BGZThreadFile thread;
689 #endif
690     } file;
691     BGZFile_vt vt;
692 
693     BAMRefSeq *refSeq;          /* pointers into headerData1 except name points into headerData2 */
694     BAMReadGroup *readGroup;    /* pointers into headerData1 */
695     char const *version;
696     char const *header;
697     char    *headerData1;       /* gets used for refSeq and readGroup */
698     uint8_t *headerData2;       /* gets used for refSeq */
699     BAMAlignment *bufLocker;
700     BAMAlignment *nocopy;       /* used to hold current record for BAMFileRead2 */
701     BAMIndex const *ndx;
702 
703     size_t nocopy_size;
704 
705     unsigned refSeqs;
706     unsigned readGroups;
707 
708     KRefcount refcount;
709     unsigned ucfirst;           /* offset of first record in uncompressed buffer */
710     unsigned bufSize;           /* current size of uncompressed buffer */
711     unsigned bufCurrent;        /* location in uncompressed buffer of read head */
712     bool eof;
713     bool threaded;
714     zlib_block_t buffer;        /* uncompressed buffer */
715 };
716 
717 /* MARK: Alignment structures */
718 
719 struct bam_alignment_s {
720     uint8_t rID[4];
721     uint8_t pos[4];
722     uint8_t read_name_len;
723     uint8_t mapQual;
724     uint8_t bin[2];
725     uint8_t n_cigars[2];
726     uint8_t flags[2];
727     uint8_t read_len[4];
728     uint8_t mate_rID[4];
729     uint8_t mate_pos[4];
730     uint8_t ins_size[4];
731     char read_name[1 /* read_name_len */];
732 /* if you change length of read_name,
733  * adjust calculation of offsets in BAMAlignmentSetOffsets */
734 /*  uint32_t cigar[n_cigars];
735  *  uint8_t seq[(read_len + 1) / 2];
736  *  uint8_t qual[read_len];
737  *  uint8_t extra[...];
738  */
739 };
740 
741 typedef union bam_alignment_u {
742     struct bam_alignment_s cooked;
743     uint8_t raw[sizeof(struct bam_alignment_s)];
744 } bam_alignment;
745 
746 struct offset_size_s {
747     unsigned offset;
748     unsigned size; /* this is the total length of the tag; length of data is size - 3 */
749 };
750 
751 struct BAMAlignment {
752     KRefcount refcount;
753 
754     BAMFilePosition pos;
755     BAMFile *parent;
756     bam_alignment const *data;
757     uint8_t *storage;
758     unsigned datasize;
759 
760     unsigned cigar;
761     unsigned seq;
762     unsigned qual;
763     unsigned numExtra;
764     unsigned hasColor;
765     struct offset_size_s extra[1];
766 };
767 
768 static const char cigarChars[] = {
769     ct_Match,
770     ct_Insert,
771     ct_Delete,
772     ct_Skip,
773     ct_SoftClip,
774     ct_HardClip,
775     ct_Padded,
776     ct_Equal,
777     ct_NotEqual
778     /* ct_Overlap must not appear in actual BAM file */
779 };
780 
781 /* MARK: Alignment accessors */
782 
getRefSeqId(const BAMAlignment * cself)783 static int32_t getRefSeqId(const BAMAlignment *cself) {
784     return LE2HI32(cself->data->cooked.rID);
785 }
786 
getPosition(const BAMAlignment * cself)787 static int32_t getPosition(const BAMAlignment *cself) {
788     return LE2HI32(cself->data->cooked.pos);
789 }
790 
getReadNameLength(const BAMAlignment * cself)791 static uint8_t getReadNameLength(const BAMAlignment *cself) {
792     return cself->data->cooked.read_name_len;
793 }
794 
getBin(const BAMAlignment * cself)795 static uint16_t getBin(const BAMAlignment *cself) {
796     return LE2HUI16(cself->data->cooked.bin);
797 }
798 
getMapQual(const BAMAlignment * cself)799 static uint8_t getMapQual(const BAMAlignment *cself) {
800     return cself->data->cooked.mapQual;
801 }
802 
getCigarCount(const BAMAlignment * cself)803 static uint16_t getCigarCount(const BAMAlignment *cself) {
804     return LE2HUI16(cself->data->cooked.n_cigars);
805 }
806 
getFlags(const BAMAlignment * cself)807 static uint16_t getFlags(const BAMAlignment *cself) {
808     return LE2HUI16(cself->data->cooked.flags);
809 }
810 
getReadLen(const BAMAlignment * cself)811 static uint32_t getReadLen(const BAMAlignment *cself) {
812     return LE2HUI32(cself->data->cooked.read_len);
813 }
814 
getMateRefSeqId(const BAMAlignment * cself)815 static int32_t getMateRefSeqId(const BAMAlignment *cself) {
816     return LE2HI32(cself->data->cooked.mate_rID);
817 }
818 
getMatePos(const BAMAlignment * cself)819 static int32_t getMatePos(const BAMAlignment *cself) {
820     return LE2HI32(cself->data->cooked.mate_pos);
821 }
822 
getInsertSize(const BAMAlignment * cself)823 static int32_t getInsertSize(const BAMAlignment *cself) {
824     return LE2HI32(cself->data->cooked.ins_size);
825 }
826 
getReadName(const BAMAlignment * cself)827 static char const *getReadName(const BAMAlignment *cself) {
828     return &cself->data->cooked.read_name[0];
829 }
830 
getCigarBase(BAMAlignment const * cself)831 static void const *getCigarBase(BAMAlignment const *cself)
832 {
833     return &cself->data->raw[cself->cigar];
834 }
835 
opt_tag_cmp(char const a[2],char const b[2])836 static int opt_tag_cmp(char const a[2], char const b[2])
837 {
838     int const d0 = (int)a[0] - (int)b[0];
839     return d0 ? d0 : ((int)a[1] - (int)b[1]);
840 }
841 
OptTag_sort(void const * A,void const * B,void * ctx)842 static int64_t CC OptTag_sort(void const *A, void const *B, void *ctx)
843 {
844     BAMAlignment const *const self = ctx;
845     unsigned const a_off = ((struct offset_size_s const *)A)->offset;
846     unsigned const b_off = ((struct offset_size_s const *)B)->offset;
847     char const *const a = (char const *)&self->data->raw[a_off];
848     char const *const b = (char const *)&self->data->raw[b_off];
849     int const diff = opt_tag_cmp(a, b);
850 
851     return diff ? (int64_t)diff : (int64_t)a - (int64_t)b;
852 }
853 
tag_findfirst(BAMAlignment const * const self,char const tag[2])854 static unsigned tag_findfirst(BAMAlignment const *const self, char const tag[2])
855 {
856     unsigned f = 0;
857     unsigned e = self->numExtra;
858 
859     while (f < e) {
860         unsigned const m = (f + e) >> 1;
861         char const *const mtag = (char const *)&self->data->raw[self->extra[m].offset];
862         int const d = opt_tag_cmp(tag, mtag);
863 
864         if (d > 0)
865             f = m + 1;
866         else
867             e = m;
868     }
869     return f;
870 }
871 
tag_runlength(BAMAlignment const * const self,char const tag[2],unsigned const at)872 static unsigned tag_runlength(BAMAlignment const *const self,
873                               char const tag[2],
874                               unsigned const at)
875 {
876     unsigned n;
877 
878     for (n = 0; n + at < self->numExtra; ++n) {
879         if (opt_tag_cmp(tag, (char const *)&self->data->raw[self->extra[n + at].offset]) != 0)
880             break;
881     }
882     return n;
883 }
884 
tag_search(BAMAlignment const * const self,char const tag[2],int const which)885 static struct offset_size_s const *tag_search(BAMAlignment const *const self,
886                                               char const tag[2],
887                                               int const which)
888 {
889     unsigned const fnd = tag_findfirst(self, tag);
890     unsigned const run = tag_runlength(self, tag, fnd);
891     unsigned const want = which < 0 ? (run + which) : which;
892 
893     return run == 0 ? NULL : &self->extra[fnd + (want % run)];
894 }
895 
get_RG(BAMAlignment const * cself)896 static char const *get_RG(BAMAlignment const *cself)
897 {
898     struct offset_size_s const *const x = tag_search(cself, "RG", 0);
899     return (char const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
900 }
901 
get_CS_info(BAMAlignment const * cself)902 static struct offset_size_s const *get_CS_info(BAMAlignment const *cself)
903 {
904     return tag_search(cself, "CS", 0);
905 }
906 
get_CQ_info(BAMAlignment const * cself)907 static struct offset_size_s const *get_CQ_info(BAMAlignment const *cself)
908 {
909     return tag_search(cself, "CQ", 0);
910 }
911 
get_CS(BAMAlignment const * cself)912 static char const *get_CS(BAMAlignment const *cself)
913 {
914     struct offset_size_s const *const x = get_CS_info(cself);
915     return (char const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
916 }
917 
get_CQ(BAMAlignment const * cself)918 static uint8_t const *get_CQ(BAMAlignment const *cself)
919 {
920     struct offset_size_s const *const x = get_CQ_info(cself);
921     return (uint8_t const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
922 }
923 
get_OQ_info(BAMAlignment const * cself)924 static struct offset_size_s const *get_OQ_info(BAMAlignment const *cself)
925 {
926     return tag_search(cself, "OQ", 0);
927 }
928 
get_OQ(BAMAlignment const * cself)929 static uint8_t const *get_OQ(BAMAlignment const *cself)
930 {
931     struct offset_size_s const *const x = get_OQ_info(cself);
932     return (uint8_t const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
933 }
934 
get_XT(BAMAlignment const * cself)935 static char const *get_XT(BAMAlignment const *cself)
936 {
937     struct offset_size_s const *const x = tag_search(cself, "XT", 0);
938     return (char const *)(x && cself->data->raw[x->offset + 2] == 'Z' ? &cself->data->raw[x->offset + 3] : NULL);
939 }
940 
get_XS(BAMAlignment const * cself)941 static uint8_t const *get_XS(BAMAlignment const *cself)
942 {
943     struct offset_size_s const *const x = tag_search(cself, "XS", -1); /* want last one */
944     return (uint8_t const *)(x && cself->data->raw[x->offset + 2] == 'A' ? &cself->data->raw[x->offset + 3] : NULL);
945 }
946 
get_CG_ZA_info(BAMAlignment const * cself)947 static struct offset_size_s const *get_CG_ZA_info(BAMAlignment const *cself)
948 {
949     struct offset_size_s const *const x = tag_search(cself, "ZA", 0);
950     return x;
951 }
952 
get_CG_ZI_info(BAMAlignment const * cself)953 static struct offset_size_s const *get_CG_ZI_info(BAMAlignment const *cself)
954 {
955     struct offset_size_s const *const x = tag_search(cself, "ZI", 0);
956     return x;
957 }
958 
get_CG_GC_info(BAMAlignment const * cself)959 static struct offset_size_s const *get_CG_GC_info(BAMAlignment const *cself)
960 {
961     struct offset_size_s const *const x = tag_search(cself, "GC", 0);
962     return x;
963 }
964 
get_CG_GS_info(BAMAlignment const * cself)965 static struct offset_size_s const *get_CG_GS_info(BAMAlignment const *cself)
966 {
967     struct offset_size_s const *const x = tag_search(cself, "GS", 0);
968     return x;
969 }
970 
get_CG_GQ_info(BAMAlignment const * cself)971 static struct offset_size_s const *get_CG_GQ_info(BAMAlignment const *cself)
972 {
973     struct offset_size_s const *const x = tag_search(cself, "GQ", 0);
974     return x;
975 }
976 
977 /* MARK: BAMFile Reading functions */
978 
979 /* returns (rcData, rcInsufficient) if eof */
BAMFileFillBuffer(BAMFile * self)980 static rc_t BAMFileFillBuffer(BAMFile *self)
981 {
982     rc_t const rc = self->vt.FileRead(&self->file, self->buffer, &self->bufSize);
983     if (rc)
984         return rc;
985     if (self->bufSize == 0 || self->bufSize <= self->bufCurrent)
986         return SILENT_RC(rcAlign, rcFile, rcReading, rcData, rcInsufficient);
987     return 0;
988 }
989 
BAMFileReadn(BAMFile * self,const unsigned len,uint8_t dst[])990 static rc_t BAMFileReadn(BAMFile *self, const unsigned len, uint8_t dst[/* len */]) {
991     rc_t rc;
992     unsigned cur;
993     unsigned n = 0;
994 
995     if (len == 0)
996         return 0;
997 
998     for (cur = 0; ; cur += n) {
999         if (self->bufSize > self->bufCurrent) {
1000             n = self->bufSize - self->bufCurrent;
1001             if (cur + n > len)
1002                 n = len - cur;
1003             memmove(&dst[cur], &self->buffer[self->bufCurrent], n);
1004             self->bufCurrent += n;
1005         }
1006         if (self->bufCurrent != self->bufSize && self->bufSize != 0)
1007             return 0;
1008         if (self->bufSize != 0) {
1009             /* a seek has not just been done so update the file position.
1010              * if we didn't and a request for the position is made before the
1011              * next read, we will not have the position of the next read.
1012              *
1013              * if a seek had just been done then
1014              *    self->fpos_cur == BGZFileGetPos(&self->file)
1015              * is already true.
1016              */
1017             self->fpos_cur = self->vt.FileGetPos(&self->file);
1018             self->bufCurrent = 0;
1019             self->bufSize = 0;
1020             if (cur + n == len)
1021                 return 0;
1022         }
1023 
1024         rc = BAMFileFillBuffer(self);
1025         if (rc)
1026             return rc;
1027     }
1028 }
1029 
BAMFilePeek(BAMFile const * const self,unsigned const offset)1030 static void const *BAMFilePeek(BAMFile const *const self, unsigned const offset)
1031 {
1032     return &self->buffer[self->bufCurrent + offset];
1033 }
1034 
BAMFileMaxPeek(BAMFile const * self)1035 static unsigned BAMFileMaxPeek(BAMFile const *self)
1036 {
1037     return self->bufSize > self->bufCurrent ? self->bufSize - self->bufCurrent : 0;
1038 }
1039 
BAMFilePeekI32(BAMFile * self)1040 static int32_t BAMFilePeekI32(BAMFile *self)
1041 {
1042     return LE2HI32(BAMFilePeek(self, 0));
1043 }
1044 
BAMFileReadI32(BAMFile * self,int32_t * rhs)1045 static rc_t BAMFileReadI32(BAMFile *self, int32_t *rhs)
1046 {
1047     uint8_t buf[sizeof(int32_t)];
1048     rc_t rc = BAMFileReadn(self, sizeof(int32_t), buf);
1049 
1050     if (rc == 0)
1051         *rhs = LE2HI32(buf);
1052     return rc;
1053 }
1054 
1055 /* MARK: BAM File header parsing functions */
1056 
comp_ReadGroup(const void * a,const void * b,void * ignored)1057 static int64_t CC comp_ReadGroup(const void *a, const void *b, void *ignored) {
1058     return strcmp(((BAMReadGroup const *)a)->name, ((BAMReadGroup const *)b)->name);
1059 }
1060 
ParseHD(BAMFile * self,char hdata[],size_t hlen,unsigned * used)1061 static rc_t ParseHD(BAMFile *self, char hdata[], size_t hlen, unsigned *used)
1062 {
1063     unsigned i;
1064     unsigned tag;
1065     unsigned value;
1066     int st = 0;
1067     int ws = 1;
1068 
1069     for (i = 0; i < hlen; ++i) {
1070         char const cc = hdata[i];
1071 
1072         if (ws && isspace(cc))
1073             continue;
1074         ws = 0;
1075 
1076         switch (st) {
1077         case 0:
1078             tag = i;
1079             ++st;
1080             break;
1081         case 1:
1082             if (isspace(cc))
1083                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1084             ++st;
1085             break;
1086         case 2:
1087             if (cc != ':')
1088                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1089             hdata[i] = '\0';
1090             value = i + 1;
1091             ++st;
1092             break;
1093         case 3:
1094             if (cc == '\t' || cc == '\r' || cc == '\n') {
1095                 hdata[i] = '\0';
1096 
1097                 if (strcmp(&hdata[tag], "VN") == 0)
1098                     self->version = &hdata[value];
1099 
1100                 ++st;
1101                 ws = 1;
1102             }
1103             break;
1104         case 4:
1105             if (cc == '@')
1106                 goto DONE;
1107             tag = i;
1108             st = 1;
1109             break;
1110         }
1111     }
1112     if (st == 4) {
1113 DONE:
1114         *used = i;
1115         return 0;
1116     }
1117     return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1118 }
1119 
ParseSQ(BAMFile * self,char hdata[],size_t hlen,unsigned * used,unsigned const rs_by_name[])1120 static rc_t ParseSQ(BAMFile *self, char hdata[], size_t hlen, unsigned *used, unsigned const rs_by_name[])
1121 {
1122     unsigned i;
1123     unsigned tag;
1124     unsigned value;
1125     int st = 0;
1126     int ws = 1;
1127     BAMRefSeq rs;
1128 
1129     memset(&rs, 0, sizeof(rs));
1130 
1131     for (i = 0; i < hlen; ++i) {
1132         char const cc = hdata[i];
1133 
1134         if (ws && isspace(cc))
1135             continue;
1136         ws = 0;
1137 
1138         switch (st) {
1139         case 0:
1140             tag = i;
1141             ++st;
1142             break;
1143         case 1:
1144             if (isspace(cc))
1145                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1146             ++st;
1147             break;
1148         case 2:
1149 #define HACKAMATIC 1
1150 #if HACKAMATIC
1151             if (cc != ':') {
1152                 if (i + 1 >= hlen || hdata[i+1] != ':')
1153                     return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1154                 else
1155                     ++i;
1156             }
1157 #else
1158             if (cc != ':')
1159                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1160 #endif
1161             hdata[i] = '\0';
1162             value = i + 1;
1163             ++st;
1164             break;
1165         case 3:
1166             if (cc == '\t' || cc == '\r' || cc == '\n') {
1167                 unsigned j;
1168 
1169                 hdata[i] = '\0';
1170 
1171                 while (value < i && isspace(hdata[value]))
1172                     ++value;
1173                 for (j = i; value < j && isspace(hdata[j - 1]); )
1174                     hdata[--j] = '\0';
1175 
1176                 if (strcmp(&hdata[tag], "SN") == 0)
1177                     rs.name = &hdata[value];
1178                 else if (strcmp(&hdata[tag], "LN") == 0)
1179                     rs.length = strtou64(&hdata[value], NULL, 10);
1180                 else if (strcmp(&hdata[tag], "AS") == 0)
1181                     rs.assemblyId = &hdata[value];
1182 #if HACKAMATIC
1183                 else if (strcmp(&hdata[tag], "M5") == 0 || strcmp(&hdata[tag], "MD5") == 0)
1184 #else
1185                 else if (strcmp(&hdata[tag], "M5") == 0)
1186 #endif
1187 #undef HACKAMATIC
1188                 {
1189                     unsigned len = j - value;
1190 
1191                     if ((hdata[value] == '\'' || hdata[value] == '"') && hdata[value + len - 1] == hdata[value]) {
1192                         ++value;
1193                         len -= 2;
1194                     }
1195                     if (len == 32) {
1196                         rs.checksum = &rs.checksum_array[0];
1197                         for (j = 0; j != 16; ++j) {
1198                             int const ch1 = toupper(hdata[value + j * 2 + 0]);
1199                             int const ch2 = toupper(hdata[value + j * 2 + 1]);
1200 
1201                             if (isxdigit(ch1) && isxdigit(ch2)) {
1202                                 rs.checksum_array[j] =
1203                                     ((ch1 > '9' ? (ch1 - ('A' - 10)) : (ch1 - '0')) << 4) +
1204                                      (ch2 > '9' ? (ch2 - ('A' - 10)) : (ch2 - '0'));
1205                             }
1206                             else {
1207                                 rs.checksum = NULL;
1208                                 break;
1209                             }
1210                         }
1211                     }
1212                 }
1213                 else if (strcmp(&hdata[tag], "UR") == 0)
1214                     rs.uri = &hdata[value];
1215                 else if (strcmp(&hdata[tag], "SP") == 0)
1216                     rs.species = &hdata[value];
1217 
1218                 ++st;
1219                 ws = 1;
1220             }
1221             break;
1222         case 4:
1223             if (cc == '@')
1224                 goto DONE;
1225             tag = i;
1226             st = 1;
1227             break;
1228         }
1229     }
1230 DONE:
1231     if (st == 4) {
1232         unsigned f = 0;
1233         unsigned e = self->refSeqs;
1234 
1235         if (rs.name == NULL) /* required tags */
1236             return RC(rcAlign, rcFile, rcParsing, rcConstraint, rcViolated);
1237 
1238         while (f < e) {
1239             unsigned const m = (f + e) >> 1;
1240             BAMRefSeq *const x = &self->refSeq[rs_by_name[m]];
1241             int const cmp = strcmp(rs.name, x->name);
1242 
1243             if (cmp < 0)
1244                 e = m;
1245             else if (cmp > 0)
1246                 f = m + 1;
1247             else {
1248                 x->assemblyId = rs.assemblyId;
1249                 x->uri = rs.uri;
1250                 x->species = rs.species;
1251                 if (rs.checksum) {
1252                     x->checksum = &x->checksum_array[0];
1253                     memmove(x->checksum_array, rs.checksum_array, 16);
1254                 }
1255                 else
1256                     x->checksum = NULL;
1257                 break;
1258             }
1259         }
1260         *used = i;
1261         return 0;
1262     }
1263     return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1264 }
1265 
ParseRG(BAMFile * self,char hdata[],size_t hlen,unsigned * used,BAMReadGroup * dst)1266 static rc_t ParseRG(BAMFile *self, char hdata[], size_t hlen, unsigned *used, BAMReadGroup *dst)
1267 {
1268     unsigned i;
1269     unsigned tag;
1270     unsigned value;
1271     int st = 0;
1272     int ws = 1;
1273 #if _DEBUGGING
1274 /*    char const *cur = hdata; */
1275 #endif
1276 
1277     memset(dst, 0, sizeof(*dst));
1278 
1279     for (i = 0; i < hlen; ++i) {
1280         char const cc = hdata[i];
1281 
1282         if (ws && isspace(cc))
1283             continue;
1284         ws = 0;
1285 
1286         switch (st) {
1287         case 0:
1288             tag = i;
1289             ++st;
1290             break;
1291         case 1:
1292             if (isspace(cc))
1293                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1294             ++st;
1295             break;
1296         case 2:
1297             if (cc != ':')
1298                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1299 #if _DEBUGGING
1300             /* cur = hdata + i + 1; */
1301 #endif
1302             hdata[i] = '\0';
1303             value = i + 1;
1304             ++st;
1305             break;
1306         case 3:
1307             if (cc == '\t' || cc == '\r' || cc == '\n') {
1308                 unsigned j = i;
1309 #if _DEBUGGING
1310                 /* cur = hdata + i + 1; */
1311 #endif
1312                 hdata[i] = '\0';
1313 
1314                 while (value < i && isspace(hdata[value]))
1315                     ++value;
1316                 while (value < j && isspace(hdata[j - 1]))
1317                     hdata[--j] = '\0';
1318 
1319                 if ((hdata[value] == '\"' || hdata[value] == '\'') && hdata[value] == hdata[j - 1]) {
1320                     ++value;
1321                     hdata[j - 1] = '\0';
1322                 }
1323                 if (strcmp(&hdata[tag], "ID") == 0)
1324                     dst->name = &hdata[value];
1325                 else if (strcmp(&hdata[tag], "SM") == 0)
1326                     dst->sample = &hdata[value];
1327                 else if (strcmp(&hdata[tag], "LB") == 0)
1328                     dst->library = &hdata[value];
1329                 else if (strcmp(&hdata[tag], "DS") == 0)
1330                     dst->description = &hdata[value];
1331                 else if (strcmp(&hdata[tag], "PU") == 0)
1332                     dst->unit = &hdata[value];
1333                 else if (strcmp(&hdata[tag], "PI") == 0)
1334                     dst->insertSize = &hdata[value];
1335                 else if (strcmp(&hdata[tag], "CN") == 0)
1336                     dst->center = &hdata[value];
1337                 else if (strcmp(&hdata[tag], "DT") == 0)
1338                     dst->runDate = &hdata[value];
1339                 else if (strcmp(&hdata[tag], "PL") == 0)
1340                     dst->platform = &hdata[value];
1341 
1342                 ++st;
1343                 ws = 1;
1344             }
1345             break;
1346         case 4:
1347             if (cc == '@')
1348                 goto DONE;
1349             tag = i;
1350             st = 1;
1351             break;
1352         }
1353     }
1354     if (st == 4) {
1355 DONE:
1356         *used = i;
1357         if (dst->name == NULL) /* required */
1358             return RC(rcAlign, rcFile, rcParsing, rcConstraint, rcViolated);
1359         return 0;
1360     }
1361     return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1362 }
1363 
ParseHeader(BAMFile * self,char hdata[],size_t hlen,unsigned const rs_by_name[])1364 static rc_t ParseHeader(BAMFile *self, char hdata[], size_t hlen, unsigned const rs_by_name[]) {
1365     unsigned rg = 0;
1366     unsigned i;
1367     unsigned tag;
1368     int st = 0;
1369     int ws = 1;
1370     unsigned used;
1371     rc_t rc;
1372 
1373     for (i = 0; i < hlen; ++i) {
1374         char const cc = hdata[i];
1375 
1376         if (ws && isspace(cc))
1377             continue;
1378         ws = 0;
1379 
1380         switch (st) {
1381         case 0:
1382             if (cc == '@')
1383                 ++st;
1384             else
1385                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1386             break;
1387         case 1:
1388             if (isspace(cc))
1389                 return RC(rcAlign, rcFile, rcParsing, rcData, rcInvalid);
1390             tag = i;
1391             ++st;
1392             break;
1393         case 2:
1394             if (isspace(cc)) {
1395                 hdata[i] = '\0';
1396                 if (i - tag == 2) {
1397                     if (strcmp(&hdata[tag], "HD") == 0) {
1398                         rc = ParseHD(self, &hdata[i + 1], hlen - i - 1, &used);
1399                         if (rc) return rc;
1400                         i += used;
1401                         st = 0;
1402                     }
1403                     else if (strcmp(&hdata[tag], "SQ") == 0) {
1404                         rc = ParseSQ(self, &hdata[i + 1], hlen - i - 1, &used, rs_by_name);
1405                         if (rc) return rc;
1406                         i += used;
1407                         st = 0;
1408                     }
1409                     else if (strcmp(&hdata[tag], "RG") == 0) {
1410                         rc = ParseRG(self, &hdata[i + 1], hlen - i - 1, &used, &self->readGroup[rg]);
1411                         if (GetRCObject(rc) == rcConstraint && GetRCState(rc) == rcViolated) {
1412                             (void)LOGERR(klogWarn, rc, "Read Group is missing ID in BAM header");
1413                             rc = 0;
1414                             if (self->readGroups) --self->readGroups;
1415                         }
1416                         else if (rc)
1417                             return rc;
1418                         else
1419                             ++rg;
1420                         i += used;
1421                         st = 0;
1422                     }
1423                 }
1424                 if (st == 2) {
1425                     ++st;
1426                     ws = 0;
1427                 }
1428             }
1429             else if (i - tag > 2)
1430                 ++st;
1431             break;
1432         case 3:
1433             if (cc == '\r' || cc == '\n') {
1434                 st = 0;
1435                 ws = 1;
1436             }
1437             break;
1438         }
1439     }
1440     ksort( self->readGroup, self->readGroups, sizeof(self->readGroup[0]), comp_ReadGroup, NULL );
1441     for (rg = 0; rg != self->readGroups; ++rg) {
1442         if (rg > 0 && strcmp(self->readGroup[rg - 1].name, self->readGroup[rg].name) == 0)
1443             return RC(rcAlign, rcFile, rcParsing, rcConstraint, rcViolated);  /* name must be unique */
1444         self->readGroup[rg].id = rg;
1445     }
1446     for (i = 0; i < self->refSeqs; ++i) {
1447         if (self->refSeq[i].length == 0) {
1448             (void)PLOGMSG(klogWarn, (klogWarn, "Reference '$(ref)' has zero length", "ref=%s", self->refSeq[i].name));
1449         }
1450     }
1451 
1452     return 0;
1453 }
1454 
CountReadGroups(char const txt[],size_t len,unsigned * reads)1455 static rc_t CountReadGroups(char const txt[], size_t len, unsigned *reads) {
1456     const char *const endp = txt + len;
1457 
1458     *reads = 0;
1459 
1460     do {
1461         while (txt != endp && isspace(*txt))
1462             ++txt;
1463         if (txt == endp || txt + 3 >= endp)
1464             break;
1465 
1466         if (txt[0] == '@' && txt[1] == 'R' && txt[2] == 'G')
1467             ++*reads;
1468 
1469         txt = memchr(txt, '\n', endp - txt);
1470     } while (txt);
1471     return 0;
1472 }
1473 
ReadMagic(BAMFile * self)1474 static rc_t ReadMagic(BAMFile *self)
1475 {
1476     uint8_t sig[4];
1477     rc_t rc = BAMFileReadn(self, 4, sig);
1478 
1479     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM signature: '%c%c%c' %u\n", sig[0], sig[1], sig[2], sig[3]));
1480     if (rc == 0 && (sig[0] != 'B' || sig[1] != 'A' || sig[2] != 'M' || sig[3] != 1))
1481         rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcBadVersion);
1482     return rc;
1483 }
1484 
ReadHeaders(BAMFile * self,char ** headerText,size_t * headerTextLen,uint8_t ** refData,unsigned * numrefs)1485 static rc_t ReadHeaders(BAMFile *self,
1486                         char **headerText, size_t *headerTextLen,
1487                         uint8_t **refData, unsigned *numrefs)
1488 {
1489     unsigned hlen;
1490     char *htxt = NULL;
1491     unsigned nrefs;
1492     uint8_t *rdat = NULL;
1493     unsigned rdsz;
1494     unsigned rdms;
1495     unsigned i;
1496     int32_t i32;
1497     rc_t rc = BAMFileReadI32(self, &i32);
1498 
1499     if (rc) return rc;
1500 
1501     if (i32 < 0) {
1502         rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcInvalid);
1503         goto BAILOUT;
1504     }
1505     hlen = i32;
1506     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Header text size: %u\n", hlen));
1507     if (hlen) {
1508         htxt = malloc(hlen + 1);
1509         if (htxt == NULL) {
1510             rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1511             goto BAILOUT;
1512         }
1513 
1514         rc = BAMFileReadn(self, hlen, (uint8_t *)htxt); if (rc) goto BAILOUT;
1515         htxt[hlen] = '\0';
1516     }
1517     rc = BAMFileReadI32(self, &i32); if (rc) goto BAILOUT;
1518     if (i32 < 0) {
1519         rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcInvalid);
1520         goto BAILOUT;
1521     }
1522     nrefs = i32;
1523     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Header reference count: %u\n", nrefs));
1524     if (nrefs) {
1525         rdms = nrefs * 16;
1526         if (rdms < 4096)
1527             rdms = 4096;
1528         rdat = malloc(rdms);
1529         if (rdat == NULL) {
1530             rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1531             goto BAILOUT;
1532         }
1533         for (i = rdsz = 0; i < nrefs; ++i) {
1534             rc = BAMFileReadI32(self, &i32); if (rc) goto BAILOUT;
1535             if (i32 <= 0) {
1536                 rc = RC(rcAlign, rcFile, rcReading, rcHeader, rcInvalid);
1537                 goto BAILOUT;
1538             }
1539             if (rdsz + i32 + 8 > rdms) {
1540                 void *tmp;
1541 
1542                 do { rdms <<= 1; } while (rdsz + i32 + 8 > rdms);
1543                 tmp = realloc(rdat, rdms);
1544                 if (tmp == NULL) {
1545                     rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1546                     goto BAILOUT;
1547                 }
1548                 rdat = tmp;
1549             }
1550             memmove(rdat + rdsz, &i32, 4);
1551             rdsz += 4;
1552             rc = BAMFileReadn(self, i32, &rdat[rdsz]); if (rc) goto BAILOUT;
1553             rdsz += i32;
1554             rc = BAMFileReadI32(self, &i32); if (rc) goto BAILOUT;
1555             memmove(rdat + rdsz, &i32, 4);
1556             rdsz += 4;
1557         }
1558     }
1559     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Header reference size: %u\n", rdsz));
1560 
1561     *headerText = htxt;
1562     *headerTextLen = hlen;
1563     *refData = rdat;
1564     *numrefs = nrefs;
1565     return 0;
1566 
1567 BAILOUT:
1568     if (htxt)
1569         free(htxt);
1570     if (rdat)
1571         free(rdat);
1572 
1573     return rc;
1574 }
1575 
comp_RefSeqName(const void * A,const void * B,void * ignored)1576 static int64_t CC comp_RefSeqName(const void *A, const void *B, void *ignored) {
1577     BAMFile const *self = (BAMFile const *)ignored;
1578     unsigned const a = *(unsigned const *)A;
1579     unsigned const b = *(unsigned const *)B;
1580 
1581     return strcmp(self->refSeq[a].name, self->refSeq[b].name);
1582 }
1583 
ProcessHeader(BAMFile * self,char const headerText[])1584 static rc_t ProcessHeader(BAMFile *self, char const headerText[])
1585 {
1586     unsigned *rs_by_name = NULL;
1587     unsigned i;
1588     unsigned cp;
1589     char *htxt;
1590     uint8_t *rdat;
1591     size_t hlen;
1592     unsigned nrefs;
1593     rc_t rc = ReadMagic(self);
1594 
1595     if (rc) return rc;
1596 
1597     rc = ReadHeaders(self, &htxt, &hlen, &rdat, &nrefs);
1598     if (rc) return rc;
1599 
1600     self->fpos_first = self->fpos_cur;
1601     self->ucfirst = self->bufCurrent;
1602     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("BAM Data records start at: %lu+%u\n", self->ucfirst, self->fpos_first));
1603 
1604     if (headerText) {
1605         free(htxt);
1606         hlen = string_size( headerText );
1607         htxt = malloc(hlen + 1);
1608         if (htxt == NULL) {
1609             free(rdat);
1610             return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1611         }
1612         memmove(htxt, headerText, hlen + 1);
1613     }
1614 
1615     self->headerData2 = rdat;
1616     if (hlen) {
1617         self->header = htxt;
1618         self->headerData1 = malloc(hlen + 1);
1619         if (self->headerData1 == NULL)
1620             return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1621         memmove(self->headerData1, self->header, hlen + 1);
1622     }
1623     else {
1624         htxt = malloc(1);
1625         htxt[0] = '\0';
1626         self->header = htxt;
1627         self->headerData1 = NULL;
1628     }
1629     self->refSeqs = nrefs;
1630     if (nrefs) {
1631         self->refSeq = calloc(nrefs, sizeof(self->refSeq[0]));
1632         if (self->refSeq == NULL)
1633             return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1634 
1635         rs_by_name = calloc(nrefs, sizeof(rs_by_name[0]));
1636         if (rs_by_name == NULL)
1637             return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1638 
1639         for (i = cp = 0; i < nrefs; ++i) {
1640             uint32_t nlen;
1641             uint32_t rlen;
1642 
1643             rs_by_name[i] = i;
1644             self->refSeq[i].id = i;
1645             memmove(&nlen, &self->headerData2[cp], 4);
1646             cp += 4;
1647             self->refSeq[i].name = (char const *)&self->headerData2[cp];
1648             cp += nlen;
1649             memmove(&rlen, &self->headerData2[cp], 4);
1650             self->headerData2[cp] = 0;
1651             cp += 4;
1652             self->refSeq[i].length = rlen;
1653         }
1654         ksort((void *)rs_by_name, self->refSeqs, sizeof(rs_by_name[0]), comp_RefSeqName, self);
1655     }
1656     if (self->headerData1) {
1657         rc = CountReadGroups(self->headerData1, hlen, &self->readGroups);
1658         if (rc == 0) {
1659             self->readGroup = calloc(self->readGroups, sizeof(self->readGroup[0]));
1660             if (self->readGroup != NULL)
1661                 rc = ParseHeader(self, self->headerData1, hlen, rs_by_name);
1662             else
1663                 rc = RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1664         }
1665     }
1666     if (rs_by_name != NULL)
1667         free((void *)rs_by_name);
1668 
1669     return rc;
1670 }
1671 
1672 /* MARK: BAM File destructor */
1673 
1674 static rc_t BAMIndexWhack(const BAMIndex *);
1675 
BAMFileWhack(BAMFile * self)1676 static rc_t BAMFileWhack(BAMFile *self) {
1677     if (self->refSeq)
1678         free(self->refSeq);
1679     if (self->readGroup)
1680         free(self->readGroup);
1681     if (self->header)
1682         free((void *)self->header);
1683     if (self->headerData1)
1684         free((void *)self->headerData1);
1685     if (self->headerData2)
1686         free((void *)self->headerData2);
1687     if (self->ndx)
1688         BAMIndexWhack(self->ndx);
1689     if (self->nocopy)
1690         free(self->nocopy);
1691     if (self->vt.FileWhack)
1692         self->vt.FileWhack(&self->file);
1693 
1694     return 0;
1695 }
1696 
1697 /* MARK: BAM File constructors */
1698 
1699 /* file is retained */
BAMFileMakeWithKFileAndHeader(BAMFile const ** cself,KFile const * file,char const * headerText,bool threaded)1700 static rc_t BAMFileMakeWithKFileAndHeader(BAMFile const **cself,
1701                                           KFile const *file,
1702                                           char const *headerText,
1703                                           bool threaded)
1704 {
1705     BAMFile *self = calloc(1, sizeof(*self));
1706     rc_t rc;
1707 
1708     if (self == NULL)
1709         return RC(rcAlign, rcFile, rcConstructing, rcMemory, rcExhausted);
1710 
1711     KRefcountInit(&self->refcount, 1, "BAMFile", "new", "");
1712 #ifndef WINDOWS
1713     if (threaded)
1714         rc = BGZThreadFileInit(&self->file.thread, file, &self->vt);
1715     else
1716 #endif
1717         rc = BGZFileInit(&self->file.plain, file, &self->vt);
1718 
1719     if (rc == 0) {
1720         rc = ProcessHeader(self, headerText);
1721         if (rc == 0) {
1722             *cself = self;
1723             return 0;
1724         }
1725     }
1726     BAMFileWhack(self);
1727     return rc;
1728 }
1729 
1730 /* file is retained */
BAMFileMakeWithKFile(const BAMFile ** cself,const KFile * file)1731 LIB_EXPORT rc_t CC BAMFileMakeWithKFile(const BAMFile **cself, const KFile *file)
1732 {
1733     return BAMFileMakeWithKFileAndHeader(cself, file, NULL, false);
1734 }
1735 
BAMFileVMakeWithDir(const BAMFile ** result,const KDirectory * dir,const char * path,va_list args)1736 LIB_EXPORT rc_t CC BAMFileVMakeWithDir(const BAMFile **result,
1737                                          const KDirectory *dir,
1738                                          const char *path,
1739                                          va_list args
1740                                          )
1741 {
1742     rc_t rc;
1743     const KFile *kf;
1744 
1745     if (result == NULL)
1746         return RC(rcAlign, rcFile, rcOpening, rcParam, rcNull);
1747     *result = NULL;
1748     rc = KDirectoryVOpenFileRead(dir, &kf, path, args);
1749     if (rc == 0) {
1750         rc = BAMFileMakeWithKFile(result, kf);
1751         KFileRelease(kf);
1752     }
1753     return rc;
1754 }
1755 
BAMFileMakeWithDir(const BAMFile ** result,const KDirectory * dir,const char * path,...)1756 LIB_EXPORT rc_t CC BAMFileMakeWithDir(const BAMFile **result,
1757                                         const KDirectory *dir,
1758                                         const char *path, ...
1759                                         )
1760 {
1761     va_list args;
1762     rc_t rc;
1763 
1764     va_start(args, path);
1765     rc = BAMFileVMakeWithDir(result, dir, path, args);
1766     va_end(args);
1767     return rc;
1768 }
1769 
BAMFileMake(const BAMFile ** cself,const char * path,...)1770 LIB_EXPORT rc_t CC BAMFileMake(const BAMFile **cself, const char *path, ...)
1771 {
1772     KDirectory *dir;
1773     va_list args;
1774     rc_t rc;
1775 
1776     if (cself == NULL)
1777         return RC(rcAlign, rcFile, rcOpening, rcParam, rcNull);
1778     *cself = NULL;
1779 
1780     rc = KDirectoryNativeDir(&dir);
1781     if (rc) return rc;
1782     va_start(args, path);
1783     rc = BAMFileVMakeWithDir(cself, dir, path, args);
1784     va_end(args);
1785     KDirectoryRelease(dir);
1786     return rc;
1787 }
1788 
BAMFileMakeWithHeader(const BAMFile ** cself,char const headerText[],char const path[],...)1789 LIB_EXPORT rc_t CC BAMFileMakeWithHeader ( const BAMFile **cself,
1790                                           char const headerText[],
1791                                           char const path[], ... )
1792 {
1793     KDirectory *dir;
1794     va_list args;
1795     rc_t rc;
1796     const KFile *kf;
1797 
1798     if (cself == NULL)
1799         return RC(rcAlign, rcFile, rcOpening, rcParam, rcNull);
1800     *cself = NULL;
1801 
1802     rc = KDirectoryNativeDir(&dir);
1803     if (rc) return rc;
1804     va_start(args, path);
1805     rc = KDirectoryVOpenFileRead(dir, &kf, path, args);
1806     if (rc == 0) {
1807         rc = BAMFileMakeWithKFileAndHeader(cself, kf, headerText, false);
1808         KFileRelease(kf);
1809     }
1810     va_end(args);
1811     KDirectoryRelease(dir);
1812     return rc;
1813 }
1814 
BAMFileMakeWithVPath(const BAMFile ** cself,const VPath * path)1815 LIB_EXPORT rc_t CC BAMFileMakeWithVPath(const BAMFile **cself, const VPath *path)
1816 {
1817     VFSManager *vfs = NULL;
1818     KFile const *fp = NULL;
1819     rc_t rc = 0;
1820 
1821     rc = VFSManagerMake(&vfs);
1822     if (rc) return rc;
1823 
1824     rc = VFSManagerOpenFileRead ( vfs, &fp, path);
1825     VFSManagerRelease(vfs);
1826     if (rc) return rc;
1827 
1828     rc = BAMFileMakeWithKFile(cself, fp);
1829     if (rc) return rc;
1830 
1831     KFileRelease(fp);
1832     return 0;
1833 }
1834 
1835 /* MARK: BAM File ref-counting */
1836 
BAMFileAddRef(const BAMFile * cself)1837 LIB_EXPORT rc_t CC BAMFileAddRef(const BAMFile *cself) {
1838     if (cself != NULL)
1839         KRefcountAdd(&cself->refcount, "BAMFile");
1840     return 0;
1841 }
1842 
BAMFileRelease(const BAMFile * cself)1843 LIB_EXPORT rc_t CC BAMFileRelease(const BAMFile *cself) {
1844     rc_t rc = 0;
1845     BAMFile *self = (BAMFile *)cself;
1846 
1847     if (cself != NULL) {
1848         if (KRefcountDrop(&self->refcount, "BAMFile") == krefWhack) {
1849             rc = BAMFileWhack(self);
1850             free(self);
1851         }
1852     }
1853     return rc;
1854 }
1855 
1856 /* MARK: BAM File positioning */
1857 
BAMFileGetProportionalPosition(const BAMFile * self)1858 LIB_EXPORT float CC BAMFileGetProportionalPosition(const BAMFile *self)
1859 {
1860     return self->vt.FileProPos(&self->file);
1861 }
1862 
BAMFileGetPositionInt(const BAMFile * self)1863 static BAMFilePosition BAMFileGetPositionInt(const BAMFile *self) {
1864     return (self->fpos_cur << 16) | self->bufCurrent;
1865 }
1866 
BAMFileGetPosition(const BAMFile * self,BAMFilePosition * pos)1867 LIB_EXPORT rc_t CC BAMFileGetPosition(const BAMFile *self, BAMFilePosition *pos) {
1868     *pos = BAMFileGetPositionInt(self);
1869     return 0;
1870 }
1871 
BAMFileSetPositionInt(const BAMFile * cself,uint64_t fpos,uint16_t bpos)1872 static rc_t BAMFileSetPositionInt(const BAMFile *cself, uint64_t fpos, uint16_t bpos)
1873 {
1874     rc_t rc;
1875     BAMFile *self = (BAMFile *)cself;
1876 
1877     if (cself->fpos_first > fpos || fpos > cself->vt.FileGetSize(&cself->file) ||
1878         (fpos == cself->fpos_first && bpos < cself->ucfirst))
1879     {
1880         return RC(rcAlign, rcFile, rcPositioning, rcParam, rcInvalid);
1881     }
1882     if (cself->fpos_cur == fpos) {
1883         if (bpos <= cself->bufSize) {
1884             self->eof = false;
1885             self->bufCurrent = bpos;
1886             return 0;
1887         }
1888         return RC(rcAlign, rcFile, rcPositioning, rcParam, rcInvalid);
1889     }
1890     rc = self->vt.FileSetPos(&self->file, fpos);
1891     if (rc == 0) {
1892         self->eof = false;
1893         self->bufSize = 0; /* force re-read */
1894         self->bufCurrent = bpos;
1895         self->fpos_cur = fpos;
1896     }
1897     return rc;
1898 }
1899 
BAMFileSetPosition(const BAMFile * cself,const BAMFilePosition * pos)1900 LIB_EXPORT rc_t CC BAMFileSetPosition(const BAMFile *cself, const BAMFilePosition *pos)
1901 {
1902     return BAMFileSetPositionInt(cself, *pos >> 16, (uint16_t)(*pos));
1903 }
1904 
BAMFileRewind(const BAMFile * cself)1905 LIB_EXPORT rc_t CC BAMFileRewind(const BAMFile *cself)
1906 {
1907     return BAMFileSetPositionInt(cself, cself->fpos_first, cself->ucfirst);
1908 }
1909 
BAMFileAdvance(BAMFile * const self,unsigned distance)1910 static void BAMFileAdvance(BAMFile *const self, unsigned distance)
1911 {
1912     self->bufCurrent += distance;
1913     if (self->bufCurrent == self->bufSize) {
1914         self->fpos_cur = self->vt.FileGetPos(&self->file);
1915         self->bufCurrent = 0;
1916         self->bufSize = 0;
1917     }
1918 }
1919 
1920 /* MARK: BAM Alignment contruction */
1921 
TagTypeSize(int const type)1922 static int TagTypeSize(int const type)
1923 {
1924     switch (type) {
1925         case dt_ASCII:      /* A */
1926         case dt_INT8:       /* c */
1927         case dt_UINT8:      /* C */
1928             return 1;
1929 
1930         case dt_INT16:      /* s */
1931         case dt_UINT16:     /* S */
1932             return 2;
1933 
1934         case dt_INT:        /* i */
1935         case dt_UINT:       /* I */
1936         case dt_FLOAT32:    /* f */
1937             return 4;
1938 #if 0
1939         case dt_FLOAT64:    /* d */
1940             return 8;
1941 #endif
1942         case dt_CSTRING:    /* Z */
1943         case dt_HEXSTRING:  /* H */
1944             return -'S';
1945 
1946         case dt_NUM_ARRAY:  /* B */
1947             return -'A';
1948     }
1949     return 0;
1950 }
1951 
ColorCheck(BAMAlignment * const self,char const tag[2],unsigned const len)1952 static void ColorCheck(BAMAlignment *const self, char const tag[2], unsigned const len)
1953 {
1954     if (tag[0] == 'C' && len != 0) {
1955         int const ch = tag[1];
1956         int const flag = ch == 'Q' ? 2 : ch == 'S' ? 1 : 0;
1957 
1958         if (flag)
1959             self->hasColor ^= (len << 2) | flag;
1960     }
1961 }
1962 
ParseOptData(BAMAlignment * const self,size_t const maxsize,size_t const xtra,size_t const datasize)1963 static rc_t ParseOptData(BAMAlignment *const self, size_t const maxsize,
1964                          size_t const xtra, size_t const datasize)
1965 {
1966     size_t const maxExtra = (maxsize - (sizeof(*self) - sizeof(self->extra))) / sizeof(self->extra[0]);
1967     char const *const base = (char const *)self->data->raw;
1968     unsigned i = 0;
1969     unsigned len;
1970     unsigned offset;
1971 
1972     self->numExtra = 0;
1973     for (len = 0, offset = (unsigned)xtra; offset < datasize; offset += len) {
1974         int const valuelen1 = TagTypeSize(base[offset + 2]);
1975         unsigned valuelen;
1976 
1977         if (valuelen1 < 0) {
1978             char const *const value = &base[offset + 3];
1979 
1980             if (-valuelen1 == 'S') {
1981                 valuelen = 0;
1982                 while (value[valuelen] != '\0') {
1983                     ++valuelen;
1984                     if (offset + valuelen >= datasize) {
1985                         rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
1986                         return rc;
1987                     }
1988                 }
1989                 ColorCheck(self, base + offset, valuelen);
1990                 ++valuelen;
1991             }
1992             else {
1993                 int const elem_size = TagTypeSize(value[0]);
1994 
1995                 assert(-valuelen1 == 'A');
1996                 if (elem_size <= 0) {
1997                     rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
1998                     return rc;
1999                 }
2000                 else {
2001                     int const elem_count = LE2HI32(&value[1]);
2002 
2003                     valuelen = elem_size * elem_count + 1 + 4;
2004                     if (offset + valuelen >= datasize) {
2005                         rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2006                         return rc;
2007                     }
2008                 }
2009             }
2010         }
2011         else if (valuelen1 == 0) {
2012             rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
2013             return rc;
2014         }
2015         else
2016             valuelen = valuelen1;
2017 
2018         len = valuelen + 3;
2019         if (i < maxExtra) {
2020             self->extra[i].offset = offset;
2021             self->extra[i].size   = len;
2022         }
2023         ++i;
2024     }
2025     self->numExtra = i;
2026     if (2 <= i && i <= maxExtra)
2027         ksort(self->extra, i, sizeof(self->extra[0]), OptTag_sort, self);
2028 
2029     return 0;
2030 }
2031 
ParseOptDataLog(BAMAlignment * const self,unsigned const maxsize,unsigned const xtra,unsigned const datasize)2032 static rc_t ParseOptDataLog(BAMAlignment *const self, unsigned const maxsize,
2033                             unsigned const xtra, unsigned const datasize)
2034 {
2035     unsigned const maxExtra = (maxsize - (sizeof(*self) - sizeof(self->extra))) / sizeof(self->extra[0]);
2036     char const *const base = (char const *)self->data->raw;
2037     unsigned i = 0;
2038     unsigned len;
2039     unsigned offset;
2040 
2041     self->numExtra = 0;
2042     for (len = 0, offset = (unsigned)xtra; offset < datasize; offset += len) {
2043         int const type = base[offset + 2];
2044         int const valuelen1 = TagTypeSize(type);
2045         unsigned valuelen;
2046 
2047         if (valuelen1 > 0)
2048             valuelen = valuelen1;
2049         else if (valuelen1 < 0) {
2050             char const *const value = &base[offset + 3];
2051 
2052             if (-valuelen1 == 'S') {
2053                 valuelen = 0;
2054                 while (value[valuelen] != '\0') {
2055                     ++valuelen;
2056                     if (offset + valuelen >= datasize) {
2057                         rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2058                         (void)LOGERR(klogErr, rc,
2059                                      "Parsing BAM optional fields: "
2060                                      "unterminated string");
2061                         return rc;
2062                     }
2063                 }
2064                 ColorCheck(self, base + offset, valuelen);
2065                 ++valuelen;
2066             }
2067             else {
2068                 int const elem_type = value[0];
2069                 int const elem_size = TagTypeSize(elem_type);
2070 
2071                 assert(-valuelen1 == 'A');
2072                 if (elem_size <= 0) {
2073                     rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
2074                     (void)LOGERR(klogErr, rc,
2075                                  "Parsing BAM optional fields: "
2076                                  "unknown array type");
2077                     return rc;
2078                 }
2079                 else {
2080                     int const elem_count = LE2HI32(&value[1]);
2081 
2082                     valuelen = elem_size * elem_count + 1 + 4;
2083                     if (offset + valuelen >= datasize) {
2084                         rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2085                         (void)LOGERR(klogErr, rc,
2086                                      "Parsing BAM optional fields: "
2087                                      "array too big");
2088                         return rc;
2089                     }
2090                 }
2091             }
2092         }
2093         else {
2094             rc_t const rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
2095             (void)LOGERR(klogErr, rc,
2096                                     "Parsing BAM optional fields: "
2097                                     "unknown type");
2098             return rc;
2099         }
2100 
2101         len = valuelen + 3;
2102         if (i < maxExtra) {
2103             self->extra[i].offset = offset;
2104             self->extra[i].size   = len;
2105         }
2106         ++i;
2107     }
2108     self->numExtra = i;
2109     if (2 <= i && i <= maxExtra)
2110         ksort(self->extra, i, sizeof(self->extra[0]), OptTag_sort, self);
2111 
2112     return 0;
2113 }
2114 
BAMAlignmentSize(unsigned const max_extra_tags)2115 static unsigned CC BAMAlignmentSize(unsigned const max_extra_tags)
2116 {
2117     BAMAlignment const *const y = NULL;
2118 
2119     return sizeof(*y) + (max_extra_tags ? max_extra_tags - 1 : 0) * sizeof(y->extra);
2120 }
2121 
BAMAlignmentSetOffsets(BAMAlignment * const self)2122 static unsigned BAMAlignmentSetOffsets(BAMAlignment *const self)
2123 {
2124     unsigned const nameLen = getReadNameLength(self);
2125     unsigned const cigCnt  = getCigarCount(self);
2126     unsigned const readLen = getReadLen(self);
2127     unsigned const cigar   = (unsigned)(&((struct bam_alignment_s const *)NULL)->read_name[nameLen] - (const char *)NULL);
2128     unsigned const seq     = cigar + 4 * cigCnt;
2129     unsigned const qual    = seq + (readLen + 1) / 2;
2130     unsigned const xtra    = qual + readLen;
2131 
2132     self->cigar = cigar;
2133     self->seq   = seq;
2134     self->qual  = qual;
2135 
2136     return xtra;
2137 }
2138 
BAMAlignmentInit(BAMAlignment * const self,unsigned const maxsize,BAMFilePosition pos,unsigned const datasize,void const * const data)2139 static bool BAMAlignmentInit(BAMAlignment *const self, unsigned const maxsize,
2140                              BAMFilePosition pos,
2141                              unsigned const datasize, void const *const data)
2142 {
2143     memset(self, 0, sizeof(*self));
2144     self->pos = pos;
2145     self->data = data;
2146     self->datasize = datasize;
2147     {
2148         unsigned const xtra = BAMAlignmentSetOffsets(self);
2149 
2150         if (   datasize >= xtra
2151             && datasize >= self->cigar
2152             && datasize >= self->seq
2153             && datasize >= self->qual)
2154         {
2155             rc_t const rc = ParseOptData(self, maxsize, xtra, datasize);
2156 
2157             if (rc == 0)
2158                 return true;
2159         }
2160         return false;
2161     }
2162 }
2163 
BAMAlignmentInitLog(BAMAlignment * const self,unsigned const maxsize,BAMFilePosition pos,unsigned const datasize,void const * const data)2164 static bool BAMAlignmentInitLog(BAMAlignment *const self, unsigned const maxsize,
2165                                 BAMFilePosition pos,
2166                                 unsigned const datasize, void const *const data)
2167 {
2168     memset(self, 0, sizeof(*self));
2169     self->pos = pos;
2170     self->data = data;
2171     self->datasize = datasize;
2172     {
2173         unsigned const xtra = BAMAlignmentSetOffsets(self);
2174 
2175         if (   datasize >= xtra
2176             && datasize >= self->cigar
2177             && datasize >= self->seq
2178             && datasize >= self->qual)
2179         {
2180             rc_t const rc = ParseOptDataLog(self, maxsize, xtra, datasize);
2181 
2182             if (rc == 0) {
2183                 DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("{"
2184                                                                 "\"BAM record\": "
2185                                                                 "{ "
2186                                                                     "\"size\": %u, "
2187                                                                     "\"name length\": %u, "
2188                                                                     "\"cigar count\": %u, "
2189                                                                     "\"read length\": %u, "
2190                                                                     "\"extra count\": %u "
2191                                                                 "}"
2192                                                             "}\n",
2193                                                             (unsigned)datasize,
2194                                                             (unsigned)getReadNameLength(self),
2195                                                             (unsigned)getCigarCount(self),
2196                                                             (unsigned)getReadLen(self),
2197                                                             (unsigned)self->numExtra));
2198                 return true;
2199             }
2200         }
2201         DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("{"
2202                                                         "\"BAM record\": "
2203                                                         "{ "
2204                                                             "\"size\": %u, "
2205                                                             "\"name length\": %u, "
2206                                                             "\"cigar count\": %u, "
2207                                                             "\"read length\": %u "
2208                                                         "}"
2209                                                     "}\n",
2210                                                     (unsigned)datasize,
2211                                                     (unsigned)getReadNameLength(self),
2212                                                     (unsigned)getCigarCount(self),
2213                                                     (unsigned)getReadLen(self)));
2214         return false;
2215     }
2216 }
2217 
BAMAlignmentLogParseError(BAMAlignment const * self)2218 static void BAMAlignmentLogParseError(BAMAlignment const *self)
2219 {
2220     char const *const reason = self->cigar > self->datasize ? "BAM Record CIGAR too long"
2221                              : self->seq   > self->datasize ? "BAM Record SEQ too long"
2222                              : self->qual  > self->datasize ? "BAM Record QUAL too long"
2223                              : self->qual + getReadLen(self) > self->datasize ? "BAM Record EXTRA too long"
2224                              : "BAM Record EXTRA parsing failure";
2225 
2226     LOGERR(klogErr, SILENT_RC(rcAlign, rcFile, rcReading, rcRow, rcInvalid), reason);
2227 }
2228 
2229 /* MARK: BAM Alignment readers */
2230 
2231 /* returns
2232  *  (rcAlign, rcFile, rcReading, rcBuffer, rcNotAvailable)
2233  * or
2234  *  (rcAlign, rcFile, rcReading, rcBuffer, rcInsufficient)
2235  * if should read with copy
2236  */
2237 static
BAMFileReadNoCopy(BAMFile * const self,unsigned actsize[],BAMAlignment rhs[],unsigned const maxsize)2238 rc_t BAMFileReadNoCopy(BAMFile *const self, unsigned actsize[], BAMAlignment rhs[],
2239                        unsigned const maxsize)
2240 {
2241     unsigned const maxPeek = BAMFileMaxPeek(self);
2242     bool isgood;
2243 
2244     *actsize = 0;
2245 
2246     if (maxPeek == 0) {
2247         rc_t const rc = BAMFileFillBuffer(self);
2248 
2249         if (rc == 0)
2250             return BAMFileReadNoCopy(self, actsize, rhs, maxsize);
2251 
2252         if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
2253         {
2254             self->eof = true;
2255             return SILENT_RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
2256         }
2257         return rc;
2258     }
2259     if (maxPeek < 4)
2260         return SILENT_RC(rcAlign, rcFile, rcReading, rcBuffer, rcNotAvailable);
2261     else {
2262         BAMFilePosition curPos = (self->fpos_cur << 16) | self->bufCurrent;
2263         int32_t const i32 = BAMFilePeekI32(self);
2264 
2265         if (i32 <= 0)
2266             return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2267 
2268         if (maxPeek < ( uint32_t ) i32 + 4)
2269             return SILENT_RC(rcAlign, rcFile, rcReading, rcBuffer, rcNotAvailable);
2270 
2271         isgood = BAMAlignmentInitLog(rhs, maxsize, curPos, i32, BAMFilePeek(self, 4));
2272         rhs[0].parent = self;
2273         KRefcountInit(&rhs->refcount, 1, "BAMAlignment", "ReadNoCopy", "");
2274     }
2275     *actsize = BAMAlignmentSize(rhs[0].numExtra);
2276     if (isgood && *actsize > maxsize)
2277         return SILENT_RC(rcAlign, rcFile, rcReading, rcBuffer, rcInsufficient);
2278 
2279     BAMFileAdvance(self, 4 + rhs->datasize);
2280     return isgood ? 0 : RC(rcAlign, rcFile, rcReading, rcRow, rcInvalid);
2281 }
2282 
2283 static
BAMAlignmentSizeFromData(unsigned const datasize,void const * data)2284 unsigned BAMAlignmentSizeFromData(unsigned const datasize, void const *data)
2285 {
2286     BAMAlignment temp;
2287 
2288     BAMAlignmentInit(&temp, sizeof(temp), 0, datasize, data);
2289 
2290     return BAMAlignmentSize(temp.numExtra);
2291 }
2292 
BAMAlignmentIsEmpty(BAMAlignment const * const self)2293 static bool BAMAlignmentIsEmpty(BAMAlignment const *const self)
2294 {
2295     if (getReadNameLength(self) == 0)
2296         return true;
2297     if (self->hasColor == 3)
2298         return false;
2299     if (getReadLen(self) != 0)
2300         return false;
2301     if (getCigarCount(self) != 0)
2302         return false;
2303     return true;
2304 }
2305 
2306 static
BAMFileReadCopy(BAMFile * const self,BAMAlignment const * rslt[],bool const log)2307 rc_t BAMFileReadCopy(BAMFile *const self, BAMAlignment const *rslt[], bool const log)
2308 {
2309     void *storage;
2310     void const *data;
2311     unsigned datasize;
2312     rc_t rc;
2313     BAMFilePosition curPos = (self->fpos_cur << 16) | self->bufCurrent;
2314 
2315     rslt[0] = NULL;
2316     {
2317         int32_t i32;
2318 
2319         rc = BAMFileReadI32(self, &i32);
2320         if ( rc != 0 )
2321         {
2322             if ( GetRCObject( rc ) == (enum RCObject)rcData && GetRCState( rc ) == rcInsufficient )
2323             {
2324                 self->eof = true;
2325                 rc = RC( rcAlign, rcFile, rcReading, rcRow, rcNotFound );
2326             }
2327             return rc;
2328         }
2329         if (i32 <= 0)
2330             return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2331 
2332         datasize = i32;
2333     }
2334     if (BAMFileMaxPeek(self) < datasize) {
2335         data = storage = malloc(datasize);
2336         if (storage == NULL)
2337             return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2338 
2339         rc = BAMFileReadn(self, datasize, storage);
2340     }
2341     else {
2342         storage = NULL;
2343         data = (bam_alignment *)&self->buffer[self->bufCurrent];
2344 
2345         BAMFileAdvance(self, datasize);
2346     }
2347     if (rc == 0) {
2348         unsigned const rsltsize = BAMAlignmentSizeFromData(datasize, data);
2349         BAMAlignment *const y = malloc(rsltsize);
2350 
2351         if (y) {
2352             if ((log ? BAMAlignmentInitLog : BAMAlignmentInit)(y, rsltsize, curPos, datasize, data)) {
2353                 if (storage == NULL)
2354                     self->bufLocker = y;
2355                 else
2356                     y->storage = storage;
2357 
2358                 y->parent = self;
2359                 KRefcountInit(&y->refcount, 1, "BAMAlignment", "ReadCopy", "");
2360                 BAMFileAddRef(self);
2361                 rslt[0] = y;
2362 
2363                 if (BAMAlignmentIsEmpty(y))
2364                     return RC(rcAlign, rcFile, rcReading, rcRow, rcEmpty);
2365                 return 0;
2366             }
2367             rc = RC(rcAlign, rcFile, rcReading, rcRow, rcInvalid);
2368             free(y);
2369         }
2370         else
2371             rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2372     }
2373     free(storage);
2374 
2375     return rc;
2376 }
2377 
2378 static
BAMFileBreakLock(BAMFile * const self)2379 rc_t BAMFileBreakLock(BAMFile *const self)
2380 {
2381     if (self->bufLocker != NULL) {
2382         if (self->bufLocker->storage == NULL)
2383             self->bufLocker->storage = malloc(self->bufLocker->datasize);
2384         if (self->bufLocker->storage == NULL)
2385             return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2386 
2387         memmove(self->bufLocker->storage, self->bufLocker->data, self->bufLocker->datasize);
2388         self->bufLocker->data = (bam_alignment *)&self->bufLocker->storage[0];
2389         self->bufLocker = NULL;
2390     }
2391     return 0;
2392 }
2393 
BAMFileRead2(const BAMFile * cself,const BAMAlignment ** rhs)2394 LIB_EXPORT rc_t CC BAMFileRead2(const BAMFile *cself, const BAMAlignment **rhs)
2395 {
2396     BAMFile *const self = (BAMFile *)cself;
2397     unsigned actsize = 0;
2398     rc_t rc;
2399 
2400     if (self == NULL || rhs == NULL)
2401         return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2402 
2403     *rhs = NULL;
2404 
2405     if (self->bufCurrent >= self->bufSize && self->eof)
2406         return RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
2407 
2408     rc = BAMFileBreakLock(self);
2409     if (rc)
2410         return rc;
2411 
2412     if (self->nocopy_size == 0) {
2413         size_t const size = 4096u;
2414         void *const temp = malloc(size);
2415 
2416         if (temp == NULL)
2417             return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2418 
2419         self->nocopy = temp;
2420         self->nocopy_size = size;
2421     }
2422 
2423 AGAIN:
2424     rc = BAMFileReadNoCopy(self, &actsize, self->nocopy, (unsigned)self->nocopy_size);
2425     if (rc == 0) {
2426         *rhs = self->nocopy;
2427         if (BAMAlignmentIsEmpty(self->nocopy)) {
2428             rc = RC(rcAlign, rcFile, rcReading, rcRow, rcEmpty);
2429             LOGERR(klogWarn, rc, "BAM Record contains no alignment or sequence data");
2430         }
2431     }
2432     else if ( GetRCObject( rc ) == (enum RCObject)rcBuffer && GetRCState( rc ) == rcInsufficient )
2433     {
2434         unsigned const size = (actsize + 4095u) & ~4095u;
2435         void *const temp = realloc(self->nocopy, size);
2436 
2437         if (temp == NULL)
2438             return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
2439 
2440         self->nocopy = temp;
2441         self->nocopy_size = size;
2442 
2443         goto AGAIN;
2444     }
2445     else if ( GetRCObject( rc ) == (enum RCObject)rcBuffer && GetRCState( rc ) == rcNotAvailable )
2446     {
2447         rc = BAMFileReadCopy( self, rhs, true );
2448     }
2449     else if (GetRCObject(rc) == rcRow && GetRCState(rc) == rcInvalid) {
2450         BAMAlignmentLogParseError(self->nocopy);
2451     }
2452     return rc;
2453 }
2454 
BAMFileRead(const BAMFile * cself,const BAMAlignment ** rhs)2455 LIB_EXPORT rc_t CC BAMFileRead(const BAMFile *cself, const BAMAlignment **rhs)
2456 {
2457     BAMFile *const self = (BAMFile *)cself;
2458 
2459     if (self == NULL || rhs == NULL)
2460         return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2461 
2462     *rhs = NULL;
2463 
2464     if (self->bufCurrent >= self->bufSize && self->eof)
2465         return RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
2466     else {
2467         rc_t const rc = BAMFileBreakLock(self);
2468         if (rc)
2469             return rc;
2470     }
2471     return BAMFileReadCopy(self, rhs, false);
2472 }
2473 
2474 /* MARK: BAM File header info accessor */
2475 
BAMFileGetRefSeqById(const BAMFile * cself,int32_t id,const BAMRefSeq ** rhs)2476 LIB_EXPORT rc_t CC BAMFileGetRefSeqById(const BAMFile *cself, int32_t id, const BAMRefSeq **rhs)
2477 {
2478     *rhs = NULL;
2479     if (id >= 0 && id < cself->refSeqs)
2480         *rhs = &cself->refSeq[id];
2481     return 0;
2482 }
2483 
BAMFileGetReadGroupByName(const BAMFile * cself,const char * name,const BAMReadGroup ** rhs)2484 LIB_EXPORT rc_t CC BAMFileGetReadGroupByName(const BAMFile *cself, const char *name, const BAMReadGroup **rhs)
2485 {
2486     BAMReadGroup rg;
2487 
2488     *rhs = NULL;
2489 
2490     rg.name = name;
2491     if (rg.name != NULL)
2492         *rhs = kbsearch(&rg, cself->readGroup, cself->readGroups, sizeof(rg), comp_ReadGroup, NULL);
2493 
2494     return 0;
2495 }
2496 
BAMFileGetRefSeqCount(const BAMFile * cself,unsigned * rhs)2497 LIB_EXPORT rc_t CC BAMFileGetRefSeqCount(const BAMFile *cself, unsigned *rhs)
2498 {
2499     *rhs = cself->refSeqs;
2500     return 0;
2501 }
2502 
BAMFileGetRefSeq(const BAMFile * cself,unsigned i,const BAMRefSeq ** rhs)2503 LIB_EXPORT rc_t CC BAMFileGetRefSeq(const BAMFile *cself, unsigned i, const BAMRefSeq **rhs)
2504 {
2505     *rhs = NULL;
2506     if (i < cself->refSeqs)
2507         *rhs = &cself->refSeq[i];
2508     return 0;
2509 }
2510 
BAMFileGetReadGroupCount(const BAMFile * cself,unsigned * rhs)2511 LIB_EXPORT rc_t CC BAMFileGetReadGroupCount(const BAMFile *cself, unsigned *rhs)
2512 {
2513     *rhs = cself->readGroups;
2514     return 0;
2515 }
2516 
BAMFileGetReadGroup(const BAMFile * cself,unsigned i,const BAMReadGroup ** rhs)2517 LIB_EXPORT rc_t CC BAMFileGetReadGroup(const BAMFile *cself, unsigned i, const BAMReadGroup **rhs)
2518 {
2519     *rhs = NULL;
2520     if (i < cself->readGroups)
2521         *rhs = &cself->readGroup[i];
2522     return 0;
2523 }
2524 
BAMFileGetHeaderText(BAMFile const * cself,char const ** header,size_t * header_len)2525 LIB_EXPORT rc_t CC BAMFileGetHeaderText(BAMFile const *cself, char const **header, size_t *header_len)
2526 {
2527     *header = cself->header;
2528     *header_len = *header ? string_size( *header ) : 0;
2529     return 0;
2530 }
2531 
2532 /* MARK: BAM Alignment destructor */
2533 
BAMAlignmentWhack(BAMAlignment * self)2534 static rc_t BAMAlignmentWhack(BAMAlignment *self)
2535 {
2536     if (self->parent->bufLocker == self)
2537         self->parent->bufLocker = NULL;
2538     if (self != self->parent->nocopy) {
2539         BAMFileRelease(self->parent);
2540         free(self->storage);
2541         free(self);
2542     }
2543     return 0;
2544 }
2545 
2546 /* MARK: BAM Alignment ref-counting */
2547 
BAMAlignmentAddRef(const BAMAlignment * cself)2548 LIB_EXPORT rc_t CC BAMAlignmentAddRef(const BAMAlignment *cself)
2549 {
2550     if (cself != NULL)
2551         KRefcountAdd(&cself->refcount, "BAMAlignment");
2552     return 0;
2553 }
2554 
BAMAlignmentRelease(const BAMAlignment * cself)2555 LIB_EXPORT rc_t CC BAMAlignmentRelease(const BAMAlignment *cself)
2556 {
2557     if (cself && KRefcountDrop(&cself->refcount, "BAMAlignment") == krefWhack)
2558         BAMAlignmentWhack((BAMAlignment *)cself);
2559 
2560     return 0;
2561 }
2562 
2563 #if 0
2564 LIB_EXPORT uint16_t CC BAMAlignmentIffyFields(const BAMAlignment *self)
2565 {
2566 }
2567 
2568 LIB_EXPORT uint16_t CC BAMAlignmentBadFields(const BAMAlignment *self)
2569 {
2570 }
2571 #endif
2572 
2573 /* MARK: BAM Alignment accessors */
2574 
BAMAlignmentGetFilePos(const BAMAlignment * self)2575 LIB_EXPORT BAMFilePosition BAMAlignmentGetFilePos(const BAMAlignment *self)
2576 {
2577     return self->pos;
2578 }
2579 
BAMAlignmentGetCigarElement(const BAMAlignment * self,unsigned i)2580 static uint32_t BAMAlignmentGetCigarElement(const BAMAlignment *self, unsigned i)
2581 {
2582     return LE2HUI32(&((uint8_t const *)getCigarBase(self))[i * 4]);
2583 }
2584 
BAMAlignmentGetRefSeqId(const BAMAlignment * cself,int32_t * rhs)2585 LIB_EXPORT rc_t CC BAMAlignmentGetRefSeqId(const BAMAlignment *cself, int32_t *rhs)
2586 {
2587     *rhs = getRefSeqId(cself);
2588     return 0;
2589 }
2590 
BAMAlignmentGetPosition(const BAMAlignment * cself,int64_t * rhs)2591 LIB_EXPORT rc_t CC BAMAlignmentGetPosition(const BAMAlignment *cself, int64_t *rhs)
2592 {
2593     *rhs = getPosition(cself);
2594     return 0;
2595 }
2596 
BAMAlignmentIsMapped(const BAMAlignment * cself)2597 LIB_EXPORT bool CC BAMAlignmentIsMapped(const BAMAlignment *cself)
2598 {
2599     if (((getFlags(cself) & BAMFlags_SelfIsUnmapped) == 0) && getRefSeqId(cself) >= 0 && getPosition(cself) >= 0 && getCigarCount(cself) > 0)
2600         return true;
2601     return false;
2602 }
2603 
2604 /* static bool BAMAlignmentIsMateMapped(const BAMAlignment *cself)
2605 {
2606     if (((getFlags(cself) & BAMFlags_MateIsUnmapped) == 0) && getMateRefSeqId(cself) >= 0 && getMatePos(cself) >= 0)
2607         return true;
2608     return false;
2609 } */
2610 
BAMAlignmentGetAlignmentDetail(const BAMAlignment * self,BAMAlignmentDetail * rslt,uint32_t count,uint32_t * actual,int32_t * pfirst,int32_t * plast)2611 LIB_EXPORT rc_t CC BAMAlignmentGetAlignmentDetail(
2612                                                   const BAMAlignment *self,
2613                                                   BAMAlignmentDetail *rslt, uint32_t count, uint32_t *actual,
2614                                                   int32_t *pfirst, int32_t *plast
2615                                                   )
2616 {
2617     unsigned i;
2618     unsigned ccnt; /* cigar count */
2619     int32_t  gpos; /* refSeq pos in global coordinates */
2620     unsigned rpos; /* read pos (always local coordinates) */
2621     uint32_t rlen; /* read length */
2622     int32_t first = -1;
2623     int32_t last = -1;
2624 
2625     if (!self)
2626         return RC(rcAlign, rcFile, rcReading, rcSelf, rcNull);
2627 
2628     rlen = getReadLen(self);
2629     ccnt = getCigarCount(self);
2630     gpos = getPosition(self);
2631 
2632     if (gpos < 0)
2633         ccnt = 0;
2634 
2635     if (actual)
2636         *actual = ccnt;
2637 
2638     if (pfirst)
2639         *pfirst = -1;
2640 
2641     if (plast)
2642         *plast = -1;
2643 
2644     if (ccnt == 0)
2645         return 0;
2646 
2647     if (rslt == NULL) {
2648         if (actual == NULL)
2649             return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2650         count = 0;
2651     }
2652 
2653     if (count < ccnt)
2654         return RC(rcAlign, rcFile, rcReading, rcBuffer, rcInsufficient);
2655 
2656     for (rpos = 0, i = 0; i != ccnt; ++i) {
2657         uint32_t len = BAMAlignmentGetCigarElement(self, i);
2658         int op = len & 0x0F;
2659 
2660         if (op > sizeof(cigarChars))
2661             return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2662 
2663         op = cigarChars[op];
2664         len >>= 4;
2665 
2666         rslt[i].refSeq_pos = gpos;
2667         rslt[i].read_pos = rpos;
2668         rslt[i].length = len;
2669         rslt[i].type = (BAMCigarType)op;
2670 
2671         /* because we're assigning unsigned to signed below */
2672         assert ( ( int32_t ) i >= 0 );
2673 
2674         switch ((BAMCigarType)op) {
2675         case ct_Match:
2676         case ct_Equal:
2677             if (first == -1)
2678                 first = ( int32_t ) i;
2679             last = ( int32_t ) i;
2680             gpos += len;
2681             rpos += len;
2682             break;
2683         case ct_Insert:
2684         case ct_SoftClip:
2685             gpos += len;
2686             break;
2687         case ct_Delete:
2688         case ct_Skip:
2689             rpos += len;
2690             break;
2691         case ct_HardClip:
2692         case ct_Padded:
2693             rslt[i].refSeq_pos = -1;
2694             rslt[i].read_pos = -1;
2695             break;
2696         default:
2697             break;
2698         }
2699 
2700         if (rslt[i].read_pos > rlen)
2701             return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
2702     }
2703     if (pfirst)
2704         *pfirst = first;
2705 
2706     if (plast)
2707         *plast = last;
2708 
2709     return 0;
2710 }
2711 
2712 static
ReferenceLengthFromCIGAR(const BAMAlignment * self)2713 unsigned ReferenceLengthFromCIGAR(const BAMAlignment *self)
2714 {
2715     unsigned i;
2716     unsigned n = getCigarCount(self);
2717     unsigned y;
2718 
2719     for (i = 0, y = 0; i != n; ++i) {
2720         uint32_t const len = BAMAlignmentGetCigarElement(self, i);
2721 
2722         switch (cigarChars[len & 0x0F]) {
2723         case ct_Match:
2724         case ct_Equal:
2725         case ct_NotEqual:
2726         case ct_Delete:
2727         case ct_Skip:
2728             y += len >> 4;
2729             break;
2730         default:
2731             break;
2732         }
2733     }
2734     return y;
2735 }
2736 
2737 static
SequenceLengthFromCIGAR(const BAMAlignment * self)2738 unsigned SequenceLengthFromCIGAR(const BAMAlignment *self)
2739 {
2740     unsigned i;
2741     unsigned n = getCigarCount(self);
2742     unsigned y;
2743 
2744     for (i = 0, y = 0; i != n; ++i) {
2745         uint32_t const len = BAMAlignmentGetCigarElement(self, i);
2746 
2747         switch (cigarChars[len & 0x0F]) {
2748         case ct_Match:
2749         case ct_Equal:
2750         case ct_NotEqual:
2751         case ct_Insert:
2752         case ct_SoftClip:
2753             y += len >> 4;
2754             break;
2755         default:
2756             break;
2757         }
2758     }
2759     return y;
2760 }
2761 
BAMAlignmentGetPosition2(const BAMAlignment * cself,int64_t * rhs,uint32_t * length)2762 LIB_EXPORT rc_t CC BAMAlignmentGetPosition2(const BAMAlignment *cself, int64_t *rhs, uint32_t *length)
2763 {
2764     *rhs = getPosition(cself);
2765     if (*rhs >= 0)
2766         *length = ReferenceLengthFromCIGAR(cself);
2767     return 0;
2768 }
2769 
BAMAlignmentGetReadGroupName(const BAMAlignment * cself,const char ** rhs)2770 LIB_EXPORT rc_t CC BAMAlignmentGetReadGroupName(const BAMAlignment *cself, const char **rhs)
2771 {
2772     *rhs = get_RG(cself);
2773     return 0;
2774 }
2775 
BAMAlignmentGetReadName(const BAMAlignment * cself,const char ** rhs)2776 LIB_EXPORT rc_t CC BAMAlignmentGetReadName(const BAMAlignment *cself, const char **rhs)
2777 {
2778     *rhs = getReadName(cself);
2779     return 0;
2780 }
2781 
BAMAlignmentGetReadName2(const BAMAlignment * cself,const char ** rhs,size_t * length)2782 LIB_EXPORT rc_t CC BAMAlignmentGetReadName2(const BAMAlignment *cself, const char **rhs, size_t *length)
2783 {
2784     *length = getReadNameLength(cself) - 1;
2785     *rhs = getReadName(cself);
2786     return 0;
2787 }
2788 
BAMAlignmentGetReadName3(const BAMAlignment * cself,const char ** rhs,size_t * length)2789 LIB_EXPORT rc_t CC BAMAlignmentGetReadName3(const BAMAlignment *cself, const char **rhs, size_t *length)
2790 {
2791     char const *const name = getReadName(cself);
2792     size_t len = getReadNameLength(cself);
2793     size_t i;
2794 
2795     for (i = len; i; ) {
2796         int const ch = name[--i];
2797 
2798         if (ch == '/') {
2799             len = i;
2800             break;
2801         }
2802         if (!isdigit(ch))
2803             break;
2804     }
2805     *rhs = name;
2806     *length = len;
2807 
2808     return 0;
2809 }
2810 
BAMAlignmentGetFlags(const BAMAlignment * cself,uint16_t * rhs)2811 LIB_EXPORT rc_t CC BAMAlignmentGetFlags(const BAMAlignment *cself, uint16_t *rhs)
2812 {
2813     *rhs = getFlags(cself);
2814     return 0;
2815 }
2816 
BAMAlignmentGetMapQuality(const BAMAlignment * cself,uint8_t * rhs)2817 LIB_EXPORT rc_t CC BAMAlignmentGetMapQuality(const BAMAlignment *cself, uint8_t *rhs)
2818 {
2819     *rhs = getMapQual(cself);
2820     return 0;
2821 }
2822 
BAMAlignmentGetCigarCount(const BAMAlignment * cself,unsigned * rhs)2823 LIB_EXPORT rc_t CC BAMAlignmentGetCigarCount(const BAMAlignment *cself, unsigned *rhs)
2824 {
2825     *rhs = getCigarCount(cself);
2826     return 0;
2827 }
2828 
BAMAlignmentGetRawCigar(const BAMAlignment * cself,uint32_t const * rslt[],uint32_t * length)2829 LIB_EXPORT rc_t CC BAMAlignmentGetRawCigar(const BAMAlignment *cself, uint32_t const *rslt[], uint32_t *length)
2830 {
2831     *rslt = getCigarBase(cself);
2832     *length = getCigarCount(cself);
2833     return 0;
2834 }
2835 
BAMAlignmentGetCigar(const BAMAlignment * cself,uint32_t i,BAMCigarType * type,uint32_t * length)2836 LIB_EXPORT rc_t CC BAMAlignmentGetCigar(const BAMAlignment *cself, uint32_t i, BAMCigarType *type, uint32_t *length)
2837 {
2838     uint32_t x;
2839 
2840     if (i >= getCigarCount(cself))
2841         return RC(rcAlign, rcFile, rcReading, rcParam, rcInvalid);
2842 
2843     x = BAMAlignmentGetCigarElement(cself, i);
2844     *type = (BAMCigarType)(cigarChars[x & 0x0F]);
2845     *length = x >> 4;
2846     return 0;
2847 }
2848 
BAMAlignmentGetReadLength(const BAMAlignment * cself,uint32_t * rhs)2849 LIB_EXPORT rc_t CC BAMAlignmentGetReadLength(const BAMAlignment *cself, uint32_t *rhs)
2850 {
2851     *rhs = getReadLen(cself);
2852     return 0;
2853 }
2854 
BAMAlignmentGetSequence2(const BAMAlignment * cself,char * rhs,uint32_t start,uint32_t stop)2855 LIB_EXPORT rc_t CC BAMAlignmentGetSequence2(const BAMAlignment *cself, char *rhs, uint32_t start, uint32_t stop)
2856 {
2857     /*
2858      *   =    A    C    M    G    R    S    V    T    W    Y    H    K    D    B    N
2859      * 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
2860      * 1111 1000 0100 1100 0010 1010 0110 1110 0001 1001 0101 1101 0011 1011 0111 0000
2861      *   N    T    G    K    C    Y    S    B    A    W    R    D    M    H    V    =
2862      */
2863     static const char  tr[16] = "=ACMGRSVTWYHKDBN";
2864  /* static const char ctr[16] = "=TGKCYSBAWRDMHVN"; */
2865     unsigned const n = getReadLen(cself);
2866     const uint8_t * const seq = &cself->data->raw[cself->seq];
2867     unsigned si, di;
2868 
2869     if (stop == 0 || stop > n)
2870         stop = n;
2871 
2872     for (di = 0, si = start; si != stop; ++si, ++di) {
2873         unsigned const b4na2 = seq[si >> 1];
2874         unsigned const b4na = (si & 1) == 0 ? (b4na2 >> 4) : (b4na2 & 0x0F);
2875 
2876         rhs[di] = tr[b4na];
2877     }
2878     return 0;
2879 }
2880 
BAMAlignmentGetSequence(const BAMAlignment * cself,char * rhs)2881 LIB_EXPORT rc_t CC BAMAlignmentGetSequence(const BAMAlignment *cself, char *rhs)
2882 {
2883     return BAMAlignmentGetSequence2(cself, rhs, 0, 0);
2884 }
2885 
BAMAlignmentHasColorSpace(BAMAlignment const * cself)2886 LIB_EXPORT bool CC BAMAlignmentHasColorSpace(BAMAlignment const *cself)
2887 {
2888     return get_CS(cself) != NULL;
2889 }
2890 
BAMAlignmentGetCSKey(BAMAlignment const * cself,char rhs[1])2891 LIB_EXPORT rc_t CC BAMAlignmentGetCSKey(BAMAlignment const *cself, char rhs[1])
2892 {
2893     char const *const vCS = get_CS(cself);
2894 
2895     if (vCS)
2896         rhs[0] = vCS[0];
2897     return 0;
2898 }
2899 
BAMAlignmentGetCSSeqLen(BAMAlignment const * cself,uint32_t * rhs)2900 LIB_EXPORT rc_t CC BAMAlignmentGetCSSeqLen(BAMAlignment const *cself, uint32_t *rhs)
2901 {
2902     struct offset_size_s const *const vCS = get_CS_info(cself);
2903 
2904     *rhs = vCS ? vCS->size - 5 : 0;
2905     return 0;
2906 }
2907 
BAMAlignmentGetCSSequence(BAMAlignment const * cself,char rhs[],uint32_t seqlen)2908 LIB_EXPORT rc_t CC BAMAlignmentGetCSSequence(BAMAlignment const *cself, char rhs[], uint32_t seqlen)
2909 {
2910     char const *const vCS = get_CS(cself);
2911 
2912     if (vCS) {
2913         unsigned i;
2914 
2915         for (i = 0;i != seqlen; ++i) {
2916             char const ch = vCS[i+1];
2917 
2918             rhs[i] = (ch == '4') ? '.' : ch;
2919         }
2920     }
2921     return 0;
2922 }
2923 
BAMAlignmentGetQuality(const BAMAlignment * cself,const uint8_t ** rhs)2924 LIB_EXPORT rc_t CC BAMAlignmentGetQuality(const BAMAlignment *cself, const uint8_t **rhs)
2925 {
2926     *rhs = &cself->data->raw[cself->qual];
2927     return 0;
2928 }
2929 
BAMAlignmentGetQuality2(BAMAlignment const * cself,uint8_t const ** rhs,uint8_t * offset)2930 LIB_EXPORT rc_t CC BAMAlignmentGetQuality2(BAMAlignment const *cself, uint8_t const **rhs, uint8_t *offset)
2931 {
2932     uint8_t const *const OQ = get_OQ(cself);
2933 
2934     if (OQ) {
2935         struct offset_size_s const *const oq = get_OQ_info(cself);
2936 
2937         if (oq->size - 4 == getReadLen(cself)) {
2938             *offset = 33;
2939             *rhs = OQ;
2940         }
2941         else
2942             return RC(rcAlign, rcRow, rcReading, rcData, rcInconsistent);
2943     }
2944     else {
2945         *offset = 0;
2946         *rhs = &cself->data->raw[cself->qual];
2947     }
2948     return 0;
2949 }
2950 
BAMAlignmentGetCSQuality(BAMAlignment const * cself,uint8_t const ** rhs,uint8_t * offset)2951 LIB_EXPORT rc_t CC BAMAlignmentGetCSQuality(BAMAlignment const *cself, uint8_t const **rhs, uint8_t *offset)
2952 {
2953     struct offset_size_s const *const cs = get_CS_info(cself);
2954     struct offset_size_s const *const cq = get_CQ_info(cself);
2955     uint8_t const *const CQ = get_CQ(cself);
2956 
2957     if (cs && cq && CQ) {
2958         if (cs->size == cq->size) {
2959             *offset = 33;
2960             *rhs = CQ + 1;
2961             return 0;
2962         }
2963         if (cs->size == cq->size + 1) {
2964             *offset = 33;
2965             *rhs = CQ;
2966             return 0;
2967         }
2968         return RC(rcAlign, rcRow, rcReading, rcData, rcInconsistent);
2969     }
2970     *offset = 0;
2971     *rhs = &cself->data->raw[cself->qual];
2972     return 0;
2973 }
2974 
BAMAlignmentGetMateRefSeqId(const BAMAlignment * cself,int32_t * rhs)2975 LIB_EXPORT rc_t CC BAMAlignmentGetMateRefSeqId(const BAMAlignment *cself, int32_t *rhs)
2976 {
2977     *rhs = getMateRefSeqId(cself);
2978     return 0;
2979 }
2980 
BAMAlignmentGetMatePosition(const BAMAlignment * cself,int64_t * rhs)2981 LIB_EXPORT rc_t CC BAMAlignmentGetMatePosition(const BAMAlignment *cself, int64_t *rhs)
2982 {
2983     *rhs = getMatePos(cself);
2984     return 0;
2985 }
2986 
BAMAlignmentGetInsertSize(const BAMAlignment * cself,int64_t * rhs)2987 LIB_EXPORT rc_t CC BAMAlignmentGetInsertSize(const BAMAlignment *cself, int64_t *rhs)
2988 {
2989     *rhs = getInsertSize(cself);
2990     return 0;
2991 }
2992 
FormatOptData(BAMAlignment const * const self,size_t const maxsize,char buffer[])2993 static int FormatOptData(BAMAlignment const *const self,
2994                          size_t const maxsize,
2995                          char buffer[])
2996 {
2997     char const *const base = (char const *)&self->data->raw[self->qual + getReadLen(self)];
2998     unsigned i;
2999     unsigned offset;
3000     unsigned cur = 0;
3001     int j;
3002 
3003     for (i = 0, offset = 0; i < self->numExtra; ++i) {
3004         int type;
3005         union { float f; uint32_t i; } fi;
3006 
3007         if (cur + 7 > maxsize)
3008             return -1;
3009         buffer[cur++] = '\t';
3010         buffer[cur++] = base[offset++];
3011         buffer[cur++] = base[offset++];
3012         buffer[cur++] = ':';
3013         type = base[offset++];
3014 
3015         switch (type) {
3016             case dt_ASCII:      /* A */
3017                 buffer[cur++] = 'A';
3018                 buffer[cur++] = ':';
3019                 buffer[cur++] = base[offset++];
3020                 break;
3021 
3022             case dt_INT8:       /* c */
3023                 buffer[cur++] = 'i';
3024                 buffer[cur++] = ':';
3025                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)*((int8_t const *)(base + offset)));
3026                 if ((cur += j) >= maxsize)
3027                     return -1;
3028                 offset += 1;
3029                 break;
3030 
3031             case dt_UINT8:      /* C */
3032                 buffer[cur++] = 'i';
3033                 buffer[cur++] = ':';
3034                 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)*((uint8_t const *)(base + offset)));
3035                 if ((cur += j) >= maxsize)
3036                     return -1;
3037                 offset += 1;
3038                 break;
3039 
3040             case dt_INT16:      /* s */
3041                 buffer[cur++] = 'i';
3042                 buffer[cur++] = ':';
3043                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI16(base + offset));
3044                 if ((cur += j) >= maxsize)
3045                     return -1;
3046                 offset += 2;
3047                 break;
3048 
3049             case dt_UINT16:     /* S */
3050                 buffer[cur++] = 'i';
3051                 buffer[cur++] = ':';
3052                 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)LE2HUI16(base + offset));
3053                 if ((cur += j) >= maxsize)
3054                     return -1;
3055                 offset += 2;
3056                 break;
3057 
3058             case dt_INT:        /* i */
3059                 buffer[cur++] = 'i';
3060                 buffer[cur++] = ':';
3061                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI32(base + offset));
3062                 if ((cur += j) >= maxsize)
3063                     return -1;
3064                 offset += 4;
3065                 break;
3066 
3067             case dt_UINT:       /* I */
3068                 buffer[cur++] = 'i';
3069                 buffer[cur++] = ':';
3070                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI32(base + offset));
3071                 if ((cur += j) >= maxsize)
3072                     return -1;
3073                 offset += 4;
3074                 break;
3075 
3076             case dt_FLOAT32:    /* f */
3077                 buffer[cur++] = 'f';
3078                 buffer[cur++] = ':';
3079                 fi.i = LE2HUI32(base + offset);
3080                 j = snprintf(buffer + cur, maxsize - cur, "%f", fi.f);
3081                 if ((cur += j) >= maxsize)
3082                     return -1;
3083                 offset += 4;
3084                 break;
3085 
3086             case dt_HEXSTRING:  /* H */
3087             case dt_CSTRING:    /* Z */
3088                 buffer[cur++] = type == dt_CSTRING ? 'Z' : 'H';
3089                 buffer[cur++] = ':';
3090                 for ( ; ; ) {
3091                     int const ch = base[offset++];
3092 
3093                     if (ch == '\0')
3094                         break;
3095                     if (cur >= maxsize)
3096                         return -1;
3097                     buffer[cur++] = ch;
3098                 }
3099                 break;
3100 
3101             case dt_NUM_ARRAY:  /* B */
3102                 buffer[cur++] = 'B';
3103                 buffer[cur++] = ':';
3104                 {
3105                     int const elemtype = base[offset++];
3106                     unsigned const elemcount = LE2HUI32(base + offset);
3107                     unsigned k;
3108 
3109                     if (cur + 2 >= maxsize)
3110                         return -1;
3111                     buffer[cur++] = elemtype;
3112                     offset += 4;
3113                     for (k = 0; k < elemcount; ++k) {
3114                         buffer[cur++] = ',';
3115                         switch (elemtype) {
3116                             case dt_INT8:
3117                                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)*((int8_t const *)(base + offset)));
3118                                 if ((cur += j) >= maxsize)
3119                                     return -1;
3120                                 offset += 1;
3121                                 break;
3122 
3123                             case dt_UINT8:
3124                                 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)*((uint8_t const *)(base + offset)));
3125                                 if ((cur += j) >= maxsize)
3126                                     return -1;
3127                                 offset += 1;
3128                                 break;
3129 
3130                             case dt_INT16:
3131                                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI16(base + offset));
3132                                 if ((cur += j) >= maxsize)
3133                                     return -1;
3134                                 offset += 2;
3135                                 break;
3136 
3137                             case dt_UINT16:
3138                                 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)LE2HUI16(base + offset));
3139                                 if ((cur += j) >= maxsize)
3140                                     return -1;
3141                                 offset += 2;
3142                                 break;
3143 
3144                             case dt_INT:
3145                                 j = snprintf(buffer + cur, maxsize - cur, "%i", (int)LE2HI32(base + offset));
3146                                 if ((cur += j) >= maxsize)
3147                                     return -1;
3148                                 offset += 4;
3149                                 break;
3150 
3151                             case dt_UINT:
3152                                 j = snprintf(buffer + cur, maxsize - cur, "%u", (unsigned)LE2HUI32(base + offset));
3153                                 if ((cur += j) >= maxsize)
3154                                     return -1;
3155                                 offset += 4;
3156                                 break;
3157 
3158                             case dt_FLOAT32:
3159                                 fi.i = LE2HUI32(base + offset);
3160                                 j = snprintf(buffer + cur, maxsize - cur, "%f", fi.f);
3161                                 if ((cur += j) >= maxsize)
3162                                     return -1;
3163                                 offset += 4;
3164 
3165                             default:
3166                                 return -1;
3167                                 break;
3168                         }
3169                     }
3170                 }
3171                 break;
3172 
3173             default:
3174                 return -1;
3175                 break;
3176         }
3177     }
3178     return cur;
3179 }
3180 
FormatSAM(BAMAlignment const * self,size_t * const actsize,size_t const maxsize,char * const buffer)3181 static rc_t FormatSAM(BAMAlignment const *self,
3182                       size_t *const actsize,
3183                       size_t const maxsize,
3184                       char *const buffer)
3185 {
3186     int i = 0;
3187     size_t cur = 0;
3188     unsigned j;
3189     int const refSeqId = getRefSeqId(self);
3190     int const refPos = getPosition(self);
3191     unsigned const cigCount = getCigarCount(self);
3192     uint32_t const *const cigar = getCigarBase(self);
3193     int const mateRefSeqId = getMateRefSeqId(self);
3194     int const mateRefPos = getMatePos(self);
3195     unsigned const readlen = getReadLen(self);
3196 
3197     i = snprintf(&buffer[cur], maxsize - cur,
3198                  "%s\t%i\t%s\t%i\t%i\t",
3199                  getReadName(self),
3200                  getFlags(self),
3201                  refSeqId < 0 ? "*" : self->parent->refSeq[refSeqId].name,
3202                  refPos < 0 ? 0 : refPos + 1,
3203                  getMapQual(self)
3204                  );
3205     if ((cur += i) > maxsize)
3206         return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3207 
3208     if (cigCount > 0) {
3209         for (j = 0; j < cigCount; ++j) {
3210             uint32_t const el = cigar[j];
3211             BAMCigarType const type = (BAMCigarType)(cigarChars[el & 0x0F]);
3212             unsigned const length = el >> 4;
3213 
3214             i = snprintf(&buffer[cur], maxsize - cur, "%u%c", length, type);
3215             if ((cur += i) > maxsize)
3216                 return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3217         }
3218     }
3219     else {
3220         if ((cur + 1) > maxsize)
3221             return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3222         buffer[cur++] = '*';
3223     }
3224     i = snprintf(&buffer[cur], maxsize - cur,
3225                  "\t%s\t%i\t%i\t",
3226                  mateRefSeqId < 0 ? "*" : mateRefSeqId == refSeqId ? "=" : self->parent->refSeq[mateRefSeqId].name,
3227                  mateRefPos < 0 ? 0 : mateRefPos + 1,
3228                  getInsertSize(self)
3229                  );
3230     if ((cur += i) > maxsize)
3231         return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3232     if (readlen) {
3233         uint8_t const *const qual = &self->data->raw[self->qual];
3234 
3235         if (cur + 2 * readlen + 1 > maxsize)
3236             return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3237         BAMAlignmentGetSequence(self, &buffer[cur]);
3238         cur += readlen;
3239         buffer[cur] = '\t';
3240         ++cur;
3241 
3242         for (j = 0; j < readlen; ++j) {
3243             if (qual[j] != 0xFF)
3244                 goto HAS_QUAL;
3245         }
3246         if (1) {
3247             buffer[cur++] = '*';
3248         }
3249         else {
3250     HAS_QUAL:
3251             for (j = 0; j < readlen; ++j)
3252                 buffer[cur++] = qual[j] + 33;
3253         }
3254     }
3255     else {
3256         i = snprintf(&buffer[cur], maxsize - cur, "*\t*");
3257         if ((cur += i) > maxsize)
3258             return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3259     }
3260     i = FormatOptData(self, maxsize - cur, &buffer[cur]);
3261     if (i < 0)
3262         return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3263     if ((cur += i) + 2 > maxsize)
3264         return RC(rcAlign, rcRow, rcReading, rcData, rcExcessive);
3265     buffer[cur++] = '\n';
3266     buffer[cur] = '\0';
3267     *actsize = cur;
3268 
3269     return 0;
3270 }
3271 
3272 #define FORMAT_SAM_SCRATCH_SIZE ((size_t)(64u * 1024u))
FormatSAMBuffer(BAMAlignment const * self,size_t actSize[],size_t const maxsize,char * const buffer)3273 static rc_t FormatSAMBuffer(BAMAlignment const *self,
3274                             size_t actSize[],
3275                             size_t const maxsize,
3276                             char *const buffer)
3277 {
3278     char scratch[FORMAT_SAM_SCRATCH_SIZE];
3279     size_t actsize = 0;
3280     rc_t const rc = FormatSAM(self, &actsize, FORMAT_SAM_SCRATCH_SIZE, scratch);
3281 
3282     actSize[0] = actsize;
3283     if (rc) return rc;
3284 
3285     if (actsize > maxsize)
3286         return RC(rcAlign, rcReading, rcRow, rcBuffer, rcInsufficient);
3287 
3288     memmove(buffer, scratch, actsize);
3289     return 0;
3290 }
3291 
BAMAlignmentFormatSAM(BAMAlignment const * self,size_t * const actSize,size_t const maxsize,char * const buffer)3292 LIB_EXPORT rc_t CC BAMAlignmentFormatSAM(BAMAlignment const *self,
3293                                          size_t *const actSize,
3294                                          size_t const maxsize,
3295                                          char *const buffer)
3296 {
3297     if (self == NULL)
3298         return RC(rcAlign, rcReading, rcRow, rcSelf, rcNull);
3299     if (buffer == NULL)
3300         return RC(rcAlign, rcReading, rcRow, rcParam, rcNull);
3301     else {
3302         size_t actsize = 0;
3303         rc_t const rc = (maxsize < FORMAT_SAM_SCRATCH_SIZE ? FormatSAMBuffer : FormatSAM)(self, &actsize, maxsize, buffer);
3304 
3305         if (actSize)
3306             *actSize = actsize;
3307         return rc;
3308     }
3309 }
3310 
3311 typedef struct OptForEach_ctx_s {
3312     BAMOptData *val;
3313     BAMOptData **alloced;
3314     size_t valsize;
3315     rc_t rc;
3316     BAMOptionalDataFunction user_f;
3317     void *user_ctx;
3318 } OptForEach_ctx_t;
3319 
i_OptDataForEach(BAMAlignment const * cself,void * Ctx,char const tag[2],BAMOptDataValueType type,unsigned count,void const * value,unsigned size)3320 static bool i_OptDataForEach(BAMAlignment const *cself, void *Ctx, char const tag[2], BAMOptDataValueType type, unsigned count, void const *value, unsigned size)
3321 {
3322     OptForEach_ctx_t *ctx = (OptForEach_ctx_t *)Ctx;
3323     size_t const need = (size_t)&((BAMOptData const *)NULL)->u.f64[(count * size + sizeof(double) - 1)/sizeof(double)];
3324 
3325     if (need > ctx->valsize) {
3326         void *const temp = realloc(ctx->alloced, need);
3327         if (temp == NULL) {
3328             ctx->rc = RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
3329             return true;
3330         }
3331         *ctx->alloced = ctx->val = temp;
3332         ctx->valsize = need;
3333     }
3334     ctx->val->type = type;
3335     ctx->val->element_count = (type == dt_CSTRING || type == dt_HEXSTRING) ? size - 1 : count;
3336 
3337     memmove(ctx->val->u.u8, value, size * count);
3338 #if __BYTE_ORDER == __BIG_ENDIAN
3339     {{
3340         unsigned di;
3341         uint32_t elem_count = ctx->val->element_count;
3342 
3343         switch (size) {
3344         case 2:
3345             for (di = 0; di != elem_count; ++di)
3346                 ctx->val->u.u16[di] = LE2HUI16(&ctx->val->u.u16[di]);
3347             break;
3348         case 4:
3349             for (di = 0; di != elem_count; ++di)
3350                 ctx->val->u.u32[di] = LE2HUI32(&ctx->val->u.u32[di]);
3351             break;
3352         case 8:
3353             for (di = 0; di != elem_count; ++di)
3354                 ctx->val->u.u64[di] = LE2HUI64(&ctx->val->u.u64[di]);
3355             break;
3356         }
3357     }}
3358 #endif
3359     ctx->rc = ctx->user_f(ctx->user_ctx, tag, ctx->val);
3360     return ctx->rc != 0;
3361 }
3362 
BAMAlignmentOptDataForEach(const BAMAlignment * cself,void * user_ctx,BAMOptionalDataFunction f)3363 LIB_EXPORT rc_t CC BAMAlignmentOptDataForEach(const BAMAlignment *cself, void *user_ctx, BAMOptionalDataFunction f)
3364 {
3365     union u {
3366         BAMOptData value;
3367         uint8_t storage[4096];
3368     } value_auto;
3369     OptForEach_ctx_t ctx;
3370     rc_t rc = 0;
3371     unsigned i;
3372 
3373     ctx.val = &value_auto.value;
3374     ctx.alloced = NULL;
3375     ctx.valsize = sizeof(value_auto);
3376     ctx.rc = 0;
3377     ctx.user_f = f;
3378     ctx.user_ctx = user_ctx;
3379 
3380     for (i = 0; i != cself->numExtra; ++i) {
3381         char const *const tag = (char const *)&cself->data->raw[cself->extra[i].offset];
3382         uint8_t type = tag[2];
3383         uint8_t const *const vp = (uint8_t const *)&tag[3];
3384         unsigned len = cself->extra[i].size - 3;
3385         unsigned size = cself->extra[i].size - 3;
3386         unsigned count = 1;
3387         unsigned offset = 0;
3388 
3389         if (type == dt_NUM_ARRAY) {
3390             unsigned elem_size = 0;
3391             uint32_t elem_count = 0;
3392 
3393             offset = len = 5;
3394             switch (vp[0]) {
3395             case dt_INT8:
3396             case dt_UINT8:
3397                 elem_size = 1;
3398                 break;
3399             case dt_INT16:
3400             case dt_UINT16:
3401                 elem_size = 2;
3402                 break;
3403             case dt_FLOAT32:
3404             case dt_INT:
3405             case dt_UINT:
3406                 elem_size = 4;
3407                 break;
3408 #if 0
3409             case dt_FLOAT64:
3410                 elem_size = 8;
3411                 break;
3412 #endif
3413             default:
3414                 rc = RC(rcAlign, rcFile, rcReading, rcData, rcUnexpected);
3415                 break;
3416             }
3417             if (rc)
3418                 break;
3419             elem_count = LE2HUI32(&vp[1]);
3420             len += elem_size * elem_count;
3421             type = vp[0];
3422             count = elem_count;
3423             size = elem_size;
3424             break;
3425         }
3426         if (i_OptDataForEach(cself, &ctx, tag, type, count, &vp[offset], size))
3427             break;
3428     }
3429     rc = rc ? rc : ctx.rc;
3430     if (ctx.alloced)
3431         free(ctx.alloced);
3432     return rc;
3433 }
3434 
3435 /* MARK: Complete Genomics stuff */
3436 
BAMAlignmentHasCGData(BAMAlignment const * self)3437 LIB_EXPORT bool CC BAMAlignmentHasCGData(BAMAlignment const *self)
3438 {
3439     return get_CG_GC_info(self) && get_CG_GS_info(self) && get_CG_GQ_info(self);
3440 }
3441 
BAMAlignmentParseCGTag(BAMAlignment const * self,size_t const max_cg_segs,unsigned cg_segs[])3442 static bool BAMAlignmentParseCGTag(BAMAlignment const *self, size_t const max_cg_segs, unsigned cg_segs[/* max_cg_segs */])
3443 {
3444     /*** patern in cg_segs should be nSnGnSnG - no more then 7 segments **/
3445     struct offset_size_s const *const GCi = get_CG_GC_info(self);
3446     char const *cg  = (char const *)&self->data->raw[GCi->offset + 3];
3447     char const *const end = cg + GCi->size - 4;
3448     unsigned iseg = 0;
3449     char last_op = 'S';
3450 
3451     memset(cg_segs, 0, max_cg_segs * sizeof(cg_segs[0]));
3452 
3453     while (cg < end && iseg < max_cg_segs) {
3454         char *endp;
3455         long const op_len = strtol(cg, &endp, 10);
3456         char const op = *(cg = endp);
3457 
3458         ++cg;
3459         if (op==last_op) {
3460             cg_segs[iseg] += op_len;
3461         }
3462         else {
3463             last_op = op;
3464             ++iseg;
3465             cg_segs[iseg] = (unsigned)op_len;
3466         }
3467     }
3468     return true;
3469 }
3470 
3471 static
ExtractInt32(BAMAlignment const * self,int32_t * result,struct offset_size_s const * const tag)3472 rc_t ExtractInt32(BAMAlignment const *self, int32_t *result,
3473                   struct offset_size_s const *const tag)
3474 {
3475     int64_t y;
3476     int const type = self->data->raw[tag->offset + 2];
3477     void const *const pvalue = &self->data->raw[tag->offset + 3];
3478 
3479     switch (type) {
3480     case 'c':
3481         if (tag->size == 4)
3482             y = *((int8_t const *)pvalue);
3483         else
3484             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3485         break;
3486     case 'C':
3487         if (tag->size == 4)
3488             y = *((uint8_t const *)pvalue);
3489         else
3490             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3491         break;
3492     case 's':
3493         if (tag->size == 5)
3494             y = LE2HI16(pvalue);
3495         else
3496             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3497         break;
3498     case 'S':
3499         if (tag->size == 5)
3500             y = LE2HUI16(pvalue);
3501         else
3502             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3503         break;
3504     case 'i':
3505         if (tag->size == 7)
3506             y = LE2HI32(pvalue);
3507         else
3508             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3509         break;
3510     case 'I':
3511         if (tag->size == 7)
3512             y = LE2HUI32(pvalue);
3513         else
3514             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3515         break;
3516     default:
3517         return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3518     }
3519     if (INT32_MIN <= y && y <= INT32_MAX) {
3520         *result = (int32_t)y;
3521         return 0;
3522     }
3523     return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3524 }
3525 
3526 LIB_EXPORT
BAMAlignmentGetCGAlignGroup(BAMAlignment const * self,char buffer[],size_t max_size,size_t * act_size)3527 rc_t CC BAMAlignmentGetCGAlignGroup(BAMAlignment const *self,
3528                                     char buffer[],
3529                                     size_t max_size,
3530                                     size_t *act_size)
3531 {
3532     struct offset_size_s const *const ZA = get_CG_ZA_info(self);
3533     struct offset_size_s const *const ZI = get_CG_ZI_info(self);
3534 
3535     if (ZA && ZI) {
3536         rc_t rc;
3537         int32_t za;
3538         int32_t zi;
3539 
3540         rc = ExtractInt32(self, &za, ZA); if (rc) return rc;
3541         rc = ExtractInt32(self, &zi, ZI); if (rc) return rc;
3542         return string_printf(buffer, max_size, act_size, "%i_%i", zi, za);
3543     }
3544     return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3545 }
3546 
3547 LIB_EXPORT
BAMAlignmentGetCGSeqQual(BAMAlignment const * self,char sequence[],uint8_t quality[])3548 rc_t CC BAMAlignmentGetCGSeqQual(BAMAlignment const *self,
3549                                  char sequence[],
3550                                  uint8_t quality[])
3551 {
3552     struct offset_size_s const *const GCi = get_CG_GC_info(self);
3553     struct offset_size_s const *const GSi = get_CG_GS_info(self);
3554     struct offset_size_s const *const GQi = get_CG_GQ_info(self);
3555 
3556     if (GCi && GSi && GQi) {
3557         char const *const vGS = (char const *)&self->data->raw[GSi->offset + 3];
3558         char const *const GQ  = (char const *)&self->data->raw[GQi->offset + 3];
3559         unsigned const GSsize = GSi->size - 4;
3560         unsigned const sn = getReadLen(self);
3561         unsigned cg_segs[2*CG_NUM_SEGS-1]; /** 4 segments + 3gaps **/
3562         unsigned i,G,S;
3563 
3564         if (GSi->size != GQi->size)
3565             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3566 
3567         if (SequenceLengthFromCIGAR(self) != sn)
3568             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3569 
3570         if (!BAMAlignmentParseCGTag(self, 2*CG_NUM_SEGS-1, cg_segs))
3571             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3572 
3573         for (S = cg_segs[0], G = 0, i = 1; i < CG_NUM_SEGS; ++i) { /** sum all S and G **/
3574             S += cg_segs[2*i];
3575             G += cg_segs[2*i-1];
3576         }
3577         if (G + G != GSsize || S + G > sn || sn + G != 35) {
3578             /*fprintf(stderr, "GSsize: %u; sn: %u; S: %u; G: %u\n", GSsize, sn, S, G);*/
3579             return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3580         }
3581         if (G > 0) {
3582             unsigned nsi = cg_segs[0];   /** new index into sequence */
3583             unsigned osi = nsi + G;      /** old index into sequence */
3584             unsigned k;                  /** index into inserted sequence **/
3585 
3586             /***make room for inserts **/
3587             memmove(sequence + osi, sequence + nsi, sn - nsi);
3588             memmove(quality  + osi, quality  + nsi, sn - nsi);
3589 
3590             for (i = 1, k = 0; i < CG_NUM_SEGS && nsi < osi; ++i) {/*** when osi and nsi meet we are done ***/
3591                 unsigned j;
3592 
3593                 for (j = cg_segs[2*i-1]; j > 0; --j) { /** insert mode **/
3594                     sequence[nsi] = vGS[k];
3595                     quality [nsi] = GQ[k] - 33;
3596                     ++nsi; ++k;
3597                     sequence[nsi] = vGS[k];
3598                     quality [nsi] = GQ[k] - 33;
3599                     ++nsi;
3600                     ++osi;
3601                     ++k;
3602                 }
3603                 if (nsi < osi){
3604                     for (j = cg_segs[2*i]; j > 0; --j) { /** copy mode **/
3605                         sequence[nsi] = sequence[osi];
3606                         quality[nsi]  = quality[osi];
3607                         ++nsi;
3608                         ++osi;
3609                     }
3610                 }
3611             }
3612         }
3613         return 0;
3614     }
3615     return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3616 }
3617 
3618 
splice(uint32_t cigar[],unsigned n,unsigned at,unsigned out,unsigned in,uint32_t const new_values[])3619 static unsigned splice(uint32_t cigar[], unsigned n, unsigned at, unsigned out, unsigned in, uint32_t const new_values[/* in */])
3620 {
3621     assert(at + out <= n);
3622     memmove(&cigar[at + in], &cigar[at + out], (n - at - out) * 4);
3623     if (in)
3624         memmove(&cigar[at], new_values, in * 4);
3625     return n + in - out;
3626 }
3627 
3628 #define OPCODE_2_FIX (0xF)
3629 
insert_B(unsigned S,unsigned G,unsigned const n,uint32_t cigar[])3630 static unsigned insert_B(unsigned S, unsigned G, unsigned const n, uint32_t cigar[/* n */])
3631 {
3632     unsigned i;
3633     unsigned pos;
3634     unsigned const T = S + G;
3635 
3636     for (pos = i = 0; i < n; ++i) {
3637         int const opcode = cigar[i] & 0xF;
3638 
3639         switch (opcode) {
3640         case 0:
3641         case 1:
3642         case 4:
3643         case 7:
3644         case 8:
3645             {{
3646                 unsigned const len = cigar[i] >> 4;
3647                 unsigned const nxt = pos + len;
3648 
3649                 if (pos <= T && T <= nxt) {
3650                     unsigned const l = T - pos;
3651                     unsigned const r = len - l;
3652                     unsigned B = i + 2;
3653                     unsigned in = 4;
3654                     uint32_t Ops[4];
3655                     uint32_t *ops = Ops;
3656 
3657                     Ops[0] = (l << 4) | opcode;
3658                     Ops[1] = (G << 4) | 9; /* B */
3659                     Ops[2] = (G << 4) | 0; /* M this is not backwards */
3660                     Ops[3] = (r << 4) | opcode;
3661 
3662                     if (r == 0)
3663                         --in;
3664                     if (l == 0) {
3665                         ++ops;
3666                         --in;
3667                         --B;
3668                     }
3669                     return splice(cigar, n, i, 1, in, ops);
3670                 }
3671                 pos = nxt;
3672             }}
3673             break;
3674         default:
3675             break;
3676         }
3677     }
3678     return n;
3679 }
3680 
3681 #if 0
3682 static unsigned fix_I(uint32_t cigar[], unsigned n)
3683 {
3684     unsigned i;
3685     /* int last_b = 0; */
3686 
3687     for (i = 0; i < n; ++i) {
3688         unsigned const opcode = cigar[i] & 0xF;
3689 
3690         if (opcode == 0xF) {
3691             unsigned const oplen = cigar[i] >> 4;
3692             uint32_t ops[2];
3693 
3694             if (0/*last_b*/) {
3695                 ops[0] = (oplen << 4) | 0; /* M */
3696                 ops[1] = (oplen << 4) | 9; /* B */
3697             }
3698             else {
3699                 ops[0] = (oplen << 4) | 9; /* B */
3700                 ops[1] = (oplen << 4) | 0; /* M */
3701             }
3702 
3703             n = splice(cigar, n, i, 1, 2, ops);
3704             ++i;
3705         }
3706 /*      else if (opcode == 9)
3707             last_b = 1;
3708         else
3709             last_b = 0; */
3710     }
3711     return n;
3712 }
3713 
3714 static unsigned fix_IN(uint32_t cigar[], unsigned n)
3715 {
3716     unsigned i;
3717 
3718     for (i = 1; i < n; ++i) {
3719         unsigned const opL = cigar[i-1] & 0xF;
3720         unsigned const opI = cigar[ i ] & 0xF;
3721 
3722         if (opL == 1 && opI == 3) {
3723             unsigned const oplen = cigar[i-1] >> 4;
3724             uint32_t ops[2];
3725 
3726             ops[0] = (oplen << 4) | 9; /* B */
3727             ops[1] = (oplen << 4) | 0; /* M */
3728 
3729             n = splice(cigar, n, i-1, 1, 2, ops);
3730             ++i;
3731         }
3732         else if (opL == 3 && opI == 1) {
3733             unsigned const oplen = cigar[i] >> 4;
3734             uint32_t ops[2];
3735 
3736             ops[0] = (oplen << 4) | 9; /* M */
3737             ops[1] = (oplen << 4) | 0; /* B */
3738 
3739             n = splice(cigar, n, i, 1, 2, ops);
3740             ++i;
3741         }
3742     }
3743     return n;
3744 }
3745 #endif
3746 
canonicalize(uint32_t cigar[],unsigned n)3747 static unsigned canonicalize(uint32_t cigar[], unsigned n)
3748 {
3749     unsigned i;
3750 
3751     for (i = n; i > 0; ) {
3752         --i;
3753         if (cigar[i] >> 4 == 0 || (cigar[i] & 0xF) == 6)
3754             n = splice(cigar, n, i, 1, 0, NULL);
3755     }
3756     for (i = 1; i < n; ) {
3757         unsigned const opL = cigar[i-1] & 0xF;
3758         unsigned const opI = cigar[ i ] & 0xF;
3759 
3760         if (opI == opL) {
3761             unsigned const oplen = (cigar[i] >> 4) + (cigar[i-1] >> 4);
3762             uint32_t const op = (oplen << 4) | opI;
3763 
3764             n = splice(cigar, n, i-1, 2, 1, &op);
3765         }
3766         else
3767             ++i;
3768     }
3769 #if 0
3770     if ((cigar[0] & 0xF) == 1)
3771         cigar[0] = (cigar[0] & ~(uint32_t)0xF) | 4; /* I -> S */
3772     if ((cigar[n - 1] & 0xF) == 1)
3773         cigar[n - 1] = (cigar[n - 1] & ~(uint32_t)0xF) | 4; /* I -> S */
3774 #endif
3775     return n;
3776 }
3777 
3778 /* static void reverse(uint32_t cigar[], unsigned n)
3779 {
3780     unsigned i;
3781     unsigned j;
3782 
3783     for (j = n - 1, i = 0; i < j; ++i, --j) {
3784         uint32_t const tmp = cigar[i];
3785         cigar[i] = cigar[j];
3786         cigar[j] = tmp;
3787     }
3788 } */
3789 
GetCGCigar(BAMAlignment const * self,unsigned const N,uint32_t cigar[])3790 static unsigned GetCGCigar(BAMAlignment const *self, unsigned const N, uint32_t cigar[/* N */])
3791 {
3792     unsigned i;
3793     unsigned G;
3794     unsigned S;
3795     unsigned n = getCigarCount(self);
3796     unsigned cg_segs[2*CG_NUM_SEGS-1]; /** 4 segments + 3 gaps **/
3797 
3798     if (!BAMAlignmentParseCGTag(self, 2*CG_NUM_SEGS-1, cg_segs))
3799         return RC(rcAlign, rcRow, rcReading, rcData, rcInvalid);
3800 
3801     if (N < n + 5)
3802         return RC(rcAlign, rcRow, rcReading, rcBuffer, rcInsufficient);
3803 
3804     memmove(cigar, getCigarBase(self), n * 4);
3805     n = canonicalize(cigar, n); /* just in case */
3806     for (i = 0, S = 0; i < CG_NUM_SEGS - 1; ++i) {
3807         S += cg_segs[2*i];
3808         G  = cg_segs[2*i+1];
3809         if (G > 0) {
3810             n = insert_B(S, G, n, cigar);
3811             S += G;
3812         }
3813     }
3814     return n;
3815 }
3816 
3817 LIB_EXPORT
BAMAlignmentGetCGCigar(BAMAlignment const * self,uint32_t * cigar,uint32_t cig_max,uint32_t * cig_act)3818 rc_t CC BAMAlignmentGetCGCigar(BAMAlignment const *self,
3819                                uint32_t *cigar,
3820                                uint32_t cig_max,
3821                                uint32_t *cig_act)
3822 {
3823     struct offset_size_s const *const GCi = get_CG_GC_info(self);
3824 
3825     *cig_act = 0;
3826 
3827     if (GCi) {
3828         *cig_act = GetCGCigar(self, cig_max, cigar);
3829         return 0;
3830     }
3831     return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3832 }
3833 
3834 /* MARK: end CG stuff */
3835 
BAMAlignmentGetTI(BAMAlignment const * self,uint64_t * ti)3836 LIB_EXPORT rc_t BAMAlignmentGetTI(BAMAlignment const *self, uint64_t *ti)
3837 {
3838     char const *const TI = get_XT(self);
3839     long long unsigned temp;
3840 
3841     if (TI && sscanf(TI, "ti|%llu", &temp) == 1) {
3842         *ti = (uint64_t)temp;
3843         return 0;
3844     }
3845     return RC(rcAlign, rcRow, rcReading, rcData, rcNotFound);
3846 }
3847 
BAMAlignmentGetRNAStrand(BAMAlignment const * const self,uint8_t * const rslt)3848 LIB_EXPORT rc_t BAMAlignmentGetRNAStrand(BAMAlignment const *const self, uint8_t *const rslt)
3849 {
3850     if (rslt) {
3851         uint8_t const *const XS = get_XS(self);
3852 
3853 	    *rslt = XS ? XS[0] : ' ';
3854     }
3855     return 0;
3856 }
3857 
3858 /* MARK: BAMIndex stuff */
3859 
get_pos(uint8_t const buf[])3860 static uint64_t get_pos(uint8_t const buf[])
3861 {
3862     return LE2HUI64(buf);
3863 }
3864 
3865 #if 0
3866 static uint16_t bin2ival(uint16_t bin)
3867 {
3868     if (bin < 1)
3869         return 0; /* (bin - 0) << 15; */
3870 
3871     if (bin < 9)
3872         return (bin - 1) << 12;
3873 
3874     if (bin < 73)
3875         return (bin - 9) << 9;
3876 
3877     if (bin < 585)
3878         return (bin - 73) << 6;
3879 
3880     if (bin < 4681)
3881         return (bin - 585) << 3;
3882 
3883     if (bin < 37449)
3884         return (bin - 4681) << 0;
3885 
3886     return 0;
3887 }
3888 
3889 static uint16_t bin_ival_count(uint16_t bin)
3890 {
3891     if (bin < 1)
3892         return 1 << 15;
3893 
3894     if (bin < 9)
3895         return 1 << 12;
3896 
3897     if (bin < 73)
3898         return 1 << 9;
3899 
3900     if (bin < 585)
3901         return 1 << 6;
3902 
3903     if (bin < 4681)
3904         return 1 << 3;
3905 
3906     if (bin < 37449)
3907         return 1;
3908 
3909     return 0;
3910 }
3911 #endif
3912 
3913 enum BAMIndexStructureTypes {
3914     bai_pairs,
3915     bai_intervals
3916 };
3917 
3918 typedef rc_t (*WalkIndexStructureCallBack)(const uint8_t data[], size_t dlen,
3919                                            unsigned refNo,
3920                                            unsigned refs,
3921                                            enum BAMIndexStructureTypes type,
3922                                            unsigned binNo,
3923                                            unsigned bins,
3924                                            unsigned elements,
3925                                            void *ctx);
3926 
3927 static
WalkIndexStructure(uint8_t const buf[],size_t const blen,WalkIndexStructureCallBack func,void * ctx)3928 rc_t WalkIndexStructure(uint8_t const buf[], size_t const blen,
3929                         WalkIndexStructureCallBack func,
3930                         void *ctx
3931                         )
3932 {
3933     unsigned cp = 0;
3934     int32_t nrefs;
3935     unsigned i;
3936     rc_t rc;
3937 
3938     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index data length: %u", blen));
3939 
3940     if (cp + 4 > blen)
3941         return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3942 
3943     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index signature: '%c%c%c%u'", buf[cp+0], buf[cp+1], buf[cp+2], buf[cp+3]));
3944     if (memcmp(buf + cp, "BAI\1", 4) != 0)
3945         return RC(rcAlign, rcIndex, rcReading, rcFormat, rcUnknown);
3946 
3947     cp += 4;
3948     if (cp + 4 > blen)
3949         return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3950 
3951     nrefs = LE2HI32(buf + cp); cp += 4;
3952     DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference count: %i", nrefs));
3953 
3954     if (nrefs == 0)
3955         return RC(rcAlign, rcIndex, rcReading, rcData, rcEmpty);
3956 
3957     for (i = 0; i < nrefs; ++i) {
3958         int32_t bins;
3959         int32_t chunks;
3960         int32_t intervals;
3961         unsigned di;
3962 
3963         DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u: starts at %u", i, cp));
3964         if (cp + 4 > blen)
3965             return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3966 
3967         bins = LE2HI32(buf + cp); cp += 4;
3968         DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u: %i bins", i, nrefs));
3969 
3970         for (di = 0; di < bins; ++di) {
3971             uint32_t binNo;
3972 
3973             if (cp + 8 > blen)
3974                 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3975 
3976             binNo = LE2HUI32(buf + cp); cp += 4;
3977             chunks = LE2HI32(buf + cp); cp += 4;
3978             DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u, bin %u: %i chunks", i, binNo, chunks));
3979 
3980             if (cp + 16 * chunks > blen)
3981                 return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3982             if (chunks > 0)
3983                 rc = func(&buf[cp], 16 * chunks, i, nrefs, bai_pairs, binNo, bins, chunks, ctx);
3984             if (rc)
3985                 return rc;
3986             cp += 16 * chunks;
3987         }
3988         if (cp + 4 > blen)
3989             return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3990 
3991         intervals = LE2HI32(buf + cp); cp += 4;
3992         DBGMSG(DBG_ALIGN, DBG_FLAG(DBG_ALIGN_BAM), ("Index reference %u: %i intervals", i, intervals));
3993 
3994         if (cp + 8 * intervals > blen)
3995             return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
3996         rc = func(&buf[cp], 8 * intervals, i, nrefs, bai_intervals, ~(unsigned)0, bins, intervals, ctx);
3997         if (rc)
3998             return rc;
3999         cp += 8 * intervals;
4000     }
4001     if (cp > blen)
4002         return RC(rcAlign, rcIndex, rcReading, rcData, rcInsufficient);
4003     return 0;
4004 }
4005 
4006 struct MeasureIndex_s {
4007     unsigned refs;
4008     unsigned pairs;
4009     unsigned intervals;
4010 };
4011 
4012 struct LoadIndex2_s {
4013     BAMIndex *self;
4014     unsigned cur_rng;
4015     unsigned cur_pos;
4016 };
4017 
4018 static
MeasureIndex(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4019 rc_t MeasureIndex(const uint8_t data[], size_t dlen, unsigned refNo,
4020                   unsigned refs, enum BAMIndexStructureTypes type,
4021                   unsigned binNo, unsigned bins,
4022                   unsigned elements, void *Ctx)
4023 {
4024     struct MeasureIndex_s *ctx = (struct MeasureIndex_s *)Ctx;
4025 
4026     ctx->refs = refs;
4027     if (elements != 0) {
4028         if (type == bai_intervals) {
4029             ctx->intervals += elements;
4030         }
4031         else if (type == bai_pairs && binNo < MAX_BIN) {
4032             ctx->pairs += elements;
4033         }
4034     }
4035     return 0;
4036 }
4037 
4038 static
LoadIndex2(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4039 rc_t LoadIndex2(const uint8_t data[], size_t dlen, unsigned refNo,
4040                 unsigned refs, enum BAMIndexStructureTypes type,
4041                 unsigned binNo, unsigned bins,
4042                 unsigned elements, void *Ctx)
4043 {
4044     struct LoadIndex2_s *const ctx = (struct LoadIndex2_s *)Ctx;
4045     BAMIndexReference *const self = &ctx->self->ref[refNo];
4046     unsigned i;
4047 
4048     if (type == bai_pairs) {
4049         if (binNo < MAX_BIN) {
4050             BAMFileRange *const dst = ctx->self->rng + ctx->cur_rng;
4051             if (self->bin == NULL)
4052                 self->bin = dst;
4053             self->binSize[binNo] = elements;
4054             self->binStart[binNo] = dst - self->bin;
4055             for (i = 0; i < elements; ++i) {
4056                 dst[i].start = get_pos(data + 16 * i + 0);
4057                 dst[i].end   = get_pos(data + 16 * i + 8);
4058             }
4059             ctx->cur_rng += elements;
4060         }
4061         else if (binNo == MAX_BIN) {
4062             self->start = get_pos(data + 0);
4063             self->end   = get_pos(data + 8);
4064         }
4065     }
4066     else if (type == bai_intervals) {
4067         BAMFilePosition *const dst = ctx->self->pos + ctx->cur_pos;
4068         self->intervals = elements;
4069         self->interval = dst;
4070         for (i = 0; i < elements; ++i) {
4071             dst[i] = get_pos(data + 8 * i);
4072         }
4073         ctx->cur_pos += elements;
4074     }
4075     return 0;
4076 }
4077 
4078 static
LoadIndex(BAMFile * self,const uint8_t buf[],size_t blen)4079 rc_t LoadIndex(BAMFile *self, const uint8_t buf[], size_t blen)
4080 {
4081     rc_t rc;
4082     struct MeasureIndex_s ctx1;
4083 
4084     memset(&ctx1, 0, sizeof(ctx1));
4085     rc = WalkIndexStructure(buf, blen, MeasureIndex, &ctx1);
4086     if (rc == 0) {
4087         size_t indexRootSize = sizeof(BAMIndex) + sizeof(BAMIndexReference) * (ctx1.refs - 1);
4088         size_t indexDataCount = ctx1.pairs * 2 + ctx1.intervals;
4089         BAMIndex *idx = calloc(1, indexRootSize);
4090         if (idx) {
4091             idx->numRefs = ctx1.refs;
4092             idx->data = calloc(indexDataCount, sizeof(BAMFilePosition));
4093             idx->rng = idx->data;
4094             idx->pos = (void *)(idx->rng + ctx1.pairs);
4095             if (idx->data) {
4096                 struct LoadIndex2_s ctx2;
4097 
4098                 ctx2.self = idx;
4099                 ctx2.cur_pos = 0;
4100                 ctx2.cur_rng = 0;
4101                 WalkIndexStructure(buf, blen, LoadIndex2, &ctx2);
4102 
4103                 if (self->ndx)
4104                     BAMIndexWhack(self->ndx);
4105                 self->ndx = idx;
4106             }
4107             else
4108                 rc = RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4109         }
4110         else
4111             rc = RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4112     }
4113     return rc;
4114 }
4115 
4116 static
BAMFileOpenIndexKFile(const BAMFile * self,KFile const * kf)4117 rc_t BAMFileOpenIndexKFile(const BAMFile *self, KFile const *kf)
4118 {
4119     rc_t rc;
4120     size_t fsize;
4121     uint8_t *buf;
4122     {
4123         uint64_t u64;
4124 
4125         rc = KFileSize(kf, &u64);
4126         if (sizeof(size_t) < sizeof(u64) && (size_t)u64 != u64) {
4127             return RC(rcAlign, rcIndex, rcReading, rcData, rcExcessive);
4128         }
4129         fsize = u64;
4130     }
4131     if (rc == 0) {
4132         buf = malloc(fsize);
4133         if (buf != NULL) {
4134             size_t nread;
4135 
4136             rc = KFileReadAll(kf, 0, buf, fsize, &nread);
4137             if (rc == 0) {
4138                 if (nread == fsize) {
4139                     rc = LoadIndex((BAMFile *)self, buf, nread);
4140                     free(buf);
4141                     return rc;
4142                 }
4143                 rc = RC(rcAlign, rcIndex, rcReading, rcData, rcInvalid);
4144             }
4145             free(buf);
4146         }
4147         else
4148             rc = RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4149     }
4150     return rc;
4151 }
4152 
BAMFileOpenIndex(const BAMFile * self,const char * path)4153 LIB_EXPORT rc_t CC BAMFileOpenIndex(const BAMFile *self, const char *path)
4154 {
4155     const KFile *kf;
4156     rc_t rc;
4157     KDirectory *dir;
4158 
4159     rc = KDirectoryNativeDir(&dir);
4160     if (rc) return rc;
4161     rc = KDirectoryOpenFileRead(dir, &kf, "%s", path);
4162     KDirectoryRelease(dir);
4163     if (rc) return rc;
4164     rc = BAMFileOpenIndexKFile(self, kf);
4165     KFileRelease(kf);
4166     return rc;
4167 }
4168 
BAMFileOpenIndexWithVPath(const BAMFile * self,const VPath * path)4169 LIB_EXPORT rc_t CC BAMFileOpenIndexWithVPath(const BAMFile *self, const VPath *path)
4170 {
4171     VFSManager *vfs = NULL;
4172     KFile const *fp = NULL;
4173     rc_t rc = 0;
4174 
4175     rc = VFSManagerMake(&vfs);
4176     if (rc) return rc;
4177 
4178     rc = VFSManagerOpenFileRead( vfs, &fp, path );
4179     VFSManagerRelease(vfs);
4180     if (rc) return rc;
4181 
4182     rc = BAMFileOpenIndexKFile(self, fp);
4183     KFileRelease(fp);
4184     return rc;
4185 }
4186 
BAMFileIsIndexed(const BAMFile * self)4187 LIB_EXPORT bool CC BAMFileIsIndexed(const BAMFile *self)
4188 {
4189 	if (self && self->ndx)
4190 		return true;
4191 	return false;
4192 }
4193 
BAMFileIndexHasRefSeqId(const BAMFile * self,uint32_t refSeqId)4194 LIB_EXPORT bool CC BAMFileIndexHasRefSeqId(const BAMFile *self, uint32_t refSeqId)
4195 {
4196 	if (self && self->ndx && refSeqId < self->ndx->numRefs)
4197 		return true;
4198 	return false;
4199 }
4200 
4201 struct BAMFileSlice {
4202     unsigned refSeqId;
4203     unsigned sliceStart;
4204     unsigned sliceEnd;
4205     unsigned ranges;
4206     unsigned current;
4207     unsigned started;
4208     struct BAMFileRange range[1 /* ranges */];
4209 };
4210 
4211 typedef struct BinRange {
4212     uint16_t beg, end;
4213 } BinRange;
4214 
4215 typedef struct BinList {
4216     BinRange range[6];
4217 } BinList;
4218 
calcBinList(unsigned const refBeg,unsigned const refEnd)4219 BinList calcBinList(unsigned const refBeg, unsigned const refEnd)
4220 {
4221     BinList rslt;
4222     unsigned size = 1 << 29;
4223     unsigned offset = 0;
4224     unsigned i;
4225 
4226     for (i = 0; i < 6; ++i) {
4227         rslt.range[i].beg = offset + refBeg / size;
4228         rslt.range[i].end = offset + (refEnd - 1) / size;
4229         offset += 1 << (3 * i);
4230         size >>= 3;
4231     }
4232     return rslt;
4233 }
4234 
BAMFileRange_cmp(void const * const A,void const * const B,void * ctx)4235 static int64_t CC BAMFileRange_cmp(void const *const A, void const *const B, void *ctx)
4236 {
4237     struct BAMFileRange const *const a = (struct BAMFileRange const *)A;
4238     struct BAMFileRange const *const b = (struct BAMFileRange const *)B;
4239     return a->start < b->start ? -1 : b->start < a->start ? 1 : a->end < b->end ? -1 : b->end < a->end ? 1 : 0;
4240 }
4241 
4242 #define USE_MAXPOS 0
4243 #define USE_MINPOS 1
4244 #define SLICE_VERBOSE 0
4245 
includeRange(BAMFileRange const * const range,BAMFilePosition const minPos,BAMFilePosition const maxPos)4246 static int includeRange(BAMFileRange const *const range, BAMFilePosition const minPos, BAMFilePosition const maxPos)
4247 {
4248 #if USE_MAXPOS
4249     if (maxPos != 0 && range->start >= maxPos)
4250         return 0;
4251 #endif
4252 #if USE_MINPOS
4253     if (minPos != 0 && range->end <= minPos)
4254         return 0;
4255 #endif
4256     return 1;
4257 }
4258 
copyRanges(BAMFileSlice * slice,BAMIndexReference const * const refIndex,BAMFilePosition const minPos,BAMFilePosition const maxPos,BinRange const ranges[6])4259 static void copyRanges(BAMFileSlice *slice,
4260                        BAMIndexReference const *const refIndex,
4261                        BAMFilePosition const minPos,
4262                        BAMFilePosition const maxPos,
4263                        BinRange const ranges[6])
4264 {
4265     unsigned j = 0;
4266     unsigned i;
4267 
4268     for (i = 0; i < 6; ++i) {
4269         BinRange const r = ranges[i];
4270         uint16_t bin = r.beg;
4271         while (bin <= r.end) {
4272             unsigned const binSize = refIndex->binSize[bin];
4273             BAMFileRange const *const range = refIndex->bin + refIndex->binStart[bin];
4274             unsigned k;
4275 
4276             for (k = 0; k < binSize; ++k) {
4277                 if (includeRange(&range[k], minPos, maxPos)) {
4278                     slice->range[j] = range[k];
4279                     ++j;
4280                 }
4281             }
4282             ++bin;
4283         }
4284     }
4285     assert(j == slice->ranges);
4286     ksort(slice->range, slice->ranges, sizeof(slice->range[0]), BAMFileRange_cmp, NULL);
4287 }
4288 
countRanges(BAMIndexReference const * const refIndex,BAMFilePosition const minPos,BAMFilePosition const maxPos,BinRange const ranges[6])4289 static unsigned countRanges(BAMIndexReference const *const refIndex,
4290                             BAMFilePosition const minPos,
4291                             BAMFilePosition const maxPos,
4292                             BinRange const ranges[6])
4293 {
4294     unsigned j = 0;
4295     unsigned i;
4296 
4297 #if SLICE_VERBOSE
4298     fprintf(stderr, "min: %016llX; max: %016llX\n", (unsigned long long)minPos, (unsigned long long)maxPos);
4299 #endif
4300 
4301     for (i = 0; i < 6; ++i) {
4302         BinRange const r = ranges[i];
4303         uint16_t bin = r.beg;
4304         while (bin <= r.end) {
4305             unsigned const binSize = refIndex->binSize[bin];
4306             BAMFileRange const *const range = refIndex->bin + refIndex->binStart[bin];
4307             unsigned k;
4308 
4309             for (k = 0; k < binSize; ++k) {
4310                 if (includeRange(&range[k], minPos, maxPos)) {
4311                     ++j;
4312                 }
4313             }
4314             ++bin;
4315         }
4316     }
4317     return j;
4318 }
4319 
makeSlice(BAMFile const * const self,unsigned const refSeqId,unsigned const alignStart,unsigned const alignEnd)4320 static BAMFileSlice *makeSlice(BAMFile const *const self,
4321                                unsigned const refSeqId,
4322                                unsigned const alignStart,
4323                                unsigned const alignEnd)
4324 {
4325     BinList const bins = calcBinList(alignStart, alignEnd);
4326     BAMIndexReference const *const refIndex = &self->ndx->ref[refSeqId];
4327     unsigned const startBin = alignStart >> 14;
4328     unsigned const endBin = ((alignEnd - 1) >> 14) + 1;
4329     BAMFilePosition const minPos = refIndex->interval[startBin];
4330     BAMFilePosition const maxPos = endBin < refIndex->intervals ? refIndex->interval[endBin] : refIndex->end;
4331     unsigned const ranges = countRanges(refIndex, minPos, maxPos, bins.range);
4332     BAMFileSlice *slice;
4333 
4334     if (ranges > 0) {
4335         slice = malloc(sizeof(*slice) + (ranges - 1) * sizeof(slice->range[0]));
4336         if (slice) {
4337             slice->refSeqId = refSeqId;
4338             slice->sliceStart = alignStart;
4339             slice->sliceEnd = alignEnd;
4340             slice->ranges = ranges;
4341             slice->current = 0;
4342             slice->started = 0;
4343 
4344             copyRanges(slice, refIndex, minPos, maxPos, bins.range);
4345         }
4346     }
4347     else {
4348         slice = calloc(1, sizeof(*slice));
4349         if (slice) {
4350             slice->refSeqId = refSeqId;
4351             slice->sliceStart = alignStart;
4352             slice->sliceEnd = alignEnd;
4353         }
4354     }
4355     return slice;
4356 }
4357 
BAMFileMakeSlice(const BAMFile * self,BAMFileSlice ** rslt,uint32_t refSeqId,uint64_t alignStart,uint64_t alignEnd)4358 LIB_EXPORT rc_t CC BAMFileMakeSlice(const BAMFile *self, BAMFileSlice **rslt, uint32_t refSeqId, uint64_t alignStart, uint64_t alignEnd)
4359 {
4360     if (self->ndx == NULL)
4361         return RC(rcAlign, rcFile, rcPositioning, rcIndex, rcNotFound);
4362     if (refSeqId >= self->refSeqs)
4363         return RC(rcAlign, rcFile, rcPositioning, rcData, rcNotFound);
4364     if (self->ndx->numRefs <= refSeqId)
4365         return RC(rcAlign, rcFile, rcPositioning, rcData, rcNotFound);
4366     if (alignStart >= self->refSeq[refSeqId].length)
4367         return RC(rcAlign, rcFile, rcPositioning, rcData, rcNotFound);
4368 
4369     if (alignEnd > self->refSeq[refSeqId].length)
4370         alignEnd = self->refSeq[refSeqId].length;
4371 
4372     *rslt = makeSlice(self, refSeqId, alignStart, alignEnd);
4373     return 0;
4374 }
4375 
BAMFileReadSlice(const BAMFile * cself,const BAMAlignment ** rhs,BAMFileSlice * slice)4376 LIB_EXPORT rc_t CC BAMFileReadSlice(const BAMFile *cself, const BAMAlignment **rhs, BAMFileSlice *slice)
4377 {
4378     assert(cself != NULL);
4379     assert(rhs != NULL);
4380     assert(slice != NULL);
4381     if (cself == NULL || rhs == NULL || slice == NULL)
4382         return RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
4383 
4384     while (slice->current < slice->ranges) {
4385         if (slice->started == 0) {
4386             rc_t rc = BAMFileSetPosition(cself, &slice->range[slice->current].start);
4387             if (rc) break;
4388         }
4389         ++slice->started;
4390         {
4391             BAMFilePosition const curPos = BAMFileGetPositionInt(cself);
4392             if (curPos < slice->range[slice->current].end) {
4393                 return BAMFileRead2(cself, rhs);
4394             }
4395 #if SLICE_VERBOSE
4396             fprintf(stderr, "slice #%u (%016llX-%016llX) contained %u records\n", slice->current + 1, (unsigned long long)slice->range[slice->current].start, (unsigned long long)slice->range[slice->current].end, slice->started);
4397 #endif
4398             ++slice->current;
4399             if (slice->current == slice->ranges)
4400                 break;
4401             if (curPos < slice->range[slice->current].start)
4402                 slice->started = 0;
4403         }
4404     }
4405     return RC(rcAlign, rcFile, rcReading, rcRow, rcNotFound);
4406 }
4407 
BAMFileSeek(const BAMFile * self,uint32_t refSeqId,uint64_t alignStart,uint64_t alignEnd)4408 LIB_EXPORT rc_t CC BAMFileSeek(const BAMFile *self, uint32_t refSeqId, uint64_t alignStart, uint64_t alignEnd)
4409 {
4410     /* Use MakeSlice and ReadSlice instead */
4411     return RC(rcAlign, rcFile, rcPositioning, rcFunction, rcUnsupported);
4412 }
4413 
BAMIndexWhack(const BAMIndex * cself)4414 static rc_t BAMIndexWhack(const BAMIndex *cself) {
4415     free(cself->data);
4416     free((void *)cself);
4417     return 0;
4418 }
4419 
4420 /* MARK: BAM Validation Stuff */
4421 
OpenVPathRead(const KFile ** fp,struct VPath const * path)4422 static rc_t OpenVPathRead(const KFile **fp, struct VPath const *path)
4423 {
4424     char buffer[4096];
4425     size_t blen;
4426     rc_t rc = VPathReadPath(path, buffer, sizeof(buffer), &blen);
4427 
4428     if (rc == 0) {
4429         KDirectory *dir;
4430 
4431         rc = KDirectoryNativeDir(&dir);
4432         if (rc == 0) {
4433             rc = KDirectoryOpenFileRead(dir, fp, "%.*s", (int)blen, buffer);
4434             KDirectoryRelease(dir);
4435         }
4436     }
4437     return rc;
4438 }
4439 
ReadVPath(void ** data,size_t * dsize,struct VPath const * path)4440 static rc_t ReadVPath(void **data, size_t *dsize, struct VPath const *path)
4441 {
4442     const KFile *fp;
4443     rc_t rc = OpenVPathRead(&fp, path);
4444 
4445     if (rc == 0) {
4446         uint8_t *buff;
4447         uint64_t fsz;
4448         size_t bsz = 0;
4449 
4450         rc = KFileSize(fp, &fsz);
4451         if (rc == 0) {
4452             if ((size_t)fsz != fsz)
4453                 return RC(rcAlign, rcFile, rcReading, rcFile, rcTooBig);
4454             buff = malloc(fsz);
4455             if (buff == NULL)
4456                 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
4457             do {
4458                 size_t nread;
4459 
4460                 rc = KFileRead(fp, 0, buff + bsz, fsz - bsz, &nread);
4461                 if (rc)
4462                     break;
4463                 bsz += nread;
4464             } while (bsz < (size_t)fsz);
4465             if (rc == 0) {
4466                 *data = buff;
4467                 *dsize = bsz;
4468                 return 0;
4469             }
4470             free(buff);
4471         }
4472     }
4473     return rc;
4474 }
4475 
VPath2BGZF(BGZFile * bgzf,struct VPath const * path)4476 static rc_t VPath2BGZF(BGZFile *bgzf, struct VPath const *path)
4477 {
4478     const KFile *fp;
4479     BGZFile_vt dummy;
4480     rc_t rc = OpenVPathRead(&fp, path);
4481 
4482     if (rc == 0) {
4483         rc = BGZFileInit(bgzf, fp, &dummy);
4484         KFileRelease(fp);
4485     }
4486     return rc;
4487 }
4488 
4489 struct index_data {
4490     uint64_t position;
4491     unsigned refNo;
4492     unsigned binNo;
4493     bool found;
4494 };
4495 
4496 struct buffer_data {
4497     uint64_t position;
4498     size_t size;
4499 };
4500 
4501 typedef struct BAMValidate_ctx_s BAMValidate_ctx_t;
4502 struct BAMValidate_ctx_s {
4503     BAMValidateCallback callback;
4504     void *ctx;
4505     BAMValidateStats *stats;
4506     const uint8_t *bai;
4507     int32_t *refLen;
4508     struct index_data *position;
4509     uint8_t *buf;
4510     uint8_t *nxt;
4511     size_t bsize;
4512     size_t alloced;
4513     size_t dnext;
4514     uint32_t options;
4515     uint32_t lastRefId;
4516     uint32_t lastRefPos;
4517     unsigned npositions;
4518     unsigned mpositions;
4519     unsigned nrefs;
4520     bool cancelled;
4521 };
4522 
4523 static
IndexValidateStructure(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4524 rc_t IndexValidateStructure(const uint8_t data[], size_t dlen,
4525                             unsigned refNo,
4526                             unsigned refs,
4527                             enum BAMIndexStructureTypes type,
4528                             unsigned binNo,
4529                             unsigned bins,
4530                             unsigned elements,
4531                             void *Ctx)
4532 {
4533     BAMValidate_ctx_t *ctx = Ctx;
4534     rc_t rc = 0;
4535 
4536     ctx->stats->baiFilePosition = data - ctx->bai;
4537     rc = ctx->callback(ctx->ctx, 0, ctx->stats);
4538     if (rc)
4539         ctx->cancelled = true;
4540     return rc;
4541 }
4542 
comp_index_data(const void * A,const void * B,void * ignored)4543 static int64_t CC comp_index_data(const void *A, const void *B, void *ignored)
4544 {
4545     const struct index_data *a = A;
4546     const struct index_data *b = B;
4547 
4548     if (a->position < b->position)
4549         return -1;
4550     else if (a->position > b->position)
4551         return 1;
4552     else
4553         return 0;
4554 }
4555 
4556 static
BAMValidateLoadIndex(const uint8_t data[],size_t dlen,unsigned refNo,unsigned refs,enum BAMIndexStructureTypes type,unsigned binNo,unsigned bins,unsigned elements,void * Ctx)4557 rc_t BAMValidateLoadIndex(const uint8_t data[], size_t dlen,
4558                           unsigned refNo,
4559                           unsigned refs,
4560                           enum BAMIndexStructureTypes type,
4561                           unsigned binNo,
4562                           unsigned bins,
4563                           unsigned elements,
4564                           void *Ctx)
4565 {
4566     BAMValidate_ctx_t *ctx = Ctx;
4567     unsigned const n = type == bai_intervals ? elements : elements * 2;
4568     unsigned i;
4569     unsigned j;
4570 
4571     if (type == bai_pairs && binNo >= MAX_BIN)
4572         return 0;
4573 
4574     if (ctx->npositions + elements > ctx->mpositions) {
4575         void *temp;
4576 
4577         do { ctx->mpositions <<= 1; } while (ctx->npositions + elements > ctx->mpositions);
4578         temp = realloc(ctx->position, ctx->mpositions * sizeof(ctx->position[0]));
4579         if (temp == NULL)
4580             return RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4581         ctx->position = temp;
4582     }
4583     for (j = i = 0; i != n; ++i) {
4584         uint64_t const pos = get_pos(&data[i * 8]);
4585 
4586         if (type == bai_pairs && (i & 1) != 0)
4587             continue;
4588 
4589         if (pos) {
4590             ctx->position[ctx->npositions + j].refNo = refNo;
4591             ctx->position[ctx->npositions + j].binNo = binNo;
4592             ctx->position[ctx->npositions + j].position = pos;
4593             ++j;
4594         }
4595     }
4596     ctx->npositions += j;
4597     return 0;
4598 }
4599 
4600 static
BAMValidateHeader(const uint8_t data[],unsigned dsize,unsigned * header_len,unsigned * refs_start,unsigned * nrefs,unsigned * data_start)4601 rc_t BAMValidateHeader(const uint8_t data[],
4602                        unsigned dsize,
4603                        unsigned *header_len,
4604                        unsigned *refs_start,
4605                        unsigned *nrefs,
4606                        unsigned *data_start
4607                        )
4608 {
4609     int32_t hlen;
4610     int32_t refs;
4611     unsigned i;
4612     unsigned cp;
4613 
4614     if (dsize < 8)
4615         return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4616 
4617     if (memcmp(data, "BAM\1", 4) != 0)
4618         return RC(rcAlign, rcFile, rcValidating, rcFormat, rcUnrecognized);
4619 
4620     hlen = LE2HI32(&data[4]);
4621     if (hlen < 0)
4622         return RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4623 
4624     if (dsize < hlen + 12)
4625         return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4626 
4627     refs = LE2HI32(&data[hlen + 8]);
4628     if (refs < 0)
4629         return RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4630 
4631     for (cp = hlen + 12, i = 0; i != refs; ++i) {
4632         int32_t nlen;
4633 
4634         if (dsize < cp + 4)
4635             return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4636 
4637         nlen = LE2HI32(&data[cp]);
4638         if (nlen < 0)
4639             return RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4640 
4641         if (dsize < cp + nlen + 4)
4642             return RC(rcAlign, rcFile, rcValidating, rcData, rcIncomplete);
4643 
4644         cp += nlen + 4;
4645     }
4646 
4647     *nrefs = refs;
4648     *refs_start = 12 + (*header_len = hlen);
4649     *data_start = cp;
4650     return 0;
4651 }
4652 
BAMValidateIndex(struct VPath const * bampath,struct VPath const * baipath,BAMValidateOption options,BAMValidateCallback callback,void * callbackContext)4653 static rc_t BAMValidateIndex(struct VPath const *bampath,
4654                              struct VPath const *baipath,
4655                              BAMValidateOption options,
4656                              BAMValidateCallback callback,
4657                              void *callbackContext
4658                              )
4659 {
4660     rc_t rc = 0;
4661     BGZFile bam;
4662     uint8_t *bai = NULL;
4663     size_t bai_size;
4664     BAMValidateStats stats;
4665     BAMValidate_ctx_t ctx;
4666     uint8_t data[2 * ZLIB_BLOCK_SIZE];
4667     uint32_t dsize = 0;
4668     uint64_t pos = 0;
4669     uint32_t temp;
4670     int32_t ref = -1;
4671     int32_t rpos = -1;
4672 
4673     if ((options & bvo_IndexOptions) == 0)
4674         return callback(callbackContext, 0, &stats);
4675 
4676     rc = ReadVPath((void **)&bai, &bai_size, baipath);
4677     if (rc)
4678         return rc;
4679 
4680     memset(&stats, 0, sizeof(stats));
4681     memset(&ctx, 0, sizeof(ctx));
4682 
4683     ctx.bai = bai;
4684     ctx.stats = &stats;
4685     ctx.options = options;
4686     ctx.ctx = callbackContext;
4687     ctx.callback = callback;
4688 
4689     if ((options & bvo_IndexOptions) == bvo_IndexStructure)
4690         return WalkIndexStructure(bai, bai_size, IndexValidateStructure, &ctx);
4691 
4692     rc = VPath2BGZF(&bam, bampath);
4693     if (rc == 0) {
4694         ctx.mpositions = 1024 * 32;
4695         ctx.position = malloc(ctx.mpositions * sizeof(ctx.position[0]));
4696         if (ctx.position == NULL)
4697             return RC(rcAlign, rcIndex, rcReading, rcMemory, rcExhausted);
4698 
4699         rc = WalkIndexStructure(bai, bai_size, BAMValidateLoadIndex, &ctx);
4700         free(bai);
4701         if (rc) {
4702             stats.indexStructureIsBad = true;
4703             rc = callback(callbackContext, rc, &stats);
4704         }
4705         else {
4706             unsigned i = 0;
4707 
4708             stats.indexStructureIsGood = true;
4709             stats.baiFileSize = ctx.npositions;
4710 
4711             ksort(ctx.position, ctx.npositions, sizeof(ctx.position[0]), comp_index_data, 0);
4712 
4713             stats.bamFileSize = bam.fsize;
4714 
4715             while (i < ctx.npositions) {
4716                 uint64_t const ifpos = ctx.position[i].position >> 16;
4717                 uint16_t const bpos = (uint16_t)ctx.position[i].position;
4718 
4719                 stats.baiFilePosition = i;
4720                 if (i == 0 || ifpos != pos) {
4721                     stats.bamFilePosition = pos = ifpos;
4722                     rc = BGZFileSetPos(&bam, pos);
4723                     if (rc == 0)
4724                         rc = BGZFileRead(&bam, data, &dsize);
4725                     if (rc) {
4726                         ++stats.indexFileOffset.error;
4727                         do {
4728                             ++i;
4729                             if (i == ctx.npositions)
4730                                 break;
4731                             if (ctx.position[i].position >> 16 != pos)
4732                                 break;
4733                             ++stats.indexFileOffset.error;
4734                         } while (1);
4735                     }
4736                     else
4737                         ++stats.indexFileOffset.good;
4738 
4739                     rc = callback(callbackContext, rc, &stats);
4740                     if (rc)
4741                         break;
4742                 }
4743                 else
4744                     ++stats.indexFileOffset.good;
4745                 if ((options & bvo_IndexOptions) > bvo_IndexOffsets1) {
4746                     int32_t rsize = 0;
4747                     BAMAlignment algn;
4748 
4749                     if (bpos >= dsize)
4750                         goto BAD_BLOCK_OFFSET;
4751                     if (dsize - bpos < 4) {
4752                     READ_MORE:
4753                         if (dsize > ZLIB_BLOCK_SIZE)
4754                             goto BAD_BLOCK_OFFSET;
4755 
4756                         rc = BGZFileRead(&bam, data + dsize, &temp);
4757                         if (rc) {
4758                             ++stats.blockCompression.error;
4759                             goto BAD_BLOCK_OFFSET;
4760                         }
4761                         dsize += temp;
4762                         if (dsize - bpos < 4 || dsize - bpos < rsize)
4763                             goto BAD_BLOCK_OFFSET;
4764                     }
4765                     rsize = LE2HI32(data + bpos);
4766                     if (rsize <= 0)
4767                         goto BAD_BLOCK_OFFSET;
4768                     if (rsize > 0xFFFF) {
4769                         ++stats.indexBlockOffset.warning;
4770                         ++i;
4771                         continue;
4772                     }
4773                     if (dsize - bpos < rsize)
4774                         goto READ_MORE;
4775 /*                    rc = BAMAlignmentParse(&algn, data + bpos + 4, rsize); */
4776                     if (rc)
4777                         goto BAD_BLOCK_OFFSET;
4778                     ++stats.indexBlockOffset.good;
4779                     if ((options & bvo_IndexOptions) > bvo_IndexOffsets2) {
4780                         int32_t const refSeqId = getRefSeqId(&algn);
4781                         uint16_t const binNo = getBin(&algn);
4782                         int32_t const position = getPosition(&algn);
4783 
4784                         if (ctx.position[i].refNo == refSeqId &&
4785                             (ctx.position[i].binNo == binNo ||
4786                              ctx.position[i].binNo == ~((unsigned)0)
4787                         ))
4788                             ++stats.indexBin.good;
4789                         else if (ctx.position[i].refNo == refSeqId)
4790                             ++stats.indexBin.warning;
4791                         else
4792                             ++stats.indexBin.error;
4793 
4794                         if (refSeqId < ref || position < rpos)
4795                             ++stats.inOrder.error;
4796 
4797                         ref = refSeqId;
4798                         rpos = position;
4799                     }
4800                 }
4801                 if (0) {
4802                 BAD_BLOCK_OFFSET:
4803                     ++stats.indexBlockOffset.error;
4804                 }
4805                 ++i;
4806             }
4807         }
4808 
4809         free(ctx.position);
4810         BGZFileWhack(&bam);
4811     }
4812     stats.bamFilePosition = stats.bamFileSize;
4813     return callback(callbackContext, rc, &stats);
4814 }
4815 
BAMValidate3(BAMValidate_ctx_t * ctx,BAMAlignment const * algn)4816 static rc_t BAMValidate3(BAMValidate_ctx_t *ctx,
4817                          BAMAlignment const *algn
4818                          )
4819 {
4820     rc_t rc = 0;
4821     uint16_t const flags = getFlags(algn);
4822     int32_t const refSeqId = getRefSeqId(algn);
4823     int32_t const refPos = getPosition(algn);
4824     unsigned const mapQ = getMapQual(algn);
4825     bool const aligned =
4826         ((flags & BAMFlags_SelfIsUnmapped) == 0) &&
4827         (refSeqId >= 0) && (refSeqId < ctx->nrefs) &&
4828         (refPos >= 0) && (refPos < ctx->refLen[refSeqId]) && (mapQ > 0);
4829 
4830     if (ctx->options & bvo_ExtraFields) {
4831     }
4832     if (aligned) {
4833         if ((ctx->options & bvo_Sorted) != 0) {
4834             if (ctx->lastRefId < refSeqId || (ctx->lastRefId == refSeqId && ctx->lastRefPos <= refPos))
4835                 ++ctx->stats->inOrder.good;
4836             else
4837                 ++ctx->stats->inOrder.error;
4838             ctx->lastRefId = refSeqId;
4839             ctx->lastRefPos = refPos;
4840         }
4841         if (ctx->options & bvo_CIGARConsistency) {
4842         }
4843         if (ctx->options & bvo_BinConsistency) {
4844         }
4845     }
4846     if (ctx->options & bvo_FlagsConsistency) {
4847     }
4848     if (ctx->options & bvo_QualityValues) {
4849     }
4850     if (ctx->options & bvo_MissingSequence) {
4851     }
4852     if (ctx->options & bvo_MissingQuality) {
4853     }
4854     if (ctx->options & bvo_FlagsStats) {
4855     }
4856     return rc;
4857 }
4858 
BAMValidate2(void * Ctx,const BGZFile * file,rc_t rc,uint64_t fpos,const zlib_block_t data,unsigned dsize)4859 static rc_t BAMValidate2(void *Ctx, const BGZFile *file,
4860                          rc_t rc, uint64_t fpos,
4861                          const zlib_block_t data, unsigned dsize)
4862 {
4863     BAMValidate_ctx_t *ctx = Ctx;
4864     rc_t rc2;
4865     bool fatal = false;
4866 
4867     ctx->stats->bamFilePosition = fpos;
4868     if (rc) {
4869         if (ctx->options == bvo_BlockHeaders)
4870             ++ctx->stats->blockHeaders.error;
4871         else
4872             ++ctx->stats->blockCompression.error;
4873     }
4874     else if (ctx->options == bvo_BlockHeaders) {
4875         ++ctx->stats->blockHeaders.good;
4876     }
4877     else if (ctx->options == bvo_BlockCompression) {
4878         ++ctx->stats->blockHeaders.good;
4879         ++ctx->stats->blockCompression.good;
4880     }
4881     else if (dsize) {
4882         ctx->bsize += dsize;
4883         if (!ctx->stats->bamHeaderIsBad && !ctx->stats->bamHeaderIsGood) {
4884             unsigned header_len;
4885             unsigned refs_start;
4886             unsigned nrefs;
4887             unsigned data_start;
4888 
4889             rc2 = BAMValidateHeader(ctx->buf, ctx->bsize,
4890                                        &header_len, &refs_start,
4891                                        &nrefs, &data_start);
4892 
4893             if (rc2 == 0) {
4894                 ctx->stats->bamHeaderIsGood = true;
4895                 if (ctx->options & bvo_BinConsistency) {
4896                     ctx->refLen = malloc(nrefs * sizeof(ctx->refLen[0]));
4897                     if (ctx->refLen == NULL) {
4898                         rc = RC(rcAlign, rcFile, rcValidating, rcMemory, rcExhausted);
4899                         fatal = true;
4900                     }
4901                     else {
4902                         unsigned cp;
4903                         unsigned i;
4904 
4905                         ctx->nrefs = nrefs;
4906                         for (i = 0, cp = refs_start; cp != data_start; ++i) {
4907                             int32_t len;
4908 
4909                             memmove(&len, &ctx->buf[cp], 4);
4910                             memmove(&ctx->refLen[i], &ctx->buf[cp + 4 + len], 4);
4911                             cp += len + 8;
4912                         }
4913                     }
4914                 }
4915                 ctx->dnext = data_start;
4916             }
4917             else if ( GetRCState( rc2 ) != rcIncomplete || GetRCObject( rc2 ) != (enum RCObject)rcData)
4918             {
4919                 ctx->stats->bamHeaderIsBad = true;
4920                 ctx->options = bvo_BlockCompression;
4921                 rc = rc2;
4922             }
4923             else
4924                 ctx->dnext = ctx->bsize;
4925         }
4926         if (rc == 0) {
4927             if (ctx->stats->bamHeaderIsGood) {
4928                 unsigned cp = ctx->dnext;
4929 
4930                 while (cp + 4 < ctx->bsize) {
4931                     int32_t rsize;
4932 
4933                     rsize = LE2HI32(&ctx->buf[cp]);
4934                     if (rsize < 0) {
4935                         ++ctx->stats->blockStructure.error;
4936                         ctx->options = bvo_BlockStructure;
4937 
4938                         /* throw away the rest of the current buffer */
4939                         if (cp >= ctx->bsize - dsize)
4940                             cp = ctx->bsize;
4941                         else
4942                             cp = ctx->bsize - dsize;
4943 
4944                         rc = RC(rcAlign, rcFile, rcValidating, rcData, rcInvalid);
4945                         break;
4946                     }
4947                     else if (cp + 4 + rsize < ctx->bsize) {
4948                         if (rsize > UINT16_MAX)
4949                             ++ctx->stats->blockStructure.warning;
4950                         else
4951                             ++ctx->stats->blockStructure.good;
4952                         if (ctx->options > bvo_BlockStructure) {
4953                             BAMAlignment algn;
4954 
4955 /*                            rc = BAMAlignmentParse(&algn, &ctx->buf[cp + 4], rsize); */
4956                             if (rc == 0) {
4957                                 ++ctx->stats->recordStructure.good;
4958                                 if (ctx->options > bvo_RecordStructure)
4959                                     rc = BAMValidate3(ctx, &algn);
4960                             }
4961                             else
4962                                 ++ctx->stats->recordStructure.error;
4963                         }
4964                         cp += 4 + rsize;
4965                     }
4966                     else
4967                         break;
4968                 }
4969                 if (&ctx->buf[cp] >= data) {
4970                     if (cp < ctx->bsize) {
4971                         ctx->bsize -= cp;
4972                         memmove(ctx->buf, &ctx->buf[cp], ctx->bsize);
4973                         cp = ctx->bsize;
4974                     }
4975                     else {
4976                         assert(cp == ctx->bsize);
4977                         cp = ctx->bsize = 0;
4978                     }
4979                 }
4980                 ctx->dnext = cp;
4981             }
4982             if (ctx->alloced < ctx->bsize + ZLIB_BLOCK_SIZE) {
4983                 void *temp;
4984 
4985                 temp = realloc(ctx->buf, ctx->alloced + ZLIB_BLOCK_SIZE);
4986                 if (temp == NULL) {
4987                     rc = RC(rcAlign, rcFile, rcValidating, rcMemory, rcExhausted);
4988                     fatal = true;
4989                 }
4990                 else {
4991                     ctx->buf = temp;
4992                     ctx->alloced += ZLIB_BLOCK_SIZE;
4993                 }
4994             }
4995             ctx->nxt = &ctx->buf[ctx->dnext];
4996         }
4997     }
4998     rc2 = ctx->callback(ctx->ctx, rc, ctx->stats);
4999     ctx->cancelled |= rc2 != 0;
5000     return fatal ? rc : rc2;
5001 }
5002 
BAMValidateBAM(struct VPath const * bampath,BAMValidateOption options,BAMValidateCallback callback,void * callbackContext)5003 static rc_t BAMValidateBAM(struct VPath const *bampath,
5004                            BAMValidateOption options,
5005                            BAMValidateCallback callback,
5006                            void *callbackContext
5007                            )
5008 {
5009     rc_t rc;
5010     BGZFile bam;
5011     BAMValidate_ctx_t ctx;
5012     BAMValidateStats stats;
5013 
5014     if (bampath == NULL)
5015         return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
5016 
5017     memset(&ctx, 0, sizeof(ctx));
5018     memset(&stats, 0, sizeof(stats));
5019 
5020     ctx.callback = callback;
5021     ctx.ctx = callbackContext;
5022     ctx.options = options;
5023     ctx.stats = &stats;
5024 
5025     if (options > bvo_BlockCompression) {
5026         ctx.alloced = ZLIB_BLOCK_SIZE * 2;
5027         ctx.nxt = ctx.buf = malloc(ctx.alloced);
5028 
5029         if (ctx.buf == NULL)
5030             return RC(rcAlign, rcFile, rcValidating, rcMemory, rcExhausted);
5031     }
5032 
5033     if (options > bvo_RecordStructure)
5034         options = bvo_RecordStructure | (options & 0xFFF0);
5035 
5036     rc = VPath2BGZF(&bam, bampath);
5037     if (rc == 0) {
5038         stats.bamFileSize = bam.fsize;
5039         if ((options & 7) > bvo_BlockHeaders)
5040             rc = BGZFileWalkBlocks(&bam, true, (zlib_block_t *)&ctx.nxt, BAMValidate2, &ctx);
5041         else
5042             rc = BGZFileWalkBlocks(&bam, false, NULL, BAMValidate2, &ctx);
5043     }
5044     BGZFileWhack(&bam);
5045     return rc;
5046 }
5047 
dummy_cb(void * ctx,rc_t result,const BAMValidateStats * stats)5048 static rc_t CC dummy_cb(void *ctx, rc_t result, const BAMValidateStats *stats)
5049 {
5050     return 0;
5051 }
5052 
BAMValidate(struct VPath const * bampath,struct VPath const * baipath,BAMValidateOption options,BAMValidateCallback callback,void * callbackContext)5053 LIB_EXPORT rc_t CC BAMValidate(struct VPath const *bampath,
5054                                struct VPath const *baipath,
5055                                BAMValidateOption options,
5056                                BAMValidateCallback callback,
5057                                void *callbackContext
5058                                )
5059 {
5060     if (callback == NULL)
5061         callback = dummy_cb;
5062     if (bampath == NULL)
5063         return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
5064     if (baipath == NULL) {
5065         if (options & bvo_IndexOptions)
5066             return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
5067         return BAMValidateBAM(bampath, options, callback, callbackContext);
5068     }
5069     return BAMValidateIndex(bampath, baipath, options, callback, callbackContext);
5070 }
5071