1 /* @source ajbamindex *********************************************************
2 **
3 ** AJAX BAM files indexing/querying functions
4 **
5 ** @version $Revision: 1.12 $
6 ** @modified 2011 Mahmut Uludag ported from samtools (samtools.sf.net)
7 ** @modified $Date: 2012/07/04 07:40:28 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA  02110-1301,  USA.
24 **
25 ******************************************************************************/
26 
27 /* The MIT License
28 **
29 **   Copyright (c) 2008 Genome Research Ltd (GRL).
30 **
31 **   Permission is hereby granted, free of charge, to any person obtaining
32 **   a copy of this software and associated documentation files (the
33 **   "Software"), to deal in the Software without restriction, including
34 **   without limitation the rights to use, copy, modify, merge, publish,
35 **   distribute, sublicense, and/or sell copies of the Software, and to
36 **   permit persons to whom the Software is furnished to do so, subject to
37 **   the following conditions:
38 **
39 **   The above copyright notice and this permission notice shall be
40 **   included in all copies or substantial portions of the Software.
41 **
42 **   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43 **   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 **   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
45 **   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
46 **   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 **   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 **   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49 **   SOFTWARE.
50 */
51 
52 
53 #include "ajlib.h"
54 
55 #include "ajbamindex.h"
56 #include "ajutil.h"
57 
58 #include <ctype.h>
59 #include <assert.h>
60 
61 
62 #ifdef WIN32
63 #define strdup _strdup
64 #endif
65 
66 /*!
67   @header
68 
69   Alignment indexing. Before indexing, BAM must be sorted based on the
70   leftmost coordinate of alignments. In indexing, BAM uses two indices:
71   a UCSC binning index and a simple linear index. The binning index is
72   efficient for alignments spanning long distance, while the auxiliary
73   linear index helps to reduce unnecessary seek calls especially for
74   short alignments.
75 
76   The UCSC binning scheme was suggested by Richard Durbin and Lincoln
77   Stein and is explained by Kent et al. (2002). In this scheme, each bin
78   represents a contiguous genomic region which can be fully contained in
79   another bin; each alignment is associated with a bin which represents
80   the smallest region containing the entire alignment. The binning
81   scheme is essentially another representation of R-tree. A distinct bin
82   uniquely corresponds to a distinct internal node in a R-tree. Bin A is
83   a child of Bin B if region A is contained in B.
84 
85   In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
86   0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
87   585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
88   find the alignments overlapped with a region [rbeg,rend), we need to
89   calculate the list of bins that may be overlapped the region and test
90   the alignments in the bins to confirm the overlaps. If the specified
91   region is short, typically only a few alignments in six bins need to
92   be retrieved. The overlapping alignments can be quickly fetched.
93 
94  */
95 
96 #define BAM_MIN_CHUNK_GAP 32768
97 /* 1<<14 is the size of minimum bin. */
98 
99 #define BAM_LIDX_SHIFT    14
100 
101 #define BAM_MAX_BIN 37450 /* =(8^6-1)/7+1 */
102 
103 
104 struct __bam_iter_t
105 {
106     int from_first; /* read from the first record; no random access */
107     int tid, beg, end, n_off, i, finished;
108     ajulong curr_off;
109     pair64_t *off;
110 };
111 
112 typedef struct __bam_iter_t *bam_iter_t;
113 
114 
115 static void bamBinDel(void** bin);
116 static void bamIndexSave(const AjPBamIndex idx, FILE *fp);
117 static void bamIterDel(bam_iter_t* iter);
118 static bam_iter_t bamIterQuery(const AjPBamIndex idx, int tid,
119 			       int beg, int end);
120 static int bamIterRead(AjPSeqBamBgzf fp, bam_iter_t iter, AjPSeqBam);
121 static AjPBamIndex bamIndexCore(AjPSeqBamBgzf fp);
122 
123 
124 
125 
126 /* @funcstatic bamInsertOffset ************************************************
127 **
128 ** Inserts a new chunk to the binning index h
129 **
130 ** @param [u] h [AjPTable] binning index
131 ** @param [r] bin [ajint] bin number
132 ** @param [r] beg [ajulong] start of target region
133 ** @param [r] end [ajulong] end of target region
134 ** @return [void]
135 **
136 ** @release 6.5.0
137 ** @@
138 ******************************************************************************/
139 
bamInsertOffset(AjPTable h,ajint bin,ajulong beg,ajulong end)140 static inline void bamInsertOffset(AjPTable h, ajint bin, ajulong beg,
141 				   ajulong end)
142 {
143     bam_binlist_t *l = NULL;
144     int* tmp = NULL;
145 
146     l = ajTableFetchmodV(h, &bin);
147 
148     if (!l)
149     {
150 	AJNEW0(l);
151 	l->m = 1;
152 	l->n = 0;
153 	l->list = (pair64_t*)calloc(l->m, 16);
154 
155 	AJNEW0(tmp);
156 	*tmp = bin;
157 
158 	ajTablePut(h, tmp, l);
159     }
160 
161     if (l->n == l->m)
162     {
163 	l->m <<= 1;
164 	l->list = (pair64_t*)realloc(l->list, l->m * 16);
165     }
166 
167     l->list[l->n].u = beg;
168     l->list[l->n++].v = end;
169 }
170 
171 
172 
173 
174 /* @funcstatic bamInsertOffset2 ***********************************************
175 **
176 ** Inserts a new offset to the linear index
177 **
178 ** @param [u] index2 [bam_lidx_t*] linear index
179 ** @param [r] b [const AjPSeqBam] alignment record
180 ** @param [r] offset [ajulong] offset
181 ** @return [void]
182 **
183 ** @release 6.5.0
184 ** @@
185 ******************************************************************************/
186 
bamInsertOffset2(bam_lidx_t * index2,const AjPSeqBam b,ajulong offset)187 static inline void bamInsertOffset2(bam_lidx_t *index2, const AjPSeqBam b,
188 				    ajulong offset)
189 {
190     ajint i;
191     ajint beg;
192     ajint end;
193     ajint old_m;
194 
195     beg = b->core.pos >> BAM_LIDX_SHIFT;
196     end = (ajSeqBamCalend(&b->core, MAJSEQBAMCIGAR(b)) - 1) >> BAM_LIDX_SHIFT;
197 
198     if (index2->m < end + 1)
199     {
200 	old_m = index2->m;
201 	index2->m = end + 1;
202 	kroundup32(index2->m);
203 	index2->offset = (ajulong*)realloc(index2->offset, index2->m * 8);
204 	memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
205     }
206 
207     if (beg == end)
208     {
209 	if (index2->offset[beg] == 0)
210 	    index2->offset[beg] = offset;
211     }
212     else
213     {
214 	for (i = beg; i <= end; ++i)
215 	    if (index2->offset[i] == 0)
216 		index2->offset[i] = offset;
217     }
218 
219     index2->n = end + 1;
220 }
221 
222 
223 
224 
225 /* @funcstatic bamMergeChunks *************************************************
226 **
227 ** merge binning index chunks
228 **
229 ** @param [u] idx [AjPBamIndex] index object
230 ** @return [void]
231 **
232 **
233 ** @release 6.5.0
234 ** @@
235 ******************************************************************************/
236 
bamMergeChunks(AjPBamIndex idx)237 static void bamMergeChunks(AjPBamIndex idx)
238 {
239 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
240     AjPTable bindex = NULL;
241     bam_binlist_t* p = NULL;
242     bam_binlist_t** bins = NULL;
243     ajint** binids = NULL;
244     int i, m;
245     ajulong n;
246     ajuint l;
247     ajulong k =0UL;
248 
249     for (i = 0; i < idx->n; ++i)
250     {
251 	bindex = idx->bindex[i];
252 
253 	n = ajTableToarrayKeysValues(bindex, (void***)&binids, (void***)&bins);
254 
255 	for (k = 0; k<n; ++k)
256 	{
257 
258 	    if(*binids[k] == BAM_MAX_BIN)  /* metadata */
259 		continue;
260 
261 	    p = bins[k];
262 	    m = 0;
263 
264 	    for (l = 1; l < p->n; ++l)
265 	    {
266 
267 #ifdef BAM_TRUE_OFFSET
268 		if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u)
269 		    p->list[m].v = p->list[l].v;
270 #else
271 		if (p->list[m].v>>16 == p->list[l].u>>16)
272 		    p->list[m].v = p->list[l].v;
273 #endif
274 
275 		else
276 		    p->list[++m] = p->list[l];
277 	    }
278 
279 	    p->n = m + 1;
280 
281 	}
282     }
283 
284     AJFREE(bins);
285     AJFREE(binids);
286 
287 #endif
288 }
289 
290 
291 
292 
293 /* @funcstatic bamFillMissing *************************************************
294 **
295 ** Fills missing linear index bits with the values of previous cells
296 ** in the index
297 **
298 ** @param [u] idx [AjPBamIndex] index object
299 ** @return [void]
300 **
301 ** @release 6.5.0
302 ** @@
303 ******************************************************************************/
304 
bamFillMissing(AjPBamIndex idx)305 static void bamFillMissing(AjPBamIndex idx)
306 {
307     bam_lidx_t *idx2 = NULL;
308     ajint i;
309     ajint j;
310 
311     for (i = 0; i < idx->n; ++i)
312     {
313 	idx2 = &idx->index2[i];
314 
315 	for (j = 1; j < idx2->n; ++j)
316 	    if (idx2->offset[j] == 0)
317 		idx2->offset[j] = idx2->offset[j-1];
318     }
319 }
320 
321 
322 
323 
324 /* @funcstatic bamIndexCore ***************************************************
325 **
326 ** Reads BAM index files
327 **
328 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
329 ** @return [AjPBamIndex] Bam index object
330 **
331 ** @release 6.5.0
332 ** @@
333 ******************************************************************************/
334 
bamIndexCore(AjPSeqBamBgzf fp)335 static AjPBamIndex bamIndexCore(AjPSeqBamBgzf fp)
336 {
337     AjPSeqBam b = NULL;
338     AjPSeqBamHeader h = NULL;
339     AjPBamIndex idx = NULL;
340     AjPSeqBamCore c = NULL;
341     ajint i;
342     ajint ret;
343     ajuint last_bin, save_bin;
344     ajint last_coor;
345     ajint last_tid;
346     ajint save_tid;
347     ajlong last_off;
348     ajulong save_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
349 
350     AJNEW0(idx);
351     AJNEW0(b);
352 
353     h = ajSeqBamHeaderRead(fp);
354 
355     c = &b->core;
356 
357     idx->n = h->n_targets;
358     ajSeqBamHeaderDel(&h);
359 
360     idx->bindex = (AjPTable*)calloc(idx->n, sizeof(AjPTable));
361 
362     for (i = 0; i < idx->n; ++i)
363     {
364 	idx->bindex[i] = ajTableintNew(100);
365     }
366 
367     idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
368 
369     save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
370     save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
371     n_mapped = n_unmapped = n_no_coor = off_end = 0;
372     off_beg = off_end = bam_tell(fp);
373 
374 
375     while ((ret = ajSeqBamRead(fp, b)) >= 0)
376     {
377 	if (c->tid < 0)
378 	    ++n_no_coor;
379 
380 	if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0))
381 	{ /* change of chromosomes/reference-sequences */
382 	    last_tid = c->tid;
383 	    last_bin = 0xffffffffu;
384 	}
385 	else if (last_tid > c->tid)
386 	{
387 	    ajErr("[bam_index_core] the alignment is not sorted (%s):"
388 		    " %d-th chr > %d-th chr\n",
389 		    MAJSEQBAMQNAME(b), last_tid+1, c->tid+1);
390 	    return NULL;
391 	}
392 	else if (c->tid >= 0 && last_coor > c->pos)
393 	{
394 	    ajErr("[bam_index_core] the alignment is not sorted (%s):"
395 		    " %u > %u in %d-th chr\n",
396 		    MAJSEQBAMQNAME(b), last_coor, c->pos, c->tid+1);
397 	    return NULL;
398 	}
399 
400 	if (c->tid >= 0 && !(c->flag & BAM_FUNMAP))
401 	    bamInsertOffset2(&idx->index2[b->core.tid], b, last_off);
402 
403 	if (c->bin != last_bin) /* then possibly write the binning index */
404 	{
405 	    /* save_bin==0xffffffffu only happens to the first record */
406 	    if (save_bin != 0xffffffffu)
407 		bamInsertOffset(idx->bindex[save_tid], save_bin, save_off,
408 		              last_off);
409 
410 	    if (last_bin == 0xffffffffu && save_tid != -1)
411 	    { /* write the meta element */
412 		off_end = last_off;
413 		bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN,
414 			off_beg, off_end);
415 		bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN,
416 			n_mapped, n_unmapped);
417 		n_mapped = n_unmapped = 0;
418 		off_beg = off_end;
419 	    }
420 
421 	    save_off = last_off;
422 	    save_bin = last_bin = c->bin;
423 	    save_tid = c->tid;
424 
425 	    if (c->tid < 0)
426 		break;
427 	}
428 
429 	if (bam_tell(fp) <= last_off)
430 	{
431 	    ajErr("bamIndexCore: bug in BGZF/RAZF: %llx < %llx\n",
432 	          (unsigned long long)bam_tell(fp),
433 	          (unsigned long long)last_off);
434 	    return NULL;
435 	}
436 
437 	if (c->flag & BAM_FUNMAP)
438 	    ++n_unmapped;
439 	else
440 	    ++n_mapped;
441 
442 	last_off = bam_tell(fp);
443 	last_coor = b->core.pos;
444     }
445 
446     if (save_tid >= 0)
447     {
448 	bamInsertOffset(idx->bindex[save_tid], save_bin, save_off,
449 	              bam_tell(fp));
450 	bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN, off_beg,
451 		      bam_tell(fp));
452 	bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN, n_mapped,
453 		      n_unmapped);
454     }
455 
456     bamMergeChunks(idx);
457     bamFillMissing(idx);
458 
459     if (ret >= 0)
460     {
461 	while ((ret = ajSeqBamRead(fp, b)) >= 0)
462 	{
463 	    ++n_no_coor;
464 	    if (c->tid >= 0 && n_no_coor)
465 	    {
466 		ajErr("bamIndexCore: the alignment is not sorted:"
467 			" reads without coordinates"
468 			" prior to reads with coordinates.");
469 		return NULL;
470 	    }
471 	}
472     }
473 
474     if (ret < -1)
475 	ajWarn("bamIndexCore: truncated file");
476 
477     AJFREE(b->data);
478     AJFREE(b);
479 
480     idx->n_no_coor = n_no_coor;
481     return idx;
482 }
483 
484 
485 
486 
487 /* @funcstatic bamBinDel ******************************************************
488 **
489 ** Deletes a bin of a binning index
490 **
491 ** is called from ajTableSetDestroyvalue function
492 **
493 ** @param [d] bin [void**] bin object in a binning index
494 ** @return [void]
495 **
496 ** @release 6.5.0
497 ** @@
498 ******************************************************************************/
499 
bamBinDel(void ** bin)500 static void bamBinDel(void** bin)
501 {
502     bam_binlist_t* p = *bin;
503     AJFREE(p->list);
504     AJFREE(p);
505     *bin = NULL;
506 }
507 
508 
509 
510 
511 /* @func ajBamIndexDel ********************************************************
512 **
513 ** Deletes a BAM index object
514 **
515 **
516 ** @param [d] Pidx [AjPBamIndex*] index object
517 ** @return [void]
518 **
519 ** @release 6.5.0
520 ** @@
521 ******************************************************************************/
522 
ajBamIndexDel(AjPBamIndex * Pidx)523 void ajBamIndexDel(AjPBamIndex* Pidx)
524 {
525     AjPBamIndex idx = NULL;
526     int i;
527 
528     idx = *Pidx;
529 
530     if (idx == 0) return;
531 
532     for (i = 0; i < idx->n; ++i)
533     {
534 	bam_lidx_t *index2 = idx->index2 + i;
535 
536 	free(index2->offset);
537 	ajTableSetDestroyvalue(idx->bindex[i], bamBinDel);
538 	ajTableDel(&idx->bindex[i]);
539     }
540 
541     AJFREE(idx->index2);
542     AJFREE(idx->bindex);
543     AJFREE(*Pidx);
544 }
545 
546 
547 
548 
549 /* @funcstatic bamIndexSave ***************************************************
550 **
551 ** Save an index
552 **
553 ** @param [r] idx [const AjPBamIndex] index object
554 ** @param [u] fp [FILE*] file pointer
555 ** @return [void]
556 **
557 ** @release 6.5.0
558 ** @@
559 ******************************************************************************/
560 
bamIndexSave(const AjPBamIndex idx,FILE * fp)561 static void bamIndexSave(const AjPBamIndex idx, FILE *fp)
562 {
563     AjPTable bindex = NULL;
564     bam_lidx_t *index2 = NULL;
565     bam_binlist_t **bins = NULL;
566     ajuint **binids = NULL;
567     ajulong l = 0;
568     ajint i=0;
569     ajuint j=0;
570     ajuint size=0;
571     ajint k=0;
572     ajuint x = 0;
573     ajint bam_is_be = 0;
574 
575     bam_is_be = ajUtilGetBigendian();
576 
577     fwrite("BAI\1", 1, 4, fp);
578 
579     if (bam_is_be)
580     {
581 	x = idx->n;
582 	ajByteRevLen4u(&x);
583 	fwrite(&x, 4, 1, fp);
584     }
585     else
586 	fwrite(&idx->n, 4, 1, fp);
587 
588     for (i = 0; i < idx->n; ++i)
589     {
590 	bindex = idx->bindex[i];
591 	index2 = idx->index2 + i;
592 
593 	/* binning index */
594 
595 	/* # distinct bins */
596 	size = (ajuint) ajTableToarrayKeysValues(bindex,
597 						 (void***)&binids,
598 						 (void***)&bins);
599 
600 	if (bam_is_be) /*big endian*/
601 	{
602 	    x = size;
603 	    ajByteRevLen4u(&x);
604 	    fwrite(&x, 4, 1, fp);
605 	}
606 	else
607 	    fwrite(&size, 4, 1, fp);
608 
609 	for (j = 0; j < size; ++j)
610 	{
611 	    bam_binlist_t *binlist = bins[j];
612 
613 	    if (bam_is_be) /*big endian*/
614 	    {
615 		x = *binids[j];
616 		ajByteRevLen4u(&x);
617 		fwrite(&x, 4, 1, fp);
618 		x = binlist->n;
619 		ajByteRevLen4u(&x);
620 		fwrite(&x, 4, 1, fp);
621 
622 		for (x = 0; x < binlist->n; ++x)
623 		{
624 		    ajByteRevLen8u((ajulong*)&binlist->list[x].u);
625 		    ajByteRevLen8u((ajulong*)&binlist->list[x].v);
626 		}
627 
628 		fwrite(binlist->list, 16, binlist->n, fp);
629 
630 		for (x = 0; x < binlist->n; ++x)
631 		{
632 		    ajByteRevLen8u((ajulong*)&binlist->list[x].u);
633 		    ajByteRevLen8u((ajulong*)&binlist->list[x].v);
634 		}
635 
636 	    }
637 	    else
638 	    {
639 		fwrite(binids[j], 4, 1, fp);
640 		fwrite(&binlist->n, 4, 1, fp);
641 		fwrite(binlist->list, 16, binlist->n, fp);
642 	    }
643 	}
644 
645 	/* write linear index (index2) */
646 	if (bam_is_be)
647 	{
648 	    x = index2->n;
649 	    ajByteRevLen4u(&x);
650 	    fwrite(&x, 4, 1, fp);
651 	}
652 	else
653 	    fwrite(&index2->n, 4, 1, fp);
654 
655 	if (bam_is_be)
656 	{
657 	    for (k = 0; k < index2->n; ++k)
658 		ajByteRevLen8u((ajulong*)&index2->offset[k]);
659 
660 	    fwrite(index2->offset, 8, index2->n, fp);
661 
662 	    for (k = 0; k < index2->n; ++k)
663 		ajByteRevLen8u((ajulong*)&index2->offset[k]);
664 	}
665 	else
666 	    fwrite(index2->offset, 8, index2->n, fp);
667 
668 	AJFREE(bins);
669 	AJFREE(binids);
670     }
671 
672     { /* write the number of reads without coordinate */
673 	l = idx->n_no_coor;
674 
675 	if (bam_is_be)
676 	    ajByteRevLen8u((ajulong*)&l);
677 
678 	fwrite(&l, 8, 1, fp);
679     }
680 
681     fflush(fp);
682 }
683 
684 
685 
686 
687 /* @funcstatic bamIndexLoadCore ***********************************************
688 **
689 ** Load BAM index data from a file
690 **
691 ** @param [u] fp [FILE*] file pointer
692 ** @return [AjPBamIndex] BAM index
693 **
694 ** @release 6.5.0
695 ** @@
696 ******************************************************************************/
697 
bamIndexLoadCore(FILE * fp)698 static AjPBamIndex bamIndexLoadCore(FILE *fp)
699 {
700     AjPBamIndex idx = NULL;
701     ajint i;
702     char magic[4];
703     ajint bam_is_be = 0;
704     bam_lidx_t *index2 = NULL;
705     ajuint binid;
706     ajint n_bin; /* # distinct bins */
707     ajint j;
708     bam_binlist_t *binlist = NULL;
709     ajint* tmp = NULL;
710     ajuint x;
711 
712     bam_is_be = ajUtilGetBigendian();
713 
714     if (fp == 0)
715     {
716 	ajErr("bamIndexLoadCore: fail to load index.\n");
717 	return 0;
718     }
719 
720     fread(magic, 1, 4, fp);
721 
722     if (strncmp(magic, "BAI\1", 4))
723     {
724 	ajErr("bamIndexLoadCore: wrong magic number.\n");
725 	fclose(fp);
726 	return 0;
727     }
728 
729     AJNEW0(idx);
730 
731     fread(&idx->n, 4, 1, fp);
732     if (bam_is_be)
733 	ajByteRevLen4(&idx->n);
734     ajDebug("# reference sequences: %d\n", idx->n);
735 
736     idx->bindex = (AjPTable*)calloc(idx->n, sizeof(AjPTable));
737     idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
738 
739     for (i = 0; i < idx->n; ++i)
740     {
741 	index2 = idx->index2 + i;
742 	binlist = NULL;
743 
744 	idx->bindex[i] = ajTableintNew(10);
745 
746 	/* load binning index */
747 	fread(&n_bin, 4, 1, fp);
748 	if (bam_is_be)
749 	    ajByteRevLen4(&n_bin);
750 	ajDebug("# distinct bins (for the reference sequence %d): %d\n",
751 		i, n_bin);
752 
753 	for (j = 0; j < n_bin; ++j)
754 	{
755 
756 	    fread(&binid, 4, 1, fp);
757 	    if (bam_is_be)
758 		ajByteRevLen4u(&binid);
759 
760 	    AJNEW0(tmp);
761 	    *tmp = binid;
762 	    AJNEW0(binlist);
763 	    ajTablePut(idx->bindex[i], tmp, binlist);
764 
765 	    fread(&binlist->n, 4, 1, fp);
766 	    if (bam_is_be)
767 		ajByteRevLen4u(&binlist->n);
768 	    ajDebug("\tbin %u has %d chunks\n", binid, binlist->n);
769 
770 
771 	    binlist->m = binlist->n;
772 	    binlist->list = (pair64_t*)malloc(binlist->m * 16);
773 	    fread(binlist->list, 16, binlist->n, fp);
774 	    if (bam_is_be)
775 	    {
776 		for (x = 0; x < binlist->n; ++x)
777 		{
778 		    ajByteRevLen8u((ajulong*)&binlist->list[x].u);
779 		    ajByteRevLen8u((ajulong*)&binlist->list[x].v);
780 		}
781 	    }
782 
783 	    if (ajDebugOn())
784 		for (x = 0; x < binlist->n; ++x)
785 		{
786 		    ajlong mChunkStart = binlist->list[x].u;
787 		    ajlong mChunkEnd   = binlist->list[x].v;
788 
789 		    ajDebug("\t\tchunk: %d:%d-%d:%d\n",
790 			    mChunkStart >> 16,
791 			    mChunkStart & 0xFFFF,
792 			    mChunkEnd >> 16,
793 			    mChunkEnd & 0xFFFF);
794 		}
795 
796 	}
797 
798 	/* load linear index */
799 	fread(&index2->n, 4, 1, fp);
800 	if (bam_is_be)
801 	    ajByteRevLen4(&index2->n);
802 	index2->m = index2->n;
803 
804 	ajDebug("\tRef %u has %d 16kbp intervals\n", i, index2->n);
805 
806 	index2->offset = (ajulong*)calloc(index2->m, 8);
807 	fread(index2->offset, index2->n, 8, fp);
808 	if (bam_is_be)
809 	    for (j = 0; j < index2->n; ++j)
810 		ajByteRevLen8u((ajulong*)&index2->offset[j]);
811 
812 
813     }
814 
815     if (fread(&idx->n_no_coor, 8, 1, fp) == 0)
816 	idx->n_no_coor = 0;
817 
818     if (bam_is_be)
819 	ajByteRevLen8u((ajulong*)&idx->n_no_coor);
820 
821     return idx;
822 }
823 
824 
825 
826 
827 /* @func ajBamIndexLoad *******************************************************
828 **
829 ** Load index from file "filename.bai"
830 **
831 ** TODO: ftp and http support not yet implemented
832 **
833 ** @param [r] _fn [const char*] name of the BAM file (NOT the index file)
834 ** @return [AjPBamIndex] BAM index object
835 **
836 ** @release 6.5.0
837 ******************************************************************************/
838 
ajBamIndexLoad(const char * _fn)839 AjPBamIndex ajBamIndexLoad(const char *_fn)
840 {
841     FILE *fp = NULL;
842     AjPBamIndex idx = NULL;
843     char *fnidx = NULL;
844     char *fn = NULL;
845     char *s = NULL;
846     const char *p = NULL;
847     ajint l = 0;
848 
849     if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn)
850     {
851 	l = strlen(_fn);
852 
853 	for (p = _fn + l - 1; p >= _fn; --p)
854 	    if (*p == '/')
855 		break;
856 
857 	fn = strdup(p + 1);
858     }
859     else
860 	fn = strdup(_fn);
861 
862     fnidx = (char*)calloc(strlen(fn) + 5, 1);
863     strcpy(fnidx, fn); strcat(fnidx, ".bai");
864     fp = fopen(fnidx, "rb");
865 
866     if (fp == 0)
867     { /* try "{base}.bai" */
868 	s = strstr(fn, "bam");
869 
870 	if (s == fn + strlen(fn) - 3)
871 	{
872 	    strcpy(fnidx, fn);
873 	    fnidx[strlen(fn)-1] = 'i';
874 	    fp = fopen(fnidx, "rb");
875 	}
876     }
877 
878     free(fnidx);
879     free(fn);
880 
881     if (fp)
882     {
883 	idx = bamIndexLoadCore(fp);
884 	fclose(fp);
885 	return idx;
886     }
887     else
888 	return NULL;
889 }
890 
891 
892 
893 
894 /* @func ajBamIndexBuild ******************************************************
895 **
896 ** Build index for a BAM file, using .bai extension for the index file
897 **
898 ** @param [r] fn [const char*] name of the BAM file
899 ** @return [int] 0 if the index build was successful
900 **
901 **
902 ** @release 6.5.0
903 ******************************************************************************/
904 
ajBamIndexBuild(const char * fn)905 int ajBamIndexBuild(const char *fn)
906 {
907     char *fnidx;
908     FILE *fpidx = NULL;
909     AjPSeqBamBgzf fp = NULL;
910     AjPBamIndex idx = NULL;
911 
912     if(!strcmp(fn, "stdout"))
913 	return -1;
914 
915     if ((fp = ajSeqBamBgzfOpenC(fn, "r")) == 0)
916     {
917 	ajErr("ajBamIndexBuild: fail to open the BAM file '%s'.", fn);
918 	return -1;
919     }
920 
921     idx = bamIndexCore(fp);
922     ajSeqBamBgzfClose(fp);
923 
924     if(idx == 0)
925     {
926 	ajErr("ajBamIndexBuild: fail to index the BAM file '%s'.", fn);
927 	return -1;
928     }
929 
930     fnidx = (char*)calloc(strlen(fn) + 5, 1);
931     strcpy(fnidx, fn); strcat(fnidx, ".bai");
932 
933     fpidx = fopen(fnidx, "wb");
934 
935     if (fpidx == 0)
936     {
937 	ajErr("ajBamIndexBuild: fail to create the index file '%s'.", fn);
938 	free(fnidx);
939 	return -1;
940     }
941 
942     bamIndexSave(idx, fpidx);
943 
944     AJFREE(fnidx);
945 
946     ajBamIndexDel(&idx);
947     fclose(fpidx);
948 
949     return 0;
950 }
951 
952 
953 
954 
955 /* @funcstatic bamReg2bins ****************************************************
956 **
957 ** Calculate the list of bins that may overlap with region [beg, end)
958  * (zero based).
959  *
960  * picard def: Get candidate bins for the specified region.
961 **
962 ** @param [r] beg [ajuint] 1-based start of target region, inclusive
963 ** @param [r] end [ajuint] 1-based end of target region, inclusive
964 ** @param [w] list [ajushort[BAM_MAX_BIN]] list of bins that may overlap
965 **                                         with region
966 ** @return [int] number of bins
967 **
968 ** @release 6.5.0
969 ** @@
970 ******************************************************************************/
971 
bamReg2bins(ajuint beg,ajuint end,ajushort list[BAM_MAX_BIN])972 static inline int bamReg2bins(ajuint beg, ajuint end,
973 			      ajushort list[BAM_MAX_BIN])
974 {
975     ajuint i=0;
976     ajuint k=0;
977 
978     if (beg >= end)
979 	return 0;
980 
981     if (end >= 1u<<29)
982 	end = 1u<<29;
983 
984     --end;
985     list[i++] = 0;
986 
987     for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
988     for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
989     for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
990     for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
991     for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
992 
993     return i;
994 }
995 
996 
997 
998 
999 /* @funcstatic bamIsOverlap ***************************************************
1000 **
1001 ** Checks whether a region overlaps a BAM record
1002 **
1003 ** @param [r] beg [ajuint] region start position
1004 ** @param [r] end [ajuint] region end position
1005 ** @param [r] b [const AjPSeqBam] BAM record
1006 ** @return [int] 1 if the region overlaps the BAM record, 0 otherwise
1007 **
1008 ** @release 6.5.0
1009 ** @@
1010 ******************************************************************************/
1011 
bamIsOverlap(ajuint beg,ajuint end,const AjPSeqBam b)1012 static inline int bamIsOverlap(ajuint beg, ajuint end, const AjPSeqBam b)
1013 {
1014     ajuint rbeg = b->core.pos;
1015     ajuint rend = 0;
1016 
1017     if(b->core.n_cigar)
1018 	rend = ajSeqBamCalend(&b->core, MAJSEQBAMCIGAR(b));
1019     else
1020 	rend = b->core.pos + 1;
1021 
1022     return (rend > beg && rbeg < end);
1023 }
1024 
1025 
1026 
1027 
1028 /* @funcstatic bamChunkCompare ************************************************
1029 **
1030 ** Compares the virtual file offset of the start of two chunks
1031 ** for binning indices
1032 **
1033 ** @param [r] a [const void*] first chunk
1034 ** @param [r] b [const void*] second chunk
1035 ** @return [ajint] 1 if the first virtual offset is smaller, 0 otherwise
1036 **
1037 ** @release 6.5.0
1038 ** @@
1039 ******************************************************************************/
1040 
bamChunkCompare(const void * a,const void * b)1041 static ajint bamChunkCompare(const void* a, const void* b)
1042 {
1043     return  (((pair64_t const *)a)->u > ((pair64_t const *)b)->u);
1044 }
1045 
1046 
1047 
1048 
1049 /* @funcstatic bamIterQuery ***************************************************
1050 **
1051 ** query a region on a reference sequence
1052 **
1053 ** @param [r] idx [const AjPBamIndex] BAM index
1054 ** @param [r] tid [int] reference sequence id
1055 ** @param [r] beg [int] 1-based start of target region, inclusive
1056 ** @param [r] end [int] 1-based end of target region, inclusive
1057 ** @return [bam_iter_t] iteration object
1058 **
1059 ** @release 6.5.0
1060 ** @@
1061 ******************************************************************************/
1062 
bamIterQuery(const AjPBamIndex idx,int tid,int beg,int end)1063 static bam_iter_t bamIterQuery(const AjPBamIndex idx, int tid,
1064 			       int beg, int end)
1065 {
1066     pair64_t *off=NULL;
1067     AjPTable bindex = NULL;
1068     AjPSeqBam b = NULL;
1069     ajushort *bins=NULL;
1070     const bam_binlist_t *p = NULL;
1071     ajulong min_off;
1072     bam_iter_t iter = 0;
1073     ajint i=0;
1074     ajint n_bins=0;
1075     ajint n_off=0;
1076     ajint key;
1077     ajint l;
1078     ajint n;
1079     ajuint j;
1080 
1081     if (beg < 0)
1082 	beg = 0;
1083 
1084     if (end < beg)
1085 	return 0;
1086 
1087     /* initialize iter */
1088     iter = calloc(1, sizeof(struct __bam_iter_t));
1089     iter->tid = tid;
1090     iter->beg = beg;
1091     iter->end = end;
1092     iter->i = -1;
1093 
1094     bins = (ajushort*)calloc(BAM_MAX_BIN, 2);
1095     n_bins = bamReg2bins(beg, end, bins);
1096 
1097     bindex = idx->bindex[tid];
1098 
1099     if (idx->index2[tid].n > 0)
1100     {
1101 	if (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)
1102 	    min_off = idx->index2[tid].offset[idx->index2[tid].n-1];
1103 	else
1104 	    min_off = idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
1105 
1106 	if (min_off == 0)
1107 	{ /* improvement for mapindex files built by tabix prior to 0.1.4 */
1108 	    n = beg>>BAM_LIDX_SHIFT;
1109 
1110 	    if (n > idx->index2[tid].n)
1111 		n = idx->index2[tid].n;
1112 
1113 	    for (i = n - 1; i >= 0; --i)
1114 		if (idx->index2[tid].offset[i] != 0)
1115 		    break;
1116 
1117 	    if (i >= 0)
1118 		min_off = idx->index2[tid].offset[i];
1119 	}
1120     }
1121     else
1122 	min_off = 0; /* tabix 0.1.2 may produce such mapindex files */
1123 
1124 
1125     /* iterate over the list of bins that may overlap with region */
1126     for (i = n_off = 0; i < n_bins; ++i)
1127     {
1128 	key = bins[i];
1129 	p =  ajTableFetchV(bindex, &key);
1130 
1131 	if(p)
1132 	{
1133 	    n_off += p->n;
1134 	    ajDebug("bin:%u   # chunks:%d\n", key, p->n);
1135 	}
1136 
1137     }
1138 
1139     ajDebug("n_off:%d\n", n_off);
1140 
1141     if (n_off == 0)
1142     {
1143 	free(bins);
1144 	return iter;
1145     }
1146 
1147     off = (pair64_t*)calloc(n_off, 16);
1148 
1149     for (i = n_off = 0; i < n_bins; ++i)
1150     {
1151 	key = bins[i];
1152 	p = ajTableFetchV(bindex, &key);
1153 
1154 	if(p)
1155 	{
1156 	    for (j = 0; j < p->n; ++j)
1157 	    {
1158 		if (p->list[j].v > min_off)
1159 		    off[n_off++] = p->list[j];
1160 
1161 		ajDebug("p->list[%d].v:%u  min_off:%d\n",
1162 			j, p->list[j].v, min_off);
1163 	    }
1164 	}
1165     }
1166 
1167     free(bins);
1168 
1169     if (n_off == 0)
1170     {
1171 	free(off);
1172 	return iter;
1173     }
1174 
1175     AJNEW0(b);
1176 
1177     qsort(off, n_off, sizeof(pair64_t), bamChunkCompare);
1178 
1179     /* resolve completely contained adjacent blocks */
1180     for (i = 1, l = 0; i < n_off; ++i)
1181 	if (off[l].v < off[i].v)
1182 	    off[++l] = off[i];
1183     n_off = l + 1;
1184 
1185     /* resolve overlaps between adjacent blocks;
1186      * this may happen due to the merge in indexing
1187      */
1188 
1189     for (i = 1; i < n_off; ++i)
1190 	if (off[i-1].v >= off[i].u)
1191 	    off[i-1].v = off[i].u;
1192 
1193     { /* merge adjacent blocks */
1194 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
1195 	    for (i = 1, l = 0; i < n_off; ++i)
1196 	    {
1197 #ifdef BAM_TRUE_OFFSET
1198 		if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u)
1199 		    off[l].v = off[i].v;
1200 #else
1201 		if (off[l].v>>16 == off[i].u>>16)
1202 		    off[l].v = off[i].v;
1203 #endif
1204 		else off[++l] = off[i];
1205 	    }
1206 	    n_off = l + 1;
1207 #endif
1208     }
1209 
1210     ajSeqBamDel(&b);
1211 
1212     iter->n_off = n_off; iter->off = off;
1213 
1214     return iter;
1215 }
1216 
1217 
1218 
1219 
1220 /* @funcstatic bamIterDel *****************************************************
1221 **
1222 ** Deletes a BAM iteration object
1223 **
1224 ** @param [d] iterp [bam_iter_t*] pointer to iteration object to be deleted
1225 ** @return [void]
1226 **
1227 ** @release 6.5.0
1228 ** @@
1229 ******************************************************************************/
1230 
bamIterDel(bam_iter_t * iterp)1231 static void bamIterDel(bam_iter_t* iterp)
1232 {
1233     bam_iter_t iter;
1234 
1235     iter = *iterp;
1236 
1237     if (iter)
1238     {
1239 	free(iter->off);
1240 	free(iter);
1241     }
1242 
1243     *iterp = NULL;
1244 }
1245 
1246 
1247 
1248 
1249 /* @funcstatic bamIterRead ****************************************************
1250 **
1251 ** ajBamFetch helper function retrieves data into alignment record b
1252 **
1253 ** @param [u] fp [AjPSeqBamBgzf] BAM file
1254 ** @param [u] iter [bam_iter_t] BAM iteration object
1255 ** @param [u] b [AjPSeqBam] BAM record
1256 ** @return [int] positive integer (1 or more) if a record successfully read
1257 **
1258 ** @release 6.5.0
1259 ** @@
1260 ******************************************************************************/
1261 
bamIterRead(AjPSeqBamBgzf fp,bam_iter_t iter,AjPSeqBam b)1262 static int bamIterRead(AjPSeqBamBgzf fp, bam_iter_t iter, AjPSeqBam b)
1263 {
1264     int ret;
1265 
1266     if (iter && iter->finished)
1267 	return -1;
1268 
1269     if (iter == 0 || iter->from_first)
1270     {
1271 	ret = ajSeqBamRead(fp, b);
1272 
1273 	if (ret < 0 && iter)
1274 	    iter->finished = 1;
1275 
1276 	return ret;
1277     }
1278 
1279     if (iter->off == 0)
1280 	return -1;
1281 
1282     for (;;)
1283     {
1284 	if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v)
1285 	{ /* then jump to the next chunk */
1286 	    if (iter->i == iter->n_off - 1)
1287 	    {
1288 		ret = -1;
1289 		break; /* no more chunks */
1290 	    }
1291 
1292 	    if (iter->i >= 0)
1293 		assert(iter->curr_off==iter->off[iter->i].v); /*otherwise bug*/
1294 
1295 	    if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u)
1296 	    { /* not adjacent chunks; then seek */
1297 		ajSeqBamBgzfSeek(fp, iter->off[iter->i+1].u, SEEK_SET);
1298 		iter->curr_off = bam_tell(fp);
1299 	    }
1300 	    ++iter->i;
1301 	}
1302 
1303 	if ((ret = ajSeqBamRead(fp, b)) >= 0)
1304 	{
1305 	    iter->curr_off = bam_tell(fp);
1306 
1307 	    if (b->core.tid != iter->tid || b->core.pos >= iter->end)
1308 	    { /* no need to proceed */
1309 
1310 		/* determine whether end of region or error */
1311 		ret = ajSeqBamValidate(NULL, b)? -1 : -5;
1312 		break;
1313 	    }
1314 	    else if (bamIsOverlap(iter->beg, iter->end, b))
1315 		return ret;
1316 	}
1317 	else
1318 	    break; /* end of file or error */
1319     }
1320 
1321     iter->finished = 1;
1322 
1323     return ret;
1324 }
1325 
1326 
1327 
1328 
1329 /* @func ajBamFetch ***********************************************************
1330 **
1331 **  Retrieve the alignments that are overlapped with the specified region.
1332 **
1333 ** @param [u] fp   [AjPSeqBamBgzf] BAM file handler
1334 ** @param [r] idx  [const AjPBamIndex] pointer to the alignment index
1335 ** @param [r] tid  [int]             chromosome ID as is defined in the header
1336 ** @param [r] beg  [int]             start coordinate, 0-based
1337 ** @param [r] end  [int]             end coordinate, 0-based
1338 ** @param [u] data [void*]           user provided data
1339 **                                   (will be transferred to func)
1340 ** @param [f] func [bam_fetch_f]     user defined function
1341 **
1342 ** @return [int] Comparison value. 0 if equal, -1 if first is lower,
1343 **               +1 if first is higher.
1344 **
1345 ** @release 6.5.0
1346 ** @@
1347 ******************************************************************************/
1348 
ajBamFetch(AjPSeqBamBgzf fp,const AjPBamIndex idx,int tid,int beg,int end,void * data,bam_fetch_f func)1349 int ajBamFetch(AjPSeqBamBgzf fp, const AjPBamIndex idx, int tid,
1350 	       int beg, int end, void *data, bam_fetch_f func)
1351 {
1352     int ret;
1353     bam_iter_t iter = NULL;
1354     AjPSeqBam b;
1355 
1356     AJNEW0(b);
1357     iter = bamIterQuery(idx, tid, beg, end);
1358 
1359     while ((ret = bamIterRead(fp, iter, b)) >= 0)
1360 	func(b, data);
1361 
1362     bamIterDel(&iter);
1363     ajSeqBamDel(&b);
1364 
1365     return (ret == -1)? 0 : ret;
1366 }
1367