1 /* @source ajbamindex *********************************************************
2 **
3 ** AJAX BAM files indexing/querying functions
4 **
5 ** @version $Revision: 1.12 $
6 ** @modified 2011 Mahmut Uludag ported from samtools (samtools.sf.net)
7 ** @modified $Date: 2012/07/04 07:40:28 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA 02110-1301, USA.
24 **
25 ******************************************************************************/
26
27 /* The MIT License
28 **
29 ** Copyright (c) 2008 Genome Research Ltd (GRL).
30 **
31 ** Permission is hereby granted, free of charge, to any person obtaining
32 ** a copy of this software and associated documentation files (the
33 ** "Software"), to deal in the Software without restriction, including
34 ** without limitation the rights to use, copy, modify, merge, publish,
35 ** distribute, sublicense, and/or sell copies of the Software, and to
36 ** permit persons to whom the Software is furnished to do so, subject to
37 ** the following conditions:
38 **
39 ** The above copyright notice and this permission notice shall be
40 ** included in all copies or substantial portions of the Software.
41 **
42 ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43 ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
45 ** NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
46 ** BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 ** ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 ** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49 ** SOFTWARE.
50 */
51
52
53 #include "ajlib.h"
54
55 #include "ajbamindex.h"
56 #include "ajutil.h"
57
58 #include <ctype.h>
59 #include <assert.h>
60
61
62 #ifdef WIN32
63 #define strdup _strdup
64 #endif
65
66 /*!
67 @header
68
69 Alignment indexing. Before indexing, BAM must be sorted based on the
70 leftmost coordinate of alignments. In indexing, BAM uses two indices:
71 a UCSC binning index and a simple linear index. The binning index is
72 efficient for alignments spanning long distance, while the auxiliary
73 linear index helps to reduce unnecessary seek calls especially for
74 short alignments.
75
76 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
77 Stein and is explained by Kent et al. (2002). In this scheme, each bin
78 represents a contiguous genomic region which can be fully contained in
79 another bin; each alignment is associated with a bin which represents
80 the smallest region containing the entire alignment. The binning
81 scheme is essentially another representation of R-tree. A distinct bin
82 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
83 a child of Bin B if region A is contained in B.
84
85 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
86 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
87 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
88 find the alignments overlapped with a region [rbeg,rend), we need to
89 calculate the list of bins that may be overlapped the region and test
90 the alignments in the bins to confirm the overlaps. If the specified
91 region is short, typically only a few alignments in six bins need to
92 be retrieved. The overlapping alignments can be quickly fetched.
93
94 */
95
96 #define BAM_MIN_CHUNK_GAP 32768
97 /* 1<<14 is the size of minimum bin. */
98
99 #define BAM_LIDX_SHIFT 14
100
101 #define BAM_MAX_BIN 37450 /* =(8^6-1)/7+1 */
102
103
104 struct __bam_iter_t
105 {
106 int from_first; /* read from the first record; no random access */
107 int tid, beg, end, n_off, i, finished;
108 ajulong curr_off;
109 pair64_t *off;
110 };
111
112 typedef struct __bam_iter_t *bam_iter_t;
113
114
115 static void bamBinDel(void** bin);
116 static void bamIndexSave(const AjPBamIndex idx, FILE *fp);
117 static void bamIterDel(bam_iter_t* iter);
118 static bam_iter_t bamIterQuery(const AjPBamIndex idx, int tid,
119 int beg, int end);
120 static int bamIterRead(AjPSeqBamBgzf fp, bam_iter_t iter, AjPSeqBam);
121 static AjPBamIndex bamIndexCore(AjPSeqBamBgzf fp);
122
123
124
125
126 /* @funcstatic bamInsertOffset ************************************************
127 **
128 ** Inserts a new chunk to the binning index h
129 **
130 ** @param [u] h [AjPTable] binning index
131 ** @param [r] bin [ajint] bin number
132 ** @param [r] beg [ajulong] start of target region
133 ** @param [r] end [ajulong] end of target region
134 ** @return [void]
135 **
136 ** @release 6.5.0
137 ** @@
138 ******************************************************************************/
139
bamInsertOffset(AjPTable h,ajint bin,ajulong beg,ajulong end)140 static inline void bamInsertOffset(AjPTable h, ajint bin, ajulong beg,
141 ajulong end)
142 {
143 bam_binlist_t *l = NULL;
144 int* tmp = NULL;
145
146 l = ajTableFetchmodV(h, &bin);
147
148 if (!l)
149 {
150 AJNEW0(l);
151 l->m = 1;
152 l->n = 0;
153 l->list = (pair64_t*)calloc(l->m, 16);
154
155 AJNEW0(tmp);
156 *tmp = bin;
157
158 ajTablePut(h, tmp, l);
159 }
160
161 if (l->n == l->m)
162 {
163 l->m <<= 1;
164 l->list = (pair64_t*)realloc(l->list, l->m * 16);
165 }
166
167 l->list[l->n].u = beg;
168 l->list[l->n++].v = end;
169 }
170
171
172
173
174 /* @funcstatic bamInsertOffset2 ***********************************************
175 **
176 ** Inserts a new offset to the linear index
177 **
178 ** @param [u] index2 [bam_lidx_t*] linear index
179 ** @param [r] b [const AjPSeqBam] alignment record
180 ** @param [r] offset [ajulong] offset
181 ** @return [void]
182 **
183 ** @release 6.5.0
184 ** @@
185 ******************************************************************************/
186
bamInsertOffset2(bam_lidx_t * index2,const AjPSeqBam b,ajulong offset)187 static inline void bamInsertOffset2(bam_lidx_t *index2, const AjPSeqBam b,
188 ajulong offset)
189 {
190 ajint i;
191 ajint beg;
192 ajint end;
193 ajint old_m;
194
195 beg = b->core.pos >> BAM_LIDX_SHIFT;
196 end = (ajSeqBamCalend(&b->core, MAJSEQBAMCIGAR(b)) - 1) >> BAM_LIDX_SHIFT;
197
198 if (index2->m < end + 1)
199 {
200 old_m = index2->m;
201 index2->m = end + 1;
202 kroundup32(index2->m);
203 index2->offset = (ajulong*)realloc(index2->offset, index2->m * 8);
204 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
205 }
206
207 if (beg == end)
208 {
209 if (index2->offset[beg] == 0)
210 index2->offset[beg] = offset;
211 }
212 else
213 {
214 for (i = beg; i <= end; ++i)
215 if (index2->offset[i] == 0)
216 index2->offset[i] = offset;
217 }
218
219 index2->n = end + 1;
220 }
221
222
223
224
225 /* @funcstatic bamMergeChunks *************************************************
226 **
227 ** merge binning index chunks
228 **
229 ** @param [u] idx [AjPBamIndex] index object
230 ** @return [void]
231 **
232 **
233 ** @release 6.5.0
234 ** @@
235 ******************************************************************************/
236
bamMergeChunks(AjPBamIndex idx)237 static void bamMergeChunks(AjPBamIndex idx)
238 {
239 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
240 AjPTable bindex = NULL;
241 bam_binlist_t* p = NULL;
242 bam_binlist_t** bins = NULL;
243 ajint** binids = NULL;
244 int i, m;
245 ajulong n;
246 ajuint l;
247 ajulong k =0UL;
248
249 for (i = 0; i < idx->n; ++i)
250 {
251 bindex = idx->bindex[i];
252
253 n = ajTableToarrayKeysValues(bindex, (void***)&binids, (void***)&bins);
254
255 for (k = 0; k<n; ++k)
256 {
257
258 if(*binids[k] == BAM_MAX_BIN) /* metadata */
259 continue;
260
261 p = bins[k];
262 m = 0;
263
264 for (l = 1; l < p->n; ++l)
265 {
266
267 #ifdef BAM_TRUE_OFFSET
268 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u)
269 p->list[m].v = p->list[l].v;
270 #else
271 if (p->list[m].v>>16 == p->list[l].u>>16)
272 p->list[m].v = p->list[l].v;
273 #endif
274
275 else
276 p->list[++m] = p->list[l];
277 }
278
279 p->n = m + 1;
280
281 }
282 }
283
284 AJFREE(bins);
285 AJFREE(binids);
286
287 #endif
288 }
289
290
291
292
293 /* @funcstatic bamFillMissing *************************************************
294 **
295 ** Fills missing linear index bits with the values of previous cells
296 ** in the index
297 **
298 ** @param [u] idx [AjPBamIndex] index object
299 ** @return [void]
300 **
301 ** @release 6.5.0
302 ** @@
303 ******************************************************************************/
304
bamFillMissing(AjPBamIndex idx)305 static void bamFillMissing(AjPBamIndex idx)
306 {
307 bam_lidx_t *idx2 = NULL;
308 ajint i;
309 ajint j;
310
311 for (i = 0; i < idx->n; ++i)
312 {
313 idx2 = &idx->index2[i];
314
315 for (j = 1; j < idx2->n; ++j)
316 if (idx2->offset[j] == 0)
317 idx2->offset[j] = idx2->offset[j-1];
318 }
319 }
320
321
322
323
324 /* @funcstatic bamIndexCore ***************************************************
325 **
326 ** Reads BAM index files
327 **
328 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
329 ** @return [AjPBamIndex] Bam index object
330 **
331 ** @release 6.5.0
332 ** @@
333 ******************************************************************************/
334
bamIndexCore(AjPSeqBamBgzf fp)335 static AjPBamIndex bamIndexCore(AjPSeqBamBgzf fp)
336 {
337 AjPSeqBam b = NULL;
338 AjPSeqBamHeader h = NULL;
339 AjPBamIndex idx = NULL;
340 AjPSeqBamCore c = NULL;
341 ajint i;
342 ajint ret;
343 ajuint last_bin, save_bin;
344 ajint last_coor;
345 ajint last_tid;
346 ajint save_tid;
347 ajlong last_off;
348 ajulong save_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
349
350 AJNEW0(idx);
351 AJNEW0(b);
352
353 h = ajSeqBamHeaderRead(fp);
354
355 c = &b->core;
356
357 idx->n = h->n_targets;
358 ajSeqBamHeaderDel(&h);
359
360 idx->bindex = (AjPTable*)calloc(idx->n, sizeof(AjPTable));
361
362 for (i = 0; i < idx->n; ++i)
363 {
364 idx->bindex[i] = ajTableintNew(100);
365 }
366
367 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
368
369 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
370 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
371 n_mapped = n_unmapped = n_no_coor = off_end = 0;
372 off_beg = off_end = bam_tell(fp);
373
374
375 while ((ret = ajSeqBamRead(fp, b)) >= 0)
376 {
377 if (c->tid < 0)
378 ++n_no_coor;
379
380 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0))
381 { /* change of chromosomes/reference-sequences */
382 last_tid = c->tid;
383 last_bin = 0xffffffffu;
384 }
385 else if (last_tid > c->tid)
386 {
387 ajErr("[bam_index_core] the alignment is not sorted (%s):"
388 " %d-th chr > %d-th chr\n",
389 MAJSEQBAMQNAME(b), last_tid+1, c->tid+1);
390 return NULL;
391 }
392 else if (c->tid >= 0 && last_coor > c->pos)
393 {
394 ajErr("[bam_index_core] the alignment is not sorted (%s):"
395 " %u > %u in %d-th chr\n",
396 MAJSEQBAMQNAME(b), last_coor, c->pos, c->tid+1);
397 return NULL;
398 }
399
400 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP))
401 bamInsertOffset2(&idx->index2[b->core.tid], b, last_off);
402
403 if (c->bin != last_bin) /* then possibly write the binning index */
404 {
405 /* save_bin==0xffffffffu only happens to the first record */
406 if (save_bin != 0xffffffffu)
407 bamInsertOffset(idx->bindex[save_tid], save_bin, save_off,
408 last_off);
409
410 if (last_bin == 0xffffffffu && save_tid != -1)
411 { /* write the meta element */
412 off_end = last_off;
413 bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN,
414 off_beg, off_end);
415 bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN,
416 n_mapped, n_unmapped);
417 n_mapped = n_unmapped = 0;
418 off_beg = off_end;
419 }
420
421 save_off = last_off;
422 save_bin = last_bin = c->bin;
423 save_tid = c->tid;
424
425 if (c->tid < 0)
426 break;
427 }
428
429 if (bam_tell(fp) <= last_off)
430 {
431 ajErr("bamIndexCore: bug in BGZF/RAZF: %llx < %llx\n",
432 (unsigned long long)bam_tell(fp),
433 (unsigned long long)last_off);
434 return NULL;
435 }
436
437 if (c->flag & BAM_FUNMAP)
438 ++n_unmapped;
439 else
440 ++n_mapped;
441
442 last_off = bam_tell(fp);
443 last_coor = b->core.pos;
444 }
445
446 if (save_tid >= 0)
447 {
448 bamInsertOffset(idx->bindex[save_tid], save_bin, save_off,
449 bam_tell(fp));
450 bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN, off_beg,
451 bam_tell(fp));
452 bamInsertOffset(idx->bindex[save_tid], BAM_MAX_BIN, n_mapped,
453 n_unmapped);
454 }
455
456 bamMergeChunks(idx);
457 bamFillMissing(idx);
458
459 if (ret >= 0)
460 {
461 while ((ret = ajSeqBamRead(fp, b)) >= 0)
462 {
463 ++n_no_coor;
464 if (c->tid >= 0 && n_no_coor)
465 {
466 ajErr("bamIndexCore: the alignment is not sorted:"
467 " reads without coordinates"
468 " prior to reads with coordinates.");
469 return NULL;
470 }
471 }
472 }
473
474 if (ret < -1)
475 ajWarn("bamIndexCore: truncated file");
476
477 AJFREE(b->data);
478 AJFREE(b);
479
480 idx->n_no_coor = n_no_coor;
481 return idx;
482 }
483
484
485
486
487 /* @funcstatic bamBinDel ******************************************************
488 **
489 ** Deletes a bin of a binning index
490 **
491 ** is called from ajTableSetDestroyvalue function
492 **
493 ** @param [d] bin [void**] bin object in a binning index
494 ** @return [void]
495 **
496 ** @release 6.5.0
497 ** @@
498 ******************************************************************************/
499
bamBinDel(void ** bin)500 static void bamBinDel(void** bin)
501 {
502 bam_binlist_t* p = *bin;
503 AJFREE(p->list);
504 AJFREE(p);
505 *bin = NULL;
506 }
507
508
509
510
511 /* @func ajBamIndexDel ********************************************************
512 **
513 ** Deletes a BAM index object
514 **
515 **
516 ** @param [d] Pidx [AjPBamIndex*] index object
517 ** @return [void]
518 **
519 ** @release 6.5.0
520 ** @@
521 ******************************************************************************/
522
ajBamIndexDel(AjPBamIndex * Pidx)523 void ajBamIndexDel(AjPBamIndex* Pidx)
524 {
525 AjPBamIndex idx = NULL;
526 int i;
527
528 idx = *Pidx;
529
530 if (idx == 0) return;
531
532 for (i = 0; i < idx->n; ++i)
533 {
534 bam_lidx_t *index2 = idx->index2 + i;
535
536 free(index2->offset);
537 ajTableSetDestroyvalue(idx->bindex[i], bamBinDel);
538 ajTableDel(&idx->bindex[i]);
539 }
540
541 AJFREE(idx->index2);
542 AJFREE(idx->bindex);
543 AJFREE(*Pidx);
544 }
545
546
547
548
549 /* @funcstatic bamIndexSave ***************************************************
550 **
551 ** Save an index
552 **
553 ** @param [r] idx [const AjPBamIndex] index object
554 ** @param [u] fp [FILE*] file pointer
555 ** @return [void]
556 **
557 ** @release 6.5.0
558 ** @@
559 ******************************************************************************/
560
bamIndexSave(const AjPBamIndex idx,FILE * fp)561 static void bamIndexSave(const AjPBamIndex idx, FILE *fp)
562 {
563 AjPTable bindex = NULL;
564 bam_lidx_t *index2 = NULL;
565 bam_binlist_t **bins = NULL;
566 ajuint **binids = NULL;
567 ajulong l = 0;
568 ajint i=0;
569 ajuint j=0;
570 ajuint size=0;
571 ajint k=0;
572 ajuint x = 0;
573 ajint bam_is_be = 0;
574
575 bam_is_be = ajUtilGetBigendian();
576
577 fwrite("BAI\1", 1, 4, fp);
578
579 if (bam_is_be)
580 {
581 x = idx->n;
582 ajByteRevLen4u(&x);
583 fwrite(&x, 4, 1, fp);
584 }
585 else
586 fwrite(&idx->n, 4, 1, fp);
587
588 for (i = 0; i < idx->n; ++i)
589 {
590 bindex = idx->bindex[i];
591 index2 = idx->index2 + i;
592
593 /* binning index */
594
595 /* # distinct bins */
596 size = (ajuint) ajTableToarrayKeysValues(bindex,
597 (void***)&binids,
598 (void***)&bins);
599
600 if (bam_is_be) /*big endian*/
601 {
602 x = size;
603 ajByteRevLen4u(&x);
604 fwrite(&x, 4, 1, fp);
605 }
606 else
607 fwrite(&size, 4, 1, fp);
608
609 for (j = 0; j < size; ++j)
610 {
611 bam_binlist_t *binlist = bins[j];
612
613 if (bam_is_be) /*big endian*/
614 {
615 x = *binids[j];
616 ajByteRevLen4u(&x);
617 fwrite(&x, 4, 1, fp);
618 x = binlist->n;
619 ajByteRevLen4u(&x);
620 fwrite(&x, 4, 1, fp);
621
622 for (x = 0; x < binlist->n; ++x)
623 {
624 ajByteRevLen8u((ajulong*)&binlist->list[x].u);
625 ajByteRevLen8u((ajulong*)&binlist->list[x].v);
626 }
627
628 fwrite(binlist->list, 16, binlist->n, fp);
629
630 for (x = 0; x < binlist->n; ++x)
631 {
632 ajByteRevLen8u((ajulong*)&binlist->list[x].u);
633 ajByteRevLen8u((ajulong*)&binlist->list[x].v);
634 }
635
636 }
637 else
638 {
639 fwrite(binids[j], 4, 1, fp);
640 fwrite(&binlist->n, 4, 1, fp);
641 fwrite(binlist->list, 16, binlist->n, fp);
642 }
643 }
644
645 /* write linear index (index2) */
646 if (bam_is_be)
647 {
648 x = index2->n;
649 ajByteRevLen4u(&x);
650 fwrite(&x, 4, 1, fp);
651 }
652 else
653 fwrite(&index2->n, 4, 1, fp);
654
655 if (bam_is_be)
656 {
657 for (k = 0; k < index2->n; ++k)
658 ajByteRevLen8u((ajulong*)&index2->offset[k]);
659
660 fwrite(index2->offset, 8, index2->n, fp);
661
662 for (k = 0; k < index2->n; ++k)
663 ajByteRevLen8u((ajulong*)&index2->offset[k]);
664 }
665 else
666 fwrite(index2->offset, 8, index2->n, fp);
667
668 AJFREE(bins);
669 AJFREE(binids);
670 }
671
672 { /* write the number of reads without coordinate */
673 l = idx->n_no_coor;
674
675 if (bam_is_be)
676 ajByteRevLen8u((ajulong*)&l);
677
678 fwrite(&l, 8, 1, fp);
679 }
680
681 fflush(fp);
682 }
683
684
685
686
687 /* @funcstatic bamIndexLoadCore ***********************************************
688 **
689 ** Load BAM index data from a file
690 **
691 ** @param [u] fp [FILE*] file pointer
692 ** @return [AjPBamIndex] BAM index
693 **
694 ** @release 6.5.0
695 ** @@
696 ******************************************************************************/
697
bamIndexLoadCore(FILE * fp)698 static AjPBamIndex bamIndexLoadCore(FILE *fp)
699 {
700 AjPBamIndex idx = NULL;
701 ajint i;
702 char magic[4];
703 ajint bam_is_be = 0;
704 bam_lidx_t *index2 = NULL;
705 ajuint binid;
706 ajint n_bin; /* # distinct bins */
707 ajint j;
708 bam_binlist_t *binlist = NULL;
709 ajint* tmp = NULL;
710 ajuint x;
711
712 bam_is_be = ajUtilGetBigendian();
713
714 if (fp == 0)
715 {
716 ajErr("bamIndexLoadCore: fail to load index.\n");
717 return 0;
718 }
719
720 fread(magic, 1, 4, fp);
721
722 if (strncmp(magic, "BAI\1", 4))
723 {
724 ajErr("bamIndexLoadCore: wrong magic number.\n");
725 fclose(fp);
726 return 0;
727 }
728
729 AJNEW0(idx);
730
731 fread(&idx->n, 4, 1, fp);
732 if (bam_is_be)
733 ajByteRevLen4(&idx->n);
734 ajDebug("# reference sequences: %d\n", idx->n);
735
736 idx->bindex = (AjPTable*)calloc(idx->n, sizeof(AjPTable));
737 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
738
739 for (i = 0; i < idx->n; ++i)
740 {
741 index2 = idx->index2 + i;
742 binlist = NULL;
743
744 idx->bindex[i] = ajTableintNew(10);
745
746 /* load binning index */
747 fread(&n_bin, 4, 1, fp);
748 if (bam_is_be)
749 ajByteRevLen4(&n_bin);
750 ajDebug("# distinct bins (for the reference sequence %d): %d\n",
751 i, n_bin);
752
753 for (j = 0; j < n_bin; ++j)
754 {
755
756 fread(&binid, 4, 1, fp);
757 if (bam_is_be)
758 ajByteRevLen4u(&binid);
759
760 AJNEW0(tmp);
761 *tmp = binid;
762 AJNEW0(binlist);
763 ajTablePut(idx->bindex[i], tmp, binlist);
764
765 fread(&binlist->n, 4, 1, fp);
766 if (bam_is_be)
767 ajByteRevLen4u(&binlist->n);
768 ajDebug("\tbin %u has %d chunks\n", binid, binlist->n);
769
770
771 binlist->m = binlist->n;
772 binlist->list = (pair64_t*)malloc(binlist->m * 16);
773 fread(binlist->list, 16, binlist->n, fp);
774 if (bam_is_be)
775 {
776 for (x = 0; x < binlist->n; ++x)
777 {
778 ajByteRevLen8u((ajulong*)&binlist->list[x].u);
779 ajByteRevLen8u((ajulong*)&binlist->list[x].v);
780 }
781 }
782
783 if (ajDebugOn())
784 for (x = 0; x < binlist->n; ++x)
785 {
786 ajlong mChunkStart = binlist->list[x].u;
787 ajlong mChunkEnd = binlist->list[x].v;
788
789 ajDebug("\t\tchunk: %d:%d-%d:%d\n",
790 mChunkStart >> 16,
791 mChunkStart & 0xFFFF,
792 mChunkEnd >> 16,
793 mChunkEnd & 0xFFFF);
794 }
795
796 }
797
798 /* load linear index */
799 fread(&index2->n, 4, 1, fp);
800 if (bam_is_be)
801 ajByteRevLen4(&index2->n);
802 index2->m = index2->n;
803
804 ajDebug("\tRef %u has %d 16kbp intervals\n", i, index2->n);
805
806 index2->offset = (ajulong*)calloc(index2->m, 8);
807 fread(index2->offset, index2->n, 8, fp);
808 if (bam_is_be)
809 for (j = 0; j < index2->n; ++j)
810 ajByteRevLen8u((ajulong*)&index2->offset[j]);
811
812
813 }
814
815 if (fread(&idx->n_no_coor, 8, 1, fp) == 0)
816 idx->n_no_coor = 0;
817
818 if (bam_is_be)
819 ajByteRevLen8u((ajulong*)&idx->n_no_coor);
820
821 return idx;
822 }
823
824
825
826
827 /* @func ajBamIndexLoad *******************************************************
828 **
829 ** Load index from file "filename.bai"
830 **
831 ** TODO: ftp and http support not yet implemented
832 **
833 ** @param [r] _fn [const char*] name of the BAM file (NOT the index file)
834 ** @return [AjPBamIndex] BAM index object
835 **
836 ** @release 6.5.0
837 ******************************************************************************/
838
ajBamIndexLoad(const char * _fn)839 AjPBamIndex ajBamIndexLoad(const char *_fn)
840 {
841 FILE *fp = NULL;
842 AjPBamIndex idx = NULL;
843 char *fnidx = NULL;
844 char *fn = NULL;
845 char *s = NULL;
846 const char *p = NULL;
847 ajint l = 0;
848
849 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn)
850 {
851 l = strlen(_fn);
852
853 for (p = _fn + l - 1; p >= _fn; --p)
854 if (*p == '/')
855 break;
856
857 fn = strdup(p + 1);
858 }
859 else
860 fn = strdup(_fn);
861
862 fnidx = (char*)calloc(strlen(fn) + 5, 1);
863 strcpy(fnidx, fn); strcat(fnidx, ".bai");
864 fp = fopen(fnidx, "rb");
865
866 if (fp == 0)
867 { /* try "{base}.bai" */
868 s = strstr(fn, "bam");
869
870 if (s == fn + strlen(fn) - 3)
871 {
872 strcpy(fnidx, fn);
873 fnidx[strlen(fn)-1] = 'i';
874 fp = fopen(fnidx, "rb");
875 }
876 }
877
878 free(fnidx);
879 free(fn);
880
881 if (fp)
882 {
883 idx = bamIndexLoadCore(fp);
884 fclose(fp);
885 return idx;
886 }
887 else
888 return NULL;
889 }
890
891
892
893
894 /* @func ajBamIndexBuild ******************************************************
895 **
896 ** Build index for a BAM file, using .bai extension for the index file
897 **
898 ** @param [r] fn [const char*] name of the BAM file
899 ** @return [int] 0 if the index build was successful
900 **
901 **
902 ** @release 6.5.0
903 ******************************************************************************/
904
ajBamIndexBuild(const char * fn)905 int ajBamIndexBuild(const char *fn)
906 {
907 char *fnidx;
908 FILE *fpidx = NULL;
909 AjPSeqBamBgzf fp = NULL;
910 AjPBamIndex idx = NULL;
911
912 if(!strcmp(fn, "stdout"))
913 return -1;
914
915 if ((fp = ajSeqBamBgzfOpenC(fn, "r")) == 0)
916 {
917 ajErr("ajBamIndexBuild: fail to open the BAM file '%s'.", fn);
918 return -1;
919 }
920
921 idx = bamIndexCore(fp);
922 ajSeqBamBgzfClose(fp);
923
924 if(idx == 0)
925 {
926 ajErr("ajBamIndexBuild: fail to index the BAM file '%s'.", fn);
927 return -1;
928 }
929
930 fnidx = (char*)calloc(strlen(fn) + 5, 1);
931 strcpy(fnidx, fn); strcat(fnidx, ".bai");
932
933 fpidx = fopen(fnidx, "wb");
934
935 if (fpidx == 0)
936 {
937 ajErr("ajBamIndexBuild: fail to create the index file '%s'.", fn);
938 free(fnidx);
939 return -1;
940 }
941
942 bamIndexSave(idx, fpidx);
943
944 AJFREE(fnidx);
945
946 ajBamIndexDel(&idx);
947 fclose(fpidx);
948
949 return 0;
950 }
951
952
953
954
955 /* @funcstatic bamReg2bins ****************************************************
956 **
957 ** Calculate the list of bins that may overlap with region [beg, end)
958 * (zero based).
959 *
960 * picard def: Get candidate bins for the specified region.
961 **
962 ** @param [r] beg [ajuint] 1-based start of target region, inclusive
963 ** @param [r] end [ajuint] 1-based end of target region, inclusive
964 ** @param [w] list [ajushort[BAM_MAX_BIN]] list of bins that may overlap
965 ** with region
966 ** @return [int] number of bins
967 **
968 ** @release 6.5.0
969 ** @@
970 ******************************************************************************/
971
bamReg2bins(ajuint beg,ajuint end,ajushort list[BAM_MAX_BIN])972 static inline int bamReg2bins(ajuint beg, ajuint end,
973 ajushort list[BAM_MAX_BIN])
974 {
975 ajuint i=0;
976 ajuint k=0;
977
978 if (beg >= end)
979 return 0;
980
981 if (end >= 1u<<29)
982 end = 1u<<29;
983
984 --end;
985 list[i++] = 0;
986
987 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
988 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
989 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
990 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
991 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
992
993 return i;
994 }
995
996
997
998
999 /* @funcstatic bamIsOverlap ***************************************************
1000 **
1001 ** Checks whether a region overlaps a BAM record
1002 **
1003 ** @param [r] beg [ajuint] region start position
1004 ** @param [r] end [ajuint] region end position
1005 ** @param [r] b [const AjPSeqBam] BAM record
1006 ** @return [int] 1 if the region overlaps the BAM record, 0 otherwise
1007 **
1008 ** @release 6.5.0
1009 ** @@
1010 ******************************************************************************/
1011
bamIsOverlap(ajuint beg,ajuint end,const AjPSeqBam b)1012 static inline int bamIsOverlap(ajuint beg, ajuint end, const AjPSeqBam b)
1013 {
1014 ajuint rbeg = b->core.pos;
1015 ajuint rend = 0;
1016
1017 if(b->core.n_cigar)
1018 rend = ajSeqBamCalend(&b->core, MAJSEQBAMCIGAR(b));
1019 else
1020 rend = b->core.pos + 1;
1021
1022 return (rend > beg && rbeg < end);
1023 }
1024
1025
1026
1027
1028 /* @funcstatic bamChunkCompare ************************************************
1029 **
1030 ** Compares the virtual file offset of the start of two chunks
1031 ** for binning indices
1032 **
1033 ** @param [r] a [const void*] first chunk
1034 ** @param [r] b [const void*] second chunk
1035 ** @return [ajint] 1 if the first virtual offset is smaller, 0 otherwise
1036 **
1037 ** @release 6.5.0
1038 ** @@
1039 ******************************************************************************/
1040
bamChunkCompare(const void * a,const void * b)1041 static ajint bamChunkCompare(const void* a, const void* b)
1042 {
1043 return (((pair64_t const *)a)->u > ((pair64_t const *)b)->u);
1044 }
1045
1046
1047
1048
1049 /* @funcstatic bamIterQuery ***************************************************
1050 **
1051 ** query a region on a reference sequence
1052 **
1053 ** @param [r] idx [const AjPBamIndex] BAM index
1054 ** @param [r] tid [int] reference sequence id
1055 ** @param [r] beg [int] 1-based start of target region, inclusive
1056 ** @param [r] end [int] 1-based end of target region, inclusive
1057 ** @return [bam_iter_t] iteration object
1058 **
1059 ** @release 6.5.0
1060 ** @@
1061 ******************************************************************************/
1062
bamIterQuery(const AjPBamIndex idx,int tid,int beg,int end)1063 static bam_iter_t bamIterQuery(const AjPBamIndex idx, int tid,
1064 int beg, int end)
1065 {
1066 pair64_t *off=NULL;
1067 AjPTable bindex = NULL;
1068 AjPSeqBam b = NULL;
1069 ajushort *bins=NULL;
1070 const bam_binlist_t *p = NULL;
1071 ajulong min_off;
1072 bam_iter_t iter = 0;
1073 ajint i=0;
1074 ajint n_bins=0;
1075 ajint n_off=0;
1076 ajint key;
1077 ajint l;
1078 ajint n;
1079 ajuint j;
1080
1081 if (beg < 0)
1082 beg = 0;
1083
1084 if (end < beg)
1085 return 0;
1086
1087 /* initialize iter */
1088 iter = calloc(1, sizeof(struct __bam_iter_t));
1089 iter->tid = tid;
1090 iter->beg = beg;
1091 iter->end = end;
1092 iter->i = -1;
1093
1094 bins = (ajushort*)calloc(BAM_MAX_BIN, 2);
1095 n_bins = bamReg2bins(beg, end, bins);
1096
1097 bindex = idx->bindex[tid];
1098
1099 if (idx->index2[tid].n > 0)
1100 {
1101 if (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)
1102 min_off = idx->index2[tid].offset[idx->index2[tid].n-1];
1103 else
1104 min_off = idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
1105
1106 if (min_off == 0)
1107 { /* improvement for mapindex files built by tabix prior to 0.1.4 */
1108 n = beg>>BAM_LIDX_SHIFT;
1109
1110 if (n > idx->index2[tid].n)
1111 n = idx->index2[tid].n;
1112
1113 for (i = n - 1; i >= 0; --i)
1114 if (idx->index2[tid].offset[i] != 0)
1115 break;
1116
1117 if (i >= 0)
1118 min_off = idx->index2[tid].offset[i];
1119 }
1120 }
1121 else
1122 min_off = 0; /* tabix 0.1.2 may produce such mapindex files */
1123
1124
1125 /* iterate over the list of bins that may overlap with region */
1126 for (i = n_off = 0; i < n_bins; ++i)
1127 {
1128 key = bins[i];
1129 p = ajTableFetchV(bindex, &key);
1130
1131 if(p)
1132 {
1133 n_off += p->n;
1134 ajDebug("bin:%u # chunks:%d\n", key, p->n);
1135 }
1136
1137 }
1138
1139 ajDebug("n_off:%d\n", n_off);
1140
1141 if (n_off == 0)
1142 {
1143 free(bins);
1144 return iter;
1145 }
1146
1147 off = (pair64_t*)calloc(n_off, 16);
1148
1149 for (i = n_off = 0; i < n_bins; ++i)
1150 {
1151 key = bins[i];
1152 p = ajTableFetchV(bindex, &key);
1153
1154 if(p)
1155 {
1156 for (j = 0; j < p->n; ++j)
1157 {
1158 if (p->list[j].v > min_off)
1159 off[n_off++] = p->list[j];
1160
1161 ajDebug("p->list[%d].v:%u min_off:%d\n",
1162 j, p->list[j].v, min_off);
1163 }
1164 }
1165 }
1166
1167 free(bins);
1168
1169 if (n_off == 0)
1170 {
1171 free(off);
1172 return iter;
1173 }
1174
1175 AJNEW0(b);
1176
1177 qsort(off, n_off, sizeof(pair64_t), bamChunkCompare);
1178
1179 /* resolve completely contained adjacent blocks */
1180 for (i = 1, l = 0; i < n_off; ++i)
1181 if (off[l].v < off[i].v)
1182 off[++l] = off[i];
1183 n_off = l + 1;
1184
1185 /* resolve overlaps between adjacent blocks;
1186 * this may happen due to the merge in indexing
1187 */
1188
1189 for (i = 1; i < n_off; ++i)
1190 if (off[i-1].v >= off[i].u)
1191 off[i-1].v = off[i].u;
1192
1193 { /* merge adjacent blocks */
1194 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
1195 for (i = 1, l = 0; i < n_off; ++i)
1196 {
1197 #ifdef BAM_TRUE_OFFSET
1198 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u)
1199 off[l].v = off[i].v;
1200 #else
1201 if (off[l].v>>16 == off[i].u>>16)
1202 off[l].v = off[i].v;
1203 #endif
1204 else off[++l] = off[i];
1205 }
1206 n_off = l + 1;
1207 #endif
1208 }
1209
1210 ajSeqBamDel(&b);
1211
1212 iter->n_off = n_off; iter->off = off;
1213
1214 return iter;
1215 }
1216
1217
1218
1219
1220 /* @funcstatic bamIterDel *****************************************************
1221 **
1222 ** Deletes a BAM iteration object
1223 **
1224 ** @param [d] iterp [bam_iter_t*] pointer to iteration object to be deleted
1225 ** @return [void]
1226 **
1227 ** @release 6.5.0
1228 ** @@
1229 ******************************************************************************/
1230
bamIterDel(bam_iter_t * iterp)1231 static void bamIterDel(bam_iter_t* iterp)
1232 {
1233 bam_iter_t iter;
1234
1235 iter = *iterp;
1236
1237 if (iter)
1238 {
1239 free(iter->off);
1240 free(iter);
1241 }
1242
1243 *iterp = NULL;
1244 }
1245
1246
1247
1248
1249 /* @funcstatic bamIterRead ****************************************************
1250 **
1251 ** ajBamFetch helper function retrieves data into alignment record b
1252 **
1253 ** @param [u] fp [AjPSeqBamBgzf] BAM file
1254 ** @param [u] iter [bam_iter_t] BAM iteration object
1255 ** @param [u] b [AjPSeqBam] BAM record
1256 ** @return [int] positive integer (1 or more) if a record successfully read
1257 **
1258 ** @release 6.5.0
1259 ** @@
1260 ******************************************************************************/
1261
bamIterRead(AjPSeqBamBgzf fp,bam_iter_t iter,AjPSeqBam b)1262 static int bamIterRead(AjPSeqBamBgzf fp, bam_iter_t iter, AjPSeqBam b)
1263 {
1264 int ret;
1265
1266 if (iter && iter->finished)
1267 return -1;
1268
1269 if (iter == 0 || iter->from_first)
1270 {
1271 ret = ajSeqBamRead(fp, b);
1272
1273 if (ret < 0 && iter)
1274 iter->finished = 1;
1275
1276 return ret;
1277 }
1278
1279 if (iter->off == 0)
1280 return -1;
1281
1282 for (;;)
1283 {
1284 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v)
1285 { /* then jump to the next chunk */
1286 if (iter->i == iter->n_off - 1)
1287 {
1288 ret = -1;
1289 break; /* no more chunks */
1290 }
1291
1292 if (iter->i >= 0)
1293 assert(iter->curr_off==iter->off[iter->i].v); /*otherwise bug*/
1294
1295 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u)
1296 { /* not adjacent chunks; then seek */
1297 ajSeqBamBgzfSeek(fp, iter->off[iter->i+1].u, SEEK_SET);
1298 iter->curr_off = bam_tell(fp);
1299 }
1300 ++iter->i;
1301 }
1302
1303 if ((ret = ajSeqBamRead(fp, b)) >= 0)
1304 {
1305 iter->curr_off = bam_tell(fp);
1306
1307 if (b->core.tid != iter->tid || b->core.pos >= iter->end)
1308 { /* no need to proceed */
1309
1310 /* determine whether end of region or error */
1311 ret = ajSeqBamValidate(NULL, b)? -1 : -5;
1312 break;
1313 }
1314 else if (bamIsOverlap(iter->beg, iter->end, b))
1315 return ret;
1316 }
1317 else
1318 break; /* end of file or error */
1319 }
1320
1321 iter->finished = 1;
1322
1323 return ret;
1324 }
1325
1326
1327
1328
1329 /* @func ajBamFetch ***********************************************************
1330 **
1331 ** Retrieve the alignments that are overlapped with the specified region.
1332 **
1333 ** @param [u] fp [AjPSeqBamBgzf] BAM file handler
1334 ** @param [r] idx [const AjPBamIndex] pointer to the alignment index
1335 ** @param [r] tid [int] chromosome ID as is defined in the header
1336 ** @param [r] beg [int] start coordinate, 0-based
1337 ** @param [r] end [int] end coordinate, 0-based
1338 ** @param [u] data [void*] user provided data
1339 ** (will be transferred to func)
1340 ** @param [f] func [bam_fetch_f] user defined function
1341 **
1342 ** @return [int] Comparison value. 0 if equal, -1 if first is lower,
1343 ** +1 if first is higher.
1344 **
1345 ** @release 6.5.0
1346 ** @@
1347 ******************************************************************************/
1348
ajBamFetch(AjPSeqBamBgzf fp,const AjPBamIndex idx,int tid,int beg,int end,void * data,bam_fetch_f func)1349 int ajBamFetch(AjPSeqBamBgzf fp, const AjPBamIndex idx, int tid,
1350 int beg, int end, void *data, bam_fetch_f func)
1351 {
1352 int ret;
1353 bam_iter_t iter = NULL;
1354 AjPSeqBam b;
1355
1356 AJNEW0(b);
1357 iter = bamIterQuery(idx, tid, beg, end);
1358
1359 while ((ret = bamIterRead(fp, iter, b)) >= 0)
1360 func(b, data);
1361
1362 bamIterDel(&iter);
1363 ajSeqBamDel(&b);
1364
1365 return (ret == -1)? 0 : ret;
1366 }
1367