1 #include <ctype.h>
2 #include <assert.h>
3 #include "bam.h"
4 #include "khash.h"
5 #include "ksort.h"
6 #include "bam_endian.h"
7 #ifdef _USE_KNETFILE
8 #include "knetfile.h"
9 #endif
10
11 /*!
12 @header
13
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
19 short alignments.
20
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
29
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
38
39 */
40
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
44
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
46
47 typedef struct {
48 uint64_t u, v;
49 } pair64_t;
50
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
53
54 typedef struct {
55 uint32_t m, n;
56 pair64_t *list;
57 } bam_binlist_t;
58
59 typedef struct {
60 int32_t n, m;
61 uint64_t *offset;
62 } bam_lidx_t;
63
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
65
66 struct __bam_index_t {
67 int32_t n;
68 uint64_t n_no_coor; // unmapped reads without coordinate
69 khash_t(i) **index;
70 bam_lidx_t *index2;
71 };
72
73 // requirement: len <= LEN_MASK
insert_offset(khash_t (i)* h,int bin,uint64_t beg,uint64_t end)74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
75 {
76 khint_t k;
77 bam_binlist_t *l;
78 int ret;
79 k = kh_put(i, h, bin, &ret);
80 l = &kh_value(h, k);
81 if (ret) { // not present
82 l->m = 1; l->n = 0;
83 l->list = (pair64_t*)calloc(l->m, 16);
84 }
85 if (l->n == l->m) {
86 l->m <<= 1;
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
88 }
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
90 }
91
insert_offset2(bam_lidx_t * index2,bam1_t * b,uint64_t offset)92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
93 {
94 int i, beg, end;
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
99 index2->m = end + 1;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
103 }
104 if (beg == end) {
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
106 } else {
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
109 }
110 index2->n = end + 1;
111 }
112
merge_chunks(bam_index_t * idx)113 static void merge_chunks(bam_index_t *idx)
114 {
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
116 khash_t(i) *index;
117 int i, l, m;
118 khint_t k;
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
122 bam_binlist_t *p;
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
125 m = 0;
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
129 #else
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
131 #endif
132 else p->list[++m] = p->list[l];
133 } // ~for(l)
134 p->n = m + 1;
135 } // ~for(k)
136 } // ~for(i)
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
138 }
139
fill_missing(bam_index_t * idx)140 static void fill_missing(bam_index_t *idx)
141 {
142 int i, j;
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
148 }
149 }
150
bam_index_core(bamFile fp)151 bam_index_t *bam_index_core(bamFile fp)
152 {
153 bam1_t *b;
154 bam_header_t *h;
155 int i, ret;
156 bam_index_t *idx;
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
159 bam1_core_t *c;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
161
162 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 h = bam_header_read(fp);
165 c = &b->core;
166
167 idx->n = h->n_targets;
168 bam_header_destroy(h);
169 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
172
173 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 off_beg = off_end = bam_tell(fp);
177 while ((ret = bam_read1(fp, b)) >= 0) {
178 if (c->tid < 0) ++n_no_coor;
179 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
180 last_tid = c->tid;
181 last_bin = 0xffffffffu;
182 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
183 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
184 bam1_qname(b), last_tid+1, c->tid+1);
185 return NULL;
186 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
187 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
188 bam1_qname(b), last_coor, c->pos, c->tid+1);
189 return NULL;
190 }
191 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
192 if (c->bin != last_bin) { // then possibly write the binning index
193 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
194 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
195 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
196 off_end = last_off;
197 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
198 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
199 n_mapped = n_unmapped = 0;
200 off_beg = off_end;
201 }
202 save_off = last_off;
203 save_bin = last_bin = c->bin;
204 save_tid = c->tid;
205 if (save_tid < 0) break;
206 }
207 if (bam_tell(fp) <= last_off) {
208 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
209 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
210 return NULL;
211 }
212 if (c->flag & BAM_FUNMAP) ++n_unmapped;
213 else ++n_mapped;
214 last_off = bam_tell(fp);
215 last_coor = b->core.pos;
216 }
217 if (save_tid >= 0) {
218 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
219 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
220 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
221 }
222 merge_chunks(idx);
223 fill_missing(idx);
224 if (ret >= 0) {
225 while ((ret = bam_read1(fp, b)) >= 0) {
226 ++n_no_coor;
227 if (c->tid >= 0 && n_no_coor) {
228 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
229 return NULL;
230 }
231 }
232 }
233 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
234 free(b->data); free(b);
235 idx->n_no_coor = n_no_coor;
236 return idx;
237 }
238
bam_index_destroy(bam_index_t * idx)239 void bam_index_destroy(bam_index_t *idx)
240 {
241 khint_t k;
242 int i;
243 if (idx == 0) return;
244 for (i = 0; i < idx->n; ++i) {
245 khash_t(i) *index = idx->index[i];
246 bam_lidx_t *index2 = idx->index2 + i;
247 for (k = kh_begin(index); k != kh_end(index); ++k) {
248 if (kh_exist(index, k))
249 free(kh_value(index, k).list);
250 }
251 kh_destroy(i, index);
252 free(index2->offset);
253 }
254 free(idx->index); free(idx->index2);
255 free(idx);
256 }
257
bam_index_save(const bam_index_t * idx,FILE * fp)258 void bam_index_save(const bam_index_t *idx, FILE *fp)
259 {
260 int32_t i, size;
261 khint_t k;
262 fwrite("BAI\1", 1, 4, fp);
263 if (bam_is_be) {
264 uint32_t x = idx->n;
265 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
266 } else fwrite(&idx->n, 4, 1, fp);
267 for (i = 0; i < idx->n; ++i) {
268 khash_t(i) *index = idx->index[i];
269 bam_lidx_t *index2 = idx->index2 + i;
270 // write binning index
271 size = kh_size(index);
272 if (bam_is_be) { // big endian
273 uint32_t x = size;
274 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
275 } else fwrite(&size, 4, 1, fp);
276 for (k = kh_begin(index); k != kh_end(index); ++k) {
277 if (kh_exist(index, k)) {
278 bam_binlist_t *p = &kh_value(index, k);
279 if (bam_is_be) { // big endian
280 uint32_t x;
281 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
282 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
283 for (x = 0; (int)x < p->n; ++x) {
284 bam_swap_endian_8p(&p->list[x].u);
285 bam_swap_endian_8p(&p->list[x].v);
286 }
287 fwrite(p->list, 16, p->n, fp);
288 for (x = 0; (int)x < p->n; ++x) {
289 bam_swap_endian_8p(&p->list[x].u);
290 bam_swap_endian_8p(&p->list[x].v);
291 }
292 } else {
293 fwrite(&kh_key(index, k), 4, 1, fp);
294 fwrite(&p->n, 4, 1, fp);
295 fwrite(p->list, 16, p->n, fp);
296 }
297 }
298 }
299 // write linear index (index2)
300 if (bam_is_be) {
301 int x = index2->n;
302 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
303 } else fwrite(&index2->n, 4, 1, fp);
304 if (bam_is_be) { // big endian
305 int x;
306 for (x = 0; (int)x < index2->n; ++x)
307 bam_swap_endian_8p(&index2->offset[x]);
308 fwrite(index2->offset, 8, index2->n, fp);
309 for (x = 0; (int)x < index2->n; ++x)
310 bam_swap_endian_8p(&index2->offset[x]);
311 } else fwrite(index2->offset, 8, index2->n, fp);
312 }
313 { // write the number of reads coor-less records.
314 uint64_t x = idx->n_no_coor;
315 if (bam_is_be) bam_swap_endian_8p(&x);
316 fwrite(&x, 8, 1, fp);
317 }
318 fflush(fp);
319 }
320
bam_index_load_core(FILE * fp)321 static bam_index_t *bam_index_load_core(FILE *fp)
322 {
323 int i;
324 char magic[4];
325 bam_index_t *idx;
326 if (fp == 0) {
327 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
328 return 0;
329 }
330 fread(magic, 1, 4, fp);
331 if (strncmp(magic, "BAI\1", 4)) {
332 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
333 fclose(fp);
334 return 0;
335 }
336 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
337 fread(&idx->n, 4, 1, fp);
338 if (bam_is_be) bam_swap_endian_4p(&idx->n);
339 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
340 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
341 for (i = 0; i < idx->n; ++i) {
342 khash_t(i) *index;
343 bam_lidx_t *index2 = idx->index2 + i;
344 uint32_t key, size;
345 khint_t k;
346 int j, ret;
347 bam_binlist_t *p;
348 index = idx->index[i] = kh_init(i);
349 // load binning index
350 fread(&size, 4, 1, fp);
351 if (bam_is_be) bam_swap_endian_4p(&size);
352 for (j = 0; j < (int)size; ++j) {
353 fread(&key, 4, 1, fp);
354 if (bam_is_be) bam_swap_endian_4p(&key);
355 k = kh_put(i, index, key, &ret);
356 p = &kh_value(index, k);
357 fread(&p->n, 4, 1, fp);
358 if (bam_is_be) bam_swap_endian_4p(&p->n);
359 p->m = p->n;
360 p->list = (pair64_t*)malloc(p->m * 16);
361 fread(p->list, 16, p->n, fp);
362 if (bam_is_be) {
363 int x;
364 for (x = 0; x < p->n; ++x) {
365 bam_swap_endian_8p(&p->list[x].u);
366 bam_swap_endian_8p(&p->list[x].v);
367 }
368 }
369 }
370 // load linear index
371 fread(&index2->n, 4, 1, fp);
372 if (bam_is_be) bam_swap_endian_4p(&index2->n);
373 index2->m = index2->n;
374 index2->offset = (uint64_t*)calloc(index2->m, 8);
375 fread(index2->offset, index2->n, 8, fp);
376 if (bam_is_be)
377 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
378 }
379 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
380 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
381 return idx;
382 }
383
bam_index_load_local(const char * _fn)384 bam_index_t *bam_index_load_local(const char *_fn)
385 {
386 FILE *fp;
387 char *fnidx, *fn;
388
389 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
390 const char *p;
391 int l = strlen(_fn);
392 for (p = _fn + l - 1; p >= _fn; --p)
393 if (*p == '/') break;
394 fn = strdup(p + 1);
395 } else fn = strdup(_fn);
396 fnidx = (char*)calloc(strlen(fn) + 5, 1);
397 strcpy(fnidx, fn); strcat(fnidx, ".bai");
398 fp = fopen(fnidx, "rb");
399 if (fp == 0) { // try "{base}.bai"
400 char *s = strstr(fn, "bam");
401 if (s == fn + strlen(fn) - 3) {
402 strcpy(fnidx, fn);
403 fnidx[strlen(fn)-1] = 'i';
404 fp = fopen(fnidx, "rb");
405 }
406 }
407 free(fnidx); free(fn);
408 if (fp) {
409 bam_index_t *idx = bam_index_load_core(fp);
410 fclose(fp);
411 return idx;
412 } else return 0;
413 }
414
415 #ifdef _USE_KNETFILE
download_from_remote(const char * url)416 static void download_from_remote(const char *url)
417 {
418 const int buf_size = 1 * 1024 * 1024;
419 char *fn;
420 FILE *fp;
421 uint8_t *buf;
422 knetFile *fp_remote;
423 int l;
424 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
425 l = strlen(url);
426 for (fn = (char*)url + l - 1; fn >= url; --fn)
427 if (*fn == '/') break;
428 ++fn; // fn now points to the file name
429 fp_remote = knet_open(url, "r");
430 if (fp_remote == 0) {
431 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
432 return;
433 }
434 if ((fp = fopen(fn, "wb")) == 0) {
435 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
436 knet_close(fp_remote);
437 return;
438 }
439 buf = (uint8_t*)calloc(buf_size, 1);
440 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
441 fwrite(buf, 1, l, fp);
442 free(buf);
443 fclose(fp);
444 knet_close(fp_remote);
445 }
446 #else
download_from_remote(const char * url)447 static void download_from_remote(const char *url)
448 {
449 return;
450 }
451 #endif
452
bam_index_load(const char * fn)453 bam_index_t *bam_index_load(const char *fn)
454 {
455 bam_index_t *idx;
456 idx = bam_index_load_local(fn);
457 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
458 char *fnidx = calloc(strlen(fn) + 5, 1);
459 strcat(strcpy(fnidx, fn), ".bai");
460 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
461 download_from_remote(fnidx);
462 idx = bam_index_load_local(fn);
463 }
464 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
465 return idx;
466 }
467
bam_index_build2(const char * fn,const char * _fnidx)468 int bam_index_build2(const char *fn, const char *_fnidx)
469 {
470 char *fnidx;
471 FILE *fpidx;
472 bamFile fp;
473 bam_index_t *idx;
474 if ((fp = bam_open(fn, "r")) == 0) {
475 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
476 return -1;
477 }
478 idx = bam_index_core(fp);
479 bam_close(fp);
480 if(idx == 0) {
481 fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
482 return -1;
483 }
484 if (_fnidx == 0) {
485 fnidx = (char*)calloc(strlen(fn) + 5, 1);
486 strcpy(fnidx, fn); strcat(fnidx, ".bai");
487 } else fnidx = strdup(_fnidx);
488 fpidx = fopen(fnidx, "wb");
489 if (fpidx == 0) {
490 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
491 free(fnidx);
492 return -1;
493 }
494 bam_index_save(idx, fpidx);
495 bam_index_destroy(idx);
496 fclose(fpidx);
497 free(fnidx);
498 return 0;
499 }
500
bam_index_build(const char * fn)501 int bam_index_build(const char *fn)
502 {
503 return bam_index_build2(fn, 0);
504 }
505
bam_index(int argc,char * argv[])506 int bam_index(int argc, char *argv[])
507 {
508 if (argc < 2) {
509 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
510 return 1;
511 }
512 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
513 else bam_index_build(argv[1]);
514 return 0;
515 }
516
bam_idxstats(int argc,char * argv[])517 int bam_idxstats(int argc, char *argv[])
518 {
519 bam_index_t *idx;
520 bam_header_t *header;
521 bamFile fp;
522 int i;
523 if (argc < 2) {
524 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
525 return 1;
526 }
527 fp = bam_open(argv[1], "r");
528 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
529 header = bam_header_read(fp);
530 bam_close(fp);
531 idx = bam_index_load(argv[1]);
532 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
533 for (i = 0; i < idx->n; ++i) {
534 khint_t k;
535 khash_t(i) *h = idx->index[i];
536 printf("%s\t%d", header->target_name[i], header->target_len[i]);
537 k = kh_get(i, h, BAM_MAX_BIN);
538 if (k != kh_end(h))
539 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
540 else printf("\t0\t0");
541 putchar('\n');
542 }
543 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
544 bam_header_destroy(header);
545 bam_index_destroy(idx);
546 return 0;
547 }
548
reg2bins(uint32_t beg,uint32_t end,uint16_t list[BAM_MAX_BIN])549 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
550 {
551 int i = 0, k;
552 if (beg >= end) return 0;
553 if (end >= 1u<<29) end = 1u<<29;
554 --end;
555 list[i++] = 0;
556 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
557 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
558 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
559 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
560 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
561 return i;
562 }
563
is_overlap(uint32_t beg,uint32_t end,const bam1_t * b)564 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
565 {
566 uint32_t rbeg = b->core.pos;
567 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
568 return (rend > beg && rbeg < end);
569 }
570
571 struct __bam_iter_t {
572 int from_first; // read from the first record; no random access
573 int tid, beg, end, n_off, i, finished;
574 uint64_t curr_off;
575 pair64_t *off;
576 };
577
578 // bam_fetch helper function retrieves
bam_iter_query(const bam_index_t * idx,int tid,int beg,int end)579 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
580 {
581 uint16_t *bins;
582 int i, n_bins, n_off;
583 pair64_t *off;
584 khint_t k;
585 khash_t(i) *index;
586 uint64_t min_off;
587 bam_iter_t iter = 0;
588
589 if (beg < 0) beg = 0;
590 if (end < beg) return 0;
591 // initialize iter
592 iter = calloc(1, sizeof(struct __bam_iter_t));
593 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
594 //
595 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
596 n_bins = reg2bins(beg, end, bins);
597 index = idx->index[tid];
598 if (idx->index2[tid].n > 0) {
599 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
600 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
601 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
602 int n = beg>>BAM_LIDX_SHIFT;
603 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
604 for (i = n - 1; i >= 0; --i)
605 if (idx->index2[tid].offset[i] != 0) break;
606 if (i >= 0) min_off = idx->index2[tid].offset[i];
607 }
608 } else min_off = 0; // tabix 0.1.2 may produce such index files
609 for (i = n_off = 0; i < n_bins; ++i) {
610 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
611 n_off += kh_value(index, k).n;
612 }
613 if (n_off == 0) {
614 free(bins); return iter;
615 }
616 off = (pair64_t*)calloc(n_off, 16);
617 for (i = n_off = 0; i < n_bins; ++i) {
618 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
619 int j;
620 bam_binlist_t *p = &kh_value(index, k);
621 for (j = 0; j < p->n; ++j)
622 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
623 }
624 }
625 free(bins);
626 if (n_off == 0) {
627 free(off); return iter;
628 }
629 {
630 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
631 int l;
632 ks_introsort(off, n_off, off);
633 // resolve completely contained adjacent blocks
634 for (i = 1, l = 0; i < n_off; ++i)
635 if (off[l].v < off[i].v)
636 off[++l] = off[i];
637 n_off = l + 1;
638 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
639 for (i = 1; i < n_off; ++i)
640 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
641 { // merge adjacent blocks
642 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
643 for (i = 1, l = 0; i < n_off; ++i) {
644 #ifdef BAM_TRUE_OFFSET
645 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
646 #else
647 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
648 #endif
649 else off[++l] = off[i];
650 }
651 n_off = l + 1;
652 #endif
653 }
654 bam_destroy1(b);
655 }
656 iter->n_off = n_off; iter->off = off;
657 return iter;
658 }
659
get_chunk_coordinates(const bam_index_t * idx,int tid,int beg,int end,int * cnt_off)660 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
661 { // for pysam compatibility
662 bam_iter_t iter;
663 pair64_t *off;
664 iter = bam_iter_query(idx, tid, beg, end);
665 off = iter->off; *cnt_off = iter->n_off;
666 free(iter);
667 return off;
668 }
669
bam_iter_destroy(bam_iter_t iter)670 void bam_iter_destroy(bam_iter_t iter)
671 {
672 if (iter) { free(iter->off); free(iter); }
673 }
674
bam_iter_read(bamFile fp,bam_iter_t iter,bam1_t * b)675 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
676 {
677 int ret;
678 if (iter && iter->finished) return -1;
679 if (iter == 0 || iter->from_first) {
680 ret = bam_read1(fp, b);
681 if (ret < 0 && iter) iter->finished = 1;
682 return ret;
683 }
684 if (iter->off == 0) return -1;
685 for (;;) {
686 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
687 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
688 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
689 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
690 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
691 iter->curr_off = bam_tell(fp);
692 }
693 ++iter->i;
694 }
695 if ((ret = bam_read1(fp, b)) >= 0) {
696 iter->curr_off = bam_tell(fp);
697 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
698 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
699 break;
700 }
701 else if (is_overlap(iter->beg, iter->end, b)) return ret;
702 } else break; // end of file or error
703 }
704 iter->finished = 1;
705 return ret;
706 }
707
bam_fetch(bamFile fp,const bam_index_t * idx,int tid,int beg,int end,void * data,bam_fetch_f func)708 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
709 {
710 int ret;
711 bam_iter_t iter;
712 bam1_t *b;
713 b = bam_init1();
714 iter = bam_iter_query(idx, tid, beg, end);
715 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
716 bam_iter_destroy(iter);
717 bam_destroy1(b);
718 return (ret == -1)? 0 : ret;
719 }
720