1 #include <ctype.h>
2 #include <assert.h>
3 #include "bam.h"
4 #include "khash.h"
5 #include "ksort.h"
6 #include "bam_endian.h"
7 #ifdef _USE_KNETFILE
8 #include "knetfile.h"
9 #endif
10 
11 /*!
12   @header
13 
14   Alignment indexing. Before indexing, BAM must be sorted based on the
15   leftmost coordinate of alignments. In indexing, BAM uses two indices:
16   a UCSC binning index and a simple linear index. The binning index is
17   efficient for alignments spanning long distance, while the auxiliary
18   linear index helps to reduce unnecessary seek calls especially for
19   short alignments.
20 
21   The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22   Stein and is explained by Kent et al. (2002). In this scheme, each bin
23   represents a contiguous genomic region which can be fully contained in
24   another bin; each alignment is associated with a bin which represents
25   the smallest region containing the entire alignment. The binning
26   scheme is essentially another representation of R-tree. A distinct bin
27   uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28   a child of Bin B if region A is contained in B.
29 
30   In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31   0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32   585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33   find the alignments overlapped with a region [rbeg,rend), we need to
34   calculate the list of bins that may be overlapped the region and test
35   the alignments in the bins to confirm the overlaps. If the specified
36   region is short, typically only a few alignments in six bins need to
37   be retrieved. The overlapping alignments can be quickly fetched.
38 
39  */
40 
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT    14
44 
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
46 
47 typedef struct {
48 	uint64_t u, v;
49 } pair64_t;
50 
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
53 
54 typedef struct {
55 	uint32_t m, n;
56 	pair64_t *list;
57 } bam_binlist_t;
58 
59 typedef struct {
60 	int32_t n, m;
61 	uint64_t *offset;
62 } bam_lidx_t;
63 
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
65 
66 struct __bam_index_t {
67 	int32_t n;
68 	uint64_t n_no_coor; // unmapped reads without coordinate
69 	khash_t(i) **index;
70 	bam_lidx_t *index2;
71 };
72 
73 // requirement: len <= LEN_MASK
insert_offset(khash_t (i)* h,int bin,uint64_t beg,uint64_t end)74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
75 {
76 	khint_t k;
77 	bam_binlist_t *l;
78 	int ret;
79 	k = kh_put(i, h, bin, &ret);
80 	l = &kh_value(h, k);
81 	if (ret) { // not present
82 		l->m = 1; l->n = 0;
83 		l->list = (pair64_t*)calloc(l->m, 16);
84 	}
85 	if (l->n == l->m) {
86 		l->m <<= 1;
87 		l->list = (pair64_t*)realloc(l->list, l->m * 16);
88 	}
89 	l->list[l->n].u = beg; l->list[l->n++].v = end;
90 }
91 
insert_offset2(bam_lidx_t * index2,bam1_t * b,uint64_t offset)92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
93 {
94 	int i, beg, end;
95 	beg = b->core.pos >> BAM_LIDX_SHIFT;
96 	end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 	if (index2->m < end + 1) {
98 		int old_m = index2->m;
99 		index2->m = end + 1;
100 		kroundup32(index2->m);
101 		index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 		memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
103 	}
104 	if (beg == end) {
105 		if (index2->offset[beg] == 0) index2->offset[beg] = offset;
106 	} else {
107 		for (i = beg; i <= end; ++i)
108 			if (index2->offset[i] == 0) index2->offset[i] = offset;
109 	}
110 	index2->n = end + 1;
111 }
112 
merge_chunks(bam_index_t * idx)113 static void merge_chunks(bam_index_t *idx)
114 {
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
116 	khash_t(i) *index;
117 	int i, l, m;
118 	khint_t k;
119 	for (i = 0; i < idx->n; ++i) {
120 		index = idx->index[i];
121 		for (k = kh_begin(index); k != kh_end(index); ++k) {
122 			bam_binlist_t *p;
123 			if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 			p = &kh_value(index, k);
125 			m = 0;
126 			for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 				if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
129 #else
130 				if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
131 #endif
132 				else p->list[++m] = p->list[l];
133 			} // ~for(l)
134 			p->n = m + 1;
135 		} // ~for(k)
136 	} // ~for(i)
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
138 }
139 
fill_missing(bam_index_t * idx)140 static void fill_missing(bam_index_t *idx)
141 {
142 	int i, j;
143 	for (i = 0; i < idx->n; ++i) {
144 		bam_lidx_t *idx2 = &idx->index2[i];
145 		for (j = 1; j < idx2->n; ++j)
146 			if (idx2->offset[j] == 0)
147 				idx2->offset[j] = idx2->offset[j-1];
148 	}
149 }
150 
bam_index_core(bamFile fp)151 bam_index_t *bam_index_core(bamFile fp)
152 {
153 	bam1_t *b;
154 	bam_header_t *h;
155 	int i, ret;
156 	bam_index_t *idx;
157 	uint32_t last_bin, save_bin;
158 	int32_t last_coor, last_tid, save_tid;
159 	bam1_core_t *c;
160 	uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
161 
162 	idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 	b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 	h = bam_header_read(fp);
165 	c = &b->core;
166 
167 	idx->n = h->n_targets;
168 	bam_header_destroy(h);
169 	idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 	for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 	idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
172 
173 	save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 	save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 	n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 	off_beg = off_end = bam_tell(fp);
177 	while ((ret = bam_read1(fp, b)) >= 0) {
178 		if (c->tid < 0) ++n_no_coor;
179 		if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
180 			last_tid = c->tid;
181 			last_bin = 0xffffffffu;
182 		} else if ((uint32_t)last_tid > (uint32_t)c->tid) {
183 			fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
184 					bam1_qname(b), last_tid+1, c->tid+1);
185 			return NULL;
186 		} else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
187 			fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
188 					bam1_qname(b), last_coor, c->pos, c->tid+1);
189 			return NULL;
190 		}
191 		if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
192 		if (c->bin != last_bin) { // then possibly write the binning index
193 			if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
194 				insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
195 			if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
196 				off_end = last_off;
197 				insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
198 				insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
199 				n_mapped = n_unmapped = 0;
200 				off_beg = off_end;
201 			}
202 			save_off = last_off;
203 			save_bin = last_bin = c->bin;
204 			save_tid = c->tid;
205 			if (save_tid < 0) break;
206 		}
207 		if (bam_tell(fp) <= last_off) {
208 			fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
209 					(unsigned long long)bam_tell(fp), (unsigned long long)last_off);
210 			return NULL;
211 		}
212 		if (c->flag & BAM_FUNMAP) ++n_unmapped;
213 		else ++n_mapped;
214 		last_off = bam_tell(fp);
215 		last_coor = b->core.pos;
216 	}
217 	if (save_tid >= 0) {
218 		insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
219 		insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
220 		insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
221 	}
222 	merge_chunks(idx);
223 	fill_missing(idx);
224 	if (ret >= 0) {
225 		while ((ret = bam_read1(fp, b)) >= 0) {
226 			++n_no_coor;
227 			if (c->tid >= 0 && n_no_coor) {
228 				fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
229 				return NULL;
230 			}
231 		}
232 	}
233 	if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
234 	free(b->data); free(b);
235 	idx->n_no_coor = n_no_coor;
236 	return idx;
237 }
238 
bam_index_destroy(bam_index_t * idx)239 void bam_index_destroy(bam_index_t *idx)
240 {
241 	khint_t k;
242 	int i;
243 	if (idx == 0) return;
244 	for (i = 0; i < idx->n; ++i) {
245 		khash_t(i) *index = idx->index[i];
246 		bam_lidx_t *index2 = idx->index2 + i;
247 		for (k = kh_begin(index); k != kh_end(index); ++k) {
248 			if (kh_exist(index, k))
249 				free(kh_value(index, k).list);
250 		}
251 		kh_destroy(i, index);
252 		free(index2->offset);
253 	}
254 	free(idx->index); free(idx->index2);
255 	free(idx);
256 }
257 
bam_index_save(const bam_index_t * idx,FILE * fp)258 void bam_index_save(const bam_index_t *idx, FILE *fp)
259 {
260 	int32_t i, size;
261 	khint_t k;
262 	fwrite("BAI\1", 1, 4, fp);
263 	if (bam_is_be) {
264 		uint32_t x = idx->n;
265 		fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
266 	} else fwrite(&idx->n, 4, 1, fp);
267 	for (i = 0; i < idx->n; ++i) {
268 		khash_t(i) *index = idx->index[i];
269 		bam_lidx_t *index2 = idx->index2 + i;
270 		// write binning index
271 		size = kh_size(index);
272 		if (bam_is_be) { // big endian
273 			uint32_t x = size;
274 			fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
275 		} else fwrite(&size, 4, 1, fp);
276 		for (k = kh_begin(index); k != kh_end(index); ++k) {
277 			if (kh_exist(index, k)) {
278 				bam_binlist_t *p = &kh_value(index, k);
279 				if (bam_is_be) { // big endian
280 					uint32_t x;
281 					x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
282 					x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
283 					for (x = 0; (int)x < p->n; ++x) {
284 						bam_swap_endian_8p(&p->list[x].u);
285 						bam_swap_endian_8p(&p->list[x].v);
286 					}
287 					fwrite(p->list, 16, p->n, fp);
288 					for (x = 0; (int)x < p->n; ++x) {
289 						bam_swap_endian_8p(&p->list[x].u);
290 						bam_swap_endian_8p(&p->list[x].v);
291 					}
292 				} else {
293 					fwrite(&kh_key(index, k), 4, 1, fp);
294 					fwrite(&p->n, 4, 1, fp);
295 					fwrite(p->list, 16, p->n, fp);
296 				}
297 			}
298 		}
299 		// write linear index (index2)
300 		if (bam_is_be) {
301 			int x = index2->n;
302 			fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
303 		} else fwrite(&index2->n, 4, 1, fp);
304 		if (bam_is_be) { // big endian
305 			int x;
306 			for (x = 0; (int)x < index2->n; ++x)
307 				bam_swap_endian_8p(&index2->offset[x]);
308 			fwrite(index2->offset, 8, index2->n, fp);
309 			for (x = 0; (int)x < index2->n; ++x)
310 				bam_swap_endian_8p(&index2->offset[x]);
311 		} else fwrite(index2->offset, 8, index2->n, fp);
312 	}
313 	{ // write the number of reads coor-less records.
314 		uint64_t x = idx->n_no_coor;
315 		if (bam_is_be) bam_swap_endian_8p(&x);
316 		fwrite(&x, 8, 1, fp);
317 	}
318 	fflush(fp);
319 }
320 
bam_index_load_core(FILE * fp)321 static bam_index_t *bam_index_load_core(FILE *fp)
322 {
323 	int i;
324 	char magic[4];
325 	bam_index_t *idx;
326 	if (fp == 0) {
327 		fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
328 		return 0;
329 	}
330 	fread(magic, 1, 4, fp);
331 	if (strncmp(magic, "BAI\1", 4)) {
332 		fprintf(stderr, "[bam_index_load] wrong magic number.\n");
333 		fclose(fp);
334 		return 0;
335 	}
336 	idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
337 	fread(&idx->n, 4, 1, fp);
338 	if (bam_is_be) bam_swap_endian_4p(&idx->n);
339 	idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
340 	idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
341 	for (i = 0; i < idx->n; ++i) {
342 		khash_t(i) *index;
343 		bam_lidx_t *index2 = idx->index2 + i;
344 		uint32_t key, size;
345 		khint_t k;
346 		int j, ret;
347 		bam_binlist_t *p;
348 		index = idx->index[i] = kh_init(i);
349 		// load binning index
350 		fread(&size, 4, 1, fp);
351 		if (bam_is_be) bam_swap_endian_4p(&size);
352 		for (j = 0; j < (int)size; ++j) {
353 			fread(&key, 4, 1, fp);
354 			if (bam_is_be) bam_swap_endian_4p(&key);
355 			k = kh_put(i, index, key, &ret);
356 			p = &kh_value(index, k);
357 			fread(&p->n, 4, 1, fp);
358 			if (bam_is_be) bam_swap_endian_4p(&p->n);
359 			p->m = p->n;
360 			p->list = (pair64_t*)malloc(p->m * 16);
361 			fread(p->list, 16, p->n, fp);
362 			if (bam_is_be) {
363 				int x;
364 				for (x = 0; x < p->n; ++x) {
365 					bam_swap_endian_8p(&p->list[x].u);
366 					bam_swap_endian_8p(&p->list[x].v);
367 				}
368 			}
369 		}
370 		// load linear index
371 		fread(&index2->n, 4, 1, fp);
372 		if (bam_is_be) bam_swap_endian_4p(&index2->n);
373 		index2->m = index2->n;
374 		index2->offset = (uint64_t*)calloc(index2->m, 8);
375 		fread(index2->offset, index2->n, 8, fp);
376 		if (bam_is_be)
377 			for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
378 	}
379 	if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
380 	if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
381 	return idx;
382 }
383 
bam_index_load_local(const char * _fn)384 bam_index_t *bam_index_load_local(const char *_fn)
385 {
386 	FILE *fp;
387 	char *fnidx, *fn;
388 
389 	if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
390 		const char *p;
391 		int l = strlen(_fn);
392 		for (p = _fn + l - 1; p >= _fn; --p)
393 			if (*p == '/') break;
394 		fn = strdup(p + 1);
395 	} else fn = strdup(_fn);
396 	fnidx = (char*)calloc(strlen(fn) + 5, 1);
397 	strcpy(fnidx, fn); strcat(fnidx, ".bai");
398 	fp = fopen(fnidx, "rb");
399 	if (fp == 0) { // try "{base}.bai"
400 		char *s = strstr(fn, "bam");
401 		if (s == fn + strlen(fn) - 3) {
402 			strcpy(fnidx, fn);
403 			fnidx[strlen(fn)-1] = 'i';
404 			fp = fopen(fnidx, "rb");
405 		}
406 	}
407 	free(fnidx); free(fn);
408 	if (fp) {
409 		bam_index_t *idx = bam_index_load_core(fp);
410 		fclose(fp);
411 		return idx;
412 	} else return 0;
413 }
414 
415 #ifdef _USE_KNETFILE
download_from_remote(const char * url)416 static void download_from_remote(const char *url)
417 {
418 	const int buf_size = 1 * 1024 * 1024;
419 	char *fn;
420 	FILE *fp;
421 	uint8_t *buf;
422 	knetFile *fp_remote;
423 	int l;
424 	if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
425 	l = strlen(url);
426 	for (fn = (char*)url + l - 1; fn >= url; --fn)
427 		if (*fn == '/') break;
428 	++fn; // fn now points to the file name
429 	fp_remote = knet_open(url, "r");
430 	if (fp_remote == 0) {
431 		fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
432 		return;
433 	}
434 	if ((fp = fopen(fn, "wb")) == 0) {
435 		fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
436 		knet_close(fp_remote);
437 		return;
438 	}
439 	buf = (uint8_t*)calloc(buf_size, 1);
440 	while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
441 		fwrite(buf, 1, l, fp);
442 	free(buf);
443 	fclose(fp);
444 	knet_close(fp_remote);
445 }
446 #else
download_from_remote(const char * url)447 static void download_from_remote(const char *url)
448 {
449 	return;
450 }
451 #endif
452 
bam_index_load(const char * fn)453 bam_index_t *bam_index_load(const char *fn)
454 {
455 	bam_index_t *idx;
456 	idx = bam_index_load_local(fn);
457 	if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
458 		char *fnidx = calloc(strlen(fn) + 5, 1);
459 		strcat(strcpy(fnidx, fn), ".bai");
460 		fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
461 		download_from_remote(fnidx);
462 		idx = bam_index_load_local(fn);
463 	}
464 	if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
465 	return idx;
466 }
467 
bam_index_build2(const char * fn,const char * _fnidx)468 int bam_index_build2(const char *fn, const char *_fnidx)
469 {
470 	char *fnidx;
471 	FILE *fpidx;
472 	bamFile fp;
473 	bam_index_t *idx;
474 	if ((fp = bam_open(fn, "r")) == 0) {
475 		fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
476 		return -1;
477 	}
478 	idx = bam_index_core(fp);
479 	bam_close(fp);
480 	if(idx == 0) {
481 		fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
482 		return -1;
483 	}
484 	if (_fnidx == 0) {
485 		fnidx = (char*)calloc(strlen(fn) + 5, 1);
486 		strcpy(fnidx, fn); strcat(fnidx, ".bai");
487 	} else fnidx = strdup(_fnidx);
488 	fpidx = fopen(fnidx, "wb");
489 	if (fpidx == 0) {
490 		fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
491 		free(fnidx);
492 		return -1;
493 	}
494 	bam_index_save(idx, fpidx);
495 	bam_index_destroy(idx);
496 	fclose(fpidx);
497 	free(fnidx);
498 	return 0;
499 }
500 
bam_index_build(const char * fn)501 int bam_index_build(const char *fn)
502 {
503 	return bam_index_build2(fn, 0);
504 }
505 
bam_index(int argc,char * argv[])506 int bam_index(int argc, char *argv[])
507 {
508 	if (argc < 2) {
509 		fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
510 		return 1;
511 	}
512 	if (argc >= 3) bam_index_build2(argv[1], argv[2]);
513 	else bam_index_build(argv[1]);
514 	return 0;
515 }
516 
bam_idxstats(int argc,char * argv[])517 int bam_idxstats(int argc, char *argv[])
518 {
519 	bam_index_t *idx;
520 	bam_header_t *header;
521 	bamFile fp;
522 	int i;
523 	if (argc < 2) {
524 		fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
525 		return 1;
526 	}
527 	fp = bam_open(argv[1], "r");
528 	if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
529 	header = bam_header_read(fp);
530 	bam_close(fp);
531 	idx = bam_index_load(argv[1]);
532 	if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
533 	for (i = 0; i < idx->n; ++i) {
534 		khint_t k;
535 		khash_t(i) *h = idx->index[i];
536 		printf("%s\t%d", header->target_name[i], header->target_len[i]);
537 		k = kh_get(i, h, BAM_MAX_BIN);
538 		if (k != kh_end(h))
539 			printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
540 		else printf("\t0\t0");
541 		putchar('\n');
542 	}
543 	printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
544 	bam_header_destroy(header);
545 	bam_index_destroy(idx);
546 	return 0;
547 }
548 
reg2bins(uint32_t beg,uint32_t end,uint16_t list[BAM_MAX_BIN])549 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
550 {
551 	int i = 0, k;
552 	if (beg >= end) return 0;
553 	if (end >= 1u<<29) end = 1u<<29;
554 	--end;
555 	list[i++] = 0;
556 	for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
557 	for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
558 	for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
559 	for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
560 	for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
561 	return i;
562 }
563 
is_overlap(uint32_t beg,uint32_t end,const bam1_t * b)564 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
565 {
566 	uint32_t rbeg = b->core.pos;
567 	uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
568 	return (rend > beg && rbeg < end);
569 }
570 
571 struct __bam_iter_t {
572 	int from_first; // read from the first record; no random access
573 	int tid, beg, end, n_off, i, finished;
574 	uint64_t curr_off;
575 	pair64_t *off;
576 };
577 
578 // bam_fetch helper function retrieves
bam_iter_query(const bam_index_t * idx,int tid,int beg,int end)579 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
580 {
581 	uint16_t *bins;
582 	int i, n_bins, n_off;
583 	pair64_t *off;
584 	khint_t k;
585 	khash_t(i) *index;
586 	uint64_t min_off;
587 	bam_iter_t iter = 0;
588 
589 	if (beg < 0) beg = 0;
590 	if (end < beg) return 0;
591 	// initialize iter
592 	iter = calloc(1, sizeof(struct __bam_iter_t));
593 	iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
594 	//
595 	bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
596 	n_bins = reg2bins(beg, end, bins);
597 	index = idx->index[tid];
598 	if (idx->index2[tid].n > 0) {
599 		min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
600 			: idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
601 		if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
602 			int n = beg>>BAM_LIDX_SHIFT;
603 			if (n > idx->index2[tid].n) n = idx->index2[tid].n;
604 			for (i = n - 1; i >= 0; --i)
605 				if (idx->index2[tid].offset[i] != 0) break;
606 			if (i >= 0) min_off = idx->index2[tid].offset[i];
607 		}
608 	} else min_off = 0; // tabix 0.1.2 may produce such index files
609 	for (i = n_off = 0; i < n_bins; ++i) {
610 		if ((k = kh_get(i, index, bins[i])) != kh_end(index))
611 			n_off += kh_value(index, k).n;
612 	}
613 	if (n_off == 0) {
614 		free(bins); return iter;
615 	}
616 	off = (pair64_t*)calloc(n_off, 16);
617 	for (i = n_off = 0; i < n_bins; ++i) {
618 		if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
619 			int j;
620 			bam_binlist_t *p = &kh_value(index, k);
621 			for (j = 0; j < p->n; ++j)
622 				if (p->list[j].v > min_off) off[n_off++] = p->list[j];
623 		}
624 	}
625 	free(bins);
626 	if (n_off == 0) {
627 		free(off); return iter;
628 	}
629 	{
630 		bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
631 		int l;
632 		ks_introsort(off, n_off, off);
633 		// resolve completely contained adjacent blocks
634 		for (i = 1, l = 0; i < n_off; ++i)
635 			if (off[l].v < off[i].v)
636 				off[++l] = off[i];
637 		n_off = l + 1;
638 		// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
639 		for (i = 1; i < n_off; ++i)
640 			if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
641 		{ // merge adjacent blocks
642 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
643 			for (i = 1, l = 0; i < n_off; ++i) {
644 #ifdef BAM_TRUE_OFFSET
645 				if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
646 #else
647 				if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
648 #endif
649 				else off[++l] = off[i];
650 			}
651 			n_off = l + 1;
652 #endif
653 		}
654 		bam_destroy1(b);
655 	}
656 	iter->n_off = n_off; iter->off = off;
657 	return iter;
658 }
659 
get_chunk_coordinates(const bam_index_t * idx,int tid,int beg,int end,int * cnt_off)660 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
661 { // for pysam compatibility
662 	bam_iter_t iter;
663 	pair64_t *off;
664 	iter = bam_iter_query(idx, tid, beg, end);
665 	off = iter->off; *cnt_off = iter->n_off;
666 	free(iter);
667 	return off;
668 }
669 
bam_iter_destroy(bam_iter_t iter)670 void bam_iter_destroy(bam_iter_t iter)
671 {
672 	if (iter) { free(iter->off); free(iter); }
673 }
674 
bam_iter_read(bamFile fp,bam_iter_t iter,bam1_t * b)675 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
676 {
677 	int ret;
678 	if (iter && iter->finished) return -1;
679 	if (iter == 0 || iter->from_first) {
680 		ret = bam_read1(fp, b);
681 		if (ret < 0 && iter) iter->finished = 1;
682 		return ret;
683 	}
684 	if (iter->off == 0) return -1;
685 	for (;;) {
686 		if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
687 			if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
688 			if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
689 			if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
690 				bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
691 				iter->curr_off = bam_tell(fp);
692 			}
693 			++iter->i;
694 		}
695 		if ((ret = bam_read1(fp, b)) >= 0) {
696 			iter->curr_off = bam_tell(fp);
697 			if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
698 				ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
699 				break;
700 			}
701 			else if (is_overlap(iter->beg, iter->end, b)) return ret;
702 		} else break; // end of file or error
703 	}
704 	iter->finished = 1;
705 	return ret;
706 }
707 
bam_fetch(bamFile fp,const bam_index_t * idx,int tid,int beg,int end,void * data,bam_fetch_f func)708 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
709 {
710 	int ret;
711 	bam_iter_t iter;
712 	bam1_t *b;
713 	b = bam_init1();
714 	iter = bam_iter_query(idx, tid, beg, end);
715 	while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
716 	bam_iter_destroy(iter);
717 	bam_destroy1(b);
718 	return (ret == -1)? 0 : ret;
719 }
720