1 #include <stdlib.h>
2 #include <stdint.h>
3 #include <string.h>
4 #include <stdio.h>
5 #include <zlib.h>
6 
7 #ifdef _WIN32
8 #define drand48() ((double)rand() / RAND_MAX)
9 #endif
10 
11 #include "ksort.h"
12 KSORT_INIT_GENERIC(uint64_t)
13 
14 #include "kseq.h"
15 KSTREAM_INIT(gzFile, gzread, 8192)
16 
17 typedef struct {
18 	int n, m;
19 	uint64_t *a;
20 	int *idx;
21 } bed_reglist_t;
22 
23 #include "khash.h"
24 KHASH_MAP_INIT_STR(reg, bed_reglist_t)
25 
26 #define LIDX_SHIFT 13
27 
28 typedef kh_reg_t reghash_t;
29 
bed_index_core(int n,uint64_t * a,int * n_idx)30 int *bed_index_core(int n, uint64_t *a, int *n_idx)
31 {
32 	int i, j, m, *idx;
33 	m = *n_idx = 0; idx = 0;
34 	for (i = 0; i < n; ++i) {
35 		int beg, end;
36 		beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
37 		if (m < end + 1) {
38 			int oldm = m;
39 			m = end + 1;
40 			kroundup32(m);
41 			idx = realloc(idx, m * sizeof(int));
42 			for (j = oldm; j < m; ++j) idx[j] = -1;
43 		}
44 		if (beg == end) {
45 			if (idx[beg] < 0) idx[beg] = i;
46 		} else {
47 			for (j = beg; j <= end; ++j)
48 				if (idx[j] < 0) idx[j] = i;
49 		}
50 		*n_idx = end + 1;
51 	}
52 	return idx;
53 }
54 
bed_index(void * _h)55 void bed_index(void *_h)
56 {
57 	reghash_t *h = (reghash_t*)_h;
58 	khint_t k;
59 	for (k = 0; k < kh_end(h); ++k) {
60 		if (kh_exist(h, k)) {
61 			bed_reglist_t *p = &kh_val(h, k);
62 			if (p->idx) free(p->idx);
63 			ks_introsort(uint64_t, p->n, p->a);
64 			p->idx = bed_index_core(p->n, p->a, &p->m);
65 		}
66 	}
67 }
68 
bed_overlap_core(const bed_reglist_t * p,int beg,int end)69 int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
70 {
71 	int i, min_off;
72 	if (p->n == 0) return 0;
73 	min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
74 	if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
75 		int n = beg>>LIDX_SHIFT;
76 		if (n > p->n) n = p->n;
77 		for (i = n - 1; i >= 0; --i)
78 			if (p->idx[i] >= 0) break;
79 		min_off = i >= 0? p->idx[i] : 0;
80 	}
81 	for (i = min_off; i < p->n; ++i) {
82 		if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
83 		if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
84 			return 1; // find the overlap; return
85 	}
86 	return 0;
87 }
88 
bed_overlap(const void * _h,const char * chr,int beg,int end)89 int bed_overlap(const void *_h, const char *chr, int beg, int end)
90 {
91 	const reghash_t *h = (const reghash_t*)_h;
92 	khint_t k;
93 	if (!h) return 0;
94 	k = kh_get(reg, h, chr);
95 	if (k == kh_end(h)) return 0;
96 	return bed_overlap_core(&kh_val(h, k), beg, end);
97 }
98 
bed_read(const char * fn)99 void *bed_read(const char *fn)
100 {
101 	reghash_t *h = kh_init(reg);
102 	gzFile fp;
103 	kstream_t *ks;
104 	int dret;
105 	kstring_t *str;
106 	// read the list
107 	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
108 	if (fp == 0) return 0;
109 	str = calloc(1, sizeof(kstring_t));
110 	ks = ks_init(fp);
111 	while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
112 		int beg = -1, end = -1;
113 		bed_reglist_t *p;
114 		khint_t k = kh_get(reg, h, str->s);
115 		if (k == kh_end(h)) { // absent from the hash table
116 			int ret;
117 			char *s = strdup(str->s);
118 			k = kh_put(reg, h, s, &ret);
119 			memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
120 		}
121 		p = &kh_val(h, k);
122 		if (dret != '\n') { // if the lines has other characters
123 			if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
124 				beg = atoi(str->s); // begin
125 				if (dret != '\n') {
126 					if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
127 						end = atoi(str->s); // end
128 						if (end < beg) end = -1;
129 					}
130 				}
131 			}
132 		}
133 		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
134 		if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
135 		if (beg >= 0 && end > beg) {
136 			if (p->n == p->m) {
137 				p->m = p->m? p->m<<1 : 4;
138 				p->a = realloc(p->a, p->m * 8);
139 			}
140 			p->a[p->n++] = (uint64_t)beg<<32 | end;
141 		}
142 	}
143 	ks_destroy(ks);
144 	gzclose(fp);
145 	free(str->s); free(str);
146 	bed_index(h);
147 	return h;
148 }
149 
bed_destroy(void * _h)150 void bed_destroy(void *_h)
151 {
152 	reghash_t *h = (reghash_t*)_h;
153 	khint_t k;
154 	for (k = 0; k < kh_end(h); ++k) {
155 		if (kh_exist(h, k)) {
156 			free(kh_val(h, k).a);
157 			free(kh_val(h, k).idx);
158 			free((char*)kh_key(h, k));
159 		}
160 	}
161 	kh_destroy(reg, h);
162 }
163