1 #include <stdlib.h>
2 #include <string.h>
3 #include <assert.h>
4 #include <stdio.h>
5 #include <zlib.h>
6 #include "rle.h"
7 #include "rope.h"
8 
9 /*******************
10  *** Memory Pool ***
11  *******************/
12 
13 #define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
14 
15 typedef struct { // memory pool for fast and compact memory allocation (no free)
16 	int size, i, n_elems;
17 	int64_t top, max;
18 	uint8_t **mem;
19 } mempool_t;
20 
mp_init(int size)21 static mempool_t *mp_init(int size)
22 {
23 	mempool_t *mp;
24 	mp = calloc(1, sizeof(mempool_t));
25 	mp->size = size;
26 	mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
27 	mp->top = -1;
28 	return mp;
29 }
30 
mp_destroy(mempool_t * mp)31 static void mp_destroy(mempool_t *mp)
32 {
33 	int64_t i;
34 	for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
35 	free(mp->mem); free(mp);
36 }
37 
mp_alloc(mempool_t * mp)38 static inline void *mp_alloc(mempool_t *mp)
39 {
40 	if (mp->i == mp->n_elems) {
41 		if (++mp->top == mp->max) {
42 			mp->max = mp->max? mp->max<<1 : 1;
43 			mp->mem = realloc(mp->mem, sizeof(void*) * mp->max);
44 		}
45 		mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
46 		mp->i = 0;
47 	}
48 	return mp->mem[mp->top] + (mp->i++) * mp->size;
49 }
50 
51 /***************
52  *** B+ rope ***
53  ***************/
54 
rope_init(int max_nodes,int block_len)55 rope_t *rope_init(int max_nodes, int block_len)
56 {
57 	rope_t *rope;
58 	rope = calloc(1, sizeof(rope_t));
59 	if (block_len < 32) block_len = 32;
60 	rope->max_nodes = (max_nodes+ 1)>>1<<1;
61 	rope->block_len = (block_len + 7) >> 3 << 3;
62 	rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
63 	rope->leaf = mp_init(rope->block_len);
64 	rope->root = mp_alloc(rope->node);
65 	rope->root->n = 1;
66 	rope->root->is_bottom = 1;
67 	rope->root->p = mp_alloc(rope->leaf);
68 	return rope;
69 }
70 
rope_destroy(rope_t * rope)71 void rope_destroy(rope_t *rope)
72 {
73 	mp_destroy(rope->node);
74 	mp_destroy(rope->leaf);
75 	free(rope);
76 }
77 
split_node(rope_t * rope,rpnode_t * u,rpnode_t * v)78 static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v)
79 { // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
80 	int j, i = v - u;
81 	rpnode_t *w; // $w is the sibling of $v
82 	if (u == 0) { // only happens at the root; add a new root
83 		u = v = mp_alloc(rope->node);
84 		v->n = 1; v->p = rope->root; // the new root has the old root as the only child
85 		memcpy(v->c, rope->c, 48);
86 		for (j = 0; j < 6; ++j) v->l += v->c[j];
87 		rope->root = v;
88 	}
89 	if (i != u->n - 1) // then make room for a new node
90 		memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
91 	++u->n; w = v + 1;
92 	memset(w, 0, sizeof(rpnode_t));
93 	w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
94 	if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
95 		uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p;
96 		rle_split(p, q);
97 		rle_count(q, w->c);
98 	} else { // $v->p is a node, not a string
99 		rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins
100 		p->n -= rope->max_nodes>>1;
101 		memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
102 		q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
103 		q->is_bottom = p->is_bottom;
104 		for (i = 0; i < q->n; ++i)
105 			for (j = 0; j < 6; ++j)
106 				w->c[j] += q[i].c[j];
107 	}
108 	for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
109 		w->l += w->c[j], v->c[j] -= w->c[j];
110 	v->l -= w->l; // update $v->c
111 	return v;
112 }
113 
rope_insert_run(rope_t * rope,int64_t x,int a,int64_t rl,rpcache_t * cache)114 int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache)
115 { // insert $a after $x symbols in $rope and the returns rank(a, x)
116 	rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
117 	int64_t y = 0, z = 0, cnt[6];
118 	int n_runs;
119 	do { // top-down update. Searching and node splitting are done together in one pass.
120 		if (p->n == rope->max_nodes) { // node is full; split
121 			v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
122 			if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
123 				y += v->l, z += v->c[a], ++v, p = v->p;
124 		}
125 		u = p;
126 		if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
127 			p += p->n - 1; y += v->l; z += v->c[a];
128 			for (; y >= x; --p) y -= p->l, z -= p->c[a];
129 			++p;
130 		} else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
131 		assert(p - u < u->n);
132 		if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
133 		v = p; p = p->p; // descend
134 	} while (!u->is_bottom);
135 	rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
136 	if (cache) {
137 		if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
138 		n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
139 		cache->p = (uint8_t*)p;
140 	} else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
141 	z += cnt[a];
142 	v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
143 	if (n_runs + RLE_MIN_SPACE > rope->block_len) {
144 		split_node(rope, u, v);
145 		if (cache) memset(cache, 0, sizeof(rpcache_t));
146 	}
147 	return z;
148 }
149 
rope_count_to_leaf(const rope_t * rope,int64_t x,int64_t cx[6],int64_t * rest)150 static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest)
151 {
152 	rpnode_t *u, *v = 0, *p = rope->root;
153 	int64_t y = 0;
154 	int a;
155 
156 	memset(cx, 0, 48);
157 	do {
158 		u = p;
159 		if (v && x - y > v->l>>1) {
160 			p += p->n - 1; y += v->l;
161 			for (a = 0; a != 6; ++a) cx[a] += v->c[a];
162 			for (; y >= x; --p) {
163 				y -= p->l;
164 				for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
165 			}
166 			++p;
167 		} else {
168 			for (; y + p->l < x; ++p) {
169 				y += p->l;
170 				for (a = 0; a != 6; ++a) cx[a] += p->c[a];
171 			}
172 		}
173 		v = p; p = p->p;
174 	} while (!u->is_bottom);
175 	*rest = x - y;
176 	return v;
177 }
178 
rope_rank2a(const rope_t * rope,int64_t x,int64_t y,int64_t * cx,int64_t * cy)179 void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
180 {
181 	rpnode_t *v;
182 	int64_t rest;
183 	v = rope_count_to_leaf(rope, x, cx, &rest);
184 	if (y < x || cy == 0) {
185 		rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
186 	} else if (rest + (y - x) <= v->l) {
187 		memcpy(cy, cx, 48);
188 		rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
189 	} else {
190 		rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
191 		v = rope_count_to_leaf(rope, y, cy, &rest);
192 		rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
193 	}
194 }
195 
196 /*********************
197  *** Rope iterator ***
198  *********************/
199 
rope_itr_first(const rope_t * rope,rpitr_t * i)200 void rope_itr_first(const rope_t *rope, rpitr_t *i)
201 {
202 	memset(i, 0, sizeof(rpitr_t));
203 	i->rope = rope;
204 	for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
205 		++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
206 }
207 
rope_itr_next_block(rpitr_t * i)208 const uint8_t *rope_itr_next_block(rpitr_t *i)
209 {
210 	const uint8_t *ret;
211 	assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
212 	if (i->d < 0) return 0;
213 	ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
214 	while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
215 	if (i->d >= 0)
216 		while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
217 			++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
218 	return ret;
219 }
220 
221 /***********
222  *** I/O ***
223  ***********/
224 
rope_print_node(const rpnode_t * p)225 void rope_print_node(const rpnode_t *p)
226 {
227 	if (p->is_bottom) {
228 		int i;
229 		putchar('(');
230 		for (i = 0; i < p->n; ++i) {
231 			uint8_t *block = (uint8_t*)p[i].p;
232 			const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block);
233 			if (i) putchar(',');
234 			while (q < end) {
235 				int c = 0;
236 				int64_t j, l;
237 				rle_dec1(q, c, l);
238 				for (j = 0; j < l; ++j) putchar("$ACGTN"[c]);
239 			}
240 		}
241 		putchar(')');
242 	} else {
243 		int i;
244 		putchar('(');
245 		for (i = 0; i < p->n; ++i) {
246 			if (i) putchar(',');
247 			rope_print_node(p[i].p);
248 		}
249 		putchar(')');
250 	}
251 }
252 
rope_dump_node(const rpnode_t * p,FILE * fp)253 void rope_dump_node(const rpnode_t *p, FILE *fp)
254 {
255 	int16_t i, n = p->n;
256 	uint8_t is_bottom = p->is_bottom;
257 	fwrite(&is_bottom, 1, 1, fp);
258 	fwrite(&n, 2, 1, fp);
259 	if (is_bottom) {
260 		for (i = 0; i < n; ++i) {
261 			fwrite(p[i].c, 8, 6, fp);
262 			fwrite(p[i].p, 1, *rle_nptr(p[i].p) + 2, fp);
263 		}
264 	} else {
265 		for (i = 0; i < p->n; ++i)
266 			rope_dump_node(p[i].p, fp);
267 	}
268 }
269 
rope_dump(const rope_t * r,FILE * fp)270 void rope_dump(const rope_t *r, FILE *fp)
271 {
272 	fwrite(&r->max_nodes, 4, 1, fp);
273 	fwrite(&r->block_len, 4, 1, fp);
274 	rope_dump_node(r->root, fp);
275 }
276 
rope_restore_node(const rope_t * r,FILE * fp,int64_t c[6])277 rpnode_t *rope_restore_node(const rope_t *r, FILE *fp, int64_t c[6])
278 {
279 	uint8_t is_bottom, a;
280 	int16_t i, n;
281 	rpnode_t *p;
282 	fread(&is_bottom, 1, 1, fp);
283 	fread(&n, 2, 1, fp);
284 	p = mp_alloc(r->node);
285 	p->is_bottom = is_bottom, p->n = n;
286 	if (is_bottom) {
287 		for (i = 0; i < n; ++i) {
288 			uint16_t *q;
289 			p[i].p = mp_alloc(r->leaf);
290 			q = rle_nptr(p[i].p);
291 			fread(p[i].c, 8, 6, fp);
292 			fread(q, 2, 1, fp);
293 			fread(q + 1, 1, *q, fp);
294 		}
295 	} else {
296 		for (i = 0; i < n; ++i)
297 			p[i].p = rope_restore_node(r, fp, p[i].c);
298 	}
299 	memset(c, 0, 48);
300 	for (i = 0; i < n; ++i) {
301 		p[i].l = 0;
302 		for (a = 0; a < 6; ++a)
303 			c[a] += p[i].c[a], p[i].l += p[i].c[a];
304 	}
305 	return p;
306 }
307 
rope_restore(FILE * fp)308 rope_t *rope_restore(FILE *fp)
309 {
310 	rope_t *r;
311 	r = calloc(1, sizeof(rope_t));
312 	fread(&r->max_nodes, 4, 1, fp);
313 	fread(&r->block_len, 4, 1, fp);
314 	r->node = mp_init(sizeof(rpnode_t) * r->max_nodes);
315 	r->leaf = mp_init(r->block_len);
316 	r->root = rope_restore_node(r, fp, r->c);
317 	return r;
318 }
319