1 #include <stdlib.h>
2 #include <assert.h>
3 #if defined(WIN32) || defined(_WIN32)
4 #include <io.h> // for open(2)
5 #else
6 #include <unistd.h>
7 #endif
8 #include <fcntl.h>
9 #include <stdio.h>
10 #define __STDC_LIMIT_MACROS
11 #include "kthread.h"
12 #include "bseq.h"
13 #include "minimap.h"
14 #include "mmpriv.h"
15 #include "kvec.h"
16 #include "khash.h"
17 
18 #define idx_hash(a) ((a)>>1)
19 #define idx_eq(a, b) ((a)>>1 == (b)>>1)
20 KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
21 typedef khash_t(idx) idxhash_t;
22 
23 KHASH_MAP_INIT_STR(str, uint32_t)
24 
25 #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
26 
27 typedef struct mm_idx_bucket_s {
28 	mm128_v a;   // (minimizer, position) array
29 	int32_t n;   // size of the _p_ array
30 	uint64_t *p; // position array for minimizers appearing >1 times
31 	void *h;     // hash table indexing _p_ and minimizers appearing once
32 } mm_idx_bucket_t;
33 
34 typedef struct {
35 	int32_t st, en, max; // max is not used for now
36 	int32_t score:30, strand:2;
37 } mm_idx_intv1_t;
38 
39 typedef struct mm_idx_intv_s {
40 	int32_t n, m;
41 	mm_idx_intv1_t *a;
42 } mm_idx_intv_t;
43 
mm_idx_init(int w,int k,int b,int flag)44 mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
45 {
46 	mm_idx_t *mi;
47 	if (k*2 < b) b = k * 2;
48 	if (w < 1) w = 1;
49 	mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
50 	mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
51 	mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
52 	if (!(mm_dbg_flag & 1)) mi->km = km_init();
53 	return mi;
54 }
55 
mm_idx_destroy(mm_idx_t * mi)56 void mm_idx_destroy(mm_idx_t *mi)
57 {
58 	uint32_t i;
59 	if (mi == 0) return;
60 	if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
61 	if (mi->B) {
62 		for (i = 0; i < 1U<<mi->b; ++i) {
63 			free(mi->B[i].p);
64 			free(mi->B[i].a.a);
65 			kh_destroy(idx, (idxhash_t*)mi->B[i].h);
66 		}
67 	}
68 	if (mi->I) {
69 		for (i = 0; i < mi->n_seq; ++i)
70 			free(mi->I[i].a);
71 		free(mi->I);
72 	}
73 	if (!mi->km) {
74 		for (i = 0; i < mi->n_seq; ++i)
75 			free(mi->seq[i].name);
76 		free(mi->seq);
77 	} else km_destroy(mi->km);
78 	free(mi->B); free(mi->S); free(mi);
79 }
80 
mm_idx_get(const mm_idx_t * mi,uint64_t minier,int * n)81 const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
82 {
83 	int mask = (1<<mi->b) - 1;
84 	khint_t k;
85 	mm_idx_bucket_t *b = &mi->B[minier&mask];
86 	idxhash_t *h = (idxhash_t*)b->h;
87 	*n = 0;
88 	if (h == 0) return 0;
89 	k = kh_get(idx, h, minier>>mi->b<<1);
90 	if (k == kh_end(h)) return 0;
91 	if (kh_key(h, k)&1) { // special casing when there is only one k-mer
92 		*n = 1;
93 		return &kh_val(h, k);
94 	} else {
95 		*n = (uint32_t)kh_val(h, k);
96 		return &b->p[kh_val(h, k)>>32];
97 	}
98 }
99 
mm_idx_stat(const mm_idx_t * mi)100 void mm_idx_stat(const mm_idx_t *mi)
101 {
102 	int n = 0, n1 = 0;
103 	uint32_t i;
104 	uint64_t sum = 0, len = 0;
105 	fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
106 	for (i = 0; i < mi->n_seq; ++i)
107 		len += mi->seq[i].len;
108 	for (i = 0; i < 1U<<mi->b; ++i)
109 		if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
110 	for (i = 0; i < 1U<<mi->b; ++i) {
111 		idxhash_t *h = (idxhash_t*)mi->B[i].h;
112 		khint_t k;
113 		if (h == 0) continue;
114 		for (k = 0; k < kh_end(h); ++k)
115 			if (kh_exist(h, k)) {
116 				sum += kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
117 				if (kh_key(h, k)&1) ++n1;
118 			}
119 	}
120 	fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
121 			__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
122 }
123 
mm_idx_index_name(mm_idx_t * mi)124 int mm_idx_index_name(mm_idx_t *mi)
125 {
126 	khash_t(str) *h;
127 	uint32_t i;
128 	int has_dup = 0, absent;
129 	if (mi->h) return 0;
130 	h = kh_init(str);
131 	for (i = 0; i < mi->n_seq; ++i) {
132 		khint_t k;
133 		k = kh_put(str, h, mi->seq[i].name, &absent);
134 		if (absent) kh_val(h, k) = i;
135 		else has_dup = 1;
136 	}
137 	mi->h = h;
138 	if (has_dup && mm_verbose >= 2)
139 		fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
140 	return has_dup;
141 }
142 
mm_idx_name2id(const mm_idx_t * mi,const char * name)143 int mm_idx_name2id(const mm_idx_t *mi, const char *name)
144 {
145 	khash_t(str) *h = (khash_t(str)*)mi->h;
146 	khint_t k;
147 	if (h == 0) return -2;
148 	k = kh_get(str, h, name);
149 	return k == kh_end(h)? -1 : kh_val(h, k);
150 }
151 
mm_idx_getseq(const mm_idx_t * mi,uint32_t rid,uint32_t st,uint32_t en,uint8_t * seq)152 int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
153 {
154 	uint64_t i, st1, en1;
155 	if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
156 	if (en > mi->seq[rid].len) en = mi->seq[rid].len;
157 	st1 = mi->seq[rid].offset + st;
158 	en1 = mi->seq[rid].offset + en;
159 	for (i = st1; i < en1; ++i)
160 		seq[i - st1] = mm_seq4_get(mi->S, i);
161 	return en - st;
162 }
163 
mm_idx_getseq_rev(const mm_idx_t * mi,uint32_t rid,uint32_t st,uint32_t en,uint8_t * seq)164 int mm_idx_getseq_rev(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
165 {
166 	uint64_t i, st1, en1;
167 	const mm_idx_seq_t *s;
168 	if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
169 	s = &mi->seq[rid];
170 	if (en > s->len) en = s->len;
171 	st1 = s->offset + (s->len - en);
172 	en1 = s->offset + (s->len - st);
173 	for (i = st1; i < en1; ++i) {
174 		uint8_t c = mm_seq4_get(mi->S, i);
175 		seq[en1 - i - 1] = c < 4? 3 - c : c;
176 	}
177 	return en - st;
178 }
179 
mm_idx_getseq2(const mm_idx_t * mi,int is_rev,uint32_t rid,uint32_t st,uint32_t en,uint8_t * seq)180 int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
181 {
182 	if (is_rev) return mm_idx_getseq_rev(mi, rid, st, en, seq);
183 	else return mm_idx_getseq(mi, rid, st, en, seq);
184 }
185 
mm_idx_cal_max_occ(const mm_idx_t * mi,float f)186 int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
187 {
188 	int i;
189 	size_t n = 0;
190 	uint32_t thres;
191 	khint_t *a, k;
192 	if (f <= 0.) return INT32_MAX;
193 	for (i = 0; i < 1<<mi->b; ++i)
194 		if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
195 	a = (uint32_t*)malloc(n * 4);
196 	for (i = n = 0; i < 1<<mi->b; ++i) {
197 		idxhash_t *h = (idxhash_t*)mi->B[i].h;
198 		if (h == 0) continue;
199 		for (k = 0; k < kh_end(h); ++k) {
200 			if (!kh_exist(h, k)) continue;
201 			a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
202 		}
203 	}
204 	thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
205 	free(a);
206 	return thres;
207 }
208 
209 /*********************************
210  * Sort and generate hash tables *
211  *********************************/
212 
worker_post(void * g,long i,int tid)213 static void worker_post(void *g, long i, int tid)
214 {
215 	int n, n_keys;
216 	size_t j, start_a, start_p;
217 	idxhash_t *h;
218 	mm_idx_t *mi = (mm_idx_t*)g;
219 	mm_idx_bucket_t *b = &mi->B[i];
220 	if (b->a.n == 0) return;
221 
222 	// sort by minimizer
223 	radix_sort_128x(b->a.a, b->a.a + b->a.n);
224 
225 	// count and preallocate
226 	for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
227 		if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
228 			++n_keys;
229 			if (n > 1) b->n += n;
230 			n = 1;
231 		} else ++n;
232 	}
233 	h = kh_init(idx);
234 	kh_resize(idx, h, n_keys);
235 	b->p = (uint64_t*)calloc(b->n, 8);
236 
237 	// create the hash table
238 	for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
239 		if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
240 			khint_t itr;
241 			int absent;
242 			mm128_t *p = &b->a.a[j-1];
243 			itr = kh_put(idx, h, p->x>>8>>mi->b<<1, &absent);
244 			assert(absent && j == start_a + n);
245 			if (n == 1) {
246 				kh_key(h, itr) |= 1;
247 				kh_val(h, itr) = p->y;
248 			} else {
249 				int k;
250 				for (k = 0; k < n; ++k)
251 					b->p[start_p + k] = b->a.a[start_a + k].y;
252 				radix_sort_64(&b->p[start_p], &b->p[start_p + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
253 				kh_val(h, itr) = (uint64_t)start_p<<32 | n;
254 				start_p += n;
255 			}
256 			start_a = j, n = 1;
257 		} else ++n;
258 	}
259 	b->h = h;
260 	assert(b->n == (int32_t)start_p);
261 
262 	// deallocate and clear b->a
263 	kfree(0, b->a.a);
264 	b->a.n = b->a.m = 0, b->a.a = 0;
265 }
266 
mm_idx_post(mm_idx_t * mi,int n_threads)267 static void mm_idx_post(mm_idx_t *mi, int n_threads)
268 {
269 	kt_for(n_threads, worker_post, mi, 1<<mi->b);
270 }
271 
272 /******************
273  * Generate index *
274  ******************/
275 
276 #include <string.h>
277 #include <zlib.h>
278 #include "bseq.h"
279 
280 typedef struct {
281 	int mini_batch_size;
282 	uint64_t batch_size, sum_len;
283 	mm_bseq_file_t *fp;
284 	mm_idx_t *mi;
285 } pipeline_t;
286 
287 typedef struct {
288     int n_seq;
289 	mm_bseq1_t *seq;
290 	mm128_v a;
291 } step_t;
292 
mm_idx_add(mm_idx_t * mi,int n,const mm128_t * a)293 static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
294 {
295 	int i, mask = (1<<mi->b) - 1;
296 	for (i = 0; i < n; ++i) {
297 		mm128_v *p = &mi->B[a[i].x>>8&mask].a;
298 		kv_push(mm128_t, 0, *p, a[i]);
299 	}
300 }
301 
worker_pipeline(void * shared,int step,void * in)302 static void *worker_pipeline(void *shared, int step, void *in)
303 {
304 	int i;
305     pipeline_t *p = (pipeline_t*)shared;
306     if (step == 0) { // step 0: read sequences
307         step_t *s;
308 		if (p->sum_len > p->batch_size) return 0;
309         s = (step_t*)calloc(1, sizeof(step_t));
310 		s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
311 		if (s->seq) {
312 			uint32_t old_m, m;
313 			assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
314 			// make room for p->mi->seq
315 			old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
316 			kroundup32(m); kroundup32(old_m);
317 			if (old_m != m)
318 				p->mi->seq = (mm_idx_seq_t*)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
319 			// make room for p->mi->S
320 			if (!(p->mi->flag & MM_I_NO_SEQ)) {
321 				uint64_t sum_len, old_max_len, max_len;
322 				for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
323 				old_max_len = (p->sum_len + 7) / 8;
324 				max_len = (p->sum_len + sum_len + 7) / 8;
325 				kroundup64(old_max_len); kroundup64(max_len);
326 				if (old_max_len != max_len) {
327 					p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
328 					memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
329 				}
330 			}
331 			// populate p->mi->seq
332 			for (i = 0; i < s->n_seq; ++i) {
333 				mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
334 				uint32_t j;
335 				if (!(p->mi->flag & MM_I_NO_NAME)) {
336 					seq->name = (char*)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
337 					strcpy(seq->name, s->seq[i].name);
338 				} else seq->name = 0;
339 				seq->len = s->seq[i].l_seq;
340 				seq->offset = p->sum_len;
341 				seq->is_alt = 0;
342 				// copy the sequence
343 				if (!(p->mi->flag & MM_I_NO_SEQ)) {
344 					for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
345 						uint64_t o = p->sum_len + j;
346 						int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
347 						mm_seq4_set(p->mi->S, o, c);
348 					}
349 				}
350 				// update p->sum_len and p->mi->n_seq
351 				p->sum_len += seq->len;
352 				s->seq[i].rid = p->mi->n_seq++;
353 			}
354 			return s;
355 		} else free(s);
356     } else if (step == 1) { // step 1: compute sketch
357         step_t *s = (step_t*)in;
358 		for (i = 0; i < s->n_seq; ++i) {
359 			mm_bseq1_t *t = &s->seq[i];
360 			if (t->l_seq > 0)
361 				mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
362 			else if (mm_verbose >= 2)
363 				fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
364 			free(t->seq); free(t->name);
365 		}
366 		free(s->seq); s->seq = 0;
367 		return s;
368     } else if (step == 2) { // dispatch sketch to buckets
369         step_t *s = (step_t*)in;
370 		mm_idx_add(p->mi, s->a.n, s->a.a);
371 		kfree(0, s->a.a); free(s);
372 	}
373     return 0;
374 }
375 
mm_idx_gen(mm_bseq_file_t * fp,int w,int k,int b,int flag,int mini_batch_size,int n_threads,uint64_t batch_size)376 mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
377 {
378 	pipeline_t pl;
379 	if (fp == 0 || mm_bseq_eof(fp)) return 0;
380 	memset(&pl, 0, sizeof(pipeline_t));
381 	pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size? mini_batch_size : batch_size;
382 	pl.batch_size = batch_size;
383 	pl.fp = fp;
384 	pl.mi = mm_idx_init(w, k, b, flag);
385 
386 	kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
387 	if (mm_verbose >= 3)
388 		fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
389 
390 	mm_idx_post(pl.mi, n_threads);
391 	if (mm_verbose >= 3)
392 		fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
393 
394 	return pl.mi;
395 }
396 
mm_idx_build(const char * fn,int w,int k,int flag,int n_threads)397 mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated
398 {
399 	mm_bseq_file_t *fp;
400 	mm_idx_t *mi;
401 	fp = mm_bseq_open(fn);
402 	if (fp == 0) return 0;
403 	mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
404 	mm_bseq_close(fp);
405 	return mi;
406 }
407 
mm_idx_str(int w,int k,int is_hpc,int bucket_bits,int n,const char ** seq,const char ** name)408 mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name)
409 {
410 	uint64_t sum_len = 0;
411 	mm128_v a = {0,0,0};
412 	mm_idx_t *mi;
413 	khash_t(str) *h;
414 	int i, flag = 0;
415 
416 	if (n <= 0) return 0;
417 	for (i = 0; i < n; ++i) // get the total length
418 		sum_len += strlen(seq[i]);
419 	if (is_hpc) flag |= MM_I_HPC;
420 	if (name == 0) flag |= MM_I_NO_NAME;
421 	if (bucket_bits < 0) bucket_bits = 14;
422 	mi = mm_idx_init(w, k, bucket_bits, flag);
423 	mi->n_seq = n;
424 	mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
425 	mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4);
426 	mi->h = h = kh_init(str);
427 	for (i = 0, sum_len = 0; i < n; ++i) {
428 		const char *s = seq[i];
429 		mm_idx_seq_t *p = &mi->seq[i];
430 		uint32_t j;
431 		if (name && name[i]) {
432 			int absent;
433 			p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1);
434 			strcpy(p->name, name[i]);
435 			kh_put(str, h, p->name, &absent);
436 			assert(absent);
437 		}
438 		p->offset = sum_len;
439 		p->len = strlen(s);
440 		p->is_alt = 0;
441 		for (j = 0; j < p->len; ++j) {
442 			int c = seq_nt4_table[(uint8_t)s[j]];
443 			uint64_t o = sum_len + j;
444 			mm_seq4_set(mi->S, o, c);
445 		}
446 		sum_len += p->len;
447 		if (p->len > 0) {
448 			a.n = 0;
449 			mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
450 			mm_idx_add(mi, a.n, a.a);
451 		}
452 	}
453 	free(a.a);
454 	mm_idx_post(mi, 1);
455 	return mi;
456 }
457 
458 /*************
459  * index I/O *
460  *************/
461 
mm_idx_dump(FILE * fp,const mm_idx_t * mi)462 void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
463 {
464 	uint64_t sum_len = 0;
465 	uint32_t x[5], i;
466 
467 	x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
468 	fwrite(MM_IDX_MAGIC, 1, 4, fp);
469 	fwrite(x, 4, 5, fp);
470 	for (i = 0; i < mi->n_seq; ++i) {
471 		if (mi->seq[i].name) {
472 			uint8_t l = strlen(mi->seq[i].name);
473 			fwrite(&l, 1, 1, fp);
474 			fwrite(mi->seq[i].name, 1, l, fp);
475 		} else {
476 			uint8_t l = 0;
477 			fwrite(&l, 1, 1, fp);
478 		}
479 		fwrite(&mi->seq[i].len, 4, 1, fp);
480 		sum_len += mi->seq[i].len;
481 	}
482 	for (i = 0; i < 1<<mi->b; ++i) {
483 		mm_idx_bucket_t *b = &mi->B[i];
484 		khint_t k;
485 		idxhash_t *h = (idxhash_t*)b->h;
486 		uint32_t size = h? h->size : 0;
487 		fwrite(&b->n, 4, 1, fp);
488 		fwrite(b->p, 8, b->n, fp);
489 		fwrite(&size, 4, 1, fp);
490 		if (size == 0) continue;
491 		for (k = 0; k < kh_end(h); ++k) {
492 			uint64_t x[2];
493 			if (!kh_exist(h, k)) continue;
494 			x[0] = kh_key(h, k), x[1] = kh_val(h, k);
495 			fwrite(x, 8, 2, fp);
496 		}
497 	}
498 	if (!(mi->flag & MM_I_NO_SEQ))
499 		fwrite(mi->S, 4, (sum_len + 7) / 8, fp);
500 	fflush(fp);
501 }
502 
mm_idx_load(FILE * fp)503 mm_idx_t *mm_idx_load(FILE *fp)
504 {
505 	char magic[4];
506 	uint32_t x[5], i;
507 	uint64_t sum_len = 0;
508 	mm_idx_t *mi;
509 
510 	if (fread(magic, 1, 4, fp) != 4) return 0;
511 	if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
512 	if (fread(x, 4, 5, fp) != 5) return 0;
513 	mi = mm_idx_init(x[0], x[1], x[2], x[4]);
514 	mi->n_seq = x[3];
515 	mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
516 	for (i = 0; i < mi->n_seq; ++i) {
517 		uint8_t l;
518 		mm_idx_seq_t *s = &mi->seq[i];
519 		fread(&l, 1, 1, fp);
520 		if (l) {
521 			s->name = (char*)kmalloc(mi->km, l + 1);
522 			fread(s->name, 1, l, fp);
523 			s->name[l] = 0;
524 		}
525 		fread(&s->len, 4, 1, fp);
526 		s->offset = sum_len;
527 		s->is_alt = 0;
528 		sum_len += s->len;
529 	}
530 	for (i = 0; i < 1<<mi->b; ++i) {
531 		mm_idx_bucket_t *b = &mi->B[i];
532 		uint32_t j, size;
533 		khint_t k;
534 		idxhash_t *h;
535 		fread(&b->n, 4, 1, fp);
536 		b->p = (uint64_t*)malloc(b->n * 8);
537 		fread(b->p, 8, b->n, fp);
538 		fread(&size, 4, 1, fp);
539 		if (size == 0) continue;
540 		b->h = h = kh_init(idx);
541 		kh_resize(idx, h, size);
542 		for (j = 0; j < size; ++j) {
543 			uint64_t x[2];
544 			int absent;
545 			fread(x, 8, 2, fp);
546 			k = kh_put(idx, h, x[0], &absent);
547 			assert(absent);
548 			kh_val(h, k) = x[1];
549 		}
550 	}
551 	if (!(mi->flag & MM_I_NO_SEQ)) {
552 		mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
553 		fread(mi->S, 4, (sum_len + 7) / 8, fp);
554 	}
555 	return mi;
556 }
557 
mm_idx_is_idx(const char * fn)558 int64_t mm_idx_is_idx(const char *fn)
559 {
560 	int fd, is_idx = 0;
561 	int64_t ret, off_end;
562 	char magic[4];
563 
564 	if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
565 	fd = open(fn, O_RDONLY);
566 	if (fd < 0) return -1; // error
567 #ifdef WIN32
568 	if ((off_end = _lseeki64(fd, 0, SEEK_END)) >= 4) {
569 		_lseeki64(fd, 0, SEEK_SET);
570 #else
571 	if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
572 		lseek(fd, 0, SEEK_SET);
573 #endif // WIN32
574 		ret = read(fd, magic, 4);
575 		if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0)
576 			is_idx = 1;
577 	}
578 	close(fd);
579 	return is_idx? off_end : 0;
580 }
581 
582 mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
583 {
584 	int64_t is_idx;
585 	mm_idx_reader_t *r;
586 	is_idx = mm_idx_is_idx(fn);
587 	if (is_idx < 0) return 0; // failed to open the index
588 	r = (mm_idx_reader_t*)calloc(1, sizeof(mm_idx_reader_t));
589 	r->is_idx = is_idx;
590 	if (opt) r->opt = *opt;
591 	else mm_idxopt_init(&r->opt);
592 	if (r->is_idx) {
593 		r->fp.idx = fopen(fn, "rb");
594 		r->idx_size = is_idx;
595 	} else r->fp.seq = mm_bseq_open(fn);
596 	if (fn_out) r->fp_out = fopen(fn_out, "wb");
597 	return r;
598 }
599 
600 void mm_idx_reader_close(mm_idx_reader_t *r)
601 {
602 	if (r->is_idx) fclose(r->fp.idx);
603 	else mm_bseq_close(r->fp.seq);
604 	if (r->fp_out) fclose(r->fp_out);
605 	free(r);
606 }
607 
608 mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
609 {
610 	mm_idx_t *mi;
611 	if (r->is_idx) {
612 		mi = mm_idx_load(r->fp.idx);
613 		if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
614 			fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
615 	} else
616 		mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
617 	if (mi) {
618 		if (r->fp_out) mm_idx_dump(r->fp_out, mi);
619 		mi->index = r->n_parts++;
620 	}
621 	return mi;
622 }
623 
624 int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work
625 {
626 	return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
627 }
628 
629 #include <ctype.h>
630 #include <zlib.h>
631 #include "ksort.h"
632 #include "kseq.h"
633 KSTREAM_DECLARE(gzFile, gzread)
634 
635 int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
636 {
637 	int n_alt = 0;
638 	gzFile fp;
639 	kstream_t *ks;
640 	kstring_t str = {0,0,0};
641 	fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
642 	if (fp == 0) return -1;
643 	ks = ks_init(fp);
644 	if (mi->h == 0) mm_idx_index_name(mi);
645 	while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
646 		char *p;
647 		int id;
648 		for (p = str.s; *p && !isspace(*p); ++p) { }
649 		*p = 0;
650 		id = mm_idx_name2id(mi, str.s);
651 		if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt;
652 	}
653 	mi->n_alt = n_alt;
654 	if (mm_verbose >= 3)
655 		fprintf(stderr, "[M::%s] found %d ALT contigs\n", __func__, n_alt);
656 	return n_alt;
657 }
658 
659 #define sort_key_bed(a) ((a).st)
660 KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
661 
662 mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
663 {
664 	gzFile fp;
665 	kstream_t *ks;
666 	kstring_t str = {0,0,0};
667 	mm_idx_intv_t *I;
668 
669 	fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
670 	if (fp == 0) return 0;
671 	I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
672 	ks = ks_init(fp);
673 	while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
674 		mm_idx_intv_t *r;
675 		mm_idx_intv1_t t = {-1,-1,-1,-1,0};
676 		char *p, *q, *bl, *bs;
677 		int32_t i, id = -1, n_blk = 0;
678 		for (p = q = str.s, i = 0;; ++p) {
679 			if (*p == 0 || *p == '\t') {
680 				int32_t c = *p;
681 				*p = 0;
682 				if (i == 0) { // chr
683 					id = mm_idx_name2id(mi, q);
684 					if (id < 0) break; // unknown name; TODO: throw a warning
685 				} else if (i == 1) { // start
686 					t.st = atol(q); // TODO: watch out integer overflow!
687 					if (t.st < 0) break;
688 				} else if (i == 2) { // end
689 					t.en = atol(q);
690 					if (t.en < 0) break;
691 				} else if (i == 4) { // BED score
692 					t.score = atol(q);
693 				} else if (i == 5) { // strand
694 					t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
695 				} else if (i == 9) {
696 					if (!isdigit(*q)) break;
697 					n_blk = atol(q);
698 				} else if (i == 10) {
699 					bl = q;
700 				} else if (i == 11) {
701 					bs = q;
702 					break;
703 				}
704 				if (c == 0) break;
705 				++i, q = p + 1;
706 			}
707 		}
708 		if (id < 0 || t.st < 0 || t.st >= t.en) continue;
709 		r = &I[id];
710 		if (i >= 11 && read_junc) { // BED12
711 			int32_t st, sz, en;
712 			st = strtol(bs, &bs, 10); ++bs;
713 			sz = strtol(bl, &bl, 10); ++bl;
714 			en = t.st + st + sz;
715 			for (i = 1; i < n_blk; ++i) {
716 				mm_idx_intv1_t s = t;
717 				if (r->n == r->m) {
718 					r->m = r->m? r->m + (r->m>>1) : 16;
719 					r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
720 				}
721 				st = strtol(bs, &bs, 10); ++bs;
722 				sz = strtol(bl, &bl, 10); ++bl;
723 				s.st = en, s.en = t.st + st;
724 				en = t.st + st + sz;
725 				if (s.en > s.st) r->a[r->n++] = s;
726 			}
727 		} else {
728 			if (r->n == r->m) {
729 				r->m = r->m? r->m + (r->m>>1) : 16;
730 				r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
731 			}
732 			r->a[r->n++] = t;
733 		}
734 	}
735 	free(str.s);
736 	ks_destroy(ks);
737 	gzclose(fp);
738 	return I;
739 }
740 
741 int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
742 {
743 	int32_t i;
744 	if (mi->h == 0) mm_idx_index_name(mi);
745 	mi->I = mm_idx_read_bed(mi, fn, read_junc);
746 	if (mi->I == 0) return -1;
747 	for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
748 		radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
749 	return 0;
750 }
751 
752 int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s)
753 {
754 	int32_t i, left, right;
755 	mm_idx_intv_t *r;
756 	memset(s, 0, en - st);
757 	if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
758 	r = &mi->I[ctg];
759 	left = 0, right = r->n;
760 	while (right > left) {
761 		int32_t mid = left + ((right - left) >> 1);
762 		if (r->a[mid].st >= st) right = mid;
763 		else left = mid + 1;
764 	}
765 	for (i = left; i < r->n; ++i) {
766 		if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0) {
767 			if (r->a[i].strand > 0) {
768 				s[r->a[i].st - st] |= 1, s[r->a[i].en - 1 - st] |= 2;
769 			} else {
770 				s[r->a[i].st - st] |= 8, s[r->a[i].en - 1 - st] |= 4;
771 			}
772 		}
773 	}
774 	return left;
775 }
776