1 #include "mmpriv.h"
2 #include "kalloc.h"
3 #include "ksort.h"
4 
mm_seed_mz_flt(void * km,mm128_v * mv,int32_t q_occ_max,float q_occ_frac)5 void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac)
6 {
7 	mm128_t *a;
8 	size_t i, j, st;
9 	if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return;
10 	KMALLOC(km, a, mv->n);
11 	for (i = 0; i < mv->n; ++i)
12 		a[i].x = mv->a[i].x, a[i].y = i;
13 	radix_sort_128x(a, a + mv->n);
14 	for (st = 0, i = 1; i <= mv->n; ++i) {
15 		if (i == mv->n || a[i].x != a[st].x) {
16 			int32_t cnt = i - st;
17 			if (cnt > q_occ_max && cnt > mv->n * q_occ_frac)
18 				for (j = st; j < i; ++j)
19 					mv->a[a[j].y].x = 0;
20 			st = i;
21 		}
22 	}
23 	kfree(km, a);
24 	for (i = j = 0; i < mv->n; ++i)
25 		if (mv->a[i].x != 0)
26 			mv->a[j++] = mv->a[i];
27 	mv->n = j;
28 }
29 
mm_seed_collect_all(void * km,const mm_idx_t * mi,const mm128_v * mv,int32_t * n_m_)30 mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_)
31 {
32 	mm_seed_t *m;
33 	size_t i;
34 	int32_t k;
35 	m = (mm_seed_t*)kmalloc(km, mv->n * sizeof(mm_seed_t));
36 	for (i = k = 0; i < mv->n; ++i) {
37 		const uint64_t *cr;
38 		mm_seed_t *q;
39 		mm128_t *p = &mv->a[i];
40 		uint32_t q_pos = (uint32_t)p->y, q_span = p->x & 0xff;
41 		int t;
42 		cr = mm_idx_get(mi, p->x>>8, &t);
43 		if (t == 0) continue;
44 		q = &m[k++];
45 		q->q_pos = q_pos, q->q_span = q_span, q->cr = cr, q->n = t, q->seg_id = p->y >> 32;
46 		q->is_tandem = q->flt = 0;
47 		if (i > 0 && p->x>>8 == mv->a[i - 1].x>>8) q->is_tandem = 1;
48 		if (i < mv->n - 1 && p->x>>8 == mv->a[i + 1].x>>8) q->is_tandem = 1;
49 	}
50 	*n_m_ = k;
51 	return m;
52 }
53 
54 #define MAX_MAX_HIGH_OCC 128
55 
mm_seed_select(int32_t n,mm_seed_t * a,int len,int max_occ,int max_max_occ,int dist)56 void mm_seed_select(int32_t n, mm_seed_t *a, int len, int max_occ, int max_max_occ, int dist)
57 { // for high-occ minimizers, choose up to max_high_occ in each high-occ streak
58 	extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*);
59 	extern void ks_heapmake_uint64_t(size_t n, uint64_t*);
60 	int32_t i, last0, m;
61 	uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation
62 
63 	if (n == 0 || n == 1) return;
64 	for (i = m = 0; i < n; ++i)
65 		if (a[i].n > max_occ) ++m;
66 	if (m == 0) return; // no high-frequency k-mers; do nothing
67 	for (i = 0, last0 = -1; i <= n; ++i) {
68 		if (i == n || a[i].n <= max_occ) {
69 			if (i - last0 > 1) {
70 				int32_t ps = last0 < 0? 0 : (uint32_t)a[last0].q_pos>>1;
71 				int32_t pe = i == n? len : (uint32_t)a[i].q_pos>>1;
72 				int32_t j, k, st = last0 + 1, en = i;
73 				int32_t max_high_occ = (int32_t)((double)(pe - ps) / dist + .499);
74 				if (max_high_occ > 0) {
75 					if (max_high_occ > MAX_MAX_HIGH_OCC)
76 						max_high_occ = MAX_MAX_HIGH_OCC;
77 					for (j = st, k = 0; j < en && k < max_high_occ; ++j, ++k)
78 						b[k] = (uint64_t)a[j].n<<32 | j;
79 					ks_heapmake_uint64_t(k, b); // initialize the binomial heap
80 					for (; j < en; ++j) { // if there are more, choose top max_high_occ
81 						if (a[j].n < (int32_t)(b[0]>>32)) { // then update the heap
82 							b[0] = (uint64_t)a[j].n<<32 | j;
83 							ks_heapdown_uint64_t(0, k, b);
84 						}
85 					}
86 					for (j = 0; j < k; ++j) a[(uint32_t)b[j]].flt = 1;
87 				}
88 				for (j = st; j < en; ++j) a[j].flt ^= 1;
89 				for (j = st; j < en; ++j)
90 					if (a[j].n > max_max_occ)
91 						a[j].flt = 1;
92 			}
93 			last0 = i;
94 		}
95 	}
96 }
97 
mm_collect_matches(void * km,int * _n_m,int qlen,int max_occ,int max_max_occ,int dist,const mm_idx_t * mi,const mm128_v * mv,int64_t * n_a,int * rep_len,int * n_mini_pos,uint64_t ** mini_pos)98 mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos)
99 {
100 	int rep_st = 0, rep_en = 0, n_m, n_m0;
101 	size_t i;
102 	mm_seed_t *m;
103 	*n_mini_pos = 0;
104 	*mini_pos = (uint64_t*)kmalloc(km, mv->n * sizeof(uint64_t));
105 	m = mm_seed_collect_all(km, mi, mv, &n_m0);
106 	if (dist > 0 && max_max_occ > max_occ) {
107 		mm_seed_select(n_m0, m, qlen, max_occ, max_max_occ, dist);
108 	} else {
109 		for (i = 0; i < n_m0; ++i)
110 			if (m[i].n > max_occ)
111 				m[i].flt = 1;
112 	}
113 	for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < n_m0; ++i) {
114 		mm_seed_t *q = &m[i];
115 		//fprintf(stderr, "X\t%d\t%d\t%d\n", q->q_pos>>1, q->n, q->flt);
116 		if (q->flt) {
117 			int en = (q->q_pos >> 1) + 1, st = en - q->q_span;
118 			if (st > rep_en) {
119 				*rep_len += rep_en - rep_st;
120 				rep_st = st, rep_en = en;
121 			} else rep_en = en;
122 		} else {
123 			*n_a += q->n;
124 			(*mini_pos)[(*n_mini_pos)++] = (uint64_t)q->q_span<<32 | q->q_pos>>1;
125 			m[n_m++] = *q;
126 		}
127 	}
128 	*rep_len += rep_en - rep_st;
129 	*_n_m = n_m;
130 	return m;
131 }
132