1 /*
2  *
3  * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
4  *
5  *
6  * This program is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "rainbow.h"
21 
lend_ulist_div(Div * div)22 u32list* lend_ulist_div(Div *div){
23 	u32list *list;
24 	if(!pop_u32slist(div->cache, &list)) list = init_u32list(4);
25 	return list;
26 }
27 
return_ulist_div(Div * div,u32list * list)28 void return_ulist_div(Div *div, u32list *list){ if(list){ clear_u32list(list); push_u32slist(div->cache, list); } }
29 
30 typedef struct {
31 	uint32_t cnt;
32 	uint32_t base;
33 } BaseCnt;
34 
C_N_2(uint32_t n)35 static inline uint32_t C_N_2(uint32_t n){
36 	if(n == 0) return 0;
37 	else return n * (n - 1) / 2;
38 }
_call_key_col(Div * div,uint32_t gid)39 uint32_t _call_key_col(Div *div, uint32_t gid){
40 	ReadInfo *rd;
41 	u32list *grp;
42 	uint32_t i, j, col, row, key, c, max_non, tol, base;
43 	BaseCnt cnts[4];
44 	key = div->n_col;
45 	base = 0;
46 	grp = get_u32slist(div->grps, gid);
47 	max_non = 0;
48 	for(col=0;col<div->n_col;col++){
49 		for(i=0;i<4;i++){ cnts[i].base = i; cnts[i].cnt = 0; }
50 		for(row=0;row<count_u32list(grp);row++){
51 			rd = ref_rilist(div->rds, get_u32list(grp, row));
52 			if(rd->seqlen1 <= col) continue;
53 //			c  = base_bit_table[(int)get_u8list(div->seqs, rd->seqoff + col)];
54 			c = div->seqs->buffer[rd->seqoff + col];
55 			cnts[c&0x03].cnt ++;
56 		}
57 		tol = cnts[0].cnt + cnts[1].cnt + cnts[2].cnt + cnts[3].cnt;
58 		if(tol == 0) break;
59 		for(i=0;i<2;i++){
60 			for(j=3;j>i;j--){
61 				if(cnts[j].cnt > cnts[j-1].cnt){
62 					swap_tmp(cnts[j].cnt, cnts[j-1].cnt, c);
63 					swap_tmp(cnts[j].base, cnts[j-1].base, c);
64 				}
65 			}
66 		}
67 		if(cnts[1].cnt < div->k_allele) continue;
68 		if(cnts[1].cnt < div->K_allele && cnts[1].cnt < div->min_freq * tol) continue;
69 		if(cnts[1].cnt > max_non){
70 			max_non = cnts[1].cnt;
71 			key = col;
72 			base = cnts[1].base;
73 		}
74 	}
75 	return (key << 2) | base;
76 }
77 
call_key_col(Div * div,uint32_t gid)78 uint32_t call_key_col(Div *div, uint32_t gid){
79 	ReadInfo *rd;
80 	u32list *grp;
81 	uint32_t i, j, k, col, row, key, c, tol, base, s1, s2;
82 	BaseCnt cnts[4];
83 	col_base_t *cb;
84 	uint64_t MM1, MM2;
85 	uint32_t n_p1, n_p2, idx;
86 	double min_mm, mm1, mm2;
87 	key = div->n_col;
88 	base = 0;
89 	grp = get_u32slist(div->grps, gid);
90 	clear_cbv(div->cbs);
91 	for(col=0;col<div->n_col;col++){
92 		for(i=0;i<4;i++){ cnts[i].base = i; cnts[i].cnt = 0; }
93 		for(row=0;row<count_u32list(grp);row++){
94 			rd = ref_rilist(div->rds, get_u32list(grp, row));
95 			if(rd->seqlen1 <= col) continue;
96 			c = div->seqs->buffer[rd->seqoff + col];
97 			cnts[c&0x03].cnt ++;
98 		}
99 		tol = cnts[0].cnt + cnts[1].cnt + cnts[2].cnt + cnts[3].cnt;
100 		if(tol == 0) break;
101 		for(i=0;i<2;i++){
102 			for(j=3;j>i;j--){
103 				if(cnts[j].cnt > cnts[j-1].cnt){
104 					swap_tmp(cnts[j].cnt, cnts[j-1].cnt, c);
105 					swap_tmp(cnts[j].base, cnts[j-1].base, c);
106 				}
107 			}
108 		}
109 		if(cnts[1].cnt < div->k_allele) continue;
110 		if(cnts[1].cnt < div->K_allele && cnts[1].cnt < div->min_freq * tol) continue;
111 		cb = next_ref_cbv(div->cbs);
112 		cb->col  = col;
113 		cb->base = cnts[1].base;
114 		cb->cnt  = cnts[1].cnt;
115 	}
116 	if(div->cbs->size == 1){
117 		key = ref_cbv(div->cbs, 0)->col;
118 		base = ref_cbv(div->cbs, 0)->base;
119 	}
120 	if(div->cbs->size > 1){
121 		encap_u32list(div->ps1, div->n_col * 4);
122 		encap_u32list(div->ps2, div->n_col * 4);
123 		min_mm = 10000000;
124 		for(i=0;i<div->cbs->size;i++){
125 			cb = ref_cbv(div->cbs, i);
126 			n_p1 = cb->cnt;
127 			n_p2 = grp->size - cb->cnt;
128 			memset(div->ps1->buffer, 0, div->n_col * 4 * 4);
129 			memset(div->ps2->buffer, 0, div->n_col * 4 * 4);
130 			for(row=0;row<grp->size;row++){
131 				rd = ref_rilist(div->rds, get_u32list(grp, row));
132 				if(rd->seqlen1 <= cb->col) idx = 1;
133 				else idx = (div->seqs->buffer[rd->seqoff + cb->col] != cb->base);
134 				if(idx){
135 					for(j=0;j<div->n_col;j++){
136 						div->ps2->buffer[div->seqs->buffer[rd->seqoff + j] + 4 * j] ++;
137 					}
138 				} else {
139 					for(j=0;j<div->n_col;j++){
140 						div->ps1->buffer[div->seqs->buffer[rd->seqoff + j] + 4 * j] ++;
141 					}
142 				}
143 			}
144 			MM1 = MM2 = 0;
145 			for(j=0;j<div->n_col;j++){
146 				if(j == i) continue;
147 				s1 = C_N_2(n_p1);
148 				s2 = C_N_2(n_p2);
149 				for (k = 0; k < 4; k++) {
150 					s1 -= C_N_2(div->ps1->buffer[k + 4 * j]);
151 					s2 -= C_N_2(div->ps2->buffer[k + 4 * j]);
152 				}
153 				MM1 += s1;
154 				MM2 += s2;
155 				//fprintf(stdout, " -- %u %u in %s -- %s:%d --\n", s1, s2, __FUNCTION__, __FILE__, __LINE__);
156 			}
157 			//fprintf(stdout, "col%d mm1 %lld\n", cb->col, MM1);
158 			//fprintf(stdout, "col%d mm2 %lld\n", cb->col, MM2);
159 			mm1 = ((long double)MM1) / (n_p1*(n_p1-1)/2);
160 			mm2 = ((long double)MM2) / (n_p2*(n_p2-1)/2);
161 			if(mm1 < mm2) mm1 = mm2;
162 			//fprintf(stdout, "gid%u col%d %f\n", gid, cb->col, mm1);
163 			if(mm1 - min_mm < 0.00000000001){
164 				min_mm = mm1;
165 				key = cb->col;
166 				base = cb->base;
167 			}
168 		}
169 	}
170 	return (key << 2) | base;
171 }
172 
dividing_core(Div * div,uint32_t gid,int dep)173 void dividing_core(Div *div, uint32_t gid, int dep){
174 	ReadInfo *rd;
175 	u32list *grp, *sub;
176 //	uint64_t mark0;
177 	uint32_t i, j, col, rid, gids[2], b;
178 	col = _call_key_col(div, gid);
179 	b = col & 0x03;
180 	col >>= 2;
181 	if(col >= div->n_col || div->rds->size < div->K_allele || dep > 255){
182 		push_u32list(div->gids, gid);
183 		push_u32list(div->deps, dep);
184 		return;
185 	}
186 	for(i=0;i<2;i++){
187 		gids[i] = count_u32slist(div->grps);
188 		sub = lend_ulist_div(div);
189 		push_u32slist(div->grps, sub);
190 	}
191 	grp = get_u32slist(div->grps, gid);
192 	/*
193 	char str[257];
194 	for(i=0;(int)i<dep;i++){
195 		mark0 = get_u64list(div->markers[i/64], gid);
196 		str[i] = '0' + ((mark0 >> (i%64))& 0x01);
197 	}
198 	str[i] = '\0';
199 	fprintf(stderr, "%s\t%d\t%c\n", str, col, "ACGT"[b]);
200 	for (j = 0; j < 4; j++) {
201 		push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
202 		push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
203 	}
204 	if (dep <= 255) {
205 		set_u64list(div->markers[dep/64], gid+1, get_u64list(div->markers[j], gid) | (1LLU << (dep%64)));
206 	}
207 	*/
208 
209 //	if (dep <= 255) {
210 	for (j = 0; (int)j < dep/64; j++) {
211 		push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
212 		push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
213 	}
214 		push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
215 		push_u64list(div->markers[j], get_u64list(div->markers[j], gid) | (1LLU << (dep%64)));
216 	j++;
217 	for (; j < 4; j++) {
218 		push_u64list(div->markers[j], 0);
219 		push_u64list(div->markers[j], 0);
220 	}
221 //	}
222 
223 	for(i=0;i<count_u32list(grp);i++){
224 		rid = get_u32list(grp, i);
225 		rd = ref_rilist(div->rds, rid);
226 		if(rd->seqlen1 >= col && div->seqs->buffer[rd->seqoff + col] == b){
227 			push_u32list(get_u32slist(div->grps, gids[1]), rid);
228 		} else {
229 			push_u32list(get_u32slist(div->grps, gids[0]), rid);
230 		}
231 	}
232 	for(i=0;i<2;i++){
233 //		if(count_u32list(get_u32slist(div->grps, gids[i])) > 2 * div->k_allele) dividing_core(div, gids[i], dep + 1);
234 		dividing_core(div, gids[i], dep + 1);
235 	}
236 }
237 
dividing(Div * div,uint32_t old_gid,FILE * out)238 void dividing(Div *div, uint32_t old_gid, FILE *out){
239 	ReadInfo *rd;
240 	u32list *grp;
241 	uint64_t marker;
242 	uint32_t i, j, k, gid, dep;
243 	char route[257];
244 	String *seq1, *seq2;
245 	seq1 = init_string(1024);
246 	seq2 = init_string(1024);
247 	for (i = 0; i < 4; i++) {
248 		clear_u64list(div->markers[i]);
249 		push_u64list(div->markers[i], 0);
250 	}
251 	dividing_core(div, 0, 0);
252 	for(i=0;i<count_u32list(div->gids);i++){
253 		grp = get_u32slist(div->grps, get_u32list(div->gids, i));
254 		dep = get_u32list(div->deps, i);
255 		if (dep>255) dep = 255;
256 //		marker1 = get_u64list(div->markers1, get_u32list(div->gids, i));
257 //		marker2 = get_u64list(div->markers2, get_u32list(div->gids, i));
258 		for(j=0;j<dep;j++){
259 			marker = get_u64list(div->markers[j/64], get_u32list(div->gids, i));
260 			route[j] = '0' + ((marker >> (j%64)) & 0x01);
261 		}
262 		route[dep] = 0;
263 		gid = ++div->gidoff;
264 		for(j=0;j<count_u32list(grp);j++){
265 			rd = ref_rilist(div->rds, get_u32list(grp, j));
266 			for(k=0;k<rd->seqlen1;k++) seq1->string[k] = bit_base_table[div->seqs->buffer[rd->seqoff + k]];
267 			seq1->string[k] = 0;
268 			for(k=0;k<rd->seqlen2;k++) seq2->string[k] = bit_base_table[div->seqs->buffer[rd->seqoff + rd->seqlen1 + k]];
269 			seq2->string[k] = 0;
270 			fprintf(out, "%u\t%u\t%s\t%s\t%u\t%s\n",
271 				rd->seqid, gid, seq1->string, seq2->string, old_gid, route);
272 		}
273 	}
274 	fflush(out);
275 	old_gid = old_gid;
276 	free_string(seq1);
277 	free_string(seq2);
278 }
279 
init_div(uint32_t k_allele,uint32_t K_allele,float min_freq)280 Div* init_div(uint32_t k_allele, uint32_t K_allele, float min_freq){
281 	Div *div; int i;
282 	div = malloc(sizeof(Div));
283 	div->gidoff   = 0;
284 	div->n_col    = 0;
285 	div->k_allele = k_allele;
286 	div->K_allele = K_allele;
287 	div->min_freq = min_freq;
288 	div->rds   = init_rilist(128);
289 	div->seqs  = init_u8list(128 * 80);
290 	div->grps  = init_u32slist(64);
291 	for (i = 0; i < 4; i++) {
292 		div->markers[i] = init_u64list(64);
293 	}
294 	div->deps = init_u32list(64);
295 	div->cache = init_u32slist(64);
296 	div->gids  = init_u32list(8);
297 	div->cbs = init_cbv(12);
298 	div->ps1 = init_u32list(32);
299 	div->ps2 = init_u32list(32);
300 	return div;
301 }
302 
reset_div(Div * div)303 void reset_div(Div *div){
304 	uint32_t i;
305 	clear_rilist(div->rds);
306 	clear_u8list(div->seqs);
307 	for(i=0;i<count_u32slist(div->grps);i++){
308 		return_ulist_div(div, get_u32slist(div->grps, i));
309 	}
310 	clear_u32slist(div->grps);
311 	clear_u32list(div->gids);
312 	for (i = 0; i < 4; i++) {
313 		clear_u64list(div->markers[i]);
314 	}
315 	clear_u32list(div->deps);
316 	div->n_col = 0;
317 }
318 
free_div(Div * div)319 void free_div(Div *div){
320 	uint32_t i;
321 	reset_div(div);
322 	free_rilist(div->rds);
323 	free_u8list(div->seqs);
324 	free_u32slist(div->grps);
325 	for(i=0;i<count_u32slist(div->cache);i++){
326 		free_u32list(get_u32slist(div->cache, i));
327 	}
328 	free_u32list(div->ps1);
329 	free_u32list(div->ps2);
330 	free_cbv(div->cbs);
331 	free_u32slist(div->cache);
332 	free_u32list(div->gids);
333 	for (i = 0; i < 4; i++) {
334 		free_u64list(div->markers[i]);
335 	}
336 	free_u32list(div->deps);
337 	free(div);
338 }
339 
div_reads(Div * div,FileReader * fr,FILE * out)340 uint32_t div_reads(Div *div, FileReader *fr, FILE *out){
341 	ReadInfo *rd;
342 	uint32_t seqid, rank, gid, last_gid, rid, ret;
343 	char *seq1, *seq2;
344 	int i;
345 	last_gid = 0;
346 	ret = 0;
347 	while(fread_table(fr) != -1){
348 		seqid = atoll(get_col_str(fr, 0));
349 		rank  = 1;
350 		gid   = atoll(get_col_str(fr, 1));
351 		seq1  = get_col_str(fr, 2);
352 		seq2  = get_col_str(fr, 3);
353 		if(gid != last_gid){
354 			ret ++;
355 			if(last_gid) dividing(div, last_gid, out);
356 			last_gid = gid;
357 			reset_div(div);
358 			push_u32slist(div->grps, lend_ulist_div(div));
359 		}
360 		if(get_col_len(fr, 2) > (int)div->n_col) div->n_col = get_col_len(fr, 2);
361 		rid = count_rilist(div->rds);
362 		rd  = next_ref_rilist(div->rds);
363 		rd->seqid   = seqid;
364 		rd->rank    = rank;
365 		rd->seqoff  = div->seqs->size;
366 		rd->seqlen1 = get_col_len(fr, 2);
367 		rd->seqlen2 = get_col_len(fr, 3);
368 		for(i=0;i<rd->seqlen1;i++) push_u8list(div->seqs, base_bit_table[(int)seq1[i]]);
369 		for(i=0;i<rd->seqlen2;i++) push_u8list(div->seqs, base_bit_table[(int)seq2[i]]);
370 		push_u32list(get_u32slist(div->grps, 0), rid);
371 	}
372 	if(last_gid) dividing(div, last_gid, out);
373 	return ret;
374 }
375 
376