1 /*
2 *
3 * Copyright (c) 2011, Jue Ruan <ruanjue@gmail.com>
4 *
5 *
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "rainbow.h"
21
lend_ulist_div(Div * div)22 u32list* lend_ulist_div(Div *div){
23 u32list *list;
24 if(!pop_u32slist(div->cache, &list)) list = init_u32list(4);
25 return list;
26 }
27
return_ulist_div(Div * div,u32list * list)28 void return_ulist_div(Div *div, u32list *list){ if(list){ clear_u32list(list); push_u32slist(div->cache, list); } }
29
30 typedef struct {
31 uint32_t cnt;
32 uint32_t base;
33 } BaseCnt;
34
C_N_2(uint32_t n)35 static inline uint32_t C_N_2(uint32_t n){
36 if(n == 0) return 0;
37 else return n * (n - 1) / 2;
38 }
_call_key_col(Div * div,uint32_t gid)39 uint32_t _call_key_col(Div *div, uint32_t gid){
40 ReadInfo *rd;
41 u32list *grp;
42 uint32_t i, j, col, row, key, c, max_non, tol, base;
43 BaseCnt cnts[4];
44 key = div->n_col;
45 base = 0;
46 grp = get_u32slist(div->grps, gid);
47 max_non = 0;
48 for(col=0;col<div->n_col;col++){
49 for(i=0;i<4;i++){ cnts[i].base = i; cnts[i].cnt = 0; }
50 for(row=0;row<count_u32list(grp);row++){
51 rd = ref_rilist(div->rds, get_u32list(grp, row));
52 if(rd->seqlen1 <= col) continue;
53 // c = base_bit_table[(int)get_u8list(div->seqs, rd->seqoff + col)];
54 c = div->seqs->buffer[rd->seqoff + col];
55 cnts[c&0x03].cnt ++;
56 }
57 tol = cnts[0].cnt + cnts[1].cnt + cnts[2].cnt + cnts[3].cnt;
58 if(tol == 0) break;
59 for(i=0;i<2;i++){
60 for(j=3;j>i;j--){
61 if(cnts[j].cnt > cnts[j-1].cnt){
62 swap_tmp(cnts[j].cnt, cnts[j-1].cnt, c);
63 swap_tmp(cnts[j].base, cnts[j-1].base, c);
64 }
65 }
66 }
67 if(cnts[1].cnt < div->k_allele) continue;
68 if(cnts[1].cnt < div->K_allele && cnts[1].cnt < div->min_freq * tol) continue;
69 if(cnts[1].cnt > max_non){
70 max_non = cnts[1].cnt;
71 key = col;
72 base = cnts[1].base;
73 }
74 }
75 return (key << 2) | base;
76 }
77
call_key_col(Div * div,uint32_t gid)78 uint32_t call_key_col(Div *div, uint32_t gid){
79 ReadInfo *rd;
80 u32list *grp;
81 uint32_t i, j, k, col, row, key, c, tol, base, s1, s2;
82 BaseCnt cnts[4];
83 col_base_t *cb;
84 uint64_t MM1, MM2;
85 uint32_t n_p1, n_p2, idx;
86 double min_mm, mm1, mm2;
87 key = div->n_col;
88 base = 0;
89 grp = get_u32slist(div->grps, gid);
90 clear_cbv(div->cbs);
91 for(col=0;col<div->n_col;col++){
92 for(i=0;i<4;i++){ cnts[i].base = i; cnts[i].cnt = 0; }
93 for(row=0;row<count_u32list(grp);row++){
94 rd = ref_rilist(div->rds, get_u32list(grp, row));
95 if(rd->seqlen1 <= col) continue;
96 c = div->seqs->buffer[rd->seqoff + col];
97 cnts[c&0x03].cnt ++;
98 }
99 tol = cnts[0].cnt + cnts[1].cnt + cnts[2].cnt + cnts[3].cnt;
100 if(tol == 0) break;
101 for(i=0;i<2;i++){
102 for(j=3;j>i;j--){
103 if(cnts[j].cnt > cnts[j-1].cnt){
104 swap_tmp(cnts[j].cnt, cnts[j-1].cnt, c);
105 swap_tmp(cnts[j].base, cnts[j-1].base, c);
106 }
107 }
108 }
109 if(cnts[1].cnt < div->k_allele) continue;
110 if(cnts[1].cnt < div->K_allele && cnts[1].cnt < div->min_freq * tol) continue;
111 cb = next_ref_cbv(div->cbs);
112 cb->col = col;
113 cb->base = cnts[1].base;
114 cb->cnt = cnts[1].cnt;
115 }
116 if(div->cbs->size == 1){
117 key = ref_cbv(div->cbs, 0)->col;
118 base = ref_cbv(div->cbs, 0)->base;
119 }
120 if(div->cbs->size > 1){
121 encap_u32list(div->ps1, div->n_col * 4);
122 encap_u32list(div->ps2, div->n_col * 4);
123 min_mm = 10000000;
124 for(i=0;i<div->cbs->size;i++){
125 cb = ref_cbv(div->cbs, i);
126 n_p1 = cb->cnt;
127 n_p2 = grp->size - cb->cnt;
128 memset(div->ps1->buffer, 0, div->n_col * 4 * 4);
129 memset(div->ps2->buffer, 0, div->n_col * 4 * 4);
130 for(row=0;row<grp->size;row++){
131 rd = ref_rilist(div->rds, get_u32list(grp, row));
132 if(rd->seqlen1 <= cb->col) idx = 1;
133 else idx = (div->seqs->buffer[rd->seqoff + cb->col] != cb->base);
134 if(idx){
135 for(j=0;j<div->n_col;j++){
136 div->ps2->buffer[div->seqs->buffer[rd->seqoff + j] + 4 * j] ++;
137 }
138 } else {
139 for(j=0;j<div->n_col;j++){
140 div->ps1->buffer[div->seqs->buffer[rd->seqoff + j] + 4 * j] ++;
141 }
142 }
143 }
144 MM1 = MM2 = 0;
145 for(j=0;j<div->n_col;j++){
146 if(j == i) continue;
147 s1 = C_N_2(n_p1);
148 s2 = C_N_2(n_p2);
149 for (k = 0; k < 4; k++) {
150 s1 -= C_N_2(div->ps1->buffer[k + 4 * j]);
151 s2 -= C_N_2(div->ps2->buffer[k + 4 * j]);
152 }
153 MM1 += s1;
154 MM2 += s2;
155 //fprintf(stdout, " -- %u %u in %s -- %s:%d --\n", s1, s2, __FUNCTION__, __FILE__, __LINE__);
156 }
157 //fprintf(stdout, "col%d mm1 %lld\n", cb->col, MM1);
158 //fprintf(stdout, "col%d mm2 %lld\n", cb->col, MM2);
159 mm1 = ((long double)MM1) / (n_p1*(n_p1-1)/2);
160 mm2 = ((long double)MM2) / (n_p2*(n_p2-1)/2);
161 if(mm1 < mm2) mm1 = mm2;
162 //fprintf(stdout, "gid%u col%d %f\n", gid, cb->col, mm1);
163 if(mm1 - min_mm < 0.00000000001){
164 min_mm = mm1;
165 key = cb->col;
166 base = cb->base;
167 }
168 }
169 }
170 return (key << 2) | base;
171 }
172
dividing_core(Div * div,uint32_t gid,int dep)173 void dividing_core(Div *div, uint32_t gid, int dep){
174 ReadInfo *rd;
175 u32list *grp, *sub;
176 // uint64_t mark0;
177 uint32_t i, j, col, rid, gids[2], b;
178 col = _call_key_col(div, gid);
179 b = col & 0x03;
180 col >>= 2;
181 if(col >= div->n_col || div->rds->size < div->K_allele || dep > 255){
182 push_u32list(div->gids, gid);
183 push_u32list(div->deps, dep);
184 return;
185 }
186 for(i=0;i<2;i++){
187 gids[i] = count_u32slist(div->grps);
188 sub = lend_ulist_div(div);
189 push_u32slist(div->grps, sub);
190 }
191 grp = get_u32slist(div->grps, gid);
192 /*
193 char str[257];
194 for(i=0;(int)i<dep;i++){
195 mark0 = get_u64list(div->markers[i/64], gid);
196 str[i] = '0' + ((mark0 >> (i%64))& 0x01);
197 }
198 str[i] = '\0';
199 fprintf(stderr, "%s\t%d\t%c\n", str, col, "ACGT"[b]);
200 for (j = 0; j < 4; j++) {
201 push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
202 push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
203 }
204 if (dep <= 255) {
205 set_u64list(div->markers[dep/64], gid+1, get_u64list(div->markers[j], gid) | (1LLU << (dep%64)));
206 }
207 */
208
209 // if (dep <= 255) {
210 for (j = 0; (int)j < dep/64; j++) {
211 push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
212 push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
213 }
214 push_u64list(div->markers[j], get_u64list(div->markers[j], gid));
215 push_u64list(div->markers[j], get_u64list(div->markers[j], gid) | (1LLU << (dep%64)));
216 j++;
217 for (; j < 4; j++) {
218 push_u64list(div->markers[j], 0);
219 push_u64list(div->markers[j], 0);
220 }
221 // }
222
223 for(i=0;i<count_u32list(grp);i++){
224 rid = get_u32list(grp, i);
225 rd = ref_rilist(div->rds, rid);
226 if(rd->seqlen1 >= col && div->seqs->buffer[rd->seqoff + col] == b){
227 push_u32list(get_u32slist(div->grps, gids[1]), rid);
228 } else {
229 push_u32list(get_u32slist(div->grps, gids[0]), rid);
230 }
231 }
232 for(i=0;i<2;i++){
233 // if(count_u32list(get_u32slist(div->grps, gids[i])) > 2 * div->k_allele) dividing_core(div, gids[i], dep + 1);
234 dividing_core(div, gids[i], dep + 1);
235 }
236 }
237
dividing(Div * div,uint32_t old_gid,FILE * out)238 void dividing(Div *div, uint32_t old_gid, FILE *out){
239 ReadInfo *rd;
240 u32list *grp;
241 uint64_t marker;
242 uint32_t i, j, k, gid, dep;
243 char route[257];
244 String *seq1, *seq2;
245 seq1 = init_string(1024);
246 seq2 = init_string(1024);
247 for (i = 0; i < 4; i++) {
248 clear_u64list(div->markers[i]);
249 push_u64list(div->markers[i], 0);
250 }
251 dividing_core(div, 0, 0);
252 for(i=0;i<count_u32list(div->gids);i++){
253 grp = get_u32slist(div->grps, get_u32list(div->gids, i));
254 dep = get_u32list(div->deps, i);
255 if (dep>255) dep = 255;
256 // marker1 = get_u64list(div->markers1, get_u32list(div->gids, i));
257 // marker2 = get_u64list(div->markers2, get_u32list(div->gids, i));
258 for(j=0;j<dep;j++){
259 marker = get_u64list(div->markers[j/64], get_u32list(div->gids, i));
260 route[j] = '0' + ((marker >> (j%64)) & 0x01);
261 }
262 route[dep] = 0;
263 gid = ++div->gidoff;
264 for(j=0;j<count_u32list(grp);j++){
265 rd = ref_rilist(div->rds, get_u32list(grp, j));
266 for(k=0;k<rd->seqlen1;k++) seq1->string[k] = bit_base_table[div->seqs->buffer[rd->seqoff + k]];
267 seq1->string[k] = 0;
268 for(k=0;k<rd->seqlen2;k++) seq2->string[k] = bit_base_table[div->seqs->buffer[rd->seqoff + rd->seqlen1 + k]];
269 seq2->string[k] = 0;
270 fprintf(out, "%u\t%u\t%s\t%s\t%u\t%s\n",
271 rd->seqid, gid, seq1->string, seq2->string, old_gid, route);
272 }
273 }
274 fflush(out);
275 old_gid = old_gid;
276 free_string(seq1);
277 free_string(seq2);
278 }
279
init_div(uint32_t k_allele,uint32_t K_allele,float min_freq)280 Div* init_div(uint32_t k_allele, uint32_t K_allele, float min_freq){
281 Div *div; int i;
282 div = malloc(sizeof(Div));
283 div->gidoff = 0;
284 div->n_col = 0;
285 div->k_allele = k_allele;
286 div->K_allele = K_allele;
287 div->min_freq = min_freq;
288 div->rds = init_rilist(128);
289 div->seqs = init_u8list(128 * 80);
290 div->grps = init_u32slist(64);
291 for (i = 0; i < 4; i++) {
292 div->markers[i] = init_u64list(64);
293 }
294 div->deps = init_u32list(64);
295 div->cache = init_u32slist(64);
296 div->gids = init_u32list(8);
297 div->cbs = init_cbv(12);
298 div->ps1 = init_u32list(32);
299 div->ps2 = init_u32list(32);
300 return div;
301 }
302
reset_div(Div * div)303 void reset_div(Div *div){
304 uint32_t i;
305 clear_rilist(div->rds);
306 clear_u8list(div->seqs);
307 for(i=0;i<count_u32slist(div->grps);i++){
308 return_ulist_div(div, get_u32slist(div->grps, i));
309 }
310 clear_u32slist(div->grps);
311 clear_u32list(div->gids);
312 for (i = 0; i < 4; i++) {
313 clear_u64list(div->markers[i]);
314 }
315 clear_u32list(div->deps);
316 div->n_col = 0;
317 }
318
free_div(Div * div)319 void free_div(Div *div){
320 uint32_t i;
321 reset_div(div);
322 free_rilist(div->rds);
323 free_u8list(div->seqs);
324 free_u32slist(div->grps);
325 for(i=0;i<count_u32slist(div->cache);i++){
326 free_u32list(get_u32slist(div->cache, i));
327 }
328 free_u32list(div->ps1);
329 free_u32list(div->ps2);
330 free_cbv(div->cbs);
331 free_u32slist(div->cache);
332 free_u32list(div->gids);
333 for (i = 0; i < 4; i++) {
334 free_u64list(div->markers[i]);
335 }
336 free_u32list(div->deps);
337 free(div);
338 }
339
div_reads(Div * div,FileReader * fr,FILE * out)340 uint32_t div_reads(Div *div, FileReader *fr, FILE *out){
341 ReadInfo *rd;
342 uint32_t seqid, rank, gid, last_gid, rid, ret;
343 char *seq1, *seq2;
344 int i;
345 last_gid = 0;
346 ret = 0;
347 while(fread_table(fr) != -1){
348 seqid = atoll(get_col_str(fr, 0));
349 rank = 1;
350 gid = atoll(get_col_str(fr, 1));
351 seq1 = get_col_str(fr, 2);
352 seq2 = get_col_str(fr, 3);
353 if(gid != last_gid){
354 ret ++;
355 if(last_gid) dividing(div, last_gid, out);
356 last_gid = gid;
357 reset_div(div);
358 push_u32slist(div->grps, lend_ulist_div(div));
359 }
360 if(get_col_len(fr, 2) > (int)div->n_col) div->n_col = get_col_len(fr, 2);
361 rid = count_rilist(div->rds);
362 rd = next_ref_rilist(div->rds);
363 rd->seqid = seqid;
364 rd->rank = rank;
365 rd->seqoff = div->seqs->size;
366 rd->seqlen1 = get_col_len(fr, 2);
367 rd->seqlen2 = get_col_len(fr, 3);
368 for(i=0;i<rd->seqlen1;i++) push_u8list(div->seqs, base_bit_table[(int)seq1[i]]);
369 for(i=0;i<rd->seqlen2;i++) push_u8list(div->seqs, base_bit_table[(int)seq2[i]]);
370 push_u32list(get_u32slist(div->grps, 0), rid);
371 }
372 if(last_gid) dividing(div, last_gid, out);
373 return ret;
374 }
375
376