1 /*
2 Copyright (C) 2017 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #include <config.h>
26
27 #include <strings.h>
28
29 #include "bcf_sr_sort.h"
30 #include "htslib/khash_str2int.h"
31
32 #define SR_REF 1
33 #define SR_SNP 2
34 #define SR_INDEL 4
35 #define SR_OTHER 8
36 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
37
38 // Resize a bit set.
kbs_resize(kbitset_t * bs,size_t ni)39 static inline kbitset_t *kbs_resize(kbitset_t *bs, size_t ni)
40 {
41 if ( !bs ) return kbs_init(ni);
42 size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS;
43 if ( n==bs->n ) return bs;
44
45 bs = (kbitset_t *) realloc(bs, sizeof(kbitset_t) + n * sizeof(unsigned long));
46 if ( bs==NULL ) return NULL;
47 if ( n > bs->n )
48 memset(bs->b + bs->n, 0, (n - bs->n) * sizeof (unsigned long));
49 bs->n = n;
50 bs->b[n] = ~0UL;
51 return bs;
52 }
53
54 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)55 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
56 {
57 // General case, bitsets of unequal size:
58 // int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
59 int i, n = bs1->n;
60
61 for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
62 return 0;
63 }
64
65 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)66 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
67 {
68 int i;
69 for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
70 }
71
72
bcf_sr_init_scores(sr_sort_t * srt)73 static void bcf_sr_init_scores(sr_sort_t *srt)
74 {
75 int i,jbit,kbit;
76
77 // lower number = lower priority, zero means forbidden
78
79 if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
80 if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
81 if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
82 if ( srt->pair & BCF_SR_PAIR_SNP_REF )
83 {
84 SR_SCORE(srt,SR_SNP,SR_REF) = 2;
85 SR_SCORE(srt,SR_REF,SR_SNP) = 2;
86 }
87 if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
88 {
89 SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
90 SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
91 }
92 if ( srt->pair & BCF_SR_PAIR_ANY )
93 {
94 for (i=0; i<256; i++)
95 if ( !srt->score[i] ) srt->score[i] = 1;
96 }
97
98 // set all combinations
99 for (i=0; i<256; i++)
100 {
101 if ( srt->score[i] ) continue; // already set
102 int max = 0;
103 for (jbit=0; jbit<4; jbit++) // high bits
104 {
105 int j = 1<<jbit;
106 if ( !(i & (j<<4)) ) continue;
107 for (kbit=0; kbit<4; kbit++) // low bits
108 {
109 int k = 1<<kbit;
110 if ( !(i & k) ) continue;
111 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
112 }
113 }
114 srt->score[i] = max;
115 }
116 }
multi_is_exact(var_t * avar,var_t * bvar)117 static int multi_is_exact(var_t *avar, var_t *bvar)
118 {
119 if ( avar->nalt != bvar->nalt ) return 0;
120
121 int alen = strlen(avar->str);
122 int blen = strlen(bvar->str);
123 if ( alen != blen ) return 0;
124
125 char *abeg = avar->str;
126 while ( *abeg )
127 {
128 char *aend = abeg;
129 while ( *aend && *aend!=',' ) aend++;
130
131 char *bbeg = bvar->str;
132 while ( *bbeg )
133 {
134 char *bend = bbeg;
135 while ( *bend && *bend!=',' ) bend++;
136 if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
137 bbeg = *bend ? bend+1 : bend;
138 }
139 if ( !*bbeg ) return 0;
140
141 abeg = *aend ? aend+1 : aend;
142 }
143 return 1;
144 }
multi_is_subset(var_t * avar,var_t * bvar)145 static int multi_is_subset(var_t *avar, var_t *bvar)
146 {
147 char *abeg = avar->str;
148 while ( *abeg )
149 {
150 char *aend = abeg;
151 while ( *aend && *aend!=',' ) aend++;
152
153 char *bbeg = bvar->str;
154 while ( *bbeg )
155 {
156 char *bend = bbeg;
157 while ( *bend && *bend!=',' ) bend++;
158 if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
159 bbeg = *bend ? bend+1 : bend;
160 }
161 abeg = *aend ? aend+1 : aend;
162 }
163 return 0;
164 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)165 int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
166 {
167 varset_t *iv = &srt->vset[ivset];
168 varset_t *jv = &srt->vset[jvset];
169
170 // Restrictive logic: the strictest type from a group is selected,
171 // so that, for example, snp+ref does not lead to the inclusion of an indel
172 int i,j;
173 uint32_t min = UINT32_MAX;
174 for (i=0; i<iv->nvar; i++)
175 {
176 var_t *ivar = &srt->var[iv->var[i]];
177 for (j=0; j<jv->nvar; j++)
178 {
179 var_t *jvar = &srt->var[jv->var[j]];
180 if ( srt->pair & BCF_SR_PAIR_EXACT )
181 {
182 if ( ivar->type != jvar->type ) continue;
183 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
184 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
185 continue;
186 }
187 if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
188 if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
189
190 uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
191 if ( !score ) return 0; // some of the varsets in the two groups are not compatible, will not pair
192 if ( min>score ) min = score;
193 }
194 }
195 if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
196
197 assert( min!=UINT32_MAX );
198
199 uint32_t cnt = 0;
200 for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
201 for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
202
203 return (1<<(28+min)) + cnt;
204 }
remove_vset(sr_sort_t * srt,int jvset)205 void remove_vset(sr_sort_t *srt, int jvset)
206 {
207 if ( jvset+1 < srt->nvset )
208 {
209 varset_t tmp = srt->vset[jvset];
210 memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
211 srt->vset[srt->nvset-1] = tmp;
212
213 int *jmat = srt->pmat + jvset*srt->ngrp;
214 memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
215
216 memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
217 }
218 srt->nvset--;
219 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)220 int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
221 {
222 int i,j;
223 if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
224
225 varset_t *iv = &srt->vset[ivset];
226 varset_t *jv = &srt->vset[jvset];
227
228 kbs_bitwise_or(iv->mask,jv->mask);
229
230 i = iv->nvar;
231 iv->nvar += jv->nvar;
232 hts_expand(int, iv->nvar, iv->mvar, iv->var);
233 for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
234
235 int *imat = srt->pmat + ivset*srt->ngrp;
236 int *jmat = srt->pmat + jvset*srt->ngrp;
237 for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
238 srt->cnt[ivset] += srt->cnt[jvset];
239
240 remove_vset(srt, jvset);
241
242 return ivset;
243 }
push_vset(sr_sort_t * srt,int ivset)244 void push_vset(sr_sort_t *srt, int ivset)
245 {
246 varset_t *iv = &srt->vset[ivset];
247 int i,j;
248 for (i=0; i<srt->sr->nreaders; i++)
249 {
250 vcf_buf_t *buf = &srt->vcf_buf[i];
251 buf->nrec++;
252 hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
253 buf->rec[buf->nrec-1] = NULL;
254 }
255 for (i=0; i<iv->nvar; i++)
256 {
257 var_t *var = &srt->var[ iv->var[i] ];
258 for (j=0; j<var->nvcf; j++)
259 {
260 int jvcf = var->vcf[j];
261 vcf_buf_t *buf = &srt->vcf_buf[jvcf];
262 buf->rec[buf->nrec-1] = var->rec[j];
263 }
264 }
265 remove_vset(srt, ivset);
266 }
267
cmpstringp(const void * p1,const void * p2)268 static int cmpstringp(const void *p1, const void *p2)
269 {
270 return strcmp(* (char * const *) p1, * (char * const *) p2);
271 }
272
273 #if DEBUG_VSETS
debug_vsets(sr_sort_t * srt)274 void debug_vsets(sr_sort_t *srt)
275 {
276 int i,j,k;
277 for (i=0; i<srt->nvset; i++)
278 {
279 fprintf(stderr,"dbg_vset %d:", i);
280 for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
281 fprintf(stderr,"\t");
282 for (j=0; j<srt->vset[i].nvar; j++)
283 {
284 var_t *var = &srt->var[srt->vset[i].var[j]];
285 fprintf(stderr,"\t%s",var->str);
286 for (k=0; k<var->nvcf; k++)
287 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
288 }
289 fprintf(stderr,"\n");
290 }
291 }
292 #endif
293
294 #if DEBUG_VBUF
debug_vbuf(sr_sort_t * srt)295 void debug_vbuf(sr_sort_t *srt)
296 {
297 int i, j;
298 for (j=0; j<srt->vcf_buf[0].nrec; j++)
299 {
300 fprintf(stderr,"dbg_vbuf %d:\t", j);
301 for (i=0; i<srt->sr->nreaders; i++)
302 {
303 vcf_buf_t *buf = &srt->vcf_buf[i];
304 fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0);
305 }
306 fprintf(stderr,"\n");
307 }
308 }
309 #endif
310
grp_create_key(sr_sort_t * srt)311 char *grp_create_key(sr_sort_t *srt)
312 {
313 if ( !srt->str.l ) return strdup("");
314 int i;
315 hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
316 for (i=0; i<srt->noff; i++)
317 {
318 srt->charp[i] = srt->str.s + srt->off[i];
319 if ( i>0 ) srt->charp[i][-1] = 0;
320 }
321 qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
322 char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
323 for (i=0; i<srt->noff; i++)
324 {
325 int len = strlen(srt->charp[i]);
326 memcpy(ptr, srt->charp[i], len);
327 ptr += len + 1;
328 ptr[-1] = i+1==srt->noff ? 0 : ';';
329 }
330 return ret;
331 }
bcf_sr_sort_set_active(sr_sort_t * srt,int idx)332 int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
333 {
334 hts_expand(int,idx+1,srt->mactive,srt->active);
335 srt->nactive = 1;
336 srt->active[srt->nactive - 1] = idx;
337 return 0;
338 }
bcf_sr_sort_add_active(sr_sort_t * srt,int idx)339 int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
340 {
341 hts_expand(int,idx+1,srt->mactive,srt->active);
342 srt->nactive++;
343 srt->active[srt->nactive - 1] = idx;
344 return 0;
345 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)346 static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
347 {
348 if ( !srt->grp_str2int )
349 {
350 // first time here, initialize
351 if ( !srt->pair )
352 {
353 if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
354 bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
355 }
356 bcf_sr_init_scores(srt);
357 srt->grp_str2int = khash_str2int_init();
358 srt->var_str2int = khash_str2int_init();
359 }
360 int k;
361 khash_t(str2int) *hash;
362 hash = srt->grp_str2int;
363 for (k=0; k < kh_end(hash); k++)
364 if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
365 hash = srt->var_str2int;
366 for (k=0; k < kh_end(hash); k++)
367 if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
368 kh_clear(str2int, srt->grp_str2int);
369 kh_clear(str2int, srt->var_str2int);
370 srt->ngrp = srt->nvar = srt->nvset = 0;
371
372 grp_t grp;
373 memset(&grp,0,sizeof(grp_t));
374
375 // group VCFs into groups, each with a unique combination of variants in the duplicate lines
376 int ireader,ivar,irec,igrp,ivset,iact;
377 for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
378 for (iact=0; iact<srt->nactive; iact++)
379 {
380 ireader = srt->active[iact];
381 bcf_sr_t *reader = &readers->readers[ireader];
382 int rid = bcf_hdr_name2id(reader->header, chr);
383 grp.nvar = 0;
384 hts_expand(int,reader->nbuffer,srt->moff,srt->off);
385 srt->noff = 0;
386 srt->str.l = 0;
387 for (irec=1; irec<=reader->nbuffer; irec++)
388 {
389 bcf1_t *line = reader->buffer[irec];
390 if ( line->rid!=rid || line->pos!=min_pos ) break;
391
392 if ( srt->str.l ) kputc(';',&srt->str);
393 srt->off[srt->noff++] = srt->str.l;
394 size_t beg = srt->str.l;
395 for (ivar=1; ivar<line->n_allele; ivar++)
396 {
397 if ( ivar>1 ) kputc(',',&srt->str);
398 kputs(line->d.allele[0],&srt->str);
399 kputc('>',&srt->str);
400 kputs(line->d.allele[ivar],&srt->str);
401 }
402 if ( line->n_allele==1 )
403 {
404 kputs(line->d.allele[0],&srt->str);
405 kputsn(">.",2,&srt->str);
406 }
407
408 // Create new variant or attach to existing one. But careful, there can be duplicate
409 // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
410 char *var_str = beg + srt->str.s;
411 int ret, var_idx = 0, var_end = srt->str.l;
412 while ( 1 )
413 {
414 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
415 if ( ret==-1 ) break;
416
417 var_t *var = &srt->var[ivar];
418 if ( var->vcf[var->nvcf-1] != ireader ) break;
419
420 srt->str.l = var_end;
421 kputw(var_idx, &srt->str);
422 var_str = beg + srt->str.s;
423 var_idx++;
424 }
425 if ( ret==-1 )
426 {
427 ivar = srt->nvar++;
428 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
429 srt->var[ivar].nvcf = 0;
430 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
431 free(srt->var[ivar].str); // possible left-over from the previous position
432 }
433 var_t *var = &srt->var[ivar];
434 var->nalt = line->n_allele - 1;
435 var->type = bcf_get_variant_types(line);
436 srt->str.s[var_end] = 0;
437 if ( ret==-1 )
438 var->str = strdup(var_str);
439
440 int mvcf = var->mvcf;
441 var->nvcf++;
442 hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
443 if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
444 var->vcf[var->nvcf-1] = ireader;
445 var->rec[var->nvcf-1] = line;
446
447 grp.nvar++;
448 hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
449 grp.var[grp.nvar-1] = ivar;
450 }
451 char *grp_key = grp_create_key(srt);
452 int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
453 if ( ret==-1 )
454 {
455 igrp = srt->ngrp++;
456 hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
457 free(srt->grp[igrp].var);
458 srt->grp[igrp] = grp;
459 srt->grp[igrp].key = grp_key;
460 khash_str2int_set(srt->grp_str2int, grp_key, igrp);
461 memset(&grp,0,sizeof(grp_t));
462 }
463 else
464 free(grp_key);
465 srt->grp[igrp].nvcf++;
466 }
467 free(grp.var);
468
469 // initialize bitmask - which groups is the variant present in
470 for (ivar=0; ivar<srt->nvar; ivar++)
471 {
472 srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
473 kbs_clear(srt->var[ivar].mask);
474 }
475 for (igrp=0; igrp<srt->ngrp; igrp++)
476 {
477 for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
478 {
479 int i = srt->grp[igrp].var[ivar];
480 kbs_insert(srt->var[i].mask, igrp);
481 }
482 }
483
484 // create the initial list of variant sets
485 for (ivar=0; ivar<srt->nvar; ivar++)
486 {
487 ivset = srt->nvset++;
488 hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
489
490 varset_t *vset = &srt->vset[ivset];
491 vset->nvar = 1;
492 hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
493 vset->var[vset->nvar-1] = ivar;
494 var_t *var = &srt->var[ivar];
495 vset->cnt = var->nvcf;
496 vset->mask = kbs_resize(vset->mask, srt->ngrp);
497 kbs_clear(vset->mask);
498 kbs_bitwise_or(vset->mask, var->mask);
499
500 int type = 0;
501 if ( var->type==VCF_REF ) type |= SR_REF;
502 else
503 {
504 if ( var->type & VCF_SNP ) type |= SR_SNP;
505 if ( var->type & VCF_MNP ) type |= SR_SNP;
506 if ( var->type & VCF_INDEL ) type |= SR_INDEL;
507 if ( var->type & VCF_OTHER ) type |= SR_OTHER;
508 }
509 var->type = type;
510 }
511 #if DEBUG_VSETS
512 debug_vsets(srt);
513 #endif
514
515 // initialize the pairing matrix
516 hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
517 hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
518 memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
519 for (ivset=0; ivset<srt->nvset; ivset++)
520 {
521 varset_t *vset = &srt->vset[ivset];
522 for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
523 srt->cnt[ivset] = vset->cnt;
524 }
525
526 // pair the lines
527 while ( srt->nvset )
528 {
529 #if DEBUG_VSETS
530 fprintf(stderr,"\n");
531 debug_vsets(srt);
532 #endif
533
534 int imax = 0;
535 for (ivset=1; ivset<srt->nvset; ivset++)
536 if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
537
538 int ipair = -1;
539 uint32_t max_score = 0;
540 for (ivset=0; ivset<srt->nvset; ivset++)
541 {
542 if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged
543 uint32_t score = pairing_score(srt, imax, ivset);
544 // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
545 if ( max_score < score ) { max_score = score; ipair = ivset; }
546 }
547
548 // merge rows creating a new variant set this way
549 if ( ipair!=-1 && ipair!=imax )
550 {
551 imax = merge_vsets(srt, imax, ipair);
552 continue;
553 }
554
555 push_vset(srt, imax);
556 }
557
558 srt->chr = chr;
559 srt->pos = min_pos;
560 }
561
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)562 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
563 {
564 int i,j;
565 assert( srt->nactive>0 );
566
567 if ( srt->nsr != readers->nreaders )
568 {
569 srt->sr = readers;
570 if ( srt->nsr < readers->nreaders )
571 {
572 srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
573 memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
574 if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
575 }
576 srt->nsr = readers->nreaders;
577 srt->chr = NULL;
578 }
579 if ( srt->nactive == 1 )
580 {
581 if ( readers->nreaders>1 )
582 memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
583 bcf_sr_t *reader = &readers->readers[srt->active[0]];
584 assert( reader->buffer[1]->pos==min_pos );
585 bcf1_t *tmp = reader->buffer[0];
586 for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
587 reader->buffer[ reader->nbuffer ] = tmp;
588 reader->nbuffer--;
589 readers->has_line[srt->active[0]] = 1;
590 return 1;
591 }
592 if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
593
594 if ( !srt->vcf_buf[0].nrec ) return 0;
595
596 #if DEBUG_VBUF
597 debug_vbuf(srt);
598 #endif
599
600 int nret = 0;
601 for (i=0; i<srt->sr->nreaders; i++)
602 {
603 vcf_buf_t *buf = &srt->vcf_buf[i];
604
605 if ( buf->rec[0] )
606 {
607 bcf_sr_t *reader = &srt->sr->readers[i];
608 for (j=1; j<=reader->nbuffer; j++)
609 if ( reader->buffer[j] == buf->rec[0] ) break;
610
611 assert( j<=reader->nbuffer );
612
613 bcf1_t *tmp = reader->buffer[0];
614 reader->buffer[0] = reader->buffer[j++];
615 for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
616 reader->buffer[ reader->nbuffer ] = tmp;
617 reader->nbuffer--;
618
619 nret++;
620 srt->sr->has_line[i] = 1;
621 }
622 else
623 srt->sr->has_line[i] = 0;
624
625 buf->nrec--;
626 if ( buf->nrec > 0 )
627 memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
628 }
629 return nret;
630 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)631 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
632 {
633 //vcf_buf is allocated only in bcf_sr_sort_next
634 //So, a call to bcf_sr_add_reader() followed immediately by bcf_sr_remove_reader()
635 //would cause the program to crash in this segment
636 if (srt->vcf_buf)
637 {
638 free(srt->vcf_buf[i].rec);
639 if ( i+1 < srt->nsr )
640 memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
641 memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
642 }
643 }
bcf_sr_sort_init(sr_sort_t * srt)644 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
645 {
646 if ( !srt ) return calloc(1,sizeof(sr_sort_t));
647 memset(srt,0,sizeof(sr_sort_t));
648 return srt;
649 }
bcf_sr_sort_reset(sr_sort_t * srt)650 void bcf_sr_sort_reset(sr_sort_t *srt)
651 {
652 srt->chr = NULL;
653 }
bcf_sr_sort_destroy(sr_sort_t * srt)654 void bcf_sr_sort_destroy(sr_sort_t *srt)
655 {
656 free(srt->active);
657 if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
658 if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
659 int i;
660 for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
661 free(srt->vcf_buf);
662 for (i=0; i<srt->mvar; i++)
663 {
664 free(srt->var[i].str);
665 free(srt->var[i].vcf);
666 free(srt->var[i].rec);
667 kbs_destroy(srt->var[i].mask);
668 }
669 free(srt->var);
670 for (i=0; i<srt->mgrp; i++)
671 free(srt->grp[i].var);
672 free(srt->grp);
673 for (i=0; i<srt->mvset; i++)
674 {
675 kbs_destroy(srt->vset[i].mask);
676 free(srt->vset[i].var);
677 }
678 free(srt->vset);
679 free(srt->str.s);
680 free(srt->off);
681 free(srt->charp);
682 free(srt->cnt);
683 free(srt->pmat);
684 memset(srt,0,sizeof(*srt));
685 }
686
687