1 /*
2 Copyright (C) 2017 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #include <strings.h>
26
27 #include "bcf_sr_sort.h"
28 #include "htslib/khash_str2int.h"
29
30 #define SR_REF 1
31 #define SR_SNP 2
32 #define SR_INDEL 4
33 #define SR_OTHER 8
34 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
35
36 // Resize a bit set.
kbs_resize(kbitset_t * bs,size_t ni)37 static inline kbitset_t *kbs_resize(kbitset_t *bs, size_t ni)
38 {
39 if ( !bs ) return kbs_init(ni);
40 size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS;
41 if ( n==bs->n ) return bs;
42
43 bs = (kbitset_t *) realloc(bs, sizeof(kbitset_t) + n * sizeof(unsigned long));
44 if ( bs==NULL ) return NULL;
45 if ( n > bs->n )
46 memset(bs->b + bs->n, 0, (n - bs->n) * sizeof (unsigned long));
47 bs->n = n;
48 bs->b[n] = ~0UL;
49 return bs;
50 }
51
52 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)53 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
54 {
55 // General case, bitsets of unequal size:
56 // int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
57 int i, n = bs1->n;
58
59 for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
60 return 0;
61 }
62
63 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)64 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
65 {
66 int i;
67 for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
68 }
69
70
bcf_sr_init_scores(sr_sort_t * srt)71 static void bcf_sr_init_scores(sr_sort_t *srt)
72 {
73 int i,jbit,kbit;
74
75 // lower number = lower priority, zero means forbidden
76
77 if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
78 if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
79 if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
80 if ( srt->pair & BCF_SR_PAIR_SNP_REF )
81 {
82 SR_SCORE(srt,SR_SNP,SR_REF) = 2;
83 SR_SCORE(srt,SR_REF,SR_SNP) = 2;
84 }
85 if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
86 {
87 SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
88 SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
89 }
90 if ( srt->pair & BCF_SR_PAIR_ANY )
91 {
92 for (i=0; i<256; i++)
93 if ( !srt->score[i] ) srt->score[i] = 1;
94 }
95
96 // set all combinations
97 for (i=0; i<256; i++)
98 {
99 if ( srt->score[i] ) continue; // already set
100 int max = 0;
101 for (jbit=0; jbit<4; jbit++) // high bits
102 {
103 int j = 1<<jbit;
104 if ( !(i & (j<<4)) ) continue;
105 for (kbit=0; kbit<4; kbit++) // low bits
106 {
107 int k = 1<<kbit;
108 if ( !(i & k) ) continue;
109 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
110 }
111 }
112 srt->score[i] = max;
113 }
114 }
multi_is_exact(var_t * avar,var_t * bvar)115 static int multi_is_exact(var_t *avar, var_t *bvar)
116 {
117 if ( avar->nalt != bvar->nalt ) return 0;
118
119 int alen = strlen(avar->str);
120 int blen = strlen(bvar->str);
121 if ( alen != blen ) return 0;
122
123 char *abeg = avar->str;
124 while ( *abeg )
125 {
126 char *aend = abeg;
127 while ( *aend && *aend!=',' ) aend++;
128
129 char *bbeg = bvar->str;
130 while ( *bbeg )
131 {
132 char *bend = bbeg;
133 while ( *bend && *bend!=',' ) bend++;
134 if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
135 bbeg = *bend ? bend+1 : bend;
136 }
137 if ( !*bbeg ) return 0;
138
139 abeg = *aend ? aend+1 : aend;
140 }
141 return 1;
142 }
multi_is_subset(var_t * avar,var_t * bvar)143 static int multi_is_subset(var_t *avar, var_t *bvar)
144 {
145 char *abeg = avar->str;
146 while ( *abeg )
147 {
148 char *aend = abeg;
149 while ( *aend && *aend!=',' ) aend++;
150
151 char *bbeg = bvar->str;
152 while ( *bbeg )
153 {
154 char *bend = bbeg;
155 while ( *bend && *bend!=',' ) bend++;
156 if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
157 bbeg = *bend ? bend+1 : bend;
158 }
159 abeg = *aend ? aend+1 : aend;
160 }
161 return 0;
162 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)163 int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
164 {
165 varset_t *iv = &srt->vset[ivset];
166 varset_t *jv = &srt->vset[jvset];
167
168 // Restrictive logic: the strictest type from a group is selected,
169 // so that, for example, snp+ref does not lead to the inclusion of an indel
170 int i,j;
171 uint32_t min = UINT32_MAX;
172 for (i=0; i<iv->nvar; i++)
173 {
174 var_t *ivar = &srt->var[iv->var[i]];
175 for (j=0; j<jv->nvar; j++)
176 {
177 var_t *jvar = &srt->var[jv->var[j]];
178 if ( srt->pair & BCF_SR_PAIR_EXACT )
179 {
180 if ( ivar->type != jvar->type ) continue;
181 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
182 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
183 continue;
184 }
185 if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
186 if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
187
188 uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
189 if ( !score ) return 0; // some of the varsets in the two groups are not compatible, will not pair
190 if ( min>score ) min = score;
191 }
192 }
193 if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
194
195 assert( min!=UINT32_MAX );
196
197 uint32_t cnt = 0;
198 for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
199 for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
200
201 return (1<<(28+min)) + cnt;
202 }
remove_vset(sr_sort_t * srt,int jvset)203 void remove_vset(sr_sort_t *srt, int jvset)
204 {
205 if ( jvset+1 < srt->nvset )
206 {
207 varset_t tmp = srt->vset[jvset];
208 memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
209 srt->vset[srt->nvset-1] = tmp;
210
211 int *jmat = srt->pmat + jvset*srt->ngrp;
212 memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
213
214 memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
215 }
216 srt->nvset--;
217 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)218 int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
219 {
220 int i,j;
221 if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
222
223 varset_t *iv = &srt->vset[ivset];
224 varset_t *jv = &srt->vset[jvset];
225
226 kbs_bitwise_or(iv->mask,jv->mask);
227
228 i = iv->nvar;
229 iv->nvar += jv->nvar;
230 hts_expand(int, iv->nvar, iv->mvar, iv->var);
231 for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
232
233 int *imat = srt->pmat + ivset*srt->ngrp;
234 int *jmat = srt->pmat + jvset*srt->ngrp;
235 for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
236 srt->cnt[ivset] += srt->cnt[jvset];
237
238 remove_vset(srt, jvset);
239
240 return ivset;
241 }
push_vset(sr_sort_t * srt,int ivset)242 void push_vset(sr_sort_t *srt, int ivset)
243 {
244 varset_t *iv = &srt->vset[ivset];
245 int i,j;
246 for (i=0; i<srt->sr->nreaders; i++)
247 {
248 vcf_buf_t *buf = &srt->vcf_buf[i];
249 buf->nrec++;
250 hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
251 buf->rec[buf->nrec-1] = NULL;
252 }
253 for (i=0; i<iv->nvar; i++)
254 {
255 var_t *var = &srt->var[ iv->var[i] ];
256 for (j=0; j<var->nvcf; j++)
257 {
258 int jvcf = var->vcf[j];
259 vcf_buf_t *buf = &srt->vcf_buf[jvcf];
260 buf->rec[buf->nrec-1] = var->rec[j];
261 }
262 }
263 remove_vset(srt, ivset);
264 }
265
cmpstringp(const void * p1,const void * p2)266 static int cmpstringp(const void *p1, const void *p2)
267 {
268 return strcmp(* (char * const *) p1, * (char * const *) p2);
269 }
270
271 #if DEBUG_VSETS
debug_vsets(sr_sort_t * srt)272 void debug_vsets(sr_sort_t *srt)
273 {
274 int i,j,k;
275 for (i=0; i<srt->nvset; i++)
276 {
277 fprintf(stderr,"dbg_vset %d:", i);
278 for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
279 fprintf(stderr,"\t");
280 for (j=0; j<srt->vset[i].nvar; j++)
281 {
282 var_t *var = &srt->var[srt->vset[i].var[j]];
283 fprintf(stderr,"\t%s",var->str);
284 for (k=0; k<var->nvcf; k++)
285 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
286 }
287 fprintf(stderr,"\n");
288 }
289 }
290 #endif
291
292 #if DEBUG_VBUF
debug_vbuf(sr_sort_t * srt)293 void debug_vbuf(sr_sort_t *srt)
294 {
295 int i, j;
296 for (j=0; j<srt->vcf_buf[0].nrec; j++)
297 {
298 fprintf(stderr,"dbg_vbuf %d:\t", j);
299 for (i=0; i<srt->sr->nreaders; i++)
300 {
301 vcf_buf_t *buf = &srt->vcf_buf[i];
302 fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0);
303 }
304 fprintf(stderr,"\n");
305 }
306 }
307 #endif
308
grp_create_key(sr_sort_t * srt)309 char *grp_create_key(sr_sort_t *srt)
310 {
311 if ( !srt->str.l ) return strdup("");
312 int i;
313 hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
314 for (i=0; i<srt->noff; i++)
315 {
316 srt->charp[i] = srt->str.s + srt->off[i];
317 if ( i>0 ) srt->charp[i][-1] = 0;
318 }
319 qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
320 char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
321 for (i=0; i<srt->noff; i++)
322 {
323 int len = strlen(srt->charp[i]);
324 memcpy(ptr, srt->charp[i], len);
325 ptr += len + 1;
326 ptr[-1] = i+1==srt->noff ? 0 : ';';
327 }
328 return ret;
329 }
bcf_sr_sort_set_active(sr_sort_t * srt,int idx)330 int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
331 {
332 hts_expand(int,idx+1,srt->mactive,srt->active);
333 srt->nactive = 1;
334 srt->active[srt->nactive - 1] = idx;
335 return 0;
336 }
bcf_sr_sort_add_active(sr_sort_t * srt,int idx)337 int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
338 {
339 hts_expand(int,idx+1,srt->mactive,srt->active);
340 srt->nactive++;
341 srt->active[srt->nactive - 1] = idx;
342 return 0;
343 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)344 static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
345 {
346 if ( !srt->grp_str2int )
347 {
348 // first time here, initialize
349 if ( !srt->pair )
350 {
351 if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
352 bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
353 }
354 bcf_sr_init_scores(srt);
355 srt->grp_str2int = khash_str2int_init();
356 srt->var_str2int = khash_str2int_init();
357 }
358 int k;
359 khash_t(str2int) *hash;
360 hash = srt->grp_str2int;
361 for (k=0; k < kh_end(hash); k++)
362 if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
363 hash = srt->var_str2int;
364 for (k=0; k < kh_end(hash); k++)
365 if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
366 kh_clear(str2int, srt->grp_str2int);
367 kh_clear(str2int, srt->var_str2int);
368 srt->ngrp = srt->nvar = srt->nvset = 0;
369
370 grp_t grp;
371 memset(&grp,0,sizeof(grp_t));
372
373 // group VCFs into groups, each with a unique combination of variants in the duplicate lines
374 int ireader,ivar,irec,igrp,ivset,iact;
375 for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
376 for (iact=0; iact<srt->nactive; iact++)
377 {
378 ireader = srt->active[iact];
379 bcf_sr_t *reader = &readers->readers[ireader];
380 int rid = bcf_hdr_name2id(reader->header, chr);
381 grp.nvar = 0;
382 hts_expand(int,reader->nbuffer,srt->moff,srt->off);
383 srt->noff = 0;
384 srt->str.l = 0;
385 for (irec=1; irec<=reader->nbuffer; irec++)
386 {
387 bcf1_t *line = reader->buffer[irec];
388 if ( line->rid!=rid || line->pos!=min_pos ) break;
389
390 if ( srt->str.l ) kputc(';',&srt->str);
391 srt->off[srt->noff++] = srt->str.l;
392 size_t beg = srt->str.l;
393 for (ivar=1; ivar<line->n_allele; ivar++)
394 {
395 if ( ivar>1 ) kputc(',',&srt->str);
396 kputs(line->d.allele[0],&srt->str);
397 kputc('>',&srt->str);
398 kputs(line->d.allele[ivar],&srt->str);
399 }
400 if ( line->n_allele==1 )
401 {
402 kputs(line->d.allele[0],&srt->str);
403 kputsn(">.",2,&srt->str);
404 }
405
406 // Create new variant or attach to existing one. But careful, there can be duplicate
407 // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
408 char *var_str = beg + srt->str.s;
409 int ret, var_idx = 0, var_end = srt->str.l;
410 while ( 1 )
411 {
412 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
413 if ( ret==-1 ) break;
414
415 var_t *var = &srt->var[ivar];
416 if ( var->vcf[var->nvcf-1] != ireader ) break;
417
418 srt->str.l = var_end;
419 kputw(var_idx, &srt->str);
420 var_str = beg + srt->str.s;
421 var_idx++;
422 }
423 if ( ret==-1 )
424 {
425 ivar = srt->nvar++;
426 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
427 srt->var[ivar].nvcf = 0;
428 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
429 free(srt->var[ivar].str); // possible left-over from the previous position
430 }
431 var_t *var = &srt->var[ivar];
432 var->nalt = line->n_allele - 1;
433 var->type = bcf_get_variant_types(line);
434 srt->str.s[var_end] = 0;
435 if ( ret==-1 )
436 var->str = strdup(var_str);
437
438 int mvcf = var->mvcf;
439 var->nvcf++;
440 hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
441 if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
442 var->vcf[var->nvcf-1] = ireader;
443 var->rec[var->nvcf-1] = line;
444
445 grp.nvar++;
446 hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
447 grp.var[grp.nvar-1] = ivar;
448 }
449 char *grp_key = grp_create_key(srt);
450 int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
451 if ( ret==-1 )
452 {
453 igrp = srt->ngrp++;
454 hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
455 free(srt->grp[igrp].var);
456 srt->grp[igrp] = grp;
457 srt->grp[igrp].key = grp_key;
458 khash_str2int_set(srt->grp_str2int, grp_key, igrp);
459 memset(&grp,0,sizeof(grp_t));
460 }
461 else
462 free(grp_key);
463 srt->grp[igrp].nvcf++;
464 }
465 free(grp.var);
466
467 // initialize bitmask - which groups is the variant present in
468 for (ivar=0; ivar<srt->nvar; ivar++)
469 {
470 srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
471 kbs_clear(srt->var[ivar].mask);
472 }
473 for (igrp=0; igrp<srt->ngrp; igrp++)
474 {
475 for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
476 {
477 int i = srt->grp[igrp].var[ivar];
478 kbs_insert(srt->var[i].mask, igrp);
479 }
480 }
481
482 // create the initial list of variant sets
483 for (ivar=0; ivar<srt->nvar; ivar++)
484 {
485 ivset = srt->nvset++;
486 hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
487
488 varset_t *vset = &srt->vset[ivset];
489 vset->nvar = 1;
490 hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
491 vset->var[vset->nvar-1] = ivar;
492 var_t *var = &srt->var[ivar];
493 vset->cnt = var->nvcf;
494 vset->mask = kbs_resize(vset->mask, srt->ngrp);
495 kbs_clear(vset->mask);
496 kbs_bitwise_or(vset->mask, var->mask);
497
498 int type = 0;
499 if ( var->type==VCF_REF ) type |= SR_REF;
500 else
501 {
502 if ( var->type & VCF_SNP ) type |= SR_SNP;
503 if ( var->type & VCF_MNP ) type |= SR_SNP;
504 if ( var->type & VCF_INDEL ) type |= SR_INDEL;
505 if ( var->type & VCF_OTHER ) type |= SR_OTHER;
506 }
507 var->type = type;
508 }
509 #if DEBUG_VSETS
510 debug_vsets(srt);
511 #endif
512
513 // initialize the pairing matrix
514 hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
515 hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
516 memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
517 for (ivset=0; ivset<srt->nvset; ivset++)
518 {
519 varset_t *vset = &srt->vset[ivset];
520 for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
521 srt->cnt[ivset] = vset->cnt;
522 }
523
524 // pair the lines
525 while ( srt->nvset )
526 {
527 #if DEBUG_VSETS
528 fprintf(stderr,"\n");
529 debug_vsets(srt);
530 #endif
531
532 int imax = 0;
533 for (ivset=1; ivset<srt->nvset; ivset++)
534 if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
535
536 int ipair = -1;
537 uint32_t max_score = 0;
538 for (ivset=0; ivset<srt->nvset; ivset++)
539 {
540 if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged
541 uint32_t score = pairing_score(srt, imax, ivset);
542 // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
543 if ( max_score < score ) { max_score = score; ipair = ivset; }
544 }
545
546 // merge rows creating a new variant set this way
547 if ( ipair!=-1 && ipair!=imax )
548 {
549 imax = merge_vsets(srt, imax, ipair);
550 continue;
551 }
552
553 push_vset(srt, imax);
554 }
555
556 srt->chr = chr;
557 srt->pos = min_pos;
558 }
559
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)560 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
561 {
562 int i,j;
563 assert( srt->nactive>0 );
564
565 if ( srt->nsr != readers->nreaders )
566 {
567 srt->sr = readers;
568 if ( srt->nsr < readers->nreaders )
569 {
570 srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
571 memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
572 if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
573 }
574 srt->nsr = readers->nreaders;
575 srt->chr = NULL;
576 }
577 if ( srt->nactive == 1 )
578 {
579 if ( readers->nreaders>1 )
580 memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
581 bcf_sr_t *reader = &readers->readers[srt->active[0]];
582 assert( reader->buffer[1]->pos==min_pos );
583 bcf1_t *tmp = reader->buffer[0];
584 for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
585 reader->buffer[ reader->nbuffer ] = tmp;
586 reader->nbuffer--;
587 readers->has_line[srt->active[0]] = 1;
588 return 1;
589 }
590 if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
591
592 if ( !srt->vcf_buf[0].nrec ) return 0;
593
594 #if DEBUG_VBUF
595 debug_vbuf(srt);
596 #endif
597
598 int nret = 0;
599 for (i=0; i<srt->sr->nreaders; i++)
600 {
601 vcf_buf_t *buf = &srt->vcf_buf[i];
602
603 if ( buf->rec[0] )
604 {
605 bcf_sr_t *reader = &srt->sr->readers[i];
606 for (j=1; j<=reader->nbuffer; j++)
607 if ( reader->buffer[j] == buf->rec[0] ) break;
608
609 assert( j<=reader->nbuffer );
610
611 bcf1_t *tmp = reader->buffer[0];
612 reader->buffer[0] = reader->buffer[j++];
613 for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
614 reader->buffer[ reader->nbuffer ] = tmp;
615 reader->nbuffer--;
616
617 nret++;
618 srt->sr->has_line[i] = 1;
619 }
620 else
621 srt->sr->has_line[i] = 0;
622
623 buf->nrec--;
624 if ( buf->nrec > 0 )
625 memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
626 }
627 return nret;
628 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)629 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
630 {
631 free(srt->vcf_buf[i].rec);
632 if ( i+1 < srt->nsr )
633 memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
634 memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
635 }
bcf_sr_sort_init(sr_sort_t * srt)636 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
637 {
638 if ( !srt ) return calloc(1,sizeof(sr_sort_t));
639 memset(srt,0,sizeof(sr_sort_t));
640 return srt;
641 }
bcf_sr_sort_destroy(sr_sort_t * srt)642 void bcf_sr_sort_destroy(sr_sort_t *srt)
643 {
644 free(srt->active);
645 if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
646 if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
647 int i;
648 for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
649 free(srt->vcf_buf);
650 for (i=0; i<srt->mvar; i++)
651 {
652 free(srt->var[i].str);
653 free(srt->var[i].vcf);
654 free(srt->var[i].rec);
655 kbs_destroy(srt->var[i].mask);
656 }
657 free(srt->var);
658 for (i=0; i<srt->mgrp; i++)
659 free(srt->grp[i].var);
660 free(srt->grp);
661 for (i=0; i<srt->mvset; i++)
662 {
663 kbs_destroy(srt->vset[i].mask);
664 free(srt->vset[i].var);
665 }
666 free(srt->vset);
667 free(srt->str.s);
668 free(srt->off);
669 free(srt->charp);
670 free(srt->cnt);
671 free(srt->pmat);
672 memset(srt,0,sizeof(*srt));
673 }
674
675