1 /*
2     Copyright (C) 2017 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 
25 #include <strings.h>
26 
27 #include "bcf_sr_sort.h"
28 #include "htslib/khash_str2int.h"
29 
30 #define SR_REF   1
31 #define SR_SNP   2
32 #define SR_INDEL 4
33 #define SR_OTHER 8
34 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
35 
36 // Resize a bit set.
kbs_resize(kbitset_t * bs,size_t ni)37 static inline kbitset_t *kbs_resize(kbitset_t *bs, size_t ni)
38 {
39     if ( !bs ) return kbs_init(ni);
40     size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS;
41     if ( n==bs->n ) return bs;
42 
43     bs = (kbitset_t *) realloc(bs, sizeof(kbitset_t) + n * sizeof(unsigned long));
44     if ( bs==NULL ) return NULL;
45     if ( n > bs->n )
46         memset(bs->b + bs->n, 0, (n - bs->n) * sizeof (unsigned long));
47     bs->n = n;
48     bs->b[n] = ~0UL;
49     return bs;
50 }
51 
52 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)53 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
54 {
55     // General case, bitsets of unequal size:
56     //  int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
57     int i, n = bs1->n;
58 
59     for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
60     return 0;
61 }
62 
63 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)64 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
65 {
66     int i;
67     for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
68 }
69 
70 
bcf_sr_init_scores(sr_sort_t * srt)71 static void bcf_sr_init_scores(sr_sort_t *srt)
72 {
73     int i,jbit,kbit;
74 
75     // lower number = lower priority, zero means forbidden
76 
77     if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
78     if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
79     if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
80     if ( srt->pair & BCF_SR_PAIR_SNP_REF )
81     {
82         SR_SCORE(srt,SR_SNP,SR_REF) = 2;
83         SR_SCORE(srt,SR_REF,SR_SNP) = 2;
84     }
85     if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
86     {
87         SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
88         SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
89     }
90     if ( srt->pair & BCF_SR_PAIR_ANY )
91     {
92         for (i=0; i<256; i++)
93             if ( !srt->score[i] ) srt->score[i] = 1;
94     }
95 
96     // set all combinations
97     for (i=0; i<256; i++)
98     {
99         if ( srt->score[i] ) continue;      // already set
100         int max = 0;
101         for (jbit=0; jbit<4; jbit++)        // high bits
102         {
103             int j = 1<<jbit;
104             if ( !(i & (j<<4)) ) continue;
105             for (kbit=0; kbit<4; kbit++)    // low bits
106             {
107                 int k = 1<<kbit;
108                 if ( !(i & k) ) continue;
109                 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
110             }
111         }
112         srt->score[i] = max;
113     }
114 }
multi_is_exact(var_t * avar,var_t * bvar)115 static int multi_is_exact(var_t *avar, var_t *bvar)
116 {
117     if ( avar->nalt != bvar->nalt ) return 0;
118 
119     int alen = strlen(avar->str);
120     int blen = strlen(bvar->str);
121     if ( alen != blen ) return 0;
122 
123     char *abeg = avar->str;
124     while ( *abeg )
125     {
126         char *aend = abeg;
127         while ( *aend && *aend!=',' ) aend++;
128 
129         char *bbeg = bvar->str;
130         while ( *bbeg )
131         {
132             char *bend = bbeg;
133             while ( *bend && *bend!=',' ) bend++;
134             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
135             bbeg = *bend ? bend+1 : bend;
136         }
137         if ( !*bbeg ) return 0;
138 
139         abeg = *aend ? aend+1 : aend;
140     }
141     return 1;
142 }
multi_is_subset(var_t * avar,var_t * bvar)143 static int multi_is_subset(var_t *avar, var_t *bvar)
144 {
145     char *abeg = avar->str;
146     while ( *abeg )
147     {
148         char *aend = abeg;
149         while ( *aend && *aend!=',' ) aend++;
150 
151         char *bbeg = bvar->str;
152         while ( *bbeg )
153         {
154             char *bend = bbeg;
155             while ( *bend && *bend!=',' ) bend++;
156             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
157             bbeg = *bend ? bend+1 : bend;
158         }
159         abeg = *aend ? aend+1 : aend;
160     }
161     return 0;
162 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)163 int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
164 {
165     varset_t *iv = &srt->vset[ivset];
166     varset_t *jv = &srt->vset[jvset];
167 
168     // Restrictive logic: the strictest type from a group is selected,
169     // so that, for example, snp+ref does not lead to the inclusion of an indel
170     int i,j;
171     uint32_t min = UINT32_MAX;
172     for (i=0; i<iv->nvar; i++)
173     {
174         var_t *ivar = &srt->var[iv->var[i]];
175         for (j=0; j<jv->nvar; j++)
176         {
177             var_t *jvar = &srt->var[jv->var[j]];
178             if ( srt->pair & BCF_SR_PAIR_EXACT )
179             {
180                 if ( ivar->type != jvar->type ) continue;
181                 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
182                 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
183                 continue;
184             }
185             if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
186             if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
187 
188             uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
189             if ( !score ) return 0;     // some of the varsets in the two groups are not compatible, will not pair
190             if ( min>score ) min = score;
191         }
192     }
193     if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
194 
195     assert( min!=UINT32_MAX );
196 
197     uint32_t cnt = 0;
198     for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
199     for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
200 
201     return (1<<(28+min)) + cnt;
202 }
remove_vset(sr_sort_t * srt,int jvset)203 void remove_vset(sr_sort_t *srt, int jvset)
204 {
205     if ( jvset+1 < srt->nvset )
206     {
207         varset_t tmp = srt->vset[jvset];
208         memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
209         srt->vset[srt->nvset-1] = tmp;
210 
211         int *jmat = srt->pmat + jvset*srt->ngrp;
212         memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
213 
214         memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
215     }
216     srt->nvset--;
217 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)218 int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
219 {
220     int i,j;
221     if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
222 
223     varset_t *iv = &srt->vset[ivset];
224     varset_t *jv = &srt->vset[jvset];
225 
226     kbs_bitwise_or(iv->mask,jv->mask);
227 
228     i = iv->nvar;
229     iv->nvar += jv->nvar;
230     hts_expand(int, iv->nvar, iv->mvar, iv->var);
231     for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
232 
233     int *imat = srt->pmat + ivset*srt->ngrp;
234     int *jmat = srt->pmat + jvset*srt->ngrp;
235     for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
236     srt->cnt[ivset] += srt->cnt[jvset];
237 
238     remove_vset(srt, jvset);
239 
240     return ivset;
241 }
push_vset(sr_sort_t * srt,int ivset)242 void push_vset(sr_sort_t *srt, int ivset)
243 {
244     varset_t *iv = &srt->vset[ivset];
245     int i,j;
246     for (i=0; i<srt->sr->nreaders; i++)
247     {
248         vcf_buf_t *buf = &srt->vcf_buf[i];
249         buf->nrec++;
250         hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
251         buf->rec[buf->nrec-1] = NULL;
252     }
253     for (i=0; i<iv->nvar; i++)
254     {
255         var_t *var = &srt->var[ iv->var[i] ];
256         for (j=0; j<var->nvcf; j++)
257         {
258             int jvcf = var->vcf[j];
259             vcf_buf_t *buf = &srt->vcf_buf[jvcf];
260             buf->rec[buf->nrec-1] = var->rec[j];
261         }
262     }
263     remove_vset(srt, ivset);
264 }
265 
cmpstringp(const void * p1,const void * p2)266 static int cmpstringp(const void *p1, const void *p2)
267 {
268     return strcmp(* (char * const *) p1, * (char * const *) p2);
269 }
debug_vsets(sr_sort_t * srt)270 void debug_vsets(sr_sort_t *srt)
271 {
272     int i,j,k;
273     for (i=0; i<srt->nvset; i++)
274     {
275         fprintf(stderr,"dbg_vset %d:", i);
276         for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
277         fprintf(stderr,"\t");
278         for (j=0; j<srt->vset[i].nvar; j++)
279         {
280             var_t *var = &srt->var[srt->vset[i].var[j]];
281             fprintf(stderr,"\t%s",var->str);
282             for (k=0; k<var->nvcf; k++)
283                 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
284         }
285         fprintf(stderr,"\n");
286     }
287 }
debug_vbuf(sr_sort_t * srt)288 void debug_vbuf(sr_sort_t *srt)
289 {
290     int i, j;
291     for (j=0; j<srt->vcf_buf[0].nrec; j++)
292     {
293         fprintf(stderr,"dbg_vbuf %d:\t", j);
294         for (i=0; i<srt->sr->nreaders; i++)
295         {
296             vcf_buf_t *buf = &srt->vcf_buf[i];
297             fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0);
298         }
299         fprintf(stderr,"\n");
300     }
301 }
grp_create_key(sr_sort_t * srt)302 char *grp_create_key(sr_sort_t *srt)
303 {
304     if ( !srt->str.l ) return strdup("");
305     int i;
306     hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
307     for (i=0; i<srt->noff; i++)
308     {
309         srt->charp[i] = srt->str.s + srt->off[i];
310         if ( i>0 ) srt->charp[i][-1] = 0;
311     }
312     qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
313     char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
314     for (i=0; i<srt->noff; i++)
315     {
316         int len = strlen(srt->charp[i]);
317         memcpy(ptr, srt->charp[i], len);
318         ptr += len + 1;
319         ptr[-1] = i+1==srt->noff ? 0 : ';';
320     }
321     return ret;
322 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)323 static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
324 {
325     if ( !srt->grp_str2int )
326     {
327         // first time here, initialize
328         if ( !srt->pair )
329         {
330             if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
331             bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
332         }
333         bcf_sr_init_scores(srt);
334         srt->grp_str2int = khash_str2int_init();
335         srt->var_str2int = khash_str2int_init();
336     }
337     int k;
338     khash_t(str2int) *hash;
339     hash = srt->grp_str2int;
340     for (k=0; k < kh_end(hash); k++)
341         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
342     hash = srt->var_str2int;
343     for (k=0; k < kh_end(hash); k++)
344         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
345     kh_clear(str2int, srt->grp_str2int);
346     kh_clear(str2int, srt->var_str2int);
347     srt->ngrp = srt->nvar = srt->nvset = 0;
348 
349     grp_t grp;
350     memset(&grp,0,sizeof(grp_t));
351 
352     // group VCFs into groups, each with a unique combination of variants in the duplicate lines
353     int ireader,ivar,irec,igrp,ivset;
354     for (ireader=0; ireader<readers->nreaders; ireader++)
355     {
356         srt->vcf_buf[ireader].nrec = 0;
357 
358         bcf_sr_t *reader = &readers->readers[ireader];
359         int rid   = bcf_hdr_name2id(reader->header, chr);
360         grp.nvar  = 0;
361         hts_expand(int,reader->nbuffer,srt->moff,srt->off);
362         srt->noff  = 0;
363         srt->str.l = 0;
364         for (irec=1; irec<=reader->nbuffer; irec++)
365         {
366             bcf1_t *line = reader->buffer[irec];
367             if ( line->rid!=rid || line->pos!=min_pos ) break;
368 
369             if ( srt->str.l ) kputc(';',&srt->str);
370             srt->off[srt->noff++] = srt->str.l;
371             size_t beg = srt->str.l;
372             for (ivar=1; ivar<line->n_allele; ivar++)
373             {
374                 if ( ivar>1 ) kputc(',',&srt->str);
375                 kputs(line->d.allele[0],&srt->str);
376                 kputc('>',&srt->str);
377                 kputs(line->d.allele[ivar],&srt->str);
378             }
379             if ( line->n_allele==1 )
380             {
381                 kputs(line->d.allele[0],&srt->str);
382                 kputsn(">.",2,&srt->str);
383             }
384 
385             // Create new variant or attach to existing one. But careful, there can be duplicate
386             // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
387             char *var_str = beg + srt->str.s;
388             int ret, var_idx = 0, var_end = srt->str.l;
389             while ( 1 )
390             {
391                 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
392                 if ( ret==-1 ) break;
393 
394                 var_t *var = &srt->var[ivar];
395                 if ( var->vcf[var->nvcf-1] != ireader ) break;
396 
397                 srt->str.l = var_end;
398                 kputw(var_idx, &srt->str);
399                 var_str = beg + srt->str.s;
400                 var_idx++;
401             }
402             if ( ret==-1 )
403             {
404                 ivar = srt->nvar++;
405                 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
406                 srt->var[ivar].nvcf = 0;
407                 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
408                 free(srt->var[ivar].str);   // possible left-over from the previous position
409             }
410             var_t *var = &srt->var[ivar];
411             var->nalt = line->n_allele - 1;
412             var->type = bcf_get_variant_types(line);
413             srt->str.s[var_end] = 0;
414             if ( ret==-1 )
415                 var->str = strdup(var_str);
416 
417             int mvcf = var->mvcf;
418             var->nvcf++;
419             hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
420             if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
421             var->vcf[var->nvcf-1] = ireader;
422             var->rec[var->nvcf-1] = line;
423 
424             grp.nvar++;
425             hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
426             grp.var[grp.nvar-1] = ivar;
427         }
428         char *grp_key = grp_create_key(srt);
429         int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
430         if ( ret==-1 )
431         {
432             igrp = srt->ngrp++;
433             hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
434             free(srt->grp[igrp].var);
435             srt->grp[igrp] = grp;
436             srt->grp[igrp].key = grp_key;
437             khash_str2int_set(srt->grp_str2int, grp_key, igrp);
438             memset(&grp,0,sizeof(grp_t));
439         }
440         else
441             free(grp_key);
442         srt->grp[igrp].nvcf++;
443     }
444     free(grp.var);
445 
446     // initialize bitmask - which groups is the variant present in
447     for (ivar=0; ivar<srt->nvar; ivar++)
448     {
449         srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
450         kbs_clear(srt->var[ivar].mask);
451     }
452     for (igrp=0; igrp<srt->ngrp; igrp++)
453     {
454         for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
455         {
456             int i = srt->grp[igrp].var[ivar];
457             kbs_insert(srt->var[i].mask, igrp);
458         }
459     }
460 
461     // create the initial list of variant sets
462     for (ivar=0; ivar<srt->nvar; ivar++)
463     {
464         ivset = srt->nvset++;
465         hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
466 
467         varset_t *vset = &srt->vset[ivset];
468         vset->nvar = 1;
469         hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
470         vset->var[vset->nvar-1] = ivar;
471         var_t *var  = &srt->var[ivar];
472         vset->cnt   = var->nvcf;
473         vset->mask  = kbs_resize(vset->mask, srt->ngrp);
474         kbs_clear(vset->mask);
475         kbs_bitwise_or(vset->mask, var->mask);
476 
477         int type = 0;
478         if ( var->type==VCF_REF ) type |= SR_REF;
479         else
480         {
481             if ( var->type & VCF_SNP ) type |= SR_SNP;
482             if ( var->type & VCF_MNP ) type |= SR_SNP;
483             if ( var->type & VCF_INDEL ) type |= SR_INDEL;
484             if ( var->type & VCF_OTHER ) type |= SR_OTHER;
485         }
486         var->type = type;
487     }
488     // debug_vsets(srt);
489 
490     // initialize the pairing matrix
491     hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
492     hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
493     memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
494     for (ivset=0; ivset<srt->nvset; ivset++)
495     {
496         varset_t *vset = &srt->vset[ivset];
497         for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
498         srt->cnt[ivset] = vset->cnt;
499     }
500 
501     // pair the lines
502     while ( srt->nvset )
503     {
504         // fprintf(stderr,"\n"); debug_vsets(srt);
505 
506         int imax = 0;
507         for (ivset=1; ivset<srt->nvset; ivset++)
508             if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
509 
510         int ipair = -1;
511         uint32_t max_score = 0;
512         for (ivset=0; ivset<srt->nvset; ivset++)
513         {
514             if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue;   // cannot be merged
515             uint32_t score = pairing_score(srt, imax, ivset);
516             // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
517             if ( max_score < score ) { max_score = score; ipair = ivset; }
518         }
519 
520         // merge rows creating a new variant set this way
521         if ( ipair!=-1 && ipair!=imax )
522         {
523             imax = merge_vsets(srt, imax, ipair);
524             continue;
525         }
526 
527         push_vset(srt, imax);
528     }
529 
530     srt->chr = chr;
531     srt->pos = min_pos;
532 }
533 
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)534 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
535 {
536     int i,j;
537     if ( readers->nreaders == 1 )
538     {
539         srt->nsr = 1;
540         if ( !srt->msr )
541         {
542             srt->vcf_buf = (vcf_buf_t*) calloc(1,sizeof(vcf_buf_t));   // first time here
543             srt->msr = 1;
544         }
545         bcf_sr_t *reader = &readers->readers[0];
546         assert( reader->buffer[1]->pos==min_pos );
547         bcf1_t *tmp = reader->buffer[0];
548         for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
549         reader->buffer[ reader->nbuffer ] = tmp;
550         reader->nbuffer--;
551         readers->has_line[0] = 1;
552         return 1;
553     }
554     if ( srt->nsr != readers->nreaders )
555     {
556         srt->sr = readers;
557         if ( srt->nsr < readers->nreaders )
558         {
559             srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
560             memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
561             if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
562         }
563         srt->nsr = readers->nreaders;
564         srt->chr = NULL;
565     }
566     if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
567 
568     if ( !srt->vcf_buf[0].nrec ) return 0;
569 
570     // debug_vbuf(srt);
571 
572     int nret = 0;
573     for (i=0; i<srt->sr->nreaders; i++)
574     {
575         vcf_buf_t *buf = &srt->vcf_buf[i];
576 
577         if ( buf->rec[0] )
578         {
579             bcf_sr_t *reader = &srt->sr->readers[i];
580             for (j=1; j<=reader->nbuffer; j++)
581                 if ( reader->buffer[j] == buf->rec[0] ) break;
582 
583             assert( j<=reader->nbuffer );
584 
585             bcf1_t *tmp = reader->buffer[0];
586             reader->buffer[0] = reader->buffer[j++];
587             for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
588             reader->buffer[ reader->nbuffer ] = tmp;
589             reader->nbuffer--;
590 
591             nret++;
592             srt->sr->has_line[i] = 1;
593         }
594         else
595             srt->sr->has_line[i] = 0;
596 
597         buf->nrec--;
598         if ( buf->nrec > 0 )
599             memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
600     }
601     return nret;
602 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)603 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
604 {
605     free(srt->vcf_buf[i].rec);
606     if ( i+1 < srt->nsr )
607         memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
608     memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
609 }
bcf_sr_sort_init(sr_sort_t * srt)610 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
611 {
612     if ( !srt ) return calloc(1,sizeof(sr_sort_t));
613     memset(srt,0,sizeof(sr_sort_t));
614     return srt;
615 }
bcf_sr_sort_destroy(sr_sort_t * srt)616 void bcf_sr_sort_destroy(sr_sort_t *srt)
617 {
618     if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
619     if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
620     int i;
621     for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
622     free(srt->vcf_buf);
623     for (i=0; i<srt->mvar; i++)
624     {
625         free(srt->var[i].str);
626         free(srt->var[i].vcf);
627         free(srt->var[i].rec);
628         kbs_destroy(srt->var[i].mask);
629     }
630     free(srt->var);
631     for (i=0; i<srt->mgrp; i++)
632         free(srt->grp[i].var);
633     free(srt->grp);
634     for (i=0; i<srt->mvset; i++)
635     {
636         kbs_destroy(srt->vset[i].mask);
637         free(srt->vset[i].var);
638     }
639     free(srt->vset);
640     free(srt->str.s);
641     free(srt->off);
642     free(srt->charp);
643     free(srt->cnt);
644     free(srt->pmat);
645     memset(srt,0,sizeof(*srt));
646 }
647 
648