1 /*
2     Copyright (C) 2017 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 
25 #include <config.h>
26 
27 #include <strings.h>
28 
29 #include "bcf_sr_sort.h"
30 #include "htslib/khash_str2int.h"
31 
32 #define SR_REF   1
33 #define SR_SNP   2
34 #define SR_INDEL 4
35 #define SR_OTHER 8
36 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
37 
38 // Resize a bit set.
kbs_resize(kbitset_t * bs,size_t ni)39 static inline kbitset_t *kbs_resize(kbitset_t *bs, size_t ni)
40 {
41     if ( !bs ) return kbs_init(ni);
42     size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS;
43     if ( n==bs->n ) return bs;
44 
45     bs = (kbitset_t *) realloc(bs, sizeof(kbitset_t) + n * sizeof(unsigned long));
46     if ( bs==NULL ) return NULL;
47     if ( n > bs->n )
48         memset(bs->b + bs->n, 0, (n - bs->n) * sizeof (unsigned long));
49     bs->n = n;
50     bs->b[n] = ~0UL;
51     return bs;
52 }
53 
54 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)55 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
56 {
57     // General case, bitsets of unequal size:
58     //  int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
59     int i, n = bs1->n;
60 
61     for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
62     return 0;
63 }
64 
65 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)66 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
67 {
68     int i;
69     for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
70 }
71 
72 
bcf_sr_init_scores(sr_sort_t * srt)73 static void bcf_sr_init_scores(sr_sort_t *srt)
74 {
75     int i,jbit,kbit;
76 
77     // lower number = lower priority, zero means forbidden
78 
79     if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
80     if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
81     if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
82     if ( srt->pair & BCF_SR_PAIR_SNP_REF )
83     {
84         SR_SCORE(srt,SR_SNP,SR_REF) = 2;
85         SR_SCORE(srt,SR_REF,SR_SNP) = 2;
86     }
87     if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
88     {
89         SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
90         SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
91     }
92     if ( srt->pair & BCF_SR_PAIR_ANY )
93     {
94         for (i=0; i<256; i++)
95             if ( !srt->score[i] ) srt->score[i] = 1;
96     }
97 
98     // set all combinations
99     for (i=0; i<256; i++)
100     {
101         if ( srt->score[i] ) continue;      // already set
102         int max = 0;
103         for (jbit=0; jbit<4; jbit++)        // high bits
104         {
105             int j = 1<<jbit;
106             if ( !(i & (j<<4)) ) continue;
107             for (kbit=0; kbit<4; kbit++)    // low bits
108             {
109                 int k = 1<<kbit;
110                 if ( !(i & k) ) continue;
111                 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
112             }
113         }
114         srt->score[i] = max;
115     }
116 }
multi_is_exact(var_t * avar,var_t * bvar)117 static int multi_is_exact(var_t *avar, var_t *bvar)
118 {
119     if ( avar->nalt != bvar->nalt ) return 0;
120 
121     int alen = strlen(avar->str);
122     int blen = strlen(bvar->str);
123     if ( alen != blen ) return 0;
124 
125     char *abeg = avar->str;
126     while ( *abeg )
127     {
128         char *aend = abeg;
129         while ( *aend && *aend!=',' ) aend++;
130 
131         char *bbeg = bvar->str;
132         while ( *bbeg )
133         {
134             char *bend = bbeg;
135             while ( *bend && *bend!=',' ) bend++;
136             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
137             bbeg = *bend ? bend+1 : bend;
138         }
139         if ( !*bbeg ) return 0;
140 
141         abeg = *aend ? aend+1 : aend;
142     }
143     return 1;
144 }
multi_is_subset(var_t * avar,var_t * bvar)145 static int multi_is_subset(var_t *avar, var_t *bvar)
146 {
147     char *abeg = avar->str;
148     while ( *abeg )
149     {
150         char *aend = abeg;
151         while ( *aend && *aend!=',' ) aend++;
152 
153         char *bbeg = bvar->str;
154         while ( *bbeg )
155         {
156             char *bend = bbeg;
157             while ( *bend && *bend!=',' ) bend++;
158             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
159             bbeg = *bend ? bend+1 : bend;
160         }
161         abeg = *aend ? aend+1 : aend;
162     }
163     return 0;
164 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)165 int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
166 {
167     varset_t *iv = &srt->vset[ivset];
168     varset_t *jv = &srt->vset[jvset];
169 
170     // Restrictive logic: the strictest type from a group is selected,
171     // so that, for example, snp+ref does not lead to the inclusion of an indel
172     int i,j;
173     uint32_t min = UINT32_MAX;
174     for (i=0; i<iv->nvar; i++)
175     {
176         var_t *ivar = &srt->var[iv->var[i]];
177         for (j=0; j<jv->nvar; j++)
178         {
179             var_t *jvar = &srt->var[jv->var[j]];
180             if ( srt->pair & BCF_SR_PAIR_EXACT )
181             {
182                 if ( ivar->type != jvar->type ) continue;
183                 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
184                 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
185                 continue;
186             }
187             if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
188             if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
189 
190             uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
191             if ( !score ) return 0;     // some of the varsets in the two groups are not compatible, will not pair
192             if ( min>score ) min = score;
193         }
194     }
195     if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
196 
197     assert( min!=UINT32_MAX );
198 
199     uint32_t cnt = 0;
200     for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
201     for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
202 
203     return (1<<(28+min)) + cnt;
204 }
remove_vset(sr_sort_t * srt,int jvset)205 void remove_vset(sr_sort_t *srt, int jvset)
206 {
207     if ( jvset+1 < srt->nvset )
208     {
209         varset_t tmp = srt->vset[jvset];
210         memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
211         srt->vset[srt->nvset-1] = tmp;
212 
213         int *jmat = srt->pmat + jvset*srt->ngrp;
214         memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
215 
216         memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
217     }
218     srt->nvset--;
219 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)220 int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
221 {
222     int i,j;
223     if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
224 
225     varset_t *iv = &srt->vset[ivset];
226     varset_t *jv = &srt->vset[jvset];
227 
228     kbs_bitwise_or(iv->mask,jv->mask);
229 
230     i = iv->nvar;
231     iv->nvar += jv->nvar;
232     hts_expand(int, iv->nvar, iv->mvar, iv->var);
233     for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
234 
235     int *imat = srt->pmat + ivset*srt->ngrp;
236     int *jmat = srt->pmat + jvset*srt->ngrp;
237     for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
238     srt->cnt[ivset] += srt->cnt[jvset];
239 
240     remove_vset(srt, jvset);
241 
242     return ivset;
243 }
push_vset(sr_sort_t * srt,int ivset)244 void push_vset(sr_sort_t *srt, int ivset)
245 {
246     varset_t *iv = &srt->vset[ivset];
247     int i,j;
248     for (i=0; i<srt->sr->nreaders; i++)
249     {
250         vcf_buf_t *buf = &srt->vcf_buf[i];
251         buf->nrec++;
252         hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
253         buf->rec[buf->nrec-1] = NULL;
254     }
255     for (i=0; i<iv->nvar; i++)
256     {
257         var_t *var = &srt->var[ iv->var[i] ];
258         for (j=0; j<var->nvcf; j++)
259         {
260             int jvcf = var->vcf[j];
261             vcf_buf_t *buf = &srt->vcf_buf[jvcf];
262             buf->rec[buf->nrec-1] = var->rec[j];
263         }
264     }
265     remove_vset(srt, ivset);
266 }
267 
cmpstringp(const void * p1,const void * p2)268 static int cmpstringp(const void *p1, const void *p2)
269 {
270     return strcmp(* (char * const *) p1, * (char * const *) p2);
271 }
272 
273 #if DEBUG_VSETS
debug_vsets(sr_sort_t * srt)274 void debug_vsets(sr_sort_t *srt)
275 {
276     int i,j,k;
277     for (i=0; i<srt->nvset; i++)
278     {
279         fprintf(stderr,"dbg_vset %d:", i);
280         for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
281         fprintf(stderr,"\t");
282         for (j=0; j<srt->vset[i].nvar; j++)
283         {
284             var_t *var = &srt->var[srt->vset[i].var[j]];
285             fprintf(stderr,"\t%s",var->str);
286             for (k=0; k<var->nvcf; k++)
287                 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
288         }
289         fprintf(stderr,"\n");
290     }
291 }
292 #endif
293 
294 #if DEBUG_VBUF
debug_vbuf(sr_sort_t * srt)295 void debug_vbuf(sr_sort_t *srt)
296 {
297     int i, j;
298     for (j=0; j<srt->vcf_buf[0].nrec; j++)
299     {
300         fprintf(stderr,"dbg_vbuf %d:\t", j);
301         for (i=0; i<srt->sr->nreaders; i++)
302         {
303             vcf_buf_t *buf = &srt->vcf_buf[i];
304             fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0);
305         }
306         fprintf(stderr,"\n");
307     }
308 }
309 #endif
310 
grp_create_key(sr_sort_t * srt)311 char *grp_create_key(sr_sort_t *srt)
312 {
313     if ( !srt->str.l ) return strdup("");
314     int i;
315     hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
316     for (i=0; i<srt->noff; i++)
317     {
318         srt->charp[i] = srt->str.s + srt->off[i];
319         if ( i>0 ) srt->charp[i][-1] = 0;
320     }
321     qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
322     char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
323     for (i=0; i<srt->noff; i++)
324     {
325         int len = strlen(srt->charp[i]);
326         memcpy(ptr, srt->charp[i], len);
327         ptr += len + 1;
328         ptr[-1] = i+1==srt->noff ? 0 : ';';
329     }
330     return ret;
331 }
bcf_sr_sort_set_active(sr_sort_t * srt,int idx)332 int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
333 {
334     hts_expand(int,idx+1,srt->mactive,srt->active);
335     srt->nactive = 1;
336     srt->active[srt->nactive - 1] = idx;
337     return 0;
338 }
bcf_sr_sort_add_active(sr_sort_t * srt,int idx)339 int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
340 {
341     hts_expand(int,idx+1,srt->mactive,srt->active);
342     srt->nactive++;
343     srt->active[srt->nactive - 1] = idx;
344     return 0;
345 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)346 static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
347 {
348     if ( !srt->grp_str2int )
349     {
350         // first time here, initialize
351         if ( !srt->pair )
352         {
353             if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
354             bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
355         }
356         bcf_sr_init_scores(srt);
357         srt->grp_str2int = khash_str2int_init();
358         srt->var_str2int = khash_str2int_init();
359     }
360     int k;
361     khash_t(str2int) *hash;
362     hash = srt->grp_str2int;
363     for (k=0; k < kh_end(hash); k++)
364         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
365     hash = srt->var_str2int;
366     for (k=0; k < kh_end(hash); k++)
367         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
368     kh_clear(str2int, srt->grp_str2int);
369     kh_clear(str2int, srt->var_str2int);
370     srt->ngrp = srt->nvar = srt->nvset = 0;
371 
372     grp_t grp;
373     memset(&grp,0,sizeof(grp_t));
374 
375     // group VCFs into groups, each with a unique combination of variants in the duplicate lines
376     int ireader,ivar,irec,igrp,ivset,iact;
377     for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
378     for (iact=0; iact<srt->nactive; iact++)
379     {
380         ireader = srt->active[iact];
381         bcf_sr_t *reader = &readers->readers[ireader];
382         int rid   = bcf_hdr_name2id(reader->header, chr);
383         grp.nvar  = 0;
384         hts_expand(int,reader->nbuffer,srt->moff,srt->off);
385         srt->noff  = 0;
386         srt->str.l = 0;
387         for (irec=1; irec<=reader->nbuffer; irec++)
388         {
389             bcf1_t *line = reader->buffer[irec];
390             if ( line->rid!=rid || line->pos!=min_pos ) break;
391 
392             if ( srt->str.l ) kputc(';',&srt->str);
393             srt->off[srt->noff++] = srt->str.l;
394             size_t beg = srt->str.l;
395             for (ivar=1; ivar<line->n_allele; ivar++)
396             {
397                 if ( ivar>1 ) kputc(',',&srt->str);
398                 kputs(line->d.allele[0],&srt->str);
399                 kputc('>',&srt->str);
400                 kputs(line->d.allele[ivar],&srt->str);
401             }
402             if ( line->n_allele==1 )
403             {
404                 kputs(line->d.allele[0],&srt->str);
405                 kputsn(">.",2,&srt->str);
406             }
407 
408             // Create new variant or attach to existing one. But careful, there can be duplicate
409             // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
410             char *var_str = beg + srt->str.s;
411             int ret, var_idx = 0, var_end = srt->str.l;
412             while ( 1 )
413             {
414                 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
415                 if ( ret==-1 ) break;
416 
417                 var_t *var = &srt->var[ivar];
418                 if ( var->vcf[var->nvcf-1] != ireader ) break;
419 
420                 srt->str.l = var_end;
421                 kputw(var_idx, &srt->str);
422                 var_str = beg + srt->str.s;
423                 var_idx++;
424             }
425             if ( ret==-1 )
426             {
427                 ivar = srt->nvar++;
428                 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
429                 srt->var[ivar].nvcf = 0;
430                 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
431                 free(srt->var[ivar].str);   // possible left-over from the previous position
432             }
433             var_t *var = &srt->var[ivar];
434             var->nalt = line->n_allele - 1;
435             var->type = bcf_get_variant_types(line);
436             srt->str.s[var_end] = 0;
437             if ( ret==-1 )
438                 var->str = strdup(var_str);
439 
440             int mvcf = var->mvcf;
441             var->nvcf++;
442             hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
443             if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
444             var->vcf[var->nvcf-1] = ireader;
445             var->rec[var->nvcf-1] = line;
446 
447             grp.nvar++;
448             hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
449             grp.var[grp.nvar-1] = ivar;
450         }
451         char *grp_key = grp_create_key(srt);
452         int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
453         if ( ret==-1 )
454         {
455             igrp = srt->ngrp++;
456             hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
457             free(srt->grp[igrp].var);
458             srt->grp[igrp] = grp;
459             srt->grp[igrp].key = grp_key;
460             khash_str2int_set(srt->grp_str2int, grp_key, igrp);
461             memset(&grp,0,sizeof(grp_t));
462         }
463         else
464             free(grp_key);
465         srt->grp[igrp].nvcf++;
466     }
467     free(grp.var);
468 
469     // initialize bitmask - which groups is the variant present in
470     for (ivar=0; ivar<srt->nvar; ivar++)
471     {
472         srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
473         kbs_clear(srt->var[ivar].mask);
474     }
475     for (igrp=0; igrp<srt->ngrp; igrp++)
476     {
477         for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
478         {
479             int i = srt->grp[igrp].var[ivar];
480             kbs_insert(srt->var[i].mask, igrp);
481         }
482     }
483 
484     // create the initial list of variant sets
485     for (ivar=0; ivar<srt->nvar; ivar++)
486     {
487         ivset = srt->nvset++;
488         hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
489 
490         varset_t *vset = &srt->vset[ivset];
491         vset->nvar = 1;
492         hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
493         vset->var[vset->nvar-1] = ivar;
494         var_t *var  = &srt->var[ivar];
495         vset->cnt   = var->nvcf;
496         vset->mask  = kbs_resize(vset->mask, srt->ngrp);
497         kbs_clear(vset->mask);
498         kbs_bitwise_or(vset->mask, var->mask);
499 
500         int type = 0;
501         if ( var->type==VCF_REF ) type |= SR_REF;
502         else
503         {
504             if ( var->type & VCF_SNP ) type |= SR_SNP;
505             if ( var->type & VCF_MNP ) type |= SR_SNP;
506             if ( var->type & VCF_INDEL ) type |= SR_INDEL;
507             if ( var->type & VCF_OTHER ) type |= SR_OTHER;
508         }
509         var->type = type;
510     }
511 #if DEBUG_VSETS
512     debug_vsets(srt);
513 #endif
514 
515     // initialize the pairing matrix
516     hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
517     hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
518     memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
519     for (ivset=0; ivset<srt->nvset; ivset++)
520     {
521         varset_t *vset = &srt->vset[ivset];
522         for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
523         srt->cnt[ivset] = vset->cnt;
524     }
525 
526     // pair the lines
527     while ( srt->nvset )
528     {
529 #if DEBUG_VSETS
530     fprintf(stderr,"\n");
531     debug_vsets(srt);
532 #endif
533 
534         int imax = 0;
535         for (ivset=1; ivset<srt->nvset; ivset++)
536             if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
537 
538         int ipair = -1;
539         uint32_t max_score = 0;
540         for (ivset=0; ivset<srt->nvset; ivset++)
541         {
542             if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue;   // cannot be merged
543             uint32_t score = pairing_score(srt, imax, ivset);
544             // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
545             if ( max_score < score ) { max_score = score; ipair = ivset; }
546         }
547 
548         // merge rows creating a new variant set this way
549         if ( ipair!=-1 && ipair!=imax )
550         {
551             imax = merge_vsets(srt, imax, ipair);
552             continue;
553         }
554 
555         push_vset(srt, imax);
556     }
557 
558     srt->chr = chr;
559     srt->pos = min_pos;
560 }
561 
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)562 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
563 {
564     int i,j;
565     assert( srt->nactive>0 );
566 
567     if ( srt->nsr != readers->nreaders )
568     {
569         srt->sr = readers;
570         if ( srt->nsr < readers->nreaders )
571         {
572             srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
573             memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
574             if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
575         }
576         srt->nsr = readers->nreaders;
577         srt->chr = NULL;
578     }
579     if ( srt->nactive == 1 )
580     {
581         if ( readers->nreaders>1 )
582             memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
583         bcf_sr_t *reader = &readers->readers[srt->active[0]];
584         assert( reader->buffer[1]->pos==min_pos );
585         bcf1_t *tmp = reader->buffer[0];
586         for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
587         reader->buffer[ reader->nbuffer ] = tmp;
588         reader->nbuffer--;
589         readers->has_line[srt->active[0]] = 1;
590         return 1;
591     }
592     if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
593 
594     if ( !srt->vcf_buf[0].nrec ) return 0;
595 
596 #if DEBUG_VBUF
597     debug_vbuf(srt);
598 #endif
599 
600     int nret = 0;
601     for (i=0; i<srt->sr->nreaders; i++)
602     {
603         vcf_buf_t *buf = &srt->vcf_buf[i];
604 
605         if ( buf->rec[0] )
606         {
607             bcf_sr_t *reader = &srt->sr->readers[i];
608             for (j=1; j<=reader->nbuffer; j++)
609                 if ( reader->buffer[j] == buf->rec[0] ) break;
610 
611             assert( j<=reader->nbuffer );
612 
613             bcf1_t *tmp = reader->buffer[0];
614             reader->buffer[0] = reader->buffer[j++];
615             for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
616             reader->buffer[ reader->nbuffer ] = tmp;
617             reader->nbuffer--;
618 
619             nret++;
620             srt->sr->has_line[i] = 1;
621         }
622         else
623             srt->sr->has_line[i] = 0;
624 
625         buf->nrec--;
626         if ( buf->nrec > 0 )
627             memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
628     }
629     return nret;
630 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)631 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
632 {
633     //vcf_buf is allocated only in bcf_sr_sort_next
634     //So, a call to bcf_sr_add_reader() followed immediately by bcf_sr_remove_reader()
635     //would cause the program to crash in this segment
636     if (srt->vcf_buf)
637     {
638         free(srt->vcf_buf[i].rec);
639         if ( i+1 < srt->nsr )
640             memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
641         memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
642     }
643 }
bcf_sr_sort_init(sr_sort_t * srt)644 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
645 {
646     if ( !srt ) return calloc(1,sizeof(sr_sort_t));
647     memset(srt,0,sizeof(sr_sort_t));
648     return srt;
649 }
bcf_sr_sort_reset(sr_sort_t * srt)650 void bcf_sr_sort_reset(sr_sort_t *srt)
651 {
652     srt->chr = NULL;
653 }
bcf_sr_sort_destroy(sr_sort_t * srt)654 void bcf_sr_sort_destroy(sr_sort_t *srt)
655 {
656     free(srt->active);
657     if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
658     if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
659     int i;
660     for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
661     free(srt->vcf_buf);
662     for (i=0; i<srt->mvar; i++)
663     {
664         free(srt->var[i].str);
665         free(srt->var[i].vcf);
666         free(srt->var[i].rec);
667         kbs_destroy(srt->var[i].mask);
668     }
669     free(srt->var);
670     for (i=0; i<srt->mgrp; i++)
671         free(srt->grp[i].var);
672     free(srt->grp);
673     for (i=0; i<srt->mvset; i++)
674     {
675         kbs_destroy(srt->vset[i].mask);
676         free(srt->vset[i].var);
677     }
678     free(srt->vset);
679     free(srt->str.s);
680     free(srt->off);
681     free(srt->charp);
682     free(srt->cnt);
683     free(srt->pmat);
684     memset(srt,0,sizeof(*srt));
685 }
686 
687