1 /*
2     Copyright (C) 2017 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 
25 #include <strings.h>
26 
27 #include "bcf_sr_sort.h"
28 #include "htslib/khash_str2int.h"
29 
30 #define SR_REF   1
31 #define SR_SNP   2
32 #define SR_INDEL 4
33 #define SR_OTHER 8
34 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
35 
36 // Resize a bit set.
kbs_resize(kbitset_t * bs,size_t ni)37 static inline kbitset_t *kbs_resize(kbitset_t *bs, size_t ni)
38 {
39     if ( !bs ) return kbs_init(ni);
40     size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS;
41     if ( n==bs->n ) return bs;
42 
43     bs = (kbitset_t *) realloc(bs, sizeof(kbitset_t) + n * sizeof(unsigned long));
44     if ( bs==NULL ) return NULL;
45     if ( n > bs->n )
46         memset(bs->b + bs->n, 0, (n - bs->n) * sizeof (unsigned long));
47     bs->n = n;
48     bs->b[n] = ~0UL;
49     return bs;
50 }
51 
52 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)53 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
54 {
55     // General case, bitsets of unequal size:
56     //  int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
57     int i, n = bs1->n;
58 
59     for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
60     return 0;
61 }
62 
63 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)64 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
65 {
66     int i;
67     for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
68 }
69 
70 
bcf_sr_init_scores(sr_sort_t * srt)71 static void bcf_sr_init_scores(sr_sort_t *srt)
72 {
73     int i,jbit,kbit;
74 
75     // lower number = lower priority, zero means forbidden
76 
77     if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
78     if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
79     if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
80     if ( srt->pair & BCF_SR_PAIR_SNP_REF )
81     {
82         SR_SCORE(srt,SR_SNP,SR_REF) = 2;
83         SR_SCORE(srt,SR_REF,SR_SNP) = 2;
84     }
85     if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
86     {
87         SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
88         SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
89     }
90     if ( srt->pair & BCF_SR_PAIR_ANY )
91     {
92         for (i=0; i<256; i++)
93             if ( !srt->score[i] ) srt->score[i] = 1;
94     }
95 
96     // set all combinations
97     for (i=0; i<256; i++)
98     {
99         if ( srt->score[i] ) continue;      // already set
100         int max = 0;
101         for (jbit=0; jbit<4; jbit++)        // high bits
102         {
103             int j = 1<<jbit;
104             if ( !(i & (j<<4)) ) continue;
105             for (kbit=0; kbit<4; kbit++)    // low bits
106             {
107                 int k = 1<<kbit;
108                 if ( !(i & k) ) continue;
109                 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
110             }
111         }
112         srt->score[i] = max;
113     }
114 }
multi_is_exact(var_t * avar,var_t * bvar)115 static int multi_is_exact(var_t *avar, var_t *bvar)
116 {
117     if ( avar->nalt != bvar->nalt ) return 0;
118 
119     int alen = strlen(avar->str);
120     int blen = strlen(bvar->str);
121     if ( alen != blen ) return 0;
122 
123     char *abeg = avar->str;
124     while ( *abeg )
125     {
126         char *aend = abeg;
127         while ( *aend && *aend!=',' ) aend++;
128 
129         char *bbeg = bvar->str;
130         while ( *bbeg )
131         {
132             char *bend = bbeg;
133             while ( *bend && *bend!=',' ) bend++;
134             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
135             bbeg = *bend ? bend+1 : bend;
136         }
137         if ( !*bbeg ) return 0;
138 
139         abeg = *aend ? aend+1 : aend;
140     }
141     return 1;
142 }
multi_is_subset(var_t * avar,var_t * bvar)143 static int multi_is_subset(var_t *avar, var_t *bvar)
144 {
145     char *abeg = avar->str;
146     while ( *abeg )
147     {
148         char *aend = abeg;
149         while ( *aend && *aend!=',' ) aend++;
150 
151         char *bbeg = bvar->str;
152         while ( *bbeg )
153         {
154             char *bend = bbeg;
155             while ( *bend && *bend!=',' ) bend++;
156             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
157             bbeg = *bend ? bend+1 : bend;
158         }
159         abeg = *aend ? aend+1 : aend;
160     }
161     return 0;
162 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)163 int32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
164 {
165     varset_t *iv = &srt->vset[ivset];
166     varset_t *jv = &srt->vset[jvset];
167 
168     // Restrictive logic: the strictest type from a group is selected,
169     // so that, for example, snp+ref does not lead to the inclusion of an indel
170     int i,j;
171     uint32_t min = UINT32_MAX;
172     for (i=0; i<iv->nvar; i++)
173     {
174         var_t *ivar = &srt->var[iv->var[i]];
175         for (j=0; j<jv->nvar; j++)
176         {
177             var_t *jvar = &srt->var[jv->var[j]];
178             if ( srt->pair & BCF_SR_PAIR_EXACT )
179             {
180                 if ( ivar->type != jvar->type ) continue;
181                 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
182                 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
183                 continue;
184             }
185             if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
186             if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
187 
188             uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
189             if ( !score ) return 0;     // some of the varsets in the two groups are not compatible, will not pair
190             if ( min>score ) min = score;
191         }
192     }
193     if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
194 
195     assert( min!=UINT32_MAX );
196 
197     uint32_t cnt = 0;
198     for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
199     for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
200 
201     return (1<<(28+min)) + cnt;
202 }
remove_vset(sr_sort_t * srt,int jvset)203 void remove_vset(sr_sort_t *srt, int jvset)
204 {
205     if ( jvset+1 < srt->nvset )
206     {
207         varset_t tmp = srt->vset[jvset];
208         memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
209         srt->vset[srt->nvset-1] = tmp;
210 
211         int *jmat = srt->pmat + jvset*srt->ngrp;
212         memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
213 
214         memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
215     }
216     srt->nvset--;
217 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)218 int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
219 {
220     int i,j;
221     if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
222 
223     varset_t *iv = &srt->vset[ivset];
224     varset_t *jv = &srt->vset[jvset];
225 
226     kbs_bitwise_or(iv->mask,jv->mask);
227 
228     i = iv->nvar;
229     iv->nvar += jv->nvar;
230     hts_expand(int, iv->nvar, iv->mvar, iv->var);
231     for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
232 
233     int *imat = srt->pmat + ivset*srt->ngrp;
234     int *jmat = srt->pmat + jvset*srt->ngrp;
235     for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
236     srt->cnt[ivset] += srt->cnt[jvset];
237 
238     remove_vset(srt, jvset);
239 
240     return ivset;
241 }
push_vset(sr_sort_t * srt,int ivset)242 void push_vset(sr_sort_t *srt, int ivset)
243 {
244     varset_t *iv = &srt->vset[ivset];
245     int i,j;
246     for (i=0; i<srt->sr->nreaders; i++)
247     {
248         vcf_buf_t *buf = &srt->vcf_buf[i];
249         buf->nrec++;
250         hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
251         buf->rec[buf->nrec-1] = NULL;
252     }
253     for (i=0; i<iv->nvar; i++)
254     {
255         var_t *var = &srt->var[ iv->var[i] ];
256         for (j=0; j<var->nvcf; j++)
257         {
258             int jvcf = var->vcf[j];
259             vcf_buf_t *buf = &srt->vcf_buf[jvcf];
260             buf->rec[buf->nrec-1] = var->rec[j];
261         }
262     }
263     remove_vset(srt, ivset);
264 }
265 
cmpstringp(const void * p1,const void * p2)266 static int cmpstringp(const void *p1, const void *p2)
267 {
268     return strcmp(* (char * const *) p1, * (char * const *) p2);
269 }
270 
271 #if DEBUG_VSETS
debug_vsets(sr_sort_t * srt)272 void debug_vsets(sr_sort_t *srt)
273 {
274     int i,j,k;
275     for (i=0; i<srt->nvset; i++)
276     {
277         fprintf(stderr,"dbg_vset %d:", i);
278         for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
279         fprintf(stderr,"\t");
280         for (j=0; j<srt->vset[i].nvar; j++)
281         {
282             var_t *var = &srt->var[srt->vset[i].var[j]];
283             fprintf(stderr,"\t%s",var->str);
284             for (k=0; k<var->nvcf; k++)
285                 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
286         }
287         fprintf(stderr,"\n");
288     }
289 }
290 #endif
291 
292 #if DEBUG_VBUF
debug_vbuf(sr_sort_t * srt)293 void debug_vbuf(sr_sort_t *srt)
294 {
295     int i, j;
296     for (j=0; j<srt->vcf_buf[0].nrec; j++)
297     {
298         fprintf(stderr,"dbg_vbuf %d:\t", j);
299         for (i=0; i<srt->sr->nreaders; i++)
300         {
301             vcf_buf_t *buf = &srt->vcf_buf[i];
302             fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0);
303         }
304         fprintf(stderr,"\n");
305     }
306 }
307 #endif
308 
grp_create_key(sr_sort_t * srt)309 char *grp_create_key(sr_sort_t *srt)
310 {
311     if ( !srt->str.l ) return strdup("");
312     int i;
313     hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
314     for (i=0; i<srt->noff; i++)
315     {
316         srt->charp[i] = srt->str.s + srt->off[i];
317         if ( i>0 ) srt->charp[i][-1] = 0;
318     }
319     qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
320     char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
321     for (i=0; i<srt->noff; i++)
322     {
323         int len = strlen(srt->charp[i]);
324         memcpy(ptr, srt->charp[i], len);
325         ptr += len + 1;
326         ptr[-1] = i+1==srt->noff ? 0 : ';';
327     }
328     return ret;
329 }
bcf_sr_sort_set_active(sr_sort_t * srt,int idx)330 int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
331 {
332     hts_expand(int,idx+1,srt->mactive,srt->active);
333     srt->nactive = 1;
334     srt->active[srt->nactive - 1] = idx;
335     return 0;
336 }
bcf_sr_sort_add_active(sr_sort_t * srt,int idx)337 int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
338 {
339     hts_expand(int,idx+1,srt->mactive,srt->active);
340     srt->nactive++;
341     srt->active[srt->nactive - 1] = idx;
342     return 0;
343 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)344 static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
345 {
346     if ( !srt->grp_str2int )
347     {
348         // first time here, initialize
349         if ( !srt->pair )
350         {
351             if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
352             bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
353         }
354         bcf_sr_init_scores(srt);
355         srt->grp_str2int = khash_str2int_init();
356         srt->var_str2int = khash_str2int_init();
357     }
358     int k;
359     khash_t(str2int) *hash;
360     hash = srt->grp_str2int;
361     for (k=0; k < kh_end(hash); k++)
362         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
363     hash = srt->var_str2int;
364     for (k=0; k < kh_end(hash); k++)
365         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
366     kh_clear(str2int, srt->grp_str2int);
367     kh_clear(str2int, srt->var_str2int);
368     srt->ngrp = srt->nvar = srt->nvset = 0;
369 
370     grp_t grp;
371     memset(&grp,0,sizeof(grp_t));
372 
373     // group VCFs into groups, each with a unique combination of variants in the duplicate lines
374     int ireader,ivar,irec,igrp,ivset,iact;
375     for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
376     for (iact=0; iact<srt->nactive; iact++)
377     {
378         ireader = srt->active[iact];
379         bcf_sr_t *reader = &readers->readers[ireader];
380         int rid   = bcf_hdr_name2id(reader->header, chr);
381         grp.nvar  = 0;
382         hts_expand(int,reader->nbuffer,srt->moff,srt->off);
383         srt->noff  = 0;
384         srt->str.l = 0;
385         for (irec=1; irec<=reader->nbuffer; irec++)
386         {
387             bcf1_t *line = reader->buffer[irec];
388             if ( line->rid!=rid || line->pos!=min_pos ) break;
389 
390             if ( srt->str.l ) kputc(';',&srt->str);
391             srt->off[srt->noff++] = srt->str.l;
392             size_t beg = srt->str.l;
393             for (ivar=1; ivar<line->n_allele; ivar++)
394             {
395                 if ( ivar>1 ) kputc(',',&srt->str);
396                 kputs(line->d.allele[0],&srt->str);
397                 kputc('>',&srt->str);
398                 kputs(line->d.allele[ivar],&srt->str);
399             }
400             if ( line->n_allele==1 )
401             {
402                 kputs(line->d.allele[0],&srt->str);
403                 kputsn(">.",2,&srt->str);
404             }
405 
406             // Create new variant or attach to existing one. But careful, there can be duplicate
407             // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
408             char *var_str = beg + srt->str.s;
409             int ret, var_idx = 0, var_end = srt->str.l;
410             while ( 1 )
411             {
412                 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
413                 if ( ret==-1 ) break;
414 
415                 var_t *var = &srt->var[ivar];
416                 if ( var->vcf[var->nvcf-1] != ireader ) break;
417 
418                 srt->str.l = var_end;
419                 kputw(var_idx, &srt->str);
420                 var_str = beg + srt->str.s;
421                 var_idx++;
422             }
423             if ( ret==-1 )
424             {
425                 ivar = srt->nvar++;
426                 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
427                 srt->var[ivar].nvcf = 0;
428                 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
429                 free(srt->var[ivar].str);   // possible left-over from the previous position
430             }
431             var_t *var = &srt->var[ivar];
432             var->nalt = line->n_allele - 1;
433             var->type = bcf_get_variant_types(line);
434             srt->str.s[var_end] = 0;
435             if ( ret==-1 )
436                 var->str = strdup(var_str);
437 
438             int mvcf = var->mvcf;
439             var->nvcf++;
440             hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
441             if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
442             var->vcf[var->nvcf-1] = ireader;
443             var->rec[var->nvcf-1] = line;
444 
445             grp.nvar++;
446             hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
447             grp.var[grp.nvar-1] = ivar;
448         }
449         char *grp_key = grp_create_key(srt);
450         int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
451         if ( ret==-1 )
452         {
453             igrp = srt->ngrp++;
454             hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
455             free(srt->grp[igrp].var);
456             srt->grp[igrp] = grp;
457             srt->grp[igrp].key = grp_key;
458             khash_str2int_set(srt->grp_str2int, grp_key, igrp);
459             memset(&grp,0,sizeof(grp_t));
460         }
461         else
462             free(grp_key);
463         srt->grp[igrp].nvcf++;
464     }
465     free(grp.var);
466 
467     // initialize bitmask - which groups is the variant present in
468     for (ivar=0; ivar<srt->nvar; ivar++)
469     {
470         srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
471         kbs_clear(srt->var[ivar].mask);
472     }
473     for (igrp=0; igrp<srt->ngrp; igrp++)
474     {
475         for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
476         {
477             int i = srt->grp[igrp].var[ivar];
478             kbs_insert(srt->var[i].mask, igrp);
479         }
480     }
481 
482     // create the initial list of variant sets
483     for (ivar=0; ivar<srt->nvar; ivar++)
484     {
485         ivset = srt->nvset++;
486         hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
487 
488         varset_t *vset = &srt->vset[ivset];
489         vset->nvar = 1;
490         hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
491         vset->var[vset->nvar-1] = ivar;
492         var_t *var  = &srt->var[ivar];
493         vset->cnt   = var->nvcf;
494         vset->mask  = kbs_resize(vset->mask, srt->ngrp);
495         kbs_clear(vset->mask);
496         kbs_bitwise_or(vset->mask, var->mask);
497 
498         int type = 0;
499         if ( var->type==VCF_REF ) type |= SR_REF;
500         else
501         {
502             if ( var->type & VCF_SNP ) type |= SR_SNP;
503             if ( var->type & VCF_MNP ) type |= SR_SNP;
504             if ( var->type & VCF_INDEL ) type |= SR_INDEL;
505             if ( var->type & VCF_OTHER ) type |= SR_OTHER;
506         }
507         var->type = type;
508     }
509 #if DEBUG_VSETS
510     debug_vsets(srt);
511 #endif
512 
513     // initialize the pairing matrix
514     hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
515     hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
516     memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
517     for (ivset=0; ivset<srt->nvset; ivset++)
518     {
519         varset_t *vset = &srt->vset[ivset];
520         for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
521         srt->cnt[ivset] = vset->cnt;
522     }
523 
524     // pair the lines
525     while ( srt->nvset )
526     {
527 #if DEBUG_VSETS
528     fprintf(stderr,"\n");
529     debug_vsets(srt);
530 #endif
531 
532         int imax = 0;
533         for (ivset=1; ivset<srt->nvset; ivset++)
534             if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
535 
536         int ipair = -1;
537         uint32_t max_score = 0;
538         for (ivset=0; ivset<srt->nvset; ivset++)
539         {
540             if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue;   // cannot be merged
541             uint32_t score = pairing_score(srt, imax, ivset);
542             // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
543             if ( max_score < score ) { max_score = score; ipair = ivset; }
544         }
545 
546         // merge rows creating a new variant set this way
547         if ( ipair!=-1 && ipair!=imax )
548         {
549             imax = merge_vsets(srt, imax, ipair);
550             continue;
551         }
552 
553         push_vset(srt, imax);
554     }
555 
556     srt->chr = chr;
557     srt->pos = min_pos;
558 }
559 
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,int min_pos)560 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
561 {
562     int i,j;
563     assert( srt->nactive>0 );
564 
565     if ( srt->nsr != readers->nreaders )
566     {
567         srt->sr = readers;
568         if ( srt->nsr < readers->nreaders )
569         {
570             srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
571             memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
572             if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
573         }
574         srt->nsr = readers->nreaders;
575         srt->chr = NULL;
576     }
577     if ( srt->nactive == 1 )
578     {
579         if ( readers->nreaders>1 )
580             memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
581         bcf_sr_t *reader = &readers->readers[srt->active[0]];
582         assert( reader->buffer[1]->pos==min_pos );
583         bcf1_t *tmp = reader->buffer[0];
584         for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
585         reader->buffer[ reader->nbuffer ] = tmp;
586         reader->nbuffer--;
587         readers->has_line[srt->active[0]] = 1;
588         return 1;
589     }
590     if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
591 
592     if ( !srt->vcf_buf[0].nrec ) return 0;
593 
594 #if DEBUG_VBUF
595     debug_vbuf(srt);
596 #endif
597 
598     int nret = 0;
599     for (i=0; i<srt->sr->nreaders; i++)
600     {
601         vcf_buf_t *buf = &srt->vcf_buf[i];
602 
603         if ( buf->rec[0] )
604         {
605             bcf_sr_t *reader = &srt->sr->readers[i];
606             for (j=1; j<=reader->nbuffer; j++)
607                 if ( reader->buffer[j] == buf->rec[0] ) break;
608 
609             assert( j<=reader->nbuffer );
610 
611             bcf1_t *tmp = reader->buffer[0];
612             reader->buffer[0] = reader->buffer[j++];
613             for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
614             reader->buffer[ reader->nbuffer ] = tmp;
615             reader->nbuffer--;
616 
617             nret++;
618             srt->sr->has_line[i] = 1;
619         }
620         else
621             srt->sr->has_line[i] = 0;
622 
623         buf->nrec--;
624         if ( buf->nrec > 0 )
625             memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
626     }
627     return nret;
628 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)629 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
630 {
631     free(srt->vcf_buf[i].rec);
632     if ( i+1 < srt->nsr )
633         memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
634     memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
635 }
bcf_sr_sort_init(sr_sort_t * srt)636 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
637 {
638     if ( !srt ) return calloc(1,sizeof(sr_sort_t));
639     memset(srt,0,sizeof(sr_sort_t));
640     return srt;
641 }
bcf_sr_sort_destroy(sr_sort_t * srt)642 void bcf_sr_sort_destroy(sr_sort_t *srt)
643 {
644     free(srt->active);
645     if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
646     if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
647     int i;
648     for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
649     free(srt->vcf_buf);
650     for (i=0; i<srt->mvar; i++)
651     {
652         free(srt->var[i].str);
653         free(srt->var[i].vcf);
654         free(srt->var[i].rec);
655         kbs_destroy(srt->var[i].mask);
656     }
657     free(srt->var);
658     for (i=0; i<srt->mgrp; i++)
659         free(srt->grp[i].var);
660     free(srt->grp);
661     for (i=0; i<srt->mvset; i++)
662     {
663         kbs_destroy(srt->vset[i].mask);
664         free(srt->vset[i].var);
665     }
666     free(srt->vset);
667     free(srt->str.s);
668     free(srt->off);
669     free(srt->charp);
670     free(srt->cnt);
671     free(srt->pmat);
672     memset(srt,0,sizeof(*srt));
673 }
674 
675