1 /*
2     Copyright (C) 2017-2021 Genome Research Ltd.
3 
4     Author: Petr Danecek <pd3@sanger.ac.uk>
5 
6     Permission is hereby granted, free of charge, to any person obtaining a copy
7     of this software and associated documentation files (the "Software"), to deal
8     in the Software without restriction, including without limitation the rights
9     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10     copies of the Software, and to permit persons to whom the Software is
11     furnished to do so, subject to the following conditions:
12 
13     The above copyright notice and this permission notice shall be included in
14     all copies or substantial portions of the Software.
15 
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22     THE SOFTWARE.
23 */
24 
25 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26 #include <config.h>
27 
28 #include <assert.h>
29 #include <strings.h>
30 
31 #include "bcf_sr_sort.h"
32 #include "htslib/khash_str2int.h"
33 #include "htslib/kbitset.h"
34 
35 #define SR_REF   1
36 #define SR_SNP   2
37 #define SR_INDEL 4
38 #define SR_OTHER 8
39 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
40 
41 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)42 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
43 {
44     // General case, bitsets of unequal size:
45     //  int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
46     int i, n = bs1->n;
47 
48     for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
49     return 0;
50 }
51 
52 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)53 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
54 {
55     int i;
56     for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
57 }
58 
59 
bcf_sr_init_scores(sr_sort_t * srt)60 static void bcf_sr_init_scores(sr_sort_t *srt)
61 {
62     int i,jbit,kbit;
63 
64     // lower number = lower priority, zero means forbidden
65 
66     if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
67     if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
68     if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
69     if ( srt->pair & BCF_SR_PAIR_SNP_REF )
70     {
71         SR_SCORE(srt,SR_SNP,SR_REF) = 2;
72         SR_SCORE(srt,SR_REF,SR_SNP) = 2;
73     }
74     if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
75     {
76         SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
77         SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
78     }
79     if ( srt->pair & BCF_SR_PAIR_ANY )
80     {
81         for (i=0; i<256; i++)
82             if ( !srt->score[i] ) srt->score[i] = 1;
83     }
84 
85     // set all combinations
86     for (i=0; i<256; i++)
87     {
88         if ( srt->score[i] ) continue;      // already set
89         int max = 0;
90         for (jbit=0; jbit<4; jbit++)        // high bits
91         {
92             int j = 1<<jbit;
93             if ( !(i & (j<<4)) ) continue;
94             for (kbit=0; kbit<4; kbit++)    // low bits
95             {
96                 int k = 1<<kbit;
97                 if ( !(i & k) ) continue;
98                 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
99             }
100         }
101         srt->score[i] = max;
102     }
103 }
multi_is_exact(var_t * avar,var_t * bvar)104 static int multi_is_exact(var_t *avar, var_t *bvar)
105 {
106     if ( avar->nalt != bvar->nalt ) return 0;
107 
108     int alen = strlen(avar->str);
109     int blen = strlen(bvar->str);
110     if ( alen != blen ) return 0;
111 
112     char *abeg = avar->str;
113     while ( *abeg )
114     {
115         char *aend = abeg;
116         while ( *aend && *aend!=',' ) aend++;
117 
118         char *bbeg = bvar->str;
119         while ( *bbeg )
120         {
121             char *bend = bbeg;
122             while ( *bend && *bend!=',' ) bend++;
123             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
124             bbeg = *bend ? bend+1 : bend;
125         }
126         if ( !*bbeg ) return 0;
127 
128         abeg = *aend ? aend+1 : aend;
129     }
130     return 1;
131 }
multi_is_subset(var_t * avar,var_t * bvar)132 static int multi_is_subset(var_t *avar, var_t *bvar)
133 {
134     char *abeg = avar->str;
135     while ( *abeg )
136     {
137         char *aend = abeg;
138         while ( *aend && *aend!=',' ) aend++;
139 
140         char *bbeg = bvar->str;
141         while ( *bbeg )
142         {
143             char *bend = bbeg;
144             while ( *bend && *bend!=',' ) bend++;
145             if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
146             bbeg = *bend ? bend+1 : bend;
147         }
148         abeg = *aend ? aend+1 : aend;
149     }
150     return 0;
151 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)152 static uint32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
153 {
154     varset_t *iv = &srt->vset[ivset];
155     varset_t *jv = &srt->vset[jvset];
156 
157     // Restrictive logic: the strictest type from a group is selected,
158     // so that, for example, snp+ref does not lead to the inclusion of an indel
159     int i,j;
160     uint32_t min = UINT32_MAX;
161     for (i=0; i<iv->nvar; i++)
162     {
163         var_t *ivar = &srt->var[iv->var[i]];
164         for (j=0; j<jv->nvar; j++)
165         {
166             var_t *jvar = &srt->var[jv->var[j]];
167             if ( srt->pair & BCF_SR_PAIR_EXACT )
168             {
169                 if ( ivar->type != jvar->type ) continue;
170                 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
171                 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
172                 continue;
173             }
174             if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX;  // exact match, best possibility
175             if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
176 
177             uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
178             if ( !score ) return 0;     // some of the varsets in the two groups are not compatible, will not pair
179             if ( min>score ) min = score;
180         }
181     }
182     if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
183 
184     assert( min!=UINT32_MAX );
185 
186     uint32_t cnt = 0;
187     for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
188     for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
189 
190     return (1u<<(28+min)) + cnt;
191 }
remove_vset(sr_sort_t * srt,int jvset)192 static void remove_vset(sr_sort_t *srt, int jvset)
193 {
194     if ( jvset+1 < srt->nvset )
195     {
196         varset_t tmp = srt->vset[jvset];
197         memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
198         srt->vset[srt->nvset-1] = tmp;
199 
200         int *jmat = srt->pmat + jvset*srt->ngrp;
201         memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
202 
203         memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
204     }
205     srt->nvset--;
206 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)207 static int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
208 {
209     int i,j;
210     if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
211 
212     varset_t *iv = &srt->vset[ivset];
213     varset_t *jv = &srt->vset[jvset];
214 
215     kbs_bitwise_or(iv->mask,jv->mask);
216 
217     i = iv->nvar;
218     iv->nvar += jv->nvar;
219     hts_expand(int, iv->nvar, iv->mvar, iv->var);
220     for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
221 
222     int *imat = srt->pmat + ivset*srt->ngrp;
223     int *jmat = srt->pmat + jvset*srt->ngrp;
224     for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
225     srt->cnt[ivset] += srt->cnt[jvset];
226 
227     remove_vset(srt, jvset);
228 
229     return ivset;
230 }
231 
push_vset(sr_sort_t * srt,int ivset)232 static int push_vset(sr_sort_t *srt, int ivset)
233 {
234     varset_t *iv = &srt->vset[ivset];
235     int i,j;
236     for (i=0; i<srt->sr->nreaders; i++)
237     {
238         vcf_buf_t *buf = &srt->vcf_buf[i];
239         buf->nrec++;
240         hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
241         buf->rec[buf->nrec-1] = NULL;
242     }
243     for (i=0; i<iv->nvar; i++)
244     {
245         var_t *var = &srt->var[ iv->var[i] ];
246         for (j=0; j<var->nvcf; j++)
247         {
248             int jvcf = var->vcf[j];
249             vcf_buf_t *buf = &srt->vcf_buf[jvcf];
250             buf->rec[buf->nrec-1] = var->rec[j];
251         }
252     }
253     remove_vset(srt, ivset);
254     return 0; // FIXME: check for errs in this function
255 }
256 
cmpstringp(const void * p1,const void * p2)257 static int cmpstringp(const void *p1, const void *p2)
258 {
259     return strcmp(* (char * const *) p1, * (char * const *) p2);
260 }
261 
262 #define DEBUG_VSETS 0
263 #if DEBUG_VSETS
debug_vsets(sr_sort_t * srt)264 void debug_vsets(sr_sort_t *srt)
265 {
266     int i,j,k;
267     for (i=0; i<srt->nvset; i++)
268     {
269         fprintf(stderr,"dbg_vset %d:", i);
270         for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
271         fprintf(stderr,"\t");
272         for (j=0; j<srt->vset[i].nvar; j++)
273         {
274             var_t *var = &srt->var[srt->vset[i].var[j]];
275             fprintf(stderr,"\t%s",var->str);
276             for (k=0; k<var->nvcf; k++)
277                 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
278         }
279         fprintf(stderr,"\n");
280     }
281 }
282 #endif
283 
284 #define DEBUG_VBUF 0
285 #if DEBUG_VBUF
debug_vbuf(sr_sort_t * srt)286 void debug_vbuf(sr_sort_t *srt)
287 {
288     int i, j;
289     for (j=0; j<srt->vcf_buf[0].nrec; j++)
290     {
291         fprintf(stderr,"dbg_vbuf %d:\t", j);
292         for (i=0; i<srt->sr->nreaders; i++)
293         {
294             vcf_buf_t *buf = &srt->vcf_buf[i];
295             fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0);
296         }
297         fprintf(stderr,"\n");
298     }
299 }
300 #endif
301 
grp_create_key(sr_sort_t * srt)302 static char *grp_create_key(sr_sort_t *srt)
303 {
304     if ( !srt->str.l ) return strdup("");
305     int i;
306     hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
307     for (i=0; i<srt->noff; i++)
308     {
309         srt->charp[i] = srt->str.s + srt->off[i];
310         if ( i>0 ) srt->charp[i][-1] = 0;
311     }
312     qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
313     char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
314     for (i=0; i<srt->noff; i++)
315     {
316         int len = strlen(srt->charp[i]);
317         memcpy(ptr, srt->charp[i], len);
318         ptr += len + 1;
319         ptr[-1] = i+1==srt->noff ? 0 : ';';
320     }
321     return ret;
322 }
bcf_sr_sort_set_active(sr_sort_t * srt,int idx)323 int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
324 {
325     hts_expand(int,idx+1,srt->mactive,srt->active);
326     srt->nactive = 1;
327     srt->active[srt->nactive - 1] = idx;
328     return 0; // FIXME: check for errs in this function
329 }
bcf_sr_sort_add_active(sr_sort_t * srt,int idx)330 int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
331 {
332     hts_expand(int,idx+1,srt->mactive,srt->active);
333     srt->nactive++;
334     srt->active[srt->nactive - 1] = idx;
335     return 0; // FIXME: check for errs in this function
336 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,hts_pos_t min_pos)337 static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
338 {
339     if ( !srt->grp_str2int )
340     {
341         // first time here, initialize
342         if ( !srt->pair )
343         {
344             if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
345             bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
346         }
347         bcf_sr_init_scores(srt);
348         srt->grp_str2int = khash_str2int_init();
349         srt->var_str2int = khash_str2int_init();
350     }
351     int k;
352     khash_t(str2int) *hash;
353     hash = srt->grp_str2int;
354     for (k=0; k < kh_end(hash); k++)
355         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
356     hash = srt->var_str2int;
357     for (k=0; k < kh_end(hash); k++)
358         if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
359     kh_clear(str2int, srt->grp_str2int);
360     kh_clear(str2int, srt->var_str2int);
361     srt->ngrp = srt->nvar = srt->nvset = 0;
362 
363     grp_t grp;
364     memset(&grp,0,sizeof(grp_t));
365 
366     // group VCFs into groups, each with a unique combination of variants in the duplicate lines
367     int ireader,ivar,irec,igrp,ivset,iact;
368     for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
369     for (iact=0; iact<srt->nactive; iact++)
370     {
371         ireader = srt->active[iact];
372         bcf_sr_t *reader = &readers->readers[ireader];
373         int rid   = bcf_hdr_name2id(reader->header, chr);
374         grp.nvar  = 0;
375         hts_expand(int,reader->nbuffer,srt->moff,srt->off);
376         srt->noff  = 0;
377         srt->str.l = 0;
378         for (irec=1; irec<=reader->nbuffer; irec++)
379         {
380             bcf1_t *line = reader->buffer[irec];
381             if ( line->rid!=rid || line->pos!=min_pos ) break;
382 
383             if ( srt->str.l ) kputc(';',&srt->str);
384             srt->off[srt->noff++] = srt->str.l;
385             size_t beg  = srt->str.l;
386             int end_pos = -1;
387             for (ivar=1; ivar<line->n_allele; ivar++)
388             {
389                 if ( ivar>1 ) kputc(',',&srt->str);
390                 kputs(line->d.allele[0],&srt->str);
391                 kputc('>',&srt->str);
392                 kputs(line->d.allele[ivar],&srt->str);
393 
394                 // If symbolic allele, check also the END tag in case there are multiple events,
395                 // such as <DEL>s, starting at the same positions
396                 if ( line->d.allele[ivar][0]=='<' )
397                 {
398                     if ( end_pos==-1 )
399                     {
400                         bcf_info_t *end_info = bcf_get_info(reader->header,line,"END");
401                         if ( end_info )
402                             end_pos = (int)end_info->v1.i;  // this is only to create a unique id, we don't mind a potential int64 overflow
403                         else
404                             end_pos = 0;
405                     }
406                     if ( end_pos )
407                     {
408                         kputc('/',&srt->str);
409                         kputw(end_pos, &srt->str);
410                     }
411                 }
412             }
413             if ( line->n_allele==1 )
414             {
415                 kputs(line->d.allele[0],&srt->str);
416                 kputsn(">.",2,&srt->str);
417             }
418 
419             // Create new variant or attach to existing one. But careful, there can be duplicate
420             // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
421             char *var_str = beg + srt->str.s;
422             int ret, var_idx = 0, var_end = srt->str.l;
423             while ( 1 )
424             {
425                 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
426                 if ( ret==-1 ) break;
427 
428                 var_t *var = &srt->var[ivar];
429                 if ( var->vcf[var->nvcf-1] != ireader ) break;
430 
431                 srt->str.l = var_end;
432                 kputw(var_idx, &srt->str);
433                 var_str = beg + srt->str.s;
434                 var_idx++;
435             }
436             if ( ret==-1 )
437             {
438                 ivar = srt->nvar++;
439                 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
440                 srt->var[ivar].nvcf = 0;
441                 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
442                 free(srt->var[ivar].str);   // possible left-over from the previous position
443             }
444             var_t *var = &srt->var[ivar];
445             var->nalt = line->n_allele - 1;
446             var->type = bcf_get_variant_types(line);
447             srt->str.s[var_end] = 0;
448             if ( ret==-1 )
449                 var->str = strdup(var_str);
450 
451             int mvcf = var->mvcf;
452             var->nvcf++;
453             hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
454             if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
455             var->vcf[var->nvcf-1] = ireader;
456             var->rec[var->nvcf-1] = line;
457 
458             grp.nvar++;
459             hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
460             grp.var[grp.nvar-1] = ivar;
461         }
462         char *grp_key = grp_create_key(srt);
463         int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
464         if ( ret==-1 )
465         {
466             igrp = srt->ngrp++;
467             hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
468             free(srt->grp[igrp].var);
469             srt->grp[igrp] = grp;
470             srt->grp[igrp].key = grp_key;
471             khash_str2int_set(srt->grp_str2int, grp_key, igrp);
472             memset(&grp,0,sizeof(grp_t));
473         }
474         else
475             free(grp_key);
476         srt->grp[igrp].nvcf++;
477     }
478     free(grp.var);
479 
480     // initialize bitmask - which groups is the variant present in
481     for (ivar=0; ivar<srt->nvar; ivar++)
482     {
483         if ( kbs_resize(&srt->var[ivar].mask, srt->ngrp) < 0 )
484         {
485             fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__);
486             exit(1);
487         }
488         kbs_clear(srt->var[ivar].mask);
489     }
490     for (igrp=0; igrp<srt->ngrp; igrp++)
491     {
492         for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
493         {
494             int i = srt->grp[igrp].var[ivar];
495             kbs_insert(srt->var[i].mask, igrp);
496         }
497     }
498 
499     // create the initial list of variant sets
500     for (ivar=0; ivar<srt->nvar; ivar++)
501     {
502         ivset = srt->nvset++;
503         hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
504 
505         varset_t *vset = &srt->vset[ivset];
506         vset->nvar = 1;
507         hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
508         vset->var[vset->nvar-1] = ivar;
509         var_t *var  = &srt->var[ivar];
510         vset->cnt   = var->nvcf;
511         if ( kbs_resize(&vset->mask, srt->ngrp) < 0 )
512         {
513             fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__);
514             exit(1);
515         }
516         kbs_clear(vset->mask);
517         kbs_bitwise_or(vset->mask, var->mask);
518 
519         int type = 0;
520         if ( var->type==VCF_REF ) type |= SR_REF;
521         else
522         {
523             if ( var->type & VCF_SNP ) type |= SR_SNP;
524             if ( var->type & VCF_MNP ) type |= SR_SNP;
525             if ( var->type & VCF_INDEL ) type |= SR_INDEL;
526             if ( var->type & VCF_OTHER ) type |= SR_OTHER;
527         }
528         var->type = type;
529     }
530 #if DEBUG_VSETS
531     debug_vsets(srt);
532 #endif
533 
534     // initialize the pairing matrix
535     hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
536     hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
537     memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
538     for (ivset=0; ivset<srt->nvset; ivset++)
539     {
540         varset_t *vset = &srt->vset[ivset];
541         for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
542         srt->cnt[ivset] = vset->cnt;
543     }
544 
545     // pair the lines
546     while ( srt->nvset )
547     {
548 #if DEBUG_VSETS
549     fprintf(stderr,"\n");
550     debug_vsets(srt);
551 #endif
552 
553         int imax = 0;
554         for (ivset=1; ivset<srt->nvset; ivset++)
555             if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
556 
557         int ipair = -1;
558         uint32_t max_score = 0;
559         for (ivset=0; ivset<srt->nvset; ivset++)
560         {
561             if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue;   // cannot be merged
562             uint32_t score = pairing_score(srt, imax, ivset);
563             // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
564             if ( max_score < score ) { max_score = score; ipair = ivset; }
565         }
566 
567         // merge rows creating a new variant set this way
568         if ( ipair!=-1 && ipair!=imax )
569         {
570             imax = merge_vsets(srt, imax, ipair);
571             continue;
572         }
573 
574         push_vset(srt, imax);
575     }
576 
577     srt->chr = chr;
578     srt->pos = min_pos;
579 
580     return 0;  // FIXME: check for errs in this function
581 }
582 
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,hts_pos_t min_pos)583 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
584 {
585     int i,j;
586     assert( srt->nactive>0 );
587 
588     if ( srt->nsr != readers->nreaders )
589     {
590         srt->sr = readers;
591         if ( srt->nsr < readers->nreaders )
592         {
593             srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
594             memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
595             if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
596         }
597         srt->nsr = readers->nreaders;
598         srt->chr = NULL;
599     }
600     if ( srt->nactive == 1 )
601     {
602         if ( readers->nreaders>1 )
603             memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
604         bcf_sr_t *reader = &readers->readers[srt->active[0]];
605         assert( reader->buffer[1]->pos==min_pos );
606         bcf1_t *tmp = reader->buffer[0];
607         for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
608         reader->buffer[ reader->nbuffer ] = tmp;
609         reader->nbuffer--;
610         readers->has_line[srt->active[0]] = 1;
611         return 1;
612     }
613     if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
614 
615     if ( !srt->vcf_buf[0].nrec ) return 0;
616 
617 #if DEBUG_VBUF
618     debug_vbuf(srt);
619 #endif
620 
621     int nret = 0;
622     for (i=0; i<srt->sr->nreaders; i++)
623     {
624         vcf_buf_t *buf = &srt->vcf_buf[i];
625 
626         if ( buf->rec[0] )
627         {
628             bcf_sr_t *reader = &srt->sr->readers[i];
629             for (j=1; j<=reader->nbuffer; j++)
630                 if ( reader->buffer[j] == buf->rec[0] ) break;
631 
632             assert( j<=reader->nbuffer );
633 
634             bcf1_t *tmp = reader->buffer[0];
635             reader->buffer[0] = reader->buffer[j++];
636             for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
637             reader->buffer[ reader->nbuffer ] = tmp;
638             reader->nbuffer--;
639 
640             nret++;
641             srt->sr->has_line[i] = 1;
642         }
643         else
644             srt->sr->has_line[i] = 0;
645 
646         buf->nrec--;
647         if ( buf->nrec > 0 )
648             memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
649     }
650     return nret;
651 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)652 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
653 {
654     //vcf_buf is allocated only in bcf_sr_sort_next
655     //So, a call to bcf_sr_add_reader() followed immediately by bcf_sr_remove_reader()
656     //would cause the program to crash in this segment
657     if (srt->vcf_buf)
658     {
659         free(srt->vcf_buf[i].rec);
660         if ( i+1 < srt->nsr )
661             memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
662         memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
663     }
664 }
bcf_sr_sort_init(sr_sort_t * srt)665 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
666 {
667     if ( !srt ) return calloc(1,sizeof(sr_sort_t));
668     memset(srt,0,sizeof(sr_sort_t));
669     return srt;
670 }
bcf_sr_sort_reset(sr_sort_t * srt)671 void bcf_sr_sort_reset(sr_sort_t *srt)
672 {
673     srt->chr = NULL;
674 }
bcf_sr_sort_destroy(sr_sort_t * srt)675 void bcf_sr_sort_destroy(sr_sort_t *srt)
676 {
677     free(srt->active);
678     if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
679     if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
680     int i;
681     for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
682     free(srt->vcf_buf);
683     for (i=0; i<srt->mvar; i++)
684     {
685         free(srt->var[i].str);
686         free(srt->var[i].vcf);
687         free(srt->var[i].rec);
688         kbs_destroy(srt->var[i].mask);
689     }
690     free(srt->var);
691     for (i=0; i<srt->mgrp; i++)
692         free(srt->grp[i].var);
693     free(srt->grp);
694     for (i=0; i<srt->mvset; i++)
695     {
696         kbs_destroy(srt->vset[i].mask);
697         free(srt->vset[i].var);
698     }
699     free(srt->vset);
700     free(srt->str.s);
701     free(srt->off);
702     free(srt->charp);
703     free(srt->cnt);
704     free(srt->pmat);
705     memset(srt,0,sizeof(*srt));
706 }
707 
708