1 /*
2 Copyright (C) 2017-2021 Genome Research Ltd.
3
4 Author: Petr Danecek <pd3@sanger.ac.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26 #include <config.h>
27
28 #include <assert.h>
29 #include <strings.h>
30
31 #include "bcf_sr_sort.h"
32 #include "htslib/khash_str2int.h"
33 #include "htslib/kbitset.h"
34
35 #define SR_REF 1
36 #define SR_SNP 2
37 #define SR_INDEL 4
38 #define SR_OTHER 8
39 #define SR_SCORE(srt,a,b) (srt)->score[((a)<<4)|(b)]
40
41 // Logical AND
kbs_logical_and(kbitset_t * bs1,kbitset_t * bs2)42 static inline int kbs_logical_and(kbitset_t *bs1, kbitset_t *bs2)
43 {
44 // General case, bitsets of unequal size:
45 // int i, n = bs1->n < bs2->n ? bs1->n : bs2->n;
46 int i, n = bs1->n;
47
48 for (i=0; i<n; i++) if ( bs1->b[i] & bs2->b[i] ) return 1;
49 return 0;
50 }
51
52 // Bitwise OR, dst will be modified, src will be left unchanged
kbs_bitwise_or(kbitset_t * dst,kbitset_t * src)53 static inline void kbs_bitwise_or(kbitset_t *dst, kbitset_t *src)
54 {
55 int i;
56 for (i=0; i<dst->n; i++) dst->b[i] |= src->b[i];
57 }
58
59
bcf_sr_init_scores(sr_sort_t * srt)60 static void bcf_sr_init_scores(sr_sort_t *srt)
61 {
62 int i,jbit,kbit;
63
64 // lower number = lower priority, zero means forbidden
65
66 if ( srt->pair & BCF_SR_PAIR_ANY ) srt->pair |= (BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF);
67 if ( srt->pair & BCF_SR_PAIR_SNPS ) SR_SCORE(srt,SR_SNP,SR_SNP) = 3;
68 if ( srt->pair & BCF_SR_PAIR_INDELS ) SR_SCORE(srt,SR_INDEL,SR_INDEL) = 3;
69 if ( srt->pair & BCF_SR_PAIR_SNP_REF )
70 {
71 SR_SCORE(srt,SR_SNP,SR_REF) = 2;
72 SR_SCORE(srt,SR_REF,SR_SNP) = 2;
73 }
74 if ( srt->pair & BCF_SR_PAIR_INDEL_REF )
75 {
76 SR_SCORE(srt,SR_INDEL,SR_REF) = 2;
77 SR_SCORE(srt,SR_REF,SR_INDEL) = 2;
78 }
79 if ( srt->pair & BCF_SR_PAIR_ANY )
80 {
81 for (i=0; i<256; i++)
82 if ( !srt->score[i] ) srt->score[i] = 1;
83 }
84
85 // set all combinations
86 for (i=0; i<256; i++)
87 {
88 if ( srt->score[i] ) continue; // already set
89 int max = 0;
90 for (jbit=0; jbit<4; jbit++) // high bits
91 {
92 int j = 1<<jbit;
93 if ( !(i & (j<<4)) ) continue;
94 for (kbit=0; kbit<4; kbit++) // low bits
95 {
96 int k = 1<<kbit;
97 if ( !(i & k) ) continue;
98 if ( max < SR_SCORE(srt,j,k) ) max = SR_SCORE(srt,j,k);
99 }
100 }
101 srt->score[i] = max;
102 }
103 }
multi_is_exact(var_t * avar,var_t * bvar)104 static int multi_is_exact(var_t *avar, var_t *bvar)
105 {
106 if ( avar->nalt != bvar->nalt ) return 0;
107
108 int alen = strlen(avar->str);
109 int blen = strlen(bvar->str);
110 if ( alen != blen ) return 0;
111
112 char *abeg = avar->str;
113 while ( *abeg )
114 {
115 char *aend = abeg;
116 while ( *aend && *aend!=',' ) aend++;
117
118 char *bbeg = bvar->str;
119 while ( *bbeg )
120 {
121 char *bend = bbeg;
122 while ( *bend && *bend!=',' ) bend++;
123 if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) break;
124 bbeg = *bend ? bend+1 : bend;
125 }
126 if ( !*bbeg ) return 0;
127
128 abeg = *aend ? aend+1 : aend;
129 }
130 return 1;
131 }
multi_is_subset(var_t * avar,var_t * bvar)132 static int multi_is_subset(var_t *avar, var_t *bvar)
133 {
134 char *abeg = avar->str;
135 while ( *abeg )
136 {
137 char *aend = abeg;
138 while ( *aend && *aend!=',' ) aend++;
139
140 char *bbeg = bvar->str;
141 while ( *bbeg )
142 {
143 char *bend = bbeg;
144 while ( *bend && *bend!=',' ) bend++;
145 if ( bend - bbeg == aend - abeg && !strncasecmp(abeg,bbeg,bend-bbeg) ) return 1;
146 bbeg = *bend ? bend+1 : bend;
147 }
148 abeg = *aend ? aend+1 : aend;
149 }
150 return 0;
151 }
pairing_score(sr_sort_t * srt,int ivset,int jvset)152 static uint32_t pairing_score(sr_sort_t *srt, int ivset, int jvset)
153 {
154 varset_t *iv = &srt->vset[ivset];
155 varset_t *jv = &srt->vset[jvset];
156
157 // Restrictive logic: the strictest type from a group is selected,
158 // so that, for example, snp+ref does not lead to the inclusion of an indel
159 int i,j;
160 uint32_t min = UINT32_MAX;
161 for (i=0; i<iv->nvar; i++)
162 {
163 var_t *ivar = &srt->var[iv->var[i]];
164 for (j=0; j<jv->nvar; j++)
165 {
166 var_t *jvar = &srt->var[jv->var[j]];
167 if ( srt->pair & BCF_SR_PAIR_EXACT )
168 {
169 if ( ivar->type != jvar->type ) continue;
170 if ( !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
171 if ( multi_is_exact(ivar,jvar) ) return UINT32_MAX; // identical alleles
172 continue;
173 }
174 if ( ivar->type==jvar->type && !strcmp(ivar->str,jvar->str) ) return UINT32_MAX; // exact match, best possibility
175 if ( ivar->type & jvar->type && multi_is_subset(ivar,jvar) ) return UINT32_MAX; // one of the alleles is identical
176
177 uint32_t score = SR_SCORE(srt,ivar->type,jvar->type);
178 if ( !score ) return 0; // some of the varsets in the two groups are not compatible, will not pair
179 if ( min>score ) min = score;
180 }
181 }
182 if ( srt->pair & BCF_SR_PAIR_EXACT ) return 0;
183
184 assert( min!=UINT32_MAX );
185
186 uint32_t cnt = 0;
187 for (i=0; i<iv->nvar; i++) cnt += srt->var[iv->var[i]].nvcf;
188 for (j=0; j<jv->nvar; j++) cnt += srt->var[jv->var[j]].nvcf;
189
190 return (1u<<(28+min)) + cnt;
191 }
remove_vset(sr_sort_t * srt,int jvset)192 static void remove_vset(sr_sort_t *srt, int jvset)
193 {
194 if ( jvset+1 < srt->nvset )
195 {
196 varset_t tmp = srt->vset[jvset];
197 memmove(&srt->vset[jvset], &srt->vset[jvset+1], sizeof(varset_t)*(srt->nvset - jvset - 1));
198 srt->vset[srt->nvset-1] = tmp;
199
200 int *jmat = srt->pmat + jvset*srt->ngrp;
201 memmove(jmat, &jmat[srt->ngrp],sizeof(int)*(srt->nvset - jvset - 1)*srt->ngrp);
202
203 memmove(&srt->cnt[jvset], &srt->cnt[jvset+1], sizeof(int)*(srt->nvset - jvset - 1));
204 }
205 srt->nvset--;
206 }
merge_vsets(sr_sort_t * srt,int ivset,int jvset)207 static int merge_vsets(sr_sort_t *srt, int ivset, int jvset)
208 {
209 int i,j;
210 if ( ivset > jvset ) { i = ivset; ivset = jvset; jvset = i; }
211
212 varset_t *iv = &srt->vset[ivset];
213 varset_t *jv = &srt->vset[jvset];
214
215 kbs_bitwise_or(iv->mask,jv->mask);
216
217 i = iv->nvar;
218 iv->nvar += jv->nvar;
219 hts_expand(int, iv->nvar, iv->mvar, iv->var);
220 for (j=0; j<jv->nvar; j++,i++) iv->var[i] = jv->var[j];
221
222 int *imat = srt->pmat + ivset*srt->ngrp;
223 int *jmat = srt->pmat + jvset*srt->ngrp;
224 for (i=0; i<srt->ngrp; i++) imat[i] += jmat[i];
225 srt->cnt[ivset] += srt->cnt[jvset];
226
227 remove_vset(srt, jvset);
228
229 return ivset;
230 }
231
push_vset(sr_sort_t * srt,int ivset)232 static int push_vset(sr_sort_t *srt, int ivset)
233 {
234 varset_t *iv = &srt->vset[ivset];
235 int i,j;
236 for (i=0; i<srt->sr->nreaders; i++)
237 {
238 vcf_buf_t *buf = &srt->vcf_buf[i];
239 buf->nrec++;
240 hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec);
241 buf->rec[buf->nrec-1] = NULL;
242 }
243 for (i=0; i<iv->nvar; i++)
244 {
245 var_t *var = &srt->var[ iv->var[i] ];
246 for (j=0; j<var->nvcf; j++)
247 {
248 int jvcf = var->vcf[j];
249 vcf_buf_t *buf = &srt->vcf_buf[jvcf];
250 buf->rec[buf->nrec-1] = var->rec[j];
251 }
252 }
253 remove_vset(srt, ivset);
254 return 0; // FIXME: check for errs in this function
255 }
256
cmpstringp(const void * p1,const void * p2)257 static int cmpstringp(const void *p1, const void *p2)
258 {
259 return strcmp(* (char * const *) p1, * (char * const *) p2);
260 }
261
262 #define DEBUG_VSETS 0
263 #if DEBUG_VSETS
debug_vsets(sr_sort_t * srt)264 void debug_vsets(sr_sort_t *srt)
265 {
266 int i,j,k;
267 for (i=0; i<srt->nvset; i++)
268 {
269 fprintf(stderr,"dbg_vset %d:", i);
270 for (j=0; j<srt->vset[i].mask->n; j++) fprintf(stderr,"%c%lu",j==0?' ':':',srt->vset[i].mask->b[j]);
271 fprintf(stderr,"\t");
272 for (j=0; j<srt->vset[i].nvar; j++)
273 {
274 var_t *var = &srt->var[srt->vset[i].var[j]];
275 fprintf(stderr,"\t%s",var->str);
276 for (k=0; k<var->nvcf; k++)
277 fprintf(stderr,"%c%d", k==0?':':',',var->vcf[k]);
278 }
279 fprintf(stderr,"\n");
280 }
281 }
282 #endif
283
284 #define DEBUG_VBUF 0
285 #if DEBUG_VBUF
debug_vbuf(sr_sort_t * srt)286 void debug_vbuf(sr_sort_t *srt)
287 {
288 int i, j;
289 for (j=0; j<srt->vcf_buf[0].nrec; j++)
290 {
291 fprintf(stderr,"dbg_vbuf %d:\t", j);
292 for (i=0; i<srt->sr->nreaders; i++)
293 {
294 vcf_buf_t *buf = &srt->vcf_buf[i];
295 fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0);
296 }
297 fprintf(stderr,"\n");
298 }
299 }
300 #endif
301
grp_create_key(sr_sort_t * srt)302 static char *grp_create_key(sr_sort_t *srt)
303 {
304 if ( !srt->str.l ) return strdup("");
305 int i;
306 hts_expand(char*,srt->noff,srt->mcharp,srt->charp);
307 for (i=0; i<srt->noff; i++)
308 {
309 srt->charp[i] = srt->str.s + srt->off[i];
310 if ( i>0 ) srt->charp[i][-1] = 0;
311 }
312 qsort(srt->charp, srt->noff, sizeof(*srt->charp), cmpstringp);
313 char *ret = (char*) malloc(srt->str.l + 1), *ptr = ret;
314 for (i=0; i<srt->noff; i++)
315 {
316 int len = strlen(srt->charp[i]);
317 memcpy(ptr, srt->charp[i], len);
318 ptr += len + 1;
319 ptr[-1] = i+1==srt->noff ? 0 : ';';
320 }
321 return ret;
322 }
bcf_sr_sort_set_active(sr_sort_t * srt,int idx)323 int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
324 {
325 hts_expand(int,idx+1,srt->mactive,srt->active);
326 srt->nactive = 1;
327 srt->active[srt->nactive - 1] = idx;
328 return 0; // FIXME: check for errs in this function
329 }
bcf_sr_sort_add_active(sr_sort_t * srt,int idx)330 int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
331 {
332 hts_expand(int,idx+1,srt->mactive,srt->active);
333 srt->nactive++;
334 srt->active[srt->nactive - 1] = idx;
335 return 0; // FIXME: check for errs in this function
336 }
bcf_sr_sort_set(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,hts_pos_t min_pos)337 static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
338 {
339 if ( !srt->grp_str2int )
340 {
341 // first time here, initialize
342 if ( !srt->pair )
343 {
344 if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
345 bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
346 }
347 bcf_sr_init_scores(srt);
348 srt->grp_str2int = khash_str2int_init();
349 srt->var_str2int = khash_str2int_init();
350 }
351 int k;
352 khash_t(str2int) *hash;
353 hash = srt->grp_str2int;
354 for (k=0; k < kh_end(hash); k++)
355 if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
356 hash = srt->var_str2int;
357 for (k=0; k < kh_end(hash); k++)
358 if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
359 kh_clear(str2int, srt->grp_str2int);
360 kh_clear(str2int, srt->var_str2int);
361 srt->ngrp = srt->nvar = srt->nvset = 0;
362
363 grp_t grp;
364 memset(&grp,0,sizeof(grp_t));
365
366 // group VCFs into groups, each with a unique combination of variants in the duplicate lines
367 int ireader,ivar,irec,igrp,ivset,iact;
368 for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
369 for (iact=0; iact<srt->nactive; iact++)
370 {
371 ireader = srt->active[iact];
372 bcf_sr_t *reader = &readers->readers[ireader];
373 int rid = bcf_hdr_name2id(reader->header, chr);
374 grp.nvar = 0;
375 hts_expand(int,reader->nbuffer,srt->moff,srt->off);
376 srt->noff = 0;
377 srt->str.l = 0;
378 for (irec=1; irec<=reader->nbuffer; irec++)
379 {
380 bcf1_t *line = reader->buffer[irec];
381 if ( line->rid!=rid || line->pos!=min_pos ) break;
382
383 if ( srt->str.l ) kputc(';',&srt->str);
384 srt->off[srt->noff++] = srt->str.l;
385 size_t beg = srt->str.l;
386 int end_pos = -1;
387 for (ivar=1; ivar<line->n_allele; ivar++)
388 {
389 if ( ivar>1 ) kputc(',',&srt->str);
390 kputs(line->d.allele[0],&srt->str);
391 kputc('>',&srt->str);
392 kputs(line->d.allele[ivar],&srt->str);
393
394 // If symbolic allele, check also the END tag in case there are multiple events,
395 // such as <DEL>s, starting at the same positions
396 if ( line->d.allele[ivar][0]=='<' )
397 {
398 if ( end_pos==-1 )
399 {
400 bcf_info_t *end_info = bcf_get_info(reader->header,line,"END");
401 if ( end_info )
402 end_pos = (int)end_info->v1.i; // this is only to create a unique id, we don't mind a potential int64 overflow
403 else
404 end_pos = 0;
405 }
406 if ( end_pos )
407 {
408 kputc('/',&srt->str);
409 kputw(end_pos, &srt->str);
410 }
411 }
412 }
413 if ( line->n_allele==1 )
414 {
415 kputs(line->d.allele[0],&srt->str);
416 kputsn(">.",2,&srt->str);
417 }
418
419 // Create new variant or attach to existing one. But careful, there can be duplicate
420 // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
421 char *var_str = beg + srt->str.s;
422 int ret, var_idx = 0, var_end = srt->str.l;
423 while ( 1 )
424 {
425 ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
426 if ( ret==-1 ) break;
427
428 var_t *var = &srt->var[ivar];
429 if ( var->vcf[var->nvcf-1] != ireader ) break;
430
431 srt->str.l = var_end;
432 kputw(var_idx, &srt->str);
433 var_str = beg + srt->str.s;
434 var_idx++;
435 }
436 if ( ret==-1 )
437 {
438 ivar = srt->nvar++;
439 hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
440 srt->var[ivar].nvcf = 0;
441 khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
442 free(srt->var[ivar].str); // possible left-over from the previous position
443 }
444 var_t *var = &srt->var[ivar];
445 var->nalt = line->n_allele - 1;
446 var->type = bcf_get_variant_types(line);
447 srt->str.s[var_end] = 0;
448 if ( ret==-1 )
449 var->str = strdup(var_str);
450
451 int mvcf = var->mvcf;
452 var->nvcf++;
453 hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
454 if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
455 var->vcf[var->nvcf-1] = ireader;
456 var->rec[var->nvcf-1] = line;
457
458 grp.nvar++;
459 hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
460 grp.var[grp.nvar-1] = ivar;
461 }
462 char *grp_key = grp_create_key(srt);
463 int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
464 if ( ret==-1 )
465 {
466 igrp = srt->ngrp++;
467 hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
468 free(srt->grp[igrp].var);
469 srt->grp[igrp] = grp;
470 srt->grp[igrp].key = grp_key;
471 khash_str2int_set(srt->grp_str2int, grp_key, igrp);
472 memset(&grp,0,sizeof(grp_t));
473 }
474 else
475 free(grp_key);
476 srt->grp[igrp].nvcf++;
477 }
478 free(grp.var);
479
480 // initialize bitmask - which groups is the variant present in
481 for (ivar=0; ivar<srt->nvar; ivar++)
482 {
483 if ( kbs_resize(&srt->var[ivar].mask, srt->ngrp) < 0 )
484 {
485 fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__);
486 exit(1);
487 }
488 kbs_clear(srt->var[ivar].mask);
489 }
490 for (igrp=0; igrp<srt->ngrp; igrp++)
491 {
492 for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
493 {
494 int i = srt->grp[igrp].var[ivar];
495 kbs_insert(srt->var[i].mask, igrp);
496 }
497 }
498
499 // create the initial list of variant sets
500 for (ivar=0; ivar<srt->nvar; ivar++)
501 {
502 ivset = srt->nvset++;
503 hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);
504
505 varset_t *vset = &srt->vset[ivset];
506 vset->nvar = 1;
507 hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
508 vset->var[vset->nvar-1] = ivar;
509 var_t *var = &srt->var[ivar];
510 vset->cnt = var->nvcf;
511 if ( kbs_resize(&vset->mask, srt->ngrp) < 0 )
512 {
513 fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__);
514 exit(1);
515 }
516 kbs_clear(vset->mask);
517 kbs_bitwise_or(vset->mask, var->mask);
518
519 int type = 0;
520 if ( var->type==VCF_REF ) type |= SR_REF;
521 else
522 {
523 if ( var->type & VCF_SNP ) type |= SR_SNP;
524 if ( var->type & VCF_MNP ) type |= SR_SNP;
525 if ( var->type & VCF_INDEL ) type |= SR_INDEL;
526 if ( var->type & VCF_OTHER ) type |= SR_OTHER;
527 }
528 var->type = type;
529 }
530 #if DEBUG_VSETS
531 debug_vsets(srt);
532 #endif
533
534 // initialize the pairing matrix
535 hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
536 hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
537 memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
538 for (ivset=0; ivset<srt->nvset; ivset++)
539 {
540 varset_t *vset = &srt->vset[ivset];
541 for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
542 srt->cnt[ivset] = vset->cnt;
543 }
544
545 // pair the lines
546 while ( srt->nvset )
547 {
548 #if DEBUG_VSETS
549 fprintf(stderr,"\n");
550 debug_vsets(srt);
551 #endif
552
553 int imax = 0;
554 for (ivset=1; ivset<srt->nvset; ivset++)
555 if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;
556
557 int ipair = -1;
558 uint32_t max_score = 0;
559 for (ivset=0; ivset<srt->nvset; ivset++)
560 {
561 if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged
562 uint32_t score = pairing_score(srt, imax, ivset);
563 // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
564 if ( max_score < score ) { max_score = score; ipair = ivset; }
565 }
566
567 // merge rows creating a new variant set this way
568 if ( ipair!=-1 && ipair!=imax )
569 {
570 imax = merge_vsets(srt, imax, ipair);
571 continue;
572 }
573
574 push_vset(srt, imax);
575 }
576
577 srt->chr = chr;
578 srt->pos = min_pos;
579
580 return 0; // FIXME: check for errs in this function
581 }
582
bcf_sr_sort_next(bcf_srs_t * readers,sr_sort_t * srt,const char * chr,hts_pos_t min_pos)583 int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos)
584 {
585 int i,j;
586 assert( srt->nactive>0 );
587
588 if ( srt->nsr != readers->nreaders )
589 {
590 srt->sr = readers;
591 if ( srt->nsr < readers->nreaders )
592 {
593 srt->vcf_buf = (vcf_buf_t*) realloc(srt->vcf_buf,readers->nreaders*sizeof(vcf_buf_t));
594 memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr));
595 if ( srt->msr < srt->nsr ) srt->msr = srt->nsr;
596 }
597 srt->nsr = readers->nreaders;
598 srt->chr = NULL;
599 }
600 if ( srt->nactive == 1 )
601 {
602 if ( readers->nreaders>1 )
603 memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
604 bcf_sr_t *reader = &readers->readers[srt->active[0]];
605 assert( reader->buffer[1]->pos==min_pos );
606 bcf1_t *tmp = reader->buffer[0];
607 for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
608 reader->buffer[ reader->nbuffer ] = tmp;
609 reader->nbuffer--;
610 readers->has_line[srt->active[0]] = 1;
611 return 1;
612 }
613 if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
614
615 if ( !srt->vcf_buf[0].nrec ) return 0;
616
617 #if DEBUG_VBUF
618 debug_vbuf(srt);
619 #endif
620
621 int nret = 0;
622 for (i=0; i<srt->sr->nreaders; i++)
623 {
624 vcf_buf_t *buf = &srt->vcf_buf[i];
625
626 if ( buf->rec[0] )
627 {
628 bcf_sr_t *reader = &srt->sr->readers[i];
629 for (j=1; j<=reader->nbuffer; j++)
630 if ( reader->buffer[j] == buf->rec[0] ) break;
631
632 assert( j<=reader->nbuffer );
633
634 bcf1_t *tmp = reader->buffer[0];
635 reader->buffer[0] = reader->buffer[j++];
636 for (; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
637 reader->buffer[ reader->nbuffer ] = tmp;
638 reader->nbuffer--;
639
640 nret++;
641 srt->sr->has_line[i] = 1;
642 }
643 else
644 srt->sr->has_line[i] = 0;
645
646 buf->nrec--;
647 if ( buf->nrec > 0 )
648 memmove(buf->rec, &buf->rec[1], buf->nrec*sizeof(bcf1_t*));
649 }
650 return nret;
651 }
bcf_sr_sort_remove_reader(bcf_srs_t * readers,sr_sort_t * srt,int i)652 void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i)
653 {
654 //vcf_buf is allocated only in bcf_sr_sort_next
655 //So, a call to bcf_sr_add_reader() followed immediately by bcf_sr_remove_reader()
656 //would cause the program to crash in this segment
657 if (srt->vcf_buf)
658 {
659 free(srt->vcf_buf[i].rec);
660 if ( i+1 < srt->nsr )
661 memmove(&srt->vcf_buf[i], &srt->vcf_buf[i+1], (srt->nsr - i - 1)*sizeof(vcf_buf_t));
662 memset(srt->vcf_buf + srt->nsr - 1, 0, sizeof(vcf_buf_t));
663 }
664 }
bcf_sr_sort_init(sr_sort_t * srt)665 sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
666 {
667 if ( !srt ) return calloc(1,sizeof(sr_sort_t));
668 memset(srt,0,sizeof(sr_sort_t));
669 return srt;
670 }
bcf_sr_sort_reset(sr_sort_t * srt)671 void bcf_sr_sort_reset(sr_sort_t *srt)
672 {
673 srt->chr = NULL;
674 }
bcf_sr_sort_destroy(sr_sort_t * srt)675 void bcf_sr_sort_destroy(sr_sort_t *srt)
676 {
677 free(srt->active);
678 if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
679 if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
680 int i;
681 for (i=0; i<srt->nsr; i++) free(srt->vcf_buf[i].rec);
682 free(srt->vcf_buf);
683 for (i=0; i<srt->mvar; i++)
684 {
685 free(srt->var[i].str);
686 free(srt->var[i].vcf);
687 free(srt->var[i].rec);
688 kbs_destroy(srt->var[i].mask);
689 }
690 free(srt->var);
691 for (i=0; i<srt->mgrp; i++)
692 free(srt->grp[i].var);
693 free(srt->grp);
694 for (i=0; i<srt->mvset; i++)
695 {
696 kbs_destroy(srt->vset[i].mask);
697 free(srt->vset[i].var);
698 }
699 free(srt->vset);
700 free(srt->str.s);
701 free(srt->off);
702 free(srt->charp);
703 free(srt->cnt);
704 free(srt->pmat);
705 memset(srt,0,sizeof(*srt));
706 }
707
708