1 /* The MIT License
2
3 Copyright (c) 2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <assert.h>
28 #include <strings.h>
29 #include <htslib/vcf.h>
30 #include <ctype.h>
31 #include "bcftools.h"
32 #include "abuf.h"
33 #include "rbuf.h"
34
35 typedef enum
36 {
37 M_FIRST, M_SUM
38 }
39 merge_rule_t;
40
41 typedef struct
42 {
43 kstring_t ref, alt;
44 int ial; // the index of the original ALT allele, 1-based
45 int beg, end; // 0-based inclusive offsets to ref,alt
46 }
47 atom_t;
48
49 typedef struct
50 {
51 bcf1_t *rec;
52 int nori, nout; // number of ALTs in the input, and VCF rows on output
53 uint8_t *tbl; // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below
54 uint8_t *overlaps; // is the star allele needed for this variant?
55 atom_t **atoms;
56 int matoms, mtbl, moverlaps;
57 char *info_tag;
58 }
59 split_t;
60
61 struct _abuf_t
62 {
63 abuf_opt_t mode;
64 split_t split;
65 atom_t *atoms;
66 int natoms, matoms;
67 const bcf_hdr_t *hdr;
68 bcf_hdr_t *out_hdr;
69 bcf1_t **vcf; // dimensions stored in rbuf
70 rbuf_t rbuf;
71
72 kstring_t tmps;
73 void *tmp, *tmp2;
74 int32_t *gt, *tmpi;
75 int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2;
76 int star_allele;
77 };
78
abuf_init(const bcf_hdr_t * hdr,abuf_opt_t mode)79 abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode)
80 {
81 if ( mode!=SPLIT ) error("todo\n");
82 abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t));
83 buf->hdr = hdr;
84 buf->out_hdr = (bcf_hdr_t*) hdr;
85 buf->mode = mode;
86 buf->star_allele = 1;
87 rbuf_init(&buf->rbuf, 0);
88 return buf;
89 }
90
abuf_destroy(abuf_t * buf)91 void abuf_destroy(abuf_t *buf)
92 {
93 int i;
94 for (i=0; i<buf->matoms; i++)
95 {
96 free(buf->atoms[i].ref.s);
97 free(buf->atoms[i].alt.s);
98 }
99 free(buf->atoms);
100 free(buf->split.atoms);
101 free(buf->split.overlaps);
102 free(buf->split.tbl);
103 for (i=0; i<buf->rbuf.m; i++)
104 if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]);
105 free(buf->vcf);
106 free(buf->gt);
107 free(buf->tmpi);
108 free(buf->tmp);
109 free(buf->tmp2);
110 free(buf->tmps.s);
111 free(buf);
112 }
113
abuf_set(abuf_t * buf,abuf_opt_t key,void * value)114 void abuf_set(abuf_t *buf, abuf_opt_t key, void *value)
115 {
116 if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; }
117 if ( key==INFO_TAG )
118 {
119 buf->split.info_tag = *((char**)value);
120 bcf_hdr_printf(buf->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",buf->split.info_tag);
121 return;
122 }
123 if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; }
124 }
125
126 /*
127 Split alleles into primitivs, e.g.
128 CC>TT becomes C>T,C>T
129 GCGT>GTGA becomes C>T,T>A
130
131 There is no sequence alignment, just trimming and hungry matching
132 from left side.
133 */
_atomize_allele(abuf_t * buf,bcf1_t * rec,int ial)134 static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
135 {
136 // Trim identical sequence from right
137 char *ref = rec->d.allele[0];
138 char *alt = rec->d.allele[ial];
139 int rlen = strlen(ref);
140 int alen = strlen(alt);
141 while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--;
142 int Mlen = rlen > alen ? rlen : alen;
143
144 atom_t *atom = NULL;
145 int i;
146 for (i=0; i<Mlen; i++)
147 {
148 char refb = i<rlen ? ref[i] : '-';
149 char altb = i<alen ? alt[i] : '-';
150 if ( refb!=altb )
151 {
152 if ( refb=='-' || altb=='-' )
153 {
154 assert(atom);
155 if ( altb!='-' ) kputc(altb, &atom->alt);
156 if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
157 }
158 else
159 {
160 buf->natoms++;
161 hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
162 atom = &buf->atoms[buf->natoms-1];
163 atom->ref.l = 0;
164 atom->alt.l = 0;
165 kputc(refb, &atom->ref);
166 kputc(altb, &atom->alt);
167 atom->beg = atom->end = i;
168 atom->ial = ial;
169 }
170 continue;
171 }
172 if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion?
173 {
174 buf->natoms++;
175 hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
176 atom = &buf->atoms[buf->natoms-1];
177 atom->ref.l = 0;
178 atom->alt.l = 0;
179 kputc(refb, &atom->ref);
180 kputc(altb, &atom->alt);
181 atom->beg = atom->end = i;
182 atom->ial = ial;
183 }
184 }
185 }
_atoms_inconsistent(const atom_t * a,const atom_t * b)186 static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
187 {
188 if ( a->beg < b->beg ) return -1;
189 if ( a->beg > b->beg ) return 1;
190 int rcmp = strcasecmp(a->ref.s,b->ref.s);
191 if ( rcmp ) return rcmp;
192 return strcasecmp(a->alt.s,b->alt.s);
193 }
194 /*
195 For reproducibility of tests on different platforms, we need to guarantee the same order of identical
196 atoms originating from different source ALTs. Even though they are consistent, different values can be
197 picked for VCF annotations as currently the values from the one that comes first are used.
198 */
_cmp_atoms(const void * aptr,const void * bptr)199 static int _cmp_atoms(const void *aptr, const void *bptr)
200 {
201 const atom_t *a = (const atom_t*) aptr;
202 const atom_t *b = (const atom_t*) bptr;
203 int rcmp = _atoms_inconsistent(a,b);
204 if ( rcmp ) return rcmp;
205 if ( a->ial < b->ial ) return -1;
206 if ( a->ial > b->ial ) return 1;
207 return 0;
208 }
_split_table_init(abuf_t * buf,bcf1_t * rec,int natoms)209 static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms)
210 {
211 buf->split.rec = rec;
212 buf->split.nori = rec->n_allele - 1;
213 buf->split.nout = 0;
214 hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl);
215 hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms);
216 hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps);
217 memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms);
218 }
_split_table_new(abuf_t * buf,atom_t * atom)219 static void _split_table_new(abuf_t *buf, atom_t *atom)
220 {
221 int i, iout = buf->split.nout++;
222 buf->split.atoms[iout] = atom;
223 uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
224 for (i=0; i<buf->split.nori; i++) ptr[i] = 0;
225 ptr[atom->ial-1] = 1;
226 }
_split_table_overlap(abuf_t * buf,int iout,atom_t * atom)227 static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
228 {
229 uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
230 ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
231 buf->split.overlaps[iout] = 1;
232 }
233 #if 0
234 static void _split_table_print(abuf_t *buf)
235 {
236 int i,j;
237 for (i=0; i<buf->split.nout; i++)
238 {
239 atom_t *atom = buf->split.atoms[i];
240 uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
241 fprintf(stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s);
242 for (j=0; j<buf->split.nori; j++) fprintf(stderr,"\t%d",(int)ptr[j]);
243 fprintf(stderr,"\n");
244 }
245 }
246 static void _split_table_print_atoms(abuf_t *buf)
247 {
248 int i;
249 for (i=0; i<buf->natoms; i++)
250 {
251 atom_t *atom = &buf->atoms[i];
252 fprintf(stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end);
253 }
254 }
255 #endif
_has_star_allele(abuf_t * buf,int iout)256 static inline uint8_t _has_star_allele(abuf_t *buf, int iout)
257 {
258 if ( !buf->star_allele ) return 0;
259 return buf->split.overlaps[iout];
260 }
_split_table_get_ial(abuf_t * buf,int irow,int ial)261 static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial)
262 {
263 if ( !ial ) return ial;
264 return buf->split.tbl[irow*buf->split.nori + ial - 1];
265 }
_split_table_set_chrom_qual(abuf_t * buf)266 static void _split_table_set_chrom_qual(abuf_t *buf)
267 {
268 int iout,j;
269 bcf1_t *rec = buf->split.rec;
270 for (iout=0; iout<buf->split.nout; iout++)
271 {
272 rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
273 j = rbuf_append(&buf->rbuf);
274 if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1();
275 bcf1_t *out = buf->vcf[j];
276 bcf_clear1(out);
277
278 atom_t *atom = buf->split.atoms[iout];
279 out->rid = rec->rid;
280 out->pos = rec->pos + atom->beg;
281 bcf_update_id(buf->out_hdr, out, rec->d.id);
282
283 const char *als[3];
284 als[0] = atom->ref.s;
285 als[1] = atom->alt.s;
286 als[2] = "*";
287 int nals = _has_star_allele(buf,iout) ? 3 : 2;
288 bcf_update_alleles(buf->out_hdr, out, als, nals);
289
290 if ( bcf_float_is_missing(rec->qual) )
291 bcf_float_set_missing(out->qual);
292 else
293 out->qual = rec->qual;
294
295 bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
296 }
297 }
298 int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst);
_split_table_set_info(abuf_t * buf,bcf_info_t * info,merge_rule_t mode)299 static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
300 {
301 const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
302 int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
303 int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
304 if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags
305 if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
306
307 bcf1_t *rec = buf->split.rec;
308 int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp;
309 int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
310 if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4;
311
312 // Check for incorrect number of values. Note this check does not consider all values missing
313 // and will remove annotations that don't pass.
314 if ( type==BCF_HT_INT || type==BCF_HT_REAL )
315 {
316 if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
317 }
318
319 if ( buf->mtmp2 < buf->mtmp )
320 {
321 buf->tmp2 = realloc(buf->tmp2, buf->mtmp);
322 if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp);
323 buf->mtmp2 = buf->mtmp;
324 }
325
326 const int num_size = 4;
327 assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
328 int32_t missing = bcf_int32_missing;
329 void *missing_ptr = (void*)&missing;
330 if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
331 int32_t vector_end = bcf_int32_vector_end;
332 void *vector_end_ptr = (void*)&vector_end;
333 if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
334
335 int iout,i;
336 for (iout=0; iout<buf->split.nout; iout++)
337 {
338 bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
339 int star_allele = _has_star_allele(buf,iout);
340 int ret = 0;
341 if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
342 ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
343 else if ( len==BCF_VL_A && type!=BCF_HT_STR )
344 {
345 int iori = buf->split.atoms[iout]->ial - 1;
346 assert( iori<nval );
347 if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
348 memcpy(buf->tmp2,missing_ptr,num_size);
349 else
350 memcpy(buf->tmp2,buf->tmp+num_size*iori,num_size);
351 if ( star_allele )
352 memcpy(buf->tmp2+num_size,missing_ptr,num_size);
353 ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
354 }
355 else if ( len==BCF_VL_A && type==BCF_HT_STR )
356 {
357 int iori = buf->split.atoms[iout]->ial - 1;
358 kstring_t dst;
359 dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
360 kputc('.',&dst);
361 if ( star_allele ) kputs(",.",&dst);
362 copy_string_field(buf->tmp, iori, nval, &dst, 0);
363 if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1);
364 buf->mtmp2 = dst.m;
365 buf->tmp2 = dst.s;
366 ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
367 }
368 else if ( len==BCF_VL_R && type!=BCF_HT_STR )
369 {
370 memcpy(buf->tmp2,buf->tmp,num_size); // REF contributes to all records
371 int iori = buf->split.atoms[iout]->ial;
372 assert( iori<nval && iori<=buf->split.nori );
373 if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
374 memcpy(buf->tmp2+num_size,missing_ptr,num_size);
375 else
376 memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size);
377 if ( type==BCF_HT_INT && mode==M_SUM )
378 {
379 uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
380 for (i=iori; i<buf->split.nori; i++)
381 {
382 if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1];
383 }
384 }
385 if ( star_allele )
386 memcpy(buf->tmp2+2*num_size,missing_ptr,num_size);
387 ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
388 }
389 else if ( len==BCF_VL_R && type==BCF_HT_STR )
390 {
391 int iori = buf->split.atoms[iout]->ial - 1;
392 kstring_t dst;
393 dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
394 kputs(".,.",&dst);
395 if ( star_allele ) kputs(",.",&dst);
396 copy_string_field(buf->tmp, 0, nval, &dst, 0);
397 copy_string_field(buf->tmp, iori+1, nval, &dst, 1);
398 if ( star_allele ) copy_string_field(".", 0, 1, &dst, 2);
399 buf->mtmp2 = dst.m;
400 buf->tmp2 = dst.s;
401 ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
402 }
403 if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
404 }
405 }
_split_table_set_history(abuf_t * buf)406 static void _split_table_set_history(abuf_t *buf)
407 {
408 int i,j;
409 bcf1_t *rec = buf->split.rec;
410 buf->tmps.l = 0;
411 ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
412 for (i=1; i<rec->n_allele; i++)
413 {
414 kputs(rec->d.allele[i],&buf->tmps);
415 if ( i+1<rec->n_allele ) kputc(',',&buf->tmps);
416 else kputc(',',&buf->tmps);
417 }
418 int len = buf->tmps.l;
419 buf->tmps.s[buf->tmps.l-1] = '|';
420
421 for (i=0; i<buf->split.nout; i++)
422 {
423 buf->tmps.l = len;
424 bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
425 uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
426 for (j=0; j<buf->split.nori; j++)
427 {
428 if ( ptr[j]!=1 ) continue;
429 kputw(j+1,&buf->tmps);
430 kputc(',',&buf->tmps);
431 }
432 buf->tmps.s[--buf->tmps.l] = 0;
433 if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
434 error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
435 }
436 }
_split_table_set_gt(abuf_t * buf)437 static void _split_table_set_gt(abuf_t *buf)
438 {
439 int nsmpl = bcf_hdr_nsamples(buf->hdr);
440 if ( !nsmpl ) return;
441
442 bcf1_t *rec = buf->split.rec;
443 buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt);
444 if ( buf->ngt<=0 ) return;
445 else
446 hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi);
447
448 int iout,i,j;
449 for (iout=0; iout<buf->split.nout; iout++)
450 {
451 bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
452 int star_allele = _has_star_allele(buf,iout);
453 int max_ploidy = buf->ngt/nsmpl;
454 int32_t *src = buf->gt, *dst = buf->tmpi;
455 for (i=0; i<nsmpl; i++)
456 {
457 for (j=0; j<max_ploidy; j++)
458 {
459 if ( src[j]==bcf_int32_vector_end || bcf_gt_is_missing(src[j]) )
460 {
461 dst[j] = src[j];
462 continue;
463 }
464 int iori = bcf_gt_allele(src[j]);
465 if ( iori<0 || iori>=rec->n_allele )
466 error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1);
467 int ial = _split_table_get_ial(buf,iout,iori);
468 if ( ial==2 && !star_allele )
469 dst[j] = bcf_gt_missing;
470 else
471 dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
472 }
473 src += max_ploidy;
474 dst += max_ploidy;
475 }
476 bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt);
477 }
478 }
_split_table_set_format(abuf_t * buf,bcf_fmt_t * fmt,merge_rule_t mode)479 static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode)
480 {
481 int nsmpl = bcf_hdr_nsamples(buf->hdr);
482 if ( !nsmpl ) return;
483
484 const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id);
485 if ( tag[0]=='G' && tag[1]=='T' && !tag[2] ) // FORMAT/GT
486 {
487 _split_table_set_gt(buf);
488 return;
489 }
490
491 int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
492 int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
493 if ( type==BCF_HT_STR && len==BCF_VL_G ) return; // possible todo: Number=G for strings
494 if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
495
496 const int num_size = 4;
497 assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
498 int32_t missing = bcf_int32_missing;
499 void *missing_ptr = (void*)&missing;
500 if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
501 int32_t vector_end = bcf_int32_vector_end;
502 void *vector_end_ptr = (void*)&vector_end;
503 if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
504
505 bcf1_t *rec = buf->split.rec;
506 int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items
507 int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
508 if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes
509
510 if ( type==BCF_HT_INT || type==BCF_HT_REAL )
511 {
512 if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
513
514 // Check for incorrect number of values. Note this check does not consider all values missing
515 // and will remove annotations that don't pass.
516 if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
517 }
518
519 // Increase buffer size to accommodate star allele
520 int nval1 = nval / nsmpl;
521 mtmp = buf->mtmp;
522 if ( type==BCF_HT_INT || type==BCF_HT_REAL )
523 {
524 if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
525 else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
526 }
527 else if ( type==BCF_HT_STR )
528 {
529 if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < nsmpl*(nval1+2) ) mtmp = nsmpl*(nval1+2); // +2 for the possibility of the star allele, ",."
530 else if ( len==BCF_VL_G && mtmp < nsmpl*(nval1+6) ) mtmp = nsmpl*(nval1+6);
531 }
532
533 if ( buf->mtmp2 < mtmp )
534 {
535 buf->tmp2 = realloc(buf->tmp2, mtmp);
536 if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp);
537 buf->mtmp2 = mtmp;
538 }
539
540 int iout, i, j;
541 for (iout=0; iout<buf->split.nout; iout++)
542 {
543 int star_allele = _has_star_allele(buf,iout);
544 bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
545 int ret = 0;
546 if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
547 ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
548 else if ( len==BCF_VL_A && type!=BCF_HT_STR )
549 {
550 int iori = buf->split.atoms[iout]->ial - 1;
551 assert( iori<nval );
552 for (i=0; i<nsmpl; i++)
553 {
554 void *src = buf->tmp + nval1*num_size*i;
555 void *dst = buf->tmp2 + num_size*i*(star_allele+1);
556 if ( !memcmp(vector_end_ptr,src+iori*num_size,num_size) )
557 memcpy(dst,missing_ptr,num_size);
558 else
559 memcpy(dst,src+iori*num_size,num_size);
560 if ( star_allele )
561 memcpy(dst+num_size,missing_ptr,num_size);
562 }
563 ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
564 }
565 else if ( (len==BCF_VL_A || len==BCF_VL_R) && type==BCF_HT_STR )
566 {
567 int ioff = len==BCF_VL_R ? 1 : 0;
568 int iori = buf->split.atoms[iout]->ial - 1;
569 int nval1_dst = star_allele ? nval1 + 2 : nval1;
570 memset(buf->tmp2,0,nval1_dst*nsmpl);
571 for (i=0; i<nsmpl; i++)
572 {
573 kstring_t dst;
574 dst.l = 0; dst.m = nval1_dst; dst.s = (char*)buf->tmp2 + nval1_dst*i;
575 kputc_('.',&dst);
576 if ( star_allele ) kputsn_(",.",2,&dst);
577 if ( len==BCF_VL_R )
578 {
579 kputsn_(",.",2,&dst);
580 copy_string_field(buf->tmp+nval1*i, 0, nval1, &dst, 0);
581 }
582 copy_string_field(buf->tmp+nval1*i, iori+ioff, nval1, &dst, 0+ioff);
583 if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1+ioff);
584 }
585 ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nval1_dst*nsmpl, type);
586 }
587 else if ( len==BCF_VL_R && type!=BCF_HT_STR )
588 {
589 int iori = buf->split.atoms[iout]->ial;
590 assert( iori<=nval );
591 for (i=0; i<nsmpl; i++)
592 {
593 void *src = buf->tmp + nval1*num_size*i;
594 void *dst = buf->tmp2 + num_size*i*(star_allele+2);
595 memcpy(dst,src,num_size);
596 memcpy(dst+num_size,src+iori*num_size,num_size);
597 if ( type==BCF_HT_INT && mode==M_SUM )
598 {
599 uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
600 for (j=iori; j<buf->split.nori; j++)
601 if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1];
602 }
603 if ( star_allele )
604 memcpy(dst+num_size*2,missing_ptr,num_size);
605 }
606 ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
607 }
608 else if ( len==BCF_VL_G && type!=BCF_HT_STR )
609 {
610 int iori = buf->split.atoms[iout]->ial;
611 int i01 = bcf_alleles2gt(0,iori);
612 int i11 = bcf_alleles2gt(iori,iori);
613 assert( iori<nval );
614 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end) { \
615 for (i=0; i<nsmpl; i++) \
616 { \
617 type_t *src = (type_t*)buf->tmp + i*nval1; \
618 type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \
619 int n=0; /* determine ploidy of this genotype */ \
620 while ( n<nval1 && !(is_vector_end) ) { n++; src++; } \
621 src = (type_t*)buf->tmp + i*nval1; \
622 memcpy(dst++,src,sizeof(type)); \
623 int nmiss = 0, nend = 0; \
624 if ( n==rec->n_allele ) /* haploid */ \
625 { \
626 memcpy(dst++,src+iori,sizeof(type)); \
627 if ( star_allele ) { nmiss = 1; nend = 3; } \
628 else nend = 1; \
629 } \
630 else if ( n==nval1 ) \
631 { \
632 memcpy(dst++,src+i01,sizeof(type)); \
633 memcpy(dst++,src+i11,sizeof(type)); \
634 if ( star_allele ) nmiss = 3; \
635 } \
636 else if ( n==1 && is_missing ) \
637 { \
638 if ( star_allele ) nend = 5; \
639 else nend = 2; \
640 } \
641 else \
642 error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \
643 bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \
644 for (j=0; j<nmiss; j++) { set_missing; dst++; } \
645 for (j=0; j<nend; j++) { set_vector_end; dst++; } \
646 } \
647 }
648 switch (type)
649 {
650 case BCF_HT_INT: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *dst=bcf_int32_missing, *dst=bcf_int32_vector_end); break;
651 case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*dst), bcf_float_set_vector_end(*dst)); break;
652 default: error("Unexpected case: %d\n", type);
653 }
654 #undef BRANCH
655 ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
656 }
657 if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
658 }
659 }
_is_acgtn(char * seq)660 static inline int _is_acgtn(char *seq)
661 {
662 while ( *seq )
663 {
664 char c = toupper(*seq);
665 if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0;
666 seq++;
667 }
668 return 1;
669 }
670 /*
671 The atomization works as follows:
672 - Atomize each alternate allele separately by leaving out sequence identical to the reference. No
673 alignment is performed, just greedy trimming of the end, then from left. This operation returns
674 a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets
675 to the original REF allele
676 - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each
677 with a single ALT.
678 - For each new VCF record determine how to translate the original allele index (iori) to this new
679 record:
680 - 1: the original allele matches the atom
681 - 0: the original allele does not overlap this atom or the overlapping part matches the REF
682 allele
683 - 2 (or equivalently "."): there is a mismatch between the original allele and the atom
684 The mapping is encoded in a table with columns corresponding to the original ALTs and rows
685 to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching
686 atoms and 2's for overlapping mismatching atoms.
687
688 Note that different ALT alleles can result in the same atom (the same output line) and this code
689 does not know how to reconcile possibly conflicting VCF annotations. This could be improved
690 and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should
691 be summed for the same atomized output allele. However, this level of complexity is not addressed
692 in this initial draft. Higher priority for now is to provide the inverse "join" operation.
693
694 Update 2021-04-09:
695 Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT.
696 Note that the code will fail on missing values (todo) and it needs to be generalized and
697 made customizable.
698 */
_abuf_split(abuf_t * buf,bcf1_t * rec)699 void _abuf_split(abuf_t *buf, bcf1_t *rec)
700 {
701 int i,j;
702 if ( rec->n_allele < 2 )
703 {
704 rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
705 int j = rbuf_append(&buf->rbuf);
706 if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
707 buf->vcf[j] = bcf_dup(rec);
708 return;
709 }
710 for (i=1; i<rec->n_allele; i++)
711 {
712 if ( _is_acgtn(rec->d.allele[i]) ) continue;
713 rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
714 int j = rbuf_append(&buf->rbuf);
715 if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
716 buf->vcf[j] = bcf_dup(rec);
717 return;
718 }
719
720 buf->natoms = 0;
721 for (i=1; i<rec->n_allele; i++) _atomize_allele(buf,rec,i);
722 qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms);
723 _split_table_init(buf,rec,buf->natoms);
724 for (i=0; i<buf->natoms; i++)
725 {
726 if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
727 _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
728 }
729 for (i=0; i<buf->natoms; i++)
730 {
731 // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles
732 atom_t *atom = &buf->atoms[i];
733 for (j=0; j<buf->split.nout; j++)
734 {
735 atom_t *out = buf->split.atoms[j];
736 if ( atom == out ) continue; // table already set to 1
737 if ( atom->beg > out->end ) continue; // cannot overlap this output atom
738 if ( atom->end < out->beg ) break; // this atom is ahead of all subsequent output records
739 _split_table_overlap(buf, j, atom);
740 }
741 }
742 assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
743
744 // Create the output records, transferring all annotations:
745 // CHROM-QUAL
746 _split_table_set_chrom_qual(buf);
747
748 // INFO
749 for (i=0; i<rec->n_info; i++)
750 {
751 // this implementation of merging rules is temporary: generalize and made customizable through the API
752 merge_rule_t mode = M_FIRST;
753 const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key);
754 if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
755
756 _split_table_set_info(buf, &rec->d.info[i], mode);
757 }
758
759 // Set INFO tag showing the original record
760 if ( buf->split.info_tag )
761 _split_table_set_history(buf);
762
763 // FORMAT
764 for (i=0; i<rec->n_fmt; i++)
765 {
766 // this implementation of merging rules is temporary: generalize and made customizable through the API
767 merge_rule_t mode = M_FIRST;
768 const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id);
769 if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
770
771 _split_table_set_format(buf, &rec->d.fmt[i], mode);
772 }
773
774 // Check that at least one FORMAT field was added, if not, the number of samples must be set manually
775 for (i=0; i<buf->split.nout; i++)
776 {
777 bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
778 if ( !out->n_sample ) out->n_sample = rec->n_sample;
779 }
780 }
781
abuf_push(abuf_t * buf,bcf1_t * rec)782 void abuf_push(abuf_t *buf, bcf1_t *rec)
783 {
784 bcf_unpack(rec, BCF_UN_ALL);
785 if ( buf->mode==SPLIT ) _abuf_split(buf,rec);
786 }
787
abuf_flush(abuf_t * buf,int flush_all)788 bcf1_t *abuf_flush(abuf_t *buf, int flush_all)
789 {
790 int i;
791
792 if ( buf->rbuf.n==0 ) return NULL;
793 if ( flush_all ) goto ret;
794
795 ret:
796 i = rbuf_shift(&buf->rbuf);
797 return buf->vcf[i];
798 }
799
800