1 /* The MIT License
2 
3    Copyright (c) 2021 Genome Research Ltd.
4 
5    Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 #include <assert.h>
28 #include <strings.h>
29 #include <htslib/vcf.h>
30 #include <ctype.h>
31 #include "bcftools.h"
32 #include "abuf.h"
33 #include "rbuf.h"
34 
35 typedef enum
36 {
37     M_FIRST, M_SUM
38 }
39 merge_rule_t;
40 
41 typedef struct
42 {
43     kstring_t ref, alt;
44     int ial;        // the index of the original ALT allele, 1-based
45     int beg, end;   // 0-based inclusive offsets to ref,alt
46 }
47 atom_t;
48 
49 typedef struct
50 {
51     bcf1_t *rec;
52     int nori, nout;     // number of ALTs in the input, and VCF rows on output
53     uint8_t *tbl;       // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below
54     uint8_t *overlaps;  // is the star allele needed for this variant?
55     atom_t **atoms;
56     int matoms, mtbl, moverlaps;
57     char *info_tag;
58 }
59 split_t;
60 
61 struct _abuf_t
62 {
63     abuf_opt_t mode;
64     split_t split;
65     atom_t *atoms;
66     int natoms, matoms;
67     const bcf_hdr_t *hdr;
68     bcf_hdr_t *out_hdr;
69     bcf1_t **vcf;       // dimensions stored in rbuf
70     rbuf_t rbuf;
71 
72     kstring_t tmps;
73     void *tmp, *tmp2;
74     int32_t *gt, *tmpi;
75     int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2;
76     int star_allele;
77 };
78 
abuf_init(const bcf_hdr_t * hdr,abuf_opt_t mode)79 abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode)
80 {
81     if ( mode!=SPLIT ) error("todo\n");
82     abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t));
83     buf->hdr  = hdr;
84     buf->out_hdr = (bcf_hdr_t*) hdr;
85     buf->mode = mode;
86     buf->star_allele = 1;
87     rbuf_init(&buf->rbuf, 0);
88     return buf;
89 }
90 
abuf_destroy(abuf_t * buf)91 void abuf_destroy(abuf_t *buf)
92 {
93     int i;
94     for (i=0; i<buf->matoms; i++)
95     {
96         free(buf->atoms[i].ref.s);
97         free(buf->atoms[i].alt.s);
98     }
99     free(buf->atoms);
100     free(buf->split.atoms);
101     free(buf->split.overlaps);
102     free(buf->split.tbl);
103     for (i=0; i<buf->rbuf.m; i++)
104         if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]);
105     free(buf->vcf);
106     free(buf->gt);
107     free(buf->tmpi);
108     free(buf->tmp);
109     free(buf->tmp2);
110     free(buf->tmps.s);
111     free(buf);
112 }
113 
abuf_set(abuf_t * buf,abuf_opt_t key,void * value)114 void abuf_set(abuf_t *buf, abuf_opt_t key, void *value)
115 {
116     if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; }
117     if ( key==INFO_TAG )
118     {
119         buf->split.info_tag = *((char**)value);
120         bcf_hdr_printf(buf->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",buf->split.info_tag);
121         return;
122     }
123     if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; }
124 }
125 
126 /*
127     Split alleles into primitivs, e.g.
128         CC>TT  becomes  C>T,C>T
129         GCGT>GTGA  becomes C>T,T>A
130 
131     There is no sequence alignment, just trimming and hungry matching
132     from left side.
133 */
_atomize_allele(abuf_t * buf,bcf1_t * rec,int ial)134 static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
135 {
136     // Trim identical sequence from right
137     char *ref = rec->d.allele[0];
138     char *alt = rec->d.allele[ial];
139     int rlen = strlen(ref);
140     int alen = strlen(alt);
141     while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--;
142     int Mlen = rlen > alen ? rlen : alen;
143 
144     atom_t *atom = NULL;
145     int i;
146     for (i=0; i<Mlen; i++)
147     {
148         char refb = i<rlen ? ref[i] : '-';
149         char altb = i<alen ? alt[i] : '-';
150         if ( refb!=altb )
151         {
152             if ( refb=='-' || altb=='-' )
153             {
154                 assert(atom);
155                 if ( altb!='-' ) kputc(altb, &atom->alt);
156                 if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
157             }
158             else
159             {
160                 buf->natoms++;
161                 hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
162                 atom = &buf->atoms[buf->natoms-1];
163                 atom->ref.l = 0;
164                 atom->alt.l = 0;
165                 kputc(refb, &atom->ref);
166                 kputc(altb, &atom->alt);
167                 atom->beg = atom->end = i;
168                 atom->ial = ial;
169             }
170             continue;
171         }
172         if ( i+1>=rlen || i+1>=alen )   // is the next base a deletion?
173         {
174             buf->natoms++;
175             hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
176             atom = &buf->atoms[buf->natoms-1];
177             atom->ref.l = 0;
178             atom->alt.l = 0;
179             kputc(refb, &atom->ref);
180             kputc(altb, &atom->alt);
181             atom->beg = atom->end = i;
182             atom->ial = ial;
183         }
184     }
185 }
_atoms_inconsistent(const atom_t * a,const atom_t * b)186 static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
187 {
188     if ( a->beg < b->beg ) return -1;
189     if ( a->beg > b->beg ) return 1;
190     int rcmp = strcasecmp(a->ref.s,b->ref.s);
191     if ( rcmp ) return rcmp;
192     return strcasecmp(a->alt.s,b->alt.s);
193 }
194 /*
195     For reproducibility of tests on different platforms, we need to guarantee the same order of identical
196     atoms originating from different source ALTs.  Even though they are consistent, different values can be
197     picked for VCF annotations as currently the values from the one that comes first are used.
198 */
_cmp_atoms(const void * aptr,const void * bptr)199 static int _cmp_atoms(const void *aptr, const void *bptr)
200 {
201     const atom_t *a = (const atom_t*) aptr;
202     const atom_t *b = (const atom_t*) bptr;
203     int rcmp = _atoms_inconsistent(a,b);
204     if ( rcmp ) return rcmp;
205     if ( a->ial < b->ial ) return -1;
206     if ( a->ial > b->ial ) return 1;
207     return 0;
208 }
_split_table_init(abuf_t * buf,bcf1_t * rec,int natoms)209 static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms)
210 {
211     buf->split.rec  = rec;
212     buf->split.nori = rec->n_allele - 1;
213     buf->split.nout = 0;
214     hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl);
215     hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms);
216     hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps);
217     memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms);
218 }
_split_table_new(abuf_t * buf,atom_t * atom)219 static void _split_table_new(abuf_t *buf, atom_t *atom)
220 {
221     int i, iout = buf->split.nout++;
222     buf->split.atoms[iout] = atom;
223     uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
224     for (i=0; i<buf->split.nori; i++) ptr[i] = 0;
225     ptr[atom->ial-1] = 1;
226 }
_split_table_overlap(abuf_t * buf,int iout,atom_t * atom)227 static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
228 {
229     uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
230     ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
231     buf->split.overlaps[iout] = 1;
232 }
233 #if 0
234 static void _split_table_print(abuf_t *buf)
235 {
236     int i,j;
237     for (i=0; i<buf->split.nout; i++)
238     {
239         atom_t *atom = buf->split.atoms[i];
240         uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
241         fprintf(stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s);
242         for (j=0; j<buf->split.nori; j++) fprintf(stderr,"\t%d",(int)ptr[j]);
243         fprintf(stderr,"\n");
244     }
245 }
246 static void _split_table_print_atoms(abuf_t *buf)
247 {
248     int i;
249     for (i=0; i<buf->natoms; i++)
250     {
251         atom_t *atom = &buf->atoms[i];
252         fprintf(stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end);
253     }
254 }
255 #endif
_has_star_allele(abuf_t * buf,int iout)256 static inline uint8_t _has_star_allele(abuf_t *buf, int iout)
257 {
258     if ( !buf->star_allele ) return 0;
259     return buf->split.overlaps[iout];
260 }
_split_table_get_ial(abuf_t * buf,int irow,int ial)261 static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial)
262 {
263     if ( !ial ) return ial;
264     return buf->split.tbl[irow*buf->split.nori + ial - 1];
265 }
_split_table_set_chrom_qual(abuf_t * buf)266 static void _split_table_set_chrom_qual(abuf_t *buf)
267 {
268     int iout,j;
269     bcf1_t *rec = buf->split.rec;
270     for (iout=0; iout<buf->split.nout; iout++)
271     {
272         rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
273         j = rbuf_append(&buf->rbuf);
274         if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1();
275         bcf1_t *out = buf->vcf[j];
276         bcf_clear1(out);
277 
278         atom_t *atom = buf->split.atoms[iout];
279         out->rid = rec->rid;
280         out->pos = rec->pos + atom->beg;
281         bcf_update_id(buf->out_hdr, out, rec->d.id);
282 
283         const char *als[3];
284         als[0] = atom->ref.s;
285         als[1] = atom->alt.s;
286         als[2] = "*";
287         int nals = _has_star_allele(buf,iout) ? 3 : 2;
288         bcf_update_alleles(buf->out_hdr, out, als, nals);
289 
290         if ( bcf_float_is_missing(rec->qual) )
291             bcf_float_set_missing(out->qual);
292         else
293             out->qual = rec->qual;
294 
295         bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
296     }
297 }
298 int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst);
_split_table_set_info(abuf_t * buf,bcf_info_t * info,merge_rule_t mode)299 static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
300 {
301     const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
302     int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
303     int len  = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
304     if ( len==BCF_VL_G ) return;                                                // todo: Number=G INFO tags
305     if ( type==BCF_HT_LONG ) return;                                            // todo: 64bit integers
306 
307     bcf1_t *rec = buf->split.rec;
308     int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp;
309     int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
310     if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4;
311 
312     // Check for incorrect number of values. Note this check does not consider all values missing
313     // and will remove annotations that don't pass.
314     if ( type==BCF_HT_INT || type==BCF_HT_REAL )
315     {
316         if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
317     }
318 
319     if ( buf->mtmp2 < buf->mtmp )
320     {
321         buf->tmp2  = realloc(buf->tmp2, buf->mtmp);
322         if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp);
323         buf->mtmp2 = buf->mtmp;
324     }
325 
326     const int num_size = 4;
327     assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
328     int32_t missing = bcf_int32_missing;
329     void *missing_ptr = (void*)&missing;
330     if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
331     int32_t vector_end = bcf_int32_vector_end;
332     void *vector_end_ptr = (void*)&vector_end;
333     if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
334 
335     int iout,i;
336     for (iout=0; iout<buf->split.nout; iout++)
337     {
338         bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
339         int star_allele = _has_star_allele(buf,iout);
340         int ret = 0;
341         if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
342             ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
343         else if ( len==BCF_VL_A && type!=BCF_HT_STR )
344         {
345             int iori = buf->split.atoms[iout]->ial - 1;
346             assert( iori<nval );
347             if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
348                 memcpy(buf->tmp2,missing_ptr,num_size);
349             else
350                 memcpy(buf->tmp2,buf->tmp+num_size*iori,num_size);
351             if ( star_allele )
352                 memcpy(buf->tmp2+num_size,missing_ptr,num_size);
353             ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
354         }
355         else if ( len==BCF_VL_A && type==BCF_HT_STR )
356         {
357             int iori = buf->split.atoms[iout]->ial - 1;
358             kstring_t dst;
359             dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
360             kputc('.',&dst);
361             if ( star_allele ) kputs(",.",&dst);
362             copy_string_field(buf->tmp, iori, nval, &dst, 0);
363             if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1);
364             buf->mtmp2 = dst.m;
365             buf->tmp2  = dst.s;
366             ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
367         }
368         else if ( len==BCF_VL_R && type!=BCF_HT_STR )
369         {
370             memcpy(buf->tmp2,buf->tmp,num_size);   // REF contributes to all records
371             int iori = buf->split.atoms[iout]->ial;
372             assert( iori<nval && iori<=buf->split.nori );
373             if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
374                 memcpy(buf->tmp2+num_size,missing_ptr,num_size);
375             else
376                 memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size);
377             if ( type==BCF_HT_INT && mode==M_SUM )
378             {
379                 uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
380                 for (i=iori; i<buf->split.nori; i++)
381                 {
382                     if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1];
383                 }
384             }
385             if ( star_allele )
386                 memcpy(buf->tmp2+2*num_size,missing_ptr,num_size);
387             ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
388         }
389         else if ( len==BCF_VL_R && type==BCF_HT_STR )
390         {
391             int iori = buf->split.atoms[iout]->ial - 1;
392             kstring_t dst;
393             dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
394             kputs(".,.",&dst);
395             if ( star_allele ) kputs(",.",&dst);
396             copy_string_field(buf->tmp, 0, nval, &dst, 0);
397             copy_string_field(buf->tmp, iori+1, nval, &dst, 1);
398             if ( star_allele ) copy_string_field(".", 0, 1, &dst, 2);
399             buf->mtmp2 = dst.m;
400             buf->tmp2  = dst.s;
401             ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
402         }
403         if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
404     }
405 }
_split_table_set_history(abuf_t * buf)406 static void _split_table_set_history(abuf_t *buf)
407 {
408     int i,j;
409     bcf1_t *rec = buf->split.rec;
410     buf->tmps.l = 0;
411     ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
412     for (i=1; i<rec->n_allele; i++)
413     {
414         kputs(rec->d.allele[i],&buf->tmps);
415         if ( i+1<rec->n_allele ) kputc(',',&buf->tmps);
416         else kputc(',',&buf->tmps);
417     }
418     int len = buf->tmps.l;
419     buf->tmps.s[buf->tmps.l-1] = '|';
420 
421     for (i=0; i<buf->split.nout; i++)
422     {
423         buf->tmps.l = len;
424         bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
425         uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
426         for (j=0; j<buf->split.nori; j++)
427         {
428             if ( ptr[j]!=1 ) continue;
429             kputw(j+1,&buf->tmps);
430             kputc(',',&buf->tmps);
431         }
432         buf->tmps.s[--buf->tmps.l] = 0;
433         if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
434             error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
435     }
436 }
_split_table_set_gt(abuf_t * buf)437 static void _split_table_set_gt(abuf_t *buf)
438 {
439     int nsmpl = bcf_hdr_nsamples(buf->hdr);
440     if ( !nsmpl ) return;
441 
442     bcf1_t *rec = buf->split.rec;
443     buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt);
444     if ( buf->ngt<=0 ) return;
445     else
446         hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi);
447 
448     int iout,i,j;
449     for (iout=0; iout<buf->split.nout; iout++)
450     {
451         bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
452         int star_allele = _has_star_allele(buf,iout);
453         int max_ploidy = buf->ngt/nsmpl;
454         int32_t *src = buf->gt, *dst = buf->tmpi;
455         for (i=0; i<nsmpl; i++)
456         {
457             for (j=0; j<max_ploidy; j++)
458             {
459                 if ( src[j]==bcf_int32_vector_end || bcf_gt_is_missing(src[j]) )
460                 {
461                     dst[j] = src[j];
462                     continue;
463                 }
464                 int iori = bcf_gt_allele(src[j]);
465                 if ( iori<0 || iori>=rec->n_allele )
466                     error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1);
467                 int ial = _split_table_get_ial(buf,iout,iori);
468                 if ( ial==2 && !star_allele )
469                     dst[j] = bcf_gt_missing;
470                 else
471                     dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
472             }
473             src += max_ploidy;
474             dst += max_ploidy;
475         }
476         bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt);
477     }
478 }
_split_table_set_format(abuf_t * buf,bcf_fmt_t * fmt,merge_rule_t mode)479 static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode)
480 {
481     int nsmpl = bcf_hdr_nsamples(buf->hdr);
482     if ( !nsmpl ) return;
483 
484     const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id);
485     if ( tag[0]=='G' && tag[1]=='T' && !tag[2] )        // FORMAT/GT
486     {
487         _split_table_set_gt(buf);
488         return;
489     }
490 
491     int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
492     int len  = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
493     if ( type==BCF_HT_STR && len==BCF_VL_G ) return;                            // possible todo: Number=G for strings
494     if ( type==BCF_HT_LONG ) return;                                            // todo: 64bit integers
495 
496     const int num_size = 4;
497     assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
498     int32_t missing = bcf_int32_missing;
499     void *missing_ptr = (void*)&missing;
500     if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
501     int32_t vector_end = bcf_int32_vector_end;
502     void *vector_end_ptr = (void*)&vector_end;
503     if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
504 
505     bcf1_t *rec = buf->split.rec;
506     int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp;  // number of items
507     int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
508     if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size;                 // number of bytes
509 
510     if ( type==BCF_HT_INT || type==BCF_HT_REAL )
511     {
512         if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return;      // not haploid nor diploid
513 
514         // Check for incorrect number of values. Note this check does not consider all values missing
515         // and will remove annotations that don't pass.
516         if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
517     }
518 
519     // Increase buffer size to accommodate star allele
520     int nval1 = nval / nsmpl;
521     mtmp = buf->mtmp;
522     if ( type==BCF_HT_INT || type==BCF_HT_REAL )
523     {
524         if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
525         else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
526     }
527     else if ( type==BCF_HT_STR )
528     {
529         if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < nsmpl*(nval1+2) ) mtmp = nsmpl*(nval1+2); // +2 for the possibility of the star allele, ",."
530         else if ( len==BCF_VL_G && mtmp < nsmpl*(nval1+6) ) mtmp = nsmpl*(nval1+6);
531     }
532 
533     if ( buf->mtmp2 < mtmp )
534     {
535         buf->tmp2  = realloc(buf->tmp2, mtmp);
536         if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp);
537         buf->mtmp2 = mtmp;
538     }
539 
540     int iout, i, j;
541     for (iout=0; iout<buf->split.nout; iout++)
542     {
543         int star_allele = _has_star_allele(buf,iout);
544         bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
545         int ret = 0;
546         if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
547             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
548         else if ( len==BCF_VL_A && type!=BCF_HT_STR )
549         {
550             int iori = buf->split.atoms[iout]->ial - 1;
551             assert( iori<nval );
552             for (i=0; i<nsmpl; i++)
553             {
554                 void *src = buf->tmp  + nval1*num_size*i;
555                 void *dst = buf->tmp2 + num_size*i*(star_allele+1);
556                 if ( !memcmp(vector_end_ptr,src+iori*num_size,num_size) )
557                     memcpy(dst,missing_ptr,num_size);
558                 else
559                     memcpy(dst,src+iori*num_size,num_size);
560                 if ( star_allele )
561                     memcpy(dst+num_size,missing_ptr,num_size);
562             }
563             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
564         }
565         else if ( (len==BCF_VL_A || len==BCF_VL_R) && type==BCF_HT_STR )
566         {
567             int ioff = len==BCF_VL_R ? 1 : 0;
568             int iori = buf->split.atoms[iout]->ial - 1;
569             int nval1_dst = star_allele ? nval1 + 2 : nval1;
570             memset(buf->tmp2,0,nval1_dst*nsmpl);
571             for (i=0; i<nsmpl; i++)
572             {
573                 kstring_t dst;
574                 dst.l = 0; dst.m = nval1_dst; dst.s = (char*)buf->tmp2 + nval1_dst*i;
575                 kputc_('.',&dst);
576                 if ( star_allele ) kputsn_(",.",2,&dst);
577                 if ( len==BCF_VL_R )
578                 {
579                     kputsn_(",.",2,&dst);
580                     copy_string_field(buf->tmp+nval1*i, 0, nval1, &dst, 0);
581                 }
582                 copy_string_field(buf->tmp+nval1*i, iori+ioff, nval1, &dst, 0+ioff);
583                 if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1+ioff);
584             }
585             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nval1_dst*nsmpl, type);
586         }
587         else if ( len==BCF_VL_R && type!=BCF_HT_STR )
588         {
589             int iori = buf->split.atoms[iout]->ial;
590             assert( iori<=nval );
591             for (i=0; i<nsmpl; i++)
592             {
593                 void *src = buf->tmp  + nval1*num_size*i;
594                 void *dst = buf->tmp2 + num_size*i*(star_allele+2);
595                 memcpy(dst,src,num_size);
596                 memcpy(dst+num_size,src+iori*num_size,num_size);
597                 if ( type==BCF_HT_INT && mode==M_SUM )
598                 {
599                     uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
600                     for (j=iori; j<buf->split.nori; j++)
601                         if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1];
602                 }
603                 if ( star_allele )
604                     memcpy(dst+num_size*2,missing_ptr,num_size);
605             }
606             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
607         }
608         else if ( len==BCF_VL_G && type!=BCF_HT_STR )
609         {
610             int iori = buf->split.atoms[iout]->ial;
611             int i01  = bcf_alleles2gt(0,iori);
612             int i11  = bcf_alleles2gt(iori,iori);
613             assert( iori<nval );
614             #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end) { \
615                 for (i=0; i<nsmpl; i++) \
616                 { \
617                     type_t *src = (type_t*)buf->tmp + i*nval1; \
618                     type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \
619                     int n=0; /* determine ploidy of this genotype */ \
620                     while ( n<nval1 && !(is_vector_end) ) { n++; src++; } \
621                     src = (type_t*)buf->tmp + i*nval1; \
622                     memcpy(dst++,src,sizeof(type)); \
623                     int nmiss = 0, nend = 0; \
624                     if ( n==rec->n_allele ) /* haploid */ \
625                     { \
626                         memcpy(dst++,src+iori,sizeof(type)); \
627                         if ( star_allele ) { nmiss = 1; nend = 3; } \
628                         else nend = 1; \
629                     } \
630                     else if ( n==nval1 ) \
631                     { \
632                         memcpy(dst++,src+i01,sizeof(type)); \
633                         memcpy(dst++,src+i11,sizeof(type)); \
634                         if ( star_allele ) nmiss = 3; \
635                     } \
636                     else if ( n==1 && is_missing ) \
637                     { \
638                         if ( star_allele ) nend = 5; \
639                         else nend = 2; \
640                     } \
641                     else  \
642                         error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \
643                                 bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \
644                     for (j=0; j<nmiss; j++) { set_missing; dst++; } \
645                     for (j=0; j<nend; j++) { set_vector_end; dst++; } \
646                 } \
647             }
648             switch (type)
649             {
650                 case BCF_HT_INT:  BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *dst=bcf_int32_missing, *dst=bcf_int32_vector_end); break;
651                 case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*dst), bcf_float_set_vector_end(*dst)); break;
652                 default: error("Unexpected case: %d\n", type);
653             }
654             #undef BRANCH
655             ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
656         }
657         if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
658     }
659 }
_is_acgtn(char * seq)660 static inline int _is_acgtn(char *seq)
661 {
662     while ( *seq )
663     {
664         char c = toupper(*seq);
665         if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0;
666         seq++;
667     }
668     return 1;
669 }
670 /*
671     The atomization works as follows:
672     - Atomize each alternate allele separately by leaving out sequence identical to the reference. No
673       alignment is performed, just greedy trimming of the end, then from left. This operation returns
674       a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets
675       to the original REF allele
676     - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each
677       with a single ALT.
678     - For each new VCF record determine how to translate the original allele index (iori) to this new
679       record:
680         - 1: the original allele matches the atom
681         - 0: the original allele does not overlap this atom or the overlapping part matches the REF
682              allele
683         - 2 (or equivalently "."): there is a mismatch between the original allele and the atom
684       The mapping is encoded in a table with columns corresponding to the original ALTs and rows
685       to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching
686       atoms and 2's for overlapping mismatching atoms.
687 
688     Note that different ALT alleles can result in the same atom (the same output line) and this code
689     does not know how to reconcile possibly conflicting VCF annotations. This could be improved
690     and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should
691     be summed for the same atomized output allele. However, this level of complexity is not addressed
692     in this initial draft. Higher priority for now is to provide the inverse "join" operation.
693 
694     Update 2021-04-09:
695         Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT.
696         Note that the code will fail on missing values (todo) and it needs to be generalized and
697         made customizable.
698 */
_abuf_split(abuf_t * buf,bcf1_t * rec)699 void _abuf_split(abuf_t *buf, bcf1_t *rec)
700 {
701     int i,j;
702     if ( rec->n_allele < 2 )
703     {
704         rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
705         int j = rbuf_append(&buf->rbuf);
706         if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
707         buf->vcf[j] = bcf_dup(rec);
708         return;
709     }
710     for (i=1; i<rec->n_allele; i++)
711     {
712         if ( _is_acgtn(rec->d.allele[i]) ) continue;
713         rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
714         int j = rbuf_append(&buf->rbuf);
715         if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
716         buf->vcf[j] = bcf_dup(rec);
717         return;
718     }
719 
720     buf->natoms = 0;
721     for (i=1; i<rec->n_allele; i++) _atomize_allele(buf,rec,i);
722     qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms);
723     _split_table_init(buf,rec,buf->natoms);
724     for (i=0; i<buf->natoms; i++)
725     {
726         if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
727         _split_table_new(buf, &buf->atoms[i]);  // add a new unique output atom
728     }
729     for (i=0; i<buf->natoms; i++)
730     {
731         // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles
732         atom_t *atom = &buf->atoms[i];
733         for (j=0; j<buf->split.nout; j++)
734         {
735             atom_t *out = buf->split.atoms[j];
736             if ( atom == out ) continue;            // table already set to 1
737             if ( atom->beg > out->end ) continue;   // cannot overlap this output atom
738             if ( atom->end < out->beg ) break;      // this atom is ahead of all subsequent output records
739             _split_table_overlap(buf, j, atom);
740         }
741     }
742     assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
743 
744     // Create the output records, transferring all annotations:
745     // CHROM-QUAL
746     _split_table_set_chrom_qual(buf);
747 
748     // INFO
749     for (i=0; i<rec->n_info; i++)
750     {
751         // this implementation of merging rules is temporary: generalize and made customizable through the API
752         merge_rule_t mode = M_FIRST;
753         const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key);
754         if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
755 
756         _split_table_set_info(buf, &rec->d.info[i], mode);
757     }
758 
759     // Set INFO tag showing the original record
760     if ( buf->split.info_tag )
761         _split_table_set_history(buf);
762 
763     // FORMAT
764     for (i=0; i<rec->n_fmt; i++)
765     {
766         // this implementation of merging rules is temporary: generalize and made customizable through the API
767         merge_rule_t mode = M_FIRST;
768         const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id);
769         if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
770 
771         _split_table_set_format(buf, &rec->d.fmt[i], mode);
772     }
773 
774     // Check that at least one FORMAT field was added, if not, the number of samples must be set manually
775     for (i=0; i<buf->split.nout; i++)
776     {
777         bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
778         if ( !out->n_sample ) out->n_sample = rec->n_sample;
779     }
780 }
781 
abuf_push(abuf_t * buf,bcf1_t * rec)782 void abuf_push(abuf_t *buf, bcf1_t *rec)
783 {
784     bcf_unpack(rec, BCF_UN_ALL);
785     if ( buf->mode==SPLIT ) _abuf_split(buf,rec);
786 }
787 
abuf_flush(abuf_t * buf,int flush_all)788 bcf1_t *abuf_flush(abuf_t *buf, int flush_all)
789 {
790     int i;
791 
792     if ( buf->rbuf.n==0 ) return NULL;
793     if ( flush_all ) goto ret;
794 
795 ret:
796     i = rbuf_shift(&buf->rbuf);
797     return buf->vcf[i];
798 }
799 
800