1 /*  plugins/setGT.c -- set gentoypes to given values
2 
3     Copyright (C) 2015-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <htslib/vcf.h>
28 #include <htslib/vcfutils.h>
29 #include <htslib/kfunc.h>
30 #include <inttypes.h>
31 #include <getopt.h>
32 #include <ctype.h>
33 #include "bcftools.h"
34 #include "filter.h"
35 
36 // Logic of the filters: include or exclude sites which match the filters?
37 #define FLT_INCLUDE 1
38 #define FLT_EXCLUDE 2
39 
40 typedef int (*cmp_f)(double a, double b);
41 
cmp_eq(double a,double b)42 static int cmp_eq(double a, double b) { return a==b ? 1 : 0; }
cmp_le(double a,double b)43 static int cmp_le(double a, double b) { return a<=b ? 1 : 0; }
cmp_ge(double a,double b)44 static int cmp_ge(double a, double b) { return a>=b ? 1 : 0; }
cmp_lt(double a,double b)45 static int cmp_lt(double a, double b) { return a<b ? 1 : 0; }
cmp_gt(double a,double b)46 static int cmp_gt(double a, double b) { return a>b ? 1 : 0; }
47 
48 typedef struct
49 {
50     bcf_hdr_t *in_hdr, *out_hdr;
51     int32_t *gts, mgts, *iarr, miarr;
52     int *arr, marr;
53     uint64_t nchanged;
54     int tgt_mask, new_mask, new_gt;
55     filter_t *filter;
56     char *filter_str;
57     struct {
58         int m_allele, M_allele, *gt, *phased, ploidy;
59         char *gt_str;
60     } custom;
61     int filter_logic;
62     uint8_t *smpl_pass;
63     double binom_val;
64     char *binom_tag;
65     cmp_f binom_cmp;
66 }
67 args_t;
68 
69 args_t *args = NULL;
70 
71 #define GT_MISSING   1
72 #define GT_PARTIAL  (1<<1)
73 #define GT_REF      (1<<2)
74 #define GT_MAJOR    (1<<3)
75 #define GT_PHASED   (1<<4)
76 #define GT_UNPHASED (1<<5)
77 #define GT_ALL      (1<<6)
78 #define GT_QUERY    (1<<7)
79 #define GT_BINOM    (1<<8)
80 #define GT_MINOR    (1<<9)
81 #define GT_CUSTOM   (1<<10)
82 
83 #define MINOR_ALLELE -1
84 #define MAJOR_ALLELE -2
85 
about(void)86 const char *about(void)
87 {
88     return "Set genotypes: partially missing to missing, missing to ref/major allele, etc.\n";
89 }
90 
usage(void)91 const char *usage(void)
92 {
93     return
94         "About: Sets genotypes. The target genotypes can be specified as:\n"
95         "           ./.  .. completely missing (\".\" or \"./.\", depending on ploidy)\n"
96         "           ./x  .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
97         "           .    .. partially or completely missing\n"
98         "           a    .. all genotypes\n"
99         "           b    .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
100         "           q    .. select genotypes using -i/-e options\n"
101         "       and the new genotype can be one of:\n"
102         "           .    .. missing (\".\" or \"./.\", keeps ploidy)\n"
103         "           0    .. reference allele (e.g. 0/0 or 0, keeps ploidy)\n"
104         "           c:GT .. custom genotype (e.g. 0/0, 0, 0/1, m/M, overrides ploidy)\n"
105         "           m    .. minor (the second most common) allele (e.g. 1/1 or 1, keeps ploidy)\n"
106         "           M    .. major allele (e.g. 1/1 or 1, keeps ploidy)\n"
107         "           p    .. phase genotype (0/1 becomes 0|1)\n"
108         "           u    .. unphase genotype and sort by allele (1|0 becomes 0/1)\n"
109         "Usage: bcftools +setGT [General Options] -- [Plugin Options]\n"
110         "Options:\n"
111         "   run \"bcftools plugin\" for a list of common options\n"
112         "\n"
113         "Plugin options:\n"
114         "   -e, --exclude <expr>        Exclude a genotype if true (requires -t q)\n"
115         "   -i, --include <expr>        include a genotype if true (requires -t q)\n"
116         "   -n, --new-gt <type>         Genotypes to set, see above\n"
117         "   -t, --target-gt <type>      Genotypes to change, see above\n"
118         "\n"
119         "Example:\n"
120         "   # set missing genotypes (\"./.\") to phased ref genotypes (\"0|0\")\n"
121         "   bcftools +setGT in.vcf -- -t . -n 0p\n"
122         "\n"
123         "   # set missing genotypes with DP>0 and GQ>20 to ref genotypes (\"0/0\")\n"
124         "   bcftools +setGT in.vcf -- -t q -n 0 -i 'GT=\".\" && FMT/DP>0 && GQ>20'\n"
125         "\n"
126         "   # set partially missing genotypes to completely missing\n"
127         "   bcftools +setGT in.vcf -- -t ./x -n .\n"
128         "\n"
129         "   # set heterozygous genotypes to 0/0 if binom.test(nAlt,nRef+nAlt,0.5)<1e-3\n"
130         "   bcftools +setGT in.vcf -- -t \"b:AD<1e-3\" -n 0\n"
131         "\n"
132         "   # force unphased heterozygous genotype if binom.test(nAlt,nRef+nAlt,0.5)>0.1\n"
133         "   bcftools +setGT in.vcf -- -t ./x -n c:'m/M'\n"
134         "\n";
135 }
136 
_parse_binom_expr_error(char * str)137 static void _parse_binom_expr_error(char *str)
138 {
139     error(
140             "Error parsing the expression: %s\n"
141             "Expected TAG CMP VAL, where\n"
142             "   TAG .. one of the format tags\n"
143             "   CMP .. operator, one of <, <=, >, >=\n"
144             "   VAL .. value\n"
145             "For example:\n"
146             "   bcftools +setGT in.vcf -- -t \"b:AD>1e-3\" -n 0\n"
147             "\n", str
148          );
149 }
parse_binom_expr(args_t * args,char * str)150 void parse_binom_expr(args_t *args, char *str)
151 {
152     if ( str[1]!=':' ) _parse_binom_expr_error(str);
153 
154     char *beg = str+2;
155     while ( *beg && isspace(*beg) ) beg++;
156     if ( !*beg ) _parse_binom_expr_error(str);
157     char *end = beg;
158     while ( *end )
159     {
160         if ( isspace(*end) || *end=='<' || *end=='=' || *end=='>' ) break;
161         end++;
162     }
163     if ( !*end ) _parse_binom_expr_error(str);
164     args->binom_tag = (char*) calloc(1,end-beg+1);
165     memcpy(args->binom_tag,beg,end-beg);
166     int tag_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->binom_tag);
167     if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The FORMAT tag \"%s\" is not present in the VCF\n", args->binom_tag);
168 
169     while ( *end && isspace(*end) ) end++;
170     if ( !*end ) _parse_binom_expr_error(str);
171 
172     if ( !strncmp(end,"<=",2) ) { args->binom_cmp = cmp_le; beg = end+2; }
173     else if ( !strncmp(end,">=",2) ) { args->binom_cmp = cmp_ge; beg = end+2; }
174     else if ( !strncmp(end,"==",2) ) { args->binom_cmp = cmp_eq; beg = end+2; }
175     else if ( !strncmp(end,"<",1) ) { args->binom_cmp = cmp_lt; beg = end+1; }
176     else if ( !strncmp(end,">",1) ) { args->binom_cmp = cmp_gt; beg = end+1; }
177     else if ( !strncmp(end,"=",1) ) { args->binom_cmp = cmp_eq; beg = end+1; }
178     else _parse_binom_expr_error(str);
179 
180     while ( *beg && isspace(*beg) ) beg++;
181     if ( !*beg ) _parse_binom_expr_error(str);
182 
183     args->binom_val = strtod(beg, &end);
184     while ( *end && isspace(*end) ) end++;
185     if ( *end ) _parse_binom_expr_error(str);
186 
187     args->tgt_mask |= GT_BINOM;
188     return;
189 }
190 
init(int argc,char ** argv,bcf_hdr_t * in,bcf_hdr_t * out)191 int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
192 {
193     args = (args_t*) calloc(1,sizeof(args_t));
194     args->in_hdr  = in;
195     args->out_hdr = out;
196 
197     int c;
198     static struct option loptions[] =
199     {
200         {"include",required_argument,NULL,'i'},
201         {"exclude",required_argument,NULL,'e'},
202         {"new-gt",required_argument,NULL,'n'},
203         {"target-gt",required_argument,NULL,'t'},
204         {NULL,0,NULL,0}
205     };
206     while ((c = getopt_long(argc, argv, "?hn:t:i:e:",loptions,NULL)) >= 0)
207     {
208         switch (c)
209         {
210             case 'e':
211                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
212                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
213             case 'i':
214                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
215                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
216             case 'n': args->new_mask = 0;
217                 if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING;
218                 if ( strchr(optarg,'0') ) args->new_mask |= GT_REF;
219                 if ( strchr(optarg,'m') ) args->new_mask |= GT_MINOR;
220                 if ( strchr(optarg,'M') ) args->new_mask |= GT_MAJOR;
221                 if ( strchr(optarg,'p') ) args->new_mask |= GT_PHASED;
222                 if ( strchr(optarg,'u') ) args->new_mask |= GT_UNPHASED;
223                 if ( !strncmp(optarg,"c:",2) ) { args->new_mask |= GT_CUSTOM; args->custom.gt_str = optarg; }
224                 if ( args->new_mask==0 ) error("Unknown parameter to --new-gt: %s\n", optarg);
225                 break;
226             case 't':
227                 if ( !strcmp(optarg,".") ) args->tgt_mask |= GT_MISSING|GT_PARTIAL;
228                 if ( !strcmp(optarg,"./x") ) args->tgt_mask |= GT_PARTIAL;
229                 if ( !strcmp(optarg,"./.") ) args->tgt_mask |= GT_MISSING;
230                 if ( !strcmp(optarg,"a") ) args->tgt_mask |= GT_ALL;
231                 if ( !strcmp(optarg,"q") ) args->tgt_mask |= GT_QUERY;
232                 if ( !strcmp(optarg,"?") ) args->tgt_mask |= GT_QUERY;        // for backward compatibility
233                 if ( strchr(optarg,'b') ) parse_binom_expr(args, strchr(optarg,'b'));
234                 if ( args->tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
235                 break;
236             case 'h':
237             case '?':
238             default: fprintf(stderr,"%s", usage()); exit(1); break;
239         }
240     }
241 
242     if ( !args->new_mask ) error("Expected -n option\n");
243     if ( !args->tgt_mask ) error("Expected -t option\n");
244 
245     if ( args->new_mask & GT_MISSING ) args->new_gt = bcf_gt_missing;
246     if ( args->new_mask & GT_REF ) args->new_gt = args->new_mask&GT_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
247     if ( args->new_mask & GT_CUSTOM )
248     {
249         char *ptr = args->custom.gt_str + 2;
250         while ( *ptr )
251         {
252             int allele;
253             if ( *ptr=='m' ) { allele = MINOR_ALLELE; args->new_mask |= GT_MINOR; }
254             else if ( *ptr=='M' ) { allele = MAJOR_ALLELE; args->new_mask |= GT_MAJOR; }
255             else if ( *ptr=='/' || *ptr=='|' )
256             {
257                 if ( !args->custom.ploidy ) error("Could not parse the genotype: %s\n",args->custom.gt_str);
258                 args->custom.phased[args->custom.ploidy-1] = *ptr=='|' ? 1 : 0;
259                 ptr++;
260                 continue;
261             }
262             else
263             {
264                 char *end;
265                 allele = strtol(ptr,&end,10);
266                 if ( end==ptr || (*end && *end!='/' && *end!='|') ) error("Could not parse the genotype: %s\n",args->custom.gt_str);
267                 ptr = end - 1;
268             }
269             args->custom.ploidy++;
270             args->custom.gt = realloc(args->custom.gt,sizeof(*args->custom.gt)*args->custom.ploidy);
271             args->custom.phased = realloc(args->custom.phased,sizeof(*args->custom.phased)*args->custom.ploidy);
272             args->custom.gt[args->custom.ploidy-1] = allele;
273             args->custom.phased[args->custom.ploidy-1] = 0;
274             ptr++;
275         }
276         // the phasing sign comes before the allele, move to the correct one
277         int i;
278         for (i=args->custom.ploidy-1; i>0; i--)
279             args->custom.phased[i] = args->custom.phased[i-1];
280     }
281 
282     if ( args->filter_str  && !(args->tgt_mask&GT_QUERY) ) error("Expected -tq with -i/-e\n");
283     if ( !args->filter_str && args->tgt_mask&GT_QUERY ) error("Expected -i/-e with -tq\n");
284     if ( args->filter_str ) args->filter = filter_init(in,args->filter_str);
285 
286     // Check the existence of FORMAT/GT tag, add it if not present
287     int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,"GT");
288     if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) )
289         bcf_hdr_printf(args->out_hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
290 
291     return 0;
292 }
293 
phase_gt(int32_t * ptr,int ngts)294 static inline int phase_gt(int32_t *ptr, int ngts)
295 {
296     int j, changed = 0;
297     for (j=0; j<ngts; j++)
298     {
299         if ( ptr[j]==bcf_int32_vector_end ) break;
300         if ( bcf_gt_is_phased(ptr[j]) ) continue;
301         ptr[j] = bcf_gt_phased(bcf_gt_allele(ptr[j]));    // add phasing; this may need a fix, I think the flag should be set for one allele only?
302         changed++;
303     }
304     return changed;
305 }
306 
unphase_gt(int32_t * ptr,int ngts)307 static inline int unphase_gt(int32_t *ptr, int ngts)
308 {
309     int j, changed = 0;
310     for (j=0; j<ngts; j++)
311     {
312         if ( ptr[j]==bcf_int32_vector_end ) break;
313         if ( !bcf_gt_is_phased(ptr[j]) ) continue;
314         ptr[j] = bcf_gt_unphased(bcf_gt_allele(ptr[j]));    // remove phasing
315         changed++;
316     }
317 
318     // insertion sort
319     int k, l;
320     for (k=1; k<j; k++)
321     {
322         int32_t x = ptr[k];
323         l = k;
324         while ( l>0 && ptr[l-1]>x )
325         {
326             ptr[l] = ptr[l-1];
327             l--;
328         }
329         ptr[l] = x;
330     }
331     return changed;
332 }
set_gt(int32_t * ptr,int ngts,int gt)333 static inline int set_gt(int32_t *ptr, int ngts, int gt)
334 {
335     int j, changed = 0;
336     for (j=0; j<ngts; j++)
337     {
338         if ( ptr[j]==bcf_int32_vector_end ) break;
339         if ( ptr[j] != gt ) changed++;
340         ptr[j] = gt;
341     }
342     return changed;
343 }
set_gt_custom(args_t * args,int32_t * ptr,int ngts,int nals)344 static inline int set_gt_custom(args_t *args, int32_t *ptr, int ngts, int nals)
345 {
346     int i, changed = 0, new_allele = 0;
347     for (i=0; i<args->custom.ploidy; i++)
348     {
349         if ( args->custom.gt[i]==MINOR_ALLELE ) new_allele = args->custom.m_allele;
350         else if ( args->custom.gt[i]==MAJOR_ALLELE ) new_allele = args->custom.M_allele;
351         else new_allele = args->custom.gt[i];
352         if ( new_allele >= nals ) continue;
353         new_allele = args->custom.phased[i] ? bcf_gt_phased(new_allele) : bcf_gt_unphased(new_allele);
354         if ( ptr[i]==new_allele ) continue;
355         ptr[i] = new_allele;
356         changed = 1;
357     }
358     while ( i<ngts )
359         ptr[i++] = bcf_int32_vector_end;
360     return changed;
361 }
362 
calc_binom(int na,int nb)363 static inline double calc_binom(int na, int nb)
364 {
365     if ( na + nb == 0 ) return 1;
366 
367     /*
368         kfunc.h implements kf_betai, which is the regularized beta function I_x(a,b) = P(X<=a/(a+b))
369     */
370     double prob = na > nb ? 2*kf_betai(na, nb + 1, 0.5) : 2*kf_betai(nb, na + 1, 0.5);
371     if ( prob > 1 ) prob = 1;
372 
373     return prob;
374 }
375 
process(bcf1_t * rec)376 bcf1_t *process(bcf1_t *rec)
377 {
378     if ( !rec->n_sample ) return rec;
379 
380     int ngts = bcf_get_genotypes(args->in_hdr, rec, &args->gts, &args->mgts);
381     ngts /= rec->n_sample;
382     int i, j, changed = 0;
383 
384     if ( args->new_mask & GT_CUSTOM )
385     {
386         if ( args->custom.ploidy > ngts ) // increased ploidy, expand the array
387         {
388             if ( args->mgts < args->custom.ploidy * rec->n_sample )
389             {
390                 args->mgts = args->custom.ploidy * rec->n_sample;
391                 args->gts  = (int32_t*)realloc(args->gts,args->mgts*sizeof(*args->gts));
392             }
393             for (i=0; i<rec->n_sample; i++)
394             {
395                 int32_t *src = args->gts + (rec->n_sample-i-1)*ngts;
396                 int32_t *dst = args->gts + (rec->n_sample-i-1)*args->custom.ploidy;
397                 for (j=0; j<ngts; j++) dst[j] = src[j];
398                 for (; j<args->custom.ploidy; j++) dst[j] = bcf_int32_vector_end;
399             }
400             ngts = args->custom.ploidy;
401         }
402     }
403 
404     int nbinom = 0;
405     if ( args->tgt_mask & GT_BINOM )
406     {
407         nbinom = bcf_get_format_int32(args->in_hdr, rec, args->binom_tag, &args->iarr, &args->miarr);
408         if ( nbinom<0 ) nbinom = 0;
409         nbinom /= rec->n_sample;
410     }
411 
412     // Calculating allele frequency for each allele and determining major allele
413     // only do this if use_major is true
414     if ( args->new_mask & GT_MAJOR )
415     {
416         int maxAC = -1, majorAllele = -1;
417         hts_expand(int,rec->n_allele,args->marr,args->arr);
418         int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT);
419         if ( ret<= 0 )
420             error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1);
421 
422         for (i=0; i < rec->n_allele; ++i)
423         {
424             if (args->arr[i] > maxAC)
425             {
426                 maxAC = args->arr[i];
427                 majorAllele = i;
428             }
429         }
430 
431         // replacing new_gt by major allele
432         args->new_gt = args->new_mask & GT_PHASED ?  bcf_gt_phased(majorAllele) : bcf_gt_unphased(majorAllele);
433         if ( args->new_mask & GT_CUSTOM ) args->custom.M_allele = majorAllele;
434     }
435     if ( args->new_mask & GT_MINOR )
436     {
437         hts_expand(int,rec->n_allele,args->marr,args->arr);
438         int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT);
439         if ( ret<= 0 )
440             error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1);
441 
442         int imax = 0;
443         for (i=1; i<rec->n_allele; i++)
444             if ( args->arr[imax] < args->arr[i] ) imax = i;
445         int imax2 = imax>0 ? 0 : (rec->n_allele>1 ? 1 : 0);
446         for (i=0; i<rec->n_allele; i++)
447             if ( i!=imax && args->arr[imax2] < args->arr[i] ) imax2 = i;
448 
449         // replacing new_gt by major allele
450         args->new_gt = args->new_mask & GT_PHASED ?  bcf_gt_phased(imax2) : bcf_gt_unphased(imax2);
451         if ( args->new_mask & GT_CUSTOM ) args->custom.m_allele = imax2;
452     }
453 
454     // replace gts
455     if ( nbinom && ngts>=2 )    // only diploid genotypes are considered: higher ploidy ignored further, haploid here
456     {
457         if ( args->filter ) filter_test(args->filter,rec,(const uint8_t **)&args->smpl_pass);
458         for (i=0; i<rec->n_sample; i++)
459         {
460             if ( args->smpl_pass )
461             {
462                 if ( !args->smpl_pass[i] && args->filter_logic==FLT_INCLUDE ) continue;
463                 if (  args->smpl_pass[i] && args->filter_logic==FLT_EXCLUDE ) continue;
464             }
465             int32_t *ptr = args->gts + i*ngts;
466             if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) continue;
467             if ( ptr[0]==ptr[1] ) continue; // a hom
468             int ia = bcf_gt_allele(ptr[0]);
469             int ib = bcf_gt_allele(ptr[1]);
470             if ( ia>=nbinom || ib>=nbinom )
471                 error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n",
472                         args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1);
473 
474             double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]);
475             if ( !args->binom_cmp(prob,args->binom_val) ) continue;
476 
477             if ( args->new_mask&GT_UNPHASED )
478                 changed += unphase_gt(ptr, ngts);
479             else if ( args->new_mask==GT_PHASED )
480                 changed += phase_gt(ptr, ngts);
481             else if ( args->new_mask & GT_CUSTOM )
482                 changed += set_gt_custom(args, ptr, ngts, rec->n_allele);
483             else
484                 changed += set_gt(ptr, ngts, args->new_gt);
485         }
486     }
487     else if ( args->tgt_mask&GT_QUERY )
488     {
489         int pass_site = filter_test(args->filter,rec,(const uint8_t **)&args->smpl_pass);
490         if ( pass_site && args->filter_logic==FLT_EXCLUDE )
491         {
492             // -i can include a site but exclude a sample, -e exclude a site but include a sample
493             if ( pass_site )
494             {
495                 if ( !args->smpl_pass ) return rec;
496                 pass_site = 0;
497                 for (i=0; i<rec->n_sample; i++)
498                 {
499                     if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
500                     else { args->smpl_pass[i] = 1; pass_site = 1; }
501                 }
502                 if ( !pass_site ) return rec;
503             }
504             else if ( args->smpl_pass )
505                 for (i=0; i<rec->n_sample; i++) args->smpl_pass[i] = 1;
506         }
507         else if ( !pass_site ) return rec;
508 
509         for (i=0; i<rec->n_sample; i++)
510         {
511             if ( args->smpl_pass && !args->smpl_pass[i] ) continue;
512             if ( args->new_mask&GT_UNPHASED )
513                 changed += unphase_gt(args->gts + i*ngts, ngts);
514             else if ( args->new_mask==GT_PHASED )
515                 changed += phase_gt(args->gts + i*ngts, ngts);
516             else if ( args->new_mask & GT_CUSTOM )
517                 changed += set_gt_custom(args, args->gts + i*ngts, ngts, rec->n_allele);
518             else
519                 changed += set_gt(args->gts + i*ngts, ngts, args->new_gt);
520         }
521     }
522     else
523     {
524         for (i=0; i<rec->n_sample; i++)
525         {
526             int ploidy = 0, nmiss = 0;
527             int32_t *ptr = args->gts + i*ngts;
528             for (j=0; j<ngts; j++)
529             {
530                 if ( ptr[j]==bcf_int32_vector_end ) break;
531                 ploidy++;
532                 if ( ptr[j]==bcf_gt_missing ) nmiss++;
533             }
534 
535             int do_set = 0;
536             if ( args->tgt_mask&GT_ALL ) do_set = 1;
537             else if ( args->tgt_mask&GT_PARTIAL && nmiss ) do_set = 1;
538             else if ( args->tgt_mask&GT_MISSING && ploidy==nmiss ) do_set = 1;
539 
540             if ( !do_set ) continue;
541 
542             if ( args->new_mask&GT_UNPHASED )
543                 changed += unphase_gt(ptr, ngts);
544             else if ( args->new_mask==GT_PHASED )
545                 changed += phase_gt(ptr, ngts);
546             else if ( args->new_mask & GT_CUSTOM )
547                 changed += set_gt_custom(args, args->gts + i*ngts, ngts, rec->n_allele);
548             else
549                 changed += set_gt(ptr, ngts, args->new_gt);
550         }
551     }
552     args->nchanged += changed;
553     if ( changed )
554     {
555         int ret = bcf_update_genotypes(args->out_hdr, rec, args->gts, ngts*rec->n_sample);
556         if ( ret!=0 ) error("Error: failed to update genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(args->in_hdr,rec),rec->pos+1);
557     }
558     return rec;
559 }
560 
destroy(void)561 void destroy(void)
562 {
563     fprintf(stderr,"Filled %"PRId64" alleles\n", args->nchanged);
564     free(args->custom.gt);
565     free(args->custom.phased);
566     free(args->binom_tag);
567     if ( args->filter ) filter_destroy(args->filter);
568     free(args->arr);
569     free(args->iarr);
570     free(args->gts);
571     free(args);
572 }
573 
574 
575