1 /* plugins/setGT.c -- set gentoypes to given values
2
3 Copyright (C) 2015-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <htslib/vcf.h>
28 #include <htslib/vcfutils.h>
29 #include <htslib/kfunc.h>
30 #include <inttypes.h>
31 #include <getopt.h>
32 #include <ctype.h>
33 #include "bcftools.h"
34 #include "filter.h"
35
36 // Logic of the filters: include or exclude sites which match the filters?
37 #define FLT_INCLUDE 1
38 #define FLT_EXCLUDE 2
39
40 typedef int (*cmp_f)(double a, double b);
41
cmp_eq(double a,double b)42 static int cmp_eq(double a, double b) { return a==b ? 1 : 0; }
cmp_le(double a,double b)43 static int cmp_le(double a, double b) { return a<=b ? 1 : 0; }
cmp_ge(double a,double b)44 static int cmp_ge(double a, double b) { return a>=b ? 1 : 0; }
cmp_lt(double a,double b)45 static int cmp_lt(double a, double b) { return a<b ? 1 : 0; }
cmp_gt(double a,double b)46 static int cmp_gt(double a, double b) { return a>b ? 1 : 0; }
47
48 typedef struct
49 {
50 bcf_hdr_t *in_hdr, *out_hdr;
51 int32_t *gts, mgts, *iarr, miarr;
52 int *arr, marr;
53 uint64_t nchanged;
54 int tgt_mask, new_mask, new_gt;
55 filter_t *filter;
56 char *filter_str;
57 struct {
58 int m_allele, M_allele, *gt, *phased, ploidy;
59 char *gt_str;
60 } custom;
61 int filter_logic;
62 uint8_t *smpl_pass;
63 double binom_val;
64 char *binom_tag;
65 cmp_f binom_cmp;
66 }
67 args_t;
68
69 args_t *args = NULL;
70
71 #define GT_MISSING 1
72 #define GT_PARTIAL (1<<1)
73 #define GT_REF (1<<2)
74 #define GT_MAJOR (1<<3)
75 #define GT_PHASED (1<<4)
76 #define GT_UNPHASED (1<<5)
77 #define GT_ALL (1<<6)
78 #define GT_QUERY (1<<7)
79 #define GT_BINOM (1<<8)
80 #define GT_MINOR (1<<9)
81 #define GT_CUSTOM (1<<10)
82
83 #define MINOR_ALLELE -1
84 #define MAJOR_ALLELE -2
85
about(void)86 const char *about(void)
87 {
88 return "Set genotypes: partially missing to missing, missing to ref/major allele, etc.\n";
89 }
90
usage(void)91 const char *usage(void)
92 {
93 return
94 "About: Sets genotypes. The target genotypes can be specified as:\n"
95 " ./. .. completely missing (\".\" or \"./.\", depending on ploidy)\n"
96 " ./x .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
97 " . .. partially or completely missing\n"
98 " a .. all genotypes\n"
99 " b .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
100 " q .. select genotypes using -i/-e options\n"
101 " and the new genotype can be one of:\n"
102 " . .. missing (\".\" or \"./.\", keeps ploidy)\n"
103 " 0 .. reference allele (e.g. 0/0 or 0, keeps ploidy)\n"
104 " c:GT .. custom genotype (e.g. 0/0, 0, 0/1, m/M, overrides ploidy)\n"
105 " m .. minor (the second most common) allele (e.g. 1/1 or 1, keeps ploidy)\n"
106 " M .. major allele (e.g. 1/1 or 1, keeps ploidy)\n"
107 " p .. phase genotype (0/1 becomes 0|1)\n"
108 " u .. unphase genotype and sort by allele (1|0 becomes 0/1)\n"
109 "Usage: bcftools +setGT [General Options] -- [Plugin Options]\n"
110 "Options:\n"
111 " run \"bcftools plugin\" for a list of common options\n"
112 "\n"
113 "Plugin options:\n"
114 " -e, --exclude <expr> Exclude a genotype if true (requires -t q)\n"
115 " -i, --include <expr> include a genotype if true (requires -t q)\n"
116 " -n, --new-gt <type> Genotypes to set, see above\n"
117 " -t, --target-gt <type> Genotypes to change, see above\n"
118 "\n"
119 "Example:\n"
120 " # set missing genotypes (\"./.\") to phased ref genotypes (\"0|0\")\n"
121 " bcftools +setGT in.vcf -- -t . -n 0p\n"
122 "\n"
123 " # set missing genotypes with DP>0 and GQ>20 to ref genotypes (\"0/0\")\n"
124 " bcftools +setGT in.vcf -- -t q -n 0 -i 'GT=\".\" && FMT/DP>0 && GQ>20'\n"
125 "\n"
126 " # set partially missing genotypes to completely missing\n"
127 " bcftools +setGT in.vcf -- -t ./x -n .\n"
128 "\n"
129 " # set heterozygous genotypes to 0/0 if binom.test(nAlt,nRef+nAlt,0.5)<1e-3\n"
130 " bcftools +setGT in.vcf -- -t \"b:AD<1e-3\" -n 0\n"
131 "\n"
132 " # force unphased heterozygous genotype if binom.test(nAlt,nRef+nAlt,0.5)>0.1\n"
133 " bcftools +setGT in.vcf -- -t ./x -n c:'m/M'\n"
134 "\n";
135 }
136
_parse_binom_expr_error(char * str)137 static void _parse_binom_expr_error(char *str)
138 {
139 error(
140 "Error parsing the expression: %s\n"
141 "Expected TAG CMP VAL, where\n"
142 " TAG .. one of the format tags\n"
143 " CMP .. operator, one of <, <=, >, >=\n"
144 " VAL .. value\n"
145 "For example:\n"
146 " bcftools +setGT in.vcf -- -t \"b:AD>1e-3\" -n 0\n"
147 "\n", str
148 );
149 }
parse_binom_expr(args_t * args,char * str)150 void parse_binom_expr(args_t *args, char *str)
151 {
152 if ( str[1]!=':' ) _parse_binom_expr_error(str);
153
154 char *beg = str+2;
155 while ( *beg && isspace(*beg) ) beg++;
156 if ( !*beg ) _parse_binom_expr_error(str);
157 char *end = beg;
158 while ( *end )
159 {
160 if ( isspace(*end) || *end=='<' || *end=='=' || *end=='>' ) break;
161 end++;
162 }
163 if ( !*end ) _parse_binom_expr_error(str);
164 args->binom_tag = (char*) calloc(1,end-beg+1);
165 memcpy(args->binom_tag,beg,end-beg);
166 int tag_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->binom_tag);
167 if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The FORMAT tag \"%s\" is not present in the VCF\n", args->binom_tag);
168
169 while ( *end && isspace(*end) ) end++;
170 if ( !*end ) _parse_binom_expr_error(str);
171
172 if ( !strncmp(end,"<=",2) ) { args->binom_cmp = cmp_le; beg = end+2; }
173 else if ( !strncmp(end,">=",2) ) { args->binom_cmp = cmp_ge; beg = end+2; }
174 else if ( !strncmp(end,"==",2) ) { args->binom_cmp = cmp_eq; beg = end+2; }
175 else if ( !strncmp(end,"<",1) ) { args->binom_cmp = cmp_lt; beg = end+1; }
176 else if ( !strncmp(end,">",1) ) { args->binom_cmp = cmp_gt; beg = end+1; }
177 else if ( !strncmp(end,"=",1) ) { args->binom_cmp = cmp_eq; beg = end+1; }
178 else _parse_binom_expr_error(str);
179
180 while ( *beg && isspace(*beg) ) beg++;
181 if ( !*beg ) _parse_binom_expr_error(str);
182
183 args->binom_val = strtod(beg, &end);
184 while ( *end && isspace(*end) ) end++;
185 if ( *end ) _parse_binom_expr_error(str);
186
187 args->tgt_mask |= GT_BINOM;
188 return;
189 }
190
init(int argc,char ** argv,bcf_hdr_t * in,bcf_hdr_t * out)191 int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
192 {
193 args = (args_t*) calloc(1,sizeof(args_t));
194 args->in_hdr = in;
195 args->out_hdr = out;
196
197 int c;
198 static struct option loptions[] =
199 {
200 {"include",required_argument,NULL,'i'},
201 {"exclude",required_argument,NULL,'e'},
202 {"new-gt",required_argument,NULL,'n'},
203 {"target-gt",required_argument,NULL,'t'},
204 {NULL,0,NULL,0}
205 };
206 while ((c = getopt_long(argc, argv, "?hn:t:i:e:",loptions,NULL)) >= 0)
207 {
208 switch (c)
209 {
210 case 'e':
211 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
212 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
213 case 'i':
214 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
215 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
216 case 'n': args->new_mask = 0;
217 if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING;
218 if ( strchr(optarg,'0') ) args->new_mask |= GT_REF;
219 if ( strchr(optarg,'m') ) args->new_mask |= GT_MINOR;
220 if ( strchr(optarg,'M') ) args->new_mask |= GT_MAJOR;
221 if ( strchr(optarg,'p') ) args->new_mask |= GT_PHASED;
222 if ( strchr(optarg,'u') ) args->new_mask |= GT_UNPHASED;
223 if ( !strncmp(optarg,"c:",2) ) { args->new_mask |= GT_CUSTOM; args->custom.gt_str = optarg; }
224 if ( args->new_mask==0 ) error("Unknown parameter to --new-gt: %s\n", optarg);
225 break;
226 case 't':
227 if ( !strcmp(optarg,".") ) args->tgt_mask |= GT_MISSING|GT_PARTIAL;
228 if ( !strcmp(optarg,"./x") ) args->tgt_mask |= GT_PARTIAL;
229 if ( !strcmp(optarg,"./.") ) args->tgt_mask |= GT_MISSING;
230 if ( !strcmp(optarg,"a") ) args->tgt_mask |= GT_ALL;
231 if ( !strcmp(optarg,"q") ) args->tgt_mask |= GT_QUERY;
232 if ( !strcmp(optarg,"?") ) args->tgt_mask |= GT_QUERY; // for backward compatibility
233 if ( strchr(optarg,'b') ) parse_binom_expr(args, strchr(optarg,'b'));
234 if ( args->tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
235 break;
236 case 'h':
237 case '?':
238 default: fprintf(stderr,"%s", usage()); exit(1); break;
239 }
240 }
241
242 if ( !args->new_mask ) error("Expected -n option\n");
243 if ( !args->tgt_mask ) error("Expected -t option\n");
244
245 if ( args->new_mask & GT_MISSING ) args->new_gt = bcf_gt_missing;
246 if ( args->new_mask & GT_REF ) args->new_gt = args->new_mask>_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
247 if ( args->new_mask & GT_CUSTOM )
248 {
249 char *ptr = args->custom.gt_str + 2;
250 while ( *ptr )
251 {
252 int allele;
253 if ( *ptr=='m' ) { allele = MINOR_ALLELE; args->new_mask |= GT_MINOR; }
254 else if ( *ptr=='M' ) { allele = MAJOR_ALLELE; args->new_mask |= GT_MAJOR; }
255 else if ( *ptr=='/' || *ptr=='|' )
256 {
257 if ( !args->custom.ploidy ) error("Could not parse the genotype: %s\n",args->custom.gt_str);
258 args->custom.phased[args->custom.ploidy-1] = *ptr=='|' ? 1 : 0;
259 ptr++;
260 continue;
261 }
262 else
263 {
264 char *end;
265 allele = strtol(ptr,&end,10);
266 if ( end==ptr || (*end && *end!='/' && *end!='|') ) error("Could not parse the genotype: %s\n",args->custom.gt_str);
267 ptr = end - 1;
268 }
269 args->custom.ploidy++;
270 args->custom.gt = realloc(args->custom.gt,sizeof(*args->custom.gt)*args->custom.ploidy);
271 args->custom.phased = realloc(args->custom.phased,sizeof(*args->custom.phased)*args->custom.ploidy);
272 args->custom.gt[args->custom.ploidy-1] = allele;
273 args->custom.phased[args->custom.ploidy-1] = 0;
274 ptr++;
275 }
276 // the phasing sign comes before the allele, move to the correct one
277 int i;
278 for (i=args->custom.ploidy-1; i>0; i--)
279 args->custom.phased[i] = args->custom.phased[i-1];
280 }
281
282 if ( args->filter_str && !(args->tgt_mask>_QUERY) ) error("Expected -tq with -i/-e\n");
283 if ( !args->filter_str && args->tgt_mask>_QUERY ) error("Expected -i/-e with -tq\n");
284 if ( args->filter_str ) args->filter = filter_init(in,args->filter_str);
285
286 // Check the existence of FORMAT/GT tag, add it if not present
287 int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,"GT");
288 if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) )
289 bcf_hdr_printf(args->out_hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
290
291 return 0;
292 }
293
phase_gt(int32_t * ptr,int ngts)294 static inline int phase_gt(int32_t *ptr, int ngts)
295 {
296 int j, changed = 0;
297 for (j=0; j<ngts; j++)
298 {
299 if ( ptr[j]==bcf_int32_vector_end ) break;
300 if ( bcf_gt_is_phased(ptr[j]) ) continue;
301 ptr[j] = bcf_gt_phased(bcf_gt_allele(ptr[j])); // add phasing; this may need a fix, I think the flag should be set for one allele only?
302 changed++;
303 }
304 return changed;
305 }
306
unphase_gt(int32_t * ptr,int ngts)307 static inline int unphase_gt(int32_t *ptr, int ngts)
308 {
309 int j, changed = 0;
310 for (j=0; j<ngts; j++)
311 {
312 if ( ptr[j]==bcf_int32_vector_end ) break;
313 if ( !bcf_gt_is_phased(ptr[j]) ) continue;
314 ptr[j] = bcf_gt_unphased(bcf_gt_allele(ptr[j])); // remove phasing
315 changed++;
316 }
317
318 // insertion sort
319 int k, l;
320 for (k=1; k<j; k++)
321 {
322 int32_t x = ptr[k];
323 l = k;
324 while ( l>0 && ptr[l-1]>x )
325 {
326 ptr[l] = ptr[l-1];
327 l--;
328 }
329 ptr[l] = x;
330 }
331 return changed;
332 }
set_gt(int32_t * ptr,int ngts,int gt)333 static inline int set_gt(int32_t *ptr, int ngts, int gt)
334 {
335 int j, changed = 0;
336 for (j=0; j<ngts; j++)
337 {
338 if ( ptr[j]==bcf_int32_vector_end ) break;
339 if ( ptr[j] != gt ) changed++;
340 ptr[j] = gt;
341 }
342 return changed;
343 }
set_gt_custom(args_t * args,int32_t * ptr,int ngts,int nals)344 static inline int set_gt_custom(args_t *args, int32_t *ptr, int ngts, int nals)
345 {
346 int i, changed = 0, new_allele = 0;
347 for (i=0; i<args->custom.ploidy; i++)
348 {
349 if ( args->custom.gt[i]==MINOR_ALLELE ) new_allele = args->custom.m_allele;
350 else if ( args->custom.gt[i]==MAJOR_ALLELE ) new_allele = args->custom.M_allele;
351 else new_allele = args->custom.gt[i];
352 if ( new_allele >= nals ) continue;
353 new_allele = args->custom.phased[i] ? bcf_gt_phased(new_allele) : bcf_gt_unphased(new_allele);
354 if ( ptr[i]==new_allele ) continue;
355 ptr[i] = new_allele;
356 changed = 1;
357 }
358 while ( i<ngts )
359 ptr[i++] = bcf_int32_vector_end;
360 return changed;
361 }
362
calc_binom(int na,int nb)363 static inline double calc_binom(int na, int nb)
364 {
365 if ( na + nb == 0 ) return 1;
366
367 /*
368 kfunc.h implements kf_betai, which is the regularized beta function I_x(a,b) = P(X<=a/(a+b))
369 */
370 double prob = na > nb ? 2*kf_betai(na, nb + 1, 0.5) : 2*kf_betai(nb, na + 1, 0.5);
371 if ( prob > 1 ) prob = 1;
372
373 return prob;
374 }
375
process(bcf1_t * rec)376 bcf1_t *process(bcf1_t *rec)
377 {
378 if ( !rec->n_sample ) return rec;
379
380 int ngts = bcf_get_genotypes(args->in_hdr, rec, &args->gts, &args->mgts);
381 ngts /= rec->n_sample;
382 int i, j, changed = 0;
383
384 if ( args->new_mask & GT_CUSTOM )
385 {
386 if ( args->custom.ploidy > ngts ) // increased ploidy, expand the array
387 {
388 if ( args->mgts < args->custom.ploidy * rec->n_sample )
389 {
390 args->mgts = args->custom.ploidy * rec->n_sample;
391 args->gts = (int32_t*)realloc(args->gts,args->mgts*sizeof(*args->gts));
392 }
393 for (i=0; i<rec->n_sample; i++)
394 {
395 int32_t *src = args->gts + (rec->n_sample-i-1)*ngts;
396 int32_t *dst = args->gts + (rec->n_sample-i-1)*args->custom.ploidy;
397 for (j=0; j<ngts; j++) dst[j] = src[j];
398 for (; j<args->custom.ploidy; j++) dst[j] = bcf_int32_vector_end;
399 }
400 ngts = args->custom.ploidy;
401 }
402 }
403
404 int nbinom = 0;
405 if ( args->tgt_mask & GT_BINOM )
406 {
407 nbinom = bcf_get_format_int32(args->in_hdr, rec, args->binom_tag, &args->iarr, &args->miarr);
408 if ( nbinom<0 ) nbinom = 0;
409 nbinom /= rec->n_sample;
410 }
411
412 // Calculating allele frequency for each allele and determining major allele
413 // only do this if use_major is true
414 if ( args->new_mask & GT_MAJOR )
415 {
416 int maxAC = -1, majorAllele = -1;
417 hts_expand(int,rec->n_allele,args->marr,args->arr);
418 int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT);
419 if ( ret<= 0 )
420 error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1);
421
422 for (i=0; i < rec->n_allele; ++i)
423 {
424 if (args->arr[i] > maxAC)
425 {
426 maxAC = args->arr[i];
427 majorAllele = i;
428 }
429 }
430
431 // replacing new_gt by major allele
432 args->new_gt = args->new_mask & GT_PHASED ? bcf_gt_phased(majorAllele) : bcf_gt_unphased(majorAllele);
433 if ( args->new_mask & GT_CUSTOM ) args->custom.M_allele = majorAllele;
434 }
435 if ( args->new_mask & GT_MINOR )
436 {
437 hts_expand(int,rec->n_allele,args->marr,args->arr);
438 int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT);
439 if ( ret<= 0 )
440 error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1);
441
442 int imax = 0;
443 for (i=1; i<rec->n_allele; i++)
444 if ( args->arr[imax] < args->arr[i] ) imax = i;
445 int imax2 = imax>0 ? 0 : (rec->n_allele>1 ? 1 : 0);
446 for (i=0; i<rec->n_allele; i++)
447 if ( i!=imax && args->arr[imax2] < args->arr[i] ) imax2 = i;
448
449 // replacing new_gt by major allele
450 args->new_gt = args->new_mask & GT_PHASED ? bcf_gt_phased(imax2) : bcf_gt_unphased(imax2);
451 if ( args->new_mask & GT_CUSTOM ) args->custom.m_allele = imax2;
452 }
453
454 // replace gts
455 if ( nbinom && ngts>=2 ) // only diploid genotypes are considered: higher ploidy ignored further, haploid here
456 {
457 if ( args->filter ) filter_test(args->filter,rec,(const uint8_t **)&args->smpl_pass);
458 for (i=0; i<rec->n_sample; i++)
459 {
460 if ( args->smpl_pass )
461 {
462 if ( !args->smpl_pass[i] && args->filter_logic==FLT_INCLUDE ) continue;
463 if ( args->smpl_pass[i] && args->filter_logic==FLT_EXCLUDE ) continue;
464 }
465 int32_t *ptr = args->gts + i*ngts;
466 if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) continue;
467 if ( ptr[0]==ptr[1] ) continue; // a hom
468 int ia = bcf_gt_allele(ptr[0]);
469 int ib = bcf_gt_allele(ptr[1]);
470 if ( ia>=nbinom || ib>=nbinom )
471 error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n",
472 args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1);
473
474 double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]);
475 if ( !args->binom_cmp(prob,args->binom_val) ) continue;
476
477 if ( args->new_mask>_UNPHASED )
478 changed += unphase_gt(ptr, ngts);
479 else if ( args->new_mask==GT_PHASED )
480 changed += phase_gt(ptr, ngts);
481 else if ( args->new_mask & GT_CUSTOM )
482 changed += set_gt_custom(args, ptr, ngts, rec->n_allele);
483 else
484 changed += set_gt(ptr, ngts, args->new_gt);
485 }
486 }
487 else if ( args->tgt_mask>_QUERY )
488 {
489 int pass_site = filter_test(args->filter,rec,(const uint8_t **)&args->smpl_pass);
490 if ( pass_site && args->filter_logic==FLT_EXCLUDE )
491 {
492 // -i can include a site but exclude a sample, -e exclude a site but include a sample
493 if ( pass_site )
494 {
495 if ( !args->smpl_pass ) return rec;
496 pass_site = 0;
497 for (i=0; i<rec->n_sample; i++)
498 {
499 if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
500 else { args->smpl_pass[i] = 1; pass_site = 1; }
501 }
502 if ( !pass_site ) return rec;
503 }
504 else if ( args->smpl_pass )
505 for (i=0; i<rec->n_sample; i++) args->smpl_pass[i] = 1;
506 }
507 else if ( !pass_site ) return rec;
508
509 for (i=0; i<rec->n_sample; i++)
510 {
511 if ( args->smpl_pass && !args->smpl_pass[i] ) continue;
512 if ( args->new_mask>_UNPHASED )
513 changed += unphase_gt(args->gts + i*ngts, ngts);
514 else if ( args->new_mask==GT_PHASED )
515 changed += phase_gt(args->gts + i*ngts, ngts);
516 else if ( args->new_mask & GT_CUSTOM )
517 changed += set_gt_custom(args, args->gts + i*ngts, ngts, rec->n_allele);
518 else
519 changed += set_gt(args->gts + i*ngts, ngts, args->new_gt);
520 }
521 }
522 else
523 {
524 for (i=0; i<rec->n_sample; i++)
525 {
526 int ploidy = 0, nmiss = 0;
527 int32_t *ptr = args->gts + i*ngts;
528 for (j=0; j<ngts; j++)
529 {
530 if ( ptr[j]==bcf_int32_vector_end ) break;
531 ploidy++;
532 if ( ptr[j]==bcf_gt_missing ) nmiss++;
533 }
534
535 int do_set = 0;
536 if ( args->tgt_mask>_ALL ) do_set = 1;
537 else if ( args->tgt_mask>_PARTIAL && nmiss ) do_set = 1;
538 else if ( args->tgt_mask>_MISSING && ploidy==nmiss ) do_set = 1;
539
540 if ( !do_set ) continue;
541
542 if ( args->new_mask>_UNPHASED )
543 changed += unphase_gt(ptr, ngts);
544 else if ( args->new_mask==GT_PHASED )
545 changed += phase_gt(ptr, ngts);
546 else if ( args->new_mask & GT_CUSTOM )
547 changed += set_gt_custom(args, args->gts + i*ngts, ngts, rec->n_allele);
548 else
549 changed += set_gt(ptr, ngts, args->new_gt);
550 }
551 }
552 args->nchanged += changed;
553 if ( changed )
554 {
555 int ret = bcf_update_genotypes(args->out_hdr, rec, args->gts, ngts*rec->n_sample);
556 if ( ret!=0 ) error("Error: failed to update genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(args->in_hdr,rec),rec->pos+1);
557 }
558 return rec;
559 }
560
destroy(void)561 void destroy(void)
562 {
563 fprintf(stderr,"Filled %"PRId64" alleles\n", args->nchanged);
564 free(args->custom.gt);
565 free(args->custom.phased);
566 free(args->binom_tag);
567 if ( args->filter ) filter_destroy(args->filter);
568 free(args->arr);
569 free(args->iarr);
570 free(args->gts);
571 free(args);
572 }
573
574
575