1 /* The MIT License
2
3 Copyright (c) 2018-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <getopt.h>
31 #include <unistd.h> // for isatty
32 #include <inttypes.h>
33 #include <htslib/hts.h>
34 #include <htslib/vcf.h>
35 #include <htslib/kstring.h>
36 #include <htslib/kseq.h>
37 #include <htslib/synced_bcf_reader.h>
38 #include <htslib/vcfutils.h>
39 #include <htslib/kbitset.h>
40 #include <htslib/khash_str2int.h>
41 #include "bcftools.h"
42 #include "filter.h"
43
44
45 // Logic of the filters: include or exclude sites which match the filters?
46 #define FLT_INCLUDE 1
47 #define FLT_EXCLUDE 2
48
49 #define iCHILD 0
50 #define iFATHER 1
51 #define iMOTHER 2
52
53 #define VERBOSE_MENDEL 1
54 #define VERBOSE_TRANSMITTED 2
55
56 typedef struct
57 {
58 int idx[3]; // VCF sample index for father, mother and child
59 int pass; // do all three pass the filters?
60 }
61 trio_t;
62
63 typedef struct
64 {
65 uint32_t
66 npass, // number of genotypes passing the filter
67 nnon_ref, // number of non-reference genotypes
68 nmendel_err, // number of DNMs / mendelian errors
69 nnovel, // a singleton allele, but observed only in the child. Counted as mendel_err as well.
70 nsingleton, // het mother or father different from everyone else
71 ndoubleton, // het mother+child or father+child different from everyone else (transmitted alleles)
72 nts, ntv, // number of transitions and transversions
73 ndnm_recurrent, // number of recurrent DNMs / mendelian errors (counted as GTs, not sites; in ambiguous cases the allele with smaller AF is chosen)
74 ndnm_hom; // number of homozygous DNMs / mendelian errors
75 }
76 trio_stats_t;
77
78 typedef struct
79 {
80 trio_stats_t *stats;
81 filter_t *filter;
82 char *expr;
83 }
84 flt_stats_t;
85
86 typedef struct
87 {
88 kbitset_t *sd_bset; // singleton (1) or doubleton (0) trio?
89 uint32_t
90 nalt, // number of all alternate trios
91 nsd, // number of singleton or doubleton trios
92 *idx; // indexes of the singleton and doubleon trios
93 }
94 alt_trios_t; // for one alt allele
95
96 typedef struct
97 {
98 int max_alt_trios; // maximum number of alternate trios [1]
99 int malt_trios;
100 alt_trios_t *alt_trios;
101 int argc, filter_logic, regions_is_file, targets_is_file;
102 int nflt_str;
103 char *filter_str, **flt_str;
104 char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets;
105 bcf_srs_t *sr;
106 bcf_hdr_t *hdr;
107 trio_t *trio;
108 int ntrio, mtrio;
109 flt_stats_t *filters;
110 int nfilters;
111 int32_t *gt_arr, *ac, *ac_trio, *dnm_als;
112 int mgt_arr, mac, mac_trio, mdnm_als;
113 int verbose;
114 FILE *fp_out;
115 }
116 args_t;
117
118 args_t args;
119
about(void)120 const char *about(void)
121 {
122 return "Calculate transmission rate and other stats in trio children.\n";
123 }
124
usage_text(void)125 static const char *usage_text(void)
126 {
127 return
128 "\n"
129 "About: Calculate transmission rate in trio children. Use curly brackets to scan\n"
130 " a range of values simultaneously\n"
131 "Usage: bcftools +trio-stats [Plugin Options]\n"
132 "Plugin options:\n"
133 " -a, --alt-trios INT for transmission rate consider only sites with at most this\n"
134 " many alternate trios, 0 for unlimited [0]\n"
135 " -d, --debug TYPE comma-separted list of features: {mendel-errors,transmitted}\n"
136 " -e, --exclude EXPR exclude trios for which the expression is true (one matching sample invalidates a trio)\n"
137 " -i, --include EXPR include trios for which the expression is true (one failing sample invalidates a trio)\n"
138 " -o, --output FILE output file name [stdout]\n"
139 " -p, --ped FILE PED file\n"
140 " -P, --pfm P,F,M sample names of proband, father, and mother\n"
141 " -r, --regions REG restrict to comma-separated list of regions\n"
142 " -R, --regions-file FILE restrict to regions listed in a file\n"
143 " -t, --targets REG similar to -r but streams rather than index-jumps\n"
144 " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
145 "\n"
146 "Example:\n"
147 " bcftools +trio-stats -p file.ped -i 'GQ>{10,20,30,40,50}' file.bcf\n"
148 "\n";
149 }
150
cmp_trios(const void * _a,const void * _b)151 static int cmp_trios(const void *_a, const void *_b)
152 {
153 trio_t *a = (trio_t *) _a;
154 trio_t *b = (trio_t *) _b;
155 int i;
156 int amin = a->idx[0];
157 for (i=1; i<3; i++)
158 if ( amin > a->idx[i] ) amin = a->idx[i];
159 int bmin = b->idx[0];
160 for (i=1; i<3; i++)
161 if ( bmin > b->idx[i] ) bmin = b->idx[i];
162 if ( amin < bmin ) return -1;
163 if ( amin > bmin ) return 1;
164 return 0;
165 }
166
parse_ped(args_t * args,char * fname)167 static void parse_ped(args_t *args, char *fname)
168 {
169 htsFile *fp = hts_open(fname, "r");
170 if ( !fp ) error("Could not read: %s\n", fname);
171
172 kstring_t str = {0,0,0};
173 if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname);
174
175 kstring_t tmp = {0,0,0};
176 void *has_smpl = khash_str2int_init();
177 void *has_trio = khash_str2int_init();
178
179 int dup_trio_warned = 0;
180 int moff = 0, *off = NULL;
181 do
182 {
183 // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment
184 // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0
185 int ncols = ksplit_core(str.s,0,&moff,&off);
186 if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s);
187
188 int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]);
189 if ( father<0 ) continue;
190 int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]);
191 if ( mother<0 ) continue;
192 int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]);
193 if ( child<0 ) continue;
194
195 tmp.l = 0;
196 ksprintf(&tmp,"%s %s %s",&str.s[off[1]],&str.s[off[2]],&str.s[off[3]]);
197 if ( khash_str2int_has_key(has_trio,tmp.s) )
198 {
199 if ( !dup_trio_warned ) fprintf(stderr,"Warning: The trio \"%s\" is listed multiple times in %s, skipping. (This message is printed only once.)\n",tmp.s,fname);
200 dup_trio_warned = 1;
201 continue;
202 }
203 if ( khash_str2int_has_key(has_smpl,&str.s[off[1]]) )
204 error("Error: The child \"%s\" is listed in two trios\n",&str.s[off[1]]);
205 khash_str2int_inc(has_smpl,strdup(&str.s[off[1]]));
206 khash_str2int_inc(has_trio,strdup(tmp.s));
207
208 args->ntrio++;
209 hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio);
210 trio_t *trio = &args->trio[args->ntrio-1];
211 trio->idx[iFATHER] = father;
212 trio->idx[iMOTHER] = mother;
213 trio->idx[iCHILD] = child;
214 }
215 while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 );
216
217 fprintf(stderr,"Identified %d complete trios in the VCF file\n", args->ntrio);
218 if ( !args->ntrio ) error("No complete trio identified\n");
219
220 // sort the sample by index so that they are accessed more or less sequentially
221 qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios);
222
223 khash_str2int_destroy_free(has_smpl);
224 khash_str2int_destroy_free(has_trio);
225 free(tmp.s);
226 free(str.s);
227 free(off);
228 if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname);
229 }
230
parse_filters(args_t * args)231 static void parse_filters(args_t *args)
232 {
233 if ( !args->filter_str ) return;
234 int mflt = 1;
235 args->nflt_str = 1;
236 args->flt_str = (char**) malloc(sizeof(char*));
237 args->flt_str[0] = strdup(args->filter_str);
238 while (1)
239 {
240 int i, expanded = 0;
241 for (i=args->nflt_str-1; i>=0; i--)
242 {
243 char *exp_beg = strchr(args->flt_str[i], '{');
244 if ( !exp_beg ) continue;
245 char *exp_end = strchr(exp_beg+1, '}');
246 if ( !exp_end ) error("Could not parse the expression: %s\n", args->filter_str);
247 char *beg = exp_beg+1, *mid = beg;
248 while ( mid<exp_end )
249 {
250 while ( mid<exp_end && *mid!=',' ) mid++;
251 kstring_t tmp = {0,0,0};
252 kputsn(args->flt_str[i], exp_beg - args->flt_str[i], &tmp);
253 kputsn(beg, mid - beg, &tmp);
254 kputs(exp_end+1, &tmp);
255 args->nflt_str++;
256 hts_expand(char*, args->nflt_str, mflt, args->flt_str);
257 args->flt_str[args->nflt_str-1] = tmp.s;
258 beg = ++mid;
259 }
260 expanded = 1;
261 free(args->flt_str[i]);
262 memmove(&args->flt_str[i], &args->flt_str[i+1], (args->nflt_str-i-1)*sizeof(*args->flt_str));
263 args->nflt_str--;
264 args->flt_str[args->nflt_str] = NULL;
265 }
266 if ( !expanded ) break;
267 }
268
269 fprintf(stderr,"Collecting data for %d filtering expressions\n", args->nflt_str);
270 }
271
init_data(args_t * args)272 static void init_data(args_t *args)
273 {
274 args->sr = bcf_sr_init();
275 if ( args->regions )
276 {
277 args->sr->require_index = 1;
278 if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
279 }
280 if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
281 if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
282 args->hdr = bcf_sr_get_header(args->sr,0);
283
284 if ( args->ped_fname )
285 parse_ped(args, args->ped_fname);
286 else
287 {
288 args->ntrio = 1;
289 args->trio = (trio_t*) calloc(1,sizeof(trio_t));
290 int ibeg, iend = 0;
291 while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++;
292 if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm);
293 args->pfm[iend] = 0;
294 int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm);
295 if ( child<0 ) error("No such sample: \"%s\"\n", args->pfm);
296 args->pfm[iend] = ',';
297 ibeg = ++iend;
298 while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++;
299 if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm);
300 args->pfm[iend] = 0;
301 int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg);
302 if ( father<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg);
303 args->pfm[iend] = ',';
304 ibeg = ++iend;
305 int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg);
306 if ( mother<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg);
307 args->trio[0].idx[iFATHER] = father;
308 args->trio[0].idx[iMOTHER] = mother;
309 args->trio[0].idx[iCHILD] = child;
310 }
311 parse_filters(args);
312
313 int i;
314 if ( !args->nflt_str )
315 {
316 args->filters = (flt_stats_t*) calloc(1, sizeof(flt_stats_t));
317 args->nfilters = 1;
318 args->filters[0].expr = strdup("all");
319 }
320 else
321 {
322 args->nfilters = args->nflt_str;
323 args->filters = (flt_stats_t*) calloc(args->nfilters, sizeof(flt_stats_t));
324 for (i=0; i<args->nfilters; i++)
325 {
326 args->filters[i].filter = filter_init(args->hdr, args->flt_str[i]);
327 args->filters[i].expr = strdup(args->flt_str[i]);
328
329 // replace tab's with spaces so that the output stays parsable
330 char *tmp = args->filters[i].expr;
331 while ( *tmp )
332 {
333 if ( *tmp=='\t' ) *tmp = ' ';
334 tmp++;
335 }
336 }
337 }
338 for (i=0; i<args->nfilters; i++)
339 args->filters[i].stats = (trio_stats_t*) calloc(args->ntrio,sizeof(trio_stats_t));
340
341 args->fp_out = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w");
342 if ( !args->fp_out ) error("Could not open the file for writing: %s\n", args->output_fname);
343 fprintf(args->fp_out,"# CMD line shows the command line used to generate this output\n");
344 fprintf(args->fp_out,"# DEF lines define expressions for all tested thresholds\n");
345 fprintf(args->fp_out,"# FLT* lines report numbers for every threshold and every trio:\n");
346 i = 0;
347 fprintf(args->fp_out,"# %d) filter id\n", ++i);
348 fprintf(args->fp_out,"# %d) child\n", ++i);
349 fprintf(args->fp_out,"# %d) father\n", ++i);
350 fprintf(args->fp_out,"# %d) mother\n", ++i);
351 fprintf(args->fp_out,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i);
352 fprintf(args->fp_out,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i);
353 fprintf(args->fp_out,"# %d) number of DNMs/Mendelian errors\n", ++i);
354 fprintf(args->fp_out,"# %d) number of novel singleton alleles in the child (counted also as DNM / Mendelian error)\n", ++i);
355 fprintf(args->fp_out,"# %d) number of untransmitted trio singletons (one alternate allele present in one parent)\n", ++i);
356 fprintf(args->fp_out,"# %d) number of transmitted trio singletons (one alternate allele present in one parent and the child)\n", ++i);
357 fprintf(args->fp_out,"# %d) number of transitions, all distinct ALT alleles present in the trio are considered\n", ++i);
358 fprintf(args->fp_out,"# %d) number of transversions, all distinct ALT alleles present in the trio are considered\n", ++i);
359 fprintf(args->fp_out,"# %d) overall ts/tv, all distinct ALT alleles present in the trio are considered\n", ++i);
360 fprintf(args->fp_out,"# %d) number of homozygous DNMs/Mendelian errors (likely genotyping errors)\n", ++i);
361 fprintf(args->fp_out,"# %d) number of recurrent DNMs/Mendelian errors (non-inherited alleles present in other samples; counts GTs, not sites)\n", ++i);
362 fprintf(args->fp_out, "CMD\t%s", args->argv[0]);
363 for (i=1; i<args->argc; i++) fprintf(args->fp_out, " %s",args->argv[i]);
364 fprintf(args->fp_out, "\n");
365 }
alt_trios_reset(args_t * args,int nals)366 static void alt_trios_reset(args_t *args, int nals)
367 {
368 int i;
369 hts_expand0(alt_trios_t, nals, args->malt_trios, args->alt_trios);
370 for (i=0; i<nals; i++)
371 {
372 alt_trios_t *tr = &args->alt_trios[i];
373 if ( !tr->idx )
374 {
375 tr->idx = (uint32_t*)malloc(sizeof(*tr->idx)*args->ntrio);
376 tr->sd_bset = kbs_init(args->ntrio);
377 }
378 else
379 kbs_clear(tr->sd_bset);
380 tr->nsd = 0;
381 tr->nalt = 0;
382 }
383 }
alt_trios_destroy(args_t * args)384 static void alt_trios_destroy(args_t *args)
385 {
386 if ( !args->max_alt_trios ) return;
387 int i;
388 for (i=0; i<args->malt_trios; i++)
389 {
390 free(args->alt_trios[i].idx);
391 kbs_destroy(args->alt_trios[i].sd_bset);
392 }
393 free(args->alt_trios);
394 }
alt_trios_add(args_t * args,int itrio,int ial,int is_singleton)395 static inline void alt_trios_add(args_t *args, int itrio, int ial, int is_singleton)
396 {
397 alt_trios_t *tr = &args->alt_trios[ial];
398 if ( is_singleton ) kbs_insert(tr->sd_bset, tr->nsd);
399 tr->idx[ tr->nsd++ ] = itrio;
400 }
destroy_data(args_t * args)401 static void destroy_data(args_t *args)
402 {
403 int i;
404 for (i=0; i<args->nfilters; i++)
405 {
406 if ( args->filters[i].filter ) filter_destroy(args->filters[i].filter);
407 free(args->filters[i].stats);
408 free(args->filters[i].expr);
409 }
410 free(args->filters);
411 for (i=0; i<args->nflt_str; i++) free(args->flt_str[i]);
412 free(args->flt_str);
413 bcf_sr_destroy(args->sr);
414 alt_trios_destroy(args);
415 free(args->trio);
416 free(args->ac);
417 free(args->ac_trio);
418 free(args->gt_arr);
419 free(args->dnm_als);
420 if ( fclose(args->fp_out)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname);
421 free(args);
422 }
report_stats(args_t * args)423 static void report_stats(args_t *args)
424 {
425 int i = 0,j;
426 for (i=0; i<args->nfilters; i++)
427 {
428 flt_stats_t *flt = &args->filters[i];
429 fprintf(args->fp_out,"DEF\tFLT%d\t%s\n", i, flt->expr);
430 }
431 for (i=0; i<args->nfilters; i++)
432 {
433 flt_stats_t *flt = &args->filters[i];
434 for (j=0; j<args->ntrio; j++)
435 {
436 fprintf(args->fp_out,"FLT%d", i);
437 fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]);
438 fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]);
439 fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]);
440 trio_stats_t *stats = &flt->stats[j];
441 fprintf(args->fp_out,"\t%d", stats->npass);
442 fprintf(args->fp_out,"\t%d", stats->nnon_ref);
443 fprintf(args->fp_out,"\t%d", stats->nmendel_err);
444 fprintf(args->fp_out,"\t%d", stats->nnovel);
445 fprintf(args->fp_out,"\t%d", stats->nsingleton);
446 fprintf(args->fp_out,"\t%d", stats->ndoubleton);
447 fprintf(args->fp_out,"\t%d", stats->nts);
448 fprintf(args->fp_out,"\t%d", stats->ntv);
449 fprintf(args->fp_out,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY);
450 fprintf(args->fp_out,"\t%d", stats->ndnm_hom);
451 fprintf(args->fp_out,"\t%d", stats->ndnm_recurrent);
452 fprintf(args->fp_out,"\n");
453 }
454 }
455 }
456
parse_genotype(int32_t * arr,int ngt1,int idx,int als[2])457 static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2])
458 {
459 int32_t *ptr = arr + ngt1 * idx;
460 if ( bcf_gt_is_missing(ptr[0]) ) return -1;
461 als[0] = bcf_gt_allele(ptr[0]);
462
463 // treat haploid GTs as homozygous diploid
464 if ( ngt1==1 || ptr[1]==bcf_int32_vector_end ) { als[1] = als[0]; return 0; }
465
466 if ( bcf_gt_is_missing(ptr[1]) ) return -1;
467 als[1] = bcf_gt_allele(ptr[1]);
468
469 return 0;
470 }
471
process_record(args_t * args,bcf1_t * rec,flt_stats_t * flt)472 static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt)
473 {
474 int i,j;
475
476 // Find out which trios pass and if the site passes
477 if ( flt->filter )
478 {
479 uint8_t *smpl_pass = NULL;
480 int pass_site = filter_test(flt->filter, rec, (const uint8_t**) &smpl_pass);
481 if ( args->filter_logic & FLT_EXCLUDE )
482 {
483 if ( pass_site )
484 {
485 if ( !smpl_pass ) return;
486 pass_site = 0;
487 for (i=0; i<args->ntrio; i++)
488 {
489 int pass_trio = 1;
490 for (j=0; j<3; j++)
491 {
492 int idx = args->trio[i].idx[j];
493 if ( smpl_pass[idx] ) { pass_trio = 0; break; }
494 }
495 args->trio[i].pass = pass_trio;
496 if ( pass_trio ) pass_site = 1;
497 }
498 if ( !pass_site ) return;
499 }
500 else
501 for (i=0; i<args->ntrio; i++) args->trio[i].pass = 1;
502 }
503 else if ( !pass_site ) return;
504 else if ( smpl_pass )
505 {
506 pass_site = 0;
507 for (i=0; i<args->ntrio; i++)
508 {
509 int pass_trio = 1;
510 for (j=0; j<3; j++)
511 {
512 int idx = args->trio[i].idx[j];
513 if ( !smpl_pass[idx] ) { pass_trio = 0; break; }
514 }
515 args->trio[i].pass = pass_trio;
516 if ( pass_trio ) pass_site = 1;
517 }
518 if ( !pass_site ) return;
519 }
520 else
521 for (i=0; i<args->ntrio; i++) args->trio[i].pass = 1;
522 }
523
524 // Find out the allele counts. Try to use INFO/AC, if not present, determine from the genotypes
525 hts_expand(int, rec->n_allele, args->mac, args->ac);
526 if ( !bcf_calc_ac(args->hdr, rec, args->ac, BCF_UN_INFO|BCF_UN_FMT) ) return;
527 hts_expand(int, rec->n_allele, args->mac_trio, args->ac_trio);
528 hts_expand(int, rec->n_allele, args->mdnm_als, args->dnm_als);
529
530 // Get the genotypes
531 int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
532 if ( ngt<0 ) return;
533 int ngt1 = ngt / rec->n_sample;
534
535
536 // For ts/tv: numeric code of the reference allele, -1 for insertions
537 int ref = !rec->d.allele[0][1] ? bcf_acgt2int(*rec->d.allele[0]) : -1;
538
539 int star_allele = -1;
540 for (i=1; i<rec->n_allele; i++)
541 if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; }
542
543 // number of non-reference trios
544 if ( args->max_alt_trios ) alt_trios_reset(args, rec->n_allele);
545
546 // Run the stats
547 for (i=0; i<args->ntrio; i++)
548 {
549 if ( flt->filter && !args->trio[i].pass ) continue;
550 trio_stats_t *stats = &flt->stats[i];
551
552 // Determine the alternate allele and the genotypes, skip if any of the alleles is missing.
553 // the order is: child, father, mother
554 int als[6], *als_child = als, *als_father = als+2, *als_mother = als+4;
555 if ( parse_genotype(args->gt_arr, ngt1, args->trio[i].idx[iCHILD], als_child) < 0 ) continue;
556 if ( parse_genotype(args->gt_arr, ngt1, args->trio[i].idx[iFATHER], als_father) < 0 ) continue;
557 if ( parse_genotype(args->gt_arr, ngt1, args->trio[i].idx[iMOTHER], als_mother) < 0 ) continue;
558
559 stats->npass++;
560
561 // Has the trio an alternate allele other than *?
562 int has_star_allele = 0, has_nonref = 0;
563 memset(args->ac_trio,0,rec->n_allele*sizeof(*args->ac_trio));
564 for (j=0; j<6; j++)
565 {
566 if ( als[j]==star_allele ) { has_star_allele = 1; continue; }
567 if ( als[j]!=0 ) has_nonref = 1;
568 args->ac_trio[ als[j] ]++;
569 }
570 if ( !has_nonref ) continue; // only ref or * in this trio
571
572 stats->nnon_ref++;
573
574 // Calculate ts/tv. It does the right thing and handles also HetAA genotypes
575 if ( ref != -1 )
576 {
577 int has_ts = 0, has_tv = 0;
578 for (j=0; j<6; j++)
579 {
580 if ( als[j]==0 || als[j]==star_allele ) continue;
581 if ( als[j] >= rec->n_allele )
582 error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]);
583 if ( rec->d.allele[als[j]][1] ) continue;
584
585 int alt = bcf_acgt2int(rec->d.allele[als[j]][0]);
586 if ( abs(ref-alt)==2 ) has_ts = 1;
587 else has_tv = 1;
588 }
589 if ( has_ts ) stats->nts++;
590 if ( has_tv ) stats->ntv++;
591 }
592
593 // Skip some stats if the star allele is present as it was already checked at the primary record, we do not want to count the same
594 // thing multiple times. There can be other alternate allele, but we ignore that for simplicity.
595 if ( has_star_allele ) continue;
596
597 // Detect mendelian errors
598 int a0F = als_child[0]==als_father[0] || als_child[0]==als_father[1] ? 1 : 0;
599 int a1M = als_child[1]==als_mother[0] || als_child[1]==als_mother[1] ? 1 : 0;
600 if ( !a0F || !a1M )
601 {
602 int a0M = als_child[0]==als_mother[0] || als_child[0]==als_mother[1] ? 1 : 0;
603 int a1F = als_child[1]==als_father[0] || als_child[1]==als_father[1] ? 1 : 0;
604 if ( !a0M || !a1F )
605 {
606 stats->nmendel_err++;
607
608 int dnm_hom = 0;
609 if ( als_child[0]==als_child[1] ) { stats->ndnm_hom++; dnm_hom = 1; }
610
611 int culprit; // neglecting the unlikely possibility of alt het 1/2 DNM genotype
612 if ( !a0F && !a0M ) culprit = als_child[0];
613 else if ( !a1F && !a1M ) culprit = als_child[1];
614 else if ( args->ac[als_child[0]] < args->ac[als_child[1]] ) culprit = als_child[0];
615 else culprit = als_child[1];
616
617 int dnm_recurrent = 0;
618 if ( (!dnm_hom && args->ac[culprit]>1) || (dnm_hom && args->ac[culprit]>2) ) { stats->ndnm_recurrent++; dnm_recurrent = 1; }
619
620 if ( args->verbose & VERBOSE_MENDEL )
621 fprintf(args->fp_out,"MERR\t%s\t%"PRId64"\t%s\t%s\t%s\t%s\t%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
622 args->hdr->samples[args->trio[i].idx[iCHILD]],
623 args->hdr->samples[args->trio[i].idx[iFATHER]],
624 args->hdr->samples[args->trio[i].idx[iMOTHER]],
625 dnm_hom ? "HOM" : "-",
626 dnm_recurrent ? "RECURRENT" : "-"
627 );
628 }
629 }
630
631 // Is this a singleton, doubleton, neither?
632 for (j=0; j<rec->n_allele; j++)
633 {
634 if ( !args->ac_trio[j] ) continue;
635 if ( args->max_alt_trios ) args->alt_trios[j].nalt++;
636
637 if ( args->ac_trio[j]==1 ) // singleton (in parent) or novel (in child)
638 {
639 if ( als_child[0]==j || als_child[1]==j ) stats->nnovel++;
640 else
641 {
642 if ( !args->max_alt_trios )
643 {
644 stats->nsingleton++;
645 if ( args->verbose & VERBOSE_TRANSMITTED )
646 fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
647 args->hdr->samples[args->trio[i].idx[iCHILD]],
648 args->hdr->samples[args->trio[i].idx[iFATHER]],
649 args->hdr->samples[args->trio[i].idx[iMOTHER]]
650 );
651 }
652 else alt_trios_add(args, i,j,1);
653 }
654 }
655 else if ( args->ac_trio[j]==2 ) // possibly a doubleton
656 {
657 if ( (als_child[0]!=j && als_child[1]!=j) || (als_child[0]==j && als_child[1]==j) ) continue;
658 if ( (als_father[0]==j && als_father[1]==j) || (als_mother[0]==j && als_mother[1]==j) ) continue;
659 if ( !args->max_alt_trios )
660 {
661 stats->ndoubleton++;
662 if ( args->verbose & VERBOSE_TRANSMITTED )
663 fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
664 args->hdr->samples[args->trio[i].idx[iCHILD]],
665 args->hdr->samples[args->trio[i].idx[iFATHER]],
666 args->hdr->samples[args->trio[i].idx[iMOTHER]]
667 );
668 }
669 else alt_trios_add(args, i,j,0);
670 }
671 }
672 }
673 if ( args->max_alt_trios )
674 {
675 for (j=0; j<rec->n_allele; j++)
676 {
677 alt_trios_t *tr = &args->alt_trios[j];
678 if ( !tr->nsd || tr->nalt > args->max_alt_trios ) continue;
679 for (i=0; i<tr->nsd; i++)
680 {
681 int itr = tr->idx[i];
682 trio_stats_t *stats = &flt->stats[itr];
683 if ( kbs_exists(tr->sd_bset,i) )
684 {
685 stats->nsingleton++;
686 if ( args->verbose & VERBOSE_TRANSMITTED )
687 fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
688 args->hdr->samples[args->trio[itr].idx[iCHILD]],
689 args->hdr->samples[args->trio[itr].idx[iFATHER]],
690 args->hdr->samples[args->trio[itr].idx[iMOTHER]]
691 );
692 }
693 else
694 {
695 stats->ndoubleton++;
696 if ( args->verbose & VERBOSE_TRANSMITTED )
697 fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
698 args->hdr->samples[args->trio[itr].idx[iCHILD]],
699 args->hdr->samples[args->trio[itr].idx[iFATHER]],
700 args->hdr->samples[args->trio[itr].idx[iMOTHER]]
701 );
702 }
703 }
704 }
705 }
706 }
707
run(int argc,char ** argv)708 int run(int argc, char **argv)
709 {
710 args_t *args = (args_t*) calloc(1,sizeof(args_t));
711 args->argc = argc; args->argv = argv;
712 args->output_fname = "-";
713 static struct option loptions[] =
714 {
715 {"debug",required_argument,0,'d'},
716 {"alt-trios",required_argument,0,'a'},
717 {"include",required_argument,0,'i'},
718 {"exclude",required_argument,0,'e'},
719 {"output",required_argument,NULL,'o'},
720 {"ped",required_argument,NULL,'p'},
721 {"pfm",required_argument,NULL,'P'},
722 {"regions",1,0,'r'},
723 {"regions-file",1,0,'R'},
724 {"targets",1,0,'t'},
725 {"targets-file",1,0,'T'},
726 {NULL,0,NULL,0}
727 };
728 int c, i;
729 while ((c = getopt_long(argc, argv, "P:p:o:s:i:e:r:R:t:T:a:d:",loptions,NULL)) >= 0)
730 {
731 switch (c)
732 {
733 case 'd':
734 {
735 int n;
736 char **tmp = hts_readlist(optarg, 0, &n);
737 for(i=0; i<n; i++)
738 {
739 if ( !strcasecmp(tmp[i],"mendel-errors") ) args->verbose |= VERBOSE_MENDEL;
740 else if ( !strcasecmp(tmp[i],"transmitted") ) args->verbose |= VERBOSE_TRANSMITTED;
741 else error("Error: The argument \"%s\" to option --debug is not recognised\n", tmp[i]);
742 free(tmp[i]);
743 }
744 free(tmp);
745 break;
746 }
747 case 'a': args->max_alt_trios = atoi(optarg); break;
748 case 'e':
749 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
750 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
751 case 'i':
752 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
753 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
754 case 't': args->targets = optarg; break;
755 case 'T': args->targets = optarg; args->targets_is_file = 1; break;
756 case 'r': args->regions = optarg; break;
757 case 'R': args->regions = optarg; args->regions_is_file = 1; break;
758 case 'o': args->output_fname = optarg; break;
759 case 'p': args->ped_fname = optarg; break;
760 case 'P': args->pfm = optarg; break;
761 case 'h':
762 case '?':
763 default: error("%s", usage_text()); break;
764 }
765 }
766 if ( optind==argc )
767 {
768 if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
769 else { error("%s", usage_text()); }
770 }
771 else if ( optind+1!=argc ) error("%s", usage_text());
772 else args->fname = argv[optind];
773
774 if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n");
775
776 init_data(args);
777
778 while ( bcf_sr_next_line(args->sr) )
779 {
780 bcf1_t *rec = bcf_sr_get_line(args->sr,0);
781 for (i=0; i<args->nfilters; i++)
782 process_record(args, rec, &args->filters[i]);
783 }
784
785 report_stats(args);
786 destroy_data(args);
787
788 return 0;
789 }
790