1 /* The MIT License
2 
3    Copyright (c) 2018-2021 Genome Research Ltd.
4 
5    Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <strings.h>
30 #include <getopt.h>
31 #include <unistd.h>     // for isatty
32 #include <inttypes.h>
33 #include <htslib/hts.h>
34 #include <htslib/vcf.h>
35 #include <htslib/kstring.h>
36 #include <htslib/kseq.h>
37 #include <htslib/synced_bcf_reader.h>
38 #include <htslib/vcfutils.h>
39 #include <htslib/kbitset.h>
40 #include <htslib/khash_str2int.h>
41 #include "bcftools.h"
42 #include "filter.h"
43 
44 
45 // Logic of the filters: include or exclude sites which match the filters?
46 #define FLT_INCLUDE 1
47 #define FLT_EXCLUDE 2
48 
49 #define iCHILD  0
50 #define iFATHER 1
51 #define iMOTHER 2
52 
53 #define VERBOSE_MENDEL 1
54 #define VERBOSE_TRANSMITTED 2
55 
56 typedef struct
57 {
58     int idx[3];     // VCF sample index for father, mother and child
59     int pass;       // do all three pass the filters?
60 }
61 trio_t;
62 
63 typedef struct
64 {
65     uint32_t
66         npass,          // number of genotypes passing the filter
67         nnon_ref,       // number of non-reference genotypes
68         nmendel_err,    // number of DNMs / mendelian errors
69         nnovel,         // a singleton allele, but observed only in the child. Counted as mendel_err as well.
70         nsingleton,     // het mother or father different from everyone else
71         ndoubleton,     // het mother+child or father+child different from everyone else (transmitted alleles)
72         nts, ntv,       // number of transitions and transversions
73         ndnm_recurrent, // number of recurrent DNMs / mendelian errors (counted as GTs, not sites; in ambiguous cases the allele with smaller AF is chosen)
74         ndnm_hom;       // number of homozygous DNMs / mendelian errors
75 }
76 trio_stats_t;
77 
78 typedef struct
79 {
80     trio_stats_t *stats;
81     filter_t *filter;
82     char *expr;
83 }
84 flt_stats_t;
85 
86 typedef struct
87 {
88     kbitset_t *sd_bset; // singleton (1) or doubleton (0) trio?
89     uint32_t
90         nalt,   // number of all alternate trios
91         nsd,    // number of singleton or doubleton trios
92         *idx;   // indexes of the singleton and doubleon trios
93 }
94 alt_trios_t;    // for one alt allele
95 
96 typedef struct
97 {
98     int max_alt_trios;      // maximum number of alternate trios [1]
99     int malt_trios;
100     alt_trios_t *alt_trios;
101     int argc, filter_logic, regions_is_file, targets_is_file;
102     int nflt_str;
103     char *filter_str, **flt_str;
104     char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets;
105     bcf_srs_t *sr;
106     bcf_hdr_t *hdr;
107     trio_t *trio;
108     int ntrio, mtrio;
109     flt_stats_t *filters;
110     int nfilters;
111     int32_t *gt_arr, *ac, *ac_trio, *dnm_als;
112     int mgt_arr, mac, mac_trio, mdnm_als;
113     int verbose;
114     FILE *fp_out;
115 }
116 args_t;
117 
118 args_t args;
119 
about(void)120 const char *about(void)
121 {
122     return "Calculate transmission rate and other stats in trio children.\n";
123 }
124 
usage_text(void)125 static const char *usage_text(void)
126 {
127     return
128         "\n"
129         "About: Calculate transmission rate in trio children. Use curly brackets to scan\n"
130         "       a range of values simultaneously\n"
131         "Usage: bcftools +trio-stats [Plugin Options]\n"
132         "Plugin options:\n"
133         "   -a, --alt-trios INT         for transmission rate consider only sites with at most this\n"
134         "                                   many alternate trios, 0 for unlimited [0]\n"
135         "   -d, --debug TYPE            comma-separted list of features: {mendel-errors,transmitted}\n"
136         "   -e, --exclude EXPR          exclude trios for which the expression is true (one matching sample invalidates a trio)\n"
137         "   -i, --include EXPR          include trios for which the expression is true (one failing sample invalidates a trio)\n"
138         "   -o, --output FILE           output file name [stdout]\n"
139         "   -p, --ped FILE              PED file\n"
140         "   -P, --pfm P,F,M             sample names of proband, father, and mother\n"
141         "   -r, --regions REG           restrict to comma-separated list of regions\n"
142         "   -R, --regions-file FILE     restrict to regions listed in a file\n"
143         "   -t, --targets REG           similar to -r but streams rather than index-jumps\n"
144         "   -T, --targets-file FILE     similar to -R but streams rather than index-jumps\n"
145         "\n"
146         "Example:\n"
147         "   bcftools +trio-stats -p file.ped -i 'GQ>{10,20,30,40,50}' file.bcf\n"
148         "\n";
149 }
150 
cmp_trios(const void * _a,const void * _b)151 static int cmp_trios(const void *_a, const void *_b)
152 {
153     trio_t *a = (trio_t *) _a;
154     trio_t *b = (trio_t *) _b;
155     int i;
156     int amin = a->idx[0];
157     for (i=1; i<3; i++)
158         if ( amin > a->idx[i] ) amin = a->idx[i];
159     int bmin = b->idx[0];
160     for (i=1; i<3; i++)
161         if ( bmin > b->idx[i] ) bmin = b->idx[i];
162     if ( amin < bmin ) return -1;
163     if ( amin > bmin ) return 1;
164     return 0;
165 }
166 
parse_ped(args_t * args,char * fname)167 static void parse_ped(args_t *args, char *fname)
168 {
169     htsFile *fp = hts_open(fname, "r");
170     if ( !fp ) error("Could not read: %s\n", fname);
171 
172     kstring_t str = {0,0,0};
173     if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname);
174 
175     kstring_t tmp = {0,0,0};
176     void *has_smpl = khash_str2int_init();
177     void *has_trio = khash_str2int_init();
178 
179     int dup_trio_warned = 0;
180     int moff = 0, *off = NULL;
181     do
182     {
183         // familyID    sampleID paternalID maternalID sex   phenotype   population relationship   siblings   secondOrder   thirdOrder   children    comment
184         // BB03    HG01884 HG01885 HG01956 2   0   ACB child   0   0   0   0
185         int ncols = ksplit_core(str.s,0,&moff,&off);
186         if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s);
187 
188         int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]);
189         if ( father<0 ) continue;
190         int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]);
191         if ( mother<0 ) continue;
192         int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]);
193         if ( child<0 ) continue;
194 
195         tmp.l = 0;
196         ksprintf(&tmp,"%s %s %s",&str.s[off[1]],&str.s[off[2]],&str.s[off[3]]);
197         if ( khash_str2int_has_key(has_trio,tmp.s) )
198         {
199             if ( !dup_trio_warned ) fprintf(stderr,"Warning: The trio \"%s\" is listed multiple times in %s, skipping. (This message is printed only once.)\n",tmp.s,fname);
200             dup_trio_warned = 1;
201             continue;
202         }
203         if ( khash_str2int_has_key(has_smpl,&str.s[off[1]]) )
204             error("Error: The child \"%s\" is listed in two trios\n",&str.s[off[1]]);
205         khash_str2int_inc(has_smpl,strdup(&str.s[off[1]]));
206         khash_str2int_inc(has_trio,strdup(tmp.s));
207 
208         args->ntrio++;
209         hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio);
210         trio_t *trio = &args->trio[args->ntrio-1];
211         trio->idx[iFATHER] = father;
212         trio->idx[iMOTHER] = mother;
213         trio->idx[iCHILD]  = child;
214     }
215     while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 );
216 
217     fprintf(stderr,"Identified %d complete trios in the VCF file\n", args->ntrio);
218     if ( !args->ntrio ) error("No complete trio identified\n");
219 
220     // sort the sample by index so that they are accessed more or less sequentially
221     qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios);
222 
223     khash_str2int_destroy_free(has_smpl);
224     khash_str2int_destroy_free(has_trio);
225     free(tmp.s);
226     free(str.s);
227     free(off);
228     if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname);
229 }
230 
parse_filters(args_t * args)231 static void parse_filters(args_t *args)
232 {
233     if ( !args->filter_str ) return;
234     int mflt = 1;
235     args->nflt_str = 1;
236     args->flt_str  = (char**) malloc(sizeof(char*));
237     args->flt_str[0] = strdup(args->filter_str);
238     while (1)
239     {
240         int i, expanded = 0;
241         for (i=args->nflt_str-1; i>=0; i--)
242         {
243             char *exp_beg = strchr(args->flt_str[i], '{');
244             if ( !exp_beg ) continue;
245             char *exp_end = strchr(exp_beg+1, '}');
246             if ( !exp_end ) error("Could not parse the expression: %s\n", args->filter_str);
247             char *beg = exp_beg+1, *mid = beg;
248             while ( mid<exp_end )
249             {
250                 while ( mid<exp_end && *mid!=',' ) mid++;
251                 kstring_t tmp = {0,0,0};
252                 kputsn(args->flt_str[i], exp_beg - args->flt_str[i], &tmp);
253                 kputsn(beg, mid - beg, &tmp);
254                 kputs(exp_end+1, &tmp);
255                 args->nflt_str++;
256                 hts_expand(char*, args->nflt_str, mflt, args->flt_str);
257                 args->flt_str[args->nflt_str-1] = tmp.s;
258                 beg = ++mid;
259             }
260             expanded = 1;
261             free(args->flt_str[i]);
262             memmove(&args->flt_str[i], &args->flt_str[i+1], (args->nflt_str-i-1)*sizeof(*args->flt_str));
263             args->nflt_str--;
264             args->flt_str[args->nflt_str] = NULL;
265         }
266         if ( !expanded ) break;
267     }
268 
269     fprintf(stderr,"Collecting data for %d filtering expressions\n", args->nflt_str);
270 }
271 
init_data(args_t * args)272 static void init_data(args_t *args)
273 {
274     args->sr = bcf_sr_init();
275     if ( args->regions )
276     {
277         args->sr->require_index = 1;
278         if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
279     }
280     if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
281     if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
282     args->hdr = bcf_sr_get_header(args->sr,0);
283 
284     if ( args->ped_fname )
285         parse_ped(args, args->ped_fname);
286     else
287     {
288         args->ntrio = 1;
289         args->trio = (trio_t*) calloc(1,sizeof(trio_t));
290         int ibeg, iend = 0;
291         while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++;
292         if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm);
293         args->pfm[iend] = 0;
294         int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm);
295         if ( child<0 ) error("No such sample: \"%s\"\n", args->pfm);
296         args->pfm[iend] = ',';
297         ibeg = ++iend;
298         while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++;
299         if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm);
300         args->pfm[iend] = 0;
301         int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg);
302         if ( father<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg);
303         args->pfm[iend] = ',';
304         ibeg = ++iend;
305         int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg);
306         if ( mother<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg);
307         args->trio[0].idx[iFATHER] = father;
308         args->trio[0].idx[iMOTHER] = mother;
309         args->trio[0].idx[iCHILD]  = child;
310     }
311     parse_filters(args);
312 
313     int i;
314     if ( !args->nflt_str )
315     {
316         args->filters = (flt_stats_t*) calloc(1, sizeof(flt_stats_t));
317         args->nfilters = 1;
318         args->filters[0].expr = strdup("all");
319     }
320     else
321     {
322         args->nfilters = args->nflt_str;
323         args->filters = (flt_stats_t*) calloc(args->nfilters, sizeof(flt_stats_t));
324         for (i=0; i<args->nfilters; i++)
325         {
326             args->filters[i].filter = filter_init(args->hdr, args->flt_str[i]);
327             args->filters[i].expr   = strdup(args->flt_str[i]);
328 
329             // replace tab's with spaces so that the output stays parsable
330             char *tmp = args->filters[i].expr;
331             while ( *tmp )
332             {
333                 if ( *tmp=='\t' ) *tmp = ' ';
334                 tmp++;
335             }
336         }
337     }
338     for (i=0; i<args->nfilters; i++)
339         args->filters[i].stats = (trio_stats_t*) calloc(args->ntrio,sizeof(trio_stats_t));
340 
341     args->fp_out = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w");
342     if ( !args->fp_out ) error("Could not open the file for writing: %s\n", args->output_fname);
343     fprintf(args->fp_out,"# CMD line shows the command line used to generate this output\n");
344     fprintf(args->fp_out,"# DEF lines define expressions for all tested thresholds\n");
345     fprintf(args->fp_out,"# FLT* lines report numbers for every threshold and every trio:\n");
346     i = 0;
347     fprintf(args->fp_out,"#   %d) filter id\n", ++i);
348     fprintf(args->fp_out,"#   %d) child\n", ++i);
349     fprintf(args->fp_out,"#   %d) father\n", ++i);
350     fprintf(args->fp_out,"#   %d) mother\n", ++i);
351     fprintf(args->fp_out,"#   %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i);
352     fprintf(args->fp_out,"#   %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i);
353     fprintf(args->fp_out,"#   %d) number of DNMs/Mendelian errors\n", ++i);
354     fprintf(args->fp_out,"#   %d) number of novel singleton alleles in the child (counted also as DNM / Mendelian error)\n", ++i);
355     fprintf(args->fp_out,"#   %d) number of untransmitted trio singletons (one alternate allele present in one parent)\n", ++i);
356     fprintf(args->fp_out,"#   %d) number of transmitted trio singletons (one alternate allele present in one parent and the child)\n", ++i);
357     fprintf(args->fp_out,"#   %d) number of transitions, all distinct ALT alleles present in the trio are considered\n", ++i);
358     fprintf(args->fp_out,"#   %d) number of transversions, all distinct ALT alleles present in the trio are considered\n", ++i);
359     fprintf(args->fp_out,"#   %d) overall ts/tv, all distinct ALT alleles present in the trio are considered\n", ++i);
360     fprintf(args->fp_out,"#   %d) number of homozygous DNMs/Mendelian errors (likely genotyping errors)\n", ++i);
361     fprintf(args->fp_out,"#   %d) number of recurrent DNMs/Mendelian errors (non-inherited alleles present in other samples; counts GTs, not sites)\n", ++i);
362     fprintf(args->fp_out, "CMD\t%s", args->argv[0]);
363     for (i=1; i<args->argc; i++) fprintf(args->fp_out, " %s",args->argv[i]);
364     fprintf(args->fp_out, "\n");
365 }
alt_trios_reset(args_t * args,int nals)366 static void alt_trios_reset(args_t *args, int nals)
367 {
368     int i;
369     hts_expand0(alt_trios_t, nals, args->malt_trios, args->alt_trios);
370     for (i=0; i<nals; i++)
371     {
372         alt_trios_t *tr = &args->alt_trios[i];
373         if ( !tr->idx )
374         {
375             tr->idx = (uint32_t*)malloc(sizeof(*tr->idx)*args->ntrio);
376             tr->sd_bset = kbs_init(args->ntrio);
377         }
378         else
379             kbs_clear(tr->sd_bset);
380         tr->nsd  = 0;
381         tr->nalt = 0;
382     }
383 }
alt_trios_destroy(args_t * args)384 static void alt_trios_destroy(args_t *args)
385 {
386     if ( !args->max_alt_trios ) return;
387     int i;
388     for (i=0; i<args->malt_trios; i++)
389     {
390         free(args->alt_trios[i].idx);
391         kbs_destroy(args->alt_trios[i].sd_bset);
392     }
393     free(args->alt_trios);
394 }
alt_trios_add(args_t * args,int itrio,int ial,int is_singleton)395 static inline void alt_trios_add(args_t *args, int itrio, int ial, int is_singleton)
396 {
397     alt_trios_t *tr = &args->alt_trios[ial];
398     if ( is_singleton ) kbs_insert(tr->sd_bset, tr->nsd);
399     tr->idx[ tr->nsd++ ] = itrio;
400 }
destroy_data(args_t * args)401 static void destroy_data(args_t *args)
402 {
403     int i;
404     for (i=0; i<args->nfilters; i++)
405     {
406         if ( args->filters[i].filter ) filter_destroy(args->filters[i].filter);
407         free(args->filters[i].stats);
408         free(args->filters[i].expr);
409     }
410     free(args->filters);
411     for (i=0; i<args->nflt_str; i++) free(args->flt_str[i]);
412     free(args->flt_str);
413     bcf_sr_destroy(args->sr);
414     alt_trios_destroy(args);
415     free(args->trio);
416     free(args->ac);
417     free(args->ac_trio);
418     free(args->gt_arr);
419     free(args->dnm_als);
420     if ( fclose(args->fp_out)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname);
421     free(args);
422 }
report_stats(args_t * args)423 static void report_stats(args_t *args)
424 {
425     int i = 0,j;
426     for (i=0; i<args->nfilters; i++)
427     {
428         flt_stats_t *flt = &args->filters[i];
429         fprintf(args->fp_out,"DEF\tFLT%d\t%s\n", i, flt->expr);
430     }
431     for (i=0; i<args->nfilters; i++)
432     {
433         flt_stats_t *flt = &args->filters[i];
434         for (j=0; j<args->ntrio; j++)
435         {
436             fprintf(args->fp_out,"FLT%d", i);
437             fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]);
438             fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]);
439             fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]);
440             trio_stats_t *stats = &flt->stats[j];
441             fprintf(args->fp_out,"\t%d", stats->npass);
442             fprintf(args->fp_out,"\t%d", stats->nnon_ref);
443             fprintf(args->fp_out,"\t%d", stats->nmendel_err);
444             fprintf(args->fp_out,"\t%d", stats->nnovel);
445             fprintf(args->fp_out,"\t%d", stats->nsingleton);
446             fprintf(args->fp_out,"\t%d", stats->ndoubleton);
447             fprintf(args->fp_out,"\t%d", stats->nts);
448             fprintf(args->fp_out,"\t%d", stats->ntv);
449             fprintf(args->fp_out,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY);
450             fprintf(args->fp_out,"\t%d", stats->ndnm_hom);
451             fprintf(args->fp_out,"\t%d", stats->ndnm_recurrent);
452             fprintf(args->fp_out,"\n");
453         }
454     }
455 }
456 
parse_genotype(int32_t * arr,int ngt1,int idx,int als[2])457 static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2])
458 {
459     int32_t *ptr = arr + ngt1 * idx;
460     if ( bcf_gt_is_missing(ptr[0]) ) return -1;
461     als[0] = bcf_gt_allele(ptr[0]);
462 
463     // treat haploid GTs as homozygous diploid
464     if ( ngt1==1 || ptr[1]==bcf_int32_vector_end ) { als[1] = als[0]; return 0; }
465 
466     if ( bcf_gt_is_missing(ptr[1]) ) return -1;
467     als[1] = bcf_gt_allele(ptr[1]);
468 
469     return 0;
470 }
471 
process_record(args_t * args,bcf1_t * rec,flt_stats_t * flt)472 static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt)
473 {
474     int i,j;
475 
476     // Find out which trios pass and if the site passes
477     if ( flt->filter )
478     {
479         uint8_t *smpl_pass = NULL;
480         int pass_site = filter_test(flt->filter, rec, (const uint8_t**) &smpl_pass);
481         if ( args->filter_logic & FLT_EXCLUDE )
482         {
483             if ( pass_site )
484             {
485                 if ( !smpl_pass ) return;
486                 pass_site = 0;
487                 for (i=0; i<args->ntrio; i++)
488                 {
489                     int pass_trio = 1;
490                     for (j=0; j<3; j++)
491                     {
492                         int idx = args->trio[i].idx[j];
493                         if ( smpl_pass[idx] ) { pass_trio = 0; break; }
494                     }
495                     args->trio[i].pass = pass_trio;
496                     if ( pass_trio ) pass_site = 1;
497                 }
498                 if ( !pass_site ) return;
499             }
500             else
501                 for (i=0; i<args->ntrio; i++) args->trio[i].pass = 1;
502         }
503         else if ( !pass_site ) return;
504         else if ( smpl_pass )
505         {
506             pass_site = 0;
507             for (i=0; i<args->ntrio; i++)
508             {
509                 int pass_trio = 1;
510                 for (j=0; j<3; j++)
511                 {
512                     int idx = args->trio[i].idx[j];
513                     if ( !smpl_pass[idx] ) { pass_trio = 0; break; }
514                 }
515                 args->trio[i].pass = pass_trio;
516                 if ( pass_trio ) pass_site = 1;
517             }
518             if ( !pass_site ) return;
519         }
520         else
521             for (i=0; i<args->ntrio; i++) args->trio[i].pass = 1;
522     }
523 
524     // Find out the allele counts. Try to use INFO/AC, if not present, determine from the genotypes
525     hts_expand(int, rec->n_allele, args->mac, args->ac);
526     if ( !bcf_calc_ac(args->hdr, rec, args->ac, BCF_UN_INFO|BCF_UN_FMT) ) return;
527     hts_expand(int, rec->n_allele, args->mac_trio, args->ac_trio);
528     hts_expand(int, rec->n_allele, args->mdnm_als, args->dnm_als);
529 
530     // Get the genotypes
531     int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
532     if ( ngt<0 ) return;
533     int ngt1 = ngt / rec->n_sample;
534 
535 
536     // For ts/tv: numeric code of the reference allele, -1 for insertions
537     int ref = !rec->d.allele[0][1] ? bcf_acgt2int(*rec->d.allele[0]) : -1;
538 
539     int star_allele = -1;
540     for (i=1; i<rec->n_allele; i++)
541         if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; }
542 
543     // number of non-reference trios
544     if ( args->max_alt_trios ) alt_trios_reset(args, rec->n_allele);
545 
546     // Run the stats
547     for (i=0; i<args->ntrio; i++)
548     {
549         if ( flt->filter && !args->trio[i].pass ) continue;
550         trio_stats_t *stats = &flt->stats[i];
551 
552         // Determine the alternate allele and the genotypes, skip if any of the alleles is missing.
553         // the order is: child, father, mother
554         int als[6], *als_child = als, *als_father = als+2, *als_mother = als+4;
555         if ( parse_genotype(args->gt_arr, ngt1, args->trio[i].idx[iCHILD], als_child) < 0 ) continue;
556         if ( parse_genotype(args->gt_arr, ngt1, args->trio[i].idx[iFATHER], als_father) < 0 ) continue;
557         if ( parse_genotype(args->gt_arr, ngt1, args->trio[i].idx[iMOTHER], als_mother) < 0 ) continue;
558 
559         stats->npass++;
560 
561         // Has the trio an alternate allele other than *?
562         int has_star_allele = 0, has_nonref = 0;
563         memset(args->ac_trio,0,rec->n_allele*sizeof(*args->ac_trio));
564         for (j=0; j<6; j++)
565         {
566             if ( als[j]==star_allele ) { has_star_allele = 1; continue; }
567             if ( als[j]!=0 ) has_nonref = 1;
568             args->ac_trio[ als[j] ]++;
569         }
570         if ( !has_nonref ) continue;   // only ref or * in this trio
571 
572         stats->nnon_ref++;
573 
574         // Calculate ts/tv. It does the right thing and handles also HetAA genotypes
575         if ( ref != -1 )
576         {
577             int has_ts = 0, has_tv = 0;
578             for (j=0; j<6; j++)
579             {
580                 if ( als[j]==0 || als[j]==star_allele ) continue;
581                 if ( als[j] >= rec->n_allele )
582                     error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]);
583                 if ( rec->d.allele[als[j]][1] ) continue;
584 
585                 int alt = bcf_acgt2int(rec->d.allele[als[j]][0]);
586                 if ( abs(ref-alt)==2 ) has_ts = 1;
587                 else has_tv = 1;
588             }
589             if ( has_ts ) stats->nts++;
590             if ( has_tv ) stats->ntv++;
591         }
592 
593         // Skip some stats if the star allele is present as it was already checked at the primary record, we do not want to count the same
594         // thing multiple times. There can be other alternate allele, but we ignore that for simplicity.
595         if ( has_star_allele ) continue;
596 
597         // Detect mendelian errors
598         int a0F = als_child[0]==als_father[0] || als_child[0]==als_father[1] ? 1 : 0;
599         int a1M = als_child[1]==als_mother[0] || als_child[1]==als_mother[1] ? 1 : 0;
600         if ( !a0F || !a1M )
601         {
602             int a0M = als_child[0]==als_mother[0] || als_child[0]==als_mother[1] ? 1 : 0;
603             int a1F = als_child[1]==als_father[0] || als_child[1]==als_father[1] ? 1 : 0;
604             if ( !a0M || !a1F )
605             {
606                 stats->nmendel_err++;
607 
608                 int dnm_hom = 0;
609                 if ( als_child[0]==als_child[1] ) { stats->ndnm_hom++; dnm_hom = 1; }
610 
611                 int culprit;    // neglecting the unlikely possibility of alt het 1/2 DNM genotype
612                 if ( !a0F && !a0M ) culprit = als_child[0];
613                 else if ( !a1F && !a1M ) culprit = als_child[1];
614                 else if ( args->ac[als_child[0]] < args->ac[als_child[1]] ) culprit = als_child[0];
615                 else culprit = als_child[1];
616 
617                 int dnm_recurrent = 0;
618                 if ( (!dnm_hom && args->ac[culprit]>1) || (dnm_hom && args->ac[culprit]>2) ) { stats->ndnm_recurrent++; dnm_recurrent = 1; }
619 
620                 if ( args->verbose & VERBOSE_MENDEL )
621                     fprintf(args->fp_out,"MERR\t%s\t%"PRId64"\t%s\t%s\t%s\t%s\t%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
622                             args->hdr->samples[args->trio[i].idx[iCHILD]],
623                             args->hdr->samples[args->trio[i].idx[iFATHER]],
624                             args->hdr->samples[args->trio[i].idx[iMOTHER]],
625                             dnm_hom ? "HOM" : "-",
626                             dnm_recurrent ? "RECURRENT" : "-"
627                            );
628             }
629         }
630 
631         // Is this a singleton, doubleton, neither?
632         for (j=0; j<rec->n_allele; j++)
633         {
634             if ( !args->ac_trio[j] ) continue;
635             if ( args->max_alt_trios ) args->alt_trios[j].nalt++;
636 
637             if ( args->ac_trio[j]==1 )  // singleton (in parent) or novel (in child)
638             {
639                 if ( als_child[0]==j || als_child[1]==j ) stats->nnovel++;
640                 else
641                 {
642                     if ( !args->max_alt_trios )
643                     {
644                         stats->nsingleton++;
645                         if ( args->verbose & VERBOSE_TRANSMITTED )
646                             fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
647                                     args->hdr->samples[args->trio[i].idx[iCHILD]],
648                                     args->hdr->samples[args->trio[i].idx[iFATHER]],
649                                     args->hdr->samples[args->trio[i].idx[iMOTHER]]
650                                    );
651                     }
652                     else alt_trios_add(args, i,j,1);
653                 }
654             }
655             else if ( args->ac_trio[j]==2 ) // possibly a doubleton
656             {
657                 if ( (als_child[0]!=j && als_child[1]!=j) || (als_child[0]==j && als_child[1]==j) ) continue;
658                 if ( (als_father[0]==j && als_father[1]==j) || (als_mother[0]==j && als_mother[1]==j) ) continue;
659                 if ( !args->max_alt_trios )
660                 {
661                     stats->ndoubleton++;
662                     if ( args->verbose & VERBOSE_TRANSMITTED )
663                         fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
664                                 args->hdr->samples[args->trio[i].idx[iCHILD]],
665                                 args->hdr->samples[args->trio[i].idx[iFATHER]],
666                                 args->hdr->samples[args->trio[i].idx[iMOTHER]]
667                                );
668                 }
669                 else alt_trios_add(args, i,j,0);
670             }
671         }
672     }
673     if ( args->max_alt_trios )
674     {
675         for (j=0; j<rec->n_allele; j++)
676         {
677             alt_trios_t *tr = &args->alt_trios[j];
678             if ( !tr->nsd || tr->nalt > args->max_alt_trios ) continue;
679             for (i=0; i<tr->nsd; i++)
680             {
681                 int itr = tr->idx[i];
682                 trio_stats_t *stats = &flt->stats[itr];
683                 if ( kbs_exists(tr->sd_bset,i) )
684                 {
685                     stats->nsingleton++;
686                     if ( args->verbose & VERBOSE_TRANSMITTED )
687                         fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
688                                 args->hdr->samples[args->trio[itr].idx[iCHILD]],
689                                 args->hdr->samples[args->trio[itr].idx[iFATHER]],
690                                 args->hdr->samples[args->trio[itr].idx[iMOTHER]]
691                                );
692                 }
693                 else
694                 {
695                     stats->ndoubleton++;
696                     if ( args->verbose & VERBOSE_TRANSMITTED )
697                         fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,
698                                 args->hdr->samples[args->trio[itr].idx[iCHILD]],
699                                 args->hdr->samples[args->trio[itr].idx[iFATHER]],
700                                 args->hdr->samples[args->trio[itr].idx[iMOTHER]]
701                                );
702                 }
703             }
704         }
705     }
706 }
707 
run(int argc,char ** argv)708 int run(int argc, char **argv)
709 {
710     args_t *args = (args_t*) calloc(1,sizeof(args_t));
711     args->argc   = argc; args->argv = argv;
712     args->output_fname = "-";
713     static struct option loptions[] =
714     {
715         {"debug",required_argument,0,'d'},
716         {"alt-trios",required_argument,0,'a'},
717         {"include",required_argument,0,'i'},
718         {"exclude",required_argument,0,'e'},
719         {"output",required_argument,NULL,'o'},
720         {"ped",required_argument,NULL,'p'},
721         {"pfm",required_argument,NULL,'P'},
722         {"regions",1,0,'r'},
723         {"regions-file",1,0,'R'},
724         {"targets",1,0,'t'},
725         {"targets-file",1,0,'T'},
726         {NULL,0,NULL,0}
727     };
728     int c, i;
729     while ((c = getopt_long(argc, argv, "P:p:o:s:i:e:r:R:t:T:a:d:",loptions,NULL)) >= 0)
730     {
731         switch (c)
732         {
733             case 'd':
734             {
735                 int n;
736                 char **tmp = hts_readlist(optarg, 0, &n);
737                 for(i=0; i<n; i++)
738                 {
739                     if ( !strcasecmp(tmp[i],"mendel-errors") ) args->verbose |= VERBOSE_MENDEL;
740                     else if ( !strcasecmp(tmp[i],"transmitted") ) args->verbose |= VERBOSE_TRANSMITTED;
741                     else error("Error: The argument \"%s\" to option --debug is not recognised\n", tmp[i]);
742                     free(tmp[i]);
743                 }
744                 free(tmp);
745                 break;
746             }
747             case 'a': args->max_alt_trios = atoi(optarg); break;
748             case 'e':
749                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
750                 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
751             case 'i':
752                 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
753                 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
754             case 't': args->targets = optarg; break;
755             case 'T': args->targets = optarg; args->targets_is_file = 1; break;
756             case 'r': args->regions = optarg; break;
757             case 'R': args->regions = optarg; args->regions_is_file = 1; break;
758             case 'o': args->output_fname = optarg; break;
759             case 'p': args->ped_fname = optarg; break;
760             case 'P': args->pfm = optarg; break;
761             case 'h':
762             case '?':
763             default: error("%s", usage_text()); break;
764         }
765     }
766     if ( optind==argc )
767     {
768         if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-";  // reading from stdin
769         else { error("%s", usage_text()); }
770     }
771     else if ( optind+1!=argc ) error("%s", usage_text());
772     else args->fname = argv[optind];
773 
774     if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n");
775 
776     init_data(args);
777 
778     while ( bcf_sr_next_line(args->sr) )
779     {
780         bcf1_t *rec = bcf_sr_get_line(args->sr,0);
781         for (i=0; i<args->nfilters; i++)
782             process_record(args, rec, &args->filters[i]);
783     }
784 
785     report_stats(args);
786     destroy_data(args);
787 
788     return 0;
789 }
790