1 /*  vcfgtcheck.c -- Check sample identity.
2 
3     Copyright (C) 2013-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <assert.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <math.h>
37 #include <htslib/vcf.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/vcfutils.h>
40 #include <htslib/kbitset.h>
41 #include <htslib/hts_os.h>
42 #include <inttypes.h>
43 #include <sys/time.h>
44 #include "bcftools.h"
45 #include "extsort.h"
46 //#include "hclust.h"
47 
48 typedef struct
49 {
50     int iqry, igt;
51 }
52 pair_t;
53 
54 typedef struct
55 {
56     bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
57     bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
58     char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
59     int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
60     int regions_overlap, targets_overlap;
61     int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
62     double *pdiff, *qry_prob, *gt_prob;
63     uint32_t *ndiff,*ncnt,ncmp, npairs;
64     int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
65     uint8_t *qry_dsg, *gt_dsg;
66     pair_t *pairs;
67     double *hwe_prob, dsg2prob[8][3], pl2prob[256];
68     double min_inter_err, max_intra_err;
69     int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
70     FILE *fp;
71     unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
72 
73     // for --distinctive-sites
74     double distinctive_sites;
75     kbitset_t *kbs_diff;
76     size_t diff_sites_size;
77     extsort_t *es;
78     char *es_tmp_prefix, *es_max_mem;
79 }
80 args_t;
81 
set_cwd(args_t * args)82 static void set_cwd(args_t *args)
83 {
84     int i;
85     char *buf;
86     size_t nbuf = 500;
87     args->cwd = (char*) malloc(sizeof(char)*nbuf);
88     for (i=0; i<5; i++)
89     {
90         if ( (buf = getcwd(args->cwd, nbuf)) ) break;
91         nbuf *= 2;
92         args->cwd = (char*) realloc(args->cwd, sizeof(char)*nbuf);
93     }
94     assert(buf);
95 }
print_header(args_t * args,FILE * fp)96 static void print_header(args_t *args, FILE *fp)
97 {
98     fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
99     fprintf(fp, "# \t bcftools %s ", args->argv[0]);
100     int i;
101     for (i=1; i<args->argc; i++)
102         fprintf(fp, " %s",args->argv[i]);
103     fprintf(fp, "\n# and the working directory was:\n");
104     fprintf(fp, "# \t %s\n#\n", args->cwd);
105 }
106 
cmp_int(const void * _a,const void * _b)107 static int cmp_int(const void *_a, const void *_b)
108 {
109     int a = *((int*)_a);
110     int b = *((int*)_b);
111     if ( a < b ) return -1;
112     if ( a > b ) return 1;
113     return 0;
114 }
cmp_pair(const void * _a,const void * _b)115 static int cmp_pair(const void *_a, const void *_b)
116 {
117     pair_t *a = (pair_t*)_a;
118     pair_t *b = (pair_t*)_b;
119     if ( a->iqry < b->iqry ) return -1;
120     if ( a->iqry > b->iqry ) return 1;
121     if ( a->igt < b->igt ) return -1;
122     if ( a->igt > b->igt ) return 1;
123     return 0;
124 }
125 
126 typedef struct
127 {
128     uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
129     unsigned long kbs_dat[1];
130 }
131 diff_sites_t;
132 #if DBG
diff_sites_debug_print(args_t * args,diff_sites_t * ds)133 static void diff_sites_debug_print(args_t *args, diff_sites_t *ds)
134 {
135     int i;
136     memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
137     fprintf(stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff);
138     for (i=0; i<args->npairs; i++) fprintf(stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0);
139     fprintf(stderr,"\n");
140 }
141 #endif
diff_sites_cmp(const void * aptr,const void * bptr)142 static int diff_sites_cmp(const void *aptr, const void *bptr)
143 {
144     diff_sites_t *a = *((diff_sites_t**)aptr);
145     diff_sites_t *b = *((diff_sites_t**)bptr);
146     if ( a->ndiff < b->ndiff ) return 1;        // descending order
147     if ( a->ndiff > b->ndiff ) return -1;
148     if ( a->rand < b->rand ) return -1;
149     if ( a->rand > b->rand ) return 1;
150     return 0;
151 }
diff_sites_init(args_t * args)152 static void diff_sites_init(args_t *args)
153 {
154     int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites;
155     if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites);
156     if ( nsites > args->npairs )
157     {
158         fprintf(stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n");
159         nsites = args->npairs;
160         args->distinctive_sites = args->npairs + 1;
161     }
162     else
163         args->distinctive_sites = nsites;
164     args->kbs_diff = kbs_init(args->npairs);
165     size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS;
166     assert( n==args->kbs_diff->n );
167     args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long);
168     args->es = extsort_alloc();
169     extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size);
170     extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix);
171     extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem);
172     extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp);
173     extsort_init(args->es);
174 }
diff_sites_destroy(args_t * args)175 static void diff_sites_destroy(args_t *args)
176 {
177     kbs_destroy(args->kbs_diff);
178     extsort_destroy(args->es);
179 }
diff_sites_reset(args_t * args)180 static inline void diff_sites_reset(args_t *args)
181 {
182     kbs_clear(args->kbs_diff);
183 }
diff_sites_push(args_t * args,int ndiff,int rid,int pos)184 static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos)
185 {
186     diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size);
187     memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added)
188     dat->ndiff = ndiff;
189     dat->rid  = rid;
190     dat->pos  = pos;
191     dat->rand = hts_lrand48();
192     memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long));
193     extsort_push(args->es,dat);
194 }
diff_sites_shift(args_t * args,int * ndiff,int * rid,int * pos)195 static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos)
196 {
197     diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es);
198     if ( !dat ) return 0;
199     *ndiff = dat->ndiff;
200     *rid   = dat->rid;
201     *pos   = dat->pos;
202     memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
203     return 1;
204 }
205 
init_samples(char * list,int list_is_file,int ** smpl,int * nsmpl,bcf_hdr_t * hdr,char * vcf_fname)206 static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname)
207 {
208     int i;
209     if ( !strcmp(list,"-") )
210     {
211         *nsmpl = bcf_hdr_nsamples(hdr);
212         *smpl  = (int*) malloc(sizeof(**smpl)*(*nsmpl));
213         for (i=0; i<*nsmpl; i++) (*smpl)[i] = i;
214         return;
215     }
216 
217     char **tmp = hts_readlist(list, list_is_file, nsmpl);
218     if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list);
219     *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
220     for (i=0; i<*nsmpl; i++)
221     {
222         int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]);
223         if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]);
224         (*smpl)[i] = idx;
225         free(tmp[i]);
226     }
227     free(tmp);
228     qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int);
229     // check for duplicates
230     for (i=1; i<*nsmpl; i++)
231         if ( (*smpl)[i-1]==(*smpl)[i] )
232             error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list);
233 }
234 
init_data(args_t * args)235 static void init_data(args_t *args)
236 {
237     hts_srand48(0);
238 
239     args->files = bcf_sr_init();
240     if ( args->regions )
241     {
242         bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
243         if ( bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
244     }
245     if ( args->targets )
246     {
247         bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
248         if ( bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
249     }
250 
251     if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
252     if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
253     if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
254         error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
255 
256     args->qry_hdr = bcf_sr_get_header(args->files,0);
257     if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname);
258     if ( args->gt_fname )
259     {
260         args->gt_hdr = bcf_sr_get_header(args->files,1);
261         if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
262     }
263 
264     // Determine whether GT or PL will be used
265     if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
266     {
267         if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 )
268             args->qry_use_GT = 0;
269         else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 )
270             args->qry_use_GT = 1;
271         else
272             error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname);
273     }
274     else if ( args->qry_use_GT==1 )
275     {
276         if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 )
277             error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname);
278     }
279     else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 )
280         error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname);
281 
282     if ( args->gt_hdr )
283     {
284         if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default
285         {
286             if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 )
287                 args->gt_use_GT = 1;
288             else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 )
289                 args->gt_use_GT = 0;
290             else
291                 error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname);
292         }
293         else if ( args->gt_use_GT==1 )
294         {
295             if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 )
296                 error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname);
297         }
298         else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 )
299             error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname);
300     }
301     else
302         args->gt_use_GT = args->qry_use_GT;
303 
304     // Prepare samples
305     int i,j;
306     args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr);
307     if ( args->qry_samples )
308     {
309         init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
310     }
311     if ( args->gt_samples )
312     {
313         init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
314             args->gt_hdr ? args->gt_hdr : args->qry_hdr,
315             args->gt_fname ? args->gt_fname : args->qry_fname);
316     }
317     else if ( args->pair_samples )
318     {
319         int npairs;
320         char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs);
321         if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples);
322         if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n");
323         args->npairs = args->pair_samples_is_file ? npairs : npairs/2;
324         args->pairs  = (pair_t*) calloc(args->npairs,sizeof(*args->pairs));
325         if ( !args->pair_samples_is_file )
326         {
327             for (i=0; i<args->npairs; i++)
328             {
329                 args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]);
330                 args->pairs[i].igt  = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]);
331                 if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]);
332                 if ( args->pairs[i].igt  < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]);
333                 free(tmp[2*i]);
334                 free(tmp[2*i+1]);
335             }
336         }
337         else
338         {
339             for (i=0; i<args->npairs; i++)
340             {
341                 char *ptr = tmp[i];
342                 while ( *ptr && !isspace(*ptr) ) ptr++;
343                 if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]);
344                 *ptr = 0;
345                 args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]);
346                 if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]);
347                 ptr++;
348                 while ( *ptr && isspace(*ptr) ) ptr++;
349                 args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr);
350                 if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr);
351                 free(tmp[i]);
352             }
353         }
354         free(tmp);
355         qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair);
356     }
357     else if ( args->gt_hdr )
358         args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr);
359     if ( !args->ngt_smpl )
360     {
361         args->ngt_smpl = args->nqry_smpl;
362         args->gt_smpl  = args->qry_smpl;
363         args->cross_check = 1;
364     }
365 
366     // The data arrays
367     if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl;
368     if ( !args->pair_samples )
369     {
370         args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
371         args->gt_dsg  = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
372     }
373     if ( args->use_PLs )
374     {
375         args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff));      // log probability of pair samples being the same
376         args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
377         args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
378 
379         // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
380         // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
381         // probabilities of 0/0, 0/1, and 1/1 genotypes
382         for (i=0; i<8; i++)
383             for (j=0; j<3; j++)
384                 args->dsg2prob[i][j] = HUGE_VAL;
385         args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
386         args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
387         args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
388         args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
389         args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
390         args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
391         args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
392         args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
393         args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
394 
395         // lookup table to avoid exponentiation
396         for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
397     }
398     else
399         args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff));    // number of differing genotypes for each pair of samples
400     args->ncnt  = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
401     if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
402     if ( args->calc_hwe_prob )
403     {
404         // prob of the observed sequence of matches given site AFs and HWE
405         args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
406         if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
407     }
408 
409     if ( args->distinctive_sites ) diff_sites_init(args);
410 
411     args->fp = stdout;
412     print_header(args, args->fp);
413 }
414 
destroy_data(args_t * args)415 static void destroy_data(args_t *args)
416 {
417     if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
418     free(args->qry_dsg);
419     if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
420     free(args->qry_prob);
421     free(args->es_max_mem);
422     fclose(args->fp);
423     if ( args->distinctive_sites ) diff_sites_destroy(args);
424     free(args->hwe_prob);
425     free(args->cwd);
426     free(args->qry_arr);
427     if ( args->gt_hdr ) free(args->gt_arr);
428     free(args->pdiff);
429     free(args->ndiff);
430     free(args->ncnt);
431     free(args->qry_smpl);
432     if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
433     free(args->pairs);
434     bcf_sr_destroy(args->files);
435 }
436 
gt_to_dsg(int32_t * ptr)437 static inline uint8_t gt_to_dsg(int32_t *ptr)
438 {
439     if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) return 0;
440     uint8_t dsg = (bcf_gt_allele(ptr[0])?1:0) + (bcf_gt_allele(ptr[1])?1:0);
441     return 1<<dsg;
442 }
pl_to_dsg(int32_t * ptr)443 static inline uint8_t pl_to_dsg(int32_t *ptr)
444 {
445     if ( ptr[0]==bcf_int32_missing || ptr[1]==bcf_int32_missing || ptr[2]==bcf_int32_missing ) return 0;
446     if ( ptr[1]==bcf_int32_vector_end || ptr[2]==bcf_int32_vector_end ) return 0;
447     int min_pl = ptr[0]<ptr[1] ? (ptr[0]<ptr[2]?ptr[0]:ptr[2]) : (ptr[1]<ptr[2]?ptr[1]:ptr[2]);
448     uint8_t dsg = 0;
449     if ( ptr[0]==min_pl ) dsg |= 1;
450     if ( ptr[1]==min_pl ) dsg |= 2;
451     if ( ptr[2]==min_pl ) dsg |= 4;
452     return dsg;
453 }
gt_to_prob(args_t * args,int32_t * ptr,double * prob)454 static inline uint8_t gt_to_prob(args_t *args, int32_t *ptr, double *prob)
455 {
456     uint8_t dsg = gt_to_dsg(ptr);
457     if ( dsg )
458     {
459         prob[0] = args->dsg2prob[dsg][0];
460         prob[1] = args->dsg2prob[dsg][1];
461         prob[2] = args->dsg2prob[dsg][2];
462     }
463     return dsg;
464 }
pl_to_prob(args_t * args,int32_t * ptr,double * prob)465 static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob)
466 {
467     uint8_t dsg = pl_to_dsg(ptr);
468     if ( dsg )
469     {
470         prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255];
471         prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255];
472         prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255];
473         double sum = prob[0] + prob[1] + prob[2];
474         prob[0] /= sum;
475         prob[1] /= sum;
476         prob[2] /= sum;
477         prob[0] = -log(prob[0]);
478         prob[1] = -log(prob[1]);
479         prob[2] = -log(prob[2]);
480     }
481     return dsg;
482 }
set_data(args_t * args,bcf_hdr_t * hdr,bcf1_t * rec,int32_t ** arr,int32_t * narr,int * narr1,int * use_GT)483 static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT)
484 {
485     static int warn_dip_GT = 1;
486     static int warn_dip_PL = 1;
487     int i;
488     for (i=0; i<2; i++)
489     {
490         if ( *use_GT )
491         {
492             int ret = bcf_get_genotypes(hdr,rec,arr,narr);
493             if ( ret < 0 )
494             {
495                 if ( !i ) { *use_GT = 0; continue; }
496                 args->nskip_no_data++;
497                 return -1;
498             }
499             if ( ret != 2*bcf_hdr_nsamples(hdr) )
500             {
501                 if ( warn_dip_GT )
502                 {
503                     fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
504                     warn_dip_GT = 0;
505                 }
506                 args->nskip_dip_GT++;
507                 return -1;
508             }
509             *narr1 = 2;
510             return 0;
511         }
512 
513         int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr);
514         if ( ret < 0 )
515         {
516             if ( !i ) { *use_GT = 1; continue; }
517             args->nskip_no_data++;
518             return -1;
519         }
520         if ( ret != 3*bcf_hdr_nsamples(hdr) )
521         {
522             if ( warn_dip_PL )
523             {
524                 fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/PL fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
525                 warn_dip_PL = 0;
526             }
527             args->nskip_dip_PL++;
528             return -1;
529         }
530         *narr1 = 3;
531         return 0;
532     }
533     return -1;  // should never reach
534 }
process_line(args_t * args)535 static void process_line(args_t *args)
536 {
537     int i,j,k, nqry1, ngt1, ret;
538 
539     bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0);   // the query file
540     int qry_use_GT = args->qry_use_GT;
541     int gt_use_GT  = args->gt_use_GT;
542 
543     ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT);
544     if ( ret<0 ) return;
545 
546     if ( args->gt_hdr )
547     {
548         gt_rec = bcf_sr_get_line(args->files,1);
549         ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, &gt_use_GT);
550         if ( ret<0 ) return;
551     }
552     else
553     {
554         ngt1 = nqry1;
555         args->gt_arr = args->qry_arr;
556     }
557 
558     args->ncmp++;
559 
560     double af,hwe_dsg[8];
561     if ( args->calc_hwe_prob )
562     {
563         int ac[2];
564         if ( args->gt_hdr )
565         {
566             if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
567         }
568         else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
569 
570         // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
571         // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
572         //      hwe[1,2,4] ..  dsg=0,1,2
573         //      hwe[3]     ..  dsg=0 or 1
574         //      hwe[6]     ..  dsg=1 or 2
575 
576         double hwe[3];
577         const double min_af = 1e-5;             // cap the AF in case we get unrealistic values
578         af = (double)ac[1]/(ac[0]+ac[1]);
579         hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
580         hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
581         hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
582         hwe_dsg[0] = 0;
583         for (i=1; i<8; i++)
584         {
585             hwe_dsg[i] = HUGE_VAL;
586             for (k=0; k<3; k++)
587             {
588                 if ( ((1<<k)&i) && hwe_dsg[i] > hwe[k] ) hwe_dsg[i] = hwe[k];
589             }
590         }
591     }
592 
593     // The sample pairs were given explicitly via -p/-P options
594     if ( args->pairs )
595     {
596         if ( !args->use_PLs )
597         {
598             int ndiff = 0;
599             if ( args->kbs_diff ) diff_sites_reset(args);
600 
601             for (i=0; i<args->npairs; i++)
602             {
603                 int32_t *ptr;
604                 uint8_t qry_dsg, gt_dsg;
605 
606                 ptr = args->gt_arr + args->pairs[i].igt*ngt1;
607                 gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
608                 if ( !gt_dsg ) continue;                        // missing value
609                 if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
610 
611                 ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
612                 qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
613                 if ( !qry_dsg ) continue;                       // missing value
614 
615                 int match = qry_dsg & gt_dsg;
616                 if ( !match )
617                 {
618                     args->ndiff[i]++;
619                     if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
620                 }
621                 else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
622                 args->ncnt[i]++;
623             }
624 
625             if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
626         }
627         else    // use_PLs set
628         {
629             for (i=0; i<args->npairs; i++)
630             {
631                 int32_t *ptr;
632                 double qry_prob[3], gt_prob[3];
633                 uint8_t qry_dsg, gt_dsg;
634 
635                 ptr = args->gt_arr + args->pairs[i].igt*ngt1;
636                 gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
637                 if ( !gt_dsg ) continue;                        // missing value
638                 if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
639 
640                 ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
641                 qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
642                 if ( !qry_dsg ) continue;                       // missing value
643 
644                 double min = qry_prob[0] + gt_prob[0];
645                 qry_prob[1] += gt_prob[1];
646                 if ( min > qry_prob[1] ) min = qry_prob[1];
647                 qry_prob[2] += gt_prob[2];
648                 if ( min > qry_prob[2] ) min = qry_prob[2];
649                 args->pdiff[i] += min;
650 
651                 if ( args->calc_hwe_prob )
652                 {
653                     int match = qry_dsg & gt_dsg;
654                     args->hwe_prob[i] += hwe_dsg[match];
655                 }
656                 args->ncnt[i]++;
657             }
658         }
659         return;
660     }
661 
662     int idx=0;
663     if ( !args->use_PLs )
664     {
665         for (i=0; i<args->nqry_smpl; i++)
666         {
667             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
668             int32_t *ptr = args->qry_arr + nqry1*iqry;
669             args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
670         }
671         if ( !args->cross_check )   // in this case gt_dsg points to qry_dsg
672         {
673             for (i=0; i<args->ngt_smpl; i++)
674             {
675                 int igt = args->gt_smpl ? args->gt_smpl[i] : i;
676                 int32_t *ptr = args->gt_arr + ngt1*igt;
677                 args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
678                 if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0;      // not a hom, set to a missing value
679             }
680         }
681         for (i=0; i<args->nqry_smpl; i++)
682         {
683             int ngt = args->cross_check ? i : args->ngt_smpl;       // two files or a sub-diagonal cross-check mode?
684             if ( !args->qry_dsg[i] ) { idx += ngt; continue; }      // missing value
685             for (j=0; j<ngt; j++)
686             {
687                 if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
688                 int match = args->qry_dsg[i] & args->gt_dsg[j];
689                 if ( !match ) args->ndiff[idx]++;
690                 else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
691                 args->ncnt[idx]++;
692                 idx++;
693             }
694         }
695     }
696     else    // use_PLs set
697     {
698         for (i=0; i<args->nqry_smpl; i++)
699         {
700             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
701             int32_t *ptr = args->qry_arr + nqry1*iqry;
702             args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3);
703         }
704         if ( !args->cross_check )   // in this case gt_dsg points to qry_dsg
705         {
706             for (i=0; i<args->ngt_smpl; i++)
707             {
708                 int igt = args->gt_smpl ? args->gt_smpl[i] : i;
709                 int32_t *ptr = args->gt_arr + ngt1*igt;
710                 args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3);
711                 if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0;      // not a hom, set to a missing value
712             }
713         }
714         for (i=0; i<args->nqry_smpl; i++)
715         {
716             int ngt = args->cross_check ? i : args->ngt_smpl;       // two files or a sub-diagonal cross-check mode?
717             if ( !args->qry_dsg[i] ) { idx += ngt; continue; }      // missing value
718             for (j=0; j<ngt; j++)
719             {
720                 if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
721 
722                 double min = args->qry_prob[i*3] + args->gt_prob[j*3];
723                 if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1];
724                 if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2];
725                 args->pdiff[idx] += min;
726 
727                 if ( args->calc_hwe_prob )
728                 {
729                     int match = args->qry_dsg[i] & args->gt_dsg[j];
730                     args->hwe_prob[idx] += hwe_dsg[match];
731                 }
732                 args->ncnt[idx]++;
733                 idx++;
734             }
735         }
736     }
737 }
738 
739 
740 typedef struct
741 {
742     int ism, idx;
743     double val;
744 }
745 idbl_t;
cmp_idbl(const void * _a,const void * _b)746 static int cmp_idbl(const void *_a, const void *_b)
747 {
748     idbl_t *a = (idbl_t*)_a;
749     idbl_t *b = (idbl_t*)_b;
750     if ( a->val < b->val ) return -1;
751     if ( a->val > b->val ) return 1;
752     return 0;
753 }
report_distinctive_sites(args_t * args)754 static void report_distinctive_sites(args_t *args)
755 {
756     extsort_sort(args->es);
757 
758     fprintf(args->fp,"# DS, distinctive sites:\n");
759     fprintf(args->fp,"#     - chromosome\n");
760     fprintf(args->fp,"#     - position\n");
761     fprintf(args->fp,"#     - cumulative number of pairs distinguished by this block\n");
762     fprintf(args->fp,"#     - block id\n");
763     fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
764 
765     kbitset_t *kbs_blk = kbs_init(args->npairs);
766     kbitset_iter_t itr;
767     int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0;
768     int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs;
769     while ( diff_sites_shift(args,&ndiff,&rid,&pos) )
770     {
771         int ndiff_new = 0, ndiff_dbg = 0;
772         kbs_start(&itr);
773         while ( (i=kbs_next(args->kbs_diff, &itr))>=0 )
774         {
775             ndiff_dbg++;
776             if ( kbs_exists(kbs_blk,i) ) continue;   // already set
777             kbs_insert(kbs_blk,i);
778             ndiff_new++;
779         }
780         if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
781         if ( !ndiff_new ) continue;     // no new pair distinguished by this site
782         ndiff_tot += ndiff_new;
783         fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
784         if ( ndiff_tot < ndiff_min ) continue;   // fewer than the requested number of pairs can be distinguished at this point
785         iblock++;
786         ndiff_tot = 0;
787         kbs_clear(kbs_blk);
788     }
789     kbs_destroy(kbs_blk);
790 }
report(args_t * args)791 static void report(args_t *args)
792 {
793     fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
794     fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
795     fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
796     fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
797     fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
798     fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
799     fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
800     fprintf(args->fp,"# DC, discordance:\n");
801     fprintf(args->fp,"#     - query sample\n");
802     fprintf(args->fp,"#     - genotyped sample\n");
803     fprintf(args->fp,"#     - discordance (number of mismatches; smaller is better)\n");
804     fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
805     fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
806     fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
807 
808     int trim = args->ntop;
809     if ( !args->pairs )
810     {
811         if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0;
812         if ( args->ngt_smpl && args->ngt_smpl <= args->ntop  ) trim = 0;
813     }
814 
815     if ( args->pairs )
816     {
817         int i;
818         for (i=0; i<args->npairs; i++)
819         {
820             int iqry = args->pairs[i].iqry;
821             int igt  = args->pairs[i].igt;
822             if ( args->ndiff )
823             {
824                 fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
825                         args->qry_hdr->samples[iqry],
826                         args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
827                         args->ndiff[i],
828                         args->calc_hwe_prob ? args->hwe_prob[i] : 0,
829                         args->ncnt[i]);
830             }
831             else
832             {
833                 fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
834                         args->qry_hdr->samples[iqry],
835                         args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
836                         args->pdiff[i],
837                         args->calc_hwe_prob ? args->hwe_prob[i] : 0,
838                         args->ncnt[i]);
839             }
840         }
841     }
842     else if ( !trim )
843     {
844         int i,j,idx=0;
845         for (i=0; i<args->nqry_smpl; i++)
846         {
847             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
848             int ngt  = args->cross_check ? i : args->ngt_smpl;
849             for (j=0; j<ngt; j++)
850             {
851                 int igt = args->gt_smpl ? args->gt_smpl[j] : j;
852                 if ( args->ndiff )
853                 {
854                     fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
855                             args->qry_hdr->samples[iqry],
856                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
857                             args->ndiff[idx],
858                             args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
859                             args->ncnt[idx]);
860                 }
861                 else
862                 {
863                     fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
864                             args->qry_hdr->samples[iqry],
865                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
866                             args->pdiff[idx],
867                             args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
868                             args->ncnt[idx]);
869                 }
870                 idx++;
871             }
872         }
873     }
874     else if ( !args->cross_check )
875     {
876         idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl);
877         int i,j;
878         for (i=0; i<args->nqry_smpl; i++)
879         {
880             int idx  = i*args->ngt_smpl;
881             for (j=0; j<args->ngt_smpl; j++)
882             {
883                 if ( args->sort_by_hwe )
884                     arr[j].val = -args->hwe_prob[idx];
885                 else if ( args->ndiff )
886                     arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
887                 else
888                     arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
889                 arr[j].ism = j;
890                 arr[j].idx = idx;
891                 idx++;
892             }
893             qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl);
894             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
895             for (j=0; j<args->ntop; j++)
896             {
897                 int idx = arr[j].idx;
898                 int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
899                 if ( args->ndiff )
900                 {
901                     fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
902                             args->qry_hdr->samples[iqry],
903                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
904                             args->ndiff[idx],
905                             args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
906                             args->ncnt[idx]);
907                 }
908                 else
909                 {
910                     fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
911                             args->qry_hdr->samples[iqry],
912                             args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
913                             args->pdiff[idx],
914                             args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
915                             args->ncnt[idx]);
916                 }
917             }
918         }
919         free(arr);
920     }
921     else
922     {
923         int narr = args->nqry_smpl-1;
924         idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr);
925         int i,j,k,idx;
926         for (i=0; i<args->nqry_smpl; i++)
927         {
928             k = 0, idx = i*(i-1)/2;
929             for (j=0; j<i; j++)
930             {
931                 if ( args->sort_by_hwe )
932                     arr[k].val = -args->hwe_prob[idx];
933                 else if ( args->ndiff )
934                     arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
935                 else
936                     arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
937                 arr[k].ism = j;
938                 arr[k].idx = idx;
939                 idx++;
940                 k++;
941             }
942             for (; j<narr; j++)
943             {
944                 idx = j*(j+1)/2 + i;
945                 if ( args->sort_by_hwe )
946                     arr[k].val = -args->hwe_prob[idx];
947                 else if ( args->ndiff )
948                     arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
949                 else
950                     arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
951                 arr[k].ism = j + 1;
952                 arr[k].idx = idx;
953                 k++;
954             }
955             qsort(arr, narr, sizeof(*arr), cmp_idbl);
956             int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
957             for (j=0; j<args->ntop; j++)
958             {
959                 if ( i <= arr[j].ism ) continue;
960                 int idx = arr[j].idx;
961                 int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
962                 if ( args->ndiff )
963                 {
964                     fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
965                             args->qry_hdr->samples[iqry],
966                             args->qry_hdr->samples[igt],
967                             args->ndiff[idx],
968                             args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
969                             args->ncnt[idx]);
970                 }
971                 else
972                 {
973                     fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
974                             args->qry_hdr->samples[iqry],
975                             args->qry_hdr->samples[igt],
976                             args->pdiff[idx],
977                             args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
978                             args->ncnt[idx]);
979                 }
980             }
981         }
982         free(arr);
983     }
984 }
985 
is_input_okay(args_t * args,int nmatch)986 static int is_input_okay(args_t *args, int nmatch)
987 {
988     int i;
989     const char *msg;
990     bcf_hdr_t *hdr;
991     bcf1_t *rec;
992     if ( args->gt_hdr && nmatch!=2 )
993     {
994         if ( args->nskip_no_match++ ) return 0;
995         for (i=0; i<2; i++)
996         {
997             rec = bcf_sr_get_line(args->files,i);
998             if ( rec ) break;
999         }
1000         hdr = bcf_sr_get_header(args->files,i);
1001         fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n",
1002                 bcf_seqname(hdr,rec),rec->pos+1);
1003         return 0;
1004     }
1005     for (i=0; i<2; i++)
1006     {
1007         hdr = bcf_sr_get_header(args->files,i);
1008         rec = bcf_sr_get_line(args->files,i);
1009         if ( rec->n_allele>2 )
1010         {
1011             if ( args->nskip_not_ba++ ) return 0;
1012             msg = "not a biallelic site, run `bcftools norm -m -` first";
1013             goto not_okay;
1014         }
1015         if ( bcf_get_variant_types(rec)==VCF_REF )
1016         {
1017             if ( args->nskip_mono++ ) return 0;
1018             msg = "monoallelic site";
1019             goto not_okay;
1020         }
1021         if ( !args->gt_hdr ) break;
1022     }
1023     return 1;
1024 
1025 not_okay:
1026     fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
1027         bcf_seqname(hdr,rec),rec->pos+1,msg);
1028     return 0;
1029 }
1030 
usage(void)1031 static void usage(void)
1032 {
1033     fprintf(stderr, "\n");
1034     fprintf(stderr, "About:   Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n");
1035     fprintf(stderr, "Usage:   bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
1036     fprintf(stderr, "\n");
1037     fprintf(stderr, "Options:\n");
1038     //fprintf(stderr, "    -a, --all-sites                  Output comparison for all sites\n");
1039     //fprintf(stderr, "    -c, --cluster MIN,MAX            Min inter- and max intra-sample error [0.23,-0.3]\n");
1040     fprintf(stderr, "        --distinctive-sites            Find sites that can distinguish between at least NUM sample pairs.\n");
1041     fprintf(stderr, "                  NUM[,MEM[,TMP]]          If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n");
1042     fprintf(stderr, "                                           The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n");
1043 #ifdef _WIN32
1044     fprintf(stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n");
1045 #else
1046     fprintf(stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
1047 #endif
1048     fprintf(stderr, "        --dry-run                      Stop after first record to estimate required time\n");
1049     fprintf(stderr, "    -e, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
1050     fprintf(stderr, "    -g, --genotypes FILE               Genotypes to compare against\n");
1051     fprintf(stderr, "    -H, --homs-only                    Homozygous genotypes only, useful with low coverage data (requires -g)\n");
1052     fprintf(stderr, "        --n-matches INT                Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
1053     fprintf(stderr, "                                           Use negative value to sort by HWE probability rather than by discordance [0]\n");
1054     fprintf(stderr, "        --no-HWE-prob                  Disable calculation of HWE probability\n");
1055     fprintf(stderr, "    -p, --pairs LIST                   Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
1056     fprintf(stderr, "    -P, --pairs-file FILE              File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
1057     fprintf(stderr, "    -r, --regions REGION               Restrict to comma-separated list of regions\n");
1058     fprintf(stderr, "    -R, --regions-file FILE            Restrict to regions listed in a file\n");
1059     fprintf(stderr, "        --regions-overlap 0|1|2        Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
1060     fprintf(stderr, "    -s, --samples [qry|gt]:LIST        List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
1061     fprintf(stderr, "    -S, --samples-file [qry|gt]:FILE   File with the query or -g samples to compare\n");
1062     fprintf(stderr, "    -t, --targets REGION               Similar to -r but streams rather than index-jumps\n");
1063     fprintf(stderr, "    -T, --targets-file FILE            Similar to -R but streams rather than index-jumps\n");
1064     fprintf(stderr, "        --targets-overlap 0|1|2        Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
1065     fprintf(stderr, "    -u, --use TAG1[,TAG2]              Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
1066     fprintf(stderr, "Examples:\n");
1067     fprintf(stderr, "   # Check discordance of all samples from B against all sample in A\n");
1068     fprintf(stderr, "   bcftools gtcheck -g A.bcf B.bcf\n");
1069     fprintf(stderr, "\n");
1070     fprintf(stderr, "   # Limit comparisons to the fiven list of samples\n");
1071     fprintf(stderr, "   bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
1072     fprintf(stderr, "\n");
1073     fprintf(stderr, "   # Compare only two pairs a1,b1 and a1,b2\n");
1074     fprintf(stderr, "   bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n");
1075     fprintf(stderr, "\n");
1076     exit(1);
1077 }
1078 
main_vcfgtcheck(int argc,char * argv[])1079 int main_vcfgtcheck(int argc, char *argv[])
1080 {
1081     int c;
1082     args_t *args = (args_t*) calloc(1,sizeof(args_t));
1083     args->argc   = argc; args->argv = argv; set_cwd(args);
1084     args->qry_use_GT = -1;
1085     args->gt_use_GT  = -1;
1086     args->calc_hwe_prob = 1;
1087     args->use_PLs = 40;
1088     args->regions_overlap = 1;
1089     args->targets_overlap = 0;
1090 
1091     // external sort for --distinctive-sites
1092 #ifdef _WIN32
1093     args->es_tmp_prefix = NULL;
1094 #else
1095     args->es_tmp_prefix = "/tmp/bcftools-gtcheck";
1096 #endif
1097     args->es_max_mem = strdup("500M");
1098 
1099     // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
1100     //    - min_inter: pairs with smaller err value will be considered identical
1101     //    - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
1102     //                  different. If negative, the cutoff may be heuristically lowered
1103     args->min_inter_err =  0.23;
1104     args->max_intra_err = -0.3;
1105 
1106     static struct option loptions[] =
1107     {
1108         {"error-probability",1,0,'e'},
1109         {"use",1,0,'u'},
1110         {"cluster",1,0,'c'},
1111         {"GTs-only",1,0,'G'},
1112         {"all-sites",0,0,'a'},
1113         {"homs-only",0,0,'H'},
1114         {"help",0,0,'h'},
1115         {"genotypes",1,0,'g'},
1116         {"plot",1,0,'p'},
1117         {"samples",1,0,'s'},
1118         {"samples-file",1,0,'S'},
1119         {"n-matches",1,0,2},
1120         {"no-HWE-prob",0,0,3},
1121         {"target-sample",1,0,4},
1122         {"dry-run",0,0,5},
1123         {"distinctive-sites",1,0,6},
1124         {"regions",1,0,'r'},
1125         {"regions-file",1,0,'R'},
1126         {"regions-overlap",required_argument,NULL,7},
1127         {"targets",1,0,'t'},
1128         {"targets-file",1,0,'T'},
1129         {"targets-overlap",required_argument,NULL,8},
1130         {"pairs",1,0,'p'},
1131         {"pairs-file",1,0,'P'},
1132         {0,0,0,0}
1133     };
1134     char *tmp;
1135     while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
1136         switch (c) {
1137             case 'e':
1138                 args->use_PLs = strtol(optarg,&tmp,10);
1139                 if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
1140                 break;
1141             case 'u':
1142                 {
1143                     int i,nlist;
1144                     char **list = hts_readlist(optarg, 0, &nlist);
1145                     if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg);
1146                     if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1;
1147                     else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0;
1148                     else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
1149                     if ( nlist==2 )
1150                     {
1151                         if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1;
1152                         else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0;
1153                         else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
1154                     }
1155                     else args->gt_use_GT = args->qry_use_GT;
1156                     for (i=0; i<nlist; i++) free(list[i]);
1157                     free(list);
1158                 }
1159                 break;
1160             case 2 :
1161                 args->ntop = strtol(optarg,&tmp,10);
1162                 if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg);
1163                 if ( args->ntop < 0 )
1164                 {
1165                     args->sort_by_hwe = 1;
1166                     args->ntop *= -1;
1167                 }
1168                 break;
1169             case 3 : args->calc_hwe_prob = 0; break;
1170             case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
1171             case 5 : args->dry_run = 1; break;
1172             case 6 :
1173                 args->distinctive_sites = strtod(optarg,&tmp);
1174                 if ( *tmp )
1175                 {
1176                     if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg);
1177                     tmp++;
1178                     free(args->es_max_mem);
1179                     args->es_max_mem = strdup(tmp);
1180                     while ( *tmp && *tmp!=',' ) tmp++;
1181                     if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
1182                 }
1183                 args->use_PLs = 0;
1184                 break;
1185             case 'c':
1186                 error("The -c option is to be implemented, please open an issue on github\n");
1187                 args->min_inter_err = strtod(optarg,&tmp);
1188                 if ( *tmp )
1189                 {
1190                     if ( *tmp!=',') error("Could not parse: -c %s\n", optarg);
1191                     args->max_intra_err = strtod(tmp+1,&tmp);
1192                     if ( *tmp ) error("Could not parse: -c %s\n", optarg);
1193                 }
1194                 break;
1195             case 'G': error("The option -G, --GTs-only has been deprecated\n"); break;
1196             case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break;
1197             case 'H': args->hom_only = 1; break;
1198             case 'g': args->gt_fname = optarg; break;
1199 //            case 'p': args->plot = optarg; break;
1200             case 's':
1201                 if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3;
1202                 else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
1203                 else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
1204                 break;
1205             case 'S':
1206                 if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
1207                 else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
1208                 else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
1209                 break;
1210             case 'p': args->pair_samples = optarg; break;
1211             case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break;
1212             case 'r': args->regions = optarg; break;
1213             case 'R': args->regions = optarg; args->regions_is_file = 1; break;
1214             case 't': args->targets = optarg; break;
1215             case 'T': args->targets = optarg; args->targets_is_file = 1; break;
1216             case  7 :
1217                 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
1218                 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
1219                 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
1220                 else error("Could not parse: --regions-overlap %s\n",optarg);
1221                 break;
1222             case  8 :
1223                 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
1224                 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
1225                 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
1226                 else error("Could not parse: --targets-overlap %s\n",optarg);
1227                 break;
1228             case 'h':
1229             case '?': usage(); break;
1230             default: error("Unknown argument: %s\n", optarg);
1231         }
1232     }
1233     if ( optind==argc )
1234     {
1235         if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-";  // reading from stdin
1236         else usage();   // no files given
1237     }
1238     else args->qry_fname = argv[optind];
1239     if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n");  // too many files given
1240     if ( args->pair_samples )
1241     {
1242         if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n");
1243         if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n");
1244     }
1245     if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
1246     if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
1247     if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
1248 
1249     init_data(args);
1250 
1251     int ret;
1252     while ( (ret=bcf_sr_next_line(args->files)) )
1253     {
1254         if ( !is_input_okay(args,ret) ) continue;
1255 
1256         // time one record to give the user an estimate with very big files
1257         struct timeval t0, t1;
1258         if ( !args->ncmp )  gettimeofday(&t0, NULL);
1259 
1260         process_line(args);
1261 
1262         if ( args->ncmp==1 )
1263         {
1264             gettimeofday(&t1, NULL);
1265             double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
1266             fprintf(stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
1267             fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
1268             if ( args->dry_run ) break;
1269         }
1270     }
1271     if ( !args->dry_run )
1272     {
1273         report(args);
1274         if ( args->distinctive_sites ) report_distinctive_sites(args);
1275     }
1276 
1277     destroy_data(args);
1278     free(args);
1279     return 0;
1280 }
1281 
1282