1 /* vcfgtcheck.c -- Check sample identity.
2
3 Copyright (C) 2013-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <assert.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <math.h>
37 #include <htslib/vcf.h>
38 #include <htslib/synced_bcf_reader.h>
39 #include <htslib/vcfutils.h>
40 #include <htslib/kbitset.h>
41 #include <htslib/hts_os.h>
42 #include <inttypes.h>
43 #include <sys/time.h>
44 #include "bcftools.h"
45 #include "extsort.h"
46 //#include "hclust.h"
47
48 typedef struct
49 {
50 int iqry, igt;
51 }
52 pair_t;
53
54 typedef struct
55 {
56 bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
57 bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
58 char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
59 int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
60 int regions_overlap, targets_overlap;
61 int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
62 double *pdiff, *qry_prob, *gt_prob;
63 uint32_t *ndiff,*ncnt,ncmp, npairs;
64 int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
65 uint8_t *qry_dsg, *gt_dsg;
66 pair_t *pairs;
67 double *hwe_prob, dsg2prob[8][3], pl2prob[256];
68 double min_inter_err, max_intra_err;
69 int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
70 FILE *fp;
71 unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
72
73 // for --distinctive-sites
74 double distinctive_sites;
75 kbitset_t *kbs_diff;
76 size_t diff_sites_size;
77 extsort_t *es;
78 char *es_tmp_prefix, *es_max_mem;
79 }
80 args_t;
81
set_cwd(args_t * args)82 static void set_cwd(args_t *args)
83 {
84 int i;
85 char *buf;
86 size_t nbuf = 500;
87 args->cwd = (char*) malloc(sizeof(char)*nbuf);
88 for (i=0; i<5; i++)
89 {
90 if ( (buf = getcwd(args->cwd, nbuf)) ) break;
91 nbuf *= 2;
92 args->cwd = (char*) realloc(args->cwd, sizeof(char)*nbuf);
93 }
94 assert(buf);
95 }
print_header(args_t * args,FILE * fp)96 static void print_header(args_t *args, FILE *fp)
97 {
98 fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
99 fprintf(fp, "# \t bcftools %s ", args->argv[0]);
100 int i;
101 for (i=1; i<args->argc; i++)
102 fprintf(fp, " %s",args->argv[i]);
103 fprintf(fp, "\n# and the working directory was:\n");
104 fprintf(fp, "# \t %s\n#\n", args->cwd);
105 }
106
cmp_int(const void * _a,const void * _b)107 static int cmp_int(const void *_a, const void *_b)
108 {
109 int a = *((int*)_a);
110 int b = *((int*)_b);
111 if ( a < b ) return -1;
112 if ( a > b ) return 1;
113 return 0;
114 }
cmp_pair(const void * _a,const void * _b)115 static int cmp_pair(const void *_a, const void *_b)
116 {
117 pair_t *a = (pair_t*)_a;
118 pair_t *b = (pair_t*)_b;
119 if ( a->iqry < b->iqry ) return -1;
120 if ( a->iqry > b->iqry ) return 1;
121 if ( a->igt < b->igt ) return -1;
122 if ( a->igt > b->igt ) return 1;
123 return 0;
124 }
125
126 typedef struct
127 {
128 uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
129 unsigned long kbs_dat[1];
130 }
131 diff_sites_t;
132 #if DBG
diff_sites_debug_print(args_t * args,diff_sites_t * ds)133 static void diff_sites_debug_print(args_t *args, diff_sites_t *ds)
134 {
135 int i;
136 memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
137 fprintf(stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff);
138 for (i=0; i<args->npairs; i++) fprintf(stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0);
139 fprintf(stderr,"\n");
140 }
141 #endif
diff_sites_cmp(const void * aptr,const void * bptr)142 static int diff_sites_cmp(const void *aptr, const void *bptr)
143 {
144 diff_sites_t *a = *((diff_sites_t**)aptr);
145 diff_sites_t *b = *((diff_sites_t**)bptr);
146 if ( a->ndiff < b->ndiff ) return 1; // descending order
147 if ( a->ndiff > b->ndiff ) return -1;
148 if ( a->rand < b->rand ) return -1;
149 if ( a->rand > b->rand ) return 1;
150 return 0;
151 }
diff_sites_init(args_t * args)152 static void diff_sites_init(args_t *args)
153 {
154 int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites;
155 if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites);
156 if ( nsites > args->npairs )
157 {
158 fprintf(stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n");
159 nsites = args->npairs;
160 args->distinctive_sites = args->npairs + 1;
161 }
162 else
163 args->distinctive_sites = nsites;
164 args->kbs_diff = kbs_init(args->npairs);
165 size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS;
166 assert( n==args->kbs_diff->n );
167 args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long);
168 args->es = extsort_alloc();
169 extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size);
170 extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix);
171 extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem);
172 extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp);
173 extsort_init(args->es);
174 }
diff_sites_destroy(args_t * args)175 static void diff_sites_destroy(args_t *args)
176 {
177 kbs_destroy(args->kbs_diff);
178 extsort_destroy(args->es);
179 }
diff_sites_reset(args_t * args)180 static inline void diff_sites_reset(args_t *args)
181 {
182 kbs_clear(args->kbs_diff);
183 }
diff_sites_push(args_t * args,int ndiff,int rid,int pos)184 static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos)
185 {
186 diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size);
187 memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added)
188 dat->ndiff = ndiff;
189 dat->rid = rid;
190 dat->pos = pos;
191 dat->rand = hts_lrand48();
192 memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long));
193 extsort_push(args->es,dat);
194 }
diff_sites_shift(args_t * args,int * ndiff,int * rid,int * pos)195 static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos)
196 {
197 diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es);
198 if ( !dat ) return 0;
199 *ndiff = dat->ndiff;
200 *rid = dat->rid;
201 *pos = dat->pos;
202 memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
203 return 1;
204 }
205
init_samples(char * list,int list_is_file,int ** smpl,int * nsmpl,bcf_hdr_t * hdr,char * vcf_fname)206 static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname)
207 {
208 int i;
209 if ( !strcmp(list,"-") )
210 {
211 *nsmpl = bcf_hdr_nsamples(hdr);
212 *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
213 for (i=0; i<*nsmpl; i++) (*smpl)[i] = i;
214 return;
215 }
216
217 char **tmp = hts_readlist(list, list_is_file, nsmpl);
218 if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list);
219 *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
220 for (i=0; i<*nsmpl; i++)
221 {
222 int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]);
223 if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]);
224 (*smpl)[i] = idx;
225 free(tmp[i]);
226 }
227 free(tmp);
228 qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int);
229 // check for duplicates
230 for (i=1; i<*nsmpl; i++)
231 if ( (*smpl)[i-1]==(*smpl)[i] )
232 error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list);
233 }
234
init_data(args_t * args)235 static void init_data(args_t *args)
236 {
237 hts_srand48(0);
238
239 args->files = bcf_sr_init();
240 if ( args->regions )
241 {
242 bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
243 if ( bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
244 }
245 if ( args->targets )
246 {
247 bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
248 if ( bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
249 }
250
251 if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
252 if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
253 if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
254 error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
255
256 args->qry_hdr = bcf_sr_get_header(args->files,0);
257 if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname);
258 if ( args->gt_fname )
259 {
260 args->gt_hdr = bcf_sr_get_header(args->files,1);
261 if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
262 }
263
264 // Determine whether GT or PL will be used
265 if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
266 {
267 if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 )
268 args->qry_use_GT = 0;
269 else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 )
270 args->qry_use_GT = 1;
271 else
272 error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname);
273 }
274 else if ( args->qry_use_GT==1 )
275 {
276 if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 )
277 error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname);
278 }
279 else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 )
280 error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname);
281
282 if ( args->gt_hdr )
283 {
284 if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default
285 {
286 if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 )
287 args->gt_use_GT = 1;
288 else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 )
289 args->gt_use_GT = 0;
290 else
291 error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname);
292 }
293 else if ( args->gt_use_GT==1 )
294 {
295 if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 )
296 error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname);
297 }
298 else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 )
299 error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname);
300 }
301 else
302 args->gt_use_GT = args->qry_use_GT;
303
304 // Prepare samples
305 int i,j;
306 args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr);
307 if ( args->qry_samples )
308 {
309 init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
310 }
311 if ( args->gt_samples )
312 {
313 init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
314 args->gt_hdr ? args->gt_hdr : args->qry_hdr,
315 args->gt_fname ? args->gt_fname : args->qry_fname);
316 }
317 else if ( args->pair_samples )
318 {
319 int npairs;
320 char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs);
321 if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples);
322 if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n");
323 args->npairs = args->pair_samples_is_file ? npairs : npairs/2;
324 args->pairs = (pair_t*) calloc(args->npairs,sizeof(*args->pairs));
325 if ( !args->pair_samples_is_file )
326 {
327 for (i=0; i<args->npairs; i++)
328 {
329 args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]);
330 args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]);
331 if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]);
332 if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]);
333 free(tmp[2*i]);
334 free(tmp[2*i+1]);
335 }
336 }
337 else
338 {
339 for (i=0; i<args->npairs; i++)
340 {
341 char *ptr = tmp[i];
342 while ( *ptr && !isspace(*ptr) ) ptr++;
343 if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]);
344 *ptr = 0;
345 args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]);
346 if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]);
347 ptr++;
348 while ( *ptr && isspace(*ptr) ) ptr++;
349 args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr);
350 if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr);
351 free(tmp[i]);
352 }
353 }
354 free(tmp);
355 qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair);
356 }
357 else if ( args->gt_hdr )
358 args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr);
359 if ( !args->ngt_smpl )
360 {
361 args->ngt_smpl = args->nqry_smpl;
362 args->gt_smpl = args->qry_smpl;
363 args->cross_check = 1;
364 }
365
366 // The data arrays
367 if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl;
368 if ( !args->pair_samples )
369 {
370 args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
371 args->gt_dsg = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
372 }
373 if ( args->use_PLs )
374 {
375 args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff)); // log probability of pair samples being the same
376 args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
377 args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
378
379 // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
380 // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
381 // probabilities of 0/0, 0/1, and 1/1 genotypes
382 for (i=0; i<8; i++)
383 for (j=0; j<3; j++)
384 args->dsg2prob[i][j] = HUGE_VAL;
385 args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
386 args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
387 args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
388 args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
389 args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
390 args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
391 args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
392 args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
393 args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
394
395 // lookup table to avoid exponentiation
396 for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
397 }
398 else
399 args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff)); // number of differing genotypes for each pair of samples
400 args->ncnt = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt)); // number of comparisons performed (non-missing data)
401 if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
402 if ( args->calc_hwe_prob )
403 {
404 // prob of the observed sequence of matches given site AFs and HWE
405 args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
406 if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
407 }
408
409 if ( args->distinctive_sites ) diff_sites_init(args);
410
411 args->fp = stdout;
412 print_header(args, args->fp);
413 }
414
destroy_data(args_t * args)415 static void destroy_data(args_t *args)
416 {
417 if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
418 free(args->qry_dsg);
419 if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
420 free(args->qry_prob);
421 free(args->es_max_mem);
422 fclose(args->fp);
423 if ( args->distinctive_sites ) diff_sites_destroy(args);
424 free(args->hwe_prob);
425 free(args->cwd);
426 free(args->qry_arr);
427 if ( args->gt_hdr ) free(args->gt_arr);
428 free(args->pdiff);
429 free(args->ndiff);
430 free(args->ncnt);
431 free(args->qry_smpl);
432 if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
433 free(args->pairs);
434 bcf_sr_destroy(args->files);
435 }
436
gt_to_dsg(int32_t * ptr)437 static inline uint8_t gt_to_dsg(int32_t *ptr)
438 {
439 if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) return 0;
440 uint8_t dsg = (bcf_gt_allele(ptr[0])?1:0) + (bcf_gt_allele(ptr[1])?1:0);
441 return 1<<dsg;
442 }
pl_to_dsg(int32_t * ptr)443 static inline uint8_t pl_to_dsg(int32_t *ptr)
444 {
445 if ( ptr[0]==bcf_int32_missing || ptr[1]==bcf_int32_missing || ptr[2]==bcf_int32_missing ) return 0;
446 if ( ptr[1]==bcf_int32_vector_end || ptr[2]==bcf_int32_vector_end ) return 0;
447 int min_pl = ptr[0]<ptr[1] ? (ptr[0]<ptr[2]?ptr[0]:ptr[2]) : (ptr[1]<ptr[2]?ptr[1]:ptr[2]);
448 uint8_t dsg = 0;
449 if ( ptr[0]==min_pl ) dsg |= 1;
450 if ( ptr[1]==min_pl ) dsg |= 2;
451 if ( ptr[2]==min_pl ) dsg |= 4;
452 return dsg;
453 }
gt_to_prob(args_t * args,int32_t * ptr,double * prob)454 static inline uint8_t gt_to_prob(args_t *args, int32_t *ptr, double *prob)
455 {
456 uint8_t dsg = gt_to_dsg(ptr);
457 if ( dsg )
458 {
459 prob[0] = args->dsg2prob[dsg][0];
460 prob[1] = args->dsg2prob[dsg][1];
461 prob[2] = args->dsg2prob[dsg][2];
462 }
463 return dsg;
464 }
pl_to_prob(args_t * args,int32_t * ptr,double * prob)465 static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob)
466 {
467 uint8_t dsg = pl_to_dsg(ptr);
468 if ( dsg )
469 {
470 prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255];
471 prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255];
472 prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255];
473 double sum = prob[0] + prob[1] + prob[2];
474 prob[0] /= sum;
475 prob[1] /= sum;
476 prob[2] /= sum;
477 prob[0] = -log(prob[0]);
478 prob[1] = -log(prob[1]);
479 prob[2] = -log(prob[2]);
480 }
481 return dsg;
482 }
set_data(args_t * args,bcf_hdr_t * hdr,bcf1_t * rec,int32_t ** arr,int32_t * narr,int * narr1,int * use_GT)483 static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT)
484 {
485 static int warn_dip_GT = 1;
486 static int warn_dip_PL = 1;
487 int i;
488 for (i=0; i<2; i++)
489 {
490 if ( *use_GT )
491 {
492 int ret = bcf_get_genotypes(hdr,rec,arr,narr);
493 if ( ret < 0 )
494 {
495 if ( !i ) { *use_GT = 0; continue; }
496 args->nskip_no_data++;
497 return -1;
498 }
499 if ( ret != 2*bcf_hdr_nsamples(hdr) )
500 {
501 if ( warn_dip_GT )
502 {
503 fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
504 warn_dip_GT = 0;
505 }
506 args->nskip_dip_GT++;
507 return -1;
508 }
509 *narr1 = 2;
510 return 0;
511 }
512
513 int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr);
514 if ( ret < 0 )
515 {
516 if ( !i ) { *use_GT = 1; continue; }
517 args->nskip_no_data++;
518 return -1;
519 }
520 if ( ret != 3*bcf_hdr_nsamples(hdr) )
521 {
522 if ( warn_dip_PL )
523 {
524 fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/PL fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
525 warn_dip_PL = 0;
526 }
527 args->nskip_dip_PL++;
528 return -1;
529 }
530 *narr1 = 3;
531 return 0;
532 }
533 return -1; // should never reach
534 }
process_line(args_t * args)535 static void process_line(args_t *args)
536 {
537 int i,j,k, nqry1, ngt1, ret;
538
539 bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0); // the query file
540 int qry_use_GT = args->qry_use_GT;
541 int gt_use_GT = args->gt_use_GT;
542
543 ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT);
544 if ( ret<0 ) return;
545
546 if ( args->gt_hdr )
547 {
548 gt_rec = bcf_sr_get_line(args->files,1);
549 ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, >_use_GT);
550 if ( ret<0 ) return;
551 }
552 else
553 {
554 ngt1 = nqry1;
555 args->gt_arr = args->qry_arr;
556 }
557
558 args->ncmp++;
559
560 double af,hwe_dsg[8];
561 if ( args->calc_hwe_prob )
562 {
563 int ac[2];
564 if ( args->gt_hdr )
565 {
566 if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
567 }
568 else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
569
570 // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
571 // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
572 // hwe[1,2,4] .. dsg=0,1,2
573 // hwe[3] .. dsg=0 or 1
574 // hwe[6] .. dsg=1 or 2
575
576 double hwe[3];
577 const double min_af = 1e-5; // cap the AF in case we get unrealistic values
578 af = (double)ac[1]/(ac[0]+ac[1]);
579 hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
580 hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
581 hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
582 hwe_dsg[0] = 0;
583 for (i=1; i<8; i++)
584 {
585 hwe_dsg[i] = HUGE_VAL;
586 for (k=0; k<3; k++)
587 {
588 if ( ((1<<k)&i) && hwe_dsg[i] > hwe[k] ) hwe_dsg[i] = hwe[k];
589 }
590 }
591 }
592
593 // The sample pairs were given explicitly via -p/-P options
594 if ( args->pairs )
595 {
596 if ( !args->use_PLs )
597 {
598 int ndiff = 0;
599 if ( args->kbs_diff ) diff_sites_reset(args);
600
601 for (i=0; i<args->npairs; i++)
602 {
603 int32_t *ptr;
604 uint8_t qry_dsg, gt_dsg;
605
606 ptr = args->gt_arr + args->pairs[i].igt*ngt1;
607 gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
608 if ( !gt_dsg ) continue; // missing value
609 if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
610
611 ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
612 qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
613 if ( !qry_dsg ) continue; // missing value
614
615 int match = qry_dsg & gt_dsg;
616 if ( !match )
617 {
618 args->ndiff[i]++;
619 if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
620 }
621 else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
622 args->ncnt[i]++;
623 }
624
625 if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
626 }
627 else // use_PLs set
628 {
629 for (i=0; i<args->npairs; i++)
630 {
631 int32_t *ptr;
632 double qry_prob[3], gt_prob[3];
633 uint8_t qry_dsg, gt_dsg;
634
635 ptr = args->gt_arr + args->pairs[i].igt*ngt1;
636 gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
637 if ( !gt_dsg ) continue; // missing value
638 if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
639
640 ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
641 qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
642 if ( !qry_dsg ) continue; // missing value
643
644 double min = qry_prob[0] + gt_prob[0];
645 qry_prob[1] += gt_prob[1];
646 if ( min > qry_prob[1] ) min = qry_prob[1];
647 qry_prob[2] += gt_prob[2];
648 if ( min > qry_prob[2] ) min = qry_prob[2];
649 args->pdiff[i] += min;
650
651 if ( args->calc_hwe_prob )
652 {
653 int match = qry_dsg & gt_dsg;
654 args->hwe_prob[i] += hwe_dsg[match];
655 }
656 args->ncnt[i]++;
657 }
658 }
659 return;
660 }
661
662 int idx=0;
663 if ( !args->use_PLs )
664 {
665 for (i=0; i<args->nqry_smpl; i++)
666 {
667 int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
668 int32_t *ptr = args->qry_arr + nqry1*iqry;
669 args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
670 }
671 if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg
672 {
673 for (i=0; i<args->ngt_smpl; i++)
674 {
675 int igt = args->gt_smpl ? args->gt_smpl[i] : i;
676 int32_t *ptr = args->gt_arr + ngt1*igt;
677 args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
678 if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value
679 }
680 }
681 for (i=0; i<args->nqry_smpl; i++)
682 {
683 int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode?
684 if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value
685 for (j=0; j<ngt; j++)
686 {
687 if ( !args->gt_dsg[j] ) { idx++; continue; } // missing value
688 int match = args->qry_dsg[i] & args->gt_dsg[j];
689 if ( !match ) args->ndiff[idx]++;
690 else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
691 args->ncnt[idx]++;
692 idx++;
693 }
694 }
695 }
696 else // use_PLs set
697 {
698 for (i=0; i<args->nqry_smpl; i++)
699 {
700 int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
701 int32_t *ptr = args->qry_arr + nqry1*iqry;
702 args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3);
703 }
704 if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg
705 {
706 for (i=0; i<args->ngt_smpl; i++)
707 {
708 int igt = args->gt_smpl ? args->gt_smpl[i] : i;
709 int32_t *ptr = args->gt_arr + ngt1*igt;
710 args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3);
711 if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value
712 }
713 }
714 for (i=0; i<args->nqry_smpl; i++)
715 {
716 int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode?
717 if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value
718 for (j=0; j<ngt; j++)
719 {
720 if ( !args->gt_dsg[j] ) { idx++; continue; } // missing value
721
722 double min = args->qry_prob[i*3] + args->gt_prob[j*3];
723 if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1];
724 if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2];
725 args->pdiff[idx] += min;
726
727 if ( args->calc_hwe_prob )
728 {
729 int match = args->qry_dsg[i] & args->gt_dsg[j];
730 args->hwe_prob[idx] += hwe_dsg[match];
731 }
732 args->ncnt[idx]++;
733 idx++;
734 }
735 }
736 }
737 }
738
739
740 typedef struct
741 {
742 int ism, idx;
743 double val;
744 }
745 idbl_t;
cmp_idbl(const void * _a,const void * _b)746 static int cmp_idbl(const void *_a, const void *_b)
747 {
748 idbl_t *a = (idbl_t*)_a;
749 idbl_t *b = (idbl_t*)_b;
750 if ( a->val < b->val ) return -1;
751 if ( a->val > b->val ) return 1;
752 return 0;
753 }
report_distinctive_sites(args_t * args)754 static void report_distinctive_sites(args_t *args)
755 {
756 extsort_sort(args->es);
757
758 fprintf(args->fp,"# DS, distinctive sites:\n");
759 fprintf(args->fp,"# - chromosome\n");
760 fprintf(args->fp,"# - position\n");
761 fprintf(args->fp,"# - cumulative number of pairs distinguished by this block\n");
762 fprintf(args->fp,"# - block id\n");
763 fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
764
765 kbitset_t *kbs_blk = kbs_init(args->npairs);
766 kbitset_iter_t itr;
767 int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0;
768 int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs;
769 while ( diff_sites_shift(args,&ndiff,&rid,&pos) )
770 {
771 int ndiff_new = 0, ndiff_dbg = 0;
772 kbs_start(&itr);
773 while ( (i=kbs_next(args->kbs_diff, &itr))>=0 )
774 {
775 ndiff_dbg++;
776 if ( kbs_exists(kbs_blk,i) ) continue; // already set
777 kbs_insert(kbs_blk,i);
778 ndiff_new++;
779 }
780 if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
781 if ( !ndiff_new ) continue; // no new pair distinguished by this site
782 ndiff_tot += ndiff_new;
783 fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
784 if ( ndiff_tot < ndiff_min ) continue; // fewer than the requested number of pairs can be distinguished at this point
785 iblock++;
786 ndiff_tot = 0;
787 kbs_clear(kbs_blk);
788 }
789 kbs_destroy(kbs_blk);
790 }
report(args_t * args)791 static void report(args_t *args)
792 {
793 fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
794 fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
795 fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
796 fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
797 fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
798 fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
799 fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
800 fprintf(args->fp,"# DC, discordance:\n");
801 fprintf(args->fp,"# - query sample\n");
802 fprintf(args->fp,"# - genotyped sample\n");
803 fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n");
804 fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
805 fprintf(args->fp,"# - number of sites compared (bigger is better)\n");
806 fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
807
808 int trim = args->ntop;
809 if ( !args->pairs )
810 {
811 if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0;
812 if ( args->ngt_smpl && args->ngt_smpl <= args->ntop ) trim = 0;
813 }
814
815 if ( args->pairs )
816 {
817 int i;
818 for (i=0; i<args->npairs; i++)
819 {
820 int iqry = args->pairs[i].iqry;
821 int igt = args->pairs[i].igt;
822 if ( args->ndiff )
823 {
824 fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
825 args->qry_hdr->samples[iqry],
826 args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
827 args->ndiff[i],
828 args->calc_hwe_prob ? args->hwe_prob[i] : 0,
829 args->ncnt[i]);
830 }
831 else
832 {
833 fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
834 args->qry_hdr->samples[iqry],
835 args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
836 args->pdiff[i],
837 args->calc_hwe_prob ? args->hwe_prob[i] : 0,
838 args->ncnt[i]);
839 }
840 }
841 }
842 else if ( !trim )
843 {
844 int i,j,idx=0;
845 for (i=0; i<args->nqry_smpl; i++)
846 {
847 int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
848 int ngt = args->cross_check ? i : args->ngt_smpl;
849 for (j=0; j<ngt; j++)
850 {
851 int igt = args->gt_smpl ? args->gt_smpl[j] : j;
852 if ( args->ndiff )
853 {
854 fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
855 args->qry_hdr->samples[iqry],
856 args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
857 args->ndiff[idx],
858 args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
859 args->ncnt[idx]);
860 }
861 else
862 {
863 fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
864 args->qry_hdr->samples[iqry],
865 args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
866 args->pdiff[idx],
867 args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
868 args->ncnt[idx]);
869 }
870 idx++;
871 }
872 }
873 }
874 else if ( !args->cross_check )
875 {
876 idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl);
877 int i,j;
878 for (i=0; i<args->nqry_smpl; i++)
879 {
880 int idx = i*args->ngt_smpl;
881 for (j=0; j<args->ngt_smpl; j++)
882 {
883 if ( args->sort_by_hwe )
884 arr[j].val = -args->hwe_prob[idx];
885 else if ( args->ndiff )
886 arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
887 else
888 arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
889 arr[j].ism = j;
890 arr[j].idx = idx;
891 idx++;
892 }
893 qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl);
894 int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
895 for (j=0; j<args->ntop; j++)
896 {
897 int idx = arr[j].idx;
898 int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
899 if ( args->ndiff )
900 {
901 fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
902 args->qry_hdr->samples[iqry],
903 args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
904 args->ndiff[idx],
905 args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
906 args->ncnt[idx]);
907 }
908 else
909 {
910 fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
911 args->qry_hdr->samples[iqry],
912 args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
913 args->pdiff[idx],
914 args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
915 args->ncnt[idx]);
916 }
917 }
918 }
919 free(arr);
920 }
921 else
922 {
923 int narr = args->nqry_smpl-1;
924 idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr);
925 int i,j,k,idx;
926 for (i=0; i<args->nqry_smpl; i++)
927 {
928 k = 0, idx = i*(i-1)/2;
929 for (j=0; j<i; j++)
930 {
931 if ( args->sort_by_hwe )
932 arr[k].val = -args->hwe_prob[idx];
933 else if ( args->ndiff )
934 arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
935 else
936 arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
937 arr[k].ism = j;
938 arr[k].idx = idx;
939 idx++;
940 k++;
941 }
942 for (; j<narr; j++)
943 {
944 idx = j*(j+1)/2 + i;
945 if ( args->sort_by_hwe )
946 arr[k].val = -args->hwe_prob[idx];
947 else if ( args->ndiff )
948 arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
949 else
950 arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
951 arr[k].ism = j + 1;
952 arr[k].idx = idx;
953 k++;
954 }
955 qsort(arr, narr, sizeof(*arr), cmp_idbl);
956 int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
957 for (j=0; j<args->ntop; j++)
958 {
959 if ( i <= arr[j].ism ) continue;
960 int idx = arr[j].idx;
961 int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
962 if ( args->ndiff )
963 {
964 fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
965 args->qry_hdr->samples[iqry],
966 args->qry_hdr->samples[igt],
967 args->ndiff[idx],
968 args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
969 args->ncnt[idx]);
970 }
971 else
972 {
973 fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
974 args->qry_hdr->samples[iqry],
975 args->qry_hdr->samples[igt],
976 args->pdiff[idx],
977 args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
978 args->ncnt[idx]);
979 }
980 }
981 }
982 free(arr);
983 }
984 }
985
is_input_okay(args_t * args,int nmatch)986 static int is_input_okay(args_t *args, int nmatch)
987 {
988 int i;
989 const char *msg;
990 bcf_hdr_t *hdr;
991 bcf1_t *rec;
992 if ( args->gt_hdr && nmatch!=2 )
993 {
994 if ( args->nskip_no_match++ ) return 0;
995 for (i=0; i<2; i++)
996 {
997 rec = bcf_sr_get_line(args->files,i);
998 if ( rec ) break;
999 }
1000 hdr = bcf_sr_get_header(args->files,i);
1001 fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n",
1002 bcf_seqname(hdr,rec),rec->pos+1);
1003 return 0;
1004 }
1005 for (i=0; i<2; i++)
1006 {
1007 hdr = bcf_sr_get_header(args->files,i);
1008 rec = bcf_sr_get_line(args->files,i);
1009 if ( rec->n_allele>2 )
1010 {
1011 if ( args->nskip_not_ba++ ) return 0;
1012 msg = "not a biallelic site, run `bcftools norm -m -` first";
1013 goto not_okay;
1014 }
1015 if ( bcf_get_variant_types(rec)==VCF_REF )
1016 {
1017 if ( args->nskip_mono++ ) return 0;
1018 msg = "monoallelic site";
1019 goto not_okay;
1020 }
1021 if ( !args->gt_hdr ) break;
1022 }
1023 return 1;
1024
1025 not_okay:
1026 fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
1027 bcf_seqname(hdr,rec),rec->pos+1,msg);
1028 return 0;
1029 }
1030
usage(void)1031 static void usage(void)
1032 {
1033 fprintf(stderr, "\n");
1034 fprintf(stderr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n");
1035 fprintf(stderr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
1036 fprintf(stderr, "\n");
1037 fprintf(stderr, "Options:\n");
1038 //fprintf(stderr, " -a, --all-sites Output comparison for all sites\n");
1039 //fprintf(stderr, " -c, --cluster MIN,MAX Min inter- and max intra-sample error [0.23,-0.3]\n");
1040 fprintf(stderr, " --distinctive-sites Find sites that can distinguish between at least NUM sample pairs.\n");
1041 fprintf(stderr, " NUM[,MEM[,TMP]] If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n");
1042 fprintf(stderr, " The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n");
1043 #ifdef _WIN32
1044 fprintf(stderr, " and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n");
1045 #else
1046 fprintf(stderr, " and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
1047 #endif
1048 fprintf(stderr, " --dry-run Stop after first record to estimate required time\n");
1049 fprintf(stderr, " -e, --error-probability INT Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
1050 fprintf(stderr, " -g, --genotypes FILE Genotypes to compare against\n");
1051 fprintf(stderr, " -H, --homs-only Homozygous genotypes only, useful with low coverage data (requires -g)\n");
1052 fprintf(stderr, " --n-matches INT Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
1053 fprintf(stderr, " Use negative value to sort by HWE probability rather than by discordance [0]\n");
1054 fprintf(stderr, " --no-HWE-prob Disable calculation of HWE probability\n");
1055 fprintf(stderr, " -p, --pairs LIST Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
1056 fprintf(stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
1057 fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
1058 fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
1059 fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
1060 fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
1061 fprintf(stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n");
1062 fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
1063 fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
1064 fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
1065 fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
1066 fprintf(stderr, "Examples:\n");
1067 fprintf(stderr, " # Check discordance of all samples from B against all sample in A\n");
1068 fprintf(stderr, " bcftools gtcheck -g A.bcf B.bcf\n");
1069 fprintf(stderr, "\n");
1070 fprintf(stderr, " # Limit comparisons to the fiven list of samples\n");
1071 fprintf(stderr, " bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
1072 fprintf(stderr, "\n");
1073 fprintf(stderr, " # Compare only two pairs a1,b1 and a1,b2\n");
1074 fprintf(stderr, " bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n");
1075 fprintf(stderr, "\n");
1076 exit(1);
1077 }
1078
main_vcfgtcheck(int argc,char * argv[])1079 int main_vcfgtcheck(int argc, char *argv[])
1080 {
1081 int c;
1082 args_t *args = (args_t*) calloc(1,sizeof(args_t));
1083 args->argc = argc; args->argv = argv; set_cwd(args);
1084 args->qry_use_GT = -1;
1085 args->gt_use_GT = -1;
1086 args->calc_hwe_prob = 1;
1087 args->use_PLs = 40;
1088 args->regions_overlap = 1;
1089 args->targets_overlap = 0;
1090
1091 // external sort for --distinctive-sites
1092 #ifdef _WIN32
1093 args->es_tmp_prefix = NULL;
1094 #else
1095 args->es_tmp_prefix = "/tmp/bcftools-gtcheck";
1096 #endif
1097 args->es_max_mem = strdup("500M");
1098
1099 // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
1100 // - min_inter: pairs with smaller err value will be considered identical
1101 // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
1102 // different. If negative, the cutoff may be heuristically lowered
1103 args->min_inter_err = 0.23;
1104 args->max_intra_err = -0.3;
1105
1106 static struct option loptions[] =
1107 {
1108 {"error-probability",1,0,'e'},
1109 {"use",1,0,'u'},
1110 {"cluster",1,0,'c'},
1111 {"GTs-only",1,0,'G'},
1112 {"all-sites",0,0,'a'},
1113 {"homs-only",0,0,'H'},
1114 {"help",0,0,'h'},
1115 {"genotypes",1,0,'g'},
1116 {"plot",1,0,'p'},
1117 {"samples",1,0,'s'},
1118 {"samples-file",1,0,'S'},
1119 {"n-matches",1,0,2},
1120 {"no-HWE-prob",0,0,3},
1121 {"target-sample",1,0,4},
1122 {"dry-run",0,0,5},
1123 {"distinctive-sites",1,0,6},
1124 {"regions",1,0,'r'},
1125 {"regions-file",1,0,'R'},
1126 {"regions-overlap",required_argument,NULL,7},
1127 {"targets",1,0,'t'},
1128 {"targets-file",1,0,'T'},
1129 {"targets-overlap",required_argument,NULL,8},
1130 {"pairs",1,0,'p'},
1131 {"pairs-file",1,0,'P'},
1132 {0,0,0,0}
1133 };
1134 char *tmp;
1135 while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
1136 switch (c) {
1137 case 'e':
1138 args->use_PLs = strtol(optarg,&tmp,10);
1139 if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
1140 break;
1141 case 'u':
1142 {
1143 int i,nlist;
1144 char **list = hts_readlist(optarg, 0, &nlist);
1145 if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg);
1146 if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1;
1147 else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0;
1148 else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
1149 if ( nlist==2 )
1150 {
1151 if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1;
1152 else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0;
1153 else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
1154 }
1155 else args->gt_use_GT = args->qry_use_GT;
1156 for (i=0; i<nlist; i++) free(list[i]);
1157 free(list);
1158 }
1159 break;
1160 case 2 :
1161 args->ntop = strtol(optarg,&tmp,10);
1162 if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg);
1163 if ( args->ntop < 0 )
1164 {
1165 args->sort_by_hwe = 1;
1166 args->ntop *= -1;
1167 }
1168 break;
1169 case 3 : args->calc_hwe_prob = 0; break;
1170 case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
1171 case 5 : args->dry_run = 1; break;
1172 case 6 :
1173 args->distinctive_sites = strtod(optarg,&tmp);
1174 if ( *tmp )
1175 {
1176 if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg);
1177 tmp++;
1178 free(args->es_max_mem);
1179 args->es_max_mem = strdup(tmp);
1180 while ( *tmp && *tmp!=',' ) tmp++;
1181 if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
1182 }
1183 args->use_PLs = 0;
1184 break;
1185 case 'c':
1186 error("The -c option is to be implemented, please open an issue on github\n");
1187 args->min_inter_err = strtod(optarg,&tmp);
1188 if ( *tmp )
1189 {
1190 if ( *tmp!=',') error("Could not parse: -c %s\n", optarg);
1191 args->max_intra_err = strtod(tmp+1,&tmp);
1192 if ( *tmp ) error("Could not parse: -c %s\n", optarg);
1193 }
1194 break;
1195 case 'G': error("The option -G, --GTs-only has been deprecated\n"); break;
1196 case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break;
1197 case 'H': args->hom_only = 1; break;
1198 case 'g': args->gt_fname = optarg; break;
1199 // case 'p': args->plot = optarg; break;
1200 case 's':
1201 if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3;
1202 else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
1203 else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
1204 break;
1205 case 'S':
1206 if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
1207 else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
1208 else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
1209 break;
1210 case 'p': args->pair_samples = optarg; break;
1211 case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break;
1212 case 'r': args->regions = optarg; break;
1213 case 'R': args->regions = optarg; args->regions_is_file = 1; break;
1214 case 't': args->targets = optarg; break;
1215 case 'T': args->targets = optarg; args->targets_is_file = 1; break;
1216 case 7 :
1217 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
1218 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
1219 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
1220 else error("Could not parse: --regions-overlap %s\n",optarg);
1221 break;
1222 case 8 :
1223 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
1224 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
1225 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
1226 else error("Could not parse: --targets-overlap %s\n",optarg);
1227 break;
1228 case 'h':
1229 case '?': usage(); break;
1230 default: error("Unknown argument: %s\n", optarg);
1231 }
1232 }
1233 if ( optind==argc )
1234 {
1235 if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-"; // reading from stdin
1236 else usage(); // no files given
1237 }
1238 else args->qry_fname = argv[optind];
1239 if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n"); // too many files given
1240 if ( args->pair_samples )
1241 {
1242 if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n");
1243 if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n");
1244 }
1245 if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
1246 if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
1247 if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
1248
1249 init_data(args);
1250
1251 int ret;
1252 while ( (ret=bcf_sr_next_line(args->files)) )
1253 {
1254 if ( !is_input_okay(args,ret) ) continue;
1255
1256 // time one record to give the user an estimate with very big files
1257 struct timeval t0, t1;
1258 if ( !args->ncmp ) gettimeofday(&t0, NULL);
1259
1260 process_line(args);
1261
1262 if ( args->ncmp==1 )
1263 {
1264 gettimeofday(&t1, NULL);
1265 double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
1266 fprintf(stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
1267 fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
1268 if ( args->dry_run ) break;
1269 }
1270 }
1271 if ( !args->dry_run )
1272 {
1273 report(args);
1274 if ( args->distinctive_sites ) report_distinctive_sites(args);
1275 }
1276
1277 destroy_data(args);
1278 free(args);
1279 return 0;
1280 }
1281
1282