1 /* The MIT License
2
3 Copyright (c) 2018-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <getopt.h>
30 #include <strings.h>
31 #include <errno.h>
32 #include <unistd.h> // for isatty
33 #include <inttypes.h>
34 #include <htslib/hts.h>
35 #include <htslib/vcf.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kseq.h>
38 #include <htslib/kfunc.h>
39 #include <htslib/synced_bcf_reader.h>
40 #include "bcftools.h"
41 #include "filter.h"
42
43
44 // Logic of the filters: include or exclude sites which match the filters?
45 #define FLT_INCLUDE 1
46 #define FLT_EXCLUDE 2
47
48 #define PRINT_PASSOC (1<<0)
49 #define PRINT_FASSOC (1<<1)
50 #define PRINT_NASSOC (1<<2)
51 #define PRINT_NOVELAL (1<<3)
52 #define PRINT_NOVELGT (1<<4)
53
54 typedef struct
55 {
56 int argc, filter_logic, regions_is_file, targets_is_file, output_type, force_samples, clevel;
57 int regions_overlap, targets_overlap;
58 uint32_t annots;
59 char **argv, *output_fname, *fname, *regions, *targets, *filter_str, *annots_str;
60 char *control_samples_str, *case_samples_str, *max_AC_str;
61 int *control_smpl, *case_smpl, ncontrol_smpl, ncase_smpl;
62 filter_t *filter;
63 bcf_srs_t *sr;
64 bcf_hdr_t *hdr, *hdr_out;
65 htsFile *out_fh;
66 int32_t *gts;
67 int mgts;
68 uint32_t *control_gts;
69 int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt;
70 kstring_t case_als_smpl, case_gts_smpl;
71 int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region
72 }
73 args_t;
74
75 args_t args;
76
about(void)77 const char *about(void)
78 {
79 return "Find novel alleles and genotypes in two groups of samples.\n";
80 }
81
usage_text(void)82 static const char *usage_text(void)
83 {
84 return
85 "\n"
86 "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n"
87 " genotypes in two groups of samples. Adds the following INFO annotations:\n"
88 " - PASSOC .. Fisher's exact test probability of genotypic association (REF vs non-REF allele)\n"
89 " - FASSOC .. proportion of non-REF allele in controls and cases\n"
90 " - NASSOC .. number of control-ref, control-alt, case-ref and case-alt alleles\n"
91 " - NOVELAL .. lists samples with a novel allele not observed in the control group\n"
92 " - NOVELGT .. lists samples with a novel genotype not observed in the control group\n"
93 "Usage: bcftools +contrast [Plugin Options]\n"
94 "Plugin options:\n"
95 " -a, --annots LIST List of annotations to output [PASSOC,FASSOC,NOVELAL]\n"
96 " -0, --control-samples LIST|FILE File or comma-separated list of control (background) samples\n"
97 " -1, --case-samples LIST|FILE File or comma-separated list of samples where novel allele or genotype is expected\n"
98 " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n"
99 " -f, --max-allele-freq NUM Calculate enrichment of rare alleles. Floating point numbers between 0 and 1 are\n"
100 " interpreted as ALT allele frequencies, integers as ALT allele counts\n"
101 " --force-samples Continue even if some samples listed in the -0,-1 files are missing from the VCF\n"
102 " -i, --include EXPR Include sites and samples for which the expression is true\n"
103 " -o, --output FILE Output file name [stdout]\n"
104 " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
105 " -r, --regions REG Restrict to comma-separated list of regions\n"
106 " -R, --regions-file FILE Restrict to regions listed in a file\n"
107 " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
108 " -t, --targets REG Similar to -r but streams rather than index-jumps\n"
109 " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
110 " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
111 "\n"
112 "Example:\n"
113 " # Test if any of the samples a,b is different from the samples c,d,e\n"
114 " bcftools +contrast -0 c,d,e -1 a,b file.bcf\n"
115 "\n"
116 " # Same as above, but read samples from a file. In case of a name collision, the sample name\n"
117 " # has precedence: the existence of a file with a list of samples is not checked unless no such\n"
118 " # sample exists in the VCF. Use a full path (e.g. \"./string\" instead of \"string\") to avoid\n"
119 " # name clashes\n"
120 " bcftools +contrast -0 samples0.txt -1 samples1.txt file.bcf\n"
121 "\n"
122 " # The same as above but checks for enrichment of rare alleles, AF<0.001 in this example, in a region\n"
123 " bcftools +contrast -r 20:1000-2000 -f 0.001 -0 samples0.txt -1 samples1.txt file.bcf\n"
124 "\n";
125 }
126
cmp_int(const void * a,const void * b)127 static int cmp_int(const void *a, const void *b)
128 {
129 if ( *((int*)a) < *((int*)b) ) return -1;
130 if ( *((int*)a) > *((int*)b) ) return -1;
131 return 0;
132 }
read_sample_list_or_file(bcf_hdr_t * hdr,const char * str,int ** smpl,int * nsmpl,int force_samples)133 static void read_sample_list_or_file(bcf_hdr_t *hdr, const char *str, int **smpl, int *nsmpl, int force_samples)
134 {
135 char **str_list = NULL;
136 int i,j, *list = NULL, nlist = 0, is_file, nskipped = 0;
137
138 for (is_file=0; is_file<=1; is_file++)
139 {
140 if ( str_list )
141 {
142 for (i=0; i<nlist; i++) free(str_list[i]);
143 free(str_list);
144 free(list);
145 str_list = NULL;
146 list = NULL;
147 nlist = 0;
148 }
149
150 str_list = hts_readlist(str, is_file, &nlist);
151 if ( !str_list )
152 {
153 if ( force_samples ) continue;
154 error("The sample \"%s\", is not present in the VCF\n", str);
155 }
156
157 list = (int*) malloc(sizeof(int)*nlist);
158 for (i=0,j=0; i<nlist; i++,j++)
159 {
160 list[j] = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, str_list[i]);
161 if ( list[j] >= 0 ) continue;
162 if ( is_file )
163 {
164 if ( !force_samples ) error("The sample \"%s\" is not present in the VCF. Use --force-samples to proceed anyway.\n", str_list[i]);
165 j--;
166 nskipped++;
167 continue;
168 }
169 break;
170 }
171 if ( i==nlist ) break;
172 }
173 for (i=0; i<nlist; i++) free(str_list[i]);
174 nlist -= nskipped;
175 if ( !nlist && !force_samples ) error("None of the samples are present in the VCF: %s\n", str);
176 if ( nskipped ) fprintf(stderr,"Warning: using %d sample%s, %d from %s %s not present in the VCF\n", nlist,nlist>1?"s":"",nskipped,str,nskipped>1?"are":"is");
177 free(str_list);
178 qsort(list,nlist,sizeof(*list),cmp_int);
179 *smpl = list;
180 *nsmpl = nlist;
181 }
182
init_data(args_t * args)183 static void init_data(args_t *args)
184 {
185 int ntmp, i;
186 char **tmp = hts_readlist(args->annots_str, 0, &ntmp);
187 for (i=0; i<ntmp; i++)
188 {
189 if ( !strcasecmp("PASSOC",tmp[i]) ) args->annots |= PRINT_PASSOC;
190 else if ( !strcasecmp("FASSOC",tmp[i]) ) args->annots |= PRINT_FASSOC;
191 else if ( !strcasecmp("NASSOC",tmp[i]) ) args->annots |= PRINT_NASSOC;
192 else if ( !strcasecmp("NOVELAL",tmp[i]) ) args->annots |= PRINT_NOVELAL;
193 else if ( !strcasecmp("NOVELGT",tmp[i]) ) args->annots |= PRINT_NOVELGT;
194 else error("The annotation is not recognised: %s\n", tmp[i]);
195 free(tmp[i]);
196 }
197 free(tmp);
198
199 args->sr = bcf_sr_init();
200 if ( args->regions )
201 {
202 args->sr->require_index = 1;
203 bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
204 if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
205 }
206 if ( args->targets )
207 {
208 bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
209 if ( bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
210 }
211 if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
212 args->hdr = bcf_sr_get_header(args->sr,0);
213 args->hdr_out = bcf_hdr_dup(args->hdr);
214 if ( args->annots & PRINT_PASSOC )
215 bcf_hdr_append(args->hdr_out, "##INFO=<ID=PASSOC,Number=1,Type=Float,Description=\"Fisher's exact test probability of genotypic association (REF vs non-REF allele)\">");
216 if ( args->annots & PRINT_FASSOC )
217 bcf_hdr_append(args->hdr_out, "##INFO=<ID=FASSOC,Number=2,Type=Float,Description=\"Proportion of non-REF allele in controls and cases\">");
218 if ( args->annots & PRINT_NASSOC )
219 bcf_hdr_append(args->hdr_out, "##INFO=<ID=NASSOC,Number=4,Type=Integer,Description=\"Number of control-ref, control-alt, case-ref and case-alt alleles\">");
220 if ( args->annots & PRINT_NOVELAL )
221 bcf_hdr_append(args->hdr_out, "##INFO=<ID=NOVELAL,Number=.,Type=String,Description=\"List of samples with novel alleles. Note that samples listed here are not listed in NOVELGT again.\">");
222 if ( args->annots & PRINT_NOVELGT )
223 bcf_hdr_append(args->hdr_out, "##INFO=<ID=NOVELGT,Number=.,Type=String,Description=\"List of samples with novel genotypes\">");
224
225 if ( args->filter_str )
226 args->filter = filter_init(args->hdr, args->filter_str);
227
228 read_sample_list_or_file(args->hdr, args->control_samples_str, &args->control_smpl, &args->ncontrol_smpl, args->force_samples);
229 read_sample_list_or_file(args->hdr, args->case_samples_str, &args->case_smpl, &args->ncase_smpl, args->force_samples);
230
231 char wmode[8];
232 set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
233 args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
234 if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
235 if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
236
237 if ( args->max_AC_str )
238 {
239 char *tmp;
240 args->max_AC = strtol(args->max_AC_str, &tmp, 10);
241 if ( tmp==args->max_AC_str || *tmp )
242 {
243 double val = strtod(args->max_AC_str, &tmp);
244 if ( tmp==args->max_AC_str || *tmp ) error("Could not parse the argument: -f, --max-allele-freq %s\n", args->max_AC_str);
245 if ( val<0 || val>1 ) error("Expected integer or float from the range [0,1]: -f, --max-allele-freq %s\n", args->max_AC_str);
246 args->max_AC = val * bcf_hdr_nsamples(args->hdr);
247 if ( !args->max_AC ) args->max_AC = 1;
248 }
249 }
250 }
destroy_data(args_t * args)251 static void destroy_data(args_t *args)
252 {
253 bcf_hdr_destroy(args->hdr_out);
254 if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
255 free(args->case_als_smpl.s);
256 free(args->case_gts_smpl.s);
257 free(args->gts);
258 free(args->control_gts);
259 free(args->control_smpl);
260 free(args->case_smpl);
261 if ( args->filter ) filter_destroy(args->filter);
262 bcf_sr_destroy(args->sr);
263 free(args);
264 }
binary_search(uint32_t val,uint32_t * dat,int ndat)265 static inline int binary_search(uint32_t val, uint32_t *dat, int ndat)
266 {
267 int i = -1, imin = 0, imax = ndat - 1;
268 while ( imin<=imax )
269 {
270 i = (imin+imax)/2;
271 if ( dat[i] < val ) imin = i + 1;
272 else if ( dat[i] > val ) imax = i - 1;
273 else return 1;
274 }
275 return 0;
276 }
binary_insert(uint32_t val,uint32_t ** dat,int * ndat,int * mdat)277 static inline void binary_insert(uint32_t val, uint32_t **dat, int *ndat, int *mdat)
278 {
279 int i = -1, imin = 0, imax = *ndat - 1;
280 while ( imin<=imax )
281 {
282 i = (imin+imax)/2;
283 if ( (*dat)[i] < val ) imin = i + 1;
284 else if ( (*dat)[i] > val ) imax = i - 1;
285 else return;
286 }
287 while ( i>=0 && (*dat)[i]>val ) i--;
288
289 (*ndat)++;
290 hts_expand(uint32_t, (*ndat), (*mdat), (*dat));
291
292 if ( *ndat > 1 )
293 memmove(*dat + i + 1, *dat + i, sizeof(uint32_t)*(*ndat - i - 1));
294
295 (*dat)[i+1] = val;
296 }
process_record(args_t * args,bcf1_t * rec)297 static int process_record(args_t *args, bcf1_t *rec)
298 {
299 args->ntotal++;
300
301 static int warned = 0;
302 int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->mgts);
303 ngts /= rec->n_sample;
304 if ( ngts>2 ) error("todo: ploidy=%d\n", ngts);
305
306 args->ncontrol_gts = 0;
307 uint32_t control_als = 0;
308 int32_t nals[4] = {0,0,0,0}; // ctrl-ref, ctrl-alt, case-ref, case-alt
309 int i,j;
310 for (i=0; i<args->ncontrol_smpl; i++)
311 {
312 uint32_t gt = 0;
313 int32_t *ptr = args->gts + args->control_smpl[i]*ngts;
314 for (j=0; j<ngts; j++)
315 {
316 if ( ptr[j]==bcf_int32_vector_end ) break;
317 if ( bcf_gt_is_missing(ptr[j]) ) continue;
318 int ial = bcf_gt_allele(ptr[j]);
319 if ( ial > 31 )
320 {
321 if ( !warned )
322 {
323 fprintf(stderr,"Too many alleles (>32) at %s:%"PRId64", skipping the site.\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
324 warned = 1;
325 }
326 args->nskipped++;
327 return -1;
328 }
329 control_als |= 1<<ial;
330 gt |= 1<<ial;
331 if ( ial ) nals[1]++;
332 else nals[0]++;
333 }
334 if ( args->annots & PRINT_NOVELGT )
335 binary_insert(gt, &args->control_gts, &args->ncontrol_gts, &args->mcontrol_gts);
336 }
337 if ( !control_als && args->ncontrol_smpl )
338 {
339 // all are missing
340 args->nskipped++;
341 return -1;
342 }
343
344 args->case_als_smpl.l = 0;
345 args->case_gts_smpl.l = 0;
346
347 int has_gt = 0;
348 for (i=0; i<args->ncase_smpl; i++)
349 {
350 int case_al = 0;
351 uint32_t gt = 0;
352 int32_t *ptr = args->gts + args->case_smpl[i]*ngts;
353 for (j=0; j<ngts; j++)
354 {
355 if ( ptr[j]==bcf_int32_vector_end ) break;
356 if ( bcf_gt_is_missing(ptr[j]) ) continue;
357 int ial = bcf_gt_allele(ptr[j]);
358 if ( ial > 31 )
359 {
360 if ( !warned )
361 {
362 fprintf(stderr,"Too many alleles (>32) at %s:%"PRId64", skipping. (todo?)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
363 warned = 1;
364 }
365 args->nskipped++;
366 return -1;
367 }
368 if ( !(control_als & (1<<ial)) ) case_al = 1;
369 gt |= 1<<ial;
370 if ( ial ) nals[3]++;
371 else nals[2]++;
372 }
373 if ( !gt ) continue;
374 has_gt = 1;
375
376 char *smpl = args->hdr->samples[ args->case_smpl[i] ];
377 if ( case_al && (args->annots & PRINT_NOVELAL) )
378 {
379 if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl);
380 kputs(smpl, &args->case_als_smpl);
381 }
382 else if ( (args->annots & PRINT_NOVELGT) && !binary_search(gt, args->control_gts, args->ncontrol_gts) )
383 {
384 if ( args->case_gts_smpl.l ) kputc(',', &args->case_gts_smpl);
385 kputs(smpl, &args->case_gts_smpl);
386 }
387 }
388 if ( !has_gt && args->ncase_smpl )
389 {
390 // all are missing
391 args->nskipped++;
392 return -1;
393 }
394
395 if ( args->max_AC )
396 {
397 if ( nals[0]+nals[2] > nals[1]+nals[3] )
398 {
399 if ( nals[1]+nals[3] <= args->max_AC )
400 for (i=0; i<4; i++) args->nals[i] += nals[i];
401 }
402 else
403 {
404 if ( nals[0]+nals[2] <= args->max_AC )
405 {
406 args->nals[0] += nals[1];
407 args->nals[1] += nals[0];
408 args->nals[2] += nals[3];
409 args->nals[3] += nals[2];
410 }
411 }
412 }
413
414 float vals[2];
415 if ( (args->annots & PRINT_PASSOC) && args->ncontrol_smpl && args->ncase_smpl )
416 {
417 double left, right, fisher;
418 kt_fisher_exact(nals[0],nals[1],nals[2],nals[3], &left,&right,&fisher);
419 vals[0] = fisher;
420 bcf_update_info_float(args->hdr_out, rec, "PASSOC", vals, 1);
421 }
422 if ( (args->annots & PRINT_FASSOC) && args->ncontrol_smpl && args->ncase_smpl )
423 {
424 if ( nals[0]+nals[1] ) vals[0] = (float)nals[1]/(nals[0]+nals[1]);
425 else bcf_float_set_missing(vals[0]);
426 if ( nals[2]+nals[3] ) vals[1] = (float)nals[3]/(nals[2]+nals[3]);
427 else bcf_float_set_missing(vals[1]);
428 bcf_update_info_float(args->hdr_out, rec, "FASSOC", vals, 2);
429 }
430 if ( args->annots & PRINT_NASSOC )
431 bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4);
432
433 if ( args->case_als_smpl.l )
434 {
435 bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s);
436 args->ncase_al++;
437 }
438 if ( args->case_gts_smpl.l )
439 {
440 bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s);
441 args->ncase_gt++;
442 }
443 args->ntested++;
444 return 0;
445 }
446
run(int argc,char ** argv)447 int run(int argc, char **argv)
448 {
449 args_t *args = (args_t*) calloc(1,sizeof(args_t));
450 args->argc = argc; args->argv = argv;
451 args->output_fname = "-";
452 args->annots_str = "PASSOC,FASSOC";
453 args->regions_overlap = 1;
454 args->targets_overlap = 0;
455 args->clevel = -1;
456 static struct option loptions[] =
457 {
458 {"max-allele-freq",required_argument,0,'f'},
459 {"annots",required_argument,0,'a'},
460 {"force-samples",no_argument,0,1},
461 {"bg-samples",required_argument,0,'0'}, // renamed to --control-samples, leaving it in for backward compatibility
462 {"control-samples",required_argument,0,'0'},
463 {"novel-samples",required_argument,0,'1'}, // renamed to --case-samples, leaving it in for backward compatibility
464 {"case-samples",required_argument,0,'1'},
465 {"include",required_argument,0,'i'},
466 {"exclude",required_argument,0,'e'},
467 {"output",required_argument,NULL,'o'},
468 {"output-type",required_argument,NULL,'O'},
469 {"regions",1,0,'r'},
470 {"regions-file",1,0,'R'},
471 {"regions-overlap",required_argument,NULL,3},
472 {"targets",1,0,'t'},
473 {"targets-file",1,0,'T'},
474 {"targets-overlap",required_argument,NULL,4},
475 {NULL,0,NULL,0}
476 };
477 int c;
478 char *tmp;
479 while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0)
480 {
481 switch (c)
482 {
483 case 1 : args->force_samples = 1; break;
484 case 'f': args->max_AC_str = optarg; break;
485 case 'a': args->annots_str = optarg; break;
486 case '0': args->control_samples_str = optarg; break;
487 case '1': args->case_samples_str = optarg; break;
488 case 'e':
489 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
490 args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
491 case 'i':
492 if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
493 args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
494 case 't': args->targets = optarg; break;
495 case 'T': args->targets = optarg; args->targets_is_file = 1; break;
496 case 'r': args->regions = optarg; break;
497 case 'R': args->regions = optarg; args->regions_is_file = 1; break;
498 case 'o': args->output_fname = optarg; break;
499 case 'O':
500 switch (optarg[0]) {
501 case 'b': args->output_type = FT_BCF_GZ; break;
502 case 'u': args->output_type = FT_BCF; break;
503 case 'z': args->output_type = FT_VCF_GZ; break;
504 case 'v': args->output_type = FT_VCF; break;
505 default:
506 {
507 args->clevel = strtol(optarg,&tmp,10);
508 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
509 }
510 };
511 if ( optarg[1] )
512 {
513 args->clevel = strtol(optarg+1,&tmp,10);
514 if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
515 }
516 break;
517 case 3 :
518 if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
519 else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
520 else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
521 else error("Could not parse: --regions-overlap %s\n",optarg);
522 break;
523 case 4 :
524 if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
525 else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
526 else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
527 else error("Could not parse: --targets-overlap %s\n",optarg);
528 break;
529 case 'h':
530 case '?':
531 default: error("%s", usage_text()); break;
532 }
533 }
534 if ( optind==argc )
535 {
536 if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
537 else { error("%s",usage_text()); }
538 }
539 else if ( optind+1!=argc ) error("%s",usage_text());
540 else args->fname = argv[optind];
541
542 if ( !args->control_samples_str ) error("Error: missing the -0, --control-samples option\n");
543 if ( !args->case_samples_str ) error("Error: missing the -1, --case-samples option\n");
544
545 init_data(args);
546
547 while ( bcf_sr_next_line(args->sr) )
548 {
549 bcf1_t *rec = bcf_sr_get_line(args->sr,0);
550 if ( args->filter )
551 {
552 int pass = filter_test(args->filter, rec, NULL);
553 if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
554 if ( !pass ) continue;
555 }
556 process_record(args, rec);
557 if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
558 }
559
560 fprintf(stderr,"Total/processed/skipped/case_allele/case_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->ncase_al, args->ncase_gt);
561 if ( args->max_AC )
562 {
563 double val1, val2, fisher;
564 kt_fisher_exact(args->nals[0],args->nals[1],args->nals[2],args->nals[3], &val1,&val2,&fisher);
565 val1 = args->nals[0]+args->nals[1] ? (float)args->nals[1]/(args->nals[0]+args->nals[1]) : 0;
566 val2 = args->nals[2]+args->nals[3] ? (float)args->nals[3]/(args->nals[2]+args->nals[3]) : 0;
567 fprintf(stderr,"max_AC/PASSOC/FASSOC/NASSOC:\t%d\t%e\t%f,%f\t%d,%d,%d,%d\n",args->max_AC,fisher,val1,val2,args->nals[0],args->nals[1],args->nals[2],args->nals[3]);
568 }
569 destroy_data(args);
570
571 return 0;
572 }
573