1 /* The MIT License
2
3 Copyright (C) 2017-2021 Giulio Genovese
4
5 Author: Giulio Genovese <giulio.genovese@gmail.com>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <getopt.h>
30 #include <htslib/vcf.h>
31 #include <htslib/faidx.h>
32 #include <htslib/kfunc.h>
33 #include "mocha.h"
34 #include "bcftools.h"
35
36 #define MOCHATOOLS_VERSION "2021-10-15"
37
38 #define TAG_LIST_DFLT "none"
39 #define GC_WIN_DFLT "200"
40
41 #define INFO_GC (1 << 0)
42 #define INFO_CPG (1 << 1)
43 #define INFO_ALLELE_A (1 << 2)
44 #define INFO_ALLELE_B (1 << 3)
45 #define INFO_AC_SEX (1 << 4)
46 #define INFO_MISSING_SEX (1 << 5)
47 #define INFO_pAC_HET (1 << 6)
48 #define INFO_AD_HET (1 << 7)
49 #define INFO_pBAF_STATS (1 << 8)
50 #define INFO_COR_BAF_LRR (1 << 9)
51
sq(double x)52 static inline double sq(double x) { return x * x; }
53
54 typedef struct {
55 int flags;
56 int adjust; // whether to adjust BAF and LRR
57 int gc_win; // length of window to compute GC and CpG content
58 int *gender;
59 char *as_str; // format field ID to summarize for allelic shift count summary
60 int phase; // whether phase information should be included in the allelic shift
61 char *info_str; // info field for binomial or Fisher's exact test
62 int phred; // whether the test result should be phred scaled
63 int nsmpl, allele_a_id, allele_b_id, adjust_id, gt_id, ad_id, baf_id, lrr_id, info_id, as_id;
64 int8_t *gt_phase_arr, *as_arr;
65 int16_t *gt0_arr, *gt1_arr, *ad0_arr, *ad1_arr;
66 int32_t *info_arr;
67 int m_info;
68 float *adjust_arr;
69 int m_adjust;
70 float *baf_arr[2];
71 int *imap_arr;
72 faidx_t *fai;
73 bcf_hdr_t *in_hdr, *out_hdr;
74 kstring_t summary_str, test_str;
75 } args_t;
76
77 args_t *args;
78
read_computed_gender(bcf_hdr_t * hdr,char * fname)79 static int *read_computed_gender(bcf_hdr_t *hdr, char *fname) {
80 htsFile *fp = hts_open(fname, "r");
81 if (fp == NULL) error("Could not open %s: %s\n", fname, strerror(errno));
82
83 kstring_t str = {0, 0, NULL};
84 if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error("Empty file: %s\n", fname);
85 tsv_t *tsv = tsv_init_delimiter(str.s, '\t');
86 int idx, computed_gender;
87 if (tsv_register(tsv, "sample_id", tsv_read_sample_id, (void *)&idx) < 0)
88 error("File %s is missing the sample_id column\n", fname);
89 if (tsv_register(tsv, "computed_gender", tsv_read_computed_gender, (void *)&computed_gender) < 0)
90 error("File %s is missing the computed_gender column\n", fname);
91
92 int i = 0;
93 int *gender = (int *)calloc((size_t)bcf_hdr_nsamples(hdr), sizeof(int));
94 while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {
95 if (!tsv_parse_delimiter(tsv, (bcf1_t *)hdr, str.s, '\t')) {
96 if (idx < 0) continue;
97 gender[idx] = computed_gender;
98 i++;
99 } else {
100 error("Could not parse line: %s\n", str.s);
101 }
102 }
103
104 tsv_destroy(tsv);
105 free(str.s);
106 hts_close(fp);
107 return gender;
108 }
109
about(void)110 const char *about(void) { return "MOsaic CHromosomal Alterations tools.\n"; }
111
usage(void)112 const char *usage(void) {
113 return "\n"
114 "About: tools for the MOsaic CHromosomal Alterations pipeline. "
115 "(version " MOCHATOOLS_VERSION
116 " https://github.com/freeseek/mocha)\n"
117 "Usage: bcftools +mochatools [General Options] -- [Plugin Options]\n"
118 "\n"
119 "Options:\n"
120 " run \"bcftools plugin\" for a list of common options\n"
121 "\n"
122 "Plugin options:\n"
123 " -l, --list-tags list available INFO tags with description for VCF output\n"
124 " -t, --tags LIST list of output INFO tags [" TAG_LIST_DFLT
125 "]\n"
126 " --adjust adjust BAF and LRR using INFO/ADJ_COEFF\n"
127 " -f, --fasta-ref <file> reference sequence to compute GC and CpG content\n"
128 " --gc-window-size <int> window size in bp used to compute the GC and CpG content [" GC_WIN_DFLT
129 "]\n"
130 " -x, --sex <file> file including information about the gender of the samples\n"
131 " --summary <tag> allelic shift FORMAT tag to summarize\n"
132 " --phase whether phase information should be included in the allelic shift\n"
133 " --test <tag> performs binomial or Fisher's exact test for INFO tag\n"
134 " --phred reports p-values as phred scaled\n"
135 " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" "
136 "prefix)\n"
137 " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n"
138 " --force-samples only warn about unknown subset samples\n"
139 " -G, --drop-genotypes drop individual genotype information (after running statistical tests)\n"
140 "\n"
141 "Examples:\n"
142 " bcftools +mochatools -- -l\n"
143 " bcftools +mochatools file.bcf -Ob -o output.bcf -- -t GC -f human_g1k_v37.fasta\n"
144 " bcftools +mochatools file.bcf -Ob -o output.bcf -- -t ALLELE_A,ALLELE_B\n"
145 " bcftools +mochatools file.bcf -- --summary AS --test AS --drop-genotypes\n"
146 " bcftools +mochatools file.bcf -- --summary AS --phase --test pAS --drop-genotypes\n"
147 "\n";
148 }
149
parse_tags(const char * str)150 static int parse_tags(const char *str) {
151 if (!args->in_hdr) error("%s", usage());
152
153 int flags = 0, n_tags;
154 if (!strcasecmp(str, "none")) return flags;
155 char **tags = hts_readlist(str, 0, &n_tags);
156 for (int i = 0; i < n_tags; i++) {
157 if (!strcasecmp(tags[i], "GC"))
158 flags |= INFO_GC;
159 else if (!strcasecmp(tags[i], "CpG"))
160 flags |= INFO_CPG;
161 else if (!strcasecmp(tags[i], "ALLELE_A"))
162 flags |= INFO_ALLELE_A;
163 else if (!strcasecmp(tags[i], "ALLELE_B"))
164 flags |= INFO_ALLELE_B;
165 else if (!strcasecmp(tags[i], "AC_Sex"))
166 flags |= INFO_AC_SEX;
167 else if (!strcasecmp(tags[i], "MISSING_Sex"))
168 flags |= INFO_MISSING_SEX;
169 else if (!strcasecmp(tags[i], "pAC_Het"))
170 flags |= INFO_pAC_HET;
171 else if (!strcasecmp(tags[i], "AD_Het"))
172 flags |= INFO_AD_HET;
173 else if (!strcasecmp(tags[i], "pBAF_Stats"))
174 flags |= INFO_pBAF_STATS;
175 else if (!strcasecmp(tags[i], "Cor_BAF_LRR"))
176 flags |= INFO_COR_BAF_LRR;
177 else
178 error("Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str, tags[i]);
179 free(tags[i]);
180 }
181 if (n_tags) free(tags);
182 return flags;
183 }
184
list_tags(void)185 static void list_tags(void) {
186 error(
187 "INFO/GC Number:1 Type:Float .. GC ratio content around the marker\n"
188 "INFO/CpG Number:1 Type:Float .. CpG ratio content around the marker\n"
189 "INFO/ALLELE_A Number:1 Type:Integer .. A allele\n"
190 "INFO/ALLELE_B Number:1 Type:Integer .. B allele\n"
191 "INFO/AC_Sex Number:4 Type:Integer .. Number of homozygous and heterozygous genotypes by gender\n"
192 "INFO/MISSING_Sex Number:4 Type:Integer .. Number of non-missing and missing genotypes by gender\n"
193 "INFO/pAC_Het Number:2 Type:Integer .. Number of heterozygous genotypes by transmission type\n"
194 "INFO/AD_Het Number:2 Type:Integer .. Allelic depths for the reference and alternate alleles across "
195 "heterozygous genotypes\n"
196 "INFO/pBAF_Stats Number:4 Type:Float .. Welch's t-test and Mann-Whitney U test for BAF by transmission "
197 "type across heterozygous genotypes\n"
198 "INFO/Cor_BAF_LRR Number:3 Type=Float .. Pearson correlation for BAF and LRR at AA, AB, and BB "
199 "genotypes\n");
200 }
201
init(int argc,char ** argv,bcf_hdr_t * in,bcf_hdr_t * out)202 int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) {
203 args = (args_t *)calloc(1, sizeof(args_t));
204 const char *tag_list = TAG_LIST_DFLT;
205 char *tmp;
206 args->gc_win = (int)strtol(GC_WIN_DFLT, NULL, 0);
207 args->in_hdr = in;
208 args->out_hdr = out;
209 args->info_str = NULL;
210 args->as_str = NULL;
211 int sample_is_file = 0;
212 int force_samples = 0;
213 int sites_only = 0;
214 char *sample_names = NULL;
215 char *gender_fname = NULL;
216 char *ref_fname = NULL;
217 kstring_t str = {0, 0, NULL};
218
219 int c;
220 static struct option loptions[] = {{"list-tags", no_argument, NULL, 'l'},
221 {"tags", required_argument, NULL, 't'},
222 {"adjust", no_argument, NULL, 1},
223 {"fasta-ref", required_argument, NULL, 'f'},
224 {"gc-window-size", required_argument, NULL, 2},
225 {"sex", required_argument, NULL, 'x'},
226 {"summary", required_argument, NULL, 3},
227 {"phase", no_argument, NULL, 4},
228 {"test", required_argument, NULL, 5},
229 {"phred", no_argument, NULL, 6},
230 {"samples", required_argument, NULL, 's'},
231 {"samples-file", required_argument, NULL, 'S'},
232 {"force-samples", no_argument, NULL, 7},
233 {"drop-genotypes", no_argument, NULL, 'G'},
234 {NULL, 0, NULL, 0}};
235
236 while ((c = getopt_long(argc, argv, "h?lt:f:x:s:S:G", loptions, NULL)) >= 0) {
237 switch (c) {
238 case 'l':
239 list_tags();
240 break;
241 case 't':
242 tag_list = optarg;
243 break;
244 case 1:
245 args->adjust = 1;
246 break;
247 case 'f':
248 ref_fname = optarg;
249 break;
250 case 2:
251 args->gc_win = (int)strtol(optarg, &tmp, 0);
252 if (*tmp) error("Could not parse: -w %s\n", optarg);
253 if (args->gc_win <= 0) error("Window size is not positive: -w %s\n", optarg);
254 break;
255 case 'x':
256 gender_fname = optarg;
257 break;
258 case 3:
259 args->as_str = optarg;
260 break;
261 case 4:
262 args->phase = 1;
263 break;
264 case 5:
265 args->info_str = optarg;
266 break;
267 case 6:
268 args->phred = 1;
269 break;
270 case 's':
271 sample_names = optarg;
272 break;
273 case 'S':
274 sample_names = optarg;
275 sample_is_file = 1;
276 break;
277 case 7:
278 force_samples = 1;
279 break;
280 case 'G':
281 sites_only = 1;
282 break;
283 case 'h':
284 case '?':
285 default:
286 error("%s", usage());
287 break;
288 }
289 }
290 args->flags |= parse_tags(tag_list);
291
292 // this ugly workaround is required to make sure we can set samples on both headers even
293 // when sample_is_file is true and sample_names is stdin
294 if (sample_names) {
295 int nsmpl;
296 char **smpl = hts_readlist(sample_names[0] == '^' ? sample_names + 1 : sample_names, sample_is_file, &nsmpl);
297 kstring_t tmp = {0, 0, NULL};
298 if (sample_names[0] == '^')
299 ksprintf(&tmp, "^%s", smpl[0]);
300 else
301 ksprintf(&tmp, "%s", smpl[0]);
302 for (int i = 1; i < nsmpl; i++) ksprintf(&tmp, ",%s", smpl[i]);
303 int ret = bcf_hdr_set_samples(args->in_hdr, tmp.s, 0);
304 if (ret < 0)
305 error("Error parsing the sample list\n");
306 else if (ret > 0) {
307 if (force_samples)
308 fprintf(stderr,
309 "Warn: subset called for sample that does not exist in header: "
310 "\"%s\"... skipping\n",
311 smpl[ret - 1]);
312 else
313 error(
314 "Error: subset called for sample that does not exist in header: \"%s\". "
315 "Use \"--force-samples\" to ignore this error.\n",
316 smpl[ret - 1]);
317 }
318 if (bcf_hdr_nsamples(args->in_hdr) == 0) fprintf(stderr, "Warn: subsetting has removed all samples\n");
319 if (bcf_hdr_set_samples(args->out_hdr, tmp.s, 0) < 0) error("Error parsing the sample list\n");
320 free(tmp.s);
321 for (int i = 0; i < nsmpl; i++) free(smpl[i]);
322 free(smpl);
323 }
324
325 if (gender_fname) args->gender = read_computed_gender(args->in_hdr, gender_fname);
326
327 if (args->flags & (INFO_GC | INFO_CPG)) {
328 if (!ref_fname) error("Reference sequence not provided, cannot infer GC or CpG content\n");
329 args->fai = fai_load(ref_fname);
330 if (!args->fai) error("Failed to load the fai index: %s\n", ref_fname);
331 bcf_hdr_append(args->out_hdr,
332 "##INFO=<ID=GC,Number=1,Type=Float,Description=\"GC ratio content around the marker\">");
333 bcf_hdr_append(args->out_hdr,
334 "##INFO=<ID=CpG,Number=1,Type=Float,Description=\"CpG ratio content around the marker\">");
335 }
336
337 args->allele_a_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_A");
338 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_INFO, args->allele_a_id)) args->allele_a_id = -1;
339 if (args->allele_a_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_INFO, args->allele_a_id) != BCF_HT_INT)
340 error("Error: input VCF file ALLELE_A info field is not of integer type\n");
341
342 args->allele_b_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_B");
343 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_INFO, args->allele_a_id)) args->allele_b_id = -1;
344 if (args->allele_b_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_INFO, args->allele_b_id) != BCF_HT_INT)
345 error("Error: input VCF file ALLELE_B info field is not of float type\n");
346
347 args->adjust_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ADJ_COEFF");
348 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_INFO, args->adjust_id)) args->adjust_id = -1;
349 if (args->adjust_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_INFO, args->adjust_id) != BCF_HT_REAL)
350 error("Error: input VCF file ADJ_COEFF info field is not of integer type\n");
351 if (args->adjust_id >= 0
352 && (bcf_hdr_id2length(args->in_hdr, BCF_HL_INFO, args->adjust_id) != BCF_VL_FIXED
353 || bcf_hdr_id2number(args->in_hdr, BCF_HL_INFO, args->adjust_id) != 9))
354 error("Error: input VCF file ADJ_COEFF info field has wrong number of values\n");
355
356 if (args->adjust && args->adjust_id < 0)
357 error("Error: ADJ_COEFF field is not present, cannot perform --adjust correction\n");
358
359 args->nsmpl = bcf_hdr_nsamples(args->in_hdr);
360
361 args->gt_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "GT");
362 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->gt_id)) args->gt_id = -1;
363
364 args->ad_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "AD");
365 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->ad_id)) args->ad_id = -1;
366 if (args->ad_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_FMT, args->ad_id) != BCF_HT_INT)
367 error("Error: input VCF file AD format field is not of integer type\n");
368 if (args->ad_id >= 0 && bcf_hdr_id2length(args->in_hdr, BCF_HL_FMT, args->ad_id) != BCF_VL_R)
369 error("Error: input VCF file AD format field has wrong number of values\n");
370
371 args->baf_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "BAF");
372 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->baf_id)) args->baf_id = -1;
373 if (args->baf_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_FMT, args->baf_id) != BCF_HT_REAL)
374 error("Error: input VCF file BAF format field is not of float type\n");
375
376 args->lrr_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "LRR");
377 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->lrr_id)) args->lrr_id = -1;
378 if (args->lrr_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_FMT, args->lrr_id) != BCF_HT_REAL)
379 error("Error: input VCF file LRR format field is not of float type\n");
380
381 if (args->flags & (INFO_ALLELE_A | INFO_ALLELE_B)) {
382 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->gt_id))
383 error("Error: GT format field is not present, cannot infer ALLELE_A or ALLELE_B\n");
384 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->baf_id))
385 error("Error: BAF format field is not present, cannot infer ALLELE_A or ALLELE_B\n");
386 if (args->flags & INFO_ALLELE_A) {
387 if (bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_A") >= 0)
388 error("Field ALLELE_A already present in the VCF.\n");
389 bcf_hdr_append(args->out_hdr, "##INFO=<ID=ALLELE_A,Number=1,Type=Integer,Description=\"A allele\">");
390 }
391 if (args->flags & INFO_ALLELE_B) {
392 if (bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_B") >= 0)
393 error("Field ALLELE_B already present in the VCF.\n");
394 if (args->flags & INFO_ALLELE_B)
395 bcf_hdr_append(args->out_hdr, "##INFO=<ID=ALLELE_B,Number=1,Type=Integer,Description=\"B allele\">");
396 }
397 }
398
399 if (args->flags & (INFO_AC_SEX)) {
400 if (!args->gender) error("Error: AC_Sex require --sex\n");
401 bcf_hdr_append(args->out_hdr,
402 "##INFO=<ID=AC_Sex,Number=4,Type=Integer,Description=\"Number of homozygous and heterozygous "
403 "genotypes by gender\">");
404 }
405
406 if (args->flags & (INFO_MISSING_SEX)) {
407 if (!args->gender) error("Error: MISSING_Sex require --sex\n");
408 bcf_hdr_append(args->out_hdr,
409 "##INFO=<ID=MISSING_Sex,Number=4,Type=Integer,Description=\"Number of non-missing and missing "
410 "genotypes by gender\">");
411 }
412
413 if (args->flags & INFO_pAC_HET) {
414 if (args->gt_id < 0) error("Error: GT format field is not present, cannot compute pAC_Het\n");
415 bcf_hdr_append(args->out_hdr,
416 "##INFO=<ID=pAC_Het,Number=2,Type=Integer,Description=\"Number of heterozygous genotypes "
417 "by transmission type\">");
418 }
419
420 if (args->flags & INFO_AD_HET) {
421 if (args->gt_id < 0) error("Error: GT format field is not present, cannot compute AD_Het\n");
422 if (args->ad_id < 0) error("Error: AD format field is not present, cannot compute AD_Het\n");
423 bcf_hdr_append(args->out_hdr,
424 "##INFO=<ID=AD_Het,Number=2,Type=Integer,Description=\"Allelic depths for the reference and "
425 "alternate alleles across heterozygous genotypes\">");
426 }
427
428 if (args->flags & INFO_pBAF_STATS) {
429 if (args->gt_id < 0) error("Error: GT format field is not present, cannot compute pBAF_Stats\n");
430 if (args->ad_id < 0 && args->baf_id < 0)
431 error("Error: Neither AD nor BAF format fields are present, cannot compute pBAF_Stats\n");
432 bcf_hdr_append(
433 args->out_hdr,
434 "##INFO=<ID=pBAF_Stats,Number=4,Type=Float,Description=\"Welch'"
435 "s t-test and Mann-Whitney U test for allelic transmission ratios across heterozygous genotypes\">");
436 }
437
438 if (args->flags & INFO_COR_BAF_LRR) {
439 if (args->allele_a_id < 0)
440 error("Error: ALLELE_A field is not present, cannot perform --cor-BAF-LRR analysis\n");
441 if (args->allele_b_id < 0)
442 error("Error: ALLELE_B field is not present, cannot perform --cor-BAF-LRR analysis\n");
443 if (args->baf_id < 0) error("Error: BAF format is not present, cannot perform --cor-BAF-LRR analysis\n");
444 if (args->lrr_id < 0) error("Error: LRR format is not present, cannot perform --cor-BAF-LRR analysis\n");
445 bcf_hdr_append(args->out_hdr,
446 "##INFO=<ID=Cor_BAF_LRR,Number=3,Type=Float,Description=\"Pearson "
447 "correlation for BAF and LRR at AA, AB, and BB genotypes\">");
448 }
449
450 if (args->as_str) {
451 args->as_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, args->as_str);
452 if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->as_id))
453 error("Error: %s format field is not present, cannot perform --summary\n", args->as_str);
454 if (args->phase) kputc('p', &args->summary_str);
455 kputs(args->as_str, &args->summary_str);
456 ksprintf(&str, "##INFO=<ID=%s,Number=2,Type=Integer,Description=\"%sAllelic shift counts for %s\">",
457 args->summary_str.s, args->phase ? "phased " : "", args->as_str);
458 bcf_hdr_append(args->out_hdr, str.s);
459 }
460
461 // this analysis is performed using a field that can be added by the previous analyses
462 if (args->info_str) {
463 args->info_id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->info_str);
464 if (bcf_hdr_sync(args->out_hdr) < 0) error_errno("[%s] Failed to update header", __func__);
465 if (!bcf_hdr_idinfo_exists(args->out_hdr, BCF_HL_INFO, args->info_id))
466 error("Error: %s info field is not present, cannot perform --test\n", args->info_str);
467 int number = bcf_hdr_id2number(args->out_hdr, BCF_HL_INFO, args->info_id);
468 if (args->phred) kputc('p', &args->test_str);
469 if (number == 2)
470 kputs("binom_", &args->test_str);
471 else if (number == 4)
472 kputs("fisher_", &args->test_str);
473 else
474 error("Error: %s info field must contain 2 or 4 elements but it contains %d\n", args->info_str, number);
475 kputs(args->info_str, &args->test_str);
476 str.l = 0;
477 ksprintf(&str, "##INFO=<ID=%s,Number=1,Type=Float,Description=\"%s%s test for %s\">", args->test_str.s,
478 args->phred ? "Phred scaled " : "", number == 2 ? "Binomial" : "Fisher's exact", args->info_str);
479 bcf_hdr_append(args->out_hdr, str.s);
480 }
481
482 if (sites_only) {
483 if (bcf_hdr_set_samples(args->out_hdr, NULL, 0) < 0) error("Error parsing the sample list\n");
484 bcf_hdr_remove(args->out_hdr, BCF_HL_FMT, NULL);
485 }
486
487 args->gt_phase_arr = (int8_t *)malloc(args->nsmpl * sizeof(int8_t));
488 args->as_arr = (int8_t *)malloc(args->nsmpl * sizeof(int8_t));
489 args->gt0_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
490 args->gt1_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
491 args->ad0_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
492 args->ad1_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
493 args->baf_arr[0] = (float *)malloc(args->nsmpl * sizeof(float));
494 args->baf_arr[1] = (float *)malloc(args->nsmpl * sizeof(float));
495 args->imap_arr = (int *)malloc(args->nsmpl * sizeof(int));
496
497 free(str.s);
498 return 0;
499 }
500
501 // Heng Li's implementation in htslib/kfunc.c
502 #define KF_GAMMA_EPS 1e-14
503 #define KF_TINY 1e-290
504
log_kf_betai_aux(double a,double b,double x)505 static double log_kf_betai_aux(double a, double b, double x) {
506 double C, D, f;
507 int j;
508 if (x == 0.) return 0.;
509 if (x == 1.) return 1.;
510 f = 1.;
511 C = f;
512 D = 0.;
513 // Modified Lentz's algorithm for computing continued fraction
514 for (j = 1; j < 200; ++j) {
515 double aa, d;
516 int m = j >> 1;
517 aa = (j & 1) ? -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1))
518 : m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m));
519 D = 1. + aa * D;
520 if (D < KF_TINY) D = KF_TINY;
521 C = 1. + aa / C;
522 if (C < KF_TINY) C = KF_TINY;
523 D = 1. / D;
524 d = C * D;
525 f *= d;
526 if (fabs(d - 1.) < KF_GAMMA_EPS) break;
527 }
528 return (kf_lgamma(a + b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1. - x)) - log(a * f);
529 }
530
531 // returns -10 * (log(2) + pbinom(min(k, n - k), n, 1/2, log.p = TRUE)) / log(10)
phred_pbinom(int k,int n)532 static double phred_pbinom(int k, int n) {
533 static double *dbinom = NULL, *pbinom = NULL;
534 static size_t n_size = 0, m_dbinom = 0, m_pbinom = 0;
535
536 if (n < 0 && k < 0) {
537 free(dbinom);
538 free(pbinom);
539 return NAN;
540 }
541
542 if (n < 0 || k < 0 || k > n) return NAN;
543
544 if (k == n >> 1) return 0.0;
545
546 if (k << 1 > n) k = n - k;
547
548 if (n > 1000) return -10.0 * M_LOG10E * (M_LN2 + log_kf_betai_aux(n - k, k + 1, .5));
549
550 if (n >= n_size) {
551 size_t len = (size_t)(1 + (1 + (n >> 1)) * ((n + 1) >> 1));
552 hts_expand(double, len, m_dbinom, dbinom);
553 hts_expand(double, len, m_pbinom, pbinom);
554 dbinom[0] = 1.0;
555 for (int i = n_size ? (int)n_size : 1; i < n + 1; i++) {
556 int prev_idx = i - 1 ? 1 + ((i - 1) >> 1) * (i >> 1) : 0;
557 int curr_idx = 1 + (i >> 1) * ((i + 1) >> 1);
558 dbinom[curr_idx] = dbinom[prev_idx] * 0.5;
559 pbinom[curr_idx] = dbinom[curr_idx];
560 for (int j = 1; j < ((i + 1) >> 1); j++) {
561 curr_idx++;
562 dbinom[curr_idx] = (double)i / (double)j * dbinom[prev_idx] * 0.5;
563 pbinom[curr_idx] = pbinom[curr_idx - 1] + dbinom[curr_idx];
564 prev_idx++;
565 }
566 }
567 n_size = (size_t)(n + 1);
568 }
569
570 int idx = 1 + (n >> 1) * ((n + 1) >> 1) + k;
571 return -10.0 * M_LOG10E * (M_LN2 + log(pbinom[idx]));
572 }
573
sample_mean_var(const float * x,int n,double * xm,double * xss)574 static int sample_mean_var(const float *x, int n, double *xm, double *xss) {
575 if (n < 2) return -1;
576 *xm = 0;
577 *xss = 0;
578 int j = 0;
579 for (int i = 0; i < n; i++) {
580 if (!isnan(x[i])) {
581 *xm += (double)x[i];
582 *xss += sq((double)x[i]);
583 j++;
584 }
585 }
586 if (j <= 1) return -1;
587 *xm /= (double)j;
588 *xss -= sq(*xm) * (double)j;
589 *xss /= (double)(j - 1);
590 return 0;
591 }
592
welch_t_test(float * a,float * b,int na,int nb)593 static double welch_t_test(float *a, float *b, int na, int nb) {
594 double mua, mub, sa2, sb2, t, v;
595 if (na < 2 || nb < 2) return HUGE_VAL;
596 sample_mean_var(a, na, &mua, &sa2);
597 sample_mean_var(b, nb, &mub, &sb2);
598 t = (mua - mub) / sqrt(sa2 / na + sb2 / nb);
599 v = (sa2 / na + sb2 / nb);
600 v *= v;
601 v /= sq(sa2) / na / na / (na - 1) + sq(sb2) / nb / nb / (nb - 1);
602 return kf_betai(v / 2.0f, 0.5, v / (v + sq(t)));
603 }
604
605 // Petr Danecek's and James Bonfield's implementation in bcftools/bam2bcf.c
606 double mann_whitney_1947_cdf(int n, int m, int U);
607
cmpfunc(const void * a,const void * b)608 static int cmpfunc(const void *a, const void *b) { return (*(float *)a > *(float *)b) - (*(float *)a < *(float *)b); }
609
610 // it currently does not handle nans
611 // adapted from Petr Danecek's implementation of calc_mwu_bias_cdf() in bcftools/bam2bcf.c
mann_whitney_u(float * a,float * b,int na,int nb)612 static double mann_whitney_u(float *a, float *b, int na, int nb) {
613 qsort(a, (size_t)na, sizeof(float), cmpfunc);
614 qsort(b, (size_t)nb, sizeof(float), cmpfunc);
615
616 int i = 0, j = 0, ca, cb;
617 double U = 0, ties = 0;
618 while (i < na || j < nb) {
619 double curr = (j == nb || (i < na && a[i] < b[j])) ? a[i] : b[j];
620 for (ca = 0; i < na && a[i] == curr; i++) ca++;
621 for (cb = 0; j < nb && b[j] == curr; j++) cb++;
622 U += ca * (j - cb * 0.5);
623 if (ca && cb) {
624 double tie = ca + cb;
625 ties += (sq(tie) - 1.0) * tie;
626 }
627 }
628 if (!na || !nb) return HUGE_VAL;
629
630 double U_min = ((double)na * nb) - U;
631 if (U < U_min) U_min = U;
632
633 if (na == 1) return 2.0 * (floor(U_min) + 1.0) / (double)(nb + 1);
634 if (nb == 1) return 2.0 * (floor(U_min) + 1.0) / (double)(na + 1);
635
636 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
637 if (na >= 8 || nb >= 8) {
638 double mean = ((double)na * nb) * 0.5;
639 // Correction for ties:
640 double N = na + nb;
641 double var2 = (sq(N) - 1) * N - ties;
642 if (var2 == 0) return 1.0;
643 var2 *= ((double)na * nb) / N / (N - 1) / 12.0;
644 // No correction for ties:
645 // float var2 = ((double)na*nb) * (na + nb + 1) / 12.0;
646 double z = (U_min - mean) / sqrt(2.0 * var2); // z is N(0,1)
647 // return 2.0 - kf_erfc(z); // which is 1 + erf(z)
648 return kf_erfc(-z); // which is 1 - erf(-z)
649 }
650
651 // Exact calculation
652 double pval = 2.0 * mann_whitney_1947_cdf(na, nb, (int)U_min);
653 return pval > 1.0 ? 1.0 : pval;
654 }
655
bcf_int8_is_vector_end(int8_t value)656 static inline int bcf_int8_is_vector_end(int8_t value) { return value == bcf_int8_vector_end; }
bcf_int16_is_vector_end(int16_t value)657 static inline int bcf_int16_is_vector_end(int16_t value) { return value == bcf_int16_vector_end; }
bcf_int32_is_vector_end(int32_t value)658 static inline int bcf_int32_is_vector_end(int32_t value) { return value == bcf_int32_vector_end; }
bcf_int8_is_missing(int8_t value)659 static inline int bcf_int8_is_missing(int8_t value) { return value == bcf_int8_missing; }
bcf_int16_is_missing(int16_t value)660 static inline int bcf_int16_is_missing(int16_t value) { return value == bcf_int16_missing; }
bcf_int32_is_missing(int32_t value)661 static inline int bcf_int32_is_missing(int32_t value) { return value == bcf_int32_missing; }
662
663 // retrieve phase information from BCF record
664 // assumes little endian architecture
bcf_get_format_sign(bcf_fmt_t * fmt,int8_t * as_arr,int nsmpl)665 static int bcf_get_format_sign(bcf_fmt_t *fmt, int8_t *as_arr, int nsmpl) {
666 if (!fmt || fmt->n != 1) return 0;
667
668 #define BRANCH(type_t, is_vector_end, is_missing) \
669 { \
670 type_t *p = (type_t *)fmt->p; \
671 for (int i = 0; i < nsmpl; i++) { \
672 if (is_vector_end(p[i]) || is_missing(p[i])) \
673 as_arr[i] = bcf_int8_missing; \
674 else if (p[i] == (type_t)0) \
675 as_arr[i] = (int8_t)0; \
676 else if (p[i] > (type_t)0) \
677 as_arr[i] = (int8_t)1; \
678 else \
679 as_arr[i] = (int8_t)-1; \
680 } \
681 }
682 switch (fmt->type) {
683 case BCF_BT_INT8:
684 BRANCH(int8_t, bcf_int8_is_vector_end, bcf_int8_is_missing);
685 break;
686 case BCF_BT_INT16:
687 BRANCH(int16_t, bcf_int16_is_vector_end, bcf_int16_is_missing);
688 break;
689 case BCF_BT_INT32:
690 BRANCH(int32_t, bcf_int32_is_vector_end, bcf_int32_is_missing);
691 break;
692 case BCF_BT_FLOAT:
693 BRANCH(int32_t, bcf_float_is_vector_end, bcf_float_is_missing);
694 break;
695 default:
696 error("Unexpected type %d\n", fmt->type);
697 }
698 #undef BRANCH
699
700 return 1;
701 }
702
process(bcf1_t * rec)703 bcf1_t *process(bcf1_t *rec) {
704 // compute GC and CpG content for each site
705 if (args->flags & (INFO_GC | INFO_CPG)) {
706 int fa_len;
707 int at_cnt = 0, cg_cnt = 0, cpg_cnt = 0;
708 const char *ref = rec->d.allele[0];
709 char *fa = faidx_fetch_seq(args->fai, bcf_seqname(args->in_hdr, rec), rec->pos - args->gc_win,
710 rec->pos + (int)strlen(ref) - 1 + args->gc_win, &fa_len);
711 if (!fa)
712 error("fai_fetch_seq failed at %s:%" PRId64 "\n", bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
713 for (int i = 0; i < fa_len; i++) {
714 if (fa[i] > 96) fa[i] = (char)(fa[i] - 32);
715 if (fa[i] == 'A' || fa[i] == 'T') at_cnt++;
716 if (fa[i] == 'C' || fa[i] == 'G') cg_cnt++;
717 if (i > 0)
718 if (fa[i - 1] == 'C' && fa[i] == 'G') cpg_cnt += 2;
719 }
720 free(fa);
721 float ratio;
722 if (args->flags & INFO_GC) {
723 ratio = (float)(cg_cnt) / (float)(at_cnt + cg_cnt);
724 bcf_update_info_float(args->out_hdr, rec, "GC", &ratio, 1);
725 }
726 if (args->flags & INFO_CPG) {
727 ratio = (float)cpg_cnt / (float)(fa_len);
728 bcf_update_info_float(args->out_hdr, rec, "CpG", &ratio, 1);
729 }
730 }
731
732 int ac_sex[] = {0, 0, 0, 0};
733 int missing_sex[] = {0, 0, 0, 0};
734 int pac_het[] = {0, 0};
735 int ad_het[] = {0, 0};
736 int summary[] = {0, 0};
737 int psummary[] = {0, 0};
738 float ret[4];
739
740 // extract format information from VCF format records
741 bcf_fmt_t *gt_fmt = bcf_get_fmt_id(rec, args->gt_id);
742 int gt_phase = bcf_get_genotype_phase(gt_fmt, args->gt_phase_arr, args->nsmpl);
743
744 // if samples or genotypes are not present, skip to the end
745 if (args->nsmpl == 0 || !bcf_get_unphased_genotype_alleles(gt_fmt, args->gt0_arr, args->gt1_arr, args->nsmpl))
746 goto ret;
747
748 bcf_fmt_t *as_fmt = bcf_get_fmt_id(rec, args->as_id);
749 int as_sign = (args->as_str && as_fmt) ? bcf_get_format_sign(as_fmt, args->as_arr, args->nsmpl) : 0;
750
751 bcf_fmt_t *ad_fmt = bcf_get_fmt_id(rec, args->ad_id);
752 int ad = ad_fmt && ad_fmt->n == rec->n_allele ? bcf_get_allelic_depth(ad_fmt, args->gt0_arr, args->gt1_arr,
753 args->ad0_arr, args->ad1_arr, args->nsmpl)
754 : 0;
755
756 bcf_fmt_t *baf_fmt = bcf_get_fmt_id(rec, args->baf_id);
757 int baf = baf_fmt && baf_fmt->n == 1 && baf_fmt->type == BCF_BT_FLOAT ? 1 : 0;
758
759 bcf_fmt_t *lrr_fmt = bcf_get_fmt_id(rec, args->lrr_id);
760 int lrr = lrr_fmt && lrr_fmt->n == 1 && lrr_fmt->type == BCF_BT_FLOAT ? 1 : 0;
761
762 if ((args->flags & (INFO_ALLELE_A | INFO_ALLELE_B)) && baf) {
763 int alleles[2];
764 switch (rec->n_allele) {
765 case 1:
766 alleles[0] = -1;
767 alleles[1] = -1;
768 break;
769 case 2:
770 alleles[0] = 0;
771 alleles[1] = 1;
772 break;
773 case 3:
774 alleles[0] = 1;
775 alleles[1] = 2;
776 break;
777 default:
778 error("Observed wrong number of alleles at %s:%" PRId64 "\n", bcf_hdr_id2name(args->in_hdr, rec->rid),
779 rec->pos + 1);
780 }
781 float median[2] = {NAN, NAN};
782 int alleles_idx[2] = {-1, -1};
783 for (int i = 0; i < 2; i++) {
784 int n = 0;
785 for (int j = 0; j < args->nsmpl; j++)
786 if (args->gt0_arr[j] == alleles[i] && args->gt1_arr[j] == alleles[i]) args->imap_arr[n++] = j;
787 median[i] = get_median((float *)baf_fmt->p, n, args->imap_arr);
788 if (median[i] < .5)
789 alleles_idx[i] = alleles[0];
790 else if (median[i] > .5)
791 alleles_idx[i] = alleles[1];
792 }
793 if (alleles_idx[0] == alleles_idx[1]) {
794 alleles_idx[0] = -1;
795 alleles_idx[1] = -1;
796 fprintf(stderr, "Unable to infer the A and B alleles while parsing the site %s:%" PRId64 "\n",
797 bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
798 } else if (alleles_idx[0] == -1) {
799 alleles_idx[0] = alleles_idx[1] == alleles[0] ? alleles[1] : alleles[0];
800 } else if (alleles_idx[1] == -1) {
801 alleles_idx[1] = alleles_idx[0] == alleles[0] ? alleles[1] : alleles[0];
802 }
803 if (args->flags & INFO_ALLELE_A) bcf_update_info_int32(args->out_hdr, rec, "ALLELE_A", &alleles_idx[0], 1);
804 if (args->flags & INFO_ALLELE_B) bcf_update_info_int32(args->out_hdr, rec, "ALLELE_B", &alleles_idx[1], 1);
805 }
806
807 // determine ALLELE_A and ALLELE_B from VCF record
808 bcf_info_t *allele_a_info = bcf_get_info_id(rec, args->allele_a_id);
809 int8_t allele_a = allele_a_info ? ((int8_t *)allele_a_info->vptr)[0] : -1;
810 bcf_info_t *allele_b_info = bcf_get_info_id(rec, args->allele_b_id);
811 int8_t allele_b = allele_b_info ? ((int8_t *)allele_b_info->vptr)[0] : -1;
812
813 // adjust BAF and LRR values
814 if (args->adjust && baf && lrr) {
815 if (bcf_get_info_float(args->in_hdr, rec, "ADJ_COEFF", &args->adjust_arr, &args->m_adjust) < 0)
816 error("Could not retrieve adjusting coefficients at %s:%" PRId64 "\n",
817 bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
818 if (args->m_adjust < 9)
819 error("Not enough adjusting coefficients at %s:%" PRId64 "\n", bcf_hdr_id2name(args->in_hdr, rec->rid),
820 rec->pos + 1);
821 for (int i = 0; i < args->nsmpl; i++) {
822 int n_a = (args->gt0_arr[i] == allele_a) + (args->gt1_arr[i] == allele_a);
823 int n_b = (args->gt0_arr[i] == allele_b) + (args->gt1_arr[i] == allele_b);
824 if (n_a + n_b != 2) continue;
825 ((float *)baf_fmt->p)[i] -=
826 args->adjust_arr[3 * n_b + 1] * ((float *)lrr_fmt->p)[i] - args->adjust_arr[3 * n_b];
827 ((float *)lrr_fmt->p)[i] -= args->adjust_arr[3 * n_b + 2];
828 }
829 }
830
831 for (int i = 0; i < args->nsmpl; i++) {
832 float curr_baf = NAN;
833
834 int is_missing = args->gt0_arr[i] == bcf_int16_missing || args->gt1_arr[i] == bcf_int16_missing;
835 if (args->gender) {
836 switch (args->gender[i]) {
837 case GENDER_MALE: // male
838 missing_sex[is_missing]++;
839 break;
840 case GENDER_FEMALE: // female
841 missing_sex[2 + is_missing]++;
842 break;
843 default:
844 break;
845 }
846 }
847
848 // if genotype is missing, skip
849 if (is_missing) continue;
850
851 int idx_as_sign =
852 (as_sign && args->as_arr[i] != bcf_int8_missing && args->as_arr[i] != 0) ? (1 - args->as_arr[i]) / 2 : -1;
853 if (idx_as_sign >= 0) summary[idx_as_sign]++;
854
855 int is_het = args->gt0_arr[i] != args->gt1_arr[i];
856 if (args->gender) {
857 switch (args->gender[i]) {
858 case GENDER_MALE: // male
859 ac_sex[is_het]++;
860 break;
861 case GENDER_FEMALE: // female
862 ac_sex[2 + is_het]++;
863 break;
864 default:
865 break;
866 }
867 }
868
869 // if genotype is not heterozygous reference / alternate, skip
870 if (!is_het || (args->gt0_arr[i] != 0 && args->gt1_arr[i] != 0)) continue;
871
872 int idx_gt_phase = gt_phase && (args->gt_phase_arr[i] == -1 || args->gt_phase_arr[i] == 1)
873 ? (1 - args->gt_phase_arr[i]) / 2
874 : -1;
875 if (idx_gt_phase >= 0) pac_het[idx_gt_phase]++;
876
877 int idx_fmt_phase =
878 (idx_gt_phase >= 0 && idx_as_sign >= 0) ? (1 - args->as_arr[i] * args->gt_phase_arr[i]) / 2 : -1;
879 if (idx_fmt_phase >= 0) psummary[idx_fmt_phase]++;
880
881 if (ad) {
882 int ref_cnt = args->ad0_arr[i];
883 int alt_cnt = args->ad1_arr[i];
884 ad_het[0] += ref_cnt;
885 ad_het[1] += alt_cnt;
886 curr_baf = ((float)alt_cnt + 0.5f) / ((float)ref_cnt + (float)alt_cnt + 1.0f);
887 }
888 if (baf) curr_baf = ((float *)baf_fmt->p)[i];
889 if (idx_gt_phase >= 0 && !isnan(curr_baf)) {
890 args->baf_arr[idx_gt_phase][pac_het[idx_gt_phase] - 1] = curr_baf;
891 }
892 }
893
894 if (args->flags & INFO_AC_SEX) bcf_update_info_int32(args->out_hdr, rec, "AC_Sex", &ac_sex, 4);
895 if (args->flags & INFO_MISSING_SEX) bcf_update_info_int32(args->out_hdr, rec, "MISSING_Sex", &missing_sex, 4);
896 if (args->flags & INFO_pAC_HET) bcf_update_info_int32(args->out_hdr, rec, "pAC_Het", &pac_het, 2);
897 if ((args->flags & INFO_AD_HET) && ad) bcf_update_info_int32(args->out_hdr, rec, "AD_Het", &ad_het, 2);
898
899 if ((args->flags & INFO_pBAF_STATS) && (ad || baf)) {
900 ret[0] = get_median(args->baf_arr[0], pac_het[0], NULL);
901 ret[1] = get_median(args->baf_arr[1], pac_het[1], NULL);
902 ret[2] = welch_t_test(args->baf_arr[0], args->baf_arr[1], pac_het[0], pac_het[1]);
903 ret[3] = mann_whitney_u(args->baf_arr[0], args->baf_arr[1], pac_het[0], pac_het[1]);
904 bcf_update_info_float(args->out_hdr, rec, "pBAF_Stats", &ret, 4);
905 }
906
907 if ((args->flags & INFO_COR_BAF_LRR) && baf && lrr) {
908 float rho[3];
909 for (int i = 0; i < 3; i++) {
910 int n = 0;
911 for (int j = 0; j < args->nsmpl; j++) {
912 int n_a = (args->gt0_arr[j] == allele_a) + (args->gt1_arr[j] == allele_a);
913 int n_b = (args->gt0_arr[j] == allele_b) + (args->gt1_arr[j] == allele_b);
914 if (n_a == 2 - i && n_b == i) args->imap_arr[n++] = j;
915 }
916 // compute the Pearson correlation
917 float xss = 0.0f, yss = 0.0f, xyss = 0.0f;
918 get_cov((float *)baf_fmt->p, (float *)lrr_fmt->p, n, args->imap_arr, &xss, &yss, &xyss);
919 rho[i] = xyss / sqrtf(xss * yss);
920 }
921 bcf_update_info_float(args->out_hdr, rec, "Cor_BAF_LRR", &rho, 3);
922 }
923
924 ret:
925 if (args->as_str)
926 bcf_update_info_int32(args->out_hdr, rec, args->summary_str.s, args->phase ? &psummary : &summary, 2);
927
928 // perform binomial or Fisher's exact test on existing INFO field
929 if (args->info_str) {
930 bcf_info_t *info = bcf_get_info_id(rec, args->info_id);
931 if (!(info->type & (BCF_BT_INT8 | BCF_BT_INT16 | BCF_BT_INT32)) || (info->len != 2 && info->len != 4))
932 error("INFO field %s at %s:%" PRId64 " should contain two or four integers\n", args->info_str,
933 bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
934 bcf_get_info_int32(args->out_hdr, rec, args->info_str, &args->info_arr, &args->m_info);
935 if (info->len == 2) {
936 double phred_pval = phred_pbinom(args->info_arr[0], args->info_arr[0] + args->info_arr[1]);
937 ret[0] = (float)(args->phred ? phred_pval : exp(-0.1 * M_LN10 * phred_pval));
938 bcf_update_info_float(args->out_hdr, rec, args->test_str.s, &ret, 1);
939 } else {
940 double left, right, fisher;
941 double pval = kt_fisher_exact(args->info_arr[0], args->info_arr[1], args->info_arr[2], args->info_arr[3],
942 &left, &right, &fisher);
943 ret[0] = (float)(args->phred ? -10.0 * M_LOG10E * log(pval) : pval);
944 }
945 bcf_update_info_float(args->out_hdr, rec, args->test_str.s, &ret, 1);
946 }
947
948 // remove all samples if sites_only was selected
949 if (bcf_hdr_nsamples(args->out_hdr) == 0) bcf_subset(args->out_hdr, rec, 0, NULL);
950 return rec;
951 }
952
destroy(void)953 void destroy(void) {
954 phred_pbinom(-1, -1);
955 free(args->gender);
956 free(args->gt_phase_arr);
957 free(args->as_arr);
958 free(args->gt0_arr);
959 free(args->gt1_arr);
960 free(args->ad0_arr);
961 free(args->ad1_arr);
962 free(args->info_arr);
963 free(args->adjust_arr);
964 free(args->baf_arr[0]);
965 free(args->baf_arr[1]);
966 free(args->imap_arr);
967 free(args->summary_str.s);
968 free(args->test_str.s);
969 free(args);
970 }
971