1 /* The MIT License
2 
3    Copyright (C) 2017-2021 Giulio Genovese
4 
5    Author: Giulio Genovese <giulio.genovese@gmail.com>
6 
7    Permission is hereby granted, free of charge, to any person obtaining a copy
8    of this software and associated documentation files (the "Software"), to deal
9    in the Software without restriction, including without limitation the rights
10    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11    copies of the Software, and to permit persons to whom the Software is
12    furnished to do so, subject to the following conditions:
13 
14    The above copyright notice and this permission notice shall be included in
15    all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <getopt.h>
30 #include <htslib/vcf.h>
31 #include <htslib/faidx.h>
32 #include <htslib/kfunc.h>
33 #include "mocha.h"
34 #include "bcftools.h"
35 
36 #define MOCHATOOLS_VERSION "2021-10-15"
37 
38 #define TAG_LIST_DFLT "none"
39 #define GC_WIN_DFLT "200"
40 
41 #define INFO_GC (1 << 0)
42 #define INFO_CPG (1 << 1)
43 #define INFO_ALLELE_A (1 << 2)
44 #define INFO_ALLELE_B (1 << 3)
45 #define INFO_AC_SEX (1 << 4)
46 #define INFO_MISSING_SEX (1 << 5)
47 #define INFO_pAC_HET (1 << 6)
48 #define INFO_AD_HET (1 << 7)
49 #define INFO_pBAF_STATS (1 << 8)
50 #define INFO_COR_BAF_LRR (1 << 9)
51 
sq(double x)52 static inline double sq(double x) { return x * x; }
53 
54 typedef struct {
55     int flags;
56     int adjust; // whether to adjust BAF and LRR
57     int gc_win; // length of window to compute GC and CpG content
58     int *gender;
59     char *as_str;   // format field ID to summarize for allelic shift count summary
60     int phase;      // whether phase information should be included in the allelic shift
61     char *info_str; // info field for binomial or Fisher's exact test
62     int phred;      // whether the test result should be phred scaled
63     int nsmpl, allele_a_id, allele_b_id, adjust_id, gt_id, ad_id, baf_id, lrr_id, info_id, as_id;
64     int8_t *gt_phase_arr, *as_arr;
65     int16_t *gt0_arr, *gt1_arr, *ad0_arr, *ad1_arr;
66     int32_t *info_arr;
67     int m_info;
68     float *adjust_arr;
69     int m_adjust;
70     float *baf_arr[2];
71     int *imap_arr;
72     faidx_t *fai;
73     bcf_hdr_t *in_hdr, *out_hdr;
74     kstring_t summary_str, test_str;
75 } args_t;
76 
77 args_t *args;
78 
read_computed_gender(bcf_hdr_t * hdr,char * fname)79 static int *read_computed_gender(bcf_hdr_t *hdr, char *fname) {
80     htsFile *fp = hts_open(fname, "r");
81     if (fp == NULL) error("Could not open %s: %s\n", fname, strerror(errno));
82 
83     kstring_t str = {0, 0, NULL};
84     if (hts_getline(fp, KS_SEP_LINE, &str) <= 0) error("Empty file: %s\n", fname);
85     tsv_t *tsv = tsv_init_delimiter(str.s, '\t');
86     int idx, computed_gender;
87     if (tsv_register(tsv, "sample_id", tsv_read_sample_id, (void *)&idx) < 0)
88         error("File %s is missing the sample_id column\n", fname);
89     if (tsv_register(tsv, "computed_gender", tsv_read_computed_gender, (void *)&computed_gender) < 0)
90         error("File %s is missing the computed_gender column\n", fname);
91 
92     int i = 0;
93     int *gender = (int *)calloc((size_t)bcf_hdr_nsamples(hdr), sizeof(int));
94     while (hts_getline(fp, KS_SEP_LINE, &str) > 0) {
95         if (!tsv_parse_delimiter(tsv, (bcf1_t *)hdr, str.s, '\t')) {
96             if (idx < 0) continue;
97             gender[idx] = computed_gender;
98             i++;
99         } else {
100             error("Could not parse line: %s\n", str.s);
101         }
102     }
103 
104     tsv_destroy(tsv);
105     free(str.s);
106     hts_close(fp);
107     return gender;
108 }
109 
about(void)110 const char *about(void) { return "MOsaic CHromosomal Alterations tools.\n"; }
111 
usage(void)112 const char *usage(void) {
113     return "\n"
114            "About: tools for the MOsaic CHromosomal Alterations pipeline. "
115            "(version " MOCHATOOLS_VERSION
116            " https://github.com/freeseek/mocha)\n"
117            "Usage: bcftools +mochatools [General Options] -- [Plugin Options]\n"
118            "\n"
119            "Options:\n"
120            "   run \"bcftools plugin\" for a list of common options\n"
121            "\n"
122            "Plugin options:\n"
123            "   -l, --list-tags               list available INFO tags with description for VCF output\n"
124            "   -t, --tags LIST               list of output INFO tags [" TAG_LIST_DFLT
125            "]\n"
126            "       --adjust                  adjust BAF and LRR using INFO/ADJ_COEFF\n"
127            "   -f, --fasta-ref <file>        reference sequence to compute GC and CpG content\n"
128            "       --gc-window-size <int>    window size in bp used to compute the GC and CpG content [" GC_WIN_DFLT
129            "]\n"
130            "   -x, --sex <file>              file including information about the gender of the samples\n"
131            "       --summary <tag>           allelic shift FORMAT tag to summarize\n"
132            "       --phase                   whether phase information should be included in the allelic shift\n"
133            "       --test <tag>              performs binomial or Fisher's exact test for INFO tag\n"
134            "       --phred                   reports p-values as phred scaled\n"
135            "   -s, --samples [^]<list>       comma separated list of samples to include (or exclude with \"^\" "
136            "prefix)\n"
137            "   -S, --samples-file [^]<file>  file of samples to include (or exclude with \"^\" prefix)\n"
138            "       --force-samples           only warn about unknown subset samples\n"
139            "   -G, --drop-genotypes          drop individual genotype information (after running statistical tests)\n"
140            "\n"
141            "Examples:\n"
142            "    bcftools +mochatools -- -l\n"
143            "    bcftools +mochatools file.bcf -Ob -o output.bcf -- -t GC -f human_g1k_v37.fasta\n"
144            "    bcftools +mochatools file.bcf -Ob -o output.bcf -- -t ALLELE_A,ALLELE_B\n"
145            "    bcftools +mochatools file.bcf -- --summary AS --test AS --drop-genotypes\n"
146            "    bcftools +mochatools file.bcf -- --summary AS --phase --test pAS --drop-genotypes\n"
147            "\n";
148 }
149 
parse_tags(const char * str)150 static int parse_tags(const char *str) {
151     if (!args->in_hdr) error("%s", usage());
152 
153     int flags = 0, n_tags;
154     if (!strcasecmp(str, "none")) return flags;
155     char **tags = hts_readlist(str, 0, &n_tags);
156     for (int i = 0; i < n_tags; i++) {
157         if (!strcasecmp(tags[i], "GC"))
158             flags |= INFO_GC;
159         else if (!strcasecmp(tags[i], "CpG"))
160             flags |= INFO_CPG;
161         else if (!strcasecmp(tags[i], "ALLELE_A"))
162             flags |= INFO_ALLELE_A;
163         else if (!strcasecmp(tags[i], "ALLELE_B"))
164             flags |= INFO_ALLELE_B;
165         else if (!strcasecmp(tags[i], "AC_Sex"))
166             flags |= INFO_AC_SEX;
167         else if (!strcasecmp(tags[i], "MISSING_Sex"))
168             flags |= INFO_MISSING_SEX;
169         else if (!strcasecmp(tags[i], "pAC_Het"))
170             flags |= INFO_pAC_HET;
171         else if (!strcasecmp(tags[i], "AD_Het"))
172             flags |= INFO_AD_HET;
173         else if (!strcasecmp(tags[i], "pBAF_Stats"))
174             flags |= INFO_pBAF_STATS;
175         else if (!strcasecmp(tags[i], "Cor_BAF_LRR"))
176             flags |= INFO_COR_BAF_LRR;
177         else
178             error("Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str, tags[i]);
179         free(tags[i]);
180     }
181     if (n_tags) free(tags);
182     return flags;
183 }
184 
list_tags(void)185 static void list_tags(void) {
186     error(
187         "INFO/GC           Number:1  Type:Float    ..  GC ratio content around the marker\n"
188         "INFO/CpG          Number:1  Type:Float    ..  CpG ratio content around the marker\n"
189         "INFO/ALLELE_A     Number:1  Type:Integer  ..  A allele\n"
190         "INFO/ALLELE_B     Number:1  Type:Integer  ..  B allele\n"
191         "INFO/AC_Sex       Number:4  Type:Integer  ..  Number of homozygous and heterozygous genotypes by gender\n"
192         "INFO/MISSING_Sex  Number:4  Type:Integer  ..  Number of non-missing and missing genotypes by gender\n"
193         "INFO/pAC_Het      Number:2  Type:Integer  ..  Number of heterozygous genotypes by transmission type\n"
194         "INFO/AD_Het       Number:2  Type:Integer  ..  Allelic depths for the reference and alternate alleles across "
195         "heterozygous genotypes\n"
196         "INFO/pBAF_Stats   Number:4  Type:Float    ..  Welch's t-test and Mann-Whitney U test for BAF by transmission "
197         "type across heterozygous genotypes\n"
198         "INFO/Cor_BAF_LRR  Number:3  Type=Float    ..  Pearson correlation for BAF and LRR at AA, AB, and BB "
199         "genotypes\n");
200 }
201 
init(int argc,char ** argv,bcf_hdr_t * in,bcf_hdr_t * out)202 int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) {
203     args = (args_t *)calloc(1, sizeof(args_t));
204     const char *tag_list = TAG_LIST_DFLT;
205     char *tmp;
206     args->gc_win = (int)strtol(GC_WIN_DFLT, NULL, 0);
207     args->in_hdr = in;
208     args->out_hdr = out;
209     args->info_str = NULL;
210     args->as_str = NULL;
211     int sample_is_file = 0;
212     int force_samples = 0;
213     int sites_only = 0;
214     char *sample_names = NULL;
215     char *gender_fname = NULL;
216     char *ref_fname = NULL;
217     kstring_t str = {0, 0, NULL};
218 
219     int c;
220     static struct option loptions[] = {{"list-tags", no_argument, NULL, 'l'},
221                                        {"tags", required_argument, NULL, 't'},
222                                        {"adjust", no_argument, NULL, 1},
223                                        {"fasta-ref", required_argument, NULL, 'f'},
224                                        {"gc-window-size", required_argument, NULL, 2},
225                                        {"sex", required_argument, NULL, 'x'},
226                                        {"summary", required_argument, NULL, 3},
227                                        {"phase", no_argument, NULL, 4},
228                                        {"test", required_argument, NULL, 5},
229                                        {"phred", no_argument, NULL, 6},
230                                        {"samples", required_argument, NULL, 's'},
231                                        {"samples-file", required_argument, NULL, 'S'},
232                                        {"force-samples", no_argument, NULL, 7},
233                                        {"drop-genotypes", no_argument, NULL, 'G'},
234                                        {NULL, 0, NULL, 0}};
235 
236     while ((c = getopt_long(argc, argv, "h?lt:f:x:s:S:G", loptions, NULL)) >= 0) {
237         switch (c) {
238         case 'l':
239             list_tags();
240             break;
241         case 't':
242             tag_list = optarg;
243             break;
244         case 1:
245             args->adjust = 1;
246             break;
247         case 'f':
248             ref_fname = optarg;
249             break;
250         case 2:
251             args->gc_win = (int)strtol(optarg, &tmp, 0);
252             if (*tmp) error("Could not parse: -w %s\n", optarg);
253             if (args->gc_win <= 0) error("Window size is not positive: -w %s\n", optarg);
254             break;
255         case 'x':
256             gender_fname = optarg;
257             break;
258         case 3:
259             args->as_str = optarg;
260             break;
261         case 4:
262             args->phase = 1;
263             break;
264         case 5:
265             args->info_str = optarg;
266             break;
267         case 6:
268             args->phred = 1;
269             break;
270         case 's':
271             sample_names = optarg;
272             break;
273         case 'S':
274             sample_names = optarg;
275             sample_is_file = 1;
276             break;
277         case 7:
278             force_samples = 1;
279             break;
280         case 'G':
281             sites_only = 1;
282             break;
283         case 'h':
284         case '?':
285         default:
286             error("%s", usage());
287             break;
288         }
289     }
290     args->flags |= parse_tags(tag_list);
291 
292     // this ugly workaround is required to make sure we can set samples on both headers even
293     // when sample_is_file is true and sample_names is stdin
294     if (sample_names) {
295         int nsmpl;
296         char **smpl = hts_readlist(sample_names[0] == '^' ? sample_names + 1 : sample_names, sample_is_file, &nsmpl);
297         kstring_t tmp = {0, 0, NULL};
298         if (sample_names[0] == '^')
299             ksprintf(&tmp, "^%s", smpl[0]);
300         else
301             ksprintf(&tmp, "%s", smpl[0]);
302         for (int i = 1; i < nsmpl; i++) ksprintf(&tmp, ",%s", smpl[i]);
303         int ret = bcf_hdr_set_samples(args->in_hdr, tmp.s, 0);
304         if (ret < 0)
305             error("Error parsing the sample list\n");
306         else if (ret > 0) {
307             if (force_samples)
308                 fprintf(stderr,
309                         "Warn: subset called for sample that does not exist in header: "
310                         "\"%s\"... skipping\n",
311                         smpl[ret - 1]);
312             else
313                 error(
314                     "Error: subset called for sample that does not exist in header: \"%s\". "
315                     "Use \"--force-samples\" to ignore this error.\n",
316                     smpl[ret - 1]);
317         }
318         if (bcf_hdr_nsamples(args->in_hdr) == 0) fprintf(stderr, "Warn: subsetting has removed all samples\n");
319         if (bcf_hdr_set_samples(args->out_hdr, tmp.s, 0) < 0) error("Error parsing the sample list\n");
320         free(tmp.s);
321         for (int i = 0; i < nsmpl; i++) free(smpl[i]);
322         free(smpl);
323     }
324 
325     if (gender_fname) args->gender = read_computed_gender(args->in_hdr, gender_fname);
326 
327     if (args->flags & (INFO_GC | INFO_CPG)) {
328         if (!ref_fname) error("Reference sequence not provided, cannot infer GC or CpG content\n");
329         args->fai = fai_load(ref_fname);
330         if (!args->fai) error("Failed to load the fai index: %s\n", ref_fname);
331         bcf_hdr_append(args->out_hdr,
332                        "##INFO=<ID=GC,Number=1,Type=Float,Description=\"GC ratio content around the marker\">");
333         bcf_hdr_append(args->out_hdr,
334                        "##INFO=<ID=CpG,Number=1,Type=Float,Description=\"CpG ratio content around the marker\">");
335     }
336 
337     args->allele_a_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_A");
338     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_INFO, args->allele_a_id)) args->allele_a_id = -1;
339     if (args->allele_a_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_INFO, args->allele_a_id) != BCF_HT_INT)
340         error("Error: input VCF file ALLELE_A info field is not of integer type\n");
341 
342     args->allele_b_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_B");
343     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_INFO, args->allele_a_id)) args->allele_b_id = -1;
344     if (args->allele_b_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_INFO, args->allele_b_id) != BCF_HT_INT)
345         error("Error: input VCF file ALLELE_B info field is not of float type\n");
346 
347     args->adjust_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ADJ_COEFF");
348     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_INFO, args->adjust_id)) args->adjust_id = -1;
349     if (args->adjust_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_INFO, args->adjust_id) != BCF_HT_REAL)
350         error("Error: input VCF file ADJ_COEFF info field is not of integer type\n");
351     if (args->adjust_id >= 0
352         && (bcf_hdr_id2length(args->in_hdr, BCF_HL_INFO, args->adjust_id) != BCF_VL_FIXED
353             || bcf_hdr_id2number(args->in_hdr, BCF_HL_INFO, args->adjust_id) != 9))
354         error("Error: input VCF file ADJ_COEFF info field has wrong number of values\n");
355 
356     if (args->adjust && args->adjust_id < 0)
357         error("Error: ADJ_COEFF field is not present, cannot perform --adjust correction\n");
358 
359     args->nsmpl = bcf_hdr_nsamples(args->in_hdr);
360 
361     args->gt_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "GT");
362     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->gt_id)) args->gt_id = -1;
363 
364     args->ad_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "AD");
365     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->ad_id)) args->ad_id = -1;
366     if (args->ad_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_FMT, args->ad_id) != BCF_HT_INT)
367         error("Error: input VCF file AD format field is not of integer type\n");
368     if (args->ad_id >= 0 && bcf_hdr_id2length(args->in_hdr, BCF_HL_FMT, args->ad_id) != BCF_VL_R)
369         error("Error: input VCF file AD format field has wrong number of values\n");
370 
371     args->baf_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "BAF");
372     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->baf_id)) args->baf_id = -1;
373     if (args->baf_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_FMT, args->baf_id) != BCF_HT_REAL)
374         error("Error: input VCF file BAF format field is not of float type\n");
375 
376     args->lrr_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "LRR");
377     if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->lrr_id)) args->lrr_id = -1;
378     if (args->lrr_id >= 0 && bcf_hdr_id2type(args->in_hdr, BCF_HL_FMT, args->lrr_id) != BCF_HT_REAL)
379         error("Error: input VCF file LRR format field is not of float type\n");
380 
381     if (args->flags & (INFO_ALLELE_A | INFO_ALLELE_B)) {
382         if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->gt_id))
383             error("Error: GT format field is not present, cannot infer ALLELE_A or ALLELE_B\n");
384         if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->baf_id))
385             error("Error: BAF format field is not present, cannot infer ALLELE_A or ALLELE_B\n");
386         if (args->flags & INFO_ALLELE_A) {
387             if (bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_A") >= 0)
388                 error("Field ALLELE_A already present in the VCF.\n");
389             bcf_hdr_append(args->out_hdr, "##INFO=<ID=ALLELE_A,Number=1,Type=Integer,Description=\"A allele\">");
390         }
391         if (args->flags & INFO_ALLELE_B) {
392             if (bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, "ALLELE_B") >= 0)
393                 error("Field ALLELE_B already present in the VCF.\n");
394             if (args->flags & INFO_ALLELE_B)
395                 bcf_hdr_append(args->out_hdr, "##INFO=<ID=ALLELE_B,Number=1,Type=Integer,Description=\"B allele\">");
396         }
397     }
398 
399     if (args->flags & (INFO_AC_SEX)) {
400         if (!args->gender) error("Error: AC_Sex require --sex\n");
401         bcf_hdr_append(args->out_hdr,
402                        "##INFO=<ID=AC_Sex,Number=4,Type=Integer,Description=\"Number of homozygous and heterozygous "
403                        "genotypes by gender\">");
404     }
405 
406     if (args->flags & (INFO_MISSING_SEX)) {
407         if (!args->gender) error("Error: MISSING_Sex require --sex\n");
408         bcf_hdr_append(args->out_hdr,
409                        "##INFO=<ID=MISSING_Sex,Number=4,Type=Integer,Description=\"Number of non-missing and missing "
410                        "genotypes by gender\">");
411     }
412 
413     if (args->flags & INFO_pAC_HET) {
414         if (args->gt_id < 0) error("Error: GT format field is not present, cannot compute pAC_Het\n");
415         bcf_hdr_append(args->out_hdr,
416                        "##INFO=<ID=pAC_Het,Number=2,Type=Integer,Description=\"Number of heterozygous genotypes "
417                        "by transmission type\">");
418     }
419 
420     if (args->flags & INFO_AD_HET) {
421         if (args->gt_id < 0) error("Error: GT format field is not present, cannot compute AD_Het\n");
422         if (args->ad_id < 0) error("Error: AD format field is not present, cannot compute AD_Het\n");
423         bcf_hdr_append(args->out_hdr,
424                        "##INFO=<ID=AD_Het,Number=2,Type=Integer,Description=\"Allelic depths for the reference and "
425                        "alternate alleles across heterozygous genotypes\">");
426     }
427 
428     if (args->flags & INFO_pBAF_STATS) {
429         if (args->gt_id < 0) error("Error: GT format field is not present, cannot compute pBAF_Stats\n");
430         if (args->ad_id < 0 && args->baf_id < 0)
431             error("Error: Neither AD nor BAF format fields are present, cannot compute pBAF_Stats\n");
432         bcf_hdr_append(
433             args->out_hdr,
434             "##INFO=<ID=pBAF_Stats,Number=4,Type=Float,Description=\"Welch'"
435             "s t-test and Mann-Whitney U test for allelic transmission ratios across heterozygous genotypes\">");
436     }
437 
438     if (args->flags & INFO_COR_BAF_LRR) {
439         if (args->allele_a_id < 0)
440             error("Error: ALLELE_A field is not present, cannot perform --cor-BAF-LRR analysis\n");
441         if (args->allele_b_id < 0)
442             error("Error: ALLELE_B field is not present, cannot perform --cor-BAF-LRR analysis\n");
443         if (args->baf_id < 0) error("Error: BAF format is not present, cannot perform --cor-BAF-LRR analysis\n");
444         if (args->lrr_id < 0) error("Error: LRR format is not present, cannot perform --cor-BAF-LRR analysis\n");
445         bcf_hdr_append(args->out_hdr,
446                        "##INFO=<ID=Cor_BAF_LRR,Number=3,Type=Float,Description=\"Pearson "
447                        "correlation for BAF and LRR at AA, AB, and BB genotypes\">");
448     }
449 
450     if (args->as_str) {
451         args->as_id = bcf_hdr_id2int(args->in_hdr, BCF_DT_ID, args->as_str);
452         if (!bcf_hdr_idinfo_exists(args->in_hdr, BCF_HL_FMT, args->as_id))
453             error("Error: %s format field is not present, cannot perform --summary\n", args->as_str);
454         if (args->phase) kputc('p', &args->summary_str);
455         kputs(args->as_str, &args->summary_str);
456         ksprintf(&str, "##INFO=<ID=%s,Number=2,Type=Integer,Description=\"%sAllelic shift counts for %s\">",
457                  args->summary_str.s, args->phase ? "phased " : "", args->as_str);
458         bcf_hdr_append(args->out_hdr, str.s);
459     }
460 
461     // this analysis is performed using a field that can be added by the previous analyses
462     if (args->info_str) {
463         args->info_id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->info_str);
464         if (bcf_hdr_sync(args->out_hdr) < 0) error_errno("[%s] Failed to update header", __func__);
465         if (!bcf_hdr_idinfo_exists(args->out_hdr, BCF_HL_INFO, args->info_id))
466             error("Error: %s info field is not present, cannot perform --test\n", args->info_str);
467         int number = bcf_hdr_id2number(args->out_hdr, BCF_HL_INFO, args->info_id);
468         if (args->phred) kputc('p', &args->test_str);
469         if (number == 2)
470             kputs("binom_", &args->test_str);
471         else if (number == 4)
472             kputs("fisher_", &args->test_str);
473         else
474             error("Error: %s info field must contain 2 or 4 elements but it contains %d\n", args->info_str, number);
475         kputs(args->info_str, &args->test_str);
476         str.l = 0;
477         ksprintf(&str, "##INFO=<ID=%s,Number=1,Type=Float,Description=\"%s%s test for %s\">", args->test_str.s,
478                  args->phred ? "Phred scaled " : "", number == 2 ? "Binomial" : "Fisher's exact", args->info_str);
479         bcf_hdr_append(args->out_hdr, str.s);
480     }
481 
482     if (sites_only) {
483         if (bcf_hdr_set_samples(args->out_hdr, NULL, 0) < 0) error("Error parsing the sample list\n");
484         bcf_hdr_remove(args->out_hdr, BCF_HL_FMT, NULL);
485     }
486 
487     args->gt_phase_arr = (int8_t *)malloc(args->nsmpl * sizeof(int8_t));
488     args->as_arr = (int8_t *)malloc(args->nsmpl * sizeof(int8_t));
489     args->gt0_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
490     args->gt1_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
491     args->ad0_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
492     args->ad1_arr = (int16_t *)malloc(args->nsmpl * sizeof(int16_t));
493     args->baf_arr[0] = (float *)malloc(args->nsmpl * sizeof(float));
494     args->baf_arr[1] = (float *)malloc(args->nsmpl * sizeof(float));
495     args->imap_arr = (int *)malloc(args->nsmpl * sizeof(int));
496 
497     free(str.s);
498     return 0;
499 }
500 
501 // Heng Li's implementation in htslib/kfunc.c
502 #define KF_GAMMA_EPS 1e-14
503 #define KF_TINY 1e-290
504 
log_kf_betai_aux(double a,double b,double x)505 static double log_kf_betai_aux(double a, double b, double x) {
506     double C, D, f;
507     int j;
508     if (x == 0.) return 0.;
509     if (x == 1.) return 1.;
510     f = 1.;
511     C = f;
512     D = 0.;
513     // Modified Lentz's algorithm for computing continued fraction
514     for (j = 1; j < 200; ++j) {
515         double aa, d;
516         int m = j >> 1;
517         aa = (j & 1) ? -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1))
518                      : m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m));
519         D = 1. + aa * D;
520         if (D < KF_TINY) D = KF_TINY;
521         C = 1. + aa / C;
522         if (C < KF_TINY) C = KF_TINY;
523         D = 1. / D;
524         d = C * D;
525         f *= d;
526         if (fabs(d - 1.) < KF_GAMMA_EPS) break;
527     }
528     return (kf_lgamma(a + b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1. - x)) - log(a * f);
529 }
530 
531 // returns -10 * (log(2) + pbinom(min(k, n - k), n, 1/2, log.p = TRUE)) / log(10)
phred_pbinom(int k,int n)532 static double phred_pbinom(int k, int n) {
533     static double *dbinom = NULL, *pbinom = NULL;
534     static size_t n_size = 0, m_dbinom = 0, m_pbinom = 0;
535 
536     if (n < 0 && k < 0) {
537         free(dbinom);
538         free(pbinom);
539         return NAN;
540     }
541 
542     if (n < 0 || k < 0 || k > n) return NAN;
543 
544     if (k == n >> 1) return 0.0;
545 
546     if (k << 1 > n) k = n - k;
547 
548     if (n > 1000) return -10.0 * M_LOG10E * (M_LN2 + log_kf_betai_aux(n - k, k + 1, .5));
549 
550     if (n >= n_size) {
551         size_t len = (size_t)(1 + (1 + (n >> 1)) * ((n + 1) >> 1));
552         hts_expand(double, len, m_dbinom, dbinom);
553         hts_expand(double, len, m_pbinom, pbinom);
554         dbinom[0] = 1.0;
555         for (int i = n_size ? (int)n_size : 1; i < n + 1; i++) {
556             int prev_idx = i - 1 ? 1 + ((i - 1) >> 1) * (i >> 1) : 0;
557             int curr_idx = 1 + (i >> 1) * ((i + 1) >> 1);
558             dbinom[curr_idx] = dbinom[prev_idx] * 0.5;
559             pbinom[curr_idx] = dbinom[curr_idx];
560             for (int j = 1; j < ((i + 1) >> 1); j++) {
561                 curr_idx++;
562                 dbinom[curr_idx] = (double)i / (double)j * dbinom[prev_idx] * 0.5;
563                 pbinom[curr_idx] = pbinom[curr_idx - 1] + dbinom[curr_idx];
564                 prev_idx++;
565             }
566         }
567         n_size = (size_t)(n + 1);
568     }
569 
570     int idx = 1 + (n >> 1) * ((n + 1) >> 1) + k;
571     return -10.0 * M_LOG10E * (M_LN2 + log(pbinom[idx]));
572 }
573 
sample_mean_var(const float * x,int n,double * xm,double * xss)574 static int sample_mean_var(const float *x, int n, double *xm, double *xss) {
575     if (n < 2) return -1;
576     *xm = 0;
577     *xss = 0;
578     int j = 0;
579     for (int i = 0; i < n; i++) {
580         if (!isnan(x[i])) {
581             *xm += (double)x[i];
582             *xss += sq((double)x[i]);
583             j++;
584         }
585     }
586     if (j <= 1) return -1;
587     *xm /= (double)j;
588     *xss -= sq(*xm) * (double)j;
589     *xss /= (double)(j - 1);
590     return 0;
591 }
592 
welch_t_test(float * a,float * b,int na,int nb)593 static double welch_t_test(float *a, float *b, int na, int nb) {
594     double mua, mub, sa2, sb2, t, v;
595     if (na < 2 || nb < 2) return HUGE_VAL;
596     sample_mean_var(a, na, &mua, &sa2);
597     sample_mean_var(b, nb, &mub, &sb2);
598     t = (mua - mub) / sqrt(sa2 / na + sb2 / nb);
599     v = (sa2 / na + sb2 / nb);
600     v *= v;
601     v /= sq(sa2) / na / na / (na - 1) + sq(sb2) / nb / nb / (nb - 1);
602     return kf_betai(v / 2.0f, 0.5, v / (v + sq(t)));
603 }
604 
605 // Petr Danecek's and James Bonfield's implementation in bcftools/bam2bcf.c
606 double mann_whitney_1947_cdf(int n, int m, int U);
607 
cmpfunc(const void * a,const void * b)608 static int cmpfunc(const void *a, const void *b) { return (*(float *)a > *(float *)b) - (*(float *)a < *(float *)b); }
609 
610 // it currently does not handle nans
611 // adapted from Petr Danecek's implementation of calc_mwu_bias_cdf() in bcftools/bam2bcf.c
mann_whitney_u(float * a,float * b,int na,int nb)612 static double mann_whitney_u(float *a, float *b, int na, int nb) {
613     qsort(a, (size_t)na, sizeof(float), cmpfunc);
614     qsort(b, (size_t)nb, sizeof(float), cmpfunc);
615 
616     int i = 0, j = 0, ca, cb;
617     double U = 0, ties = 0;
618     while (i < na || j < nb) {
619         double curr = (j == nb || (i < na && a[i] < b[j])) ? a[i] : b[j];
620         for (ca = 0; i < na && a[i] == curr; i++) ca++;
621         for (cb = 0; j < nb && b[j] == curr; j++) cb++;
622         U += ca * (j - cb * 0.5);
623         if (ca && cb) {
624             double tie = ca + cb;
625             ties += (sq(tie) - 1.0) * tie;
626         }
627     }
628     if (!na || !nb) return HUGE_VAL;
629 
630     double U_min = ((double)na * nb) - U;
631     if (U < U_min) U_min = U;
632 
633     if (na == 1) return 2.0 * (floor(U_min) + 1.0) / (double)(nb + 1);
634     if (nb == 1) return 2.0 * (floor(U_min) + 1.0) / (double)(na + 1);
635 
636     // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
637     if (na >= 8 || nb >= 8) {
638         double mean = ((double)na * nb) * 0.5;
639         // Correction for ties:
640         double N = na + nb;
641         double var2 = (sq(N) - 1) * N - ties;
642         if (var2 == 0) return 1.0;
643         var2 *= ((double)na * nb) / N / (N - 1) / 12.0;
644         // No correction for ties:
645         // float var2 = ((double)na*nb) * (na + nb + 1) / 12.0;
646         double z = (U_min - mean) / sqrt(2.0 * var2); // z is N(0,1)
647         // return 2.0 - kf_erfc(z);  // which is 1 + erf(z)
648         return kf_erfc(-z); // which is 1 - erf(-z)
649     }
650 
651     // Exact calculation
652     double pval = 2.0 * mann_whitney_1947_cdf(na, nb, (int)U_min);
653     return pval > 1.0 ? 1.0 : pval;
654 }
655 
bcf_int8_is_vector_end(int8_t value)656 static inline int bcf_int8_is_vector_end(int8_t value) { return value == bcf_int8_vector_end; }
bcf_int16_is_vector_end(int16_t value)657 static inline int bcf_int16_is_vector_end(int16_t value) { return value == bcf_int16_vector_end; }
bcf_int32_is_vector_end(int32_t value)658 static inline int bcf_int32_is_vector_end(int32_t value) { return value == bcf_int32_vector_end; }
bcf_int8_is_missing(int8_t value)659 static inline int bcf_int8_is_missing(int8_t value) { return value == bcf_int8_missing; }
bcf_int16_is_missing(int16_t value)660 static inline int bcf_int16_is_missing(int16_t value) { return value == bcf_int16_missing; }
bcf_int32_is_missing(int32_t value)661 static inline int bcf_int32_is_missing(int32_t value) { return value == bcf_int32_missing; }
662 
663 // retrieve phase information from BCF record
664 // assumes little endian architecture
bcf_get_format_sign(bcf_fmt_t * fmt,int8_t * as_arr,int nsmpl)665 static int bcf_get_format_sign(bcf_fmt_t *fmt, int8_t *as_arr, int nsmpl) {
666     if (!fmt || fmt->n != 1) return 0;
667 
668 #define BRANCH(type_t, is_vector_end, is_missing)                                                                      \
669     {                                                                                                                  \
670         type_t *p = (type_t *)fmt->p;                                                                                  \
671         for (int i = 0; i < nsmpl; i++) {                                                                              \
672             if (is_vector_end(p[i]) || is_missing(p[i]))                                                               \
673                 as_arr[i] = bcf_int8_missing;                                                                          \
674             else if (p[i] == (type_t)0)                                                                                \
675                 as_arr[i] = (int8_t)0;                                                                                 \
676             else if (p[i] > (type_t)0)                                                                                 \
677                 as_arr[i] = (int8_t)1;                                                                                 \
678             else                                                                                                       \
679                 as_arr[i] = (int8_t)-1;                                                                                \
680         }                                                                                                              \
681     }
682     switch (fmt->type) {
683     case BCF_BT_INT8:
684         BRANCH(int8_t, bcf_int8_is_vector_end, bcf_int8_is_missing);
685         break;
686     case BCF_BT_INT16:
687         BRANCH(int16_t, bcf_int16_is_vector_end, bcf_int16_is_missing);
688         break;
689     case BCF_BT_INT32:
690         BRANCH(int32_t, bcf_int32_is_vector_end, bcf_int32_is_missing);
691         break;
692     case BCF_BT_FLOAT:
693         BRANCH(int32_t, bcf_float_is_vector_end, bcf_float_is_missing);
694         break;
695     default:
696         error("Unexpected type %d\n", fmt->type);
697     }
698 #undef BRANCH
699 
700     return 1;
701 }
702 
process(bcf1_t * rec)703 bcf1_t *process(bcf1_t *rec) {
704     // compute GC and CpG content for each site
705     if (args->flags & (INFO_GC | INFO_CPG)) {
706         int fa_len;
707         int at_cnt = 0, cg_cnt = 0, cpg_cnt = 0;
708         const char *ref = rec->d.allele[0];
709         char *fa = faidx_fetch_seq(args->fai, bcf_seqname(args->in_hdr, rec), rec->pos - args->gc_win,
710                                    rec->pos + (int)strlen(ref) - 1 + args->gc_win, &fa_len);
711         if (!fa)
712             error("fai_fetch_seq failed at %s:%" PRId64 "\n", bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
713         for (int i = 0; i < fa_len; i++) {
714             if (fa[i] > 96) fa[i] = (char)(fa[i] - 32);
715             if (fa[i] == 'A' || fa[i] == 'T') at_cnt++;
716             if (fa[i] == 'C' || fa[i] == 'G') cg_cnt++;
717             if (i > 0)
718                 if (fa[i - 1] == 'C' && fa[i] == 'G') cpg_cnt += 2;
719         }
720         free(fa);
721         float ratio;
722         if (args->flags & INFO_GC) {
723             ratio = (float)(cg_cnt) / (float)(at_cnt + cg_cnt);
724             bcf_update_info_float(args->out_hdr, rec, "GC", &ratio, 1);
725         }
726         if (args->flags & INFO_CPG) {
727             ratio = (float)cpg_cnt / (float)(fa_len);
728             bcf_update_info_float(args->out_hdr, rec, "CpG", &ratio, 1);
729         }
730     }
731 
732     int ac_sex[] = {0, 0, 0, 0};
733     int missing_sex[] = {0, 0, 0, 0};
734     int pac_het[] = {0, 0};
735     int ad_het[] = {0, 0};
736     int summary[] = {0, 0};
737     int psummary[] = {0, 0};
738     float ret[4];
739 
740     // extract format information from VCF format records
741     bcf_fmt_t *gt_fmt = bcf_get_fmt_id(rec, args->gt_id);
742     int gt_phase = bcf_get_genotype_phase(gt_fmt, args->gt_phase_arr, args->nsmpl);
743 
744     // if samples or genotypes are not present, skip to the end
745     if (args->nsmpl == 0 || !bcf_get_unphased_genotype_alleles(gt_fmt, args->gt0_arr, args->gt1_arr, args->nsmpl))
746         goto ret;
747 
748     bcf_fmt_t *as_fmt = bcf_get_fmt_id(rec, args->as_id);
749     int as_sign = (args->as_str && as_fmt) ? bcf_get_format_sign(as_fmt, args->as_arr, args->nsmpl) : 0;
750 
751     bcf_fmt_t *ad_fmt = bcf_get_fmt_id(rec, args->ad_id);
752     int ad = ad_fmt && ad_fmt->n == rec->n_allele ? bcf_get_allelic_depth(ad_fmt, args->gt0_arr, args->gt1_arr,
753                                                                           args->ad0_arr, args->ad1_arr, args->nsmpl)
754                                                   : 0;
755 
756     bcf_fmt_t *baf_fmt = bcf_get_fmt_id(rec, args->baf_id);
757     int baf = baf_fmt && baf_fmt->n == 1 && baf_fmt->type == BCF_BT_FLOAT ? 1 : 0;
758 
759     bcf_fmt_t *lrr_fmt = bcf_get_fmt_id(rec, args->lrr_id);
760     int lrr = lrr_fmt && lrr_fmt->n == 1 && lrr_fmt->type == BCF_BT_FLOAT ? 1 : 0;
761 
762     if ((args->flags & (INFO_ALLELE_A | INFO_ALLELE_B)) && baf) {
763         int alleles[2];
764         switch (rec->n_allele) {
765         case 1:
766             alleles[0] = -1;
767             alleles[1] = -1;
768             break;
769         case 2:
770             alleles[0] = 0;
771             alleles[1] = 1;
772             break;
773         case 3:
774             alleles[0] = 1;
775             alleles[1] = 2;
776             break;
777         default:
778             error("Observed wrong number of alleles at %s:%" PRId64 "\n", bcf_hdr_id2name(args->in_hdr, rec->rid),
779                   rec->pos + 1);
780         }
781         float median[2] = {NAN, NAN};
782         int alleles_idx[2] = {-1, -1};
783         for (int i = 0; i < 2; i++) {
784             int n = 0;
785             for (int j = 0; j < args->nsmpl; j++)
786                 if (args->gt0_arr[j] == alleles[i] && args->gt1_arr[j] == alleles[i]) args->imap_arr[n++] = j;
787             median[i] = get_median((float *)baf_fmt->p, n, args->imap_arr);
788             if (median[i] < .5)
789                 alleles_idx[i] = alleles[0];
790             else if (median[i] > .5)
791                 alleles_idx[i] = alleles[1];
792         }
793         if (alleles_idx[0] == alleles_idx[1]) {
794             alleles_idx[0] = -1;
795             alleles_idx[1] = -1;
796             fprintf(stderr, "Unable to infer the A and B alleles while parsing the site %s:%" PRId64 "\n",
797                     bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
798         } else if (alleles_idx[0] == -1) {
799             alleles_idx[0] = alleles_idx[1] == alleles[0] ? alleles[1] : alleles[0];
800         } else if (alleles_idx[1] == -1) {
801             alleles_idx[1] = alleles_idx[0] == alleles[0] ? alleles[1] : alleles[0];
802         }
803         if (args->flags & INFO_ALLELE_A) bcf_update_info_int32(args->out_hdr, rec, "ALLELE_A", &alleles_idx[0], 1);
804         if (args->flags & INFO_ALLELE_B) bcf_update_info_int32(args->out_hdr, rec, "ALLELE_B", &alleles_idx[1], 1);
805     }
806 
807     // determine ALLELE_A and ALLELE_B from VCF record
808     bcf_info_t *allele_a_info = bcf_get_info_id(rec, args->allele_a_id);
809     int8_t allele_a = allele_a_info ? ((int8_t *)allele_a_info->vptr)[0] : -1;
810     bcf_info_t *allele_b_info = bcf_get_info_id(rec, args->allele_b_id);
811     int8_t allele_b = allele_b_info ? ((int8_t *)allele_b_info->vptr)[0] : -1;
812 
813     // adjust BAF and LRR values
814     if (args->adjust && baf && lrr) {
815         if (bcf_get_info_float(args->in_hdr, rec, "ADJ_COEFF", &args->adjust_arr, &args->m_adjust) < 0)
816             error("Could not retrieve adjusting coefficients at %s:%" PRId64 "\n",
817                   bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
818         if (args->m_adjust < 9)
819             error("Not enough adjusting coefficients at %s:%" PRId64 "\n", bcf_hdr_id2name(args->in_hdr, rec->rid),
820                   rec->pos + 1);
821         for (int i = 0; i < args->nsmpl; i++) {
822             int n_a = (args->gt0_arr[i] == allele_a) + (args->gt1_arr[i] == allele_a);
823             int n_b = (args->gt0_arr[i] == allele_b) + (args->gt1_arr[i] == allele_b);
824             if (n_a + n_b != 2) continue;
825             ((float *)baf_fmt->p)[i] -=
826                 args->adjust_arr[3 * n_b + 1] * ((float *)lrr_fmt->p)[i] - args->adjust_arr[3 * n_b];
827             ((float *)lrr_fmt->p)[i] -= args->adjust_arr[3 * n_b + 2];
828         }
829     }
830 
831     for (int i = 0; i < args->nsmpl; i++) {
832         float curr_baf = NAN;
833 
834         int is_missing = args->gt0_arr[i] == bcf_int16_missing || args->gt1_arr[i] == bcf_int16_missing;
835         if (args->gender) {
836             switch (args->gender[i]) {
837             case GENDER_MALE: // male
838                 missing_sex[is_missing]++;
839                 break;
840             case GENDER_FEMALE: // female
841                 missing_sex[2 + is_missing]++;
842                 break;
843             default:
844                 break;
845             }
846         }
847 
848         // if genotype is missing, skip
849         if (is_missing) continue;
850 
851         int idx_as_sign =
852             (as_sign && args->as_arr[i] != bcf_int8_missing && args->as_arr[i] != 0) ? (1 - args->as_arr[i]) / 2 : -1;
853         if (idx_as_sign >= 0) summary[idx_as_sign]++;
854 
855         int is_het = args->gt0_arr[i] != args->gt1_arr[i];
856         if (args->gender) {
857             switch (args->gender[i]) {
858             case GENDER_MALE: // male
859                 ac_sex[is_het]++;
860                 break;
861             case GENDER_FEMALE: // female
862                 ac_sex[2 + is_het]++;
863                 break;
864             default:
865                 break;
866             }
867         }
868 
869         // if genotype is not heterozygous reference / alternate, skip
870         if (!is_het || (args->gt0_arr[i] != 0 && args->gt1_arr[i] != 0)) continue;
871 
872         int idx_gt_phase = gt_phase && (args->gt_phase_arr[i] == -1 || args->gt_phase_arr[i] == 1)
873                                ? (1 - args->gt_phase_arr[i]) / 2
874                                : -1;
875         if (idx_gt_phase >= 0) pac_het[idx_gt_phase]++;
876 
877         int idx_fmt_phase =
878             (idx_gt_phase >= 0 && idx_as_sign >= 0) ? (1 - args->as_arr[i] * args->gt_phase_arr[i]) / 2 : -1;
879         if (idx_fmt_phase >= 0) psummary[idx_fmt_phase]++;
880 
881         if (ad) {
882             int ref_cnt = args->ad0_arr[i];
883             int alt_cnt = args->ad1_arr[i];
884             ad_het[0] += ref_cnt;
885             ad_het[1] += alt_cnt;
886             curr_baf = ((float)alt_cnt + 0.5f) / ((float)ref_cnt + (float)alt_cnt + 1.0f);
887         }
888         if (baf) curr_baf = ((float *)baf_fmt->p)[i];
889         if (idx_gt_phase >= 0 && !isnan(curr_baf)) {
890             args->baf_arr[idx_gt_phase][pac_het[idx_gt_phase] - 1] = curr_baf;
891         }
892     }
893 
894     if (args->flags & INFO_AC_SEX) bcf_update_info_int32(args->out_hdr, rec, "AC_Sex", &ac_sex, 4);
895     if (args->flags & INFO_MISSING_SEX) bcf_update_info_int32(args->out_hdr, rec, "MISSING_Sex", &missing_sex, 4);
896     if (args->flags & INFO_pAC_HET) bcf_update_info_int32(args->out_hdr, rec, "pAC_Het", &pac_het, 2);
897     if ((args->flags & INFO_AD_HET) && ad) bcf_update_info_int32(args->out_hdr, rec, "AD_Het", &ad_het, 2);
898 
899     if ((args->flags & INFO_pBAF_STATS) && (ad || baf)) {
900         ret[0] = get_median(args->baf_arr[0], pac_het[0], NULL);
901         ret[1] = get_median(args->baf_arr[1], pac_het[1], NULL);
902         ret[2] = welch_t_test(args->baf_arr[0], args->baf_arr[1], pac_het[0], pac_het[1]);
903         ret[3] = mann_whitney_u(args->baf_arr[0], args->baf_arr[1], pac_het[0], pac_het[1]);
904         bcf_update_info_float(args->out_hdr, rec, "pBAF_Stats", &ret, 4);
905     }
906 
907     if ((args->flags & INFO_COR_BAF_LRR) && baf && lrr) {
908         float rho[3];
909         for (int i = 0; i < 3; i++) {
910             int n = 0;
911             for (int j = 0; j < args->nsmpl; j++) {
912                 int n_a = (args->gt0_arr[j] == allele_a) + (args->gt1_arr[j] == allele_a);
913                 int n_b = (args->gt0_arr[j] == allele_b) + (args->gt1_arr[j] == allele_b);
914                 if (n_a == 2 - i && n_b == i) args->imap_arr[n++] = j;
915             }
916             // compute the Pearson correlation
917             float xss = 0.0f, yss = 0.0f, xyss = 0.0f;
918             get_cov((float *)baf_fmt->p, (float *)lrr_fmt->p, n, args->imap_arr, &xss, &yss, &xyss);
919             rho[i] = xyss / sqrtf(xss * yss);
920         }
921         bcf_update_info_float(args->out_hdr, rec, "Cor_BAF_LRR", &rho, 3);
922     }
923 
924 ret:
925     if (args->as_str)
926         bcf_update_info_int32(args->out_hdr, rec, args->summary_str.s, args->phase ? &psummary : &summary, 2);
927 
928     // perform binomial or Fisher's exact test on existing INFO field
929     if (args->info_str) {
930         bcf_info_t *info = bcf_get_info_id(rec, args->info_id);
931         if (!(info->type & (BCF_BT_INT8 | BCF_BT_INT16 | BCF_BT_INT32)) || (info->len != 2 && info->len != 4))
932             error("INFO field %s at %s:%" PRId64 " should contain two or four integers\n", args->info_str,
933                   bcf_hdr_id2name(args->in_hdr, rec->rid), rec->pos + 1);
934         bcf_get_info_int32(args->out_hdr, rec, args->info_str, &args->info_arr, &args->m_info);
935         if (info->len == 2) {
936             double phred_pval = phred_pbinom(args->info_arr[0], args->info_arr[0] + args->info_arr[1]);
937             ret[0] = (float)(args->phred ? phred_pval : exp(-0.1 * M_LN10 * phred_pval));
938             bcf_update_info_float(args->out_hdr, rec, args->test_str.s, &ret, 1);
939         } else {
940             double left, right, fisher;
941             double pval = kt_fisher_exact(args->info_arr[0], args->info_arr[1], args->info_arr[2], args->info_arr[3],
942                                           &left, &right, &fisher);
943             ret[0] = (float)(args->phred ? -10.0 * M_LOG10E * log(pval) : pval);
944         }
945         bcf_update_info_float(args->out_hdr, rec, args->test_str.s, &ret, 1);
946     }
947 
948     // remove all samples if sites_only was selected
949     if (bcf_hdr_nsamples(args->out_hdr) == 0) bcf_subset(args->out_hdr, rec, 0, NULL);
950     return rec;
951 }
952 
destroy(void)953 void destroy(void) {
954     phred_pbinom(-1, -1);
955     free(args->gender);
956     free(args->gt_phase_arr);
957     free(args->as_arr);
958     free(args->gt0_arr);
959     free(args->gt1_arr);
960     free(args->ad0_arr);
961     free(args->ad1_arr);
962     free(args->info_arr);
963     free(args->adjust_arr);
964     free(args->baf_arr[0]);
965     free(args->baf_arr[1]);
966     free(args->imap_arr);
967     free(args->summary_str.s);
968     free(args->test_str.s);
969     free(args);
970 }
971