1 #include "bcftools.pysam.h"
2 
3 /*  bam2bcf.c -- variant calling.
4 
5     Copyright (C) 2010-2012 Broad Institute.
6     Copyright (C) 2012-2021 Genome Research Ltd.
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #include <math.h>
29 #include <stdint.h>
30 #include <assert.h>
31 #include <float.h>
32 #include <htslib/hts.h>
33 #include <htslib/sam.h>
34 #include <htslib/kstring.h>
35 #include <htslib/kfunc.h>
36 #include "bam2bcf.h"
37 
38 extern  void ks_introsort_uint32_t(size_t n, uint32_t a[]);
39 
40 #define CALL_DEFTHETA 0.83
41 #define DEF_MAPQ 20
42 
43 #define CAP_DIST 25
44 
bcf_call_init(double theta,int min_baseQ,int max_baseQ,int delta_baseQ)45 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
46                              int delta_baseQ)
47 {
48     bcf_callaux_t *bca;
49     if (theta <= 0.) theta = CALL_DEFTHETA;
50     bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
51     bca->capQ = 60;
52     bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
53     bca->min_baseQ = min_baseQ;
54     bca->max_baseQ = max_baseQ;
55     bca->delta_baseQ = delta_baseQ;
56     bca->e = errmod_init(1. - theta);
57     bca->min_frac = 0.002;
58     bca->min_support = 1;
59     bca->per_sample_flt = 0;
60     bca->npos = 100;
61     bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
62     bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
63     bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
64     bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
65     bca->nqual = 60;
66     bca->ref_mq  = (int*) malloc(bca->nqual*sizeof(int));
67     bca->alt_mq  = (int*) malloc(bca->nqual*sizeof(int));
68     bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
69     bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
70     bca->ref_bq  = (int*) malloc(bca->nqual*sizeof(int));
71     bca->alt_bq  = (int*) malloc(bca->nqual*sizeof(int));
72     bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
73     bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
74     return bca;
75 }
76 
bcf_call_destroy(bcf_callaux_t * bca)77 void bcf_call_destroy(bcf_callaux_t *bca)
78 {
79     if (bca == 0) return;
80     errmod_destroy(bca->e);
81     if (bca->npos) {
82         free(bca->ref_pos);  free(bca->alt_pos);
83         free(bca->iref_pos); free(bca->ialt_pos);
84         bca->npos = 0;
85     }
86     free(bca->ref_mq); free(bca->alt_mq);
87     free(bca->iref_mq); free(bca->ialt_mq);
88     free(bca->ref_bq); free(bca->alt_bq);
89     free(bca->fwd_mqs); free(bca->rev_mqs);
90     bca->nqual = 0;
91     free(bca->bases); free(bca->inscns); free(bca);
92 }
93 
94 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len,int * sc_len,int * sc_dist)95 static int get_position(const bam_pileup1_t *p, int *len,
96                         int *sc_len, int *sc_dist) {
97     int i, j, edist = p->qpos + 1;
98     int sc_left = 0, sc_right = 0;
99     int sc_left_dist = -1, sc_right_dist = -1;
100 
101     // left end
102     for (i = 0; i < p->b->core.n_cigar; i++) {
103         int cig  = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
104         if (cig == BAM_CHARD_CLIP)
105             continue;
106         else if (cig == BAM_CSOFT_CLIP)
107             sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
108         else
109             break;
110     }
111     if (sc_left)
112         sc_left_dist = p->qpos+1 - sc_left;
113     edist -= sc_left;
114 
115     // right end
116     for (j = p->b->core.n_cigar-1; j >= i; j--) {
117         int cig  = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
118         if (cig == BAM_CHARD_CLIP)
119             continue;
120         else if (cig == BAM_CSOFT_CLIP)
121             sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
122         else
123             break;
124     }
125     if (sc_right)
126         sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
127 
128     // Distance to nearest soft-clips and length of that clip.
129     if (sc_left_dist >= 0) {
130         if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
131             *sc_len  = sc_left;
132             *sc_dist = sc_left_dist;
133         }
134     } else if (sc_right_dist >= 0) {
135         *sc_len  = sc_right;
136         *sc_dist = sc_right_dist;
137     } else {
138         *sc_len  = 0;
139         *sc_dist = 0;
140     }
141 
142     *len = p->b->core.l_qseq - sc_left - sc_right;
143     return edist;
144 }
145 
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)146 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
147 {
148     memset(bca->ref_pos,0,sizeof(int)*bca->npos);
149     memset(bca->alt_pos,0,sizeof(int)*bca->npos);
150     memset(bca->iref_pos,0,sizeof(int)*bca->npos);
151     memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
152     memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
153     memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
154     memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
155     memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
156     memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
157     memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
158     memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
159     memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
160     if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
161     if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
162     if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
163     memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
164     memset(bca->ref_scl,  0, 100*sizeof(int));
165     memset(bca->alt_scl,  0, 100*sizeof(int));
166     memset(bca->iref_scl, 0, 100*sizeof(int));
167     memset(bca->ialt_scl, 0, 100*sizeof(int));
168 }
169 
170 /*
171     Notes:
172     - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
173         which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
174         Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
175     - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
176  */
177 /*
178  * This function is called once for each sample.
179  * _n is number of pilesups pl contributing reads to this sample
180  * pl is pointer to array of _n pileups (one pileup per read)
181  * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
182  * bca is the settings to perform calls across all samples
183  * r is the returned value of the call
184  */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)185 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
186 {
187     int i, n, ref4, is_indel, ori_depth = 0;
188 
189     // clean from previous run
190     r->ori_depth = 0;
191     r->mq0 = 0;
192     memset(r->anno,0,sizeof(double)*16);
193     memset(r->p,0,sizeof(float)*25);
194     r->SCR = 0;
195 
196     if (ref_base >= 0) {
197         ref4 = seq_nt16_int[ref_base];
198         is_indel = 0;
199     } else ref4 = 4, is_indel = 1;
200     if (_n == 0) return -1;
201     // enlarge the bases array if necessary
202     if (bca->max_bases < _n) {
203         bca->max_bases = _n;
204         kroundup32(bca->max_bases);
205         bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
206     }
207 
208     // fill the bases array
209     double nqual_over_60 = bca->nqual / 60.0;
210     int ADR_ref_missed[4] = {0};
211     int ADF_ref_missed[4] = {0};
212     for (i = n = 0; i < _n; ++i) {
213         const bam_pileup1_t *p = pl + i;
214         int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
215         if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
216         if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
217         if (p->is_del && !is_indel) continue;
218         ++ori_depth;
219         if (is_indel)
220         {
221             b = p->aux>>16&0x3f;
222             seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
223             if (q < bca->min_baseQ)
224             {
225                 if (!p->indel && b < 4)
226                 {
227                     if (bam_is_rev(p->b))
228                         ADR_ref_missed[b]++;
229                     else
230                         ADF_ref_missed[b]++;
231                 }
232                 continue;
233             }
234             if (p->indel == 0 && (q < _n/2 || _n > 20)) {
235                 // high quality indel calls without p->indel set aren't
236                 // particularly indicative of being a good REF match either,
237                 // at least not in low coverage.  So require solid coverage
238                 // before we start utilising such quals.
239                 b = 0;
240                 q = (int)bam_get_qual(p->b)[p->qpos];
241                 seqQ = (3*seqQ + 2*q)/8;
242             }
243             if (_n > 20 && seqQ > 40) seqQ = 40;
244             baseQ  = p->aux>>8&0xff;
245 
246             is_diff = (b != 0);
247         }
248         else
249         {
250             b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
251             b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
252 
253             // Lowest of this and neighbour quality values
254             uint8_t *qual = bam_get_qual(p->b);
255             q = qual[p->qpos];
256             if (p->qpos > 0 &&
257                 q > qual[p->qpos-1]+bca->delta_baseQ)
258                 q = qual[p->qpos-1]+bca->delta_baseQ;
259             if (p->qpos+1 < p->b->core.l_qseq &&
260                 q > qual[p->qpos+1]+bca->delta_baseQ)
261                 q = qual[p->qpos+1]+bca->delta_baseQ;
262 
263             if (q < bca->min_baseQ) continue;
264             if (q > bca->max_baseQ) q = bca->max_baseQ;
265             baseQ = q;
266             seqQ  = 99;
267             is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
268         }
269         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
270         if ( !mapQ ) r->mq0++;
271         if (q > seqQ) q = seqQ;
272         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
273         if (q > mapQ) q = mapQ;
274         if (q > 63) q = 63;
275         if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
276         bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
277         // collect annotations
278         if (b < 4)
279         {
280             r->QS[b] += q;
281             if ( r->ADF )
282             {
283                 if ( bam_is_rev(p->b) )
284                     r->ADR[b]++;
285                 else
286                     r->ADF[b]++;
287             }
288         }
289         ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
290         min_dist = p->b->core.l_qseq - 1 - p->qpos;
291         if (min_dist > p->qpos) min_dist = p->qpos;
292         if (min_dist > CAP_DIST) min_dist = CAP_DIST;
293         r->anno[1<<2|is_diff<<1|0] += baseQ;
294         r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
295         r->anno[2<<2|is_diff<<1|0] += mapQ;
296         r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
297         r->anno[3<<2|is_diff<<1|0] += min_dist;
298         r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
299 
300         // collect for bias tests
301         if ( baseQ > 59 ) baseQ = 59;
302         if ( mapQ > 59 ) mapQ = 59;
303         int len, epos = 0, sc_len = 0, sc_dist = 0;
304         if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
305         {
306             int pos = get_position(p, &len, &sc_len, &sc_dist);
307             epos = (double)pos/(len+1) * bca->npos;
308 
309             if (sc_len) {
310                 sc_len = 15.0*sc_len / sc_dist;
311                 if (sc_len > 99) sc_len = 99;
312             }
313         }
314 
315         int imq  = mapQ * nqual_over_60;
316         int ibq  = baseQ * nqual_over_60;
317 
318         if ( bam_is_rev(p->b) )
319             bca->rev_mqs[imq]++;
320         else
321             bca->fwd_mqs[imq]++;
322 
323         if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
324         {
325             bca->ref_pos[epos]++;
326             bca->ref_bq[ibq]++;
327             bca->ref_mq[imq]++;
328             bca->ref_scl[sc_len]++;
329         }
330         else
331         {
332             bca->alt_pos[epos]++;
333             bca->alt_bq[ibq]++;
334             bca->alt_mq[imq]++;
335             bca->alt_scl[sc_len]++;
336         }
337     }
338 
339     // Compensate for AD not being counted on low quality REF indel matches.
340     if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
341     {
342         for (i=0; i<4; i++)
343         {
344             r->ADR[0] += ADR_ref_missed[i];
345             r->ADF[0] += ADF_ref_missed[i];
346         }
347     }
348     else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
349     {
350         int dp = 0, dp_ambig = 0;
351         for (i=0; i<4; i++) dp += r->ADR[i];
352         for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
353         if ( dp )
354             for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
355         dp = 0, dp_ambig = 0;
356         for (i=0; i<4; i++) dp += r->ADF[i];
357         for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
358         if ( dp )
359             for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
360     }
361 
362     r->ori_depth = ori_depth;
363     // glfgen
364     errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
365     return n;
366 }
367 
368 
369 /*
370  *  calc_vdb() - returns value between zero (most biased) and one (no bias)
371  *               on success, or HUGE_VAL when VDB cannot be calculated because
372  *               of insufficient depth (<2x)
373  *
374  *  Variant Distance Bias tests if the variant bases are positioned within the
375  *  reads with sufficient randomness. Unlike other tests, it looks only at
376  *  variant reads and therefore gives different kind of information than Read
377  *  Position Bias for instance. VDB was developed for detecting artefacts in
378  *  RNA-seq calls where reads from spliced transcripts span splice site
379  *  boundaries.  The current implementation differs somewhat from the original
380  *  version described in supplementary material of PMID:22524474, but the idea
381  *  remains the same. (Here the random variable tested is the average distance
382  *  from the averaged position, not the average pairwise distance.)
383  *
384  *  For coverage of 2x, the calculation is exact but is approximated for the
385  *  rest. The result is most accurate between 4-200x. For 3x or >200x, the
386  *  reported values are slightly more favourable than those of a true random
387  *  distribution.
388  */
calc_vdb(int * pos,int npos)389 double calc_vdb(int *pos, int npos)
390 {
391     // Note well: the parameters were obtained by fitting to simulated data of
392     // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
393     const int readlen = 100;
394     assert( npos==readlen );
395 
396     #define nparam 15
397     const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
398         {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
399         {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
400         {200,0.7,23.7} };
401 
402     int i, dp = 0;
403     float mean_pos = 0, mean_diff = 0;
404     for (i=0; i<npos; i++)
405     {
406         if ( !pos[i] ) continue;
407         dp += pos[i];
408         mean_pos += pos[i]*i;
409     }
410     if ( dp<2 ) return HUGE_VAL;     // one or zero reads can be placed anywhere
411 
412     mean_pos /= dp;
413     for (i=0; i<npos; i++)
414     {
415         if ( !pos[i] ) continue;
416         mean_diff += pos[i] * fabs(i - mean_pos);
417     }
418     mean_diff /= dp;
419 
420     int ipos = mean_diff;   // tuned for float-to-int implicit conversion
421     if ( dp==2 )
422         return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
423 
424     if ( dp>=200 )
425         i = nparam; // shortcut for big depths
426     else
427     {
428         for (i=0; i<nparam; i++)
429             if ( param[i][0]>=dp ) break;
430     }
431     float pshift, pscale;
432     if ( i==nparam )
433     {
434         // the depth is too high, go with 200x
435         pscale = param[nparam-1][1];
436         pshift = param[nparam-1][2];
437     }
438     else if ( i>0 && param[i][0]!=dp )
439     {
440         // linear interpolation of parameters
441         pscale = (param[i-1][1] + param[i][1])*0.5;
442         pshift = (param[i-1][2] + param[i][2])*0.5;
443     }
444     else
445     {
446         pscale = param[i][1];
447         pshift = param[i][2];
448     }
449     return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
450 }
451 
calc_chisq_bias(int * a,int * b,int n)452 double calc_chisq_bias(int *a, int *b, int n)
453 {
454     int na = 0, nb = 0, i, ndf = n;
455     for (i=0; i<n; i++) na += a[i];
456     for (i=0; i<n; i++) nb += b[i];
457     if ( !na || !nb ) return HUGE_VAL;
458 
459     double chisq = 0;
460     for (i=0; i<n; i++)
461     {
462         if ( !a[i] && !b[i] ) ndf--;
463         else
464         {
465             double tmp = a[i] - b[i];
466             chisq += tmp*tmp/(a[i]+b[i]);
467         }
468     }
469     /*
470         kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
471         1 if the distributions are identical, 0 if very different
472     */
473     double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
474     return prob;
475 }
476 
mann_whitney_1947_(int n,int m,int U)477 static double mann_whitney_1947_(int n, int m, int U)
478 {
479      if (U<0) return 0;
480      if (n==0||m==0) return U==0 ? 1 : 0;
481     return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
482 }
483 
mann_whitney_1947(int n,int m,int U)484 double mann_whitney_1947(int n, int m, int U)
485 {
486     #include "mw.h"
487 
488     assert(n >= 2 && m >= 2);
489 
490     return (n < 8 && m < 8 && U < 50)
491         ? mw[n-2][m-2][U]
492         : mann_whitney_1947_(n,m,U);
493 }
494 
mann_whitney_1947_cdf(int n,int m,int U)495 double mann_whitney_1947_cdf(int n, int m, int U)
496 {
497     int i;
498     double sum = 0;
499     for (i=0; i<=U; i++)
500         sum += mann_whitney_1947(n,m,i);
501     return sum;
502 }
503 
calc_mwu_bias_cdf(int * a,int * b,int n)504 double calc_mwu_bias_cdf(int *a, int *b, int n)
505 {
506     int na = 0, nb = 0, i;
507     double U = 0;
508     //double ties = 0;
509     for (i=0; i<n; i++)
510     {
511         na += a[i];
512         U  += a[i] * (nb + b[i]*0.5);
513         nb += b[i];
514         // if ( a[i] && b[i] )
515         // {
516         //     double tie = a[i] + b[i];
517         //     ties += (tie*tie-1)*tie;
518         // }
519     }
520     if ( !na || !nb ) return HUGE_VAL;
521 
522     // Always work with the smaller U
523     double U_min = ((double)na * nb) - U;
524     if ( U < U_min ) U_min = U;
525 
526     if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
527     if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
528 
529     // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
530     if ( na>=8 || nb>=8 )
531     {
532         double mean = ((double)na*nb)*0.5;
533         // Correction for ties:
534         //      double N = na+nb;
535         //      double var2 = (N*N-1)*N-ties;
536         //      if ( var2==0 ) return 1.0;
537         //      var2 *= ((double)na*nb)/N/(N-1)/12.0;
538         // No correction for ties:
539         double var2 = ((double)na*nb)*(na+nb+1)/12.0;
540         double z = (U_min - mean)/sqrt(2*var2);   // z is N(0,1)
541         return 2.0 - kf_erfc(z);  // which is 1 + erf(z)
542     }
543 
544     // Exact calculation
545     double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
546     return pval>1 ? 1 : pval;
547 }
548 
calc_mwu_bias(int * a,int * b,int n,int left)549 double calc_mwu_bias(int *a, int *b, int n, int left)
550 {
551     int na = 0, nb = 0, i;
552     double U = 0;
553     // double ties = 0;
554     for (i=0; i<n; i++)
555     {
556         if (!a[i]) {
557             if (!b[i]) continue;
558             nb += b[i];
559         } else if (!b[i]) {
560             na += a[i];
561             U  += a[i] * nb;
562         } else {
563             na += a[i];
564             U  += a[i] * (nb + b[i]*0.5);
565             nb += b[i];
566             // double tie = a[i] + b[i];
567             // ties += (tie*tie-1)*tie;
568         }
569     }
570     if ( !na || !nb ) return HUGE_VAL;
571     if ( na==1 || nb==1 ) return 1.0;       // Flat probability, all U values are equally likely
572 
573     double mean = ((double)na*nb)*0.5;
574     if (left && U > mean) return 1; // for MQB which is asymmetrical
575     if ( na==2 || nb==2 )
576     {
577         // Linear approximation
578         return U>mean ? (2.0*mean-U)/mean : U/mean;
579     }
580     // Correction for ties:
581     //      double N = na+nb;
582     //      double var2 = (N*N-1)*N-ties;
583     //      if ( var2==0 ) return 1.0;
584     //      var2 *= ((double)na*nb)/N/(N-1)/12.0;
585     // No correction for ties:
586     double var2 = ((double)na*nb)*(na+nb+1)/12.0;
587     if ( na>=8 || nb>=8 )
588     {
589         // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
590         return exp(-0.5*(U-mean)*(U-mean)/var2);
591     }
592 
593     // Exact calculation
594     return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
595 }
596 
597 // A Z-score version of the above function.
598 //
599 // See "Normal approximation and tie correction" at
600 // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
601 //
602 // The Z score is the number of standard deviations above or below the mean
603 // with 0 being equality of the two distributions and +ve/-ve from there.
604 //
605 // This is a more robust score to filter on.
calc_mwu_biasZ(int * a,int * b,int n,int left_only,int do_Z)606 double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
607     int i;
608     int64_t t;
609 
610     // Optimisation
611     for (i = 0; i < n; i++)
612         if (b[i])
613             break;
614     int b_empty = (i == n);
615 
616     // Count equal (e), less-than (l) and greater-than (g) permutations.
617     int e = 0, l = 0, na = 0, nb = 0;
618     if (b_empty) {
619         for (t = 0, i = n-1; i >= 0; i--) {
620             na += a[i];
621             t += (a[i]*a[i]-1)*a[i];  // adjustment score for ties
622         }
623     } else {
624         for (t = 0, i = n-1; i >= 0; i--) {
625             // Combinations of a[i] and b[j] for i==j
626             e += a[i]*b[i];
627 
628             // nb is running total of b[i+1]..b[n-1].
629             // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
630             // for all i < j.
631             l += a[i]*nb;    // a<b
632 
633             na += a[i];
634             nb += b[i];
635             int p = a[i]+b[i];
636             t += (p*p-1)*p;  // adjustment score for ties
637         }
638     }
639 
640     if (!na || !nb)
641         return HUGE_VAL;
642 
643     double U, m;
644     U = l + e*0.5; // Mann-Whitney U score
645     m = na*nb / 2.0;
646 
647     // With ties adjustment
648     double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
649     // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
650     if (var2 <= 0)
651         return do_Z ? 0 : 1;
652 
653     if (do_Z) {
654         // S.D. normalised Z-score
655         //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
656         return (U - m) / sqrt(var2);
657     }
658 
659     // Else U score, which can be asymmetric for some data types.
660     if (left_only && U > m)
661         return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
662 
663     if (na >= 8 || nb >= 8) {
664         // Normal approximation, very good for na>=8 && nb>=8 and
665         // reasonable if na<8 or nb<8
666         return exp(-0.5*(U-m)*(U-m)/var2);
667     }
668 
669     // Exact calculation
670     if (na==1 || nb == 1)
671         return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
672     else
673         return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
674 }
675 
logsumexp2(double a,double b)676 static inline double logsumexp2(double a, double b)
677 {
678     if ( a>b )
679         return log(1 + exp(b-a)) + a;
680     else
681         return log(1 + exp(a-b)) + b;
682 }
683 
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)684 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
685 {
686     call->seg_bias = HUGE_VAL;
687     if ( !bcr ) return;
688 
689     int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
690     if ( !nr ) return;
691 
692     int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n;    // average depth
693     double M   = floor((double)nr / avg_dp + 0.5);   // an approximate number of variants samples in the population
694     if ( M>call->n ) M = call->n;       // clamp M at the number of samples
695     else if ( M==0 ) M = 1;
696     double f = M / 2. / call->n;        // allele frequency
697     double p = (double) nr / call->n;   // number of variant reads per sample expected if variant not real (poisson)
698     double q = (double) nr / M;         // number of variant reads per sample expected if variant is real (poisson)
699     double sum = 0;
700     const double log2 = log(2.0);
701 
702     // fprintf(bcftools_stderr,"M=%.1f  p=%e q=%e f=%f  dp=%d\n",M,p,q,f,avg_dp);
703     int i;
704     for (i=0; i<call->n; i++)
705     {
706         int oi = bcr[i].anno[2] + bcr[i].anno[3];       // observed number of non-ref reads
707         double tmp;
708         if ( oi )
709         {
710             // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
711             tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
712             tmp += log(f) + oi*log(q/p) - q + p;
713         }
714         else
715             tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
716         sum += tmp;
717         // fprintf(bcftools_stderr,"oi=%d %e\n", oi,tmp);
718     }
719     call->seg_bias = sum;
720 }
721 
722 /**
723  *  bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
724  *  @n:         number of samples
725  *  @calls:     each sample's calls
726  *  @bca:       auxiliary data structure for holding temporary values
727  *  @ref_base:  the reference base
728  *  @call:      filled with the annotations
729  *
730  *  Combines calls across the various samples being studied
731  *  1. For each allele at each base across all samples the quality is summed so
732  *     you end up with a set of quality sums for each allele present 2. The quality
733  *     sums are sorted.
734  *  3. Using the sorted quality sums we now create the allele ordering array
735  *     A\subN. This is done by doing the following:
736  *     a) If the reference allele is known it always comes first, otherwise N
737  *        comes first.
738  *     b) Then the rest of the alleles are output in descending order of quality
739  *        sum (which we already know the qsum array was sorted).  Any allelles with
740  *        qsum 0 will be excluded.
741  *  4. Using the allele ordering array we create the genotype ordering array.
742  *     In the worst case with an unknown reference this will be:  A0/A0 A1/A0 A1/A1
743  *     A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
744  *  5. The genotype ordering array is then used to extract data from the error
745  *     model 5*5 matrix and is used to produce a Phread likelihood array for each
746  *     sample.
747  */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)748 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
749 {
750     int ref4, i, j;
751     float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
752     if (ref_base >= 0) {
753         call->ori_ref = ref4 = seq_nt16_int[ref_base];
754         if (ref4 > 4) ref4 = 4;
755     } else call->ori_ref = -1, ref4 = 0;
756 
757     // calculate qsum, this is done by summing normalized qsum across all samples,
758     // to account for differences in coverage
759     for (i = 0; i < n; ++i)
760     {
761         float sum = 0;
762         for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
763         if ( sum )
764             for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
765     }
766 
767     // sort qsum in ascending order (insertion sort)
768     float *ptr[5], *tmp;
769     for (i=0; i<5; i++) ptr[i] = &qsum[i];
770     for (i=1; i<4; i++)
771         for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
772             tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
773 
774     // Set the reference allele and alternative allele(s)
775     for (i=0; i<5; i++) call->a[i] = -1;
776     for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
777     call->unseen = -1;
778     call->a[0] = ref4;
779     for (i=3, j=1; i>=0; i--)   // i: alleles sorted by QS; j, a[j]: output allele ordering
780     {
781         int ipos = ptr[i] - qsum;   // position in sorted qsum array
782         if ( ipos==ref4 )
783             call->qsum[0] = qsum[ipos];    // REF's qsum
784         else
785         {
786             if ( !qsum[ipos] ) break;       // qsum is 0, this and consequent alleles are not seen in the pileup
787             call->qsum[j] = qsum[ipos];
788             call->a[j++]  = ipos;
789         }
790     }
791     if (ref_base >= 0)
792     {
793         // for SNPs, find the "unseen" base
794         if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
795             call->unseen = j, call->a[j++] = ptr[i] - qsum;
796         call->n_alleles = j;
797     }
798     else
799     {
800         call->n_alleles = j;
801         if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
802     }
803     /*
804      * Set the phread likelihood array (call->PL) This array is 15 entries long
805      * for each sample because that is size of an upper or lower triangle of a
806      * worst case 5x5 matrix of possible genotypes. This worst case matrix will
807      * occur when all 4 possible alleles are present and the reference allele
808      * is unknown.  The sides of the matrix will correspond to the reference
809      * allele (if known) followed by the alleles present in descending order of
810      * quality sum
811      */
812     {
813         int x, g[15], z;
814         double sum_min = 0.;
815         x = call->n_alleles * (call->n_alleles + 1) / 2;
816         // get the possible genotypes
817         // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
818         for (i = z = 0; i < call->n_alleles; ++i) {
819             for (j = 0; j <= i; ++j) {
820                 g[z++] = call->a[j] * 5 + call->a[i];
821             }
822         }
823         // for each sample calculate the PL
824         for (i = 0; i < n; ++i)
825         {
826             int32_t *PL = call->PL + x * i;
827             const bcf_callret1_t *r = calls + i;
828             float min = FLT_MAX;
829             for (j = 0; j < x; ++j) {
830                 if (min > r->p[g[j]]) min = r->p[g[j]];
831             }
832             sum_min += min;
833             for (j = 0; j < x; ++j) {
834                 int y;
835                 y = (int)(r->p[g[j]] - min + .499);
836                 if (y > 255) y = 255;
837                 PL[j] = y;
838             }
839         }
840         if ( call->DP4 )
841         {
842             for (i=0; i<n; i++)
843             {
844                 call->DP4[4*i]   = calls[i].anno[0];
845                 call->DP4[4*i+1] = calls[i].anno[1];
846                 call->DP4[4*i+2] = calls[i].anno[2];
847                 call->DP4[4*i+3] = calls[i].anno[3];
848             }
849         }
850         if ( call->SCR )
851         {
852             for (i=0; i<n; i++)
853             {
854                 call->SCR[0]  += calls[i].SCR;
855                 call->SCR[1+i] = calls[i].SCR;
856             }
857         }
858         if ( call->ADF )
859         {
860             assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
861 
862             // reorder ADR,ADF to match the allele ordering at this site
863             int32_t tmp[B2B_MAX_ALLELES];
864             int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
865             int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
866             int32_t *adr_tot = call->ADR;   // the first bin stores total counts per site
867             int32_t *adf_tot = call->ADF;
868             for (i=0; i<n; i++)
869             {
870                 for (j=0; j<call->n_alleles; j++)
871                 {
872                     tmp[j] = adr[ call->a[j] ];
873                     adr_tot[j] += tmp[j];
874                 }
875                 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
876                 for (j=0; j<call->n_alleles; j++)
877                 {
878                     tmp[j] = adf[ call->a[j] ];
879                     adf_tot[j] += tmp[j];
880                 }
881                 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
882                 adf_out += call->n_alleles;
883                 adr_out += call->n_alleles;
884                 adr += B2B_MAX_ALLELES;
885                 adf += B2B_MAX_ALLELES;
886             }
887         }
888         if ( bca->fmt_flag & B2B_FMT_QS )
889         {
890             assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
891 
892             // reorder QS to match the allele ordering at this site
893             int32_t tmp[B2B_MAX_ALLELES];
894             int32_t *qs = call->QS, *qs_out = call->QS;
895             for (i=0; i<n; i++)
896             {
897                 for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
898                 for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
899                 qs_out += call->n_alleles;
900                 qs += B2B_MAX_ALLELES;
901             }
902         }
903 
904 //      if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
905         call->shift = (int)(sum_min + .499);
906     }
907     // combine annotations
908     memset(call->anno, 0, 16 * sizeof(double));
909     call->ori_depth = 0;
910     call->depth     = 0;
911     call->mq0       = 0;
912     for (i = 0; i < n; ++i) {
913         call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
914         call->ori_depth += calls[i].ori_depth;
915         call->mq0 += calls[i].mq0;
916         for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
917     }
918 
919     calc_SegBias(calls, call);
920 
921     // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
922     // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
923     // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
924 
925     if (bca->fmt_flag & B2B_INFO_ZSCORE) {
926         // U z-normalised as +/- number of standard deviations from mean.
927         if (call->ori_ref < 0) {
928             if (bca->fmt_flag & B2B_INFO_RPB)
929                 call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
930                                                bca->npos, 0, 1);
931             call->mwu_mq  = calc_mwu_biasZ(bca->iref_mq,  bca->ialt_mq,
932                                            bca->nqual,1,1);
933             if ( bca->fmt_flag & B2B_INFO_SCB )
934                 call->mwu_sc  = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
935                                                100, 0,1);
936         } else {
937             if (bca->fmt_flag & B2B_INFO_RPB)
938                 call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
939                                                bca->npos, 0, 1);
940             call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
941                                            bca->nqual,1,1);
942             call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
943                                            bca->nqual,0,1);
944             call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
945                                            bca->nqual,0,1);
946             if ( bca->fmt_flag & B2B_INFO_SCB )
947                 call->mwu_sc  = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
948                                                100, 0,1);
949         }
950     } else {
951         // Old method; U as probability between 0 and 1
952         if ( bca->fmt_flag & B2B_INFO_RPB )
953             call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
954                                            bca->npos, 0, 0);
955         call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
956                                        bca->nqual, 1, 0);
957         call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
958                                        bca->nqual, 0, 0);
959         call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
960                                        bca->nqual, 0, 0);
961     }
962 
963 #if CDF_MWU_TESTS
964     // CDF version of MWU tests is not calculated by default
965     if ( bca->fmt_flag & B2B_INFO_RPB )
966         call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
967     call->mwu_mq_cdf  = calc_mwu_bias_cdf(bca->ref_mq,  bca->alt_mq,  bca->nqual);
968     call->mwu_bq_cdf  = calc_mwu_bias_cdf(bca->ref_bq,  bca->alt_bq,  bca->nqual);
969     call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
970 #endif
971 
972     if ( bca->fmt_flag & B2B_INFO_VDB )
973         call->vdb = calc_vdb(bca->alt_pos, bca->npos);
974 
975     return 0;
976 }
977 
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)978 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
979 {
980     extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
981     int i, j, nals = 1;
982 
983     bcf_hdr_t *hdr = bc->bcf_hdr;
984     rec->rid  = bc->tid;
985     rec->pos  = bc->pos;
986     rec->qual = 0;
987 
988     bc->tmp.l = 0;
989     if (bc->ori_ref < 0)    // indel
990     {
991         // REF
992         kputc(ref[bc->pos], &bc->tmp);
993         for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
994 
995         // ALT
996         for (i=1; i<4; i++)
997         {
998             if (bc->a[i] < 0) break;
999             kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
1000 
1001             if (bca->indel_types[bc->a[i]] < 0) { // deletion
1002                 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
1003                     kputc(ref[bc->pos+1+j], &bc->tmp);
1004             } else { // insertion; cannot be a reference unless a bug
1005                 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
1006                 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
1007                     kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
1008                 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
1009             }
1010             nals++;
1011         }
1012     }
1013     else    // SNP
1014     {
1015         kputc("ACGTN"[bc->ori_ref], &bc->tmp);
1016         for (i=1; i<5; i++)
1017         {
1018             if (bc->a[i] < 0) break;
1019             kputc(',', &bc->tmp);
1020             if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
1021             else kputc("ACGT"[bc->a[i]], &bc->tmp);
1022             nals++;
1023         }
1024     }
1025     bcf_update_alleles_str(hdr, rec, bc->tmp.s);
1026 
1027     bc->tmp.l = 0;
1028 
1029     // INFO
1030     if (bc->ori_ref < 0)
1031     {
1032         bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
1033         bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
1034         bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
1035     }
1036     bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
1037     if ( fmt_flag&B2B_INFO_ADF )
1038         bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
1039     if ( fmt_flag&B2B_INFO_ADR )
1040         bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
1041     if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
1042     {
1043         for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
1044         if ( fmt_flag&B2B_INFO_AD )
1045             bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
1046         if ( fmt_flag&B2B_INFO_DPR )
1047             bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
1048     }
1049     if ( fmt_flag&B2B_INFO_SCR )
1050         bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1);
1051 
1052     float tmpf[16];
1053     for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
1054     bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
1055     bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
1056 
1057     if ( bc->vdb != HUGE_VAL )      bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
1058     if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
1059 
1060     if (bca->fmt_flag & B2B_INFO_ZSCORE) {
1061         if ( bc->mwu_pos != HUGE_VAL )
1062             bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
1063         if ( bc->mwu_mq != HUGE_VAL )
1064             bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
1065         if ( bc->mwu_mqs != HUGE_VAL )
1066             bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
1067         if ( bc->mwu_bq != HUGE_VAL )
1068             bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
1069         if ( bc->mwu_sc != HUGE_VAL )
1070             bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
1071     } else {
1072         if ( bc->mwu_pos != HUGE_VAL )
1073             bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
1074         if ( bc->mwu_mq != HUGE_VAL )
1075             bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
1076         if ( bc->mwu_mqs != HUGE_VAL )
1077              bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
1078         if ( bc->mwu_bq != HUGE_VAL )
1079             bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
1080     }
1081 
1082     if ( bc->strand_bias != HUGE_VAL )
1083         bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
1084 
1085 #if CDF_MWU_TESTS
1086     if ( bc->mwu_pos_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
1087     if ( bc->mwu_mq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
1088     if ( bc->mwu_mqs_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
1089     if ( bc->mwu_bq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
1090 #endif
1091     tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
1092     bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
1093 
1094     // FORMAT
1095     rec->n_sample = bc->n;
1096     bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
1097     if ( fmt_flag&B2B_FMT_DP )
1098     {
1099         int32_t *ptr = (int32_t*) bc->fmt_arr;
1100         for (i=0; i<bc->n; i++)
1101             ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
1102         bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
1103     }
1104     if ( fmt_flag&B2B_FMT_DV )
1105     {
1106         int32_t *ptr = (int32_t*) bc->fmt_arr;
1107         for (i=0; i<bc->n; i++)
1108             ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
1109         bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
1110     }
1111     if ( fmt_flag&B2B_FMT_SP )
1112     {
1113         int32_t *ptr = (int32_t*) bc->fmt_arr;
1114         for (i=0; i<bc->n; i++)
1115         {
1116             int fwd_ref = bc->DP4[4*i], rev_ref =  bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
1117             if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
1118                 ptr[i] = 0;
1119             else
1120             {
1121                 double left, right, two;
1122                 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
1123                 int32_t x = (int)(-4.343 * log(two) + .499);
1124                 if (x > 255) x = 255;
1125                 ptr[i] = x;
1126             }
1127         }
1128         bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
1129     }
1130     if ( fmt_flag&B2B_FMT_DP4 )
1131         bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
1132     if ( fmt_flag&B2B_FMT_ADF )
1133         bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1134     if ( fmt_flag&B2B_FMT_ADR )
1135         bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1136     if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
1137     {
1138         for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
1139         if ( fmt_flag&B2B_FMT_AD )
1140             bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1141         if ( fmt_flag&B2B_FMT_DPR )
1142             bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1143     }
1144     if ( fmt_flag&B2B_FMT_SCR )
1145         bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
1146     if ( fmt_flag&B2B_FMT_QS )
1147         bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
1148 
1149     return 0;
1150 }
1151