1 /*  bam2bcf.c -- variant calling.
2 
3     Copyright (C) 2010-2012 Broad Institute.
4     Copyright (C) 2012-2021 Genome Research Ltd.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <math.h>
27 #include <stdint.h>
28 #include <assert.h>
29 #include <float.h>
30 #include <htslib/hts.h>
31 #include <htslib/sam.h>
32 #include <htslib/kstring.h>
33 #include <htslib/kfunc.h>
34 #include "bam2bcf.h"
35 
36 extern  void ks_introsort_uint32_t(size_t n, uint32_t a[]);
37 
38 #define CALL_DEFTHETA 0.83
39 #define DEF_MAPQ 20
40 
41 #define CAP_DIST 25
42 
bcf_call_init(double theta,int min_baseQ,int max_baseQ,int delta_baseQ)43 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
44                              int delta_baseQ)
45 {
46     bcf_callaux_t *bca;
47     if (theta <= 0.) theta = CALL_DEFTHETA;
48     bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
49     bca->capQ = 60;
50     bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
51     bca->min_baseQ = min_baseQ;
52     bca->max_baseQ = max_baseQ;
53     bca->delta_baseQ = delta_baseQ;
54     bca->e = errmod_init(1. - theta);
55     bca->min_frac = 0.002;
56     bca->min_support = 1;
57     bca->per_sample_flt = 0;
58     bca->npos = 100;
59     bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
60     bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
61     bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
62     bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
63     bca->nqual = 60;
64     bca->ref_mq  = (int*) malloc(bca->nqual*sizeof(int));
65     bca->alt_mq  = (int*) malloc(bca->nqual*sizeof(int));
66     bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
67     bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
68     bca->ref_bq  = (int*) malloc(bca->nqual*sizeof(int));
69     bca->alt_bq  = (int*) malloc(bca->nqual*sizeof(int));
70     bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
71     bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
72     return bca;
73 }
74 
bcf_call_destroy(bcf_callaux_t * bca)75 void bcf_call_destroy(bcf_callaux_t *bca)
76 {
77     if (bca == 0) return;
78     errmod_destroy(bca->e);
79     if (bca->npos) {
80         free(bca->ref_pos);  free(bca->alt_pos);
81         free(bca->iref_pos); free(bca->ialt_pos);
82         bca->npos = 0;
83     }
84     free(bca->ref_mq); free(bca->alt_mq);
85     free(bca->iref_mq); free(bca->ialt_mq);
86     free(bca->ref_bq); free(bca->alt_bq);
87     free(bca->fwd_mqs); free(bca->rev_mqs);
88     bca->nqual = 0;
89     free(bca->bases); free(bca->inscns); free(bca);
90 }
91 
92 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len,int * sc_len,int * sc_dist)93 static int get_position(const bam_pileup1_t *p, int *len,
94                         int *sc_len, int *sc_dist) {
95     int i, j, edist = p->qpos + 1;
96     int sc_left = 0, sc_right = 0;
97     int sc_left_dist = -1, sc_right_dist = -1;
98 
99     // left end
100     for (i = 0; i < p->b->core.n_cigar; i++) {
101         int cig  = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
102         if (cig == BAM_CHARD_CLIP)
103             continue;
104         else if (cig == BAM_CSOFT_CLIP)
105             sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
106         else
107             break;
108     }
109     if (sc_left)
110         sc_left_dist = p->qpos+1 - sc_left;
111     edist -= sc_left;
112 
113     // right end
114     for (j = p->b->core.n_cigar-1; j >= i; j--) {
115         int cig  = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
116         if (cig == BAM_CHARD_CLIP)
117             continue;
118         else if (cig == BAM_CSOFT_CLIP)
119             sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
120         else
121             break;
122     }
123     if (sc_right)
124         sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
125 
126     // Distance to nearest soft-clips and length of that clip.
127     if (sc_left_dist >= 0) {
128         if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
129             *sc_len  = sc_left;
130             *sc_dist = sc_left_dist;
131         }
132     } else if (sc_right_dist >= 0) {
133         *sc_len  = sc_right;
134         *sc_dist = sc_right_dist;
135     } else {
136         *sc_len  = 0;
137         *sc_dist = 0;
138     }
139 
140     *len = p->b->core.l_qseq - sc_left - sc_right;
141     return edist;
142 }
143 
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)144 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
145 {
146     memset(bca->ref_pos,0,sizeof(int)*bca->npos);
147     memset(bca->alt_pos,0,sizeof(int)*bca->npos);
148     memset(bca->iref_pos,0,sizeof(int)*bca->npos);
149     memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
150     memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
151     memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
152     memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
153     memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
154     memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
155     memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
156     memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
157     memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
158     if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
159     if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
160     if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
161     memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
162     memset(bca->ref_scl,  0, 100*sizeof(int));
163     memset(bca->alt_scl,  0, 100*sizeof(int));
164     memset(bca->iref_scl, 0, 100*sizeof(int));
165     memset(bca->ialt_scl, 0, 100*sizeof(int));
166 }
167 
168 /*
169     Notes:
170     - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
171         which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
172         Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
173     - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
174  */
175 /*
176  * This function is called once for each sample.
177  * _n is number of pilesups pl contributing reads to this sample
178  * pl is pointer to array of _n pileups (one pileup per read)
179  * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
180  * bca is the settings to perform calls across all samples
181  * r is the returned value of the call
182  */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)183 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
184 {
185     int i, n, ref4, is_indel, ori_depth = 0;
186 
187     // clean from previous run
188     r->ori_depth = 0;
189     r->mq0 = 0;
190     memset(r->anno,0,sizeof(double)*16);
191     memset(r->p,0,sizeof(float)*25);
192     r->SCR = 0;
193 
194     if (ref_base >= 0) {
195         ref4 = seq_nt16_int[ref_base];
196         is_indel = 0;
197     } else ref4 = 4, is_indel = 1;
198     if (_n == 0) return -1;
199     // enlarge the bases array if necessary
200     if (bca->max_bases < _n) {
201         bca->max_bases = _n;
202         kroundup32(bca->max_bases);
203         bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
204     }
205 
206     // fill the bases array
207     double nqual_over_60 = bca->nqual / 60.0;
208     int ADR_ref_missed[4] = {0};
209     int ADF_ref_missed[4] = {0};
210     for (i = n = 0; i < _n; ++i) {
211         const bam_pileup1_t *p = pl + i;
212         int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
213         if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
214         if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
215         if (p->is_del && !is_indel) continue;
216         ++ori_depth;
217         if (is_indel)
218         {
219             b = p->aux>>16&0x3f;
220             seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
221             if (q < bca->min_baseQ)
222             {
223                 if (!p->indel && b < 4)
224                 {
225                     if (bam_is_rev(p->b))
226                         ADR_ref_missed[b]++;
227                     else
228                         ADF_ref_missed[b]++;
229                 }
230                 continue;
231             }
232             if (p->indel == 0 && (q < _n/2 || _n > 20)) {
233                 // high quality indel calls without p->indel set aren't
234                 // particularly indicative of being a good REF match either,
235                 // at least not in low coverage.  So require solid coverage
236                 // before we start utilising such quals.
237                 b = 0;
238                 q = (int)bam_get_qual(p->b)[p->qpos];
239                 seqQ = (3*seqQ + 2*q)/8;
240             }
241             if (_n > 20 && seqQ > 40) seqQ = 40;
242             baseQ  = p->aux>>8&0xff;
243 
244             is_diff = (b != 0);
245         }
246         else
247         {
248             b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
249             b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
250 
251             // Lowest of this and neighbour quality values
252             uint8_t *qual = bam_get_qual(p->b);
253             q = qual[p->qpos];
254             if (p->qpos > 0 &&
255                 q > qual[p->qpos-1]+bca->delta_baseQ)
256                 q = qual[p->qpos-1]+bca->delta_baseQ;
257             if (p->qpos+1 < p->b->core.l_qseq &&
258                 q > qual[p->qpos+1]+bca->delta_baseQ)
259                 q = qual[p->qpos+1]+bca->delta_baseQ;
260 
261             if (q < bca->min_baseQ) continue;
262             if (q > bca->max_baseQ) q = bca->max_baseQ;
263             baseQ = q;
264             seqQ  = 99;
265             is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
266         }
267         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
268         if ( !mapQ ) r->mq0++;
269         if (q > seqQ) q = seqQ;
270         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
271         if (q > mapQ) q = mapQ;
272         if (q > 63) q = 63;
273         if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
274         bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
275         // collect annotations
276         if (b < 4)
277         {
278             r->QS[b] += q;
279             if ( r->ADF )
280             {
281                 if ( bam_is_rev(p->b) )
282                     r->ADR[b]++;
283                 else
284                     r->ADF[b]++;
285             }
286         }
287         ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
288         min_dist = p->b->core.l_qseq - 1 - p->qpos;
289         if (min_dist > p->qpos) min_dist = p->qpos;
290         if (min_dist > CAP_DIST) min_dist = CAP_DIST;
291         r->anno[1<<2|is_diff<<1|0] += baseQ;
292         r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
293         r->anno[2<<2|is_diff<<1|0] += mapQ;
294         r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
295         r->anno[3<<2|is_diff<<1|0] += min_dist;
296         r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
297 
298         // collect for bias tests
299         if ( baseQ > 59 ) baseQ = 59;
300         if ( mapQ > 59 ) mapQ = 59;
301         int len, epos = 0, sc_len = 0, sc_dist = 0;
302         if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
303         {
304             int pos = get_position(p, &len, &sc_len, &sc_dist);
305             epos = (double)pos/(len+1) * bca->npos;
306 
307             if (sc_len) {
308                 sc_len = 15.0*sc_len / sc_dist;
309                 if (sc_len > 99) sc_len = 99;
310             }
311         }
312 
313         int imq  = mapQ * nqual_over_60;
314         int ibq  = baseQ * nqual_over_60;
315 
316         if ( bam_is_rev(p->b) )
317             bca->rev_mqs[imq]++;
318         else
319             bca->fwd_mqs[imq]++;
320 
321         if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
322         {
323             bca->ref_pos[epos]++;
324             bca->ref_bq[ibq]++;
325             bca->ref_mq[imq]++;
326             bca->ref_scl[sc_len]++;
327         }
328         else
329         {
330             bca->alt_pos[epos]++;
331             bca->alt_bq[ibq]++;
332             bca->alt_mq[imq]++;
333             bca->alt_scl[sc_len]++;
334         }
335     }
336 
337     // Compensate for AD not being counted on low quality REF indel matches.
338     if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
339     {
340         for (i=0; i<4; i++)
341         {
342             r->ADR[0] += ADR_ref_missed[i];
343             r->ADF[0] += ADF_ref_missed[i];
344         }
345     }
346     else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
347     {
348         int dp = 0, dp_ambig = 0;
349         for (i=0; i<4; i++) dp += r->ADR[i];
350         for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
351         if ( dp )
352             for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
353         dp = 0, dp_ambig = 0;
354         for (i=0; i<4; i++) dp += r->ADF[i];
355         for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
356         if ( dp )
357             for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
358     }
359 
360     r->ori_depth = ori_depth;
361     // glfgen
362     errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
363     return n;
364 }
365 
366 
367 /*
368  *  calc_vdb() - returns value between zero (most biased) and one (no bias)
369  *               on success, or HUGE_VAL when VDB cannot be calculated because
370  *               of insufficient depth (<2x)
371  *
372  *  Variant Distance Bias tests if the variant bases are positioned within the
373  *  reads with sufficient randomness. Unlike other tests, it looks only at
374  *  variant reads and therefore gives different kind of information than Read
375  *  Position Bias for instance. VDB was developed for detecting artefacts in
376  *  RNA-seq calls where reads from spliced transcripts span splice site
377  *  boundaries.  The current implementation differs somewhat from the original
378  *  version described in supplementary material of PMID:22524474, but the idea
379  *  remains the same. (Here the random variable tested is the average distance
380  *  from the averaged position, not the average pairwise distance.)
381  *
382  *  For coverage of 2x, the calculation is exact but is approximated for the
383  *  rest. The result is most accurate between 4-200x. For 3x or >200x, the
384  *  reported values are slightly more favourable than those of a true random
385  *  distribution.
386  */
calc_vdb(int * pos,int npos)387 double calc_vdb(int *pos, int npos)
388 {
389     // Note well: the parameters were obtained by fitting to simulated data of
390     // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
391     const int readlen = 100;
392     assert( npos==readlen );
393 
394     #define nparam 15
395     const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
396         {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
397         {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
398         {200,0.7,23.7} };
399 
400     int i, dp = 0;
401     float mean_pos = 0, mean_diff = 0;
402     for (i=0; i<npos; i++)
403     {
404         if ( !pos[i] ) continue;
405         dp += pos[i];
406         mean_pos += pos[i]*i;
407     }
408     if ( dp<2 ) return HUGE_VAL;     // one or zero reads can be placed anywhere
409 
410     mean_pos /= dp;
411     for (i=0; i<npos; i++)
412     {
413         if ( !pos[i] ) continue;
414         mean_diff += pos[i] * fabs(i - mean_pos);
415     }
416     mean_diff /= dp;
417 
418     int ipos = mean_diff;   // tuned for float-to-int implicit conversion
419     if ( dp==2 )
420         return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
421 
422     if ( dp>=200 )
423         i = nparam; // shortcut for big depths
424     else
425     {
426         for (i=0; i<nparam; i++)
427             if ( param[i][0]>=dp ) break;
428     }
429     float pshift, pscale;
430     if ( i==nparam )
431     {
432         // the depth is too high, go with 200x
433         pscale = param[nparam-1][1];
434         pshift = param[nparam-1][2];
435     }
436     else if ( i>0 && param[i][0]!=dp )
437     {
438         // linear interpolation of parameters
439         pscale = (param[i-1][1] + param[i][1])*0.5;
440         pshift = (param[i-1][2] + param[i][2])*0.5;
441     }
442     else
443     {
444         pscale = param[i][1];
445         pshift = param[i][2];
446     }
447     return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
448 }
449 
calc_chisq_bias(int * a,int * b,int n)450 double calc_chisq_bias(int *a, int *b, int n)
451 {
452     int na = 0, nb = 0, i, ndf = n;
453     for (i=0; i<n; i++) na += a[i];
454     for (i=0; i<n; i++) nb += b[i];
455     if ( !na || !nb ) return HUGE_VAL;
456 
457     double chisq = 0;
458     for (i=0; i<n; i++)
459     {
460         if ( !a[i] && !b[i] ) ndf--;
461         else
462         {
463             double tmp = a[i] - b[i];
464             chisq += tmp*tmp/(a[i]+b[i]);
465         }
466     }
467     /*
468         kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
469         1 if the distributions are identical, 0 if very different
470     */
471     double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
472     return prob;
473 }
474 
mann_whitney_1947_(int n,int m,int U)475 static double mann_whitney_1947_(int n, int m, int U)
476 {
477      if (U<0) return 0;
478      if (n==0||m==0) return U==0 ? 1 : 0;
479     return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
480 }
481 
mann_whitney_1947(int n,int m,int U)482 double mann_whitney_1947(int n, int m, int U)
483 {
484     #include "mw.h"
485 
486     assert(n >= 2 && m >= 2);
487 
488     return (n < 8 && m < 8 && U < 50)
489         ? mw[n-2][m-2][U]
490         : mann_whitney_1947_(n,m,U);
491 }
492 
mann_whitney_1947_cdf(int n,int m,int U)493 double mann_whitney_1947_cdf(int n, int m, int U)
494 {
495     int i;
496     double sum = 0;
497     for (i=0; i<=U; i++)
498         sum += mann_whitney_1947(n,m,i);
499     return sum;
500 }
501 
calc_mwu_bias_cdf(int * a,int * b,int n)502 double calc_mwu_bias_cdf(int *a, int *b, int n)
503 {
504     int na = 0, nb = 0, i;
505     double U = 0;
506     //double ties = 0;
507     for (i=0; i<n; i++)
508     {
509         na += a[i];
510         U  += a[i] * (nb + b[i]*0.5);
511         nb += b[i];
512         // if ( a[i] && b[i] )
513         // {
514         //     double tie = a[i] + b[i];
515         //     ties += (tie*tie-1)*tie;
516         // }
517     }
518     if ( !na || !nb ) return HUGE_VAL;
519 
520     // Always work with the smaller U
521     double U_min = ((double)na * nb) - U;
522     if ( U < U_min ) U_min = U;
523 
524     if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
525     if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
526 
527     // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
528     if ( na>=8 || nb>=8 )
529     {
530         double mean = ((double)na*nb)*0.5;
531         // Correction for ties:
532         //      double N = na+nb;
533         //      double var2 = (N*N-1)*N-ties;
534         //      if ( var2==0 ) return 1.0;
535         //      var2 *= ((double)na*nb)/N/(N-1)/12.0;
536         // No correction for ties:
537         double var2 = ((double)na*nb)*(na+nb+1)/12.0;
538         double z = (U_min - mean)/sqrt(2*var2);   // z is N(0,1)
539         return 2.0 - kf_erfc(z);  // which is 1 + erf(z)
540     }
541 
542     // Exact calculation
543     double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
544     return pval>1 ? 1 : pval;
545 }
546 
calc_mwu_bias(int * a,int * b,int n,int left)547 double calc_mwu_bias(int *a, int *b, int n, int left)
548 {
549     int na = 0, nb = 0, i;
550     double U = 0;
551     // double ties = 0;
552     for (i=0; i<n; i++)
553     {
554         if (!a[i]) {
555             if (!b[i]) continue;
556             nb += b[i];
557         } else if (!b[i]) {
558             na += a[i];
559             U  += a[i] * nb;
560         } else {
561             na += a[i];
562             U  += a[i] * (nb + b[i]*0.5);
563             nb += b[i];
564             // double tie = a[i] + b[i];
565             // ties += (tie*tie-1)*tie;
566         }
567     }
568     if ( !na || !nb ) return HUGE_VAL;
569     if ( na==1 || nb==1 ) return 1.0;       // Flat probability, all U values are equally likely
570 
571     double mean = ((double)na*nb)*0.5;
572     if (left && U > mean) return 1; // for MQB which is asymmetrical
573     if ( na==2 || nb==2 )
574     {
575         // Linear approximation
576         return U>mean ? (2.0*mean-U)/mean : U/mean;
577     }
578     // Correction for ties:
579     //      double N = na+nb;
580     //      double var2 = (N*N-1)*N-ties;
581     //      if ( var2==0 ) return 1.0;
582     //      var2 *= ((double)na*nb)/N/(N-1)/12.0;
583     // No correction for ties:
584     double var2 = ((double)na*nb)*(na+nb+1)/12.0;
585     if ( na>=8 || nb>=8 )
586     {
587         // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
588         return exp(-0.5*(U-mean)*(U-mean)/var2);
589     }
590 
591     // Exact calculation
592     return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
593 }
594 
595 // A Z-score version of the above function.
596 //
597 // See "Normal approximation and tie correction" at
598 // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
599 //
600 // The Z score is the number of standard deviations above or below the mean
601 // with 0 being equality of the two distributions and +ve/-ve from there.
602 //
603 // This is a more robust score to filter on.
calc_mwu_biasZ(int * a,int * b,int n,int left_only,int do_Z)604 double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
605     int i;
606     int64_t t;
607 
608     // Optimisation
609     for (i = 0; i < n; i++)
610         if (b[i])
611             break;
612     int b_empty = (i == n);
613 
614     // Count equal (e), less-than (l) and greater-than (g) permutations.
615     int e = 0, l = 0, na = 0, nb = 0;
616     if (b_empty) {
617         for (t = 0, i = n-1; i >= 0; i--) {
618             na += a[i];
619             t += (a[i]*a[i]-1)*a[i];  // adjustment score for ties
620         }
621     } else {
622         for (t = 0, i = n-1; i >= 0; i--) {
623             // Combinations of a[i] and b[j] for i==j
624             e += a[i]*b[i];
625 
626             // nb is running total of b[i+1]..b[n-1].
627             // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
628             // for all i < j.
629             l += a[i]*nb;    // a<b
630 
631             na += a[i];
632             nb += b[i];
633             int p = a[i]+b[i];
634             t += (p*p-1)*p;  // adjustment score for ties
635         }
636     }
637 
638     if (!na || !nb)
639         return HUGE_VAL;
640 
641     double U, m;
642     U = l + e*0.5; // Mann-Whitney U score
643     m = na*nb / 2.0;
644 
645     // With ties adjustment
646     double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
647     // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
648     if (var2 <= 0)
649         return do_Z ? 0 : 1;
650 
651     if (do_Z) {
652         // S.D. normalised Z-score
653         //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
654         return (U - m) / sqrt(var2);
655     }
656 
657     // Else U score, which can be asymmetric for some data types.
658     if (left_only && U > m)
659         return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
660 
661     if (na >= 8 || nb >= 8) {
662         // Normal approximation, very good for na>=8 && nb>=8 and
663         // reasonable if na<8 or nb<8
664         return exp(-0.5*(U-m)*(U-m)/var2);
665     }
666 
667     // Exact calculation
668     if (na==1 || nb == 1)
669         return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
670     else
671         return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
672 }
673 
logsumexp2(double a,double b)674 static inline double logsumexp2(double a, double b)
675 {
676     if ( a>b )
677         return log(1 + exp(b-a)) + a;
678     else
679         return log(1 + exp(a-b)) + b;
680 }
681 
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)682 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
683 {
684     call->seg_bias = HUGE_VAL;
685     if ( !bcr ) return;
686 
687     int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
688     if ( !nr ) return;
689 
690     int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n;    // average depth
691     double M   = floor((double)nr / avg_dp + 0.5);   // an approximate number of variants samples in the population
692     if ( M>call->n ) M = call->n;       // clamp M at the number of samples
693     else if ( M==0 ) M = 1;
694     double f = M / 2. / call->n;        // allele frequency
695     double p = (double) nr / call->n;   // number of variant reads per sample expected if variant not real (poisson)
696     double q = (double) nr / M;         // number of variant reads per sample expected if variant is real (poisson)
697     double sum = 0;
698     const double log2 = log(2.0);
699 
700     // fprintf(stderr,"M=%.1f  p=%e q=%e f=%f  dp=%d\n",M,p,q,f,avg_dp);
701     int i;
702     for (i=0; i<call->n; i++)
703     {
704         int oi = bcr[i].anno[2] + bcr[i].anno[3];       // observed number of non-ref reads
705         double tmp;
706         if ( oi )
707         {
708             // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
709             tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
710             tmp += log(f) + oi*log(q/p) - q + p;
711         }
712         else
713             tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
714         sum += tmp;
715         // fprintf(stderr,"oi=%d %e\n", oi,tmp);
716     }
717     call->seg_bias = sum;
718 }
719 
720 /**
721  *  bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
722  *  @n:         number of samples
723  *  @calls:     each sample's calls
724  *  @bca:       auxiliary data structure for holding temporary values
725  *  @ref_base:  the reference base
726  *  @call:      filled with the annotations
727  *
728  *  Combines calls across the various samples being studied
729  *  1. For each allele at each base across all samples the quality is summed so
730  *     you end up with a set of quality sums for each allele present 2. The quality
731  *     sums are sorted.
732  *  3. Using the sorted quality sums we now create the allele ordering array
733  *     A\subN. This is done by doing the following:
734  *     a) If the reference allele is known it always comes first, otherwise N
735  *        comes first.
736  *     b) Then the rest of the alleles are output in descending order of quality
737  *        sum (which we already know the qsum array was sorted).  Any allelles with
738  *        qsum 0 will be excluded.
739  *  4. Using the allele ordering array we create the genotype ordering array.
740  *     In the worst case with an unknown reference this will be:  A0/A0 A1/A0 A1/A1
741  *     A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
742  *  5. The genotype ordering array is then used to extract data from the error
743  *     model 5*5 matrix and is used to produce a Phread likelihood array for each
744  *     sample.
745  */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)746 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
747 {
748     int ref4, i, j;
749     float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
750     if (ref_base >= 0) {
751         call->ori_ref = ref4 = seq_nt16_int[ref_base];
752         if (ref4 > 4) ref4 = 4;
753     } else call->ori_ref = -1, ref4 = 0;
754 
755     // calculate qsum, this is done by summing normalized qsum across all samples,
756     // to account for differences in coverage
757     for (i = 0; i < n; ++i)
758     {
759         float sum = 0;
760         for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
761         if ( sum )
762             for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
763     }
764 
765     // sort qsum in ascending order (insertion sort)
766     float *ptr[5], *tmp;
767     for (i=0; i<5; i++) ptr[i] = &qsum[i];
768     for (i=1; i<4; i++)
769         for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
770             tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
771 
772     // Set the reference allele and alternative allele(s)
773     for (i=0; i<5; i++) call->a[i] = -1;
774     for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
775     call->unseen = -1;
776     call->a[0] = ref4;
777     for (i=3, j=1; i>=0; i--)   // i: alleles sorted by QS; j, a[j]: output allele ordering
778     {
779         int ipos = ptr[i] - qsum;   // position in sorted qsum array
780         if ( ipos==ref4 )
781             call->qsum[0] = qsum[ipos];    // REF's qsum
782         else
783         {
784             if ( !qsum[ipos] ) break;       // qsum is 0, this and consequent alleles are not seen in the pileup
785             call->qsum[j] = qsum[ipos];
786             call->a[j++]  = ipos;
787         }
788     }
789     if (ref_base >= 0)
790     {
791         // for SNPs, find the "unseen" base
792         if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
793             call->unseen = j, call->a[j++] = ptr[i] - qsum;
794         call->n_alleles = j;
795     }
796     else
797     {
798         call->n_alleles = j;
799         if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
800     }
801     /*
802      * Set the phread likelihood array (call->PL) This array is 15 entries long
803      * for each sample because that is size of an upper or lower triangle of a
804      * worst case 5x5 matrix of possible genotypes. This worst case matrix will
805      * occur when all 4 possible alleles are present and the reference allele
806      * is unknown.  The sides of the matrix will correspond to the reference
807      * allele (if known) followed by the alleles present in descending order of
808      * quality sum
809      */
810     {
811         int x, g[15], z;
812         double sum_min = 0.;
813         x = call->n_alleles * (call->n_alleles + 1) / 2;
814         // get the possible genotypes
815         // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
816         for (i = z = 0; i < call->n_alleles; ++i) {
817             for (j = 0; j <= i; ++j) {
818                 g[z++] = call->a[j] * 5 + call->a[i];
819             }
820         }
821         // for each sample calculate the PL
822         for (i = 0; i < n; ++i)
823         {
824             int32_t *PL = call->PL + x * i;
825             const bcf_callret1_t *r = calls + i;
826             float min = FLT_MAX;
827             for (j = 0; j < x; ++j) {
828                 if (min > r->p[g[j]]) min = r->p[g[j]];
829             }
830             sum_min += min;
831             for (j = 0; j < x; ++j) {
832                 int y;
833                 y = (int)(r->p[g[j]] - min + .499);
834                 if (y > 255) y = 255;
835                 PL[j] = y;
836             }
837         }
838         if ( call->DP4 )
839         {
840             for (i=0; i<n; i++)
841             {
842                 call->DP4[4*i]   = calls[i].anno[0];
843                 call->DP4[4*i+1] = calls[i].anno[1];
844                 call->DP4[4*i+2] = calls[i].anno[2];
845                 call->DP4[4*i+3] = calls[i].anno[3];
846             }
847         }
848         if ( call->SCR )
849         {
850             for (i=0; i<n; i++)
851             {
852                 call->SCR[0]  += calls[i].SCR;
853                 call->SCR[1+i] = calls[i].SCR;
854             }
855         }
856         if ( call->ADF )
857         {
858             assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
859 
860             // reorder ADR,ADF to match the allele ordering at this site
861             int32_t tmp[B2B_MAX_ALLELES];
862             int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
863             int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
864             int32_t *adr_tot = call->ADR;   // the first bin stores total counts per site
865             int32_t *adf_tot = call->ADF;
866             for (i=0; i<n; i++)
867             {
868                 for (j=0; j<call->n_alleles; j++)
869                 {
870                     tmp[j] = adr[ call->a[j] ];
871                     adr_tot[j] += tmp[j];
872                 }
873                 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
874                 for (j=0; j<call->n_alleles; j++)
875                 {
876                     tmp[j] = adf[ call->a[j] ];
877                     adf_tot[j] += tmp[j];
878                 }
879                 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
880                 adf_out += call->n_alleles;
881                 adr_out += call->n_alleles;
882                 adr += B2B_MAX_ALLELES;
883                 adf += B2B_MAX_ALLELES;
884             }
885         }
886         if ( bca->fmt_flag & B2B_FMT_QS )
887         {
888             assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
889 
890             // reorder QS to match the allele ordering at this site
891             int32_t tmp[B2B_MAX_ALLELES];
892             int32_t *qs = call->QS, *qs_out = call->QS;
893             for (i=0; i<n; i++)
894             {
895                 for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
896                 for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
897                 qs_out += call->n_alleles;
898                 qs += B2B_MAX_ALLELES;
899             }
900         }
901 
902 //      if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
903         call->shift = (int)(sum_min + .499);
904     }
905     // combine annotations
906     memset(call->anno, 0, 16 * sizeof(double));
907     call->ori_depth = 0;
908     call->depth     = 0;
909     call->mq0       = 0;
910     for (i = 0; i < n; ++i) {
911         call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
912         call->ori_depth += calls[i].ori_depth;
913         call->mq0 += calls[i].mq0;
914         for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
915     }
916 
917     calc_SegBias(calls, call);
918 
919     // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
920     // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
921     // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
922 
923     if (bca->fmt_flag & B2B_INFO_ZSCORE) {
924         // U z-normalised as +/- number of standard deviations from mean.
925         if (call->ori_ref < 0) {
926             if (bca->fmt_flag & B2B_INFO_RPB)
927                 call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
928                                                bca->npos, 0, 1);
929             call->mwu_mq  = calc_mwu_biasZ(bca->iref_mq,  bca->ialt_mq,
930                                            bca->nqual,1,1);
931             if ( bca->fmt_flag & B2B_INFO_SCB )
932                 call->mwu_sc  = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
933                                                100, 0,1);
934         } else {
935             if (bca->fmt_flag & B2B_INFO_RPB)
936                 call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
937                                                bca->npos, 0, 1);
938             call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
939                                            bca->nqual,1,1);
940             call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
941                                            bca->nqual,0,1);
942             call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
943                                            bca->nqual,0,1);
944             if ( bca->fmt_flag & B2B_INFO_SCB )
945                 call->mwu_sc  = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
946                                                100, 0,1);
947         }
948     } else {
949         // Old method; U as probability between 0 and 1
950         if ( bca->fmt_flag & B2B_INFO_RPB )
951             call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
952                                            bca->npos, 0, 0);
953         call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
954                                        bca->nqual, 1, 0);
955         call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
956                                        bca->nqual, 0, 0);
957         call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
958                                        bca->nqual, 0, 0);
959     }
960 
961 #if CDF_MWU_TESTS
962     // CDF version of MWU tests is not calculated by default
963     if ( bca->fmt_flag & B2B_INFO_RPB )
964         call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
965     call->mwu_mq_cdf  = calc_mwu_bias_cdf(bca->ref_mq,  bca->alt_mq,  bca->nqual);
966     call->mwu_bq_cdf  = calc_mwu_bias_cdf(bca->ref_bq,  bca->alt_bq,  bca->nqual);
967     call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
968 #endif
969 
970     if ( bca->fmt_flag & B2B_INFO_VDB )
971         call->vdb = calc_vdb(bca->alt_pos, bca->npos);
972 
973     return 0;
974 }
975 
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)976 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
977 {
978     extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
979     int i, j, nals = 1;
980 
981     bcf_hdr_t *hdr = bc->bcf_hdr;
982     rec->rid  = bc->tid;
983     rec->pos  = bc->pos;
984     rec->qual = 0;
985 
986     bc->tmp.l = 0;
987     if (bc->ori_ref < 0)    // indel
988     {
989         // REF
990         kputc(ref[bc->pos], &bc->tmp);
991         for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
992 
993         // ALT
994         for (i=1; i<4; i++)
995         {
996             if (bc->a[i] < 0) break;
997             kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
998 
999             if (bca->indel_types[bc->a[i]] < 0) { // deletion
1000                 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
1001                     kputc(ref[bc->pos+1+j], &bc->tmp);
1002             } else { // insertion; cannot be a reference unless a bug
1003                 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
1004                 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
1005                     kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
1006                 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
1007             }
1008             nals++;
1009         }
1010     }
1011     else    // SNP
1012     {
1013         kputc("ACGTN"[bc->ori_ref], &bc->tmp);
1014         for (i=1; i<5; i++)
1015         {
1016             if (bc->a[i] < 0) break;
1017             kputc(',', &bc->tmp);
1018             if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
1019             else kputc("ACGT"[bc->a[i]], &bc->tmp);
1020             nals++;
1021         }
1022     }
1023     bcf_update_alleles_str(hdr, rec, bc->tmp.s);
1024 
1025     bc->tmp.l = 0;
1026 
1027     // INFO
1028     if (bc->ori_ref < 0)
1029     {
1030         bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
1031         bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
1032         bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
1033     }
1034     bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
1035     if ( fmt_flag&B2B_INFO_ADF )
1036         bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
1037     if ( fmt_flag&B2B_INFO_ADR )
1038         bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
1039     if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
1040     {
1041         for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
1042         if ( fmt_flag&B2B_INFO_AD )
1043             bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
1044         if ( fmt_flag&B2B_INFO_DPR )
1045             bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
1046     }
1047     if ( fmt_flag&B2B_INFO_SCR )
1048         bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1);
1049 
1050     float tmpf[16];
1051     for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
1052     bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
1053     bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
1054 
1055     if ( bc->vdb != HUGE_VAL )      bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
1056     if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
1057 
1058     if (bca->fmt_flag & B2B_INFO_ZSCORE) {
1059         if ( bc->mwu_pos != HUGE_VAL )
1060             bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
1061         if ( bc->mwu_mq != HUGE_VAL )
1062             bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
1063         if ( bc->mwu_mqs != HUGE_VAL )
1064             bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
1065         if ( bc->mwu_bq != HUGE_VAL )
1066             bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
1067         if ( bc->mwu_sc != HUGE_VAL )
1068             bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
1069     } else {
1070         if ( bc->mwu_pos != HUGE_VAL )
1071             bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
1072         if ( bc->mwu_mq != HUGE_VAL )
1073             bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
1074         if ( bc->mwu_mqs != HUGE_VAL )
1075              bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
1076         if ( bc->mwu_bq != HUGE_VAL )
1077             bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
1078     }
1079 
1080     if ( bc->strand_bias != HUGE_VAL )
1081         bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
1082 
1083 #if CDF_MWU_TESTS
1084     if ( bc->mwu_pos_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
1085     if ( bc->mwu_mq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
1086     if ( bc->mwu_mqs_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
1087     if ( bc->mwu_bq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
1088 #endif
1089     tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
1090     bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
1091 
1092     // FORMAT
1093     rec->n_sample = bc->n;
1094     bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
1095     if ( fmt_flag&B2B_FMT_DP )
1096     {
1097         int32_t *ptr = (int32_t*) bc->fmt_arr;
1098         for (i=0; i<bc->n; i++)
1099             ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
1100         bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
1101     }
1102     if ( fmt_flag&B2B_FMT_DV )
1103     {
1104         int32_t *ptr = (int32_t*) bc->fmt_arr;
1105         for (i=0; i<bc->n; i++)
1106             ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
1107         bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
1108     }
1109     if ( fmt_flag&B2B_FMT_SP )
1110     {
1111         int32_t *ptr = (int32_t*) bc->fmt_arr;
1112         for (i=0; i<bc->n; i++)
1113         {
1114             int fwd_ref = bc->DP4[4*i], rev_ref =  bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
1115             if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
1116                 ptr[i] = 0;
1117             else
1118             {
1119                 double left, right, two;
1120                 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
1121                 int32_t x = (int)(-4.343 * log(two) + .499);
1122                 if (x > 255) x = 255;
1123                 ptr[i] = x;
1124             }
1125         }
1126         bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
1127     }
1128     if ( fmt_flag&B2B_FMT_DP4 )
1129         bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
1130     if ( fmt_flag&B2B_FMT_ADF )
1131         bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1132     if ( fmt_flag&B2B_FMT_ADR )
1133         bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1134     if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
1135     {
1136         for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
1137         if ( fmt_flag&B2B_FMT_AD )
1138             bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1139         if ( fmt_flag&B2B_FMT_DPR )
1140             bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1141     }
1142     if ( fmt_flag&B2B_FMT_SCR )
1143         bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
1144     if ( fmt_flag&B2B_FMT_QS )
1145         bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
1146 
1147     return 0;
1148 }
1149