1 /* bam2bcf.c -- variant calling.
2
3 Copyright (C) 2010-2012 Broad Institute.
4 Copyright (C) 2012-2021 Genome Research Ltd.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <math.h>
27 #include <stdint.h>
28 #include <assert.h>
29 #include <float.h>
30 #include <htslib/hts.h>
31 #include <htslib/sam.h>
32 #include <htslib/kstring.h>
33 #include <htslib/kfunc.h>
34 #include "bam2bcf.h"
35
36 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
37
38 #define CALL_DEFTHETA 0.83
39 #define DEF_MAPQ 20
40
41 #define CAP_DIST 25
42
bcf_call_init(double theta,int min_baseQ,int max_baseQ,int delta_baseQ)43 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
44 int delta_baseQ)
45 {
46 bcf_callaux_t *bca;
47 if (theta <= 0.) theta = CALL_DEFTHETA;
48 bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
49 bca->capQ = 60;
50 bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
51 bca->min_baseQ = min_baseQ;
52 bca->max_baseQ = max_baseQ;
53 bca->delta_baseQ = delta_baseQ;
54 bca->e = errmod_init(1. - theta);
55 bca->min_frac = 0.002;
56 bca->min_support = 1;
57 bca->per_sample_flt = 0;
58 bca->npos = 100;
59 bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
60 bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
61 bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
62 bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
63 bca->nqual = 60;
64 bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
65 bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
66 bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
67 bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
68 bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
69 bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
70 bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
71 bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
72 return bca;
73 }
74
bcf_call_destroy(bcf_callaux_t * bca)75 void bcf_call_destroy(bcf_callaux_t *bca)
76 {
77 if (bca == 0) return;
78 errmod_destroy(bca->e);
79 if (bca->npos) {
80 free(bca->ref_pos); free(bca->alt_pos);
81 free(bca->iref_pos); free(bca->ialt_pos);
82 bca->npos = 0;
83 }
84 free(bca->ref_mq); free(bca->alt_mq);
85 free(bca->iref_mq); free(bca->ialt_mq);
86 free(bca->ref_bq); free(bca->alt_bq);
87 free(bca->fwd_mqs); free(bca->rev_mqs);
88 bca->nqual = 0;
89 free(bca->bases); free(bca->inscns); free(bca);
90 }
91
92 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len,int * sc_len,int * sc_dist)93 static int get_position(const bam_pileup1_t *p, int *len,
94 int *sc_len, int *sc_dist) {
95 int i, j, edist = p->qpos + 1;
96 int sc_left = 0, sc_right = 0;
97 int sc_left_dist = -1, sc_right_dist = -1;
98
99 // left end
100 for (i = 0; i < p->b->core.n_cigar; i++) {
101 int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
102 if (cig == BAM_CHARD_CLIP)
103 continue;
104 else if (cig == BAM_CSOFT_CLIP)
105 sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
106 else
107 break;
108 }
109 if (sc_left)
110 sc_left_dist = p->qpos+1 - sc_left;
111 edist -= sc_left;
112
113 // right end
114 for (j = p->b->core.n_cigar-1; j >= i; j--) {
115 int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
116 if (cig == BAM_CHARD_CLIP)
117 continue;
118 else if (cig == BAM_CSOFT_CLIP)
119 sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
120 else
121 break;
122 }
123 if (sc_right)
124 sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
125
126 // Distance to nearest soft-clips and length of that clip.
127 if (sc_left_dist >= 0) {
128 if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
129 *sc_len = sc_left;
130 *sc_dist = sc_left_dist;
131 }
132 } else if (sc_right_dist >= 0) {
133 *sc_len = sc_right;
134 *sc_dist = sc_right_dist;
135 } else {
136 *sc_len = 0;
137 *sc_dist = 0;
138 }
139
140 *len = p->b->core.l_qseq - sc_left - sc_right;
141 return edist;
142 }
143
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)144 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
145 {
146 memset(bca->ref_pos,0,sizeof(int)*bca->npos);
147 memset(bca->alt_pos,0,sizeof(int)*bca->npos);
148 memset(bca->iref_pos,0,sizeof(int)*bca->npos);
149 memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
150 memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
151 memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
152 memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
153 memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
154 memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
155 memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
156 memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
157 memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
158 if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
159 if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
160 if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
161 memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
162 memset(bca->ref_scl, 0, 100*sizeof(int));
163 memset(bca->alt_scl, 0, 100*sizeof(int));
164 memset(bca->iref_scl, 0, 100*sizeof(int));
165 memset(bca->ialt_scl, 0, 100*sizeof(int));
166 }
167
168 /*
169 Notes:
170 - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
171 which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
172 Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
173 - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
174 */
175 /*
176 * This function is called once for each sample.
177 * _n is number of pilesups pl contributing reads to this sample
178 * pl is pointer to array of _n pileups (one pileup per read)
179 * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
180 * bca is the settings to perform calls across all samples
181 * r is the returned value of the call
182 */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)183 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
184 {
185 int i, n, ref4, is_indel, ori_depth = 0;
186
187 // clean from previous run
188 r->ori_depth = 0;
189 r->mq0 = 0;
190 memset(r->anno,0,sizeof(double)*16);
191 memset(r->p,0,sizeof(float)*25);
192 r->SCR = 0;
193
194 if (ref_base >= 0) {
195 ref4 = seq_nt16_int[ref_base];
196 is_indel = 0;
197 } else ref4 = 4, is_indel = 1;
198 if (_n == 0) return -1;
199 // enlarge the bases array if necessary
200 if (bca->max_bases < _n) {
201 bca->max_bases = _n;
202 kroundup32(bca->max_bases);
203 bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
204 }
205
206 // fill the bases array
207 double nqual_over_60 = bca->nqual / 60.0;
208 int ADR_ref_missed[4] = {0};
209 int ADF_ref_missed[4] = {0};
210 for (i = n = 0; i < _n; ++i) {
211 const bam_pileup1_t *p = pl + i;
212 int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
213 if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
214 if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
215 if (p->is_del && !is_indel) continue;
216 ++ori_depth;
217 if (is_indel)
218 {
219 b = p->aux>>16&0x3f;
220 seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
221 if (q < bca->min_baseQ)
222 {
223 if (!p->indel && b < 4)
224 {
225 if (bam_is_rev(p->b))
226 ADR_ref_missed[b]++;
227 else
228 ADF_ref_missed[b]++;
229 }
230 continue;
231 }
232 if (p->indel == 0 && (q < _n/2 || _n > 20)) {
233 // high quality indel calls without p->indel set aren't
234 // particularly indicative of being a good REF match either,
235 // at least not in low coverage. So require solid coverage
236 // before we start utilising such quals.
237 b = 0;
238 q = (int)bam_get_qual(p->b)[p->qpos];
239 seqQ = (3*seqQ + 2*q)/8;
240 }
241 if (_n > 20 && seqQ > 40) seqQ = 40;
242 baseQ = p->aux>>8&0xff;
243
244 is_diff = (b != 0);
245 }
246 else
247 {
248 b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
249 b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
250
251 // Lowest of this and neighbour quality values
252 uint8_t *qual = bam_get_qual(p->b);
253 q = qual[p->qpos];
254 if (p->qpos > 0 &&
255 q > qual[p->qpos-1]+bca->delta_baseQ)
256 q = qual[p->qpos-1]+bca->delta_baseQ;
257 if (p->qpos+1 < p->b->core.l_qseq &&
258 q > qual[p->qpos+1]+bca->delta_baseQ)
259 q = qual[p->qpos+1]+bca->delta_baseQ;
260
261 if (q < bca->min_baseQ) continue;
262 if (q > bca->max_baseQ) q = bca->max_baseQ;
263 baseQ = q;
264 seqQ = 99;
265 is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
266 }
267 mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
268 if ( !mapQ ) r->mq0++;
269 if (q > seqQ) q = seqQ;
270 mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
271 if (q > mapQ) q = mapQ;
272 if (q > 63) q = 63;
273 if (q < 4) q = 4; // MQ=0 reads count as BQ=4
274 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
275 // collect annotations
276 if (b < 4)
277 {
278 r->QS[b] += q;
279 if ( r->ADF )
280 {
281 if ( bam_is_rev(p->b) )
282 r->ADR[b]++;
283 else
284 r->ADF[b]++;
285 }
286 }
287 ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
288 min_dist = p->b->core.l_qseq - 1 - p->qpos;
289 if (min_dist > p->qpos) min_dist = p->qpos;
290 if (min_dist > CAP_DIST) min_dist = CAP_DIST;
291 r->anno[1<<2|is_diff<<1|0] += baseQ;
292 r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
293 r->anno[2<<2|is_diff<<1|0] += mapQ;
294 r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
295 r->anno[3<<2|is_diff<<1|0] += min_dist;
296 r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
297
298 // collect for bias tests
299 if ( baseQ > 59 ) baseQ = 59;
300 if ( mapQ > 59 ) mapQ = 59;
301 int len, epos = 0, sc_len = 0, sc_dist = 0;
302 if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
303 {
304 int pos = get_position(p, &len, &sc_len, &sc_dist);
305 epos = (double)pos/(len+1) * bca->npos;
306
307 if (sc_len) {
308 sc_len = 15.0*sc_len / sc_dist;
309 if (sc_len > 99) sc_len = 99;
310 }
311 }
312
313 int imq = mapQ * nqual_over_60;
314 int ibq = baseQ * nqual_over_60;
315
316 if ( bam_is_rev(p->b) )
317 bca->rev_mqs[imq]++;
318 else
319 bca->fwd_mqs[imq]++;
320
321 if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
322 {
323 bca->ref_pos[epos]++;
324 bca->ref_bq[ibq]++;
325 bca->ref_mq[imq]++;
326 bca->ref_scl[sc_len]++;
327 }
328 else
329 {
330 bca->alt_pos[epos]++;
331 bca->alt_bq[ibq]++;
332 bca->alt_mq[imq]++;
333 bca->alt_scl[sc_len]++;
334 }
335 }
336
337 // Compensate for AD not being counted on low quality REF indel matches.
338 if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
339 {
340 for (i=0; i<4; i++)
341 {
342 r->ADR[0] += ADR_ref_missed[i];
343 r->ADF[0] += ADF_ref_missed[i];
344 }
345 }
346 else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
347 {
348 int dp = 0, dp_ambig = 0;
349 for (i=0; i<4; i++) dp += r->ADR[i];
350 for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
351 if ( dp )
352 for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
353 dp = 0, dp_ambig = 0;
354 for (i=0; i<4; i++) dp += r->ADF[i];
355 for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
356 if ( dp )
357 for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
358 }
359
360 r->ori_depth = ori_depth;
361 // glfgen
362 errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
363 return n;
364 }
365
366
367 /*
368 * calc_vdb() - returns value between zero (most biased) and one (no bias)
369 * on success, or HUGE_VAL when VDB cannot be calculated because
370 * of insufficient depth (<2x)
371 *
372 * Variant Distance Bias tests if the variant bases are positioned within the
373 * reads with sufficient randomness. Unlike other tests, it looks only at
374 * variant reads and therefore gives different kind of information than Read
375 * Position Bias for instance. VDB was developed for detecting artefacts in
376 * RNA-seq calls where reads from spliced transcripts span splice site
377 * boundaries. The current implementation differs somewhat from the original
378 * version described in supplementary material of PMID:22524474, but the idea
379 * remains the same. (Here the random variable tested is the average distance
380 * from the averaged position, not the average pairwise distance.)
381 *
382 * For coverage of 2x, the calculation is exact but is approximated for the
383 * rest. The result is most accurate between 4-200x. For 3x or >200x, the
384 * reported values are slightly more favourable than those of a true random
385 * distribution.
386 */
calc_vdb(int * pos,int npos)387 double calc_vdb(int *pos, int npos)
388 {
389 // Note well: the parameters were obtained by fitting to simulated data of
390 // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
391 const int readlen = 100;
392 assert( npos==readlen );
393
394 #define nparam 15
395 const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
396 {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
397 {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
398 {200,0.7,23.7} };
399
400 int i, dp = 0;
401 float mean_pos = 0, mean_diff = 0;
402 for (i=0; i<npos; i++)
403 {
404 if ( !pos[i] ) continue;
405 dp += pos[i];
406 mean_pos += pos[i]*i;
407 }
408 if ( dp<2 ) return HUGE_VAL; // one or zero reads can be placed anywhere
409
410 mean_pos /= dp;
411 for (i=0; i<npos; i++)
412 {
413 if ( !pos[i] ) continue;
414 mean_diff += pos[i] * fabs(i - mean_pos);
415 }
416 mean_diff /= dp;
417
418 int ipos = mean_diff; // tuned for float-to-int implicit conversion
419 if ( dp==2 )
420 return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
421
422 if ( dp>=200 )
423 i = nparam; // shortcut for big depths
424 else
425 {
426 for (i=0; i<nparam; i++)
427 if ( param[i][0]>=dp ) break;
428 }
429 float pshift, pscale;
430 if ( i==nparam )
431 {
432 // the depth is too high, go with 200x
433 pscale = param[nparam-1][1];
434 pshift = param[nparam-1][2];
435 }
436 else if ( i>0 && param[i][0]!=dp )
437 {
438 // linear interpolation of parameters
439 pscale = (param[i-1][1] + param[i][1])*0.5;
440 pshift = (param[i-1][2] + param[i][2])*0.5;
441 }
442 else
443 {
444 pscale = param[i][1];
445 pshift = param[i][2];
446 }
447 return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
448 }
449
calc_chisq_bias(int * a,int * b,int n)450 double calc_chisq_bias(int *a, int *b, int n)
451 {
452 int na = 0, nb = 0, i, ndf = n;
453 for (i=0; i<n; i++) na += a[i];
454 for (i=0; i<n; i++) nb += b[i];
455 if ( !na || !nb ) return HUGE_VAL;
456
457 double chisq = 0;
458 for (i=0; i<n; i++)
459 {
460 if ( !a[i] && !b[i] ) ndf--;
461 else
462 {
463 double tmp = a[i] - b[i];
464 chisq += tmp*tmp/(a[i]+b[i]);
465 }
466 }
467 /*
468 kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
469 1 if the distributions are identical, 0 if very different
470 */
471 double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
472 return prob;
473 }
474
mann_whitney_1947_(int n,int m,int U)475 static double mann_whitney_1947_(int n, int m, int U)
476 {
477 if (U<0) return 0;
478 if (n==0||m==0) return U==0 ? 1 : 0;
479 return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
480 }
481
mann_whitney_1947(int n,int m,int U)482 double mann_whitney_1947(int n, int m, int U)
483 {
484 #include "mw.h"
485
486 assert(n >= 2 && m >= 2);
487
488 return (n < 8 && m < 8 && U < 50)
489 ? mw[n-2][m-2][U]
490 : mann_whitney_1947_(n,m,U);
491 }
492
mann_whitney_1947_cdf(int n,int m,int U)493 double mann_whitney_1947_cdf(int n, int m, int U)
494 {
495 int i;
496 double sum = 0;
497 for (i=0; i<=U; i++)
498 sum += mann_whitney_1947(n,m,i);
499 return sum;
500 }
501
calc_mwu_bias_cdf(int * a,int * b,int n)502 double calc_mwu_bias_cdf(int *a, int *b, int n)
503 {
504 int na = 0, nb = 0, i;
505 double U = 0;
506 //double ties = 0;
507 for (i=0; i<n; i++)
508 {
509 na += a[i];
510 U += a[i] * (nb + b[i]*0.5);
511 nb += b[i];
512 // if ( a[i] && b[i] )
513 // {
514 // double tie = a[i] + b[i];
515 // ties += (tie*tie-1)*tie;
516 // }
517 }
518 if ( !na || !nb ) return HUGE_VAL;
519
520 // Always work with the smaller U
521 double U_min = ((double)na * nb) - U;
522 if ( U < U_min ) U_min = U;
523
524 if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
525 if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
526
527 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
528 if ( na>=8 || nb>=8 )
529 {
530 double mean = ((double)na*nb)*0.5;
531 // Correction for ties:
532 // double N = na+nb;
533 // double var2 = (N*N-1)*N-ties;
534 // if ( var2==0 ) return 1.0;
535 // var2 *= ((double)na*nb)/N/(N-1)/12.0;
536 // No correction for ties:
537 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
538 double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
539 return 2.0 - kf_erfc(z); // which is 1 + erf(z)
540 }
541
542 // Exact calculation
543 double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
544 return pval>1 ? 1 : pval;
545 }
546
calc_mwu_bias(int * a,int * b,int n,int left)547 double calc_mwu_bias(int *a, int *b, int n, int left)
548 {
549 int na = 0, nb = 0, i;
550 double U = 0;
551 // double ties = 0;
552 for (i=0; i<n; i++)
553 {
554 if (!a[i]) {
555 if (!b[i]) continue;
556 nb += b[i];
557 } else if (!b[i]) {
558 na += a[i];
559 U += a[i] * nb;
560 } else {
561 na += a[i];
562 U += a[i] * (nb + b[i]*0.5);
563 nb += b[i];
564 // double tie = a[i] + b[i];
565 // ties += (tie*tie-1)*tie;
566 }
567 }
568 if ( !na || !nb ) return HUGE_VAL;
569 if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
570
571 double mean = ((double)na*nb)*0.5;
572 if (left && U > mean) return 1; // for MQB which is asymmetrical
573 if ( na==2 || nb==2 )
574 {
575 // Linear approximation
576 return U>mean ? (2.0*mean-U)/mean : U/mean;
577 }
578 // Correction for ties:
579 // double N = na+nb;
580 // double var2 = (N*N-1)*N-ties;
581 // if ( var2==0 ) return 1.0;
582 // var2 *= ((double)na*nb)/N/(N-1)/12.0;
583 // No correction for ties:
584 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
585 if ( na>=8 || nb>=8 )
586 {
587 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
588 return exp(-0.5*(U-mean)*(U-mean)/var2);
589 }
590
591 // Exact calculation
592 return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
593 }
594
595 // A Z-score version of the above function.
596 //
597 // See "Normal approximation and tie correction" at
598 // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
599 //
600 // The Z score is the number of standard deviations above or below the mean
601 // with 0 being equality of the two distributions and +ve/-ve from there.
602 //
603 // This is a more robust score to filter on.
calc_mwu_biasZ(int * a,int * b,int n,int left_only,int do_Z)604 double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
605 int i;
606 int64_t t;
607
608 // Optimisation
609 for (i = 0; i < n; i++)
610 if (b[i])
611 break;
612 int b_empty = (i == n);
613
614 // Count equal (e), less-than (l) and greater-than (g) permutations.
615 int e = 0, l = 0, na = 0, nb = 0;
616 if (b_empty) {
617 for (t = 0, i = n-1; i >= 0; i--) {
618 na += a[i];
619 t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties
620 }
621 } else {
622 for (t = 0, i = n-1; i >= 0; i--) {
623 // Combinations of a[i] and b[j] for i==j
624 e += a[i]*b[i];
625
626 // nb is running total of b[i+1]..b[n-1].
627 // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
628 // for all i < j.
629 l += a[i]*nb; // a<b
630
631 na += a[i];
632 nb += b[i];
633 int p = a[i]+b[i];
634 t += (p*p-1)*p; // adjustment score for ties
635 }
636 }
637
638 if (!na || !nb)
639 return HUGE_VAL;
640
641 double U, m;
642 U = l + e*0.5; // Mann-Whitney U score
643 m = na*nb / 2.0;
644
645 // With ties adjustment
646 double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
647 // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
648 if (var2 <= 0)
649 return do_Z ? 0 : 1;
650
651 if (do_Z) {
652 // S.D. normalised Z-score
653 //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
654 return (U - m) / sqrt(var2);
655 }
656
657 // Else U score, which can be asymmetric for some data types.
658 if (left_only && U > m)
659 return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
660
661 if (na >= 8 || nb >= 8) {
662 // Normal approximation, very good for na>=8 && nb>=8 and
663 // reasonable if na<8 or nb<8
664 return exp(-0.5*(U-m)*(U-m)/var2);
665 }
666
667 // Exact calculation
668 if (na==1 || nb == 1)
669 return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
670 else
671 return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
672 }
673
logsumexp2(double a,double b)674 static inline double logsumexp2(double a, double b)
675 {
676 if ( a>b )
677 return log(1 + exp(b-a)) + a;
678 else
679 return log(1 + exp(a-b)) + b;
680 }
681
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)682 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
683 {
684 call->seg_bias = HUGE_VAL;
685 if ( !bcr ) return;
686
687 int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
688 if ( !nr ) return;
689
690 int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth
691 double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population
692 if ( M>call->n ) M = call->n; // clamp M at the number of samples
693 else if ( M==0 ) M = 1;
694 double f = M / 2. / call->n; // allele frequency
695 double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson)
696 double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson)
697 double sum = 0;
698 const double log2 = log(2.0);
699
700 // fprintf(stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
701 int i;
702 for (i=0; i<call->n; i++)
703 {
704 int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads
705 double tmp;
706 if ( oi )
707 {
708 // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
709 tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
710 tmp += log(f) + oi*log(q/p) - q + p;
711 }
712 else
713 tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
714 sum += tmp;
715 // fprintf(stderr,"oi=%d %e\n", oi,tmp);
716 }
717 call->seg_bias = sum;
718 }
719
720 /**
721 * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
722 * @n: number of samples
723 * @calls: each sample's calls
724 * @bca: auxiliary data structure for holding temporary values
725 * @ref_base: the reference base
726 * @call: filled with the annotations
727 *
728 * Combines calls across the various samples being studied
729 * 1. For each allele at each base across all samples the quality is summed so
730 * you end up with a set of quality sums for each allele present 2. The quality
731 * sums are sorted.
732 * 3. Using the sorted quality sums we now create the allele ordering array
733 * A\subN. This is done by doing the following:
734 * a) If the reference allele is known it always comes first, otherwise N
735 * comes first.
736 * b) Then the rest of the alleles are output in descending order of quality
737 * sum (which we already know the qsum array was sorted). Any allelles with
738 * qsum 0 will be excluded.
739 * 4. Using the allele ordering array we create the genotype ordering array.
740 * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1
741 * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
742 * 5. The genotype ordering array is then used to extract data from the error
743 * model 5*5 matrix and is used to produce a Phread likelihood array for each
744 * sample.
745 */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)746 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
747 {
748 int ref4, i, j;
749 float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
750 if (ref_base >= 0) {
751 call->ori_ref = ref4 = seq_nt16_int[ref_base];
752 if (ref4 > 4) ref4 = 4;
753 } else call->ori_ref = -1, ref4 = 0;
754
755 // calculate qsum, this is done by summing normalized qsum across all samples,
756 // to account for differences in coverage
757 for (i = 0; i < n; ++i)
758 {
759 float sum = 0;
760 for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
761 if ( sum )
762 for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
763 }
764
765 // sort qsum in ascending order (insertion sort)
766 float *ptr[5], *tmp;
767 for (i=0; i<5; i++) ptr[i] = &qsum[i];
768 for (i=1; i<4; i++)
769 for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
770 tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
771
772 // Set the reference allele and alternative allele(s)
773 for (i=0; i<5; i++) call->a[i] = -1;
774 for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
775 call->unseen = -1;
776 call->a[0] = ref4;
777 for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
778 {
779 int ipos = ptr[i] - qsum; // position in sorted qsum array
780 if ( ipos==ref4 )
781 call->qsum[0] = qsum[ipos]; // REF's qsum
782 else
783 {
784 if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup
785 call->qsum[j] = qsum[ipos];
786 call->a[j++] = ipos;
787 }
788 }
789 if (ref_base >= 0)
790 {
791 // for SNPs, find the "unseen" base
792 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
793 call->unseen = j, call->a[j++] = ptr[i] - qsum;
794 call->n_alleles = j;
795 }
796 else
797 {
798 call->n_alleles = j;
799 if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
800 }
801 /*
802 * Set the phread likelihood array (call->PL) This array is 15 entries long
803 * for each sample because that is size of an upper or lower triangle of a
804 * worst case 5x5 matrix of possible genotypes. This worst case matrix will
805 * occur when all 4 possible alleles are present and the reference allele
806 * is unknown. The sides of the matrix will correspond to the reference
807 * allele (if known) followed by the alleles present in descending order of
808 * quality sum
809 */
810 {
811 int x, g[15], z;
812 double sum_min = 0.;
813 x = call->n_alleles * (call->n_alleles + 1) / 2;
814 // get the possible genotypes
815 // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
816 for (i = z = 0; i < call->n_alleles; ++i) {
817 for (j = 0; j <= i; ++j) {
818 g[z++] = call->a[j] * 5 + call->a[i];
819 }
820 }
821 // for each sample calculate the PL
822 for (i = 0; i < n; ++i)
823 {
824 int32_t *PL = call->PL + x * i;
825 const bcf_callret1_t *r = calls + i;
826 float min = FLT_MAX;
827 for (j = 0; j < x; ++j) {
828 if (min > r->p[g[j]]) min = r->p[g[j]];
829 }
830 sum_min += min;
831 for (j = 0; j < x; ++j) {
832 int y;
833 y = (int)(r->p[g[j]] - min + .499);
834 if (y > 255) y = 255;
835 PL[j] = y;
836 }
837 }
838 if ( call->DP4 )
839 {
840 for (i=0; i<n; i++)
841 {
842 call->DP4[4*i] = calls[i].anno[0];
843 call->DP4[4*i+1] = calls[i].anno[1];
844 call->DP4[4*i+2] = calls[i].anno[2];
845 call->DP4[4*i+3] = calls[i].anno[3];
846 }
847 }
848 if ( call->SCR )
849 {
850 for (i=0; i<n; i++)
851 {
852 call->SCR[0] += calls[i].SCR;
853 call->SCR[1+i] = calls[i].SCR;
854 }
855 }
856 if ( call->ADF )
857 {
858 assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
859
860 // reorder ADR,ADF to match the allele ordering at this site
861 int32_t tmp[B2B_MAX_ALLELES];
862 int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
863 int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
864 int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
865 int32_t *adf_tot = call->ADF;
866 for (i=0; i<n; i++)
867 {
868 for (j=0; j<call->n_alleles; j++)
869 {
870 tmp[j] = adr[ call->a[j] ];
871 adr_tot[j] += tmp[j];
872 }
873 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
874 for (j=0; j<call->n_alleles; j++)
875 {
876 tmp[j] = adf[ call->a[j] ];
877 adf_tot[j] += tmp[j];
878 }
879 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
880 adf_out += call->n_alleles;
881 adr_out += call->n_alleles;
882 adr += B2B_MAX_ALLELES;
883 adf += B2B_MAX_ALLELES;
884 }
885 }
886 if ( bca->fmt_flag & B2B_FMT_QS )
887 {
888 assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
889
890 // reorder QS to match the allele ordering at this site
891 int32_t tmp[B2B_MAX_ALLELES];
892 int32_t *qs = call->QS, *qs_out = call->QS;
893 for (i=0; i<n; i++)
894 {
895 for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
896 for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
897 qs_out += call->n_alleles;
898 qs += B2B_MAX_ALLELES;
899 }
900 }
901
902 // if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
903 call->shift = (int)(sum_min + .499);
904 }
905 // combine annotations
906 memset(call->anno, 0, 16 * sizeof(double));
907 call->ori_depth = 0;
908 call->depth = 0;
909 call->mq0 = 0;
910 for (i = 0; i < n; ++i) {
911 call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
912 call->ori_depth += calls[i].ori_depth;
913 call->mq0 += calls[i].mq0;
914 for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
915 }
916
917 calc_SegBias(calls, call);
918
919 // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
920 // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
921 // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
922
923 if (bca->fmt_flag & B2B_INFO_ZSCORE) {
924 // U z-normalised as +/- number of standard deviations from mean.
925 if (call->ori_ref < 0) {
926 if (bca->fmt_flag & B2B_INFO_RPB)
927 call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
928 bca->npos, 0, 1);
929 call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
930 bca->nqual,1,1);
931 if ( bca->fmt_flag & B2B_INFO_SCB )
932 call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
933 100, 0,1);
934 } else {
935 if (bca->fmt_flag & B2B_INFO_RPB)
936 call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
937 bca->npos, 0, 1);
938 call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
939 bca->nqual,1,1);
940 call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
941 bca->nqual,0,1);
942 call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
943 bca->nqual,0,1);
944 if ( bca->fmt_flag & B2B_INFO_SCB )
945 call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
946 100, 0,1);
947 }
948 } else {
949 // Old method; U as probability between 0 and 1
950 if ( bca->fmt_flag & B2B_INFO_RPB )
951 call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
952 bca->npos, 0, 0);
953 call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
954 bca->nqual, 1, 0);
955 call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
956 bca->nqual, 0, 0);
957 call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
958 bca->nqual, 0, 0);
959 }
960
961 #if CDF_MWU_TESTS
962 // CDF version of MWU tests is not calculated by default
963 if ( bca->fmt_flag & B2B_INFO_RPB )
964 call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
965 call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
966 call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
967 call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
968 #endif
969
970 if ( bca->fmt_flag & B2B_INFO_VDB )
971 call->vdb = calc_vdb(bca->alt_pos, bca->npos);
972
973 return 0;
974 }
975
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)976 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
977 {
978 extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
979 int i, j, nals = 1;
980
981 bcf_hdr_t *hdr = bc->bcf_hdr;
982 rec->rid = bc->tid;
983 rec->pos = bc->pos;
984 rec->qual = 0;
985
986 bc->tmp.l = 0;
987 if (bc->ori_ref < 0) // indel
988 {
989 // REF
990 kputc(ref[bc->pos], &bc->tmp);
991 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
992
993 // ALT
994 for (i=1; i<4; i++)
995 {
996 if (bc->a[i] < 0) break;
997 kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
998
999 if (bca->indel_types[bc->a[i]] < 0) { // deletion
1000 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
1001 kputc(ref[bc->pos+1+j], &bc->tmp);
1002 } else { // insertion; cannot be a reference unless a bug
1003 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
1004 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
1005 kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
1006 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
1007 }
1008 nals++;
1009 }
1010 }
1011 else // SNP
1012 {
1013 kputc("ACGTN"[bc->ori_ref], &bc->tmp);
1014 for (i=1; i<5; i++)
1015 {
1016 if (bc->a[i] < 0) break;
1017 kputc(',', &bc->tmp);
1018 if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
1019 else kputc("ACGT"[bc->a[i]], &bc->tmp);
1020 nals++;
1021 }
1022 }
1023 bcf_update_alleles_str(hdr, rec, bc->tmp.s);
1024
1025 bc->tmp.l = 0;
1026
1027 // INFO
1028 if (bc->ori_ref < 0)
1029 {
1030 bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
1031 bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
1032 bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
1033 }
1034 bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
1035 if ( fmt_flag&B2B_INFO_ADF )
1036 bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
1037 if ( fmt_flag&B2B_INFO_ADR )
1038 bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
1039 if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
1040 {
1041 for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
1042 if ( fmt_flag&B2B_INFO_AD )
1043 bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
1044 if ( fmt_flag&B2B_INFO_DPR )
1045 bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
1046 }
1047 if ( fmt_flag&B2B_INFO_SCR )
1048 bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1);
1049
1050 float tmpf[16];
1051 for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
1052 bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
1053 bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
1054
1055 if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
1056 if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
1057
1058 if (bca->fmt_flag & B2B_INFO_ZSCORE) {
1059 if ( bc->mwu_pos != HUGE_VAL )
1060 bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
1061 if ( bc->mwu_mq != HUGE_VAL )
1062 bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
1063 if ( bc->mwu_mqs != HUGE_VAL )
1064 bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
1065 if ( bc->mwu_bq != HUGE_VAL )
1066 bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
1067 if ( bc->mwu_sc != HUGE_VAL )
1068 bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
1069 } else {
1070 if ( bc->mwu_pos != HUGE_VAL )
1071 bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
1072 if ( bc->mwu_mq != HUGE_VAL )
1073 bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
1074 if ( bc->mwu_mqs != HUGE_VAL )
1075 bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
1076 if ( bc->mwu_bq != HUGE_VAL )
1077 bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
1078 }
1079
1080 if ( bc->strand_bias != HUGE_VAL )
1081 bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
1082
1083 #if CDF_MWU_TESTS
1084 if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
1085 if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
1086 if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
1087 if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
1088 #endif
1089 tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
1090 bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
1091
1092 // FORMAT
1093 rec->n_sample = bc->n;
1094 bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
1095 if ( fmt_flag&B2B_FMT_DP )
1096 {
1097 int32_t *ptr = (int32_t*) bc->fmt_arr;
1098 for (i=0; i<bc->n; i++)
1099 ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
1100 bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
1101 }
1102 if ( fmt_flag&B2B_FMT_DV )
1103 {
1104 int32_t *ptr = (int32_t*) bc->fmt_arr;
1105 for (i=0; i<bc->n; i++)
1106 ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
1107 bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
1108 }
1109 if ( fmt_flag&B2B_FMT_SP )
1110 {
1111 int32_t *ptr = (int32_t*) bc->fmt_arr;
1112 for (i=0; i<bc->n; i++)
1113 {
1114 int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
1115 if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
1116 ptr[i] = 0;
1117 else
1118 {
1119 double left, right, two;
1120 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
1121 int32_t x = (int)(-4.343 * log(two) + .499);
1122 if (x > 255) x = 255;
1123 ptr[i] = x;
1124 }
1125 }
1126 bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
1127 }
1128 if ( fmt_flag&B2B_FMT_DP4 )
1129 bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
1130 if ( fmt_flag&B2B_FMT_ADF )
1131 bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1132 if ( fmt_flag&B2B_FMT_ADR )
1133 bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1134 if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
1135 {
1136 for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
1137 if ( fmt_flag&B2B_FMT_AD )
1138 bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1139 if ( fmt_flag&B2B_FMT_DPR )
1140 bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1141 }
1142 if ( fmt_flag&B2B_FMT_SCR )
1143 bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
1144 if ( fmt_flag&B2B_FMT_QS )
1145 bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
1146
1147 return 0;
1148 }
1149