1 #include "bcftools.pysam.h"
2
3 /* bam2bcf.c -- variant calling.
4
5 Copyright (C) 2010-2012 Broad Institute.
6 Copyright (C) 2012-2021 Genome Research Ltd.
7
8 Author: Heng Li <lh3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #include <math.h>
29 #include <stdint.h>
30 #include <assert.h>
31 #include <float.h>
32 #include <htslib/hts.h>
33 #include <htslib/sam.h>
34 #include <htslib/kstring.h>
35 #include <htslib/kfunc.h>
36 #include "bam2bcf.h"
37
38 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
39
40 #define CALL_DEFTHETA 0.83
41 #define DEF_MAPQ 20
42
43 #define CAP_DIST 25
44
bcf_call_init(double theta,int min_baseQ,int max_baseQ,int delta_baseQ)45 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
46 int delta_baseQ)
47 {
48 bcf_callaux_t *bca;
49 if (theta <= 0.) theta = CALL_DEFTHETA;
50 bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
51 bca->capQ = 60;
52 bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
53 bca->min_baseQ = min_baseQ;
54 bca->max_baseQ = max_baseQ;
55 bca->delta_baseQ = delta_baseQ;
56 bca->e = errmod_init(1. - theta);
57 bca->min_frac = 0.002;
58 bca->min_support = 1;
59 bca->per_sample_flt = 0;
60 bca->npos = 100;
61 bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
62 bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
63 bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
64 bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
65 bca->nqual = 60;
66 bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
67 bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
68 bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
69 bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
70 bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
71 bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
72 bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
73 bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
74 return bca;
75 }
76
bcf_call_destroy(bcf_callaux_t * bca)77 void bcf_call_destroy(bcf_callaux_t *bca)
78 {
79 if (bca == 0) return;
80 errmod_destroy(bca->e);
81 if (bca->npos) {
82 free(bca->ref_pos); free(bca->alt_pos);
83 free(bca->iref_pos); free(bca->ialt_pos);
84 bca->npos = 0;
85 }
86 free(bca->ref_mq); free(bca->alt_mq);
87 free(bca->iref_mq); free(bca->ialt_mq);
88 free(bca->ref_bq); free(bca->alt_bq);
89 free(bca->fwd_mqs); free(bca->rev_mqs);
90 bca->nqual = 0;
91 free(bca->bases); free(bca->inscns); free(bca);
92 }
93
94 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len,int * sc_len,int * sc_dist)95 static int get_position(const bam_pileup1_t *p, int *len,
96 int *sc_len, int *sc_dist) {
97 int i, j, edist = p->qpos + 1;
98 int sc_left = 0, sc_right = 0;
99 int sc_left_dist = -1, sc_right_dist = -1;
100
101 // left end
102 for (i = 0; i < p->b->core.n_cigar; i++) {
103 int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
104 if (cig == BAM_CHARD_CLIP)
105 continue;
106 else if (cig == BAM_CSOFT_CLIP)
107 sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
108 else
109 break;
110 }
111 if (sc_left)
112 sc_left_dist = p->qpos+1 - sc_left;
113 edist -= sc_left;
114
115 // right end
116 for (j = p->b->core.n_cigar-1; j >= i; j--) {
117 int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
118 if (cig == BAM_CHARD_CLIP)
119 continue;
120 else if (cig == BAM_CSOFT_CLIP)
121 sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
122 else
123 break;
124 }
125 if (sc_right)
126 sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
127
128 // Distance to nearest soft-clips and length of that clip.
129 if (sc_left_dist >= 0) {
130 if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
131 *sc_len = sc_left;
132 *sc_dist = sc_left_dist;
133 }
134 } else if (sc_right_dist >= 0) {
135 *sc_len = sc_right;
136 *sc_dist = sc_right_dist;
137 } else {
138 *sc_len = 0;
139 *sc_dist = 0;
140 }
141
142 *len = p->b->core.l_qseq - sc_left - sc_right;
143 return edist;
144 }
145
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)146 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
147 {
148 memset(bca->ref_pos,0,sizeof(int)*bca->npos);
149 memset(bca->alt_pos,0,sizeof(int)*bca->npos);
150 memset(bca->iref_pos,0,sizeof(int)*bca->npos);
151 memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
152 memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
153 memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
154 memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
155 memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
156 memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
157 memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
158 memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
159 memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
160 if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
161 if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
162 if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
163 memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
164 memset(bca->ref_scl, 0, 100*sizeof(int));
165 memset(bca->alt_scl, 0, 100*sizeof(int));
166 memset(bca->iref_scl, 0, 100*sizeof(int));
167 memset(bca->ialt_scl, 0, 100*sizeof(int));
168 }
169
170 /*
171 Notes:
172 - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
173 which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
174 Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
175 - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
176 */
177 /*
178 * This function is called once for each sample.
179 * _n is number of pilesups pl contributing reads to this sample
180 * pl is pointer to array of _n pileups (one pileup per read)
181 * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
182 * bca is the settings to perform calls across all samples
183 * r is the returned value of the call
184 */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)185 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
186 {
187 int i, n, ref4, is_indel, ori_depth = 0;
188
189 // clean from previous run
190 r->ori_depth = 0;
191 r->mq0 = 0;
192 memset(r->anno,0,sizeof(double)*16);
193 memset(r->p,0,sizeof(float)*25);
194 r->SCR = 0;
195
196 if (ref_base >= 0) {
197 ref4 = seq_nt16_int[ref_base];
198 is_indel = 0;
199 } else ref4 = 4, is_indel = 1;
200 if (_n == 0) return -1;
201 // enlarge the bases array if necessary
202 if (bca->max_bases < _n) {
203 bca->max_bases = _n;
204 kroundup32(bca->max_bases);
205 bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
206 }
207
208 // fill the bases array
209 double nqual_over_60 = bca->nqual / 60.0;
210 int ADR_ref_missed[4] = {0};
211 int ADF_ref_missed[4] = {0};
212 for (i = n = 0; i < _n; ++i) {
213 const bam_pileup1_t *p = pl + i;
214 int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
215 if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
216 if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
217 if (p->is_del && !is_indel) continue;
218 ++ori_depth;
219 if (is_indel)
220 {
221 b = p->aux>>16&0x3f;
222 seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
223 if (q < bca->min_baseQ)
224 {
225 if (!p->indel && b < 4)
226 {
227 if (bam_is_rev(p->b))
228 ADR_ref_missed[b]++;
229 else
230 ADF_ref_missed[b]++;
231 }
232 continue;
233 }
234 if (p->indel == 0 && (q < _n/2 || _n > 20)) {
235 // high quality indel calls without p->indel set aren't
236 // particularly indicative of being a good REF match either,
237 // at least not in low coverage. So require solid coverage
238 // before we start utilising such quals.
239 b = 0;
240 q = (int)bam_get_qual(p->b)[p->qpos];
241 seqQ = (3*seqQ + 2*q)/8;
242 }
243 if (_n > 20 && seqQ > 40) seqQ = 40;
244 baseQ = p->aux>>8&0xff;
245
246 is_diff = (b != 0);
247 }
248 else
249 {
250 b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
251 b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
252
253 // Lowest of this and neighbour quality values
254 uint8_t *qual = bam_get_qual(p->b);
255 q = qual[p->qpos];
256 if (p->qpos > 0 &&
257 q > qual[p->qpos-1]+bca->delta_baseQ)
258 q = qual[p->qpos-1]+bca->delta_baseQ;
259 if (p->qpos+1 < p->b->core.l_qseq &&
260 q > qual[p->qpos+1]+bca->delta_baseQ)
261 q = qual[p->qpos+1]+bca->delta_baseQ;
262
263 if (q < bca->min_baseQ) continue;
264 if (q > bca->max_baseQ) q = bca->max_baseQ;
265 baseQ = q;
266 seqQ = 99;
267 is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
268 }
269 mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
270 if ( !mapQ ) r->mq0++;
271 if (q > seqQ) q = seqQ;
272 mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
273 if (q > mapQ) q = mapQ;
274 if (q > 63) q = 63;
275 if (q < 4) q = 4; // MQ=0 reads count as BQ=4
276 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
277 // collect annotations
278 if (b < 4)
279 {
280 r->QS[b] += q;
281 if ( r->ADF )
282 {
283 if ( bam_is_rev(p->b) )
284 r->ADR[b]++;
285 else
286 r->ADF[b]++;
287 }
288 }
289 ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
290 min_dist = p->b->core.l_qseq - 1 - p->qpos;
291 if (min_dist > p->qpos) min_dist = p->qpos;
292 if (min_dist > CAP_DIST) min_dist = CAP_DIST;
293 r->anno[1<<2|is_diff<<1|0] += baseQ;
294 r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
295 r->anno[2<<2|is_diff<<1|0] += mapQ;
296 r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
297 r->anno[3<<2|is_diff<<1|0] += min_dist;
298 r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
299
300 // collect for bias tests
301 if ( baseQ > 59 ) baseQ = 59;
302 if ( mapQ > 59 ) mapQ = 59;
303 int len, epos = 0, sc_len = 0, sc_dist = 0;
304 if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
305 {
306 int pos = get_position(p, &len, &sc_len, &sc_dist);
307 epos = (double)pos/(len+1) * bca->npos;
308
309 if (sc_len) {
310 sc_len = 15.0*sc_len / sc_dist;
311 if (sc_len > 99) sc_len = 99;
312 }
313 }
314
315 int imq = mapQ * nqual_over_60;
316 int ibq = baseQ * nqual_over_60;
317
318 if ( bam_is_rev(p->b) )
319 bca->rev_mqs[imq]++;
320 else
321 bca->fwd_mqs[imq]++;
322
323 if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
324 {
325 bca->ref_pos[epos]++;
326 bca->ref_bq[ibq]++;
327 bca->ref_mq[imq]++;
328 bca->ref_scl[sc_len]++;
329 }
330 else
331 {
332 bca->alt_pos[epos]++;
333 bca->alt_bq[ibq]++;
334 bca->alt_mq[imq]++;
335 bca->alt_scl[sc_len]++;
336 }
337 }
338
339 // Compensate for AD not being counted on low quality REF indel matches.
340 if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
341 {
342 for (i=0; i<4; i++)
343 {
344 r->ADR[0] += ADR_ref_missed[i];
345 r->ADF[0] += ADF_ref_missed[i];
346 }
347 }
348 else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
349 {
350 int dp = 0, dp_ambig = 0;
351 for (i=0; i<4; i++) dp += r->ADR[i];
352 for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
353 if ( dp )
354 for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
355 dp = 0, dp_ambig = 0;
356 for (i=0; i<4; i++) dp += r->ADF[i];
357 for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
358 if ( dp )
359 for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
360 }
361
362 r->ori_depth = ori_depth;
363 // glfgen
364 errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
365 return n;
366 }
367
368
369 /*
370 * calc_vdb() - returns value between zero (most biased) and one (no bias)
371 * on success, or HUGE_VAL when VDB cannot be calculated because
372 * of insufficient depth (<2x)
373 *
374 * Variant Distance Bias tests if the variant bases are positioned within the
375 * reads with sufficient randomness. Unlike other tests, it looks only at
376 * variant reads and therefore gives different kind of information than Read
377 * Position Bias for instance. VDB was developed for detecting artefacts in
378 * RNA-seq calls where reads from spliced transcripts span splice site
379 * boundaries. The current implementation differs somewhat from the original
380 * version described in supplementary material of PMID:22524474, but the idea
381 * remains the same. (Here the random variable tested is the average distance
382 * from the averaged position, not the average pairwise distance.)
383 *
384 * For coverage of 2x, the calculation is exact but is approximated for the
385 * rest. The result is most accurate between 4-200x. For 3x or >200x, the
386 * reported values are slightly more favourable than those of a true random
387 * distribution.
388 */
calc_vdb(int * pos,int npos)389 double calc_vdb(int *pos, int npos)
390 {
391 // Note well: the parameters were obtained by fitting to simulated data of
392 // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
393 const int readlen = 100;
394 assert( npos==readlen );
395
396 #define nparam 15
397 const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
398 {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
399 {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
400 {200,0.7,23.7} };
401
402 int i, dp = 0;
403 float mean_pos = 0, mean_diff = 0;
404 for (i=0; i<npos; i++)
405 {
406 if ( !pos[i] ) continue;
407 dp += pos[i];
408 mean_pos += pos[i]*i;
409 }
410 if ( dp<2 ) return HUGE_VAL; // one or zero reads can be placed anywhere
411
412 mean_pos /= dp;
413 for (i=0; i<npos; i++)
414 {
415 if ( !pos[i] ) continue;
416 mean_diff += pos[i] * fabs(i - mean_pos);
417 }
418 mean_diff /= dp;
419
420 int ipos = mean_diff; // tuned for float-to-int implicit conversion
421 if ( dp==2 )
422 return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
423
424 if ( dp>=200 )
425 i = nparam; // shortcut for big depths
426 else
427 {
428 for (i=0; i<nparam; i++)
429 if ( param[i][0]>=dp ) break;
430 }
431 float pshift, pscale;
432 if ( i==nparam )
433 {
434 // the depth is too high, go with 200x
435 pscale = param[nparam-1][1];
436 pshift = param[nparam-1][2];
437 }
438 else if ( i>0 && param[i][0]!=dp )
439 {
440 // linear interpolation of parameters
441 pscale = (param[i-1][1] + param[i][1])*0.5;
442 pshift = (param[i-1][2] + param[i][2])*0.5;
443 }
444 else
445 {
446 pscale = param[i][1];
447 pshift = param[i][2];
448 }
449 return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
450 }
451
calc_chisq_bias(int * a,int * b,int n)452 double calc_chisq_bias(int *a, int *b, int n)
453 {
454 int na = 0, nb = 0, i, ndf = n;
455 for (i=0; i<n; i++) na += a[i];
456 for (i=0; i<n; i++) nb += b[i];
457 if ( !na || !nb ) return HUGE_VAL;
458
459 double chisq = 0;
460 for (i=0; i<n; i++)
461 {
462 if ( !a[i] && !b[i] ) ndf--;
463 else
464 {
465 double tmp = a[i] - b[i];
466 chisq += tmp*tmp/(a[i]+b[i]);
467 }
468 }
469 /*
470 kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
471 1 if the distributions are identical, 0 if very different
472 */
473 double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
474 return prob;
475 }
476
mann_whitney_1947_(int n,int m,int U)477 static double mann_whitney_1947_(int n, int m, int U)
478 {
479 if (U<0) return 0;
480 if (n==0||m==0) return U==0 ? 1 : 0;
481 return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
482 }
483
mann_whitney_1947(int n,int m,int U)484 double mann_whitney_1947(int n, int m, int U)
485 {
486 #include "mw.h"
487
488 assert(n >= 2 && m >= 2);
489
490 return (n < 8 && m < 8 && U < 50)
491 ? mw[n-2][m-2][U]
492 : mann_whitney_1947_(n,m,U);
493 }
494
mann_whitney_1947_cdf(int n,int m,int U)495 double mann_whitney_1947_cdf(int n, int m, int U)
496 {
497 int i;
498 double sum = 0;
499 for (i=0; i<=U; i++)
500 sum += mann_whitney_1947(n,m,i);
501 return sum;
502 }
503
calc_mwu_bias_cdf(int * a,int * b,int n)504 double calc_mwu_bias_cdf(int *a, int *b, int n)
505 {
506 int na = 0, nb = 0, i;
507 double U = 0;
508 //double ties = 0;
509 for (i=0; i<n; i++)
510 {
511 na += a[i];
512 U += a[i] * (nb + b[i]*0.5);
513 nb += b[i];
514 // if ( a[i] && b[i] )
515 // {
516 // double tie = a[i] + b[i];
517 // ties += (tie*tie-1)*tie;
518 // }
519 }
520 if ( !na || !nb ) return HUGE_VAL;
521
522 // Always work with the smaller U
523 double U_min = ((double)na * nb) - U;
524 if ( U < U_min ) U_min = U;
525
526 if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
527 if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
528
529 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
530 if ( na>=8 || nb>=8 )
531 {
532 double mean = ((double)na*nb)*0.5;
533 // Correction for ties:
534 // double N = na+nb;
535 // double var2 = (N*N-1)*N-ties;
536 // if ( var2==0 ) return 1.0;
537 // var2 *= ((double)na*nb)/N/(N-1)/12.0;
538 // No correction for ties:
539 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
540 double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
541 return 2.0 - kf_erfc(z); // which is 1 + erf(z)
542 }
543
544 // Exact calculation
545 double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
546 return pval>1 ? 1 : pval;
547 }
548
calc_mwu_bias(int * a,int * b,int n,int left)549 double calc_mwu_bias(int *a, int *b, int n, int left)
550 {
551 int na = 0, nb = 0, i;
552 double U = 0;
553 // double ties = 0;
554 for (i=0; i<n; i++)
555 {
556 if (!a[i]) {
557 if (!b[i]) continue;
558 nb += b[i];
559 } else if (!b[i]) {
560 na += a[i];
561 U += a[i] * nb;
562 } else {
563 na += a[i];
564 U += a[i] * (nb + b[i]*0.5);
565 nb += b[i];
566 // double tie = a[i] + b[i];
567 // ties += (tie*tie-1)*tie;
568 }
569 }
570 if ( !na || !nb ) return HUGE_VAL;
571 if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
572
573 double mean = ((double)na*nb)*0.5;
574 if (left && U > mean) return 1; // for MQB which is asymmetrical
575 if ( na==2 || nb==2 )
576 {
577 // Linear approximation
578 return U>mean ? (2.0*mean-U)/mean : U/mean;
579 }
580 // Correction for ties:
581 // double N = na+nb;
582 // double var2 = (N*N-1)*N-ties;
583 // if ( var2==0 ) return 1.0;
584 // var2 *= ((double)na*nb)/N/(N-1)/12.0;
585 // No correction for ties:
586 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
587 if ( na>=8 || nb>=8 )
588 {
589 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
590 return exp(-0.5*(U-mean)*(U-mean)/var2);
591 }
592
593 // Exact calculation
594 return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
595 }
596
597 // A Z-score version of the above function.
598 //
599 // See "Normal approximation and tie correction" at
600 // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
601 //
602 // The Z score is the number of standard deviations above or below the mean
603 // with 0 being equality of the two distributions and +ve/-ve from there.
604 //
605 // This is a more robust score to filter on.
calc_mwu_biasZ(int * a,int * b,int n,int left_only,int do_Z)606 double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
607 int i;
608 int64_t t;
609
610 // Optimisation
611 for (i = 0; i < n; i++)
612 if (b[i])
613 break;
614 int b_empty = (i == n);
615
616 // Count equal (e), less-than (l) and greater-than (g) permutations.
617 int e = 0, l = 0, na = 0, nb = 0;
618 if (b_empty) {
619 for (t = 0, i = n-1; i >= 0; i--) {
620 na += a[i];
621 t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties
622 }
623 } else {
624 for (t = 0, i = n-1; i >= 0; i--) {
625 // Combinations of a[i] and b[j] for i==j
626 e += a[i]*b[i];
627
628 // nb is running total of b[i+1]..b[n-1].
629 // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
630 // for all i < j.
631 l += a[i]*nb; // a<b
632
633 na += a[i];
634 nb += b[i];
635 int p = a[i]+b[i];
636 t += (p*p-1)*p; // adjustment score for ties
637 }
638 }
639
640 if (!na || !nb)
641 return HUGE_VAL;
642
643 double U, m;
644 U = l + e*0.5; // Mann-Whitney U score
645 m = na*nb / 2.0;
646
647 // With ties adjustment
648 double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
649 // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
650 if (var2 <= 0)
651 return do_Z ? 0 : 1;
652
653 if (do_Z) {
654 // S.D. normalised Z-score
655 //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
656 return (U - m) / sqrt(var2);
657 }
658
659 // Else U score, which can be asymmetric for some data types.
660 if (left_only && U > m)
661 return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
662
663 if (na >= 8 || nb >= 8) {
664 // Normal approximation, very good for na>=8 && nb>=8 and
665 // reasonable if na<8 or nb<8
666 return exp(-0.5*(U-m)*(U-m)/var2);
667 }
668
669 // Exact calculation
670 if (na==1 || nb == 1)
671 return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
672 else
673 return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
674 }
675
logsumexp2(double a,double b)676 static inline double logsumexp2(double a, double b)
677 {
678 if ( a>b )
679 return log(1 + exp(b-a)) + a;
680 else
681 return log(1 + exp(a-b)) + b;
682 }
683
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)684 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
685 {
686 call->seg_bias = HUGE_VAL;
687 if ( !bcr ) return;
688
689 int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
690 if ( !nr ) return;
691
692 int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth
693 double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population
694 if ( M>call->n ) M = call->n; // clamp M at the number of samples
695 else if ( M==0 ) M = 1;
696 double f = M / 2. / call->n; // allele frequency
697 double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson)
698 double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson)
699 double sum = 0;
700 const double log2 = log(2.0);
701
702 // fprintf(bcftools_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
703 int i;
704 for (i=0; i<call->n; i++)
705 {
706 int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads
707 double tmp;
708 if ( oi )
709 {
710 // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
711 tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
712 tmp += log(f) + oi*log(q/p) - q + p;
713 }
714 else
715 tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
716 sum += tmp;
717 // fprintf(bcftools_stderr,"oi=%d %e\n", oi,tmp);
718 }
719 call->seg_bias = sum;
720 }
721
722 /**
723 * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
724 * @n: number of samples
725 * @calls: each sample's calls
726 * @bca: auxiliary data structure for holding temporary values
727 * @ref_base: the reference base
728 * @call: filled with the annotations
729 *
730 * Combines calls across the various samples being studied
731 * 1. For each allele at each base across all samples the quality is summed so
732 * you end up with a set of quality sums for each allele present 2. The quality
733 * sums are sorted.
734 * 3. Using the sorted quality sums we now create the allele ordering array
735 * A\subN. This is done by doing the following:
736 * a) If the reference allele is known it always comes first, otherwise N
737 * comes first.
738 * b) Then the rest of the alleles are output in descending order of quality
739 * sum (which we already know the qsum array was sorted). Any allelles with
740 * qsum 0 will be excluded.
741 * 4. Using the allele ordering array we create the genotype ordering array.
742 * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1
743 * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
744 * 5. The genotype ordering array is then used to extract data from the error
745 * model 5*5 matrix and is used to produce a Phread likelihood array for each
746 * sample.
747 */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)748 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
749 {
750 int ref4, i, j;
751 float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
752 if (ref_base >= 0) {
753 call->ori_ref = ref4 = seq_nt16_int[ref_base];
754 if (ref4 > 4) ref4 = 4;
755 } else call->ori_ref = -1, ref4 = 0;
756
757 // calculate qsum, this is done by summing normalized qsum across all samples,
758 // to account for differences in coverage
759 for (i = 0; i < n; ++i)
760 {
761 float sum = 0;
762 for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
763 if ( sum )
764 for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
765 }
766
767 // sort qsum in ascending order (insertion sort)
768 float *ptr[5], *tmp;
769 for (i=0; i<5; i++) ptr[i] = &qsum[i];
770 for (i=1; i<4; i++)
771 for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
772 tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
773
774 // Set the reference allele and alternative allele(s)
775 for (i=0; i<5; i++) call->a[i] = -1;
776 for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
777 call->unseen = -1;
778 call->a[0] = ref4;
779 for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
780 {
781 int ipos = ptr[i] - qsum; // position in sorted qsum array
782 if ( ipos==ref4 )
783 call->qsum[0] = qsum[ipos]; // REF's qsum
784 else
785 {
786 if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup
787 call->qsum[j] = qsum[ipos];
788 call->a[j++] = ipos;
789 }
790 }
791 if (ref_base >= 0)
792 {
793 // for SNPs, find the "unseen" base
794 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
795 call->unseen = j, call->a[j++] = ptr[i] - qsum;
796 call->n_alleles = j;
797 }
798 else
799 {
800 call->n_alleles = j;
801 if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
802 }
803 /*
804 * Set the phread likelihood array (call->PL) This array is 15 entries long
805 * for each sample because that is size of an upper or lower triangle of a
806 * worst case 5x5 matrix of possible genotypes. This worst case matrix will
807 * occur when all 4 possible alleles are present and the reference allele
808 * is unknown. The sides of the matrix will correspond to the reference
809 * allele (if known) followed by the alleles present in descending order of
810 * quality sum
811 */
812 {
813 int x, g[15], z;
814 double sum_min = 0.;
815 x = call->n_alleles * (call->n_alleles + 1) / 2;
816 // get the possible genotypes
817 // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
818 for (i = z = 0; i < call->n_alleles; ++i) {
819 for (j = 0; j <= i; ++j) {
820 g[z++] = call->a[j] * 5 + call->a[i];
821 }
822 }
823 // for each sample calculate the PL
824 for (i = 0; i < n; ++i)
825 {
826 int32_t *PL = call->PL + x * i;
827 const bcf_callret1_t *r = calls + i;
828 float min = FLT_MAX;
829 for (j = 0; j < x; ++j) {
830 if (min > r->p[g[j]]) min = r->p[g[j]];
831 }
832 sum_min += min;
833 for (j = 0; j < x; ++j) {
834 int y;
835 y = (int)(r->p[g[j]] - min + .499);
836 if (y > 255) y = 255;
837 PL[j] = y;
838 }
839 }
840 if ( call->DP4 )
841 {
842 for (i=0; i<n; i++)
843 {
844 call->DP4[4*i] = calls[i].anno[0];
845 call->DP4[4*i+1] = calls[i].anno[1];
846 call->DP4[4*i+2] = calls[i].anno[2];
847 call->DP4[4*i+3] = calls[i].anno[3];
848 }
849 }
850 if ( call->SCR )
851 {
852 for (i=0; i<n; i++)
853 {
854 call->SCR[0] += calls[i].SCR;
855 call->SCR[1+i] = calls[i].SCR;
856 }
857 }
858 if ( call->ADF )
859 {
860 assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
861
862 // reorder ADR,ADF to match the allele ordering at this site
863 int32_t tmp[B2B_MAX_ALLELES];
864 int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
865 int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
866 int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
867 int32_t *adf_tot = call->ADF;
868 for (i=0; i<n; i++)
869 {
870 for (j=0; j<call->n_alleles; j++)
871 {
872 tmp[j] = adr[ call->a[j] ];
873 adr_tot[j] += tmp[j];
874 }
875 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
876 for (j=0; j<call->n_alleles; j++)
877 {
878 tmp[j] = adf[ call->a[j] ];
879 adf_tot[j] += tmp[j];
880 }
881 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
882 adf_out += call->n_alleles;
883 adr_out += call->n_alleles;
884 adr += B2B_MAX_ALLELES;
885 adf += B2B_MAX_ALLELES;
886 }
887 }
888 if ( bca->fmt_flag & B2B_FMT_QS )
889 {
890 assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
891
892 // reorder QS to match the allele ordering at this site
893 int32_t tmp[B2B_MAX_ALLELES];
894 int32_t *qs = call->QS, *qs_out = call->QS;
895 for (i=0; i<n; i++)
896 {
897 for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
898 for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
899 qs_out += call->n_alleles;
900 qs += B2B_MAX_ALLELES;
901 }
902 }
903
904 // if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
905 call->shift = (int)(sum_min + .499);
906 }
907 // combine annotations
908 memset(call->anno, 0, 16 * sizeof(double));
909 call->ori_depth = 0;
910 call->depth = 0;
911 call->mq0 = 0;
912 for (i = 0; i < n; ++i) {
913 call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
914 call->ori_depth += calls[i].ori_depth;
915 call->mq0 += calls[i].mq0;
916 for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
917 }
918
919 calc_SegBias(calls, call);
920
921 // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
922 // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
923 // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
924
925 if (bca->fmt_flag & B2B_INFO_ZSCORE) {
926 // U z-normalised as +/- number of standard deviations from mean.
927 if (call->ori_ref < 0) {
928 if (bca->fmt_flag & B2B_INFO_RPB)
929 call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
930 bca->npos, 0, 1);
931 call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
932 bca->nqual,1,1);
933 if ( bca->fmt_flag & B2B_INFO_SCB )
934 call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
935 100, 0,1);
936 } else {
937 if (bca->fmt_flag & B2B_INFO_RPB)
938 call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
939 bca->npos, 0, 1);
940 call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
941 bca->nqual,1,1);
942 call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
943 bca->nqual,0,1);
944 call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
945 bca->nqual,0,1);
946 if ( bca->fmt_flag & B2B_INFO_SCB )
947 call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
948 100, 0,1);
949 }
950 } else {
951 // Old method; U as probability between 0 and 1
952 if ( bca->fmt_flag & B2B_INFO_RPB )
953 call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
954 bca->npos, 0, 0);
955 call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
956 bca->nqual, 1, 0);
957 call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
958 bca->nqual, 0, 0);
959 call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
960 bca->nqual, 0, 0);
961 }
962
963 #if CDF_MWU_TESTS
964 // CDF version of MWU tests is not calculated by default
965 if ( bca->fmt_flag & B2B_INFO_RPB )
966 call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
967 call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
968 call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
969 call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
970 #endif
971
972 if ( bca->fmt_flag & B2B_INFO_VDB )
973 call->vdb = calc_vdb(bca->alt_pos, bca->npos);
974
975 return 0;
976 }
977
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)978 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
979 {
980 extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
981 int i, j, nals = 1;
982
983 bcf_hdr_t *hdr = bc->bcf_hdr;
984 rec->rid = bc->tid;
985 rec->pos = bc->pos;
986 rec->qual = 0;
987
988 bc->tmp.l = 0;
989 if (bc->ori_ref < 0) // indel
990 {
991 // REF
992 kputc(ref[bc->pos], &bc->tmp);
993 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
994
995 // ALT
996 for (i=1; i<4; i++)
997 {
998 if (bc->a[i] < 0) break;
999 kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
1000
1001 if (bca->indel_types[bc->a[i]] < 0) { // deletion
1002 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
1003 kputc(ref[bc->pos+1+j], &bc->tmp);
1004 } else { // insertion; cannot be a reference unless a bug
1005 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
1006 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
1007 kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
1008 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
1009 }
1010 nals++;
1011 }
1012 }
1013 else // SNP
1014 {
1015 kputc("ACGTN"[bc->ori_ref], &bc->tmp);
1016 for (i=1; i<5; i++)
1017 {
1018 if (bc->a[i] < 0) break;
1019 kputc(',', &bc->tmp);
1020 if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
1021 else kputc("ACGT"[bc->a[i]], &bc->tmp);
1022 nals++;
1023 }
1024 }
1025 bcf_update_alleles_str(hdr, rec, bc->tmp.s);
1026
1027 bc->tmp.l = 0;
1028
1029 // INFO
1030 if (bc->ori_ref < 0)
1031 {
1032 bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
1033 bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
1034 bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
1035 }
1036 bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
1037 if ( fmt_flag&B2B_INFO_ADF )
1038 bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
1039 if ( fmt_flag&B2B_INFO_ADR )
1040 bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
1041 if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
1042 {
1043 for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
1044 if ( fmt_flag&B2B_INFO_AD )
1045 bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
1046 if ( fmt_flag&B2B_INFO_DPR )
1047 bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
1048 }
1049 if ( fmt_flag&B2B_INFO_SCR )
1050 bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1);
1051
1052 float tmpf[16];
1053 for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
1054 bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
1055 bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
1056
1057 if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
1058 if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
1059
1060 if (bca->fmt_flag & B2B_INFO_ZSCORE) {
1061 if ( bc->mwu_pos != HUGE_VAL )
1062 bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
1063 if ( bc->mwu_mq != HUGE_VAL )
1064 bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
1065 if ( bc->mwu_mqs != HUGE_VAL )
1066 bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
1067 if ( bc->mwu_bq != HUGE_VAL )
1068 bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
1069 if ( bc->mwu_sc != HUGE_VAL )
1070 bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
1071 } else {
1072 if ( bc->mwu_pos != HUGE_VAL )
1073 bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
1074 if ( bc->mwu_mq != HUGE_VAL )
1075 bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
1076 if ( bc->mwu_mqs != HUGE_VAL )
1077 bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
1078 if ( bc->mwu_bq != HUGE_VAL )
1079 bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
1080 }
1081
1082 if ( bc->strand_bias != HUGE_VAL )
1083 bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
1084
1085 #if CDF_MWU_TESTS
1086 if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
1087 if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
1088 if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
1089 if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
1090 #endif
1091 tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
1092 bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
1093
1094 // FORMAT
1095 rec->n_sample = bc->n;
1096 bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
1097 if ( fmt_flag&B2B_FMT_DP )
1098 {
1099 int32_t *ptr = (int32_t*) bc->fmt_arr;
1100 for (i=0; i<bc->n; i++)
1101 ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
1102 bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
1103 }
1104 if ( fmt_flag&B2B_FMT_DV )
1105 {
1106 int32_t *ptr = (int32_t*) bc->fmt_arr;
1107 for (i=0; i<bc->n; i++)
1108 ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
1109 bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
1110 }
1111 if ( fmt_flag&B2B_FMT_SP )
1112 {
1113 int32_t *ptr = (int32_t*) bc->fmt_arr;
1114 for (i=0; i<bc->n; i++)
1115 {
1116 int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
1117 if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
1118 ptr[i] = 0;
1119 else
1120 {
1121 double left, right, two;
1122 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
1123 int32_t x = (int)(-4.343 * log(two) + .499);
1124 if (x > 255) x = 255;
1125 ptr[i] = x;
1126 }
1127 }
1128 bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
1129 }
1130 if ( fmt_flag&B2B_FMT_DP4 )
1131 bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
1132 if ( fmt_flag&B2B_FMT_ADF )
1133 bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1134 if ( fmt_flag&B2B_FMT_ADR )
1135 bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1136 if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
1137 {
1138 for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
1139 if ( fmt_flag&B2B_FMT_AD )
1140 bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1141 if ( fmt_flag&B2B_FMT_DPR )
1142 bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
1143 }
1144 if ( fmt_flag&B2B_FMT_SCR )
1145 bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
1146 if ( fmt_flag&B2B_FMT_QS )
1147 bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
1148
1149 return 0;
1150 }
1151