1 /* bam2bcf.c -- variant calling.
2
3 Copyright (C) 2010-2012 Broad Institute.
4 Copyright (C) 2012-2015, 2021 Genome Research Ltd.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <math.h>
29 #include <stdint.h>
30 #include <assert.h>
31 #include <float.h>
32 #include <htslib/hts.h>
33 #include <htslib/sam.h>
34 #include <htslib/kstring.h>
35 #include <htslib/kfunc.h>
36 #include "bam2bcf.h"
37
38 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
39
40 #define CALL_DEFTHETA 0.83
41 #define DEF_MAPQ 20
42
43 #define CAP_DIST 25
44
bcf_call_init(double theta,int min_baseQ)45 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
46 {
47 bcf_callaux_t *bca;
48 if (theta <= 0.) theta = CALL_DEFTHETA;
49 bca = calloc(1, sizeof(bcf_callaux_t));
50 bca->capQ = 60;
51 bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
52 bca->min_baseQ = min_baseQ;
53 bca->e = errmod_init(1. - theta);
54 bca->min_frac = 0.002;
55 bca->min_support = 1;
56 bca->per_sample_flt = 0;
57 bca->npos = 100;
58 bca->ref_pos = malloc(bca->npos*sizeof(int));
59 bca->alt_pos = malloc(bca->npos*sizeof(int));
60 bca->nqual = 60;
61 bca->ref_mq = malloc(bca->nqual*sizeof(int));
62 bca->alt_mq = malloc(bca->nqual*sizeof(int));
63 bca->ref_bq = malloc(bca->nqual*sizeof(int));
64 bca->alt_bq = malloc(bca->nqual*sizeof(int));
65 bca->fwd_mqs = malloc(bca->nqual*sizeof(int));
66 bca->rev_mqs = malloc(bca->nqual*sizeof(int));
67 return bca;
68 }
69
bcf_call_destroy(bcf_callaux_t * bca)70 void bcf_call_destroy(bcf_callaux_t *bca)
71 {
72 if (bca == 0) return;
73 errmod_destroy(bca->e);
74 if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
75 free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
76 free(bca->fwd_mqs); free(bca->rev_mqs);
77 bca->nqual = 0;
78 free(bca->bases); free(bca->inscns); free(bca);
79 }
80
81 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len)82 static int get_position(const bam_pileup1_t *p, int *len)
83 {
84 int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
85 for (icig=0; icig<p->b->core.n_cigar; icig++)
86 {
87 int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
88 int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
89 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
90 {
91 n_tot_bases += ncig;
92 iread += ncig;
93 continue;
94 }
95 if ( cig==BAM_CINS )
96 {
97 n_tot_bases += ncig;
98 iread += ncig;
99 continue;
100 }
101 if ( cig==BAM_CSOFT_CLIP )
102 {
103 iread += ncig;
104 if ( iread<=p->qpos ) edist -= ncig;
105 continue;
106 }
107 if ( cig==BAM_CDEL ) continue;
108 if ( cig==BAM_CHARD_CLIP ) continue;
109 if ( cig==BAM_CPAD ) continue;
110 if ( cig==BAM_CREF_SKIP ) continue;
111 fprintf(stderr,"todo: cigar %d\n", cig);
112 assert(0);
113 }
114 *len = n_tot_bases;
115 return edist;
116 }
117
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)118 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
119 {
120 memset(bca->ref_pos,0,sizeof(int)*bca->npos);
121 memset(bca->alt_pos,0,sizeof(int)*bca->npos);
122 memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
123 memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
124 memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
125 memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
126 memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
127 memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
128 if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
129 if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
130 }
131
132 /*
133 Notes:
134 - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
135 which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
136 Later it's used for multiallelic calling by bcftools -m
137 - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
138 */
139 /*
140 * This function is called once for each sample.
141 * _n is number of pilesups pl contributing reads to this sample
142 * pl is pointer to array of _n pileups (one pileup per read)
143 * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
144 * bca is the settings to perform calls across all samples
145 * r is the returned value of the call
146 */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)147 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
148 {
149 int i, n, ref4, is_indel, ori_depth = 0;
150
151 // clean from previous run
152 r->ori_depth = 0;
153 r->mq0 = 0;
154 memset(r->qsum,0,sizeof(float)*4);
155 memset(r->anno,0,sizeof(double)*16);
156 memset(r->p,0,sizeof(float)*25);
157
158 if (ref_base >= 0) {
159 ref4 = seq_nt16_int[ref_base];
160 is_indel = 0;
161 } else ref4 = 4, is_indel = 1;
162 if (_n == 0) return -1;
163 // enlarge the bases array if necessary
164 if (bca->max_bases < _n) {
165 bca->max_bases = _n;
166 kroundup32(bca->max_bases);
167 bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
168 }
169 // fill the bases array
170 for (i = n = 0; i < _n; ++i) {
171 const bam_pileup1_t *p = pl + i;
172 int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
173 // set base
174 if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
175 ++ori_depth;
176 mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
177 if ( !mapQ ) r->mq0++;
178 baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
179 seqQ = is_indel? (p->aux>>8&0xff) : 99;
180 if (q < bca->min_baseQ) continue;
181 if (q > seqQ) q = seqQ;
182 mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
183 if (q > mapQ) q = mapQ;
184 if (q > 63) q = 63;
185 if (q < 4) q = 4; // MQ=0 reads count as BQ=4
186 if (!is_indel) {
187 b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
188 b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
189 is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
190 } else {
191 b = p->aux>>16&0x3f;
192 is_diff = (b != 0);
193 }
194 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
195 // collect annotations
196 if (b < 4)
197 {
198 r->qsum[b] += q;
199 if ( r->ADF )
200 {
201 if ( bam_is_rev(p->b) )
202 r->ADR[b]++;
203 else
204 r->ADF[b]++;
205 }
206 }
207 ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
208 min_dist = p->b->core.l_qseq - 1 - p->qpos;
209 if (min_dist > p->qpos) min_dist = p->qpos;
210 if (min_dist > CAP_DIST) min_dist = CAP_DIST;
211 r->anno[1<<2|is_diff<<1|0] += baseQ;
212 r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
213 r->anno[2<<2|is_diff<<1|0] += mapQ;
214 r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
215 r->anno[3<<2|is_diff<<1|0] += min_dist;
216 r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
217
218 // collect for bias tests
219 if ( baseQ > 59 ) baseQ = 59;
220 if ( mapQ > 59 ) mapQ = 59;
221 int len, pos = get_position(p, &len);
222 int epos = (double)pos/(len+1) * bca->npos;
223 int ibq = baseQ/60. * bca->nqual;
224 int imq = mapQ/60. * bca->nqual;
225 if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
226 else bca->fwd_mqs[imq]++;
227 if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
228 {
229 bca->ref_pos[epos]++;
230 bca->ref_bq[ibq]++;
231 bca->ref_mq[imq]++;
232 }
233 else
234 {
235 bca->alt_pos[epos]++;
236 bca->alt_bq[ibq]++;
237 bca->alt_mq[imq]++;
238 }
239 }
240 r->ori_depth = ori_depth;
241 // glfgen
242 errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
243 return n;
244 }
245
246
247 /*
248 * calc_vdb() - returns value between zero (most biased) and one (no bias)
249 * on success, or HUGE_VAL when VDB cannot be calculated because
250 * of insufficient depth (<2x)
251 *
252 * Variant Distance Bias tests if the variant bases are positioned within the
253 * reads with sufficient randomness. Unlike other tests, it looks only at
254 * variant reads and therefore gives different kind of information than Read
255 * Position Bias for instance. VDB was developed for detecting artefacts in
256 * RNA-seq calls where reads from spliced transcripts span splice site
257 * boundaries. The current implementation differs somewhat from the original
258 * version described in supplementary material of PMID:22524474, but the idea
259 * remains the same. (Here the random variable tested is the average distance
260 * from the averaged position, not the average pairwise distance.)
261 *
262 * For coverage of 2x, the calculation is exact but is approximated for the
263 * rest. The result is most accurate between 4-200x. For 3x or >200x, the
264 * reported values are slightly more favourable than those of a true random
265 * distribution.
266 */
calc_vdb(int * pos,int npos)267 double calc_vdb(int *pos, int npos)
268 {
269 // Note well: the parameters were obtained by fitting to simulated data of
270 // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
271 const int readlen = 100;
272 assert( npos==readlen );
273
274 #define nparam 15
275 const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
276 {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
277 {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
278 {200,0.7,23.7} };
279
280 int i, dp = 0;
281 float mean_pos = 0, mean_diff = 0;
282 for (i=0; i<npos; i++)
283 {
284 if ( !pos[i] ) continue;
285 dp += pos[i];
286 mean_pos += pos[i]*i;
287 }
288 if ( dp<2 ) return HUGE_VAL; // one or zero reads can be placed anywhere
289
290 mean_pos /= dp;
291 for (i=0; i<npos; i++)
292 {
293 if ( !pos[i] ) continue;
294 mean_diff += pos[i] * fabs(i - mean_pos);
295 }
296 mean_diff /= dp;
297
298 int ipos = mean_diff; // tuned for float-to-int implicit conversion
299 if ( dp==2 )
300 return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
301
302 if ( dp>=200 )
303 i = nparam; // shortcut for big depths
304 else
305 {
306 for (i=0; i<nparam; i++)
307 if ( param[i][0]>=dp ) break;
308 }
309 float pshift, pscale;
310 if ( i==nparam )
311 {
312 // the depth is too high, go with 200x
313 pscale = param[nparam-1][1];
314 pshift = param[nparam-1][2];
315 }
316 else if ( i>0 && param[i][0]!=dp )
317 {
318 // linear interpolation of parameters
319 pscale = (param[i-1][1] + param[i][1])*0.5;
320 pshift = (param[i-1][2] + param[i][2])*0.5;
321 }
322 else
323 {
324 pscale = param[i][1];
325 pshift = param[i][2];
326 }
327 return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
328 }
329
calc_chisq_bias(int * a,int * b,int n)330 double calc_chisq_bias(int *a, int *b, int n)
331 {
332 int na = 0, nb = 0, i, ndf = n;
333 for (i=0; i<n; i++) na += a[i];
334 for (i=0; i<n; i++) nb += b[i];
335 if ( !na || !nb ) return HUGE_VAL;
336
337 double chisq = 0;
338 for (i=0; i<n; i++)
339 {
340 if ( !a[i] && !b[i] ) ndf--;
341 else
342 {
343 double tmp = a[i] - b[i];
344 chisq += tmp*tmp/(a[i]+b[i]);
345 }
346 }
347 /*
348 kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
349 1 if the distributions are identical, 0 if very different
350 */
351 double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
352 return prob;
353 }
354
mann_whitney_1947(int n,int m,int U)355 double mann_whitney_1947(int n, int m, int U)
356 {
357 if (U<0) return 0;
358 if (n==0||m==0) return U==0 ? 1 : 0;
359 return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
360 }
361
mann_whitney_1947_cdf(int n,int m,int U)362 double mann_whitney_1947_cdf(int n, int m, int U)
363 {
364 int i;
365 double sum = 0;
366 for (i=0; i<=U; i++)
367 sum += mann_whitney_1947(n,m,i);
368 return sum;
369 }
370
calc_mwu_bias_cdf(int * a,int * b,int n)371 double calc_mwu_bias_cdf(int *a, int *b, int n)
372 {
373 int na = 0, nb = 0, i;
374 double U = 0;
375 for (i=0; i<n; i++)
376 {
377 na += a[i];
378 U += a[i] * (nb + b[i]*0.5);
379 nb += b[i];
380 }
381 if ( !na || !nb ) return HUGE_VAL;
382
383 // Always work with the smaller U
384 double U_min = ((double)na * nb) - U;
385 if ( U < U_min ) U_min = U;
386
387 if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
388 if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
389
390 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
391 if ( na>=8 || nb>=8 )
392 {
393 double mean = ((double)na*nb)*0.5;
394 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
395 double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
396 return 2.0 - kf_erfc(z); // which is 1 + erf(z)
397 }
398
399 // Exact calculation
400 double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
401 return pval>1 ? 1 : pval;
402 }
403
calc_mwu_bias(int * a,int * b,int n)404 double calc_mwu_bias(int *a, int *b, int n)
405 {
406 int na = 0, nb = 0, i;
407 double U = 0;
408 for (i=0; i<n; i++)
409 {
410 na += a[i];
411 U += a[i] * (nb + b[i]*0.5);
412 nb += b[i];
413 }
414 if ( !na || !nb ) return HUGE_VAL;
415 if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
416
417 double mean = ((double)na*nb)*0.5;
418 if ( na==2 || nb==2 )
419 {
420 // Linear approximation
421 return U>mean ? (2.0*mean-U)/mean : U/mean;
422 }
423 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
424 if ( na>=8 || nb>=8 )
425 {
426 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
427 return exp(-0.5*(U-mean)*(U-mean)/var2);
428 }
429
430 // Exact calculation
431 return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
432 }
433
logsumexp2(double a,double b)434 static inline double logsumexp2(double a, double b)
435 {
436 if ( a>b )
437 return log(1 + exp(b-a)) + a;
438 else
439 return log(1 + exp(a-b)) + b;
440 }
441
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)442 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
443 {
444 call->seg_bias = HUGE_VAL;
445 if ( !bcr ) return;
446
447 int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
448 if ( !nr ) return;
449
450 int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth
451 double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population
452 if ( M>call->n ) M = call->n; // clamp M at the number of samples
453 else if ( M==0 ) M = 1;
454 double f = M / 2. / call->n; // allele frequency
455 double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson)
456 double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson)
457 double sum = 0;
458 const double log2 = log(2.0);
459
460 // fprintf(stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
461 int i;
462 for (i=0; i<call->n; i++)
463 {
464 int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads
465 double tmp;
466 if ( oi )
467 {
468 // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
469 tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
470 tmp += log(f) + oi*log(q/p) - q + p;
471 }
472 else
473 tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
474 sum += tmp;
475 // fprintf(stderr,"oi=%d %e\n", oi,tmp);
476 }
477 call->seg_bias = sum;
478 }
479
480 /**
481 * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
482 * @n: number of samples
483 * @calls: each sample's calls
484 * @bca: auxiliary data structure for holding temporary values
485 * @ref_base: the reference base
486 * @call: filled with the annotations
487 *
488 * Combines calls across the various samples being studied
489 * 1. For each allele at each base across all samples the quality is summed so
490 * you end up with a set of quality sums for each allele present 2. The quality
491 * sums are sorted.
492 * 3. Using the sorted quality sums we now create the allele ordering array
493 * A\subN. This is done by doing the following:
494 * a) If the reference allele is known it always comes first, otherwise N
495 * comes first.
496 * b) Then the rest of the alleles are output in descending order of quality
497 * sum (which we already know the qsum array was sorted). Any allelles with
498 * qsum 0 will be excluded.
499 * 4. Using the allele ordering array we create the genotype ordering array.
500 * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1
501 * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
502 * 5. The genotype ordering array is then used to extract data from the error
503 * model 5*5 matrix and is used to produce a Phread likelihood array for each
504 * sample.
505 */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)506 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
507 {
508 int ref4, i, j;
509 float qsum[5] = {0,0,0,0,0};
510 if (ref_base >= 0) {
511 call->ori_ref = ref4 = seq_nt16_int[ref_base];
512 if (ref4 > 4) ref4 = 4;
513 } else call->ori_ref = -1, ref4 = 0;
514
515 // calculate qsum, this is done by summing normalized qsum across all samples,
516 // to account for differences in coverage
517 for (i = 0; i < n; ++i)
518 {
519 float sum = 0;
520 for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
521 if ( sum )
522 for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
523 }
524
525 // sort qsum in ascending order (insertion sort)
526 float *ptr[5], *tmp;
527 for (i=0; i<5; i++) ptr[i] = &qsum[i];
528 for (i=1; i<4; i++)
529 for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
530 tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
531
532 // Set the reference allele and alternative allele(s)
533 for (i=0; i<5; i++) call->a[i] = -1;
534 for (i=0; i<5; i++) call->qsum[i] = 0;
535 call->unseen = -1;
536 call->a[0] = ref4;
537 for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
538 {
539 int ipos = ptr[i] - qsum; // position in sorted qsum array
540 if ( ipos==ref4 )
541 call->qsum[0] = qsum[ipos]; // REF's qsum
542 else
543 {
544 if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup
545 call->qsum[j] = qsum[ipos];
546 call->a[j++] = ipos;
547 }
548 }
549 if (ref_base >= 0)
550 {
551 // for SNPs, find the "unseen" base
552 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
553 call->unseen = j, call->a[j++] = ptr[i] - qsum;
554 call->n_alleles = j;
555 }
556 else
557 {
558 call->n_alleles = j;
559 if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
560 }
561 /*
562 * Set the phread likelihood array (call->PL) This array is 15 entries long
563 * for each sample because that is size of an upper or lower triangle of a
564 * worst case 5x5 matrix of possible genotypes. This worst case matrix will
565 * occur when all 4 possible alleles are present and the reference allele
566 * is unknown. The sides of the matrix will correspond to the reference
567 * allele (if known) followed by the alleles present in descending order of
568 * quality sum
569 */
570 {
571 int x, g[15], z;
572 double sum_min = 0.;
573 x = call->n_alleles * (call->n_alleles + 1) / 2;
574 // get the possible genotypes
575 // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
576 for (i = z = 0; i < call->n_alleles; ++i) {
577 for (j = 0; j <= i; ++j) {
578 g[z++] = call->a[j] * 5 + call->a[i];
579 }
580 }
581 // for each sample calculate the PL
582 for (i = 0; i < n; ++i)
583 {
584 int32_t *PL = call->PL + x * i;
585 const bcf_callret1_t *r = calls + i;
586 float min = FLT_MAX;
587 for (j = 0; j < x; ++j) {
588 if (min > r->p[g[j]]) min = r->p[g[j]];
589 }
590 sum_min += min;
591 for (j = 0; j < x; ++j) {
592 int y;
593 y = (int)(r->p[g[j]] - min + .499);
594 if (y > 255) y = 255;
595 PL[j] = y;
596 }
597 }
598 if ( call->DP4 )
599 {
600 for (i=0; i<n; i++)
601 {
602 call->DP4[4*i] = calls[i].anno[0];
603 call->DP4[4*i+1] = calls[i].anno[1];
604 call->DP4[4*i+2] = calls[i].anno[2];
605 call->DP4[4*i+3] = calls[i].anno[3];
606 }
607 }
608 if ( call->ADF )
609 {
610 assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
611
612 // reorder ADR,ADF to match the allele ordering at this site
613 int32_t tmp[B2B_MAX_ALLELES];
614 int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
615 int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
616 int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
617 int32_t *adf_tot = call->ADF;
618 for (i=0; i<n; i++)
619 {
620 for (j=0; j<call->n_alleles; j++)
621 {
622 tmp[j] = adr[ call->a[j] ];
623 adr_tot[j] += tmp[j];
624 }
625 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
626 for (j=0; j<call->n_alleles; j++)
627 {
628 tmp[j] = adf[ call->a[j] ];
629 adf_tot[j] += tmp[j];
630 }
631 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
632 adf_out += call->n_alleles;
633 adr_out += call->n_alleles;
634 adr += B2B_MAX_ALLELES;
635 adf += B2B_MAX_ALLELES;
636 }
637 }
638
639 // if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
640 call->shift = (int)(sum_min + .499);
641 }
642 // combine annotations
643 memset(call->anno, 0, 16 * sizeof(double));
644 call->ori_depth = 0;
645 call->depth = 0;
646 call->mq0 = 0;
647 for (i = 0; i < n; ++i) {
648 call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
649 call->ori_depth += calls[i].ori_depth;
650 call->mq0 += calls[i].mq0;
651 for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
652 }
653
654 calc_SegBias(calls, call);
655
656 // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
657 // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
658 // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
659
660 call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
661 call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual);
662 call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual);
663 call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
664
665 #if CDF_MWU_TESTS
666 call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
667 call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
668 call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
669 call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
670 #endif
671
672 call->vdb = calc_vdb(bca->alt_pos, bca->npos);
673
674 return 0;
675 }
676
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)677 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
678 {
679 extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
680 int i, j, nals = 1;
681
682 bcf_hdr_t *hdr = bc->bcf_hdr;
683 rec->rid = bc->tid;
684 rec->pos = bc->pos;
685 rec->qual = 0;
686
687 bc->tmp.l = 0;
688 if (bc->ori_ref < 0) // indel
689 {
690 // REF
691 kputc(ref[bc->pos], &bc->tmp);
692 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
693
694 // ALT
695 for (i=1; i<4; i++)
696 {
697 if (bc->a[i] < 0) break;
698 kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
699
700 if (bca->indel_types[bc->a[i]] < 0) { // deletion
701 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
702 kputc(ref[bc->pos+1+j], &bc->tmp);
703 } else { // insertion; cannot be a reference unless a bug
704 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
705 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
706 kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
707 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
708 }
709 nals++;
710 }
711 }
712 else // SNP
713 {
714 kputc("ACGTN"[bc->ori_ref], &bc->tmp);
715 for (i=1; i<5; i++)
716 {
717 if (bc->a[i] < 0) break;
718 kputc(',', &bc->tmp);
719 if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
720 else kputc("ACGT"[bc->a[i]], &bc->tmp);
721 nals++;
722 }
723 }
724 bcf_update_alleles_str(hdr, rec, bc->tmp.s);
725
726 bc->tmp.l = 0;
727
728 // INFO
729 if (bc->ori_ref < 0)
730 {
731 bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
732 bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
733 bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
734 }
735 bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
736 if ( fmt_flag&B2B_INFO_ADF )
737 bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
738 if ( fmt_flag&B2B_INFO_ADR )
739 bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
740 if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
741 {
742 for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
743 if ( fmt_flag&B2B_INFO_AD )
744 bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
745 if ( fmt_flag&B2B_INFO_DPR )
746 bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
747 }
748
749 float tmpf[16];
750 for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
751 bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
752 bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
753
754 if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
755 if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
756 if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
757 if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
758 if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
759 if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
760 #if CDF_MWU_TESTS
761 if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
762 if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
763 if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
764 if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
765 #endif
766 tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
767 bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
768
769 // FORMAT
770 rec->n_sample = bc->n;
771 bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
772 if ( fmt_flag&B2B_FMT_DP )
773 {
774 int32_t *ptr = (int32_t*) bc->fmt_arr;
775 for (i=0; i<bc->n; i++)
776 ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
777 bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
778 }
779 if ( fmt_flag&B2B_FMT_DV )
780 {
781 int32_t *ptr = (int32_t*) bc->fmt_arr;
782 for (i=0; i<bc->n; i++)
783 ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
784 bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
785 }
786 if ( fmt_flag&B2B_FMT_SP )
787 {
788 int32_t *ptr = (int32_t*) bc->fmt_arr;
789 for (i=0; i<bc->n; i++)
790 {
791 int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
792 if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
793 ptr[i] = 0;
794 else
795 {
796 double left, right, two;
797 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
798 int32_t x = (int)(-4.343 * log(two) + .499);
799 if (x > 255) x = 255;
800 ptr[i] = x;
801 }
802 }
803 bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
804 }
805 if ( fmt_flag&B2B_FMT_DP4 )
806 bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
807 if ( fmt_flag&B2B_FMT_ADF )
808 bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
809 if ( fmt_flag&B2B_FMT_ADR )
810 bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
811 if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
812 {
813 for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
814 if ( fmt_flag&B2B_FMT_AD )
815 bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
816 if ( fmt_flag&B2B_FMT_DPR )
817 bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
818 }
819
820 return 0;
821 }
822