1 #include "samtools.pysam.h"
2
3 /* bam2bcf.c -- variant calling.
4
5 Copyright (C) 2010-2012 Broad Institute.
6 Copyright (C) 2012-2015, 2021 Genome Research Ltd.
7
8 Author: Heng Li <lh3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #include <config.h>
29
30 #include <math.h>
31 #include <stdint.h>
32 #include <assert.h>
33 #include <float.h>
34 #include <htslib/hts.h>
35 #include <htslib/sam.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kfunc.h>
38 #include "bam2bcf.h"
39
40 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
41
42 #define CALL_DEFTHETA 0.83
43 #define DEF_MAPQ 20
44
45 #define CAP_DIST 25
46
bcf_call_init(double theta,int min_baseQ)47 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
48 {
49 bcf_callaux_t *bca;
50 if (theta <= 0.) theta = CALL_DEFTHETA;
51 bca = calloc(1, sizeof(bcf_callaux_t));
52 bca->capQ = 60;
53 bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
54 bca->min_baseQ = min_baseQ;
55 bca->e = errmod_init(1. - theta);
56 bca->min_frac = 0.002;
57 bca->min_support = 1;
58 bca->per_sample_flt = 0;
59 bca->npos = 100;
60 bca->ref_pos = malloc(bca->npos*sizeof(int));
61 bca->alt_pos = malloc(bca->npos*sizeof(int));
62 bca->nqual = 60;
63 bca->ref_mq = malloc(bca->nqual*sizeof(int));
64 bca->alt_mq = malloc(bca->nqual*sizeof(int));
65 bca->ref_bq = malloc(bca->nqual*sizeof(int));
66 bca->alt_bq = malloc(bca->nqual*sizeof(int));
67 bca->fwd_mqs = malloc(bca->nqual*sizeof(int));
68 bca->rev_mqs = malloc(bca->nqual*sizeof(int));
69 return bca;
70 }
71
bcf_call_destroy(bcf_callaux_t * bca)72 void bcf_call_destroy(bcf_callaux_t *bca)
73 {
74 if (bca == 0) return;
75 errmod_destroy(bca->e);
76 if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
77 free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
78 free(bca->fwd_mqs); free(bca->rev_mqs);
79 bca->nqual = 0;
80 free(bca->bases); free(bca->inscns); free(bca);
81 }
82
83 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len)84 static int get_position(const bam_pileup1_t *p, int *len)
85 {
86 int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
87 for (icig=0; icig<p->b->core.n_cigar; icig++)
88 {
89 int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
90 int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
91 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
92 {
93 n_tot_bases += ncig;
94 iread += ncig;
95 continue;
96 }
97 if ( cig==BAM_CINS )
98 {
99 n_tot_bases += ncig;
100 iread += ncig;
101 continue;
102 }
103 if ( cig==BAM_CSOFT_CLIP )
104 {
105 iread += ncig;
106 if ( iread<=p->qpos ) edist -= ncig;
107 continue;
108 }
109 if ( cig==BAM_CDEL ) continue;
110 if ( cig==BAM_CHARD_CLIP ) continue;
111 if ( cig==BAM_CPAD ) continue;
112 if ( cig==BAM_CREF_SKIP ) continue;
113 fprintf(samtools_stderr,"todo: cigar %d\n", cig);
114 assert(0);
115 }
116 *len = n_tot_bases;
117 return edist;
118 }
119
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)120 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
121 {
122 memset(bca->ref_pos,0,sizeof(int)*bca->npos);
123 memset(bca->alt_pos,0,sizeof(int)*bca->npos);
124 memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
125 memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
126 memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
127 memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
128 memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
129 memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
130 if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
131 if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
132 }
133
134 /*
135 Notes:
136 - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
137 which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
138 Later it's used for multiallelic calling by bcftools -m
139 - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
140 */
141 /*
142 * This function is called once for each sample.
143 * _n is number of pilesups pl contributing reads to this sample
144 * pl is pointer to array of _n pileups (one pileup per read)
145 * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
146 * bca is the settings to perform calls across all samples
147 * r is the returned value of the call
148 */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)149 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
150 {
151 int i, n, ref4, is_indel, ori_depth = 0;
152
153 // clean from previous run
154 r->ori_depth = 0;
155 r->mq0 = 0;
156 memset(r->qsum,0,sizeof(float)*4);
157 memset(r->anno,0,sizeof(double)*16);
158 memset(r->p,0,sizeof(float)*25);
159
160 if (ref_base >= 0) {
161 ref4 = seq_nt16_int[ref_base];
162 is_indel = 0;
163 } else ref4 = 4, is_indel = 1;
164 if (_n == 0) return -1;
165 // enlarge the bases array if necessary
166 if (bca->max_bases < _n) {
167 bca->max_bases = _n;
168 kroundup32(bca->max_bases);
169 bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
170 }
171 // fill the bases array
172 for (i = n = 0; i < _n; ++i) {
173 const bam_pileup1_t *p = pl + i;
174 int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
175 // set base
176 if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
177 ++ori_depth;
178 mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
179 if ( !mapQ ) r->mq0++;
180 baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
181 seqQ = is_indel? (p->aux>>8&0xff) : 99;
182 if (q < bca->min_baseQ) continue;
183 if (q > seqQ) q = seqQ;
184 mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
185 if (q > mapQ) q = mapQ;
186 if (q > 63) q = 63;
187 if (q < 4) q = 4; // MQ=0 reads count as BQ=4
188 if (!is_indel) {
189 b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
190 b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
191 is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
192 } else {
193 b = p->aux>>16&0x3f;
194 is_diff = (b != 0);
195 }
196 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
197 // collect annotations
198 if (b < 4)
199 {
200 r->qsum[b] += q;
201 if ( r->ADF )
202 {
203 if ( bam_is_rev(p->b) )
204 r->ADR[b]++;
205 else
206 r->ADF[b]++;
207 }
208 }
209 ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
210 min_dist = p->b->core.l_qseq - 1 - p->qpos;
211 if (min_dist > p->qpos) min_dist = p->qpos;
212 if (min_dist > CAP_DIST) min_dist = CAP_DIST;
213 r->anno[1<<2|is_diff<<1|0] += baseQ;
214 r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
215 r->anno[2<<2|is_diff<<1|0] += mapQ;
216 r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
217 r->anno[3<<2|is_diff<<1|0] += min_dist;
218 r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
219
220 // collect for bias tests
221 if ( baseQ > 59 ) baseQ = 59;
222 if ( mapQ > 59 ) mapQ = 59;
223 int len, pos = get_position(p, &len);
224 int epos = (double)pos/(len+1) * bca->npos;
225 int ibq = baseQ/60. * bca->nqual;
226 int imq = mapQ/60. * bca->nqual;
227 if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
228 else bca->fwd_mqs[imq]++;
229 if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
230 {
231 bca->ref_pos[epos]++;
232 bca->ref_bq[ibq]++;
233 bca->ref_mq[imq]++;
234 }
235 else
236 {
237 bca->alt_pos[epos]++;
238 bca->alt_bq[ibq]++;
239 bca->alt_mq[imq]++;
240 }
241 }
242 r->ori_depth = ori_depth;
243 // glfgen
244 errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
245 return n;
246 }
247
248
249 /*
250 * calc_vdb() - returns value between zero (most biased) and one (no bias)
251 * on success, or HUGE_VAL when VDB cannot be calculated because
252 * of insufficient depth (<2x)
253 *
254 * Variant Distance Bias tests if the variant bases are positioned within the
255 * reads with sufficient randomness. Unlike other tests, it looks only at
256 * variant reads and therefore gives different kind of information than Read
257 * Position Bias for instance. VDB was developed for detecting artefacts in
258 * RNA-seq calls where reads from spliced transcripts span splice site
259 * boundaries. The current implementation differs somewhat from the original
260 * version described in supplementary material of PMID:22524474, but the idea
261 * remains the same. (Here the random variable tested is the average distance
262 * from the averaged position, not the average pairwise distance.)
263 *
264 * For coverage of 2x, the calculation is exact but is approximated for the
265 * rest. The result is most accurate between 4-200x. For 3x or >200x, the
266 * reported values are slightly more favourable than those of a true random
267 * distribution.
268 */
calc_vdb(int * pos,int npos)269 double calc_vdb(int *pos, int npos)
270 {
271 // Note well: the parameters were obtained by fitting to simulated data of
272 // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
273 const int readlen = 100;
274 assert( npos==readlen );
275
276 #define nparam 15
277 const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
278 {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
279 {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
280 {200,0.7,23.7} };
281
282 int i, dp = 0;
283 float mean_pos = 0, mean_diff = 0;
284 for (i=0; i<npos; i++)
285 {
286 if ( !pos[i] ) continue;
287 dp += pos[i];
288 mean_pos += pos[i]*i;
289 }
290 if ( dp<2 ) return HUGE_VAL; // one or zero reads can be placed anywhere
291
292 mean_pos /= dp;
293 for (i=0; i<npos; i++)
294 {
295 if ( !pos[i] ) continue;
296 mean_diff += pos[i] * fabs(i - mean_pos);
297 }
298 mean_diff /= dp;
299
300 int ipos = mean_diff; // tuned for float-to-int implicit conversion
301 if ( dp==2 )
302 return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
303
304 if ( dp>=200 )
305 i = nparam; // shortcut for big depths
306 else
307 {
308 for (i=0; i<nparam; i++)
309 if ( param[i][0]>=dp ) break;
310 }
311 float pshift, pscale;
312 if ( i==nparam )
313 {
314 // the depth is too high, go with 200x
315 pscale = param[nparam-1][1];
316 pshift = param[nparam-1][2];
317 }
318 else if ( i>0 && param[i][0]!=dp )
319 {
320 // linear interpolation of parameters
321 pscale = (param[i-1][1] + param[i][1])*0.5;
322 pshift = (param[i-1][2] + param[i][2])*0.5;
323 }
324 else
325 {
326 pscale = param[i][1];
327 pshift = param[i][2];
328 }
329 return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
330 }
331
calc_chisq_bias(int * a,int * b,int n)332 double calc_chisq_bias(int *a, int *b, int n)
333 {
334 int na = 0, nb = 0, i, ndf = n;
335 for (i=0; i<n; i++) na += a[i];
336 for (i=0; i<n; i++) nb += b[i];
337 if ( !na || !nb ) return HUGE_VAL;
338
339 double chisq = 0;
340 for (i=0; i<n; i++)
341 {
342 if ( !a[i] && !b[i] ) ndf--;
343 else
344 {
345 double tmp = a[i] - b[i];
346 chisq += tmp*tmp/(a[i]+b[i]);
347 }
348 }
349 /*
350 kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
351 1 if the distributions are identical, 0 if very different
352 */
353 double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
354 return prob;
355 }
356
mann_whitney_1947(int n,int m,int U)357 double mann_whitney_1947(int n, int m, int U)
358 {
359 if (U<0) return 0;
360 if (n==0||m==0) return U==0 ? 1 : 0;
361 return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
362 }
363
mann_whitney_1947_cdf(int n,int m,int U)364 double mann_whitney_1947_cdf(int n, int m, int U)
365 {
366 int i;
367 double sum = 0;
368 for (i=0; i<=U; i++)
369 sum += mann_whitney_1947(n,m,i);
370 return sum;
371 }
372
calc_mwu_bias_cdf(int * a,int * b,int n)373 double calc_mwu_bias_cdf(int *a, int *b, int n)
374 {
375 int na = 0, nb = 0, i;
376 double U = 0;
377 for (i=0; i<n; i++)
378 {
379 na += a[i];
380 U += a[i] * (nb + b[i]*0.5);
381 nb += b[i];
382 }
383 if ( !na || !nb ) return HUGE_VAL;
384
385 // Always work with the smaller U
386 double U_min = ((double)na * nb) - U;
387 if ( U < U_min ) U_min = U;
388
389 if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
390 if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
391
392 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
393 if ( na>=8 || nb>=8 )
394 {
395 double mean = ((double)na*nb)*0.5;
396 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
397 double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
398 return 2.0 - kf_erfc(z); // which is 1 + erf(z)
399 }
400
401 // Exact calculation
402 double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
403 return pval>1 ? 1 : pval;
404 }
405
calc_mwu_bias(int * a,int * b,int n)406 double calc_mwu_bias(int *a, int *b, int n)
407 {
408 int na = 0, nb = 0, i;
409 double U = 0;
410 for (i=0; i<n; i++)
411 {
412 na += a[i];
413 U += a[i] * (nb + b[i]*0.5);
414 nb += b[i];
415 }
416 if ( !na || !nb ) return HUGE_VAL;
417 if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
418
419 double mean = ((double)na*nb)*0.5;
420 if ( na==2 || nb==2 )
421 {
422 // Linear approximation
423 return U>mean ? (2.0*mean-U)/mean : U/mean;
424 }
425 double var2 = ((double)na*nb)*(na+nb+1)/12.0;
426 if ( na>=8 || nb>=8 )
427 {
428 // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
429 return exp(-0.5*(U-mean)*(U-mean)/var2);
430 }
431
432 // Exact calculation
433 return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
434 }
435
logsumexp2(double a,double b)436 static inline double logsumexp2(double a, double b)
437 {
438 if ( a>b )
439 return log(1 + exp(b-a)) + a;
440 else
441 return log(1 + exp(a-b)) + b;
442 }
443
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)444 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
445 {
446 call->seg_bias = HUGE_VAL;
447 if ( !bcr ) return;
448
449 int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
450 if ( !nr ) return;
451
452 int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth
453 double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population
454 if ( M>call->n ) M = call->n; // clamp M at the number of samples
455 else if ( M==0 ) M = 1;
456 double f = M / 2. / call->n; // allele frequency
457 double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson)
458 double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson)
459 double sum = 0;
460 const double log2 = log(2.0);
461
462 // fprintf(samtools_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
463 int i;
464 for (i=0; i<call->n; i++)
465 {
466 int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads
467 double tmp;
468 if ( oi )
469 {
470 // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
471 tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
472 tmp += log(f) + oi*log(q/p) - q + p;
473 }
474 else
475 tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
476 sum += tmp;
477 // fprintf(samtools_stderr,"oi=%d %e\n", oi,tmp);
478 }
479 call->seg_bias = sum;
480 }
481
482 /**
483 * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
484 * @n: number of samples
485 * @calls: each sample's calls
486 * @bca: auxiliary data structure for holding temporary values
487 * @ref_base: the reference base
488 * @call: filled with the annotations
489 *
490 * Combines calls across the various samples being studied
491 * 1. For each allele at each base across all samples the quality is summed so
492 * you end up with a set of quality sums for each allele present 2. The quality
493 * sums are sorted.
494 * 3. Using the sorted quality sums we now create the allele ordering array
495 * A\subN. This is done by doing the following:
496 * a) If the reference allele is known it always comes first, otherwise N
497 * comes first.
498 * b) Then the rest of the alleles are output in descending order of quality
499 * sum (which we already know the qsum array was sorted). Any allelles with
500 * qsum 0 will be excluded.
501 * 4. Using the allele ordering array we create the genotype ordering array.
502 * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1
503 * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
504 * 5. The genotype ordering array is then used to extract data from the error
505 * model 5*5 matrix and is used to produce a Phread likelihood array for each
506 * sample.
507 */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)508 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
509 {
510 int ref4, i, j;
511 float qsum[5] = {0,0,0,0,0};
512 if (ref_base >= 0) {
513 call->ori_ref = ref4 = seq_nt16_int[ref_base];
514 if (ref4 > 4) ref4 = 4;
515 } else call->ori_ref = -1, ref4 = 0;
516
517 // calculate qsum, this is done by summing normalized qsum across all samples,
518 // to account for differences in coverage
519 for (i = 0; i < n; ++i)
520 {
521 float sum = 0;
522 for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
523 if ( sum )
524 for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
525 }
526
527 // sort qsum in ascending order (insertion sort)
528 float *ptr[5], *tmp;
529 for (i=0; i<5; i++) ptr[i] = &qsum[i];
530 for (i=1; i<4; i++)
531 for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
532 tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
533
534 // Set the reference allele and alternative allele(s)
535 for (i=0; i<5; i++) call->a[i] = -1;
536 for (i=0; i<5; i++) call->qsum[i] = 0;
537 call->unseen = -1;
538 call->a[0] = ref4;
539 for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
540 {
541 int ipos = ptr[i] - qsum; // position in sorted qsum array
542 if ( ipos==ref4 )
543 call->qsum[0] = qsum[ipos]; // REF's qsum
544 else
545 {
546 if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup
547 call->qsum[j] = qsum[ipos];
548 call->a[j++] = ipos;
549 }
550 }
551 if (ref_base >= 0)
552 {
553 // for SNPs, find the "unseen" base
554 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
555 call->unseen = j, call->a[j++] = ptr[i] - qsum;
556 call->n_alleles = j;
557 }
558 else
559 {
560 call->n_alleles = j;
561 if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
562 }
563 /*
564 * Set the phread likelihood array (call->PL) This array is 15 entries long
565 * for each sample because that is size of an upper or lower triangle of a
566 * worst case 5x5 matrix of possible genotypes. This worst case matrix will
567 * occur when all 4 possible alleles are present and the reference allele
568 * is unknown. The sides of the matrix will correspond to the reference
569 * allele (if known) followed by the alleles present in descending order of
570 * quality sum
571 */
572 {
573 int x, g[15], z;
574 double sum_min = 0.;
575 x = call->n_alleles * (call->n_alleles + 1) / 2;
576 // get the possible genotypes
577 // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
578 for (i = z = 0; i < call->n_alleles; ++i) {
579 for (j = 0; j <= i; ++j) {
580 g[z++] = call->a[j] * 5 + call->a[i];
581 }
582 }
583 // for each sample calculate the PL
584 for (i = 0; i < n; ++i)
585 {
586 int32_t *PL = call->PL + x * i;
587 const bcf_callret1_t *r = calls + i;
588 float min = FLT_MAX;
589 for (j = 0; j < x; ++j) {
590 if (min > r->p[g[j]]) min = r->p[g[j]];
591 }
592 sum_min += min;
593 for (j = 0; j < x; ++j) {
594 int y;
595 y = (int)(r->p[g[j]] - min + .499);
596 if (y > 255) y = 255;
597 PL[j] = y;
598 }
599 }
600 if ( call->DP4 )
601 {
602 for (i=0; i<n; i++)
603 {
604 call->DP4[4*i] = calls[i].anno[0];
605 call->DP4[4*i+1] = calls[i].anno[1];
606 call->DP4[4*i+2] = calls[i].anno[2];
607 call->DP4[4*i+3] = calls[i].anno[3];
608 }
609 }
610 if ( call->ADF )
611 {
612 assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
613
614 // reorder ADR,ADF to match the allele ordering at this site
615 int32_t tmp[B2B_MAX_ALLELES];
616 int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
617 int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
618 int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
619 int32_t *adf_tot = call->ADF;
620 for (i=0; i<n; i++)
621 {
622 for (j=0; j<call->n_alleles; j++)
623 {
624 tmp[j] = adr[ call->a[j] ];
625 adr_tot[j] += tmp[j];
626 }
627 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
628 for (j=0; j<call->n_alleles; j++)
629 {
630 tmp[j] = adf[ call->a[j] ];
631 adf_tot[j] += tmp[j];
632 }
633 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
634 adf_out += call->n_alleles;
635 adr_out += call->n_alleles;
636 adr += B2B_MAX_ALLELES;
637 adf += B2B_MAX_ALLELES;
638 }
639 }
640
641 // if (ref_base < 0) fprintf(samtools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
642 call->shift = (int)(sum_min + .499);
643 }
644 // combine annotations
645 memset(call->anno, 0, 16 * sizeof(double));
646 call->ori_depth = 0;
647 call->depth = 0;
648 call->mq0 = 0;
649 for (i = 0; i < n; ++i) {
650 call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
651 call->ori_depth += calls[i].ori_depth;
652 call->mq0 += calls[i].mq0;
653 for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
654 }
655
656 calc_SegBias(calls, call);
657
658 // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
659 // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
660 // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
661
662 call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
663 call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual);
664 call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual);
665 call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
666
667 #if CDF_MWU_TESTS
668 call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
669 call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
670 call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
671 call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
672 #endif
673
674 call->vdb = calc_vdb(bca->alt_pos, bca->npos);
675
676 return 0;
677 }
678
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)679 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
680 {
681 extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
682 int i, j, nals = 1;
683
684 bcf_hdr_t *hdr = bc->bcf_hdr;
685 rec->rid = bc->tid;
686 rec->pos = bc->pos;
687 rec->qual = 0;
688
689 bc->tmp.l = 0;
690 if (bc->ori_ref < 0) // indel
691 {
692 // REF
693 kputc(ref[bc->pos], &bc->tmp);
694 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
695
696 // ALT
697 for (i=1; i<4; i++)
698 {
699 if (bc->a[i] < 0) break;
700 kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
701
702 if (bca->indel_types[bc->a[i]] < 0) { // deletion
703 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
704 kputc(ref[bc->pos+1+j], &bc->tmp);
705 } else { // insertion; cannot be a reference unless a bug
706 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
707 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
708 kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
709 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
710 }
711 nals++;
712 }
713 }
714 else // SNP
715 {
716 kputc("ACGTN"[bc->ori_ref], &bc->tmp);
717 for (i=1; i<5; i++)
718 {
719 if (bc->a[i] < 0) break;
720 kputc(',', &bc->tmp);
721 if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
722 else kputc("ACGT"[bc->a[i]], &bc->tmp);
723 nals++;
724 }
725 }
726 bcf_update_alleles_str(hdr, rec, bc->tmp.s);
727
728 bc->tmp.l = 0;
729
730 // INFO
731 if (bc->ori_ref < 0)
732 {
733 bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
734 bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
735 bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
736 }
737 bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
738 if ( fmt_flag&B2B_INFO_ADF )
739 bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
740 if ( fmt_flag&B2B_INFO_ADR )
741 bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
742 if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
743 {
744 for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
745 if ( fmt_flag&B2B_INFO_AD )
746 bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
747 if ( fmt_flag&B2B_INFO_DPR )
748 bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
749 }
750
751 float tmpf[16];
752 for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
753 bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
754 bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
755
756 if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
757 if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
758 if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
759 if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
760 if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
761 if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
762 #if CDF_MWU_TESTS
763 if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
764 if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
765 if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
766 if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
767 #endif
768 tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
769 bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
770
771 // FORMAT
772 rec->n_sample = bc->n;
773 bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
774 if ( fmt_flag&B2B_FMT_DP )
775 {
776 int32_t *ptr = (int32_t*) bc->fmt_arr;
777 for (i=0; i<bc->n; i++)
778 ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
779 bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
780 }
781 if ( fmt_flag&B2B_FMT_DV )
782 {
783 int32_t *ptr = (int32_t*) bc->fmt_arr;
784 for (i=0; i<bc->n; i++)
785 ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
786 bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
787 }
788 if ( fmt_flag&B2B_FMT_SP )
789 {
790 int32_t *ptr = (int32_t*) bc->fmt_arr;
791 for (i=0; i<bc->n; i++)
792 {
793 int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
794 if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
795 ptr[i] = 0;
796 else
797 {
798 double left, right, two;
799 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
800 int32_t x = (int)(-4.343 * log(two) + .499);
801 if (x > 255) x = 255;
802 ptr[i] = x;
803 }
804 }
805 bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
806 }
807 if ( fmt_flag&B2B_FMT_DP4 )
808 bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
809 if ( fmt_flag&B2B_FMT_ADF )
810 bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
811 if ( fmt_flag&B2B_FMT_ADR )
812 bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
813 if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
814 {
815 for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
816 if ( fmt_flag&B2B_FMT_AD )
817 bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
818 if ( fmt_flag&B2B_FMT_DPR )
819 bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
820 }
821
822 return 0;
823 }
824