1 /*  bam2bcf.c -- variant calling.
2 
3     Copyright (C) 2010-2012 Broad Institute.
4     Copyright (C) 2012-2015, 2021 Genome Research Ltd.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <math.h>
29 #include <stdint.h>
30 #include <assert.h>
31 #include <float.h>
32 #include <htslib/hts.h>
33 #include <htslib/sam.h>
34 #include <htslib/kstring.h>
35 #include <htslib/kfunc.h>
36 #include "bam2bcf.h"
37 
38 extern  void ks_introsort_uint32_t(size_t n, uint32_t a[]);
39 
40 #define CALL_DEFTHETA 0.83
41 #define DEF_MAPQ 20
42 
43 #define CAP_DIST 25
44 
bcf_call_init(double theta,int min_baseQ)45 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
46 {
47     bcf_callaux_t *bca;
48     if (theta <= 0.) theta = CALL_DEFTHETA;
49     bca = calloc(1, sizeof(bcf_callaux_t));
50     bca->capQ = 60;
51     bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
52     bca->min_baseQ = min_baseQ;
53     bca->e = errmod_init(1. - theta);
54     bca->min_frac = 0.002;
55     bca->min_support = 1;
56     bca->per_sample_flt = 0;
57     bca->npos = 100;
58     bca->ref_pos = malloc(bca->npos*sizeof(int));
59     bca->alt_pos = malloc(bca->npos*sizeof(int));
60     bca->nqual = 60;
61     bca->ref_mq  = malloc(bca->nqual*sizeof(int));
62     bca->alt_mq  = malloc(bca->nqual*sizeof(int));
63     bca->ref_bq  = malloc(bca->nqual*sizeof(int));
64     bca->alt_bq  = malloc(bca->nqual*sizeof(int));
65     bca->fwd_mqs = malloc(bca->nqual*sizeof(int));
66     bca->rev_mqs = malloc(bca->nqual*sizeof(int));
67     return bca;
68 }
69 
bcf_call_destroy(bcf_callaux_t * bca)70 void bcf_call_destroy(bcf_callaux_t *bca)
71 {
72     if (bca == 0) return;
73     errmod_destroy(bca->e);
74     if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
75     free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
76     free(bca->fwd_mqs); free(bca->rev_mqs);
77     bca->nqual = 0;
78     free(bca->bases); free(bca->inscns); free(bca);
79 }
80 
81 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len)82 static int get_position(const bam_pileup1_t *p, int *len)
83 {
84     int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
85     for (icig=0; icig<p->b->core.n_cigar; icig++)
86     {
87         int cig  = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
88         int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
89         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
90         {
91             n_tot_bases += ncig;
92             iread += ncig;
93             continue;
94         }
95         if ( cig==BAM_CINS )
96         {
97             n_tot_bases += ncig;
98             iread += ncig;
99             continue;
100         }
101         if ( cig==BAM_CSOFT_CLIP )
102         {
103             iread += ncig;
104             if ( iread<=p->qpos ) edist -= ncig;
105             continue;
106         }
107         if ( cig==BAM_CDEL ) continue;
108         if ( cig==BAM_CHARD_CLIP ) continue;
109         if ( cig==BAM_CPAD ) continue;
110         if ( cig==BAM_CREF_SKIP ) continue;
111         fprintf(stderr,"todo: cigar %d\n", cig);
112         assert(0);
113     }
114     *len = n_tot_bases;
115     return edist;
116 }
117 
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)118 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
119 {
120     memset(bca->ref_pos,0,sizeof(int)*bca->npos);
121     memset(bca->alt_pos,0,sizeof(int)*bca->npos);
122     memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
123     memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
124     memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
125     memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
126     memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
127     memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
128     if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
129     if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
130 }
131 
132 /*
133     Notes:
134     - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
135         which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
136         Later it's used for multiallelic calling by bcftools -m
137     - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
138  */
139 /*
140  * This function is called once for each sample.
141  * _n is number of pilesups pl contributing reads to this sample
142  * pl is pointer to array of _n pileups (one pileup per read)
143  * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
144  * bca is the settings to perform calls across all samples
145  * r is the returned value of the call
146  */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)147 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
148 {
149     int i, n, ref4, is_indel, ori_depth = 0;
150 
151     // clean from previous run
152     r->ori_depth = 0;
153     r->mq0 = 0;
154     memset(r->qsum,0,sizeof(float)*4);
155     memset(r->anno,0,sizeof(double)*16);
156     memset(r->p,0,sizeof(float)*25);
157 
158     if (ref_base >= 0) {
159         ref4 = seq_nt16_int[ref_base];
160         is_indel = 0;
161     } else ref4 = 4, is_indel = 1;
162     if (_n == 0) return -1;
163     // enlarge the bases array if necessary
164     if (bca->max_bases < _n) {
165         bca->max_bases = _n;
166         kroundup32(bca->max_bases);
167         bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
168     }
169     // fill the bases array
170     for (i = n = 0; i < _n; ++i) {
171         const bam_pileup1_t *p = pl + i;
172         int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
173         // set base
174         if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
175         ++ori_depth;
176         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
177         if ( !mapQ ) r->mq0++;
178         baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
179         seqQ = is_indel? (p->aux>>8&0xff) : 99;
180         if (q < bca->min_baseQ) continue;
181         if (q > seqQ) q = seqQ;
182         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
183         if (q > mapQ) q = mapQ;
184         if (q > 63) q = 63;
185         if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
186         if (!is_indel) {
187             b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
188             b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
189             is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
190         } else {
191             b = p->aux>>16&0x3f;
192             is_diff = (b != 0);
193         }
194         bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
195         // collect annotations
196         if (b < 4)
197         {
198             r->qsum[b] += q;
199             if ( r->ADF )
200             {
201                 if ( bam_is_rev(p->b) )
202                     r->ADR[b]++;
203                 else
204                     r->ADF[b]++;
205             }
206         }
207         ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
208         min_dist = p->b->core.l_qseq - 1 - p->qpos;
209         if (min_dist > p->qpos) min_dist = p->qpos;
210         if (min_dist > CAP_DIST) min_dist = CAP_DIST;
211         r->anno[1<<2|is_diff<<1|0] += baseQ;
212         r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
213         r->anno[2<<2|is_diff<<1|0] += mapQ;
214         r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
215         r->anno[3<<2|is_diff<<1|0] += min_dist;
216         r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
217 
218         // collect for bias tests
219         if ( baseQ > 59 ) baseQ = 59;
220         if ( mapQ > 59 ) mapQ = 59;
221         int len, pos = get_position(p, &len);
222         int epos = (double)pos/(len+1) * bca->npos;
223         int ibq  = baseQ/60. * bca->nqual;
224         int imq  = mapQ/60. * bca->nqual;
225         if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
226         else bca->fwd_mqs[imq]++;
227         if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
228         {
229             bca->ref_pos[epos]++;
230             bca->ref_bq[ibq]++;
231             bca->ref_mq[imq]++;
232         }
233         else
234         {
235             bca->alt_pos[epos]++;
236             bca->alt_bq[ibq]++;
237             bca->alt_mq[imq]++;
238         }
239     }
240     r->ori_depth = ori_depth;
241     // glfgen
242     errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
243     return n;
244 }
245 
246 
247 /*
248  *  calc_vdb() - returns value between zero (most biased) and one (no bias)
249  *               on success, or HUGE_VAL when VDB cannot be calculated because
250  *               of insufficient depth (<2x)
251  *
252  *  Variant Distance Bias tests if the variant bases are positioned within the
253  *  reads with sufficient randomness. Unlike other tests, it looks only at
254  *  variant reads and therefore gives different kind of information than Read
255  *  Position Bias for instance. VDB was developed for detecting artefacts in
256  *  RNA-seq calls where reads from spliced transcripts span splice site
257  *  boundaries.  The current implementation differs somewhat from the original
258  *  version described in supplementary material of PMID:22524474, but the idea
259  *  remains the same. (Here the random variable tested is the average distance
260  *  from the averaged position, not the average pairwise distance.)
261  *
262  *  For coverage of 2x, the calculation is exact but is approximated for the
263  *  rest. The result is most accurate between 4-200x. For 3x or >200x, the
264  *  reported values are slightly more favourable than those of a true random
265  *  distribution.
266  */
calc_vdb(int * pos,int npos)267 double calc_vdb(int *pos, int npos)
268 {
269     // Note well: the parameters were obtained by fitting to simulated data of
270     // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
271     const int readlen = 100;
272     assert( npos==readlen );
273 
274     #define nparam 15
275     const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
276         {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
277         {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
278         {200,0.7,23.7} };
279 
280     int i, dp = 0;
281     float mean_pos = 0, mean_diff = 0;
282     for (i=0; i<npos; i++)
283     {
284         if ( !pos[i] ) continue;
285         dp += pos[i];
286         mean_pos += pos[i]*i;
287     }
288     if ( dp<2 ) return HUGE_VAL;     // one or zero reads can be placed anywhere
289 
290     mean_pos /= dp;
291     for (i=0; i<npos; i++)
292     {
293         if ( !pos[i] ) continue;
294         mean_diff += pos[i] * fabs(i - mean_pos);
295     }
296     mean_diff /= dp;
297 
298     int ipos = mean_diff;   // tuned for float-to-int implicit conversion
299     if ( dp==2 )
300         return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
301 
302     if ( dp>=200 )
303         i = nparam; // shortcut for big depths
304     else
305     {
306         for (i=0; i<nparam; i++)
307             if ( param[i][0]>=dp ) break;
308     }
309     float pshift, pscale;
310     if ( i==nparam )
311     {
312         // the depth is too high, go with 200x
313         pscale = param[nparam-1][1];
314         pshift = param[nparam-1][2];
315     }
316     else if ( i>0 && param[i][0]!=dp )
317     {
318         // linear interpolation of parameters
319         pscale = (param[i-1][1] + param[i][1])*0.5;
320         pshift = (param[i-1][2] + param[i][2])*0.5;
321     }
322     else
323     {
324         pscale = param[i][1];
325         pshift = param[i][2];
326     }
327     return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
328 }
329 
calc_chisq_bias(int * a,int * b,int n)330 double calc_chisq_bias(int *a, int *b, int n)
331 {
332     int na = 0, nb = 0, i, ndf = n;
333     for (i=0; i<n; i++) na += a[i];
334     for (i=0; i<n; i++) nb += b[i];
335     if ( !na || !nb ) return HUGE_VAL;
336 
337     double chisq = 0;
338     for (i=0; i<n; i++)
339     {
340         if ( !a[i] && !b[i] ) ndf--;
341         else
342         {
343             double tmp = a[i] - b[i];
344             chisq += tmp*tmp/(a[i]+b[i]);
345         }
346     }
347     /*
348         kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
349         1 if the distributions are identical, 0 if very different
350     */
351     double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
352     return prob;
353 }
354 
mann_whitney_1947(int n,int m,int U)355 double mann_whitney_1947(int n, int m, int U)
356 {
357     if (U<0) return 0;
358     if (n==0||m==0) return U==0 ? 1 : 0;
359     return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
360 }
361 
mann_whitney_1947_cdf(int n,int m,int U)362 double mann_whitney_1947_cdf(int n, int m, int U)
363 {
364     int i;
365     double sum = 0;
366     for (i=0; i<=U; i++)
367         sum += mann_whitney_1947(n,m,i);
368     return sum;
369 }
370 
calc_mwu_bias_cdf(int * a,int * b,int n)371 double calc_mwu_bias_cdf(int *a, int *b, int n)
372 {
373     int na = 0, nb = 0, i;
374     double U = 0;
375     for (i=0; i<n; i++)
376     {
377         na += a[i];
378         U  += a[i] * (nb + b[i]*0.5);
379         nb += b[i];
380     }
381     if ( !na || !nb ) return HUGE_VAL;
382 
383     // Always work with the smaller U
384     double U_min = ((double)na * nb) - U;
385     if ( U < U_min ) U_min = U;
386 
387     if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
388     if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
389 
390     // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
391     if ( na>=8 || nb>=8 )
392     {
393         double mean = ((double)na*nb)*0.5;
394         double var2 = ((double)na*nb)*(na+nb+1)/12.0;
395         double z = (U_min - mean)/sqrt(2*var2);   // z is N(0,1)
396         return 2.0 - kf_erfc(z);  // which is 1 + erf(z)
397     }
398 
399     // Exact calculation
400     double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
401     return pval>1 ? 1 : pval;
402 }
403 
calc_mwu_bias(int * a,int * b,int n)404 double calc_mwu_bias(int *a, int *b, int n)
405 {
406     int na = 0, nb = 0, i;
407     double U = 0;
408     for (i=0; i<n; i++)
409     {
410         na += a[i];
411         U  += a[i] * (nb + b[i]*0.5);
412         nb += b[i];
413     }
414     if ( !na || !nb ) return HUGE_VAL;
415     if ( na==1 || nb==1 ) return 1.0;       // Flat probability, all U values are equally likely
416 
417     double mean = ((double)na*nb)*0.5;
418     if ( na==2 || nb==2 )
419     {
420         // Linear approximation
421         return U>mean ? (2.0*mean-U)/mean : U/mean;
422     }
423     double var2 = ((double)na*nb)*(na+nb+1)/12.0;
424     if ( na>=8 || nb>=8 )
425     {
426         // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
427         return exp(-0.5*(U-mean)*(U-mean)/var2);
428     }
429 
430     // Exact calculation
431     return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
432 }
433 
logsumexp2(double a,double b)434 static inline double logsumexp2(double a, double b)
435 {
436     if ( a>b )
437         return log(1 + exp(b-a)) + a;
438     else
439         return log(1 + exp(a-b)) + b;
440 }
441 
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)442 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
443 {
444     call->seg_bias = HUGE_VAL;
445     if ( !bcr ) return;
446 
447     int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
448     if ( !nr ) return;
449 
450     int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n;    // average depth
451     double M   = floor((double)nr / avg_dp + 0.5);   // an approximate number of variants samples in the population
452     if ( M>call->n ) M = call->n;       // clamp M at the number of samples
453     else if ( M==0 ) M = 1;
454     double f = M / 2. / call->n;        // allele frequency
455     double p = (double) nr / call->n;   // number of variant reads per sample expected if variant not real (poisson)
456     double q = (double) nr / M;         // number of variant reads per sample expected if variant is real (poisson)
457     double sum = 0;
458     const double log2 = log(2.0);
459 
460     // fprintf(stderr,"M=%.1f  p=%e q=%e f=%f  dp=%d\n",M,p,q,f,avg_dp);
461     int i;
462     for (i=0; i<call->n; i++)
463     {
464         int oi = bcr[i].anno[2] + bcr[i].anno[3];       // observed number of non-ref reads
465         double tmp;
466         if ( oi )
467         {
468             // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
469             tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
470             tmp += log(f) + oi*log(q/p) - q + p;
471         }
472         else
473             tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
474         sum += tmp;
475         // fprintf(stderr,"oi=%d %e\n", oi,tmp);
476     }
477     call->seg_bias = sum;
478 }
479 
480 /**
481  *  bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
482  *  @n:         number of samples
483  *  @calls:     each sample's calls
484  *  @bca:       auxiliary data structure for holding temporary values
485  *  @ref_base:  the reference base
486  *  @call:      filled with the annotations
487  *
488  *  Combines calls across the various samples being studied
489  *  1. For each allele at each base across all samples the quality is summed so
490  *     you end up with a set of quality sums for each allele present 2. The quality
491  *     sums are sorted.
492  *  3. Using the sorted quality sums we now create the allele ordering array
493  *     A\subN. This is done by doing the following:
494  *     a) If the reference allele is known it always comes first, otherwise N
495  *        comes first.
496  *     b) Then the rest of the alleles are output in descending order of quality
497  *        sum (which we already know the qsum array was sorted).  Any allelles with
498  *        qsum 0 will be excluded.
499  *  4. Using the allele ordering array we create the genotype ordering array.
500  *     In the worst case with an unknown reference this will be:  A0/A0 A1/A0 A1/A1
501  *     A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
502  *  5. The genotype ordering array is then used to extract data from the error
503  *     model 5*5 matrix and is used to produce a Phread likelihood array for each
504  *     sample.
505  */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)506 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
507 {
508     int ref4, i, j;
509     float qsum[5] = {0,0,0,0,0};
510     if (ref_base >= 0) {
511         call->ori_ref = ref4 = seq_nt16_int[ref_base];
512         if (ref4 > 4) ref4 = 4;
513     } else call->ori_ref = -1, ref4 = 0;
514 
515     // calculate qsum, this is done by summing normalized qsum across all samples,
516     // to account for differences in coverage
517     for (i = 0; i < n; ++i)
518     {
519         float sum = 0;
520         for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
521         if ( sum )
522             for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
523     }
524 
525     // sort qsum in ascending order (insertion sort)
526     float *ptr[5], *tmp;
527     for (i=0; i<5; i++) ptr[i] = &qsum[i];
528     for (i=1; i<4; i++)
529         for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
530             tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
531 
532     // Set the reference allele and alternative allele(s)
533     for (i=0; i<5; i++) call->a[i] = -1;
534     for (i=0; i<5; i++) call->qsum[i] = 0;
535     call->unseen = -1;
536     call->a[0] = ref4;
537     for (i=3, j=1; i>=0; i--)   // i: alleles sorted by QS; j, a[j]: output allele ordering
538     {
539         int ipos = ptr[i] - qsum;   // position in sorted qsum array
540         if ( ipos==ref4 )
541             call->qsum[0] = qsum[ipos];    // REF's qsum
542         else
543         {
544             if ( !qsum[ipos] ) break;       // qsum is 0, this and consequent alleles are not seen in the pileup
545             call->qsum[j] = qsum[ipos];
546             call->a[j++]  = ipos;
547         }
548     }
549     if (ref_base >= 0)
550     {
551         // for SNPs, find the "unseen" base
552         if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
553             call->unseen = j, call->a[j++] = ptr[i] - qsum;
554         call->n_alleles = j;
555     }
556     else
557     {
558         call->n_alleles = j;
559         if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
560     }
561     /*
562      * Set the phread likelihood array (call->PL) This array is 15 entries long
563      * for each sample because that is size of an upper or lower triangle of a
564      * worst case 5x5 matrix of possible genotypes. This worst case matrix will
565      * occur when all 4 possible alleles are present and the reference allele
566      * is unknown.  The sides of the matrix will correspond to the reference
567      * allele (if known) followed by the alleles present in descending order of
568      * quality sum
569      */
570     {
571         int x, g[15], z;
572         double sum_min = 0.;
573         x = call->n_alleles * (call->n_alleles + 1) / 2;
574         // get the possible genotypes
575         // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
576         for (i = z = 0; i < call->n_alleles; ++i) {
577             for (j = 0; j <= i; ++j) {
578                 g[z++] = call->a[j] * 5 + call->a[i];
579             }
580         }
581         // for each sample calculate the PL
582         for (i = 0; i < n; ++i)
583         {
584             int32_t *PL = call->PL + x * i;
585             const bcf_callret1_t *r = calls + i;
586             float min = FLT_MAX;
587             for (j = 0; j < x; ++j) {
588                 if (min > r->p[g[j]]) min = r->p[g[j]];
589             }
590             sum_min += min;
591             for (j = 0; j < x; ++j) {
592                 int y;
593                 y = (int)(r->p[g[j]] - min + .499);
594                 if (y > 255) y = 255;
595                 PL[j] = y;
596             }
597         }
598         if ( call->DP4 )
599         {
600             for (i=0; i<n; i++)
601             {
602                 call->DP4[4*i]   = calls[i].anno[0];
603                 call->DP4[4*i+1] = calls[i].anno[1];
604                 call->DP4[4*i+2] = calls[i].anno[2];
605                 call->DP4[4*i+3] = calls[i].anno[3];
606             }
607         }
608         if ( call->ADF )
609         {
610             assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
611 
612             // reorder ADR,ADF to match the allele ordering at this site
613             int32_t tmp[B2B_MAX_ALLELES];
614             int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
615             int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
616             int32_t *adr_tot = call->ADR;   // the first bin stores total counts per site
617             int32_t *adf_tot = call->ADF;
618             for (i=0; i<n; i++)
619             {
620                 for (j=0; j<call->n_alleles; j++)
621                 {
622                     tmp[j] = adr[ call->a[j] ];
623                     adr_tot[j] += tmp[j];
624                 }
625                 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
626                 for (j=0; j<call->n_alleles; j++)
627                 {
628                     tmp[j] = adf[ call->a[j] ];
629                     adf_tot[j] += tmp[j];
630                 }
631                 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
632                 adf_out += call->n_alleles;
633                 adr_out += call->n_alleles;
634                 adr += B2B_MAX_ALLELES;
635                 adf += B2B_MAX_ALLELES;
636             }
637         }
638 
639 //      if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
640         call->shift = (int)(sum_min + .499);
641     }
642     // combine annotations
643     memset(call->anno, 0, 16 * sizeof(double));
644     call->ori_depth = 0;
645     call->depth     = 0;
646     call->mq0       = 0;
647     for (i = 0; i < n; ++i) {
648         call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
649         call->ori_depth += calls[i].ori_depth;
650         call->mq0 += calls[i].mq0;
651         for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
652     }
653 
654     calc_SegBias(calls, call);
655 
656     // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
657     // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
658     // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
659 
660     call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
661     call->mwu_mq  = calc_mwu_bias(bca->ref_mq,  bca->alt_mq,  bca->nqual);
662     call->mwu_bq  = calc_mwu_bias(bca->ref_bq,  bca->alt_bq,  bca->nqual);
663     call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
664 
665 #if CDF_MWU_TESTS
666     call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
667     call->mwu_mq_cdf  = calc_mwu_bias_cdf(bca->ref_mq,  bca->alt_mq,  bca->nqual);
668     call->mwu_bq_cdf  = calc_mwu_bias_cdf(bca->ref_bq,  bca->alt_bq,  bca->nqual);
669     call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
670 #endif
671 
672     call->vdb = calc_vdb(bca->alt_pos, bca->npos);
673 
674     return 0;
675 }
676 
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)677 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
678 {
679     extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
680     int i, j, nals = 1;
681 
682     bcf_hdr_t *hdr = bc->bcf_hdr;
683     rec->rid  = bc->tid;
684     rec->pos  = bc->pos;
685     rec->qual = 0;
686 
687     bc->tmp.l = 0;
688     if (bc->ori_ref < 0)    // indel
689     {
690         // REF
691         kputc(ref[bc->pos], &bc->tmp);
692         for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
693 
694         // ALT
695         for (i=1; i<4; i++)
696         {
697             if (bc->a[i] < 0) break;
698             kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
699 
700             if (bca->indel_types[bc->a[i]] < 0) { // deletion
701                 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
702                     kputc(ref[bc->pos+1+j], &bc->tmp);
703             } else { // insertion; cannot be a reference unless a bug
704                 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
705                 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
706                     kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
707                 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
708             }
709             nals++;
710         }
711     }
712     else    // SNP
713     {
714         kputc("ACGTN"[bc->ori_ref], &bc->tmp);
715         for (i=1; i<5; i++)
716         {
717             if (bc->a[i] < 0) break;
718             kputc(',', &bc->tmp);
719             if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
720             else kputc("ACGT"[bc->a[i]], &bc->tmp);
721             nals++;
722         }
723     }
724     bcf_update_alleles_str(hdr, rec, bc->tmp.s);
725 
726     bc->tmp.l = 0;
727 
728     // INFO
729     if (bc->ori_ref < 0)
730     {
731         bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
732         bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
733         bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
734     }
735     bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
736     if ( fmt_flag&B2B_INFO_ADF )
737         bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
738     if ( fmt_flag&B2B_INFO_ADR )
739         bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
740     if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
741     {
742         for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
743         if ( fmt_flag&B2B_INFO_AD )
744             bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
745         if ( fmt_flag&B2B_INFO_DPR )
746             bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
747     }
748 
749     float tmpf[16];
750     for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
751     bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
752     bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
753 
754     if ( bc->vdb != HUGE_VAL )      bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
755     if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
756     if ( bc->mwu_pos != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
757     if ( bc->mwu_mq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
758     if ( bc->mwu_mqs != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
759     if ( bc->mwu_bq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
760 #if CDF_MWU_TESTS
761     if ( bc->mwu_pos_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
762     if ( bc->mwu_mq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
763     if ( bc->mwu_mqs_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
764     if ( bc->mwu_bq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
765 #endif
766     tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
767     bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
768 
769     // FORMAT
770     rec->n_sample = bc->n;
771     bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
772     if ( fmt_flag&B2B_FMT_DP )
773     {
774         int32_t *ptr = (int32_t*) bc->fmt_arr;
775         for (i=0; i<bc->n; i++)
776             ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
777         bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
778     }
779     if ( fmt_flag&B2B_FMT_DV )
780     {
781         int32_t *ptr = (int32_t*) bc->fmt_arr;
782         for (i=0; i<bc->n; i++)
783             ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
784         bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
785     }
786     if ( fmt_flag&B2B_FMT_SP )
787     {
788         int32_t *ptr = (int32_t*) bc->fmt_arr;
789         for (i=0; i<bc->n; i++)
790         {
791             int fwd_ref = bc->DP4[4*i], rev_ref =  bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
792             if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
793                 ptr[i] = 0;
794             else
795             {
796                 double left, right, two;
797                 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
798                 int32_t x = (int)(-4.343 * log(two) + .499);
799                 if (x > 255) x = 255;
800                 ptr[i] = x;
801             }
802         }
803         bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
804     }
805     if ( fmt_flag&B2B_FMT_DP4 )
806         bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
807     if ( fmt_flag&B2B_FMT_ADF )
808         bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
809     if ( fmt_flag&B2B_FMT_ADR )
810         bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
811     if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
812     {
813         for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
814         if ( fmt_flag&B2B_FMT_AD )
815             bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
816         if ( fmt_flag&B2B_FMT_DPR )
817             bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
818     }
819 
820     return 0;
821 }
822