1 #include "samtools.pysam.h"
2 
3 /*  bam2bcf.c -- variant calling.
4 
5     Copyright (C) 2010-2012 Broad Institute.
6     Copyright (C) 2012-2015, 2021 Genome Research Ltd.
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #include <config.h>
29 
30 #include <math.h>
31 #include <stdint.h>
32 #include <assert.h>
33 #include <float.h>
34 #include <htslib/hts.h>
35 #include <htslib/sam.h>
36 #include <htslib/kstring.h>
37 #include <htslib/kfunc.h>
38 #include "bam2bcf.h"
39 
40 extern  void ks_introsort_uint32_t(size_t n, uint32_t a[]);
41 
42 #define CALL_DEFTHETA 0.83
43 #define DEF_MAPQ 20
44 
45 #define CAP_DIST 25
46 
bcf_call_init(double theta,int min_baseQ)47 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
48 {
49     bcf_callaux_t *bca;
50     if (theta <= 0.) theta = CALL_DEFTHETA;
51     bca = calloc(1, sizeof(bcf_callaux_t));
52     bca->capQ = 60;
53     bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
54     bca->min_baseQ = min_baseQ;
55     bca->e = errmod_init(1. - theta);
56     bca->min_frac = 0.002;
57     bca->min_support = 1;
58     bca->per_sample_flt = 0;
59     bca->npos = 100;
60     bca->ref_pos = malloc(bca->npos*sizeof(int));
61     bca->alt_pos = malloc(bca->npos*sizeof(int));
62     bca->nqual = 60;
63     bca->ref_mq  = malloc(bca->nqual*sizeof(int));
64     bca->alt_mq  = malloc(bca->nqual*sizeof(int));
65     bca->ref_bq  = malloc(bca->nqual*sizeof(int));
66     bca->alt_bq  = malloc(bca->nqual*sizeof(int));
67     bca->fwd_mqs = malloc(bca->nqual*sizeof(int));
68     bca->rev_mqs = malloc(bca->nqual*sizeof(int));
69     return bca;
70 }
71 
bcf_call_destroy(bcf_callaux_t * bca)72 void bcf_call_destroy(bcf_callaux_t *bca)
73 {
74     if (bca == 0) return;
75     errmod_destroy(bca->e);
76     if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
77     free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
78     free(bca->fwd_mqs); free(bca->rev_mqs);
79     bca->nqual = 0;
80     free(bca->bases); free(bca->inscns); free(bca);
81 }
82 
83 // position in the sequence with respect to the aligned part of the read
get_position(const bam_pileup1_t * p,int * len)84 static int get_position(const bam_pileup1_t *p, int *len)
85 {
86     int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
87     for (icig=0; icig<p->b->core.n_cigar; icig++)
88     {
89         int cig  = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
90         int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
91         if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
92         {
93             n_tot_bases += ncig;
94             iread += ncig;
95             continue;
96         }
97         if ( cig==BAM_CINS )
98         {
99             n_tot_bases += ncig;
100             iread += ncig;
101             continue;
102         }
103         if ( cig==BAM_CSOFT_CLIP )
104         {
105             iread += ncig;
106             if ( iread<=p->qpos ) edist -= ncig;
107             continue;
108         }
109         if ( cig==BAM_CDEL ) continue;
110         if ( cig==BAM_CHARD_CLIP ) continue;
111         if ( cig==BAM_CPAD ) continue;
112         if ( cig==BAM_CREF_SKIP ) continue;
113         fprintf(samtools_stderr,"todo: cigar %d\n", cig);
114         assert(0);
115     }
116     *len = n_tot_bases;
117     return edist;
118 }
119 
bcf_callaux_clean(bcf_callaux_t * bca,bcf_call_t * call)120 void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
121 {
122     memset(bca->ref_pos,0,sizeof(int)*bca->npos);
123     memset(bca->alt_pos,0,sizeof(int)*bca->npos);
124     memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
125     memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
126     memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
127     memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
128     memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
129     memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
130     if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
131     if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
132 }
133 
134 /*
135     Notes:
136     - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
137         which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
138         Later it's used for multiallelic calling by bcftools -m
139     - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
140  */
141 /*
142  * This function is called once for each sample.
143  * _n is number of pilesups pl contributing reads to this sample
144  * pl is pointer to array of _n pileups (one pileup per read)
145  * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
146  * bca is the settings to perform calls across all samples
147  * r is the returned value of the call
148  */
bcf_call_glfgen(int _n,const bam_pileup1_t * pl,int ref_base,bcf_callaux_t * bca,bcf_callret1_t * r)149 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
150 {
151     int i, n, ref4, is_indel, ori_depth = 0;
152 
153     // clean from previous run
154     r->ori_depth = 0;
155     r->mq0 = 0;
156     memset(r->qsum,0,sizeof(float)*4);
157     memset(r->anno,0,sizeof(double)*16);
158     memset(r->p,0,sizeof(float)*25);
159 
160     if (ref_base >= 0) {
161         ref4 = seq_nt16_int[ref_base];
162         is_indel = 0;
163     } else ref4 = 4, is_indel = 1;
164     if (_n == 0) return -1;
165     // enlarge the bases array if necessary
166     if (bca->max_bases < _n) {
167         bca->max_bases = _n;
168         kroundup32(bca->max_bases);
169         bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
170     }
171     // fill the bases array
172     for (i = n = 0; i < _n; ++i) {
173         const bam_pileup1_t *p = pl + i;
174         int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
175         // set base
176         if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
177         ++ori_depth;
178         mapQ  = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
179         if ( !mapQ ) r->mq0++;
180         baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
181         seqQ = is_indel? (p->aux>>8&0xff) : 99;
182         if (q < bca->min_baseQ) continue;
183         if (q > seqQ) q = seqQ;
184         mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
185         if (q > mapQ) q = mapQ;
186         if (q > 63) q = 63;
187         if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
188         if (!is_indel) {
189             b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
190             b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
191             is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
192         } else {
193             b = p->aux>>16&0x3f;
194             is_diff = (b != 0);
195         }
196         bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
197         // collect annotations
198         if (b < 4)
199         {
200             r->qsum[b] += q;
201             if ( r->ADF )
202             {
203                 if ( bam_is_rev(p->b) )
204                     r->ADR[b]++;
205                 else
206                     r->ADF[b]++;
207             }
208         }
209         ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
210         min_dist = p->b->core.l_qseq - 1 - p->qpos;
211         if (min_dist > p->qpos) min_dist = p->qpos;
212         if (min_dist > CAP_DIST) min_dist = CAP_DIST;
213         r->anno[1<<2|is_diff<<1|0] += baseQ;
214         r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
215         r->anno[2<<2|is_diff<<1|0] += mapQ;
216         r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
217         r->anno[3<<2|is_diff<<1|0] += min_dist;
218         r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
219 
220         // collect for bias tests
221         if ( baseQ > 59 ) baseQ = 59;
222         if ( mapQ > 59 ) mapQ = 59;
223         int len, pos = get_position(p, &len);
224         int epos = (double)pos/(len+1) * bca->npos;
225         int ibq  = baseQ/60. * bca->nqual;
226         int imq  = mapQ/60. * bca->nqual;
227         if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
228         else bca->fwd_mqs[imq]++;
229         if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
230         {
231             bca->ref_pos[epos]++;
232             bca->ref_bq[ibq]++;
233             bca->ref_mq[imq]++;
234         }
235         else
236         {
237             bca->alt_pos[epos]++;
238             bca->alt_bq[ibq]++;
239             bca->alt_mq[imq]++;
240         }
241     }
242     r->ori_depth = ori_depth;
243     // glfgen
244     errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
245     return n;
246 }
247 
248 
249 /*
250  *  calc_vdb() - returns value between zero (most biased) and one (no bias)
251  *               on success, or HUGE_VAL when VDB cannot be calculated because
252  *               of insufficient depth (<2x)
253  *
254  *  Variant Distance Bias tests if the variant bases are positioned within the
255  *  reads with sufficient randomness. Unlike other tests, it looks only at
256  *  variant reads and therefore gives different kind of information than Read
257  *  Position Bias for instance. VDB was developed for detecting artefacts in
258  *  RNA-seq calls where reads from spliced transcripts span splice site
259  *  boundaries.  The current implementation differs somewhat from the original
260  *  version described in supplementary material of PMID:22524474, but the idea
261  *  remains the same. (Here the random variable tested is the average distance
262  *  from the averaged position, not the average pairwise distance.)
263  *
264  *  For coverage of 2x, the calculation is exact but is approximated for the
265  *  rest. The result is most accurate between 4-200x. For 3x or >200x, the
266  *  reported values are slightly more favourable than those of a true random
267  *  distribution.
268  */
calc_vdb(int * pos,int npos)269 double calc_vdb(int *pos, int npos)
270 {
271     // Note well: the parameters were obtained by fitting to simulated data of
272     // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
273     const int readlen = 100;
274     assert( npos==readlen );
275 
276     #define nparam 15
277     const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
278         {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
279         {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
280         {200,0.7,23.7} };
281 
282     int i, dp = 0;
283     float mean_pos = 0, mean_diff = 0;
284     for (i=0; i<npos; i++)
285     {
286         if ( !pos[i] ) continue;
287         dp += pos[i];
288         mean_pos += pos[i]*i;
289     }
290     if ( dp<2 ) return HUGE_VAL;     // one or zero reads can be placed anywhere
291 
292     mean_pos /= dp;
293     for (i=0; i<npos; i++)
294     {
295         if ( !pos[i] ) continue;
296         mean_diff += pos[i] * fabs(i - mean_pos);
297     }
298     mean_diff /= dp;
299 
300     int ipos = mean_diff;   // tuned for float-to-int implicit conversion
301     if ( dp==2 )
302         return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
303 
304     if ( dp>=200 )
305         i = nparam; // shortcut for big depths
306     else
307     {
308         for (i=0; i<nparam; i++)
309             if ( param[i][0]>=dp ) break;
310     }
311     float pshift, pscale;
312     if ( i==nparam )
313     {
314         // the depth is too high, go with 200x
315         pscale = param[nparam-1][1];
316         pshift = param[nparam-1][2];
317     }
318     else if ( i>0 && param[i][0]!=dp )
319     {
320         // linear interpolation of parameters
321         pscale = (param[i-1][1] + param[i][1])*0.5;
322         pshift = (param[i-1][2] + param[i][2])*0.5;
323     }
324     else
325     {
326         pscale = param[i][1];
327         pshift = param[i][2];
328     }
329     return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
330 }
331 
calc_chisq_bias(int * a,int * b,int n)332 double calc_chisq_bias(int *a, int *b, int n)
333 {
334     int na = 0, nb = 0, i, ndf = n;
335     for (i=0; i<n; i++) na += a[i];
336     for (i=0; i<n; i++) nb += b[i];
337     if ( !na || !nb ) return HUGE_VAL;
338 
339     double chisq = 0;
340     for (i=0; i<n; i++)
341     {
342         if ( !a[i] && !b[i] ) ndf--;
343         else
344         {
345             double tmp = a[i] - b[i];
346             chisq += tmp*tmp/(a[i]+b[i]);
347         }
348     }
349     /*
350         kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
351         1 if the distributions are identical, 0 if very different
352     */
353     double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
354     return prob;
355 }
356 
mann_whitney_1947(int n,int m,int U)357 double mann_whitney_1947(int n, int m, int U)
358 {
359     if (U<0) return 0;
360     if (n==0||m==0) return U==0 ? 1 : 0;
361     return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
362 }
363 
mann_whitney_1947_cdf(int n,int m,int U)364 double mann_whitney_1947_cdf(int n, int m, int U)
365 {
366     int i;
367     double sum = 0;
368     for (i=0; i<=U; i++)
369         sum += mann_whitney_1947(n,m,i);
370     return sum;
371 }
372 
calc_mwu_bias_cdf(int * a,int * b,int n)373 double calc_mwu_bias_cdf(int *a, int *b, int n)
374 {
375     int na = 0, nb = 0, i;
376     double U = 0;
377     for (i=0; i<n; i++)
378     {
379         na += a[i];
380         U  += a[i] * (nb + b[i]*0.5);
381         nb += b[i];
382     }
383     if ( !na || !nb ) return HUGE_VAL;
384 
385     // Always work with the smaller U
386     double U_min = ((double)na * nb) - U;
387     if ( U < U_min ) U_min = U;
388 
389     if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
390     if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
391 
392     // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
393     if ( na>=8 || nb>=8 )
394     {
395         double mean = ((double)na*nb)*0.5;
396         double var2 = ((double)na*nb)*(na+nb+1)/12.0;
397         double z = (U_min - mean)/sqrt(2*var2);   // z is N(0,1)
398         return 2.0 - kf_erfc(z);  // which is 1 + erf(z)
399     }
400 
401     // Exact calculation
402     double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
403     return pval>1 ? 1 : pval;
404 }
405 
calc_mwu_bias(int * a,int * b,int n)406 double calc_mwu_bias(int *a, int *b, int n)
407 {
408     int na = 0, nb = 0, i;
409     double U = 0;
410     for (i=0; i<n; i++)
411     {
412         na += a[i];
413         U  += a[i] * (nb + b[i]*0.5);
414         nb += b[i];
415     }
416     if ( !na || !nb ) return HUGE_VAL;
417     if ( na==1 || nb==1 ) return 1.0;       // Flat probability, all U values are equally likely
418 
419     double mean = ((double)na*nb)*0.5;
420     if ( na==2 || nb==2 )
421     {
422         // Linear approximation
423         return U>mean ? (2.0*mean-U)/mean : U/mean;
424     }
425     double var2 = ((double)na*nb)*(na+nb+1)/12.0;
426     if ( na>=8 || nb>=8 )
427     {
428         // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
429         return exp(-0.5*(U-mean)*(U-mean)/var2);
430     }
431 
432     // Exact calculation
433     return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
434 }
435 
logsumexp2(double a,double b)436 static inline double logsumexp2(double a, double b)
437 {
438     if ( a>b )
439         return log(1 + exp(b-a)) + a;
440     else
441         return log(1 + exp(a-b)) + b;
442 }
443 
calc_SegBias(const bcf_callret1_t * bcr,bcf_call_t * call)444 void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
445 {
446     call->seg_bias = HUGE_VAL;
447     if ( !bcr ) return;
448 
449     int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
450     if ( !nr ) return;
451 
452     int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n;    // average depth
453     double M   = floor((double)nr / avg_dp + 0.5);   // an approximate number of variants samples in the population
454     if ( M>call->n ) M = call->n;       // clamp M at the number of samples
455     else if ( M==0 ) M = 1;
456     double f = M / 2. / call->n;        // allele frequency
457     double p = (double) nr / call->n;   // number of variant reads per sample expected if variant not real (poisson)
458     double q = (double) nr / M;         // number of variant reads per sample expected if variant is real (poisson)
459     double sum = 0;
460     const double log2 = log(2.0);
461 
462     // fprintf(samtools_stderr,"M=%.1f  p=%e q=%e f=%f  dp=%d\n",M,p,q,f,avg_dp);
463     int i;
464     for (i=0; i<call->n; i++)
465     {
466         int oi = bcr[i].anno[2] + bcr[i].anno[3];       // observed number of non-ref reads
467         double tmp;
468         if ( oi )
469         {
470             // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
471             tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
472             tmp += log(f) + oi*log(q/p) - q + p;
473         }
474         else
475             tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
476         sum += tmp;
477         // fprintf(samtools_stderr,"oi=%d %e\n", oi,tmp);
478     }
479     call->seg_bias = sum;
480 }
481 
482 /**
483  *  bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
484  *  @n:         number of samples
485  *  @calls:     each sample's calls
486  *  @bca:       auxiliary data structure for holding temporary values
487  *  @ref_base:  the reference base
488  *  @call:      filled with the annotations
489  *
490  *  Combines calls across the various samples being studied
491  *  1. For each allele at each base across all samples the quality is summed so
492  *     you end up with a set of quality sums for each allele present 2. The quality
493  *     sums are sorted.
494  *  3. Using the sorted quality sums we now create the allele ordering array
495  *     A\subN. This is done by doing the following:
496  *     a) If the reference allele is known it always comes first, otherwise N
497  *        comes first.
498  *     b) Then the rest of the alleles are output in descending order of quality
499  *        sum (which we already know the qsum array was sorted).  Any allelles with
500  *        qsum 0 will be excluded.
501  *  4. Using the allele ordering array we create the genotype ordering array.
502  *     In the worst case with an unknown reference this will be:  A0/A0 A1/A0 A1/A1
503  *     A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
504  *  5. The genotype ordering array is then used to extract data from the error
505  *     model 5*5 matrix and is used to produce a Phread likelihood array for each
506  *     sample.
507  */
bcf_call_combine(int n,const bcf_callret1_t * calls,bcf_callaux_t * bca,int ref_base,bcf_call_t * call)508 int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
509 {
510     int ref4, i, j;
511     float qsum[5] = {0,0,0,0,0};
512     if (ref_base >= 0) {
513         call->ori_ref = ref4 = seq_nt16_int[ref_base];
514         if (ref4 > 4) ref4 = 4;
515     } else call->ori_ref = -1, ref4 = 0;
516 
517     // calculate qsum, this is done by summing normalized qsum across all samples,
518     // to account for differences in coverage
519     for (i = 0; i < n; ++i)
520     {
521         float sum = 0;
522         for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
523         if ( sum )
524             for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
525     }
526 
527     // sort qsum in ascending order (insertion sort)
528     float *ptr[5], *tmp;
529     for (i=0; i<5; i++) ptr[i] = &qsum[i];
530     for (i=1; i<4; i++)
531         for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
532             tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
533 
534     // Set the reference allele and alternative allele(s)
535     for (i=0; i<5; i++) call->a[i] = -1;
536     for (i=0; i<5; i++) call->qsum[i] = 0;
537     call->unseen = -1;
538     call->a[0] = ref4;
539     for (i=3, j=1; i>=0; i--)   // i: alleles sorted by QS; j, a[j]: output allele ordering
540     {
541         int ipos = ptr[i] - qsum;   // position in sorted qsum array
542         if ( ipos==ref4 )
543             call->qsum[0] = qsum[ipos];    // REF's qsum
544         else
545         {
546             if ( !qsum[ipos] ) break;       // qsum is 0, this and consequent alleles are not seen in the pileup
547             call->qsum[j] = qsum[ipos];
548             call->a[j++]  = ipos;
549         }
550     }
551     if (ref_base >= 0)
552     {
553         // for SNPs, find the "unseen" base
554         if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
555             call->unseen = j, call->a[j++] = ptr[i] - qsum;
556         call->n_alleles = j;
557     }
558     else
559     {
560         call->n_alleles = j;
561         if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
562     }
563     /*
564      * Set the phread likelihood array (call->PL) This array is 15 entries long
565      * for each sample because that is size of an upper or lower triangle of a
566      * worst case 5x5 matrix of possible genotypes. This worst case matrix will
567      * occur when all 4 possible alleles are present and the reference allele
568      * is unknown.  The sides of the matrix will correspond to the reference
569      * allele (if known) followed by the alleles present in descending order of
570      * quality sum
571      */
572     {
573         int x, g[15], z;
574         double sum_min = 0.;
575         x = call->n_alleles * (call->n_alleles + 1) / 2;
576         // get the possible genotypes
577         // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
578         for (i = z = 0; i < call->n_alleles; ++i) {
579             for (j = 0; j <= i; ++j) {
580                 g[z++] = call->a[j] * 5 + call->a[i];
581             }
582         }
583         // for each sample calculate the PL
584         for (i = 0; i < n; ++i)
585         {
586             int32_t *PL = call->PL + x * i;
587             const bcf_callret1_t *r = calls + i;
588             float min = FLT_MAX;
589             for (j = 0; j < x; ++j) {
590                 if (min > r->p[g[j]]) min = r->p[g[j]];
591             }
592             sum_min += min;
593             for (j = 0; j < x; ++j) {
594                 int y;
595                 y = (int)(r->p[g[j]] - min + .499);
596                 if (y > 255) y = 255;
597                 PL[j] = y;
598             }
599         }
600         if ( call->DP4 )
601         {
602             for (i=0; i<n; i++)
603             {
604                 call->DP4[4*i]   = calls[i].anno[0];
605                 call->DP4[4*i+1] = calls[i].anno[1];
606                 call->DP4[4*i+2] = calls[i].anno[2];
607                 call->DP4[4*i+3] = calls[i].anno[3];
608             }
609         }
610         if ( call->ADF )
611         {
612             assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
613 
614             // reorder ADR,ADF to match the allele ordering at this site
615             int32_t tmp[B2B_MAX_ALLELES];
616             int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
617             int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
618             int32_t *adr_tot = call->ADR;   // the first bin stores total counts per site
619             int32_t *adf_tot = call->ADF;
620             for (i=0; i<n; i++)
621             {
622                 for (j=0; j<call->n_alleles; j++)
623                 {
624                     tmp[j] = adr[ call->a[j] ];
625                     adr_tot[j] += tmp[j];
626                 }
627                 for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
628                 for (j=0; j<call->n_alleles; j++)
629                 {
630                     tmp[j] = adf[ call->a[j] ];
631                     adf_tot[j] += tmp[j];
632                 }
633                 for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
634                 adf_out += call->n_alleles;
635                 adr_out += call->n_alleles;
636                 adr += B2B_MAX_ALLELES;
637                 adf += B2B_MAX_ALLELES;
638             }
639         }
640 
641 //      if (ref_base < 0) fprintf(samtools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
642         call->shift = (int)(sum_min + .499);
643     }
644     // combine annotations
645     memset(call->anno, 0, 16 * sizeof(double));
646     call->ori_depth = 0;
647     call->depth     = 0;
648     call->mq0       = 0;
649     for (i = 0; i < n; ++i) {
650         call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
651         call->ori_depth += calls[i].ori_depth;
652         call->mq0 += calls[i].mq0;
653         for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
654     }
655 
656     calc_SegBias(calls, call);
657 
658     // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
659     // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
660     // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
661 
662     call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
663     call->mwu_mq  = calc_mwu_bias(bca->ref_mq,  bca->alt_mq,  bca->nqual);
664     call->mwu_bq  = calc_mwu_bias(bca->ref_bq,  bca->alt_bq,  bca->nqual);
665     call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
666 
667 #if CDF_MWU_TESTS
668     call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
669     call->mwu_mq_cdf  = calc_mwu_bias_cdf(bca->ref_mq,  bca->alt_mq,  bca->nqual);
670     call->mwu_bq_cdf  = calc_mwu_bias_cdf(bca->ref_bq,  bca->alt_bq,  bca->nqual);
671     call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
672 #endif
673 
674     call->vdb = calc_vdb(bca->alt_pos, bca->npos);
675 
676     return 0;
677 }
678 
bcf_call2bcf(bcf_call_t * bc,bcf1_t * rec,bcf_callret1_t * bcr,int fmt_flag,const bcf_callaux_t * bca,const char * ref)679 int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
680 {
681     extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
682     int i, j, nals = 1;
683 
684     bcf_hdr_t *hdr = bc->bcf_hdr;
685     rec->rid  = bc->tid;
686     rec->pos  = bc->pos;
687     rec->qual = 0;
688 
689     bc->tmp.l = 0;
690     if (bc->ori_ref < 0)    // indel
691     {
692         // REF
693         kputc(ref[bc->pos], &bc->tmp);
694         for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
695 
696         // ALT
697         for (i=1; i<4; i++)
698         {
699             if (bc->a[i] < 0) break;
700             kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
701 
702             if (bca->indel_types[bc->a[i]] < 0) { // deletion
703                 for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
704                     kputc(ref[bc->pos+1+j], &bc->tmp);
705             } else { // insertion; cannot be a reference unless a bug
706                 char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
707                 for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
708                     kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
709                 for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
710             }
711             nals++;
712         }
713     }
714     else    // SNP
715     {
716         kputc("ACGTN"[bc->ori_ref], &bc->tmp);
717         for (i=1; i<5; i++)
718         {
719             if (bc->a[i] < 0) break;
720             kputc(',', &bc->tmp);
721             if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
722             else kputc("ACGT"[bc->a[i]], &bc->tmp);
723             nals++;
724         }
725     }
726     bcf_update_alleles_str(hdr, rec, bc->tmp.s);
727 
728     bc->tmp.l = 0;
729 
730     // INFO
731     if (bc->ori_ref < 0)
732     {
733         bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
734         bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
735         bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
736     }
737     bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
738     if ( fmt_flag&B2B_INFO_ADF )
739         bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
740     if ( fmt_flag&B2B_INFO_ADR )
741         bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
742     if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
743     {
744         for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
745         if ( fmt_flag&B2B_INFO_AD )
746             bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
747         if ( fmt_flag&B2B_INFO_DPR )
748             bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
749     }
750 
751     float tmpf[16];
752     for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
753     bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
754     bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
755 
756     if ( bc->vdb != HUGE_VAL )      bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
757     if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
758     if ( bc->mwu_pos != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
759     if ( bc->mwu_mq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
760     if ( bc->mwu_mqs != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
761     if ( bc->mwu_bq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
762 #if CDF_MWU_TESTS
763     if ( bc->mwu_pos_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
764     if ( bc->mwu_mq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
765     if ( bc->mwu_mqs_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
766     if ( bc->mwu_bq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
767 #endif
768     tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
769     bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
770 
771     // FORMAT
772     rec->n_sample = bc->n;
773     bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
774     if ( fmt_flag&B2B_FMT_DP )
775     {
776         int32_t *ptr = (int32_t*) bc->fmt_arr;
777         for (i=0; i<bc->n; i++)
778             ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
779         bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
780     }
781     if ( fmt_flag&B2B_FMT_DV )
782     {
783         int32_t *ptr = (int32_t*) bc->fmt_arr;
784         for (i=0; i<bc->n; i++)
785             ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
786         bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
787     }
788     if ( fmt_flag&B2B_FMT_SP )
789     {
790         int32_t *ptr = (int32_t*) bc->fmt_arr;
791         for (i=0; i<bc->n; i++)
792         {
793             int fwd_ref = bc->DP4[4*i], rev_ref =  bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
794             if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
795                 ptr[i] = 0;
796             else
797             {
798                 double left, right, two;
799                 kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
800                 int32_t x = (int)(-4.343 * log(two) + .499);
801                 if (x > 255) x = 255;
802                 ptr[i] = x;
803             }
804         }
805         bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
806     }
807     if ( fmt_flag&B2B_FMT_DP4 )
808         bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
809     if ( fmt_flag&B2B_FMT_ADF )
810         bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
811     if ( fmt_flag&B2B_FMT_ADR )
812         bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
813     if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
814     {
815         for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
816         if ( fmt_flag&B2B_FMT_AD )
817             bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
818         if ( fmt_flag&B2B_FMT_DPR )
819             bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
820     }
821 
822     return 0;
823 }
824