1 /*  mcall.c -- multiallelic and rare variant calling.
2 
3     Copyright (C) 2012-2021 Genome Research Ltd.
4 
5     Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.  */
24 
25 #include <assert.h>
26 #include <math.h>
27 #include <inttypes.h>
28 #include <ctype.h>
29 #include <htslib/kfunc.h>
30 #include <htslib/khash_str2int.h>
31 #include "call.h"
32 #include "prob1.h"
33 
34 // Using priors for GTs does not seem to be mathematically justified. Although
35 // it seems effective in removing false calls, it also flips a significant
36 // proportion of HET genotypes. Better is to filter by FORMAT/GQ using
37 // `bcftools filter`.
38 #define USE_PRIOR_FOR_GTS 0
39 
40 // Go with uniform PLs for samples with no coverage. If unset, missing
41 // genotypes is reported instead.
42 #define FLAT_PDG_FOR_MISSING 0
43 
44 int test16(float *anno16, anno16_t *a);
45 
qcall_init(call_t * call)46 void qcall_init(call_t *call) { return; }
qcall_destroy(call_t * call)47 void qcall_destroy(call_t *call) { return; }
qcall(call_t * call,bcf1_t * rec)48 int qcall(call_t *call, bcf1_t *rec)
49 {
50     // QCall format:
51     //  chromosome, position, reference allele, depth, mapping quality, 0, ..
52     error("TODO: qcall output\n");
53     return 0;
54 }
55 
call_init_pl2p(call_t * call)56 void call_init_pl2p(call_t *call)
57 {
58     int i;
59     for (i=0; i<256; i++)
60         call->pl2p[i] = pow(10., -i/10.);
61 }
62 
63 // Macros for accessing call->trio and call->ntrio
64 #define FTYPE_222 0     // family type: all diploid
65 #define FTYPE_121 1     // chrX, the child is a boy
66 #define FTYPE_122 2     // chrX, a girl
67 #define FTYPE_101 3     // chrY, boy
68 #define FTYPE_100 4     // chrY, girl
69 
70 #define GT_SKIP 0xf     // empty genotype (chrY in females)
71 
72 #define IS_POW2(x) (!((x) & ((x) - 1)))    // zero is permitted
73 #define IS_HOM(x)  IS_POW2(x)
74 
75 // Pkij = P(k|i,j) tells how likely it is to be a het if the parents
76 // are homs etc. The consistency of i,j,k has been already checked.
77 // Parameters are alleles and ploidy of father, mother, kid
78 // Returns 2/Pkij.
calc_Pkij(int fals,int mals,int kals,int fpl,int mpl,int kpl)79 int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
80 {
81     int als = fals|mals|kals;
82     if ( IS_HOM(als) ) return 2;    // all are the same: child must be a HOM, P=1
83 
84     if ( fpl==1 )
85     {
86         if ( kpl==1 )   // chr X, the child is a boy, the copy is inherited from the mother
87         {
88             if ( IS_HOM(mals) ) return 2;   // 0 11 -> P(1) = 1
89             return 4;                       // 0 01 -> P(0) = P(1) = 1/2
90         }
91         // chr X, the child is a girl
92         if ( IS_HOM(mals) ) return 2;       // 0 11 -> P(01) = 1
93         return 4;                           // 0 01 -> P(00) = P(01) = 1/2
94     }
95 
96     if ( IS_HOM(fals) && IS_HOM(mals) ) return 2;   // 00 11 01, the child must be a HET, P=1
97     if ( !IS_HOM(fals) && !IS_HOM(mals) )
98     {
99         if ( IS_HOM(kals) ) return 8;   // 01 01 00 or 01 01 11, P(k=HOM) = 1/4
100         return 4;                       // 01 01 01, P(k=HET) = 1/2
101     }
102     return 4;   // 00 01, P(k=HET) = P(k=HOM) = 1/2
103 }
104 
105 // Initialize ntrio and trio: ntrio lists the number of possible
106 // genotypes given combination of haploid/diploid genomes and the
107 // number of alleles. trio lists allowed genotype combinations:
108 //      4bit: 2/Pkij, 4: father, 4: mother, 4: child
109 // See also mcall_call_trio_genotypes()
110 //
mcall_init_trios(call_t * call)111 static void mcall_init_trios(call_t *call)
112 {
113     if ( call->prior_AN )
114     {
115         int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
116         if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
117         if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) )  error("No such FORMAT tag \"%s\"\n", call->prior_AN);
118         id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
119         if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
120         if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) )  error("No such FORMAT tag \"%s\"\n", call->prior_AC);
121     }
122 
123     // 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
124     call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78;  call->ntrio[FTYPE_222][4] = 250;
125     call->ntrio[FTYPE_121][2] = 8;  call->ntrio[FTYPE_121][3] = 27;  call->ntrio[FTYPE_121][4] = 64;
126     call->ntrio[FTYPE_122][2] = 8;  call->ntrio[FTYPE_122][3] = 27;  call->ntrio[FTYPE_122][4] = 64;
127     call->ntrio[FTYPE_101][2] = 2;  call->ntrio[FTYPE_101][3] = 3;   call->ntrio[FTYPE_101][4] = 4;
128     call->ntrio[FTYPE_100][2] = 2;  call->ntrio[FTYPE_100][3] = 3;   call->ntrio[FTYPE_100][4] = 4;
129 
130     int nals, itype;
131     for (itype=0; itype<=4; itype++)
132     {
133         for (nals=2; nals<=4; nals++)
134             call->trio[itype][nals] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[itype][nals]);
135     }
136 
137     // max 10 possible diploid genotypes
138     int gts[10];
139     for (nals=2; nals<=4; nals++)
140     {
141         int i,j,k, n = 0, ngts = 0;
142         for (i=0; i<nals; i++)
143             for (j=0; j<=i; j++)
144                 gts[ngts++] = 1<<i | 1<<j;
145 
146         // 222: all diploid
147         // i,j,k: father, mother, child
148         for (i=0; i<ngts; i++)
149             for (j=0; j<ngts; j++)
150                 for (k=0; k<ngts; k++)
151                 {
152                     if ( ((gts[i]|gts[j])&gts[k]) != gts[k] ) continue;             // k not present in neither i nor j
153                     if ( !(gts[i] & gts[k]) || !(gts[j] & gts[k]) ) continue;       // one copy from father, one from mother
154                     int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 2,2,2);
155                     call->trio[FTYPE_222][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;  // father, mother, child
156                 }
157         assert( n==call->ntrio[FTYPE_222][nals] );
158 
159         // 121: chrX, boy
160         n = 0;
161         for (i=0; i<ngts; i++)
162             for (j=0; j<ngts; j++)
163                 for (k=0; k<ngts; k++)
164                 {
165                     if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue;   // father nor boy can be diploid
166                     if ( ((gts[i]|gts[j])&gts[k]) != gts[k] ) continue;
167                     if ( !(gts[j] & gts[k]) ) continue;     // boy must inherit the copy from mother
168                     int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,1);
169                     call->trio[FTYPE_121][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
170                 }
171         assert( n==call->ntrio[FTYPE_121][nals] );
172 
173         // 122: chrX, girl
174         n = 0;
175         for (i=0; i<ngts; i++)
176             for (j=0; j<ngts; j++)
177                 for (k=0; k<ngts; k++)
178                 {
179                     if ( !IS_HOM(gts[i]) ) continue;
180                     if ( ((gts[i]|gts[j])&gts[k]) != gts[k] ) continue;
181                     if ( !(gts[i] & gts[k]) ) continue;     // girl must inherit one copy from the father and one from the mother
182                     if ( !(gts[j] & gts[k]) ) continue;
183                     int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,2);
184                     call->trio[FTYPE_122][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
185                 }
186         assert( n==call->ntrio[FTYPE_122][nals] );
187 
188         // 101: chrY, boy
189         n = 0;
190         for (i=0; i<ngts; i++)
191             for (k=0; k<ngts; k++)
192             {
193                 if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue;
194                 if ( (gts[i]&gts[k]) != gts[k] ) continue;
195                 call->trio[FTYPE_101][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | k;
196             }
197         assert( n==call->ntrio[FTYPE_101][nals] );
198 
199         // 100: chrY, girl
200         n = 0;
201         for (i=0; i<ngts; i++)
202         {
203             if ( !IS_POW2(gts[i]) ) continue;
204             call->trio[FTYPE_100][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | GT_SKIP;
205         }
206         assert( n==call->ntrio[FTYPE_100][nals] );
207 
208     }
209     call->GLs = (double*) calloc(bcf_hdr_nsamples(call->hdr)*10,sizeof(double));
210 
211     int i, j;
212     for (i=0; i<call->nfams; i++)
213     {
214         family_t *fam = &call->fams[i];
215         int ploidy[3];
216         for (j=0; j<3; j++)
217             ploidy[j] = call->ploidy[fam->sample[j]];
218 
219         if ( ploidy[FATHER]==2 )    // not X, not Y
220         {
221             if ( ploidy[MOTHER]!=2 || ploidy[CHILD]!=2 )
222                 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
223             fam->type = FTYPE_222;
224             continue;
225         }
226         if ( ploidy[FATHER]!=1 || ploidy[MOTHER]==1 )
227                 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
228         if ( ploidy[MOTHER]==2 )    // X
229         {
230             if ( ploidy[CHILD]==0 )
231                 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
232             fam->type = ploidy[CHILD]==2 ? FTYPE_122 : FTYPE_121;   // a girl or a boy
233         }
234         else    // Y
235         {
236             if ( ploidy[CHILD]==2 )
237                 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
238             fam->type = ploidy[CHILD]==0 ? FTYPE_100 : FTYPE_101;   // a girl or a boy
239         }
240     }
241 }
mcall_destroy_trios(call_t * call)242 static void mcall_destroy_trios(call_t *call)
243 {
244     int i, j;
245     for (i=2; i<=4; i++)
246         for (j=0; j<=4; j++)
247             free(call->trio[j][i]);
248 }
249 
init_sample_groups(call_t * call)250 static void init_sample_groups(call_t *call)
251 {
252     int i, nsmpl = bcf_hdr_nsamples(call->hdr);
253     if ( !call->sample_groups )
254     {
255         // standard pooled calling, all samples in the same group
256         call->nsmpl_grp = 1;
257         call->smpl_grp  = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp));
258         call->smpl_grp[0].nsmpl = nsmpl;
259         call->smpl_grp[0].smpl  = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t));
260         for (i=0; i<nsmpl; i++)
261             call->smpl_grp[0].smpl[i] = i;
262         return;
263     }
264 
265     if ( call->sample_groups_tag )
266     {
267         // Is the tag defined in the header?
268         int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
269         if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
270         if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) )  error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag);
271     }
272     else
273     {
274         int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS");
275         if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS";
276         else
277         {
278             tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD");
279             if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
280             else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
281         }
282     }
283 
284     // Read samples/groups
285     if ( !strcmp("-",call->sample_groups) )
286     {
287         // single-sample calling, each sample creates its own group
288         call->nsmpl_grp = nsmpl;
289         call->smpl_grp  = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp));
290         for (i=0; i<nsmpl; i++)
291         {
292             call->smpl_grp[i].nsmpl = 1;
293             call->smpl_grp[i].smpl  = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t));
294             call->smpl_grp[i].smpl[0] = i;
295         }
296     }
297     else
298     {
299         int nlines;
300         char **lines = hts_readlist(call->sample_groups, 1, &nlines);
301         if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);
302 
303         uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
304         uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
305         void *grp2idx = khash_str2int_init();
306 
307         call->nsmpl_grp = 0;
308         for (i=0; i<nlines; i++)
309         {
310             char *ptr = lines[i];
311             while ( *ptr && !isspace(*ptr) ) ptr++;
312             if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
313             char *tmp = ptr;
314             while ( *ptr && isspace(*ptr) ) ptr++;
315             if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
316             *tmp = 0;
317             int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
318             if ( ismpl<0 ) continue;
319             if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
320             if ( !khash_str2int_has_key(grp2idx,ptr+1) )
321             {
322                 khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
323                 call->nsmpl_grp++;
324             }
325             int igrp = -1;
326             if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 )
327                 error("This should not happen, fixme: %s\n",ptr+1);
328             grp2n[igrp]++;
329             smpl2grp[ismpl] = igrp+1;   // +1 to distinguish unlisted samples
330         }
331         khash_str2int_destroy(grp2idx);
332         if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);
333 
334         call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
335         for (i=0; i<nsmpl; i++)
336         {
337             if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
338             int igrp = smpl2grp[i] - 1;
339             if ( !call->smpl_grp[igrp].nsmpl )
340                 call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
341             call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
342             call->smpl_grp[igrp].nsmpl++;
343         }
344         free(smpl2grp);
345         free(grp2n);
346         for (i=0; i<nlines; i++) free(lines[i]);
347         free(lines);
348     }
349 }
destroy_sample_groups(call_t * call)350 static void destroy_sample_groups(call_t *call)
351 {
352     int i;
353     for (i=0; i<call->nsmpl_grp; i++)
354     {
355         free(call->smpl_grp[i].qsum);
356         free(call->smpl_grp[i].smpl);
357     }
358     free(call->smpl_grp);
359 }
360 
mcall_init(call_t * call)361 void mcall_init(call_t *call)
362 {
363     init_sample_groups(call);
364     call_init_pl2p(call);
365 
366     call->nals_map = 5;
367     call->als_map  = (int*) malloc(sizeof(int)*call->nals_map);
368     call->npl_map  = 5*(5+1)/2;     // will be expanded later if necessary
369     call->pl_map   = (int*) malloc(sizeof(int)*call->npl_map);
370     call->gts  = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int32_t));   // assuming at most diploid everywhere
371 
372     if ( call->flag & CALL_CONSTR_TRIO )
373     {
374         call->cgts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
375         call->ugts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
376         mcall_init_trios(call);
377         bcf_hdr_append(call->hdr,"##FORMAT=<ID=CGT,Number=1,Type=Integer,Description=\"Constrained Genotype (0-based index to Number=G ordering).\">");
378         bcf_hdr_append(call->hdr,"##FORMAT=<ID=UGT,Number=1,Type=Integer,Description=\"Unconstrained Genotype (0-based index to Number=G ordering).\">");
379     }
380     if ( call->flag & CALL_CONSTR_ALLELES ) call->vcmp = vcmp_init();
381 
382     bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
383     if ( call->output_tags & CALL_FMT_GQ )
384         bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
385     if ( call->output_tags & CALL_FMT_GP )
386         bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype posterior probabilities in the range 0 to 1\">");
387     if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
388         call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
389     bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
390     bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
391     bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
392     bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
393     if ( call->output_tags & CALL_FMT_PV4 )
394         bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
395 
396     // init the prior
397     if ( call->theta>0 )
398     {
399         int i, n = 0;
400         if ( !call->ploidy ) n = 2*bcf_hdr_nsamples(call->hdr); // all are diploid
401         else
402         {
403             for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
404                 n += call->ploidy[i];
405         }
406         // Watterson factor, here aM_1 = aM_2 = 1
407         double aM = 1;
408         for (i=2; i<n; i++) aM += 1./i;
409         call->theta *= aM;
410         if ( call->theta >= 1 )
411         {
412             fprintf(stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta);
413             call->theta = 0.99;
414         }
415         call->theta = log(call->theta);
416     }
417 }
418 
mcall_destroy(call_t * call)419 void mcall_destroy(call_t *call)
420 {
421     destroy_sample_groups(call);
422     if (call->vcmp) vcmp_destroy(call->vcmp);
423     free(call->itmp);
424     mcall_destroy_trios(call);
425     free(call->GPs);
426     free(call->ADs);
427     free(call->GLs);
428     free(call->GQs);
429     free(call->anno16);
430     free(call->PLs);
431     free(call->als_map);
432     free(call->pl_map);
433     free(call->gts); free(call->cgts); free(call->ugts);
434     free(call->pdg);
435     free(call->als);
436     free(call->ac);
437     return;
438 }
439 
440 
441 // Inits P(D|G): convert PLs from log space and normalize. In case of zero
442 // depth, missing PLs are all zero. In this case, pdg's are set to 0
443 // so that the corresponding genotypes can be set as missing and the
444 // qual calculation is not affected.
445 // Missing values are replaced by generic likelihoods when X (unseen allele) is
446 // present.
447 // NB: While the -m callig model uses the pdgs in canonical order,
448 // the original samtools -c calling code uses pdgs in reverse order (AA comes
449 // first, RR last).
450 // NB: Ploidy is not taken into account here, which is incorrect.
set_pdg(double * pl2p,int * PLs,double * pdg,int n_smpl,int n_gt,int unseen)451 void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unseen)
452 {
453     int i, j, nals;
454 
455     // find out the number of alleles, expecting diploid genotype likelihoods
456     bcf_gt2alleles(n_gt-1, &i, &nals);
457     assert( i==nals );
458     nals++;
459 
460     for (i=0; i<n_smpl; i++)
461     {
462         double sum = 0;
463         for (j=0; j<n_gt; j++)
464         {
465             if ( PLs[j]==bcf_int32_vector_end )
466             {
467                 // We expect diploid genotype likelihoods. If not diploid, treat as missing
468                 j = 0;
469                 break;
470             }
471             if ( PLs[j]==bcf_int32_missing ) break;
472             pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
473             sum += pdg[j];
474         }
475 
476         if ( j==0 )
477         {
478             // First value is missing (LK of RR), this indicates that
479             // all values are missing.
480             j = sum = n_gt;
481         }
482         else if ( j<n_gt && unseen<0 )
483         {
484             // Some of the values are missing and the unseen allele LK is not
485             // available. In such a case, we set LK to a very small value.
486             sum = 0;
487             for (j=0; j<n_gt; j++)
488             {
489                 assert( PLs[j]!=bcf_int32_vector_end );
490                 if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
491                 pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
492                 sum += pdg[j];
493             }
494         }
495         if ( j<n_gt )
496         {
497             // Missing values present, fill with unseen allele LK. This can be only
498             // as good as the merge was.
499             int ia,ib, k;
500             j = 0;
501             sum = 0;
502             for (ia=0; ia<nals; ia++)
503             {
504                 for (ib=0; ib<=ia; ib++)
505                 {
506                     if ( PLs[j]==bcf_int32_missing )
507                     {
508                         k = bcf_alleles2gt(ia,unseen);
509                         if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(ib,unseen);
510                         if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(unseen,unseen);
511                         if ( PLs[k]==bcf_int32_missing )
512                         {
513                             // The PLs for unseen allele X are not present as well as for ia, ib.
514                             // This can happen with incremental calling, when one of the merged
515                             // files had all alleles A,C,G,T, in such a case, X was not present.
516                             // Use a very small value instead.
517                             PLs[j] = 255;
518                         }
519                         else
520                             PLs[j] = PLs[k];
521                     }
522                     pdg[j] = pl2p[ PLs[j] ];
523                     sum += pdg[j];
524                     j++;
525                 }
526             }
527         }
528         // Normalize: sum_i pdg_i = 1
529         if ( sum==n_gt )
530         {
531             // all missing
532             #if FLAT_PDG_FOR_MISSING
533                 for (j=0; j<n_gt; j++) pdg[j] = 1./n_gt;
534             #else
535                 for (j=0; j<n_gt; j++) pdg[j] = 0;
536             #endif
537         }
538         else
539             for (j=0; j<n_gt; j++) pdg[j] /= sum;
540 
541         PLs += n_gt;
542         pdg += n_gt;
543     }
544 }
545 
546 // Create mapping between old and new (trimmed) alleles
init_allele_trimming_maps(call_t * call,int nals_ori,int als_out)547 void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out)
548 {
549     int i, j, nout = 0;
550 
551     // als_map: old(i) -> new(j)
552     for (i=0; i<nals_ori; i++)
553     {
554         if ( als_out & (1<<i) ) call->als_map[i] = nout++;
555         else call->als_map[i] = -1;
556     }
557 
558     if ( !call->pl_map ) return;
559 
560     // pl_map: new(k) -> old(l)
561     int k = 0, l = 0;
562     for (i=0; i<nals_ori; i++)
563     {
564         for (j=0; j<=i; j++)
565         {
566             if ( (als_out & (1<<i)) && (als_out & (1<<j)) ) call->pl_map[k++] = l;
567             l++;
568         }
569     }
570 }
571 
572 /** log(exp(a)+exp(b)) */
logsumexp2(double a,double b)573 static inline double logsumexp2(double a, double b)
574 {
575     if ( a>b )
576         return log(1 + exp(b-a)) + a;
577     else
578         return log(1 + exp(a-b)) + b;
579 }
580 
581 // Macro to set the most likely alleles
582 #define UPDATE_MAX_LKs(als,sum) { \
583      if ( max_lk<lk_tot && lk_tot_set ) { max_lk = lk_tot; max_als = (als); } \
584      if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
585 }
586 
587 #define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
588 
589 // Determine the most likely combination of alleles. In this implementation,
590 // at most tri-allelic sites are considered. Returns the number of alleles.
mcall_find_best_alleles(call_t * call,int nals,smpl_grp_t * grp)591 static int mcall_find_best_alleles(call_t *call, int nals, smpl_grp_t *grp)
592 {
593     int ia,ib,ic;   // iterators over up to three alleles
594     int max_als=0;  // most likely combination of alleles
595     double ref_lk = -HUGE_VAL, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
596     double lk_sum = -HUGE_VAL;    // for normalizing the likelihoods
597     int nsmpl = grp->nsmpl;
598     int ngts  = nals*(nals+1)/2;
599 
600     // Single allele
601     for (ia=0; ia<nals; ia++)
602     {
603         double lk_tot  = 0;
604         int lk_tot_set = 0;
605         int iaa = (ia+1)*(ia+2)/2-1;    // index in PL which corresponds to the homozygous "ia/ia" genotype
606         int ismpl;
607         for (ismpl=0; ismpl<nsmpl; ismpl++)
608         {
609             double *pdg = call->pdg + grp->smpl[ismpl]*ngts + iaa;
610             if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
611         }
612         if ( ia==0 ) ref_lk = lk_tot;   // likelihood of 0/0 for all samples
613         else lk_tot += call->theta; // the prior
614         UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
615     }
616 
617     // Two alleles
618     if ( nals>1 )
619     {
620         for (ia=0; ia<nals; ia++)
621         {
622             if ( grp->qsum[ia]==0 ) continue;
623             int iaa = (ia+1)*(ia+2)/2-1;
624             for (ib=0; ib<ia; ib++)
625             {
626                 if ( grp->qsum[ib]==0 ) continue;
627                 double lk_tot  = 0;
628                 int lk_tot_set = 0;
629                 double fa  = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]);
630                 double fb  = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]);
631                 double fa2 = fa*fa;
632                 double fb2 = fb*fb;
633                 double fab = 2*fa*fb;
634                 int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
635                 for (is=0; is<nsmpl; is++)
636                 {
637                     int ismpl = grp->smpl[is];
638                     double *pdg = call->pdg + ismpl*ngts;
639                     double val = 0;
640                     if ( !call->ploidy || call->ploidy[ismpl]==2 )
641                         val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
642                     else if ( call->ploidy && call->ploidy[ismpl]==1 )
643                         val = fa*pdg[iaa] + fb*pdg[ibb];
644                     if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
645                 }
646                 if ( ia!=0 ) lk_tot += call->theta;    // the prior
647                 if ( ib!=0 ) lk_tot += call->theta;
648                 UPDATE_MAX_LKs(1<<ia|1<<ib, lk_tot_set);
649             }
650         }
651     }
652 
653     // Three alleles
654     if ( nals>2 )
655     {
656         for (ia=0; ia<nals; ia++)
657         {
658             if ( grp->qsum[ia]==0 ) continue;
659             int iaa = (ia+1)*(ia+2)/2-1;
660             for (ib=0; ib<ia; ib++)
661             {
662                 if ( grp->qsum[ib]==0 ) continue;
663                 int ibb = (ib+1)*(ib+2)/2-1;
664                 int iab = iaa - ia + ib;
665                 for (ic=0; ic<ib; ic++)
666                 {
667                     if ( grp->qsum[ic]==0 ) continue;
668                     double lk_tot  = 0;
669                     int lk_tot_set = 0;
670 
671                     double fa  = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
672                     double fb  = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
673                     double fc  = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
674                     double fa2 = fa*fa;
675                     double fb2 = fb*fb;
676                     double fc2 = fc*fc;
677                     double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
678                     int is, icc = (ic+1)*(ic+2)/2-1;
679                     int iac = iaa - ia + ic, ibc = ibb - ib + ic;
680                     for (is=0; is<nsmpl; is++)
681                     {
682                         int ismpl = grp->smpl[is];
683                         double *pdg = call->pdg + ismpl*ngts;
684                         double val = 0;
685                         if ( !call->ploidy || call->ploidy[ismpl]==2 )
686                             val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
687                         else if ( call->ploidy && call->ploidy[ismpl]==1 )
688                             val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
689                         if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
690                     }
691                     if ( ia!=0 ) lk_tot += call->theta;    // the prior
692                     if ( ib!=0 ) lk_tot += call->theta;    // the prior
693                     if ( ic!=0 ) lk_tot += call->theta;    // the prior
694                     UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic, lk_tot_set);
695                 }
696             }
697         }
698     }
699 
700     int i, n = 0;
701     for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
702 
703     grp->max_lk = max_lk;
704     grp->ref_lk = ref_lk;
705     grp->lk_sum = lk_sum;
706     grp->als  = max_als;
707     grp->nals = n;
708 
709     return n;
710 }
711 
712 // Sets GT=0/0 or GT=. if PL=0,0,0
mcall_set_ref_genotypes(call_t * call,int nals_ori)713 static void mcall_set_ref_genotypes(call_t *call, int nals_ori)
714 {
715     int i;
716     int ngts  = nals_ori*(nals_ori+1)/2;            // need this to distinguish between GT=0/0 vs GT=.
717     int nsmpl = bcf_hdr_nsamples(call->hdr);
718 
719     for (i=0; i<nals_ori; i++) call->ac[i] = 0;     // nals_new<=nals_ori, never mind setting extra 0's
720 
721     // Set all genotypes to 0/0 or 0
722     int *gts    = call->gts;
723     double *pdg = call->pdg;
724     int isample;
725     for (isample = 0; isample < nsmpl; isample++)
726     {
727         int ploidy = call->ploidy ? call->ploidy[isample] : 2;
728         for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
729         if ( i==ngts || !ploidy )
730         {
731             gts[0] = bcf_gt_missing;
732             gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
733         }
734         else
735         {
736             gts[0] = bcf_gt_unphased(0);
737             gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
738             call->ac[0] += ploidy;
739         }
740         gts += 2;
741         pdg += ngts;
742     }
743 }
744 
mcall_call_genotypes(call_t * call,int nals_ori,smpl_grp_t * grp)745 static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
746 {
747     int ia, ib, i;
748     int ngts_ori = nals_ori*(nals_ori+1)/2;
749     int ngts_new = call->nals_new*(call->nals_new+1)/2;
750     int nsmpl = grp->nsmpl;
751 
752     #if USE_PRIOR_FOR_GTS
753         float prior = exp(call->theta);
754     #endif
755 
756     int is;
757     for (is = 0; is < nsmpl; is++)
758     {
759         int ismpl   = grp->smpl[is];
760         double *pdg = call->pdg + ismpl*ngts_ori;
761         float *gps  = call->GPs + ismpl*ngts_new;
762         int *gts    = call->gts + ismpl*2;
763 
764         int ploidy = call->ploidy ? call->ploidy[ismpl] : 2;
765         assert( ploidy>=0 && ploidy<=2 );
766 
767         if ( !ploidy )
768         {
769             gts[0] = bcf_gt_missing;
770             gts[1] = bcf_int32_vector_end;
771             gps[0] = -1;
772             continue;
773         }
774 
775         #if !FLAT_PDG_FOR_MISSING
776             // Skip samples with zero depth, they have all pdg's equal to 0
777             for (i=0; i<ngts_ori; i++) if ( pdg[i]!=0.0 ) break;
778             if ( i==ngts_ori )
779             {
780                 gts[0] = bcf_gt_missing;
781                 gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
782                 gps[0] = -1;
783                 continue;
784             }
785         #endif
786 
787         // Default fallback for the case all LKs are the same
788         gts[0] = bcf_gt_unphased(0);
789         gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
790 
791         // Non-zero depth, determine the most likely genotype
792         double best_lk = 0;
793         for (ia=0; ia<nals_ori; ia++)
794         {
795             if ( !(grp->als & 1<<ia) ) continue;    // ia-th allele not in the final selection, skip
796             int iaa = (ia+1)*(ia+2)/2-1;                // PL index of the ia/ia genotype
797             double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
798             #if USE_PRIOR_FOR_GTS
799                 if ( ia!=0 ) lk *= prior;
800             #endif
801             int igt  = ploidy==2 ? bcf_alleles2gt(call->als_map[ia],call->als_map[ia]) : call->als_map[ia];
802             gps[igt] = lk;
803             if ( best_lk < lk )
804             {
805                 best_lk = lk;
806                 gts[0] = bcf_gt_unphased(call->als_map[ia]);
807             }
808         }
809         if ( ploidy==2 )
810         {
811             gts[1] = gts[0];
812             for (ia=0; ia<nals_ori; ia++)
813             {
814                 if ( !(grp->als & 1<<ia) ) continue;
815                 int iaa = (ia+1)*(ia+2)/2-1;
816                 for (ib=0; ib<ia; ib++)
817                 {
818                     if ( !(grp->als & 1<<ib) ) continue;
819                     int iab = iaa - ia + ib;
820                     double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
821                     #if USE_PRIOR_FOR_GTS
822                         if ( ia!=0 ) lk *= prior;
823                         if ( ib!=0 ) lk *= prior;
824                     #endif
825                     int igt  = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
826                     gps[igt] = lk;
827                     if ( best_lk < lk )
828                     {
829                         best_lk = lk;
830                         gts[0] = bcf_gt_unphased(call->als_map[ib]);
831                         gts[1] = bcf_gt_unphased(call->als_map[ia]);
832                     }
833                 }
834             }
835         }
836         else
837             gts[1] = bcf_int32_vector_end;
838 
839         call->ac[ bcf_gt_allele(gts[0]) ]++;
840         if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
841     }
842     if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return;
843     double max, sum;
844     for (is=0; is<nsmpl; is++)
845     {
846         int ismpl  = grp->smpl[is];
847         float *gps = call->GPs + ismpl*ngts_new;
848 
849         int nmax;
850         if ( call->ploidy )
851         {
852             if ( call->ploidy[ismpl]==2 ) nmax = ngts_new;
853             else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals;
854             else nmax = 0;
855         }
856         else nmax = ngts_new;
857 
858         max = gps[0];
859         if ( max<0 || nmax==0 )
860         {
861             // no call
862             if ( call->output_tags & CALL_FMT_GP )
863             {
864                 for (i=0; i<nmax; i++) gps[i] = 0;
865                 if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
866                 if ( nmax < ngts_new ) bcf_float_set_vector_end(gps[nmax]);
867             }
868             call->GQs[ismpl] = 0;
869             continue;
870         }
871         sum = gps[0];
872         for (i=1; i<nmax; i++)
873         {
874             if ( max < gps[i] ) max = gps[i];
875             sum += gps[i];
876         }
877         max = -4.34294*log(1 - max/sum);
878         call->GQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX;
879         if ( call->output_tags & CALL_FMT_GP )
880         {
881             assert( max );
882             for (i=0; i<nmax; i++) gps[i] = gps[i]/sum;
883             for (; i<ngts_new; i++) bcf_float_set_vector_end(gps[i]);
884         }
885     }
886 }
887 
888 
889 /**
890     Pm = P(mendelian) .. parameter to vary, 1-Pm is the probability of novel mutation.
891                          When trio_Pm_ins is negative, Pm is calculated dynamically
892                          according to indel length. For simplicity, only the
893                          first ALT is considered.
894     Pkij = P(k|i,j)   .. probability that the genotype combination i,j,k is consistent
895                          with mendelian inheritance (the likelihood that offspring
896                          of two HETs is a HOM is smaller than it being a HET)
897 
898     P_uc(F=i,M=j,K=k) = P(F=i) . P(M=j) . P(K=k)  .. unconstrained P
899     P_c(F=i,M=j,K=k) = P_uc . Pkij                .. constrained P
900     P(F=i,M=j,K=k) = P_uc . (1 - Pm) + P_c . Pm
901                    = P_uc . [1 - Pm + Pkij . Pm]
902 
903     We choose genotype combination i,j,k which maximizes P(F=i,M=j,K=k). This
904     probability gives the quality GQ(Trio).
905     Individual qualities are calculated as
906         GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
907  */
908 #if 0
909 static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new)
910 {
911     int ia, ib, i;
912     int nsmpl    = bcf_hdr_nsamples(call->hdr);
913     int ngts     = nals*(nals+1)/2;
914     int nout_gts = nals_new*(nals_new+1)/2;
915     double *gls  = call->GLs - nout_gts;
916     double *pdg  = call->pdg - ngts;
917 
918     // Calculate individuals' genotype likelihoods P(X=i)
919     int isample;
920     for (isample = 0; isample < nsmpl; isample++)
921     {
922         int ploidy = call->ploidy ? call->ploidy[isample] : 2;
923         int32_t *gts = call->ugts + isample;
924 
925         gls += nout_gts;
926         pdg += ngts;
927 
928         // Skip samples with all pdg's equal to 1. These have zero depth.
929         for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
930         if ( i==ngts || !ploidy )
931         {
932             gts[0] = -1;
933             gls[0] = 1;
934             continue;
935         }
936 
937         for (i=0; i<nout_gts; i++) gls[i] = -HUGE_VAL;
938 
939         grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]];
940         double sum_lk  = 0;
941         double best_lk = 0;
942         for (ia=0; ia<nals; ia++)
943         {
944             if ( !(als_new & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
945             int iaa   = bcf_alleles2gt(ia,ia);      // PL index of the ia/ia genotype
946             int idx   = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
947             double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
948             sum_lk   += lk;
949             gls[idx]  = lk;
950             if ( best_lk < lk )
951             {
952                 best_lk = lk;
953                 gts[0] = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
954             }
955         }
956         if ( ploidy==2 )
957         {
958             for (ia=0; ia<nals; ia++)
959             {
960                 if ( !(als_new & 1<<ia) ) continue;
961                 for (ib=0; ib<ia; ib++)
962                 {
963                     if ( !(als_new & 1<<ib) ) continue;
964                     int iab   = bcf_alleles2gt(ia,ib);
965                     int idx   = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
966                     double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
967                     sum_lk   += lk;
968                     gls[idx]  = lk;
969                     if ( best_lk < lk )
970                     {
971                         best_lk = lk;
972                         gts[0] = bcf_alleles2gt(call->als_map[ib],call->als_map[ia]);
973                     }
974                 }
975             }
976         }
977         for (i=0; i<nout_gts; i++)
978             if ( gls[i]!=-HUGE_VAL ) gls[i] = log(gls[i]/sum_lk);
979     }
980 
981     // Set novel mutation rate for this site: using first ALT allele for simplicity.
982     double trio_Pm;
983     if ( call->trio_Pm_ins<0 && call->trio_Pm_del<0 ) trio_Pm = call->trio_Pm_SNPs;     // the same Pm for indels and SNPs requested
984     else
985     {
986         int ret = bcf_get_variant_types(rec);
987         if ( !(ret & VCF_INDEL) ) trio_Pm = call->trio_Pm_SNPs;
988         else
989         {
990             if ( call->trio_Pm_ins<0 )  // dynamic calculation, trio_Pm_del holds the scaling factor
991             {
992                 trio_Pm = rec->d.var[1].n<0 ? -21.9313 - 0.2856*rec->d.var[1].n : -22.8689 + 0.2994*rec->d.var[1].n;
993                 trio_Pm = 1 - call->trio_Pm_del * exp(trio_Pm);
994             }
995             else                        // snps and indels set explicitly
996             {
997                 trio_Pm = rec->d.var[1].n<0 ? call->trio_Pm_del : call->trio_Pm_ins;
998             }
999         }
1000     }
1001 
1002     // Calculate constrained likelihoods and determine genotypes
1003     int ifm;
1004     for (ifm=0; ifm<call->nfams; ifm++)
1005     {
1006         family_t *fam = &call->fams[ifm];
1007         int ntrio = call->ntrio[fam->type][nals_new];
1008         uint16_t *trio = call->trio[fam->type][nals_new];
1009 
1010         // Unconstrained likelihood
1011         int uc_itr = 0;
1012         double uc_lk = 0;
1013         for (i=0; i<3; i++)     // for father, mother, child
1014         {
1015             int ismpl = fam->sample[i];
1016             double *gl = call->GLs + nout_gts*ismpl;
1017             if ( gl[0]==1 ) continue;
1018             int j, jmax = 0;
1019             double max  = gl[0];
1020             for (j=1; j<nout_gts; j++)
1021                 if ( max < gl[j] ) { max = gl[j]; jmax = j; }
1022             uc_lk += max;
1023             uc_itr |= jmax << ((2-i)*4);
1024         }
1025 
1026         // Best constrained likelihood
1027         int c_itr = -1, itr, uc_is_mendelian = 0;
1028         double c_lk = -HUGE_VAL;
1029         for (itr=0; itr<ntrio; itr++)   // for each trio genotype combination
1030         {
1031             double lk = 0;
1032             int npresent = 0;
1033             for (i=0; i<3; i++)     // for father, mother, child
1034             {
1035                 int ismpl = fam->sample[i];
1036                 double *gl = call->GLs + nout_gts*ismpl;
1037                 if ( gl[0]==1 ) continue;
1038                 int igt = trio[itr]>>((2-i)*4) & 0xf;
1039                 assert( !call->ploidy || call->ploidy[ismpl]>0 );
1040                 if ( igt==GT_SKIP ) continue;
1041                 lk += gl[igt];
1042                 npresent++;
1043                 // fprintf(stderr," %e", gl[igt]);
1044             }
1045             // fprintf(stderr,"\t\t");
1046             double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1;  // with missing genotypes Pkij's are different
1047             lk += log(1 - trio_Pm * (1 - Pkij));
1048             // fprintf(stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij);
1049             if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; }
1050             if ( uc_itr==trio[itr] ) uc_is_mendelian = 1;
1051         }
1052 
1053         if ( !uc_is_mendelian )
1054         {
1055             uc_lk += log(1 - trio_Pm);
1056             // fprintf(stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
1057             if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; }
1058         }
1059         // fprintf(stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
1060 
1061         // Set genotypes for father, mother, child and calculate genotype qualities
1062         for (i=0; i<3; i++)
1063         {
1064             // GT
1065             int ismpl    = fam->sample[i];
1066             int igt      = c_itr>>((2-i)*4) & 0xf;
1067             double *gl   = call->GLs + nout_gts*ismpl;
1068             int32_t *gts = call->cgts + ismpl;
1069             if ( gl[0]==1 || igt==GT_SKIP )    // zero depth, set missing genotypes
1070             {
1071                 gts[0] = -1;
1072                 // bcf_float_set_missing(call->GQs[ismpl]);
1073                 continue;
1074             }
1075             gts[0] = igt;
1076 
1077             #if 0
1078                 // todo: Genotype Qualities
1079                 //
1080                 // GQ: for each family member i sum over all genotypes j,k keeping igt fixed
1081                 double lk_sum = 0;
1082                 for (itr=0; itr<ntrio; itr++)
1083                 {
1084                     if ( igt != (trio[itr]>>((2-i)*4) & 0xf) ) continue;
1085                     double lk = 0;
1086                     int j;
1087                     for (j=0; j<3; j++)
1088                     {
1089                         int jsmpl = fam->sample[j];
1090                         double *gl = call->GLs + ngts*jsmpl;
1091                         if ( gl[0]==1 ) continue;
1092                         int jgt = trio[itr]>>((2-j)*4) & 0xf;
1093                         if ( jgt==GT_SKIP ) continue;
1094                         lk += gl[jgt];
1095                     }
1096                     double Pkij = (double)2/(trio[itr]>>12);
1097                     lk += log(1 - trio_Pm * (1 - Pkij));
1098                     lk_sum = logsumexp2(lk_sum, lk);
1099                 }
1100                 if ( !uc_is_mendelian && (best_itr>>((2-i)*4)&0xf)==(uc_itr>>((2-i)*4)&0xf) ) lk_sum = logsumexp2(lk_sum,uc_lk);
1101                 call->GQs[ismpl] = -4.3429*(best_lk - lk_sum);
1102             #endif
1103         }
1104     }
1105 
1106     for (i=0; i<4; i++) call->ac[i] = 0;
1107     call->nhets = 0;
1108     call->ndiploid = 0;
1109 
1110     // Test if CGT,UGT are needed
1111     int ucgts_needed = 0;
1112     int32_t *cgts = call->cgts - 1;
1113     int32_t *ugts = call->ugts - 1;
1114     int32_t *gts  = call->gts - 2;
1115     for (isample = 0; isample < nsmpl; isample++)
1116     {
1117         int ploidy = call->ploidy ? call->ploidy[isample] : 2;
1118         cgts++;
1119         ugts++;
1120         gts += 2;
1121         if ( ugts[0]==-1 )
1122         {
1123             gts[0] = bcf_gt_missing;
1124             gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
1125             continue;
1126         }
1127         int a,b;
1128         if ( cgts[0]!=ugts[0] )
1129         {
1130             bcf_gt2alleles(cgts[0], &a, &b);
1131             gts[0] = bcf_gt_unphased(a);
1132             gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
1133         }
1134         else
1135         {
1136             bcf_gt2alleles(ugts[0], &a, &b);
1137             gts[0] = bcf_gt_unphased(a);
1138             gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
1139         }
1140         if ( cgts[0]!=ugts[0] ) ucgts_needed = 1;
1141         call->ac[a]++;
1142         if ( ploidy==2 )
1143         {
1144             call->ac[b]++;
1145             call->ndiploid++;
1146             if ( a!=b ) call->nhets++;
1147         }
1148     }
1149     if ( ucgts_needed )
1150     {
1151         // Some GTs are different
1152         bcf_update_format_int32(call->hdr,rec,"UGT",call->ugts,nsmpl);
1153         bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
1154     }
1155 }
1156 #endif
1157 
mcall_trim_and_update_PLs(call_t * call,bcf1_t * rec,int nals_ori,int nals_new)1158 static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
1159 {
1160     int npls_src = nals_ori*(nals_ori+1)/2;
1161     int npls_dst = nals_new*(nals_new+1)/2;     // number of PL values in diploid samples, ori and new
1162     if ( call->all_diploid && npls_src == npls_dst ) return;
1163 
1164     int *pls_src = call->PLs, *pls_dst = call->PLs;
1165 
1166     int nsmpl = bcf_hdr_nsamples(call->hdr);
1167     int isample, ia;
1168     for (isample = 0; isample < nsmpl; isample++)
1169     {
1170         int ploidy = call->ploidy ? call->ploidy[isample] : 2;
1171         if ( ploidy==2 )
1172         {
1173             for (ia=0; ia<npls_dst; ia++)
1174                 pls_dst[ia] =  pls_src[ call->pl_map[ia] ];
1175         }
1176         else if ( ploidy==1 )
1177         {
1178             for (ia=0; ia<nals_new; ia++)
1179             {
1180                 int isrc = (ia+1)*(ia+2)/2-1;
1181                 pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
1182             }
1183             if ( ia<npls_dst ) pls_dst[ia] = bcf_int32_vector_end;
1184         }
1185         else
1186         {
1187             pls_dst[0] = bcf_int32_missing;
1188             pls_dst[1] = bcf_int32_vector_end;  // relying on nals_new>1 in mcall()
1189         }
1190         pls_src += npls_src;
1191         pls_dst += npls_dst;
1192     }
1193     bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
1194 }
1195 
mcall_trim_and_update_numberR(call_t * call,bcf1_t * rec,int nals_ori,int nals_new)1196 void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
1197 {
1198     if ( nals_ori==nals_new ) return;
1199 
1200     int i,j, nret, size = sizeof(float);
1201 
1202     void *tmp_ori = call->itmp, *tmp_new = call->PLs;  // reusing PLs storage which is not used at this point
1203     int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
1204 
1205     // INFO fields
1206     for (i=0; i<rec->n_info; i++)
1207     {
1208         bcf_info_t *info = &rec->d.info[i];
1209         int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
1210         if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
1211 
1212         int type  = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
1213         const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key);
1214         nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
1215         if ( nret<=0 ) continue;
1216 
1217         if ( nals_new==1 )
1218             bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1);     // has to be the REF, the order could not change
1219         else
1220         {
1221             for (j=0; j<nals_ori; j++)
1222             {
1223                 int k = call->als_map[j];
1224                 if ( k==-1 ) continue;   // to be dropped
1225                 memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
1226             }
1227             bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new);
1228         }
1229     }
1230 
1231     // FORMAT fields
1232     for (i=0; i<rec->n_fmt; i++)
1233     {
1234         bcf_fmt_t *fmt = &rec->d.fmt[i];
1235         int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
1236         if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
1237 
1238         int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
1239         const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
1240         nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
1241         if (nret<=0) continue;
1242         int nsmpl = bcf_hdr_nsamples(call->hdr);
1243 
1244         assert( nret==nals_ori*nsmpl );
1245 
1246         for (j=0; j<nsmpl; j++)
1247         {
1248             char *ptr_src = (char *)tmp_ori + j*nals_ori*size;
1249             char *ptr_dst = (char *)tmp_new + j*nals_new*size;
1250             int k;
1251             for (k=0; k<nals_ori; k++)
1252             {
1253                 int l = call->als_map[k];
1254                 if ( l==-1 ) continue;   // to be dropped
1255                 memcpy(ptr_dst+size*l, ptr_src+size*k, size);
1256             }
1257         }
1258         bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl);
1259     }
1260 
1261     call->PLs    = (int32_t*) tmp_new;
1262     call->mPLs   = ntmp_new;
1263     call->itmp   = (int32_t*) tmp_ori;
1264     call->n_itmp = ntmp_ori;
1265 }
1266 
1267 
1268 // NB: in this function we temporarily use calls->als_map for a different
1269 // purpose to store mapping from new (target) alleles to original alleles.
1270 //
mcall_constrain_alleles(call_t * call,bcf1_t * rec,int * unseen)1271 static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
1272 {
1273     assert( call->tgt_als->n );
1274     if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n);
1275     hts_expand(char*,call->tgt_als->n+1,call->nals,call->als);
1276 
1277     int has_new = 0;
1278 
1279     int i, j, nals = 1;
1280     for (i=1; i<call->nals_map; i++) call->als_map[i] = -1;
1281 
1282     if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 )
1283         error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]);
1284 
1285     // create mapping from new to old alleles
1286     call->als[0] = call->tgt_als->allele[0];
1287     call->als_map[0] = 0;
1288 
1289     for (i=1; i<call->tgt_als->n; i++)
1290     {
1291         call->als[nals] = call->tgt_als->allele[i];
1292         j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);
1293 
1294         // if ( j+1==*unseen )
1295         // {
1296         //     fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen);
1297         //     int k;
1298         //     for (k=0; k<rec->n_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
1299         //     fprintf(stderr,"\tTAB=");
1300         //     for (k=0; k<call->tgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
1301         //     fprintf(stderr,"\n");
1302         //     return -1;
1303         // }
1304 
1305         if ( j>=0 )
1306         {
1307             // existing allele
1308             call->als_map[nals] = j+1;
1309         }
1310         else
1311         {
1312             // There is a new allele in targets which is not present in VCF.
1313             // We use the X allele to estimate PLs. Note that X may not be
1314             // present at multiallelic indels sites. In that case we use the
1315             // last allele anyway, because the least likely allele comes last
1316             // in mpileup's ALT output.
1317             call->als_map[nals] = (*unseen)>=0 ? *unseen : rec->n_allele - 1;
1318             has_new = 1;
1319         }
1320         nals++;
1321     }
1322     if ( *unseen )
1323     {
1324         call->als_map[nals] = *unseen;
1325         call->als[nals] = rec->d.allele[*unseen];
1326         nals++;
1327     }
1328 
1329     if ( !has_new && nals==rec->n_allele ) return 0;
1330     bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
1331 
1332     // create mapping from new PL to old PL
1333     int k = 0;
1334     for (i=0; i<nals; i++)
1335     {
1336         for (j=0; j<=i; j++)
1337         {
1338             int a = call->als_map[i], b = call->als_map[j];
1339             call->pl_map[k++] = a>b ? a*(a+1)/2 + b : b*(b+1)/2 + a;
1340         }
1341     }
1342 
1343     // update PL
1344     call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
1345     int nsmpl  = bcf_hdr_nsamples(call->hdr);
1346     int npls_ori = call->nPLs / nsmpl;
1347     int npls_new = k;
1348     hts_expand(int32_t,npls_new*nsmpl,call->n_itmp,call->itmp);
1349     int *ori_pl = call->PLs, *new_pl = call->itmp;
1350     for (i=0; i<nsmpl; i++)
1351     {
1352         for (k=0; k<npls_new; k++)
1353         {
1354             new_pl[k] = ori_pl[call->pl_map[k]];
1355             if ( new_pl[k]==bcf_int32_missing && *unseen>=0 )
1356             {
1357                 // missing value, and there is an unseen allele: identify the
1358                 // alleles and use the lk of either AX or XX
1359                 int k_ori = call->pl_map[k], ia, ib;
1360                 bcf_gt2alleles(k_ori, &ia, &ib);
1361                 k_ori = bcf_alleles2gt(ia,*unseen);
1362                 if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(ib,*unseen);
1363                 if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(*unseen,*unseen);
1364                 new_pl[k] = ori_pl[k_ori];
1365             }
1366             if ( !k && new_pl[k]==bcf_int32_vector_end ) new_pl[k]=bcf_int32_missing;
1367         }
1368         ori_pl += npls_ori;
1369         new_pl += npls_new;
1370     }
1371     bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
1372 
1373     // update QS, use temporarily call->GPs to store the values
1374     int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
1375     hts_expand(float,nals,call->nGPs,call->GPs);
1376     for (i=0; i<nals; i++)
1377         call->GPs[i] = call->als_map[i]<nqs ? call->smpl_grp[0].qsum[call->als_map[i]] : 0;
1378     bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals);
1379 
1380     // update any Number=R tags
1381     void *tmp_ori = call->itmp, *tmp_new = call->PLs;  // reusing PLs storage which is not used at this point
1382     int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
1383     for (i=0; i<rec->n_fmt; i++)
1384     {
1385         bcf_fmt_t *fmt = &rec->d.fmt[i];
1386         int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
1387         if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
1388 
1389         // NB:works only for BCF_HT_INT and BCF_HT_REAL
1390         int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
1391         assert( type==BCF_HT_INT || type==BCF_HT_REAL );
1392         assert( sizeof(float)==sizeof(int32_t) );
1393 
1394         const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
1395         int nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
1396         if (nret<=0) continue;
1397         int nsmpl = bcf_hdr_nsamples(call->hdr);
1398         int size1 = sizeof(float);
1399         hts_expand(float, nsmpl * nals, ntmp_new, tmp_new);
1400         for (j=0; j<nsmpl; j++)
1401         {
1402             uint8_t *ptr_ori = (uint8_t *) tmp_ori + j*size1*fmt->n;
1403             uint8_t *ptr_new = (uint8_t *) tmp_new + j*nals*size1;
1404             for (k=0; k<nals; k++)
1405             {
1406                 uint8_t *dst = ptr_new + size1*k;
1407                 uint8_t *src = ptr_ori + size1*call->als_map[k];
1408                 memcpy(dst,src,size1);
1409             }
1410         }
1411         nret = bcf_update_format(call->hdr, rec, key, tmp_new, nsmpl*nals, type);
1412         assert( nret==0 );
1413     }
1414     call->PLs    = (int32_t*) tmp_new;
1415     call->mPLs   = ntmp_new;
1416     call->itmp   = (int32_t*) tmp_ori;
1417     call->n_itmp = ntmp_ori;
1418 
1419     if ( *unseen ) *unseen = nals-1;
1420     return 0;
1421 }
1422 
1423 
1424 /**
1425   *  This function implements the multiallelic calling model. It has two major parts:
1426   *   1) determine the most likely set of alleles and calculate the quality of ref/non-ref site
1427   *   2) determine and set the genotypes
1428   *  In various places in between, the BCF record gets updated.
1429   */
mcall(call_t * call,bcf1_t * rec)1430 int mcall(call_t *call, bcf1_t *rec)
1431 {
1432     int i,j, unseen = call->unseen;
1433 
1434     // Force alleles when calling genotypes given alleles was requested
1435     if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
1436 
1437     int nsmpl    = bcf_hdr_nsamples(call->hdr);
1438     int nals_ori = rec->n_allele;
1439     hts_expand(int,nals_ori,call->nac,call->ac);
1440     hts_expand(int,nals_ori,call->nals_map,call->als_map);
1441     hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map);
1442 
1443     // Get the genotype likelihoods
1444     call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
1445     if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori )  // a mixture of diploid and haploid or haploid only
1446         error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs);
1447 
1448     // Convert PLs to probabilities
1449     int ngts_ori = nals_ori*(nals_ori+1)/2;
1450     hts_expand(double, call->nPLs, call->npdg, call->pdg);
1451     set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen);
1452 
1453     // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
1454     if ( call->nsmpl_grp == 1  )
1455     {
1456         int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
1457         if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
1458         if ( nqs < nals_ori )
1459         {
1460             // Some of the listed alleles do not have the corresponding QS field. This is
1461             // typically ref-only site with <*> in ALT.
1462             hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum);
1463             for (i=nqs; i<nals_ori; i++) call->smpl_grp[0].qsum[i] = 0;
1464         }
1465     }
1466     else
1467     {
1468         for (j=0; j<call->nsmpl_grp; j++)
1469         {
1470             hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum);
1471             memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori);
1472         }
1473 
1474         // Use FORMAT/AD or FORMAT/QS
1475         int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs);
1476         if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag);
1477         nad /= bcf_hdr_nsamples(call->hdr);
1478         for (i=0; i<call->nsmpl_grp; i++)
1479         {
1480             int is;
1481             smpl_grp_t *grp = &call->smpl_grp[i];
1482             hts_expand(float,nals_ori,grp->nqsum,grp->qsum);
1483             for (j=0; j<nals_ori; j++) grp->qsum[j] = 0;
1484             for (is=0; is<grp->nsmpl; is++)
1485             {
1486                 int ismpl = grp->smpl[is];
1487                 int32_t *ptr = call->ADs + ismpl*nad;
1488                 float sum = 0;
1489                 for (j=0; j<nad; j++)
1490                 {
1491                     if ( ptr[j]==bcf_int32_vector_end ) break;
1492                     if ( ptr[j]!=bcf_int32_missing ) sum += ptr[j];
1493                 }
1494                 if ( sum )
1495                 {
1496                     for (j=0; j<nad; j++)
1497                     {
1498                         if ( ptr[j]==bcf_int32_vector_end ) break;
1499                         if ( ptr[j]!=bcf_int32_missing ) grp->qsum[j] += ptr[j]/sum;
1500                     }
1501                 }
1502             }
1503         }
1504     }
1505 
1506     // If available, take into account reference panel AFs
1507     if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
1508     {
1509         int an = call->ac[0];   // number of alleles total, procede only if not zero; reuse call->ac
1510         if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 )    // number of ALT alleles
1511         {
1512             int ac0 = an;       // this will become the number of REFs
1513             for (i=0; i<nals_ori-1; i++)
1514             {
1515                 if ( call->ac[i]==bcf_int32_vector_end ) break;
1516                 if ( call->ac[i]==bcf_int32_missing ) continue;
1517                 ac0 -= call->ac[i];
1518 
1519                 // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples
1520                 for (j=0; j<call->nsmpl_grp; j++)
1521                     call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an);
1522             }
1523             if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
1524             for (j=0; j<call->nsmpl_grp; j++)
1525                 call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an);
1526         }
1527     }
1528 
1529     // normalize so that QS sums to 1 for each group
1530     for (j=0; j<call->nsmpl_grp; j++)
1531     {
1532         float sum = 0;
1533         for (i=0; i<nals_ori; i++) sum += call->smpl_grp[j].qsum[i];
1534         if ( sum ) for (i=0; i<nals_ori; i++) call->smpl_grp[j].qsum[i] /= sum;
1535     }
1536 
1537     bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0);      // remove QS tag
1538 
1539     if ( nals_ori > 8*sizeof(call->als_new) )
1540     {
1541         fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
1542         return 0;
1543     }
1544 
1545     // For each group find the best combination of alleles
1546     call->als_new = 0;
1547     double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL;
1548     for (j=0; j<call->nsmpl_grp; j++)
1549     {
1550         smpl_grp_t *grp = &call->smpl_grp[j];
1551         mcall_find_best_alleles(call, nals_ori, grp);
1552         call->als_new |= grp->als;
1553         if ( grp->max_lk==-HUGE_VAL ) continue;
1554         double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk));
1555         if ( max_qual < qual )
1556         {
1557             max_qual = qual;
1558             lk_sum = grp->lk_sum;
1559             ref_lk = grp->ref_lk;
1560         }
1561     }
1562 
1563     // Make sure the REF allele is always present
1564     if ( !(call->als_new&1) ) call->als_new |= 1;
1565 
1566     int is_variant = call->als_new==1 ? 0 : 1;
1567     if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
1568 
1569     call->nals_new = 0;
1570     for (i=0; i<nals_ori; i++)
1571     {
1572         if ( i>0 && i==unseen ) continue;
1573         if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
1574         if ( call->als_new & (1<<i) ) call->nals_new++;
1575     }
1576 
1577     init_allele_trimming_maps(call,nals_ori,call->als_new);
1578 
1579     int nAC = 0;
1580     if ( call->als_new==1 )   // only REF allele on output
1581     {
1582         mcall_set_ref_genotypes(call,nals_ori);
1583         bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0);    // remove PL, useless now
1584     }
1585     else if ( !is_variant )
1586     {
1587         mcall_set_ref_genotypes(call,nals_ori);     // running with -A, prevent mcall_call_genotypes from putting some ALT back
1588         mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new);
1589     }
1590     else
1591     {
1592         // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
1593         // Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
1594         int ngts_new = call->nals_new*(call->nals_new+1)/2;
1595         hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs);
1596         for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
1597 
1598         if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
1599         {
1600             fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
1601             return 0;
1602         }
1603         if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
1604         {
1605             memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs));
1606             memset(call->GQs,0,nsmpl*sizeof(*call->GQs));
1607         }
1608         for (i=0; i<call->nsmpl_grp; i++)
1609         {
1610             if ( call->flag & CALL_CONSTR_TRIO )
1611                 error("todo: constrained trio calling temporarily disabled\n");   //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]);
1612             else
1613                 mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]);
1614         }
1615 
1616         // Skip the site if all samples are 0/0. This can happen occasionally.
1617         for (i=1; i<call->nals_new; i++) nAC += call->ac[i];
1618         if ( !nAC && call->flag & CALL_VARONLY ) return 0;
1619 
1620         if ( call->output_tags & CALL_FMT_GP )
1621             bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new);
1622         if ( call->output_tags & CALL_FMT_GQ )
1623             bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
1624 
1625         mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new);
1626     }
1627     if ( nals_ori!=call->nals_new )
1628         mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new);
1629 
1630     // Set QUAL
1631     if ( nAC )
1632     {
1633         // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
1634         rec->qual = max_qual;
1635     }
1636     else
1637     {
1638         // Set the quality of a REF site
1639         if ( lk_sum!=-HUGE_VAL )  // no support from (high quality) reads, so QUAL=1-prior
1640             rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk));
1641         else if ( call->ac[0] )
1642             rec->qual = call->theta ? -4.343*call->theta : 0;
1643         else
1644             bcf_float_set_missing(rec->qual);
1645     }
1646 
1647     // AC, AN
1648     if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1);
1649     nAC += call->ac[0];
1650     bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
1651 
1652     // Remove unused alleles
1653     hts_expand(char*,call->nals_new,call->nals,call->als);
1654     for (i=0; i<nals_ori; i++)
1655         if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
1656     bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new);
1657     bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
1658 
1659     // DP4 and PV4 tags
1660     if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
1661     {
1662         int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
1663         bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
1664 
1665         int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
1666         bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
1667 
1668         if ( call->output_tags & CALL_FMT_PV4 )
1669         {
1670             anno16_t a;
1671             float tmpf[4];
1672             int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
1673             if ( is_tested )
1674             {
1675                 for (i=0; i<4; i++) tmpf[i] = a.p[i];
1676                 bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
1677             }
1678         }
1679     }
1680 
1681     bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0);     // remove I16 tag
1682 
1683     return call->nals_new;
1684 }
1685 
1686