1 /* mcall.c -- multiallelic and rare variant calling.
2
3 Copyright (C) 2012-2021 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE. */
24
25 #include <assert.h>
26 #include <math.h>
27 #include <inttypes.h>
28 #include <ctype.h>
29 #include <htslib/kfunc.h>
30 #include <htslib/khash_str2int.h>
31 #include "call.h"
32 #include "prob1.h"
33
34 // Using priors for GTs does not seem to be mathematically justified. Although
35 // it seems effective in removing false calls, it also flips a significant
36 // proportion of HET genotypes. Better is to filter by FORMAT/GQ using
37 // `bcftools filter`.
38 #define USE_PRIOR_FOR_GTS 0
39
40 // Go with uniform PLs for samples with no coverage. If unset, missing
41 // genotypes is reported instead.
42 #define FLAT_PDG_FOR_MISSING 0
43
44 int test16(float *anno16, anno16_t *a);
45
qcall_init(call_t * call)46 void qcall_init(call_t *call) { return; }
qcall_destroy(call_t * call)47 void qcall_destroy(call_t *call) { return; }
qcall(call_t * call,bcf1_t * rec)48 int qcall(call_t *call, bcf1_t *rec)
49 {
50 // QCall format:
51 // chromosome, position, reference allele, depth, mapping quality, 0, ..
52 error("TODO: qcall output\n");
53 return 0;
54 }
55
call_init_pl2p(call_t * call)56 void call_init_pl2p(call_t *call)
57 {
58 int i;
59 for (i=0; i<256; i++)
60 call->pl2p[i] = pow(10., -i/10.);
61 }
62
63 // Macros for accessing call->trio and call->ntrio
64 #define FTYPE_222 0 // family type: all diploid
65 #define FTYPE_121 1 // chrX, the child is a boy
66 #define FTYPE_122 2 // chrX, a girl
67 #define FTYPE_101 3 // chrY, boy
68 #define FTYPE_100 4 // chrY, girl
69
70 #define GT_SKIP 0xf // empty genotype (chrY in females)
71
72 #define IS_POW2(x) (!((x) & ((x) - 1))) // zero is permitted
73 #define IS_HOM(x) IS_POW2(x)
74
75 // Pkij = P(k|i,j) tells how likely it is to be a het if the parents
76 // are homs etc. The consistency of i,j,k has been already checked.
77 // Parameters are alleles and ploidy of father, mother, kid
78 // Returns 2/Pkij.
calc_Pkij(int fals,int mals,int kals,int fpl,int mpl,int kpl)79 int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
80 {
81 int als = fals|mals|kals;
82 if ( IS_HOM(als) ) return 2; // all are the same: child must be a HOM, P=1
83
84 if ( fpl==1 )
85 {
86 if ( kpl==1 ) // chr X, the child is a boy, the copy is inherited from the mother
87 {
88 if ( IS_HOM(mals) ) return 2; // 0 11 -> P(1) = 1
89 return 4; // 0 01 -> P(0) = P(1) = 1/2
90 }
91 // chr X, the child is a girl
92 if ( IS_HOM(mals) ) return 2; // 0 11 -> P(01) = 1
93 return 4; // 0 01 -> P(00) = P(01) = 1/2
94 }
95
96 if ( IS_HOM(fals) && IS_HOM(mals) ) return 2; // 00 11 01, the child must be a HET, P=1
97 if ( !IS_HOM(fals) && !IS_HOM(mals) )
98 {
99 if ( IS_HOM(kals) ) return 8; // 01 01 00 or 01 01 11, P(k=HOM) = 1/4
100 return 4; // 01 01 01, P(k=HET) = 1/2
101 }
102 return 4; // 00 01, P(k=HET) = P(k=HOM) = 1/2
103 }
104
105 // Initialize ntrio and trio: ntrio lists the number of possible
106 // genotypes given combination of haploid/diploid genomes and the
107 // number of alleles. trio lists allowed genotype combinations:
108 // 4bit: 2/Pkij, 4: father, 4: mother, 4: child
109 // See also mcall_call_trio_genotypes()
110 //
mcall_init_trios(call_t * call)111 static void mcall_init_trios(call_t *call)
112 {
113 if ( call->prior_AN )
114 {
115 int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
116 if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
117 if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN);
118 id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
119 if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
120 if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC);
121 }
122
123 // 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
124 call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
125 call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
126 call->ntrio[FTYPE_122][2] = 8; call->ntrio[FTYPE_122][3] = 27; call->ntrio[FTYPE_122][4] = 64;
127 call->ntrio[FTYPE_101][2] = 2; call->ntrio[FTYPE_101][3] = 3; call->ntrio[FTYPE_101][4] = 4;
128 call->ntrio[FTYPE_100][2] = 2; call->ntrio[FTYPE_100][3] = 3; call->ntrio[FTYPE_100][4] = 4;
129
130 int nals, itype;
131 for (itype=0; itype<=4; itype++)
132 {
133 for (nals=2; nals<=4; nals++)
134 call->trio[itype][nals] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[itype][nals]);
135 }
136
137 // max 10 possible diploid genotypes
138 int gts[10];
139 for (nals=2; nals<=4; nals++)
140 {
141 int i,j,k, n = 0, ngts = 0;
142 for (i=0; i<nals; i++)
143 for (j=0; j<=i; j++)
144 gts[ngts++] = 1<<i | 1<<j;
145
146 // 222: all diploid
147 // i,j,k: father, mother, child
148 for (i=0; i<ngts; i++)
149 for (j=0; j<ngts; j++)
150 for (k=0; k<ngts; k++)
151 {
152 if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue; // k not present in neither i nor j
153 if ( !(gts[i] & gts[k]) || !(gts[j] & gts[k]) ) continue; // one copy from father, one from mother
154 int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 2,2,2);
155 call->trio[FTYPE_222][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k; // father, mother, child
156 }
157 assert( n==call->ntrio[FTYPE_222][nals] );
158
159 // 121: chrX, boy
160 n = 0;
161 for (i=0; i<ngts; i++)
162 for (j=0; j<ngts; j++)
163 for (k=0; k<ngts; k++)
164 {
165 if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue; // father nor boy can be diploid
166 if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue;
167 if ( !(gts[j] & gts[k]) ) continue; // boy must inherit the copy from mother
168 int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,1);
169 call->trio[FTYPE_121][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
170 }
171 assert( n==call->ntrio[FTYPE_121][nals] );
172
173 // 122: chrX, girl
174 n = 0;
175 for (i=0; i<ngts; i++)
176 for (j=0; j<ngts; j++)
177 for (k=0; k<ngts; k++)
178 {
179 if ( !IS_HOM(gts[i]) ) continue;
180 if ( ((gts[i]|gts[j])>s[k]) != gts[k] ) continue;
181 if ( !(gts[i] & gts[k]) ) continue; // girl must inherit one copy from the father and one from the mother
182 if ( !(gts[j] & gts[k]) ) continue;
183 int Pkij = calc_Pkij(gts[i],gts[j],gts[k], 1,2,2);
184 call->trio[FTYPE_122][nals][n++] = Pkij<<12 | i<<8 | j<<4 | k;
185 }
186 assert( n==call->ntrio[FTYPE_122][nals] );
187
188 // 101: chrY, boy
189 n = 0;
190 for (i=0; i<ngts; i++)
191 for (k=0; k<ngts; k++)
192 {
193 if ( !IS_HOM(gts[i]) || !IS_HOM(gts[k]) ) continue;
194 if ( (gts[i]>s[k]) != gts[k] ) continue;
195 call->trio[FTYPE_101][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | k;
196 }
197 assert( n==call->ntrio[FTYPE_101][nals] );
198
199 // 100: chrY, girl
200 n = 0;
201 for (i=0; i<ngts; i++)
202 {
203 if ( !IS_POW2(gts[i]) ) continue;
204 call->trio[FTYPE_100][nals][n++] = 1<<12 | i<<8 | GT_SKIP<<4 | GT_SKIP;
205 }
206 assert( n==call->ntrio[FTYPE_100][nals] );
207
208 }
209 call->GLs = (double*) calloc(bcf_hdr_nsamples(call->hdr)*10,sizeof(double));
210
211 int i, j;
212 for (i=0; i<call->nfams; i++)
213 {
214 family_t *fam = &call->fams[i];
215 int ploidy[3];
216 for (j=0; j<3; j++)
217 ploidy[j] = call->ploidy[fam->sample[j]];
218
219 if ( ploidy[FATHER]==2 ) // not X, not Y
220 {
221 if ( ploidy[MOTHER]!=2 || ploidy[CHILD]!=2 )
222 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
223 fam->type = FTYPE_222;
224 continue;
225 }
226 if ( ploidy[FATHER]!=1 || ploidy[MOTHER]==1 )
227 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
228 if ( ploidy[MOTHER]==2 ) // X
229 {
230 if ( ploidy[CHILD]==0 )
231 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
232 fam->type = ploidy[CHILD]==2 ? FTYPE_122 : FTYPE_121; // a girl or a boy
233 }
234 else // Y
235 {
236 if ( ploidy[CHILD]==2 )
237 error("Incorrect ploidy: %d %d %d\n", ploidy[FATHER],ploidy[MOTHER],ploidy[CHILD]);
238 fam->type = ploidy[CHILD]==0 ? FTYPE_100 : FTYPE_101; // a girl or a boy
239 }
240 }
241 }
mcall_destroy_trios(call_t * call)242 static void mcall_destroy_trios(call_t *call)
243 {
244 int i, j;
245 for (i=2; i<=4; i++)
246 for (j=0; j<=4; j++)
247 free(call->trio[j][i]);
248 }
249
init_sample_groups(call_t * call)250 static void init_sample_groups(call_t *call)
251 {
252 int i, nsmpl = bcf_hdr_nsamples(call->hdr);
253 if ( !call->sample_groups )
254 {
255 // standard pooled calling, all samples in the same group
256 call->nsmpl_grp = 1;
257 call->smpl_grp = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp));
258 call->smpl_grp[0].nsmpl = nsmpl;
259 call->smpl_grp[0].smpl = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t));
260 for (i=0; i<nsmpl; i++)
261 call->smpl_grp[0].smpl[i] = i;
262 return;
263 }
264
265 if ( call->sample_groups_tag )
266 {
267 // Is the tag defined in the header?
268 int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
269 if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
270 if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag);
271 }
272 else
273 {
274 int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS");
275 if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS";
276 else
277 {
278 tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD");
279 if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
280 else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
281 }
282 }
283
284 // Read samples/groups
285 if ( !strcmp("-",call->sample_groups) )
286 {
287 // single-sample calling, each sample creates its own group
288 call->nsmpl_grp = nsmpl;
289 call->smpl_grp = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp));
290 for (i=0; i<nsmpl; i++)
291 {
292 call->smpl_grp[i].nsmpl = 1;
293 call->smpl_grp[i].smpl = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t));
294 call->smpl_grp[i].smpl[0] = i;
295 }
296 }
297 else
298 {
299 int nlines;
300 char **lines = hts_readlist(call->sample_groups, 1, &nlines);
301 if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);
302
303 uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
304 uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
305 void *grp2idx = khash_str2int_init();
306
307 call->nsmpl_grp = 0;
308 for (i=0; i<nlines; i++)
309 {
310 char *ptr = lines[i];
311 while ( *ptr && !isspace(*ptr) ) ptr++;
312 if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
313 char *tmp = ptr;
314 while ( *ptr && isspace(*ptr) ) ptr++;
315 if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
316 *tmp = 0;
317 int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
318 if ( ismpl<0 ) continue;
319 if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
320 if ( !khash_str2int_has_key(grp2idx,ptr+1) )
321 {
322 khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
323 call->nsmpl_grp++;
324 }
325 int igrp = -1;
326 if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 )
327 error("This should not happen, fixme: %s\n",ptr+1);
328 grp2n[igrp]++;
329 smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples
330 }
331 khash_str2int_destroy(grp2idx);
332 if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);
333
334 call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
335 for (i=0; i<nsmpl; i++)
336 {
337 if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
338 int igrp = smpl2grp[i] - 1;
339 if ( !call->smpl_grp[igrp].nsmpl )
340 call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
341 call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
342 call->smpl_grp[igrp].nsmpl++;
343 }
344 free(smpl2grp);
345 free(grp2n);
346 for (i=0; i<nlines; i++) free(lines[i]);
347 free(lines);
348 }
349 }
destroy_sample_groups(call_t * call)350 static void destroy_sample_groups(call_t *call)
351 {
352 int i;
353 for (i=0; i<call->nsmpl_grp; i++)
354 {
355 free(call->smpl_grp[i].qsum);
356 free(call->smpl_grp[i].smpl);
357 }
358 free(call->smpl_grp);
359 }
360
mcall_init(call_t * call)361 void mcall_init(call_t *call)
362 {
363 init_sample_groups(call);
364 call_init_pl2p(call);
365
366 call->nals_map = 5;
367 call->als_map = (int*) malloc(sizeof(int)*call->nals_map);
368 call->npl_map = 5*(5+1)/2; // will be expanded later if necessary
369 call->pl_map = (int*) malloc(sizeof(int)*call->npl_map);
370 call->gts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int32_t)); // assuming at most diploid everywhere
371
372 if ( call->flag & CALL_CONSTR_TRIO )
373 {
374 call->cgts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
375 call->ugts = (int32_t*) calloc(bcf_hdr_nsamples(call->hdr),sizeof(int32_t));
376 mcall_init_trios(call);
377 bcf_hdr_append(call->hdr,"##FORMAT=<ID=CGT,Number=1,Type=Integer,Description=\"Constrained Genotype (0-based index to Number=G ordering).\">");
378 bcf_hdr_append(call->hdr,"##FORMAT=<ID=UGT,Number=1,Type=Integer,Description=\"Unconstrained Genotype (0-based index to Number=G ordering).\">");
379 }
380 if ( call->flag & CALL_CONSTR_ALLELES ) call->vcmp = vcmp_init();
381
382 bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
383 if ( call->output_tags & CALL_FMT_GQ )
384 bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
385 if ( call->output_tags & CALL_FMT_GP )
386 bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype posterior probabilities in the range 0 to 1\">");
387 if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
388 call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
389 bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
390 bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
391 bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
392 bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
393 if ( call->output_tags & CALL_FMT_PV4 )
394 bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
395
396 // init the prior
397 if ( call->theta>0 )
398 {
399 int i, n = 0;
400 if ( !call->ploidy ) n = 2*bcf_hdr_nsamples(call->hdr); // all are diploid
401 else
402 {
403 for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
404 n += call->ploidy[i];
405 }
406 // Watterson factor, here aM_1 = aM_2 = 1
407 double aM = 1;
408 for (i=2; i<n; i++) aM += 1./i;
409 call->theta *= aM;
410 if ( call->theta >= 1 )
411 {
412 fprintf(stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta);
413 call->theta = 0.99;
414 }
415 call->theta = log(call->theta);
416 }
417 }
418
mcall_destroy(call_t * call)419 void mcall_destroy(call_t *call)
420 {
421 destroy_sample_groups(call);
422 if (call->vcmp) vcmp_destroy(call->vcmp);
423 free(call->itmp);
424 mcall_destroy_trios(call);
425 free(call->GPs);
426 free(call->ADs);
427 free(call->GLs);
428 free(call->GQs);
429 free(call->anno16);
430 free(call->PLs);
431 free(call->als_map);
432 free(call->pl_map);
433 free(call->gts); free(call->cgts); free(call->ugts);
434 free(call->pdg);
435 free(call->als);
436 free(call->ac);
437 return;
438 }
439
440
441 // Inits P(D|G): convert PLs from log space and normalize. In case of zero
442 // depth, missing PLs are all zero. In this case, pdg's are set to 0
443 // so that the corresponding genotypes can be set as missing and the
444 // qual calculation is not affected.
445 // Missing values are replaced by generic likelihoods when X (unseen allele) is
446 // present.
447 // NB: While the -m callig model uses the pdgs in canonical order,
448 // the original samtools -c calling code uses pdgs in reverse order (AA comes
449 // first, RR last).
450 // NB: Ploidy is not taken into account here, which is incorrect.
set_pdg(double * pl2p,int * PLs,double * pdg,int n_smpl,int n_gt,int unseen)451 void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unseen)
452 {
453 int i, j, nals;
454
455 // find out the number of alleles, expecting diploid genotype likelihoods
456 bcf_gt2alleles(n_gt-1, &i, &nals);
457 assert( i==nals );
458 nals++;
459
460 for (i=0; i<n_smpl; i++)
461 {
462 double sum = 0;
463 for (j=0; j<n_gt; j++)
464 {
465 if ( PLs[j]==bcf_int32_vector_end )
466 {
467 // We expect diploid genotype likelihoods. If not diploid, treat as missing
468 j = 0;
469 break;
470 }
471 if ( PLs[j]==bcf_int32_missing ) break;
472 pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
473 sum += pdg[j];
474 }
475
476 if ( j==0 )
477 {
478 // First value is missing (LK of RR), this indicates that
479 // all values are missing.
480 j = sum = n_gt;
481 }
482 else if ( j<n_gt && unseen<0 )
483 {
484 // Some of the values are missing and the unseen allele LK is not
485 // available. In such a case, we set LK to a very small value.
486 sum = 0;
487 for (j=0; j<n_gt; j++)
488 {
489 assert( PLs[j]!=bcf_int32_vector_end );
490 if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
491 pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
492 sum += pdg[j];
493 }
494 }
495 if ( j<n_gt )
496 {
497 // Missing values present, fill with unseen allele LK. This can be only
498 // as good as the merge was.
499 int ia,ib, k;
500 j = 0;
501 sum = 0;
502 for (ia=0; ia<nals; ia++)
503 {
504 for (ib=0; ib<=ia; ib++)
505 {
506 if ( PLs[j]==bcf_int32_missing )
507 {
508 k = bcf_alleles2gt(ia,unseen);
509 if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(ib,unseen);
510 if ( PLs[k]==bcf_int32_missing ) k = bcf_alleles2gt(unseen,unseen);
511 if ( PLs[k]==bcf_int32_missing )
512 {
513 // The PLs for unseen allele X are not present as well as for ia, ib.
514 // This can happen with incremental calling, when one of the merged
515 // files had all alleles A,C,G,T, in such a case, X was not present.
516 // Use a very small value instead.
517 PLs[j] = 255;
518 }
519 else
520 PLs[j] = PLs[k];
521 }
522 pdg[j] = pl2p[ PLs[j] ];
523 sum += pdg[j];
524 j++;
525 }
526 }
527 }
528 // Normalize: sum_i pdg_i = 1
529 if ( sum==n_gt )
530 {
531 // all missing
532 #if FLAT_PDG_FOR_MISSING
533 for (j=0; j<n_gt; j++) pdg[j] = 1./n_gt;
534 #else
535 for (j=0; j<n_gt; j++) pdg[j] = 0;
536 #endif
537 }
538 else
539 for (j=0; j<n_gt; j++) pdg[j] /= sum;
540
541 PLs += n_gt;
542 pdg += n_gt;
543 }
544 }
545
546 // Create mapping between old and new (trimmed) alleles
init_allele_trimming_maps(call_t * call,int nals_ori,int als_out)547 void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out)
548 {
549 int i, j, nout = 0;
550
551 // als_map: old(i) -> new(j)
552 for (i=0; i<nals_ori; i++)
553 {
554 if ( als_out & (1<<i) ) call->als_map[i] = nout++;
555 else call->als_map[i] = -1;
556 }
557
558 if ( !call->pl_map ) return;
559
560 // pl_map: new(k) -> old(l)
561 int k = 0, l = 0;
562 for (i=0; i<nals_ori; i++)
563 {
564 for (j=0; j<=i; j++)
565 {
566 if ( (als_out & (1<<i)) && (als_out & (1<<j)) ) call->pl_map[k++] = l;
567 l++;
568 }
569 }
570 }
571
572 /** log(exp(a)+exp(b)) */
logsumexp2(double a,double b)573 static inline double logsumexp2(double a, double b)
574 {
575 if ( a>b )
576 return log(1 + exp(b-a)) + a;
577 else
578 return log(1 + exp(a-b)) + b;
579 }
580
581 // Macro to set the most likely alleles
582 #define UPDATE_MAX_LKs(als,sum) { \
583 if ( max_lk<lk_tot && lk_tot_set ) { max_lk = lk_tot; max_als = (als); } \
584 if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
585 }
586
587 #define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
588
589 // Determine the most likely combination of alleles. In this implementation,
590 // at most tri-allelic sites are considered. Returns the number of alleles.
mcall_find_best_alleles(call_t * call,int nals,smpl_grp_t * grp)591 static int mcall_find_best_alleles(call_t *call, int nals, smpl_grp_t *grp)
592 {
593 int ia,ib,ic; // iterators over up to three alleles
594 int max_als=0; // most likely combination of alleles
595 double ref_lk = -HUGE_VAL, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
596 double lk_sum = -HUGE_VAL; // for normalizing the likelihoods
597 int nsmpl = grp->nsmpl;
598 int ngts = nals*(nals+1)/2;
599
600 // Single allele
601 for (ia=0; ia<nals; ia++)
602 {
603 double lk_tot = 0;
604 int lk_tot_set = 0;
605 int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype
606 int ismpl;
607 for (ismpl=0; ismpl<nsmpl; ismpl++)
608 {
609 double *pdg = call->pdg + grp->smpl[ismpl]*ngts + iaa;
610 if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
611 }
612 if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
613 else lk_tot += call->theta; // the prior
614 UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
615 }
616
617 // Two alleles
618 if ( nals>1 )
619 {
620 for (ia=0; ia<nals; ia++)
621 {
622 if ( grp->qsum[ia]==0 ) continue;
623 int iaa = (ia+1)*(ia+2)/2-1;
624 for (ib=0; ib<ia; ib++)
625 {
626 if ( grp->qsum[ib]==0 ) continue;
627 double lk_tot = 0;
628 int lk_tot_set = 0;
629 double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]);
630 double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]);
631 double fa2 = fa*fa;
632 double fb2 = fb*fb;
633 double fab = 2*fa*fb;
634 int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
635 for (is=0; is<nsmpl; is++)
636 {
637 int ismpl = grp->smpl[is];
638 double *pdg = call->pdg + ismpl*ngts;
639 double val = 0;
640 if ( !call->ploidy || call->ploidy[ismpl]==2 )
641 val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
642 else if ( call->ploidy && call->ploidy[ismpl]==1 )
643 val = fa*pdg[iaa] + fb*pdg[ibb];
644 if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
645 }
646 if ( ia!=0 ) lk_tot += call->theta; // the prior
647 if ( ib!=0 ) lk_tot += call->theta;
648 UPDATE_MAX_LKs(1<<ia|1<<ib, lk_tot_set);
649 }
650 }
651 }
652
653 // Three alleles
654 if ( nals>2 )
655 {
656 for (ia=0; ia<nals; ia++)
657 {
658 if ( grp->qsum[ia]==0 ) continue;
659 int iaa = (ia+1)*(ia+2)/2-1;
660 for (ib=0; ib<ia; ib++)
661 {
662 if ( grp->qsum[ib]==0 ) continue;
663 int ibb = (ib+1)*(ib+2)/2-1;
664 int iab = iaa - ia + ib;
665 for (ic=0; ic<ib; ic++)
666 {
667 if ( grp->qsum[ic]==0 ) continue;
668 double lk_tot = 0;
669 int lk_tot_set = 0;
670
671 double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
672 double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
673 double fc = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
674 double fa2 = fa*fa;
675 double fb2 = fb*fb;
676 double fc2 = fc*fc;
677 double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
678 int is, icc = (ic+1)*(ic+2)/2-1;
679 int iac = iaa - ia + ic, ibc = ibb - ib + ic;
680 for (is=0; is<nsmpl; is++)
681 {
682 int ismpl = grp->smpl[is];
683 double *pdg = call->pdg + ismpl*ngts;
684 double val = 0;
685 if ( !call->ploidy || call->ploidy[ismpl]==2 )
686 val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
687 else if ( call->ploidy && call->ploidy[ismpl]==1 )
688 val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
689 if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
690 }
691 if ( ia!=0 ) lk_tot += call->theta; // the prior
692 if ( ib!=0 ) lk_tot += call->theta; // the prior
693 if ( ic!=0 ) lk_tot += call->theta; // the prior
694 UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic, lk_tot_set);
695 }
696 }
697 }
698 }
699
700 int i, n = 0;
701 for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
702
703 grp->max_lk = max_lk;
704 grp->ref_lk = ref_lk;
705 grp->lk_sum = lk_sum;
706 grp->als = max_als;
707 grp->nals = n;
708
709 return n;
710 }
711
712 // Sets GT=0/0 or GT=. if PL=0,0,0
mcall_set_ref_genotypes(call_t * call,int nals_ori)713 static void mcall_set_ref_genotypes(call_t *call, int nals_ori)
714 {
715 int i;
716 int ngts = nals_ori*(nals_ori+1)/2; // need this to distinguish between GT=0/0 vs GT=.
717 int nsmpl = bcf_hdr_nsamples(call->hdr);
718
719 for (i=0; i<nals_ori; i++) call->ac[i] = 0; // nals_new<=nals_ori, never mind setting extra 0's
720
721 // Set all genotypes to 0/0 or 0
722 int *gts = call->gts;
723 double *pdg = call->pdg;
724 int isample;
725 for (isample = 0; isample < nsmpl; isample++)
726 {
727 int ploidy = call->ploidy ? call->ploidy[isample] : 2;
728 for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
729 if ( i==ngts || !ploidy )
730 {
731 gts[0] = bcf_gt_missing;
732 gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
733 }
734 else
735 {
736 gts[0] = bcf_gt_unphased(0);
737 gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
738 call->ac[0] += ploidy;
739 }
740 gts += 2;
741 pdg += ngts;
742 }
743 }
744
mcall_call_genotypes(call_t * call,int nals_ori,smpl_grp_t * grp)745 static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
746 {
747 int ia, ib, i;
748 int ngts_ori = nals_ori*(nals_ori+1)/2;
749 int ngts_new = call->nals_new*(call->nals_new+1)/2;
750 int nsmpl = grp->nsmpl;
751
752 #if USE_PRIOR_FOR_GTS
753 float prior = exp(call->theta);
754 #endif
755
756 int is;
757 for (is = 0; is < nsmpl; is++)
758 {
759 int ismpl = grp->smpl[is];
760 double *pdg = call->pdg + ismpl*ngts_ori;
761 float *gps = call->GPs + ismpl*ngts_new;
762 int *gts = call->gts + ismpl*2;
763
764 int ploidy = call->ploidy ? call->ploidy[ismpl] : 2;
765 assert( ploidy>=0 && ploidy<=2 );
766
767 if ( !ploidy )
768 {
769 gts[0] = bcf_gt_missing;
770 gts[1] = bcf_int32_vector_end;
771 gps[0] = -1;
772 continue;
773 }
774
775 #if !FLAT_PDG_FOR_MISSING
776 // Skip samples with zero depth, they have all pdg's equal to 0
777 for (i=0; i<ngts_ori; i++) if ( pdg[i]!=0.0 ) break;
778 if ( i==ngts_ori )
779 {
780 gts[0] = bcf_gt_missing;
781 gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
782 gps[0] = -1;
783 continue;
784 }
785 #endif
786
787 // Default fallback for the case all LKs are the same
788 gts[0] = bcf_gt_unphased(0);
789 gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
790
791 // Non-zero depth, determine the most likely genotype
792 double best_lk = 0;
793 for (ia=0; ia<nals_ori; ia++)
794 {
795 if ( !(grp->als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
796 int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
797 double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
798 #if USE_PRIOR_FOR_GTS
799 if ( ia!=0 ) lk *= prior;
800 #endif
801 int igt = ploidy==2 ? bcf_alleles2gt(call->als_map[ia],call->als_map[ia]) : call->als_map[ia];
802 gps[igt] = lk;
803 if ( best_lk < lk )
804 {
805 best_lk = lk;
806 gts[0] = bcf_gt_unphased(call->als_map[ia]);
807 }
808 }
809 if ( ploidy==2 )
810 {
811 gts[1] = gts[0];
812 for (ia=0; ia<nals_ori; ia++)
813 {
814 if ( !(grp->als & 1<<ia) ) continue;
815 int iaa = (ia+1)*(ia+2)/2-1;
816 for (ib=0; ib<ia; ib++)
817 {
818 if ( !(grp->als & 1<<ib) ) continue;
819 int iab = iaa - ia + ib;
820 double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
821 #if USE_PRIOR_FOR_GTS
822 if ( ia!=0 ) lk *= prior;
823 if ( ib!=0 ) lk *= prior;
824 #endif
825 int igt = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
826 gps[igt] = lk;
827 if ( best_lk < lk )
828 {
829 best_lk = lk;
830 gts[0] = bcf_gt_unphased(call->als_map[ib]);
831 gts[1] = bcf_gt_unphased(call->als_map[ia]);
832 }
833 }
834 }
835 }
836 else
837 gts[1] = bcf_int32_vector_end;
838
839 call->ac[ bcf_gt_allele(gts[0]) ]++;
840 if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
841 }
842 if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return;
843 double max, sum;
844 for (is=0; is<nsmpl; is++)
845 {
846 int ismpl = grp->smpl[is];
847 float *gps = call->GPs + ismpl*ngts_new;
848
849 int nmax;
850 if ( call->ploidy )
851 {
852 if ( call->ploidy[ismpl]==2 ) nmax = ngts_new;
853 else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals;
854 else nmax = 0;
855 }
856 else nmax = ngts_new;
857
858 max = gps[0];
859 if ( max<0 || nmax==0 )
860 {
861 // no call
862 if ( call->output_tags & CALL_FMT_GP )
863 {
864 for (i=0; i<nmax; i++) gps[i] = 0;
865 if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
866 if ( nmax < ngts_new ) bcf_float_set_vector_end(gps[nmax]);
867 }
868 call->GQs[ismpl] = 0;
869 continue;
870 }
871 sum = gps[0];
872 for (i=1; i<nmax; i++)
873 {
874 if ( max < gps[i] ) max = gps[i];
875 sum += gps[i];
876 }
877 max = -4.34294*log(1 - max/sum);
878 call->GQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX;
879 if ( call->output_tags & CALL_FMT_GP )
880 {
881 assert( max );
882 for (i=0; i<nmax; i++) gps[i] = gps[i]/sum;
883 for (; i<ngts_new; i++) bcf_float_set_vector_end(gps[i]);
884 }
885 }
886 }
887
888
889 /**
890 Pm = P(mendelian) .. parameter to vary, 1-Pm is the probability of novel mutation.
891 When trio_Pm_ins is negative, Pm is calculated dynamically
892 according to indel length. For simplicity, only the
893 first ALT is considered.
894 Pkij = P(k|i,j) .. probability that the genotype combination i,j,k is consistent
895 with mendelian inheritance (the likelihood that offspring
896 of two HETs is a HOM is smaller than it being a HET)
897
898 P_uc(F=i,M=j,K=k) = P(F=i) . P(M=j) . P(K=k) .. unconstrained P
899 P_c(F=i,M=j,K=k) = P_uc . Pkij .. constrained P
900 P(F=i,M=j,K=k) = P_uc . (1 - Pm) + P_c . Pm
901 = P_uc . [1 - Pm + Pkij . Pm]
902
903 We choose genotype combination i,j,k which maximizes P(F=i,M=j,K=k). This
904 probability gives the quality GQ(Trio).
905 Individual qualities are calculated as
906 GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
907 */
908 #if 0
909 static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new)
910 {
911 int ia, ib, i;
912 int nsmpl = bcf_hdr_nsamples(call->hdr);
913 int ngts = nals*(nals+1)/2;
914 int nout_gts = nals_new*(nals_new+1)/2;
915 double *gls = call->GLs - nout_gts;
916 double *pdg = call->pdg - ngts;
917
918 // Calculate individuals' genotype likelihoods P(X=i)
919 int isample;
920 for (isample = 0; isample < nsmpl; isample++)
921 {
922 int ploidy = call->ploidy ? call->ploidy[isample] : 2;
923 int32_t *gts = call->ugts + isample;
924
925 gls += nout_gts;
926 pdg += ngts;
927
928 // Skip samples with all pdg's equal to 1. These have zero depth.
929 for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
930 if ( i==ngts || !ploidy )
931 {
932 gts[0] = -1;
933 gls[0] = 1;
934 continue;
935 }
936
937 for (i=0; i<nout_gts; i++) gls[i] = -HUGE_VAL;
938
939 grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]];
940 double sum_lk = 0;
941 double best_lk = 0;
942 for (ia=0; ia<nals; ia++)
943 {
944 if ( !(als_new & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
945 int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
946 int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
947 double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
948 sum_lk += lk;
949 gls[idx] = lk;
950 if ( best_lk < lk )
951 {
952 best_lk = lk;
953 gts[0] = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
954 }
955 }
956 if ( ploidy==2 )
957 {
958 for (ia=0; ia<nals; ia++)
959 {
960 if ( !(als_new & 1<<ia) ) continue;
961 for (ib=0; ib<ia; ib++)
962 {
963 if ( !(als_new & 1<<ib) ) continue;
964 int iab = bcf_alleles2gt(ia,ib);
965 int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
966 double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
967 sum_lk += lk;
968 gls[idx] = lk;
969 if ( best_lk < lk )
970 {
971 best_lk = lk;
972 gts[0] = bcf_alleles2gt(call->als_map[ib],call->als_map[ia]);
973 }
974 }
975 }
976 }
977 for (i=0; i<nout_gts; i++)
978 if ( gls[i]!=-HUGE_VAL ) gls[i] = log(gls[i]/sum_lk);
979 }
980
981 // Set novel mutation rate for this site: using first ALT allele for simplicity.
982 double trio_Pm;
983 if ( call->trio_Pm_ins<0 && call->trio_Pm_del<0 ) trio_Pm = call->trio_Pm_SNPs; // the same Pm for indels and SNPs requested
984 else
985 {
986 int ret = bcf_get_variant_types(rec);
987 if ( !(ret & VCF_INDEL) ) trio_Pm = call->trio_Pm_SNPs;
988 else
989 {
990 if ( call->trio_Pm_ins<0 ) // dynamic calculation, trio_Pm_del holds the scaling factor
991 {
992 trio_Pm = rec->d.var[1].n<0 ? -21.9313 - 0.2856*rec->d.var[1].n : -22.8689 + 0.2994*rec->d.var[1].n;
993 trio_Pm = 1 - call->trio_Pm_del * exp(trio_Pm);
994 }
995 else // snps and indels set explicitly
996 {
997 trio_Pm = rec->d.var[1].n<0 ? call->trio_Pm_del : call->trio_Pm_ins;
998 }
999 }
1000 }
1001
1002 // Calculate constrained likelihoods and determine genotypes
1003 int ifm;
1004 for (ifm=0; ifm<call->nfams; ifm++)
1005 {
1006 family_t *fam = &call->fams[ifm];
1007 int ntrio = call->ntrio[fam->type][nals_new];
1008 uint16_t *trio = call->trio[fam->type][nals_new];
1009
1010 // Unconstrained likelihood
1011 int uc_itr = 0;
1012 double uc_lk = 0;
1013 for (i=0; i<3; i++) // for father, mother, child
1014 {
1015 int ismpl = fam->sample[i];
1016 double *gl = call->GLs + nout_gts*ismpl;
1017 if ( gl[0]==1 ) continue;
1018 int j, jmax = 0;
1019 double max = gl[0];
1020 for (j=1; j<nout_gts; j++)
1021 if ( max < gl[j] ) { max = gl[j]; jmax = j; }
1022 uc_lk += max;
1023 uc_itr |= jmax << ((2-i)*4);
1024 }
1025
1026 // Best constrained likelihood
1027 int c_itr = -1, itr, uc_is_mendelian = 0;
1028 double c_lk = -HUGE_VAL;
1029 for (itr=0; itr<ntrio; itr++) // for each trio genotype combination
1030 {
1031 double lk = 0;
1032 int npresent = 0;
1033 for (i=0; i<3; i++) // for father, mother, child
1034 {
1035 int ismpl = fam->sample[i];
1036 double *gl = call->GLs + nout_gts*ismpl;
1037 if ( gl[0]==1 ) continue;
1038 int igt = trio[itr]>>((2-i)*4) & 0xf;
1039 assert( !call->ploidy || call->ploidy[ismpl]>0 );
1040 if ( igt==GT_SKIP ) continue;
1041 lk += gl[igt];
1042 npresent++;
1043 // fprintf(stderr," %e", gl[igt]);
1044 }
1045 // fprintf(stderr,"\t\t");
1046 double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1; // with missing genotypes Pkij's are different
1047 lk += log(1 - trio_Pm * (1 - Pkij));
1048 // fprintf(stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij);
1049 if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; }
1050 if ( uc_itr==trio[itr] ) uc_is_mendelian = 1;
1051 }
1052
1053 if ( !uc_is_mendelian )
1054 {
1055 uc_lk += log(1 - trio_Pm);
1056 // fprintf(stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
1057 if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; }
1058 }
1059 // fprintf(stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf);
1060
1061 // Set genotypes for father, mother, child and calculate genotype qualities
1062 for (i=0; i<3; i++)
1063 {
1064 // GT
1065 int ismpl = fam->sample[i];
1066 int igt = c_itr>>((2-i)*4) & 0xf;
1067 double *gl = call->GLs + nout_gts*ismpl;
1068 int32_t *gts = call->cgts + ismpl;
1069 if ( gl[0]==1 || igt==GT_SKIP ) // zero depth, set missing genotypes
1070 {
1071 gts[0] = -1;
1072 // bcf_float_set_missing(call->GQs[ismpl]);
1073 continue;
1074 }
1075 gts[0] = igt;
1076
1077 #if 0
1078 // todo: Genotype Qualities
1079 //
1080 // GQ: for each family member i sum over all genotypes j,k keeping igt fixed
1081 double lk_sum = 0;
1082 for (itr=0; itr<ntrio; itr++)
1083 {
1084 if ( igt != (trio[itr]>>((2-i)*4) & 0xf) ) continue;
1085 double lk = 0;
1086 int j;
1087 for (j=0; j<3; j++)
1088 {
1089 int jsmpl = fam->sample[j];
1090 double *gl = call->GLs + ngts*jsmpl;
1091 if ( gl[0]==1 ) continue;
1092 int jgt = trio[itr]>>((2-j)*4) & 0xf;
1093 if ( jgt==GT_SKIP ) continue;
1094 lk += gl[jgt];
1095 }
1096 double Pkij = (double)2/(trio[itr]>>12);
1097 lk += log(1 - trio_Pm * (1 - Pkij));
1098 lk_sum = logsumexp2(lk_sum, lk);
1099 }
1100 if ( !uc_is_mendelian && (best_itr>>((2-i)*4)&0xf)==(uc_itr>>((2-i)*4)&0xf) ) lk_sum = logsumexp2(lk_sum,uc_lk);
1101 call->GQs[ismpl] = -4.3429*(best_lk - lk_sum);
1102 #endif
1103 }
1104 }
1105
1106 for (i=0; i<4; i++) call->ac[i] = 0;
1107 call->nhets = 0;
1108 call->ndiploid = 0;
1109
1110 // Test if CGT,UGT are needed
1111 int ucgts_needed = 0;
1112 int32_t *cgts = call->cgts - 1;
1113 int32_t *ugts = call->ugts - 1;
1114 int32_t *gts = call->gts - 2;
1115 for (isample = 0; isample < nsmpl; isample++)
1116 {
1117 int ploidy = call->ploidy ? call->ploidy[isample] : 2;
1118 cgts++;
1119 ugts++;
1120 gts += 2;
1121 if ( ugts[0]==-1 )
1122 {
1123 gts[0] = bcf_gt_missing;
1124 gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
1125 continue;
1126 }
1127 int a,b;
1128 if ( cgts[0]!=ugts[0] )
1129 {
1130 bcf_gt2alleles(cgts[0], &a, &b);
1131 gts[0] = bcf_gt_unphased(a);
1132 gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
1133 }
1134 else
1135 {
1136 bcf_gt2alleles(ugts[0], &a, &b);
1137 gts[0] = bcf_gt_unphased(a);
1138 gts[1] = ploidy==1 ? bcf_int32_vector_end : bcf_gt_unphased(b);
1139 }
1140 if ( cgts[0]!=ugts[0] ) ucgts_needed = 1;
1141 call->ac[a]++;
1142 if ( ploidy==2 )
1143 {
1144 call->ac[b]++;
1145 call->ndiploid++;
1146 if ( a!=b ) call->nhets++;
1147 }
1148 }
1149 if ( ucgts_needed )
1150 {
1151 // Some GTs are different
1152 bcf_update_format_int32(call->hdr,rec,"UGT",call->ugts,nsmpl);
1153 bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
1154 }
1155 }
1156 #endif
1157
mcall_trim_and_update_PLs(call_t * call,bcf1_t * rec,int nals_ori,int nals_new)1158 static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
1159 {
1160 int npls_src = nals_ori*(nals_ori+1)/2;
1161 int npls_dst = nals_new*(nals_new+1)/2; // number of PL values in diploid samples, ori and new
1162 if ( call->all_diploid && npls_src == npls_dst ) return;
1163
1164 int *pls_src = call->PLs, *pls_dst = call->PLs;
1165
1166 int nsmpl = bcf_hdr_nsamples(call->hdr);
1167 int isample, ia;
1168 for (isample = 0; isample < nsmpl; isample++)
1169 {
1170 int ploidy = call->ploidy ? call->ploidy[isample] : 2;
1171 if ( ploidy==2 )
1172 {
1173 for (ia=0; ia<npls_dst; ia++)
1174 pls_dst[ia] = pls_src[ call->pl_map[ia] ];
1175 }
1176 else if ( ploidy==1 )
1177 {
1178 for (ia=0; ia<nals_new; ia++)
1179 {
1180 int isrc = (ia+1)*(ia+2)/2-1;
1181 pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
1182 }
1183 if ( ia<npls_dst ) pls_dst[ia] = bcf_int32_vector_end;
1184 }
1185 else
1186 {
1187 pls_dst[0] = bcf_int32_missing;
1188 pls_dst[1] = bcf_int32_vector_end; // relying on nals_new>1 in mcall()
1189 }
1190 pls_src += npls_src;
1191 pls_dst += npls_dst;
1192 }
1193 bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
1194 }
1195
mcall_trim_and_update_numberR(call_t * call,bcf1_t * rec,int nals_ori,int nals_new)1196 void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
1197 {
1198 if ( nals_ori==nals_new ) return;
1199
1200 int i,j, nret, size = sizeof(float);
1201
1202 void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
1203 int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
1204
1205 // INFO fields
1206 for (i=0; i<rec->n_info; i++)
1207 {
1208 bcf_info_t *info = &rec->d.info[i];
1209 int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
1210 if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
1211
1212 int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
1213 const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key);
1214 nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
1215 if ( nret<=0 ) continue;
1216
1217 if ( nals_new==1 )
1218 bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
1219 else
1220 {
1221 for (j=0; j<nals_ori; j++)
1222 {
1223 int k = call->als_map[j];
1224 if ( k==-1 ) continue; // to be dropped
1225 memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
1226 }
1227 bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new);
1228 }
1229 }
1230
1231 // FORMAT fields
1232 for (i=0; i<rec->n_fmt; i++)
1233 {
1234 bcf_fmt_t *fmt = &rec->d.fmt[i];
1235 int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
1236 if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
1237
1238 int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
1239 const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
1240 nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
1241 if (nret<=0) continue;
1242 int nsmpl = bcf_hdr_nsamples(call->hdr);
1243
1244 assert( nret==nals_ori*nsmpl );
1245
1246 for (j=0; j<nsmpl; j++)
1247 {
1248 char *ptr_src = (char *)tmp_ori + j*nals_ori*size;
1249 char *ptr_dst = (char *)tmp_new + j*nals_new*size;
1250 int k;
1251 for (k=0; k<nals_ori; k++)
1252 {
1253 int l = call->als_map[k];
1254 if ( l==-1 ) continue; // to be dropped
1255 memcpy(ptr_dst+size*l, ptr_src+size*k, size);
1256 }
1257 }
1258 bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl);
1259 }
1260
1261 call->PLs = (int32_t*) tmp_new;
1262 call->mPLs = ntmp_new;
1263 call->itmp = (int32_t*) tmp_ori;
1264 call->n_itmp = ntmp_ori;
1265 }
1266
1267
1268 // NB: in this function we temporarily use calls->als_map for a different
1269 // purpose to store mapping from new (target) alleles to original alleles.
1270 //
mcall_constrain_alleles(call_t * call,bcf1_t * rec,int * unseen)1271 static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
1272 {
1273 assert( call->tgt_als->n );
1274 if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n);
1275 hts_expand(char*,call->tgt_als->n+1,call->nals,call->als);
1276
1277 int has_new = 0;
1278
1279 int i, j, nals = 1;
1280 for (i=1; i<call->nals_map; i++) call->als_map[i] = -1;
1281
1282 if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 )
1283 error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]);
1284
1285 // create mapping from new to old alleles
1286 call->als[0] = call->tgt_als->allele[0];
1287 call->als_map[0] = 0;
1288
1289 for (i=1; i<call->tgt_als->n; i++)
1290 {
1291 call->als[nals] = call->tgt_als->allele[i];
1292 j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);
1293
1294 // if ( j+1==*unseen )
1295 // {
1296 // fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen);
1297 // int k;
1298 // for (k=0; k<rec->n_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
1299 // fprintf(stderr,"\tTAB=");
1300 // for (k=0; k<call->tgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
1301 // fprintf(stderr,"\n");
1302 // return -1;
1303 // }
1304
1305 if ( j>=0 )
1306 {
1307 // existing allele
1308 call->als_map[nals] = j+1;
1309 }
1310 else
1311 {
1312 // There is a new allele in targets which is not present in VCF.
1313 // We use the X allele to estimate PLs. Note that X may not be
1314 // present at multiallelic indels sites. In that case we use the
1315 // last allele anyway, because the least likely allele comes last
1316 // in mpileup's ALT output.
1317 call->als_map[nals] = (*unseen)>=0 ? *unseen : rec->n_allele - 1;
1318 has_new = 1;
1319 }
1320 nals++;
1321 }
1322 if ( *unseen )
1323 {
1324 call->als_map[nals] = *unseen;
1325 call->als[nals] = rec->d.allele[*unseen];
1326 nals++;
1327 }
1328
1329 if ( !has_new && nals==rec->n_allele ) return 0;
1330 bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
1331
1332 // create mapping from new PL to old PL
1333 int k = 0;
1334 for (i=0; i<nals; i++)
1335 {
1336 for (j=0; j<=i; j++)
1337 {
1338 int a = call->als_map[i], b = call->als_map[j];
1339 call->pl_map[k++] = a>b ? a*(a+1)/2 + b : b*(b+1)/2 + a;
1340 }
1341 }
1342
1343 // update PL
1344 call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
1345 int nsmpl = bcf_hdr_nsamples(call->hdr);
1346 int npls_ori = call->nPLs / nsmpl;
1347 int npls_new = k;
1348 hts_expand(int32_t,npls_new*nsmpl,call->n_itmp,call->itmp);
1349 int *ori_pl = call->PLs, *new_pl = call->itmp;
1350 for (i=0; i<nsmpl; i++)
1351 {
1352 for (k=0; k<npls_new; k++)
1353 {
1354 new_pl[k] = ori_pl[call->pl_map[k]];
1355 if ( new_pl[k]==bcf_int32_missing && *unseen>=0 )
1356 {
1357 // missing value, and there is an unseen allele: identify the
1358 // alleles and use the lk of either AX or XX
1359 int k_ori = call->pl_map[k], ia, ib;
1360 bcf_gt2alleles(k_ori, &ia, &ib);
1361 k_ori = bcf_alleles2gt(ia,*unseen);
1362 if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(ib,*unseen);
1363 if ( ori_pl[k_ori]==bcf_int32_missing ) k_ori = bcf_alleles2gt(*unseen,*unseen);
1364 new_pl[k] = ori_pl[k_ori];
1365 }
1366 if ( !k && new_pl[k]==bcf_int32_vector_end ) new_pl[k]=bcf_int32_missing;
1367 }
1368 ori_pl += npls_ori;
1369 new_pl += npls_new;
1370 }
1371 bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
1372
1373 // update QS, use temporarily call->GPs to store the values
1374 int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
1375 hts_expand(float,nals,call->nGPs,call->GPs);
1376 for (i=0; i<nals; i++)
1377 call->GPs[i] = call->als_map[i]<nqs ? call->smpl_grp[0].qsum[call->als_map[i]] : 0;
1378 bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals);
1379
1380 // update any Number=R tags
1381 void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
1382 int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
1383 for (i=0; i<rec->n_fmt; i++)
1384 {
1385 bcf_fmt_t *fmt = &rec->d.fmt[i];
1386 int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
1387 if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
1388
1389 // NB:works only for BCF_HT_INT and BCF_HT_REAL
1390 int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
1391 assert( type==BCF_HT_INT || type==BCF_HT_REAL );
1392 assert( sizeof(float)==sizeof(int32_t) );
1393
1394 const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
1395 int nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
1396 if (nret<=0) continue;
1397 int nsmpl = bcf_hdr_nsamples(call->hdr);
1398 int size1 = sizeof(float);
1399 hts_expand(float, nsmpl * nals, ntmp_new, tmp_new);
1400 for (j=0; j<nsmpl; j++)
1401 {
1402 uint8_t *ptr_ori = (uint8_t *) tmp_ori + j*size1*fmt->n;
1403 uint8_t *ptr_new = (uint8_t *) tmp_new + j*nals*size1;
1404 for (k=0; k<nals; k++)
1405 {
1406 uint8_t *dst = ptr_new + size1*k;
1407 uint8_t *src = ptr_ori + size1*call->als_map[k];
1408 memcpy(dst,src,size1);
1409 }
1410 }
1411 nret = bcf_update_format(call->hdr, rec, key, tmp_new, nsmpl*nals, type);
1412 assert( nret==0 );
1413 }
1414 call->PLs = (int32_t*) tmp_new;
1415 call->mPLs = ntmp_new;
1416 call->itmp = (int32_t*) tmp_ori;
1417 call->n_itmp = ntmp_ori;
1418
1419 if ( *unseen ) *unseen = nals-1;
1420 return 0;
1421 }
1422
1423
1424 /**
1425 * This function implements the multiallelic calling model. It has two major parts:
1426 * 1) determine the most likely set of alleles and calculate the quality of ref/non-ref site
1427 * 2) determine and set the genotypes
1428 * In various places in between, the BCF record gets updated.
1429 */
mcall(call_t * call,bcf1_t * rec)1430 int mcall(call_t *call, bcf1_t *rec)
1431 {
1432 int i,j, unseen = call->unseen;
1433
1434 // Force alleles when calling genotypes given alleles was requested
1435 if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
1436
1437 int nsmpl = bcf_hdr_nsamples(call->hdr);
1438 int nals_ori = rec->n_allele;
1439 hts_expand(int,nals_ori,call->nac,call->ac);
1440 hts_expand(int,nals_ori,call->nals_map,call->als_map);
1441 hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map);
1442
1443 // Get the genotype likelihoods
1444 call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
1445 if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori ) // a mixture of diploid and haploid or haploid only
1446 error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs);
1447
1448 // Convert PLs to probabilities
1449 int ngts_ori = nals_ori*(nals_ori+1)/2;
1450 hts_expand(double, call->nPLs, call->npdg, call->pdg);
1451 set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen);
1452
1453 // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
1454 if ( call->nsmpl_grp == 1 )
1455 {
1456 int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
1457 if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
1458 if ( nqs < nals_ori )
1459 {
1460 // Some of the listed alleles do not have the corresponding QS field. This is
1461 // typically ref-only site with <*> in ALT.
1462 hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum);
1463 for (i=nqs; i<nals_ori; i++) call->smpl_grp[0].qsum[i] = 0;
1464 }
1465 }
1466 else
1467 {
1468 for (j=0; j<call->nsmpl_grp; j++)
1469 {
1470 hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum);
1471 memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori);
1472 }
1473
1474 // Use FORMAT/AD or FORMAT/QS
1475 int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs);
1476 if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag);
1477 nad /= bcf_hdr_nsamples(call->hdr);
1478 for (i=0; i<call->nsmpl_grp; i++)
1479 {
1480 int is;
1481 smpl_grp_t *grp = &call->smpl_grp[i];
1482 hts_expand(float,nals_ori,grp->nqsum,grp->qsum);
1483 for (j=0; j<nals_ori; j++) grp->qsum[j] = 0;
1484 for (is=0; is<grp->nsmpl; is++)
1485 {
1486 int ismpl = grp->smpl[is];
1487 int32_t *ptr = call->ADs + ismpl*nad;
1488 float sum = 0;
1489 for (j=0; j<nad; j++)
1490 {
1491 if ( ptr[j]==bcf_int32_vector_end ) break;
1492 if ( ptr[j]!=bcf_int32_missing ) sum += ptr[j];
1493 }
1494 if ( sum )
1495 {
1496 for (j=0; j<nad; j++)
1497 {
1498 if ( ptr[j]==bcf_int32_vector_end ) break;
1499 if ( ptr[j]!=bcf_int32_missing ) grp->qsum[j] += ptr[j]/sum;
1500 }
1501 }
1502 }
1503 }
1504 }
1505
1506 // If available, take into account reference panel AFs
1507 if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
1508 {
1509 int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac
1510 if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles
1511 {
1512 int ac0 = an; // this will become the number of REFs
1513 for (i=0; i<nals_ori-1; i++)
1514 {
1515 if ( call->ac[i]==bcf_int32_vector_end ) break;
1516 if ( call->ac[i]==bcf_int32_missing ) continue;
1517 ac0 -= call->ac[i];
1518
1519 // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples
1520 for (j=0; j<call->nsmpl_grp; j++)
1521 call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an);
1522 }
1523 if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
1524 for (j=0; j<call->nsmpl_grp; j++)
1525 call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an);
1526 }
1527 }
1528
1529 // normalize so that QS sums to 1 for each group
1530 for (j=0; j<call->nsmpl_grp; j++)
1531 {
1532 float sum = 0;
1533 for (i=0; i<nals_ori; i++) sum += call->smpl_grp[j].qsum[i];
1534 if ( sum ) for (i=0; i<nals_ori; i++) call->smpl_grp[j].qsum[i] /= sum;
1535 }
1536
1537 bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
1538
1539 if ( nals_ori > 8*sizeof(call->als_new) )
1540 {
1541 fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
1542 return 0;
1543 }
1544
1545 // For each group find the best combination of alleles
1546 call->als_new = 0;
1547 double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL;
1548 for (j=0; j<call->nsmpl_grp; j++)
1549 {
1550 smpl_grp_t *grp = &call->smpl_grp[j];
1551 mcall_find_best_alleles(call, nals_ori, grp);
1552 call->als_new |= grp->als;
1553 if ( grp->max_lk==-HUGE_VAL ) continue;
1554 double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk));
1555 if ( max_qual < qual )
1556 {
1557 max_qual = qual;
1558 lk_sum = grp->lk_sum;
1559 ref_lk = grp->ref_lk;
1560 }
1561 }
1562
1563 // Make sure the REF allele is always present
1564 if ( !(call->als_new&1) ) call->als_new |= 1;
1565
1566 int is_variant = call->als_new==1 ? 0 : 1;
1567 if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
1568
1569 call->nals_new = 0;
1570 for (i=0; i<nals_ori; i++)
1571 {
1572 if ( i>0 && i==unseen ) continue;
1573 if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
1574 if ( call->als_new & (1<<i) ) call->nals_new++;
1575 }
1576
1577 init_allele_trimming_maps(call,nals_ori,call->als_new);
1578
1579 int nAC = 0;
1580 if ( call->als_new==1 ) // only REF allele on output
1581 {
1582 mcall_set_ref_genotypes(call,nals_ori);
1583 bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now
1584 }
1585 else if ( !is_variant )
1586 {
1587 mcall_set_ref_genotypes(call,nals_ori); // running with -A, prevent mcall_call_genotypes from putting some ALT back
1588 mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new);
1589 }
1590 else
1591 {
1592 // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
1593 // Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
1594 int ngts_new = call->nals_new*(call->nals_new+1)/2;
1595 hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs);
1596 for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
1597
1598 if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
1599 {
1600 fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
1601 return 0;
1602 }
1603 if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
1604 {
1605 memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs));
1606 memset(call->GQs,0,nsmpl*sizeof(*call->GQs));
1607 }
1608 for (i=0; i<call->nsmpl_grp; i++)
1609 {
1610 if ( call->flag & CALL_CONSTR_TRIO )
1611 error("todo: constrained trio calling temporarily disabled\n"); //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]);
1612 else
1613 mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]);
1614 }
1615
1616 // Skip the site if all samples are 0/0. This can happen occasionally.
1617 for (i=1; i<call->nals_new; i++) nAC += call->ac[i];
1618 if ( !nAC && call->flag & CALL_VARONLY ) return 0;
1619
1620 if ( call->output_tags & CALL_FMT_GP )
1621 bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new);
1622 if ( call->output_tags & CALL_FMT_GQ )
1623 bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
1624
1625 mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new);
1626 }
1627 if ( nals_ori!=call->nals_new )
1628 mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new);
1629
1630 // Set QUAL
1631 if ( nAC )
1632 {
1633 // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
1634 rec->qual = max_qual;
1635 }
1636 else
1637 {
1638 // Set the quality of a REF site
1639 if ( lk_sum!=-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
1640 rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk));
1641 else if ( call->ac[0] )
1642 rec->qual = call->theta ? -4.343*call->theta : 0;
1643 else
1644 bcf_float_set_missing(rec->qual);
1645 }
1646
1647 // AC, AN
1648 if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1);
1649 nAC += call->ac[0];
1650 bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
1651
1652 // Remove unused alleles
1653 hts_expand(char*,call->nals_new,call->nals,call->als);
1654 for (i=0; i<nals_ori; i++)
1655 if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
1656 bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new);
1657 bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
1658
1659 // DP4 and PV4 tags
1660 if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
1661 {
1662 int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
1663 bcf_update_info_int32(call->hdr, rec, "DP4", dp, 4);
1664
1665 int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
1666 bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
1667
1668 if ( call->output_tags & CALL_FMT_PV4 )
1669 {
1670 anno16_t a;
1671 float tmpf[4];
1672 int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
1673 if ( is_tested )
1674 {
1675 for (i=0; i<4; i++) tmpf[i] = a.p[i];
1676 bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
1677 }
1678 }
1679 }
1680
1681 bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
1682
1683 return call->nals_new;
1684 }
1685
1686