1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.Genotype;
6 import htsjdk.variant.variantcontext.GenotypesContext;
7 import htsjdk.variant.variantcontext.VariantContext;
8 import org.broadinstitute.barclay.help.DocumentedFeature;
9 import org.broadinstitute.hellbender.engine.ReferenceContext;
10 import org.broadinstitute.hellbender.utils.MathUtils;
11 import org.broadinstitute.hellbender.utils.Utils;
12 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
13 import org.broadinstitute.hellbender.utils.help.HelpConstants;
14 import org.broadinstitute.hellbender.utils.read.GATKRead;
15 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
16 
17 import java.util.Collections;
18 import java.util.List;
19 import java.util.Map;
20 
21 /**
22  * Variant confidence normalized by unfiltered depth of variant samples
23  *
24  * <p>This annotation puts the variant confidence QUAL score into perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.</p>
25  *
26  * <h3>Statistical notes</h3>
27  * <p>The QD is the QUAL score normalized by allele depth (AD) for a variant. For a single sample, the HaplotypeCaller calculates the QD by taking QUAL/AD. For multiple samples, HaplotypeCaller and GenotypeGVCFs calculate the QD by taking QUAL/AD of samples with a non hom-ref genotype call. The reason we leave out the samples with a hom-ref call is to not penalize the QUAL for the other samples with the variant call.</p>
28  * <p>Here is a single sample example:</p>
29  * <pre>2	37629	.	C	G	1063.77	.	AC=2;AF=1.00;AN=2;DP=31;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.50;QD=34.32;SOR=2.376	GT:AD:DP:GQ:PL:QSS	1/1:0,31:31:93:1092,93,0:0,960</pre>
30    <p>QUAL/AD = 1063.77/31 = 34.32 = QD</p>
31  * <p>Here is a multi-sample example:</p>
32  * <pre>10	8046	.	C	T	4107.13	.	AC=1;AF=0.167;AN=6;BaseQRankSum=-3.717;DP=1063;FS=1.616;MLEAC=1;MLEAF=0.167;QD=11.54
33    GT:AD:DP:GQ:PL:QSS	0/0:369,4:373:99:0,1007,12207:10548,98	    0/0:331,1:332:99:0,967,11125:9576,27	    0/1:192,164:356:99:4138,0,5291:5501,4505</pre>
34  * <p>QUAL/AD = 4107.13/356 = 11.54 = QD</p>
35  *
36  * <h3>Caveat</h3>
37  * <p>This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.</p>
38  *
39  * <h3>Related annotations</h3>
40  * <ul>
41  *     <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
42  *     <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li>
43  * </ul>
44  */
45 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Variant confidence normalized by unfiltered depth of variant samples (QD)")
46 public final class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation {
47 
48     @VisibleForTesting
49     static final double MAX_QD_BEFORE_FIXING = 35;
50 
51     @VisibleForTesting
52     static final double IDEAL_HIGH_QD = 30;
53     private static final double JITTER_SIGMA = 3;
54 
55     @Override
annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)56     public Map<String, Object> annotate(final ReferenceContext ref,
57                                         final VariantContext vc,
58                                         final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
59         Utils.nonNull(vc);
60         if ( !vc.hasLog10PError() ) {
61             return Collections.emptyMap();
62         }
63 
64         final GenotypesContext genotypes = vc.getGenotypes();
65         if ( genotypes == null || genotypes.isEmpty() ) {
66             return Collections.emptyMap();
67         }
68 
69         final int depth = getDepth(genotypes, likelihoods);
70 
71         if ( depth == 0 ) {
72             return Collections.emptyMap();
73         }
74 
75         final double qual = -10.0 * vc.getLog10PError();
76         double QD = qual / depth;
77 
78         // Hack: see note in the fixTooHighQD method below
79         QD = fixTooHighQD(QD);
80 
81         return Collections.singletonMap(getKeyNames().get(0), String.format("%.2f", QD));
82     }
83 
84     /**
85      * The haplotype caller generates very high quality scores when multiple events are on the
86      * same haplotype.  This causes some very good variants to have unusually high QD values,
87      * and VQSR will filter these out.  This code looks at the QD value, and if it is above
88      * threshold we map it down to the mean high QD value, with some jittering
89      *
90      * @param QD the raw QD score
91      * @return a QD value
92      */
fixTooHighQD(final double QD)93     public static double fixTooHighQD(final double QD) {
94         if ( QD < MAX_QD_BEFORE_FIXING ) {
95             return QD;
96         } else {
97             return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
98         }
99     }
100 
getDepth(final GenotypesContext genotypes, final AlleleLikelihoods<GATKRead, Allele> likelihoods)101     public static int getDepth(final GenotypesContext genotypes, final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
102         int depth = 0;
103         int ADrestrictedDepth = 0;
104 
105         for ( final Genotype genotype : genotypes ) {
106             // we care only about variant calls with likelihoods
107             if ( !genotype.isHet() && !genotype.isHomVar() ) {
108                 continue;
109             }
110 
111             // if we have the AD values for this sample, let's make sure that the variant depth is greater than 1!
112             if ( genotype.hasAD() ) {
113                 final int[] AD = genotype.getAD();
114                 final int totalADdepth = (int) MathUtils.sum(AD);
115                 if ( totalADdepth != 0 ) {
116                     if (totalADdepth - AD[0] > 1) {
117                         ADrestrictedDepth += totalADdepth;
118                     }
119                     depth += totalADdepth;
120                     continue;
121                 }
122             }
123             // if there is no AD value or it is a dummy value, we want to look to other means to get the depth
124             if (likelihoods != null) {
125                 depth += likelihoods.sampleEvidenceCount(likelihoods.indexOfSample(genotype.getSampleName()));
126             } else if ( genotype.hasDP() ) {
127                 depth += genotype.getDP();
128             }
129         }
130 
131         // if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward
132         if ( ADrestrictedDepth > 0 ) {
133             depth = ADrestrictedDepth;
134         }
135         return depth;
136     }
137 
138     @Override
getKeyNames()139     public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.QUAL_BY_DEPTH_KEY); }
140 }
141