1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.variant.variantcontext.Allele; 5 import htsjdk.variant.variantcontext.Genotype; 6 import htsjdk.variant.variantcontext.GenotypesContext; 7 import htsjdk.variant.variantcontext.VariantContext; 8 import org.broadinstitute.barclay.help.DocumentedFeature; 9 import org.broadinstitute.hellbender.engine.ReferenceContext; 10 import org.broadinstitute.hellbender.utils.MathUtils; 11 import org.broadinstitute.hellbender.utils.Utils; 12 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; 13 import org.broadinstitute.hellbender.utils.help.HelpConstants; 14 import org.broadinstitute.hellbender.utils.read.GATKRead; 15 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 16 17 import java.util.Collections; 18 import java.util.List; 19 import java.util.Map; 20 21 /** 22 * Variant confidence normalized by unfiltered depth of variant samples 23 * 24 * <p>This annotation puts the variant confidence QUAL score into perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.</p> 25 * 26 * <h3>Statistical notes</h3> 27 * <p>The QD is the QUAL score normalized by allele depth (AD) for a variant. For a single sample, the HaplotypeCaller calculates the QD by taking QUAL/AD. For multiple samples, HaplotypeCaller and GenotypeGVCFs calculate the QD by taking QUAL/AD of samples with a non hom-ref genotype call. The reason we leave out the samples with a hom-ref call is to not penalize the QUAL for the other samples with the variant call.</p> 28 * <p>Here is a single sample example:</p> 29 * <pre>2 37629 . C G 1063.77 . AC=2;AF=1.00;AN=2;DP=31;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.50;QD=34.32;SOR=2.376 GT:AD:DP:GQ:PL:QSS 1/1:0,31:31:93:1092,93,0:0,960</pre> 30 <p>QUAL/AD = 1063.77/31 = 34.32 = QD</p> 31 * <p>Here is a multi-sample example:</p> 32 * <pre>10 8046 . C T 4107.13 . AC=1;AF=0.167;AN=6;BaseQRankSum=-3.717;DP=1063;FS=1.616;MLEAC=1;MLEAF=0.167;QD=11.54 33 GT:AD:DP:GQ:PL:QSS 0/0:369,4:373:99:0,1007,12207:10548,98 0/0:331,1:332:99:0,967,11125:9576,27 0/1:192,164:356:99:4138,0,5291:5501,4505</pre> 34 * <p>QUAL/AD = 4107.13/356 = 11.54 = QD</p> 35 * 36 * <h3>Caveat</h3> 37 * <p>This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.</p> 38 * 39 * <h3>Related annotations</h3> 40 * <ul> 41 * <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li> 42 * <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li> 43 * </ul> 44 */ 45 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Variant confidence normalized by unfiltered depth of variant samples (QD)") 46 public final class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { 47 48 @VisibleForTesting 49 static final double MAX_QD_BEFORE_FIXING = 35; 50 51 @VisibleForTesting 52 static final double IDEAL_HIGH_QD = 30; 53 private static final double JITTER_SIGMA = 3; 54 55 @Override annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)56 public Map<String, Object> annotate(final ReferenceContext ref, 57 final VariantContext vc, 58 final AlleleLikelihoods<GATKRead, Allele> likelihoods) { 59 Utils.nonNull(vc); 60 if ( !vc.hasLog10PError() ) { 61 return Collections.emptyMap(); 62 } 63 64 final GenotypesContext genotypes = vc.getGenotypes(); 65 if ( genotypes == null || genotypes.isEmpty() ) { 66 return Collections.emptyMap(); 67 } 68 69 final int depth = getDepth(genotypes, likelihoods); 70 71 if ( depth == 0 ) { 72 return Collections.emptyMap(); 73 } 74 75 final double qual = -10.0 * vc.getLog10PError(); 76 double QD = qual / depth; 77 78 // Hack: see note in the fixTooHighQD method below 79 QD = fixTooHighQD(QD); 80 81 return Collections.singletonMap(getKeyNames().get(0), String.format("%.2f", QD)); 82 } 83 84 /** 85 * The haplotype caller generates very high quality scores when multiple events are on the 86 * same haplotype. This causes some very good variants to have unusually high QD values, 87 * and VQSR will filter these out. This code looks at the QD value, and if it is above 88 * threshold we map it down to the mean high QD value, with some jittering 89 * 90 * @param QD the raw QD score 91 * @return a QD value 92 */ fixTooHighQD(final double QD)93 public static double fixTooHighQD(final double QD) { 94 if ( QD < MAX_QD_BEFORE_FIXING ) { 95 return QD; 96 } else { 97 return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA; 98 } 99 } 100 getDepth(final GenotypesContext genotypes, final AlleleLikelihoods<GATKRead, Allele> likelihoods)101 public static int getDepth(final GenotypesContext genotypes, final AlleleLikelihoods<GATKRead, Allele> likelihoods) { 102 int depth = 0; 103 int ADrestrictedDepth = 0; 104 105 for ( final Genotype genotype : genotypes ) { 106 // we care only about variant calls with likelihoods 107 if ( !genotype.isHet() && !genotype.isHomVar() ) { 108 continue; 109 } 110 111 // if we have the AD values for this sample, let's make sure that the variant depth is greater than 1! 112 if ( genotype.hasAD() ) { 113 final int[] AD = genotype.getAD(); 114 final int totalADdepth = (int) MathUtils.sum(AD); 115 if ( totalADdepth != 0 ) { 116 if (totalADdepth - AD[0] > 1) { 117 ADrestrictedDepth += totalADdepth; 118 } 119 depth += totalADdepth; 120 continue; 121 } 122 } 123 // if there is no AD value or it is a dummy value, we want to look to other means to get the depth 124 if (likelihoods != null) { 125 depth += likelihoods.sampleEvidenceCount(likelihoods.indexOfSample(genotype.getSampleName())); 126 } else if ( genotype.hasDP() ) { 127 depth += genotype.getDP(); 128 } 129 } 130 131 // if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward 132 if ( ADrestrictedDepth > 0 ) { 133 depth = ADrestrictedDepth; 134 } 135 return depth; 136 } 137 138 @Override getKeyNames()139 public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.QUAL_BY_DEPTH_KEY); } 140 } 141