1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import htsjdk.variant.variantcontext.Allele;
4 import htsjdk.variant.variantcontext.Genotype;
5 import htsjdk.variant.variantcontext.GenotypeBuilder;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import htsjdk.variant.vcf.VCFConstants;
8 import htsjdk.variant.vcf.VCFFormatHeaderLine;
9 import htsjdk.variant.vcf.VCFStandardHeaderLines;
10 import org.apache.logging.log4j.LogManager;
11 import org.apache.logging.log4j.Logger;
12 import org.broadinstitute.barclay.help.DocumentedFeature;
13 import org.broadinstitute.hellbender.engine.ReferenceContext;
14 import org.broadinstitute.hellbender.utils.Utils;
15 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
16 import org.broadinstitute.hellbender.utils.help.HelpConstants;
17 import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
18 import org.broadinstitute.hellbender.utils.read.GATKRead;
19 
20 import java.util.*;
21 import java.util.stream.Collectors;
22 
23 /**
24  * Depth of informative coverage for each sample.
25  *
26  * <p>This annotation is similar to the sample-level DP annotation, which counts read depth after general filtering, but with an extra layer of stringency. Its purpose is to provide the count of reads that are actually considered informative by HaplotypeCaller (HC), using pre-read likelihoods that are produced internally by HC.</p>
27  * <p>In this context, an informative read is defined as one that allows the allele it carries to be easily distinguished. In contrast, a read might be considered uninformative if, for example, it only partially overlaps a short tandem repeat and it is not clear whether the read contains the reference allele or an extra repeat.</p>
28  *
29  * <p>See the method documentation on <a href="http://www.broadinstitute.org/gatk/guide/article?id=4721">using coverage information</a> for important interpretation details.</p>
30  *
31  * <h3>Caveats</h3>
32  * <ul>
33  *     <li>This annotation can only be generated by HaplotypeCaller (it will not work when called from VariantAnnotator).</li>
34  * </ul>
35  *
36  * <h3>Related annotations</h3>
37  * <ul>
38  *     <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample</a></b> calculates depth of coverage for each allele per sample (AD).</li>
39  *     <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
40  * </ul>
41  *
42  */
43 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Depth of informative coverage for each sample (DP)")
44 public final class DepthPerSampleHC extends GenotypeAnnotation implements StandardHCAnnotation, StandardMutectAnnotation {
45     private final static Logger logger = LogManager.getLogger(DepthPerSampleHC.class);
46 
47     @Override
annotate( final ReferenceContext ref, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final AlleleLikelihoods<GATKRead, Allele> likelihoods )48     public void annotate( final ReferenceContext ref,
49                           final VariantContext vc,
50                           final Genotype g,
51                           final GenotypeBuilder gb,
52                           final AlleleLikelihoods<GATKRead, Allele> likelihoods ) {
53         Utils.nonNull(vc);
54         Utils.nonNull(g);
55         Utils.nonNull(gb);
56 
57         if ( likelihoods == null || !g.isCalled() ) {
58             logger.warn(AnnotationUtils.generateMissingDataWarning(vc, g, likelihoods));
59             return;
60         }
61 
62         // check that there are reads
63         final String sample = g.getSampleName();
64         if (likelihoods.sampleEvidenceCount(likelihoods.indexOfSample(sample)) == 0) {
65             gb.DP(0);
66             return;
67         }
68 
69         final Set<Allele> alleles = new LinkedHashSet<>(vc.getAlleles());
70 
71         // make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
72         if ( !likelihoods.alleles().containsAll(alleles) ) {
73             logger.warn("VC alleles " + alleles + " not a strict subset of AlleleLikelihoods alleles " + likelihoods.alleles());
74             return;
75         }
76 
77         // the depth for the HC is the sum of the informative alleles at this site.  It's not perfect (as we cannot
78         // differentiate between reads that align over the event but aren't informative vs. those that aren't even
79         // close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
80         final Map<Allele, List<Allele>> alleleSubset = alleles.stream().collect(Collectors.toMap(a -> a, a -> Arrays.asList(a)));
81         final AlleleLikelihoods<GATKRead, Allele> subsettedLikelihoods = likelihoods.marginalize(alleleSubset);
82         final int depth = (int) subsettedLikelihoods.bestAllelesBreakingTies(sample).stream().filter(ba -> ba.isInformative()).count();
83         gb.DP(depth);
84     }
85 
86     @Override
getKeyNames()87     public List<String> getKeyNames() {
88         return Collections.singletonList(VCFConstants.DEPTH_KEY);
89     }
90 
91     @Override
getDescriptions()92     public List<VCFFormatHeaderLine> getDescriptions() {
93         return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
94     }
95 }
96