1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import htsjdk.variant.variantcontext.Allele;
4 import htsjdk.variant.variantcontext.Genotype;
5 import htsjdk.variant.variantcontext.GenotypeBuilder;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import htsjdk.variant.vcf.VCFConstants;
8 import htsjdk.variant.vcf.VCFFormatHeaderLine;
9 import htsjdk.variant.vcf.VCFStandardHeaderLines;
10 import org.broadinstitute.barclay.help.DocumentedFeature;
11 import org.broadinstitute.hellbender.engine.ReferenceContext;
12 import org.broadinstitute.hellbender.utils.Utils;
13 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
14 import org.broadinstitute.hellbender.utils.help.HelpConstants;
15 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
16 import org.broadinstitute.hellbender.utils.read.GATKRead;
17 
18 import java.util.*;
19 import java.util.stream.Collectors;
20 
21 /**
22  * Depth of coverage of each allele per sample
23  *
24  * <p>Also known as the allele depth, this annotation gives the unfiltered count of reads that support a given allele for an individual sample. The values in the field are ordered to match the order of alleles specified in the REF and ALT fields: REF, ALT1, ALT2 and so on if there are multiple ALT alleles.</p>
25  *
26  * <p>See the method documentation on <a href="http://www.broadinstitute.org/gatk/guide/article?id=4721">using coverage information</a> for important interpretation details.</p>
27  *
28  * <h3>Caveats</h3>
29  * <ul>
30  *     <li>The AD calculation as performed by HaplotypeCaller may not yield exact results because only reads that statistically favor one allele over the other are counted. Due to this fact, the sum of AD may be different than the individual sample depth, especially when there are many non-informative reads.</li>
31  *     <li>Because the AD includes reads and bases that were filtered by the caller (and in case of indels, is based on a statistical computation), it  should not be used to make assumptions about the genotype that it is associated with. Ultimately, the phred-scaled genotype likelihoods (PLs) are what determines the genotype calls.</li>
32  * </ul>
33  *
34  * <h3>Related annotations</h3>
35  * <ul>
36  *     <li><b><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage</a></b> gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.</li>
37  * </ul>
38  */
39 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Depth of coverage of each allele per sample (AD)")
40 public final class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation, StandardMutectAnnotation {
41 
42     @Override
annotate(final ReferenceContext ref, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final AlleleLikelihoods<GATKRead, Allele> likelihoods)43     public void annotate(final ReferenceContext ref,
44                          final VariantContext vc,
45                          final Genotype g,
46                          final GenotypeBuilder gb,
47                          final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
48         Utils.nonNull(gb, "gb is null");
49         Utils.nonNull(vc, "vc is null");
50 
51         if ( g == null || !g.isCalled() || likelihoods == null) {
52             return;
53         }
54         final Set<Allele> alleles = new LinkedHashSet<>(vc.getAlleles());
55 
56         // make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
57         Utils.validateArg(likelihoods.alleles().containsAll(alleles), () -> "VC alleles " + alleles + " not a  subset of AlleleLikelihoods alleles " + likelihoods.alleles());
58 
59         gb.AD(annotateWithLikelihoods(vc, g, alleles, likelihoods));
60     }
61 
annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, final AlleleLikelihoods<GATKRead, Allele> likelihoods)62     protected int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
63 
64         final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
65         for ( final Allele allele : vc.getAlleles() ) {
66             alleleCounts.put(allele, 0);
67         }
68         final Map<Allele, List<Allele>> alleleSubset = alleles.stream().collect(Collectors.toMap(a -> a, Arrays::asList));
69         final AlleleLikelihoods<GATKRead, Allele> subsettedLikelihoods = likelihoods.marginalize(alleleSubset);
70         subsettedLikelihoods.bestAllelesBreakingTies(g.getSampleName()).stream()
71                 .filter(ba -> ba.isInformative())
72                 .forEach(ba -> alleleCounts.compute(ba.allele, (allele,prevCount) -> prevCount + 1));
73 
74         final int[] counts = new int[alleleCounts.size()];
75         counts[0] = alleleCounts.get(vc.getReference()); //first one in AD is always ref
76         for (int i = 0; i < vc.getNAlleles() -1; i++) {
77             counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
78         }
79 
80         return counts;
81     }
82 
83     @Override
getKeyNames()84     public List<String> getKeyNames() { return Collections.singletonList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
85 
86     @Override
getDescriptions()87     public List<VCFFormatHeaderLine> getDescriptions() {
88         return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
89     }
90 }
91