1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import htsjdk.variant.variantcontext.Allele;
4 import htsjdk.variant.variantcontext.Genotype;
5 import htsjdk.variant.variantcontext.GenotypeBuilder;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import htsjdk.variant.vcf.VCFFormatHeaderLine;
8 import org.apache.commons.lang.mutable.MutableInt;
9 import org.apache.logging.log4j.LogManager;
10 import org.apache.logging.log4j.Logger;
11 import org.broadinstitute.barclay.help.DocumentedFeature;
12 import org.broadinstitute.hellbender.engine.ReferenceContext;
13 import org.broadinstitute.hellbender.utils.QualityUtils;
14 import org.broadinstitute.hellbender.utils.Utils;
15 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
16 import org.broadinstitute.hellbender.utils.help.HelpConstants;
17 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
18 import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
19 import org.broadinstitute.hellbender.utils.read.GATKRead;
20 import org.broadinstitute.hellbender.utils.read.ReadUtils;
21 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
22 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
23 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
24 
25 import java.util.*;
26 import java.util.function.Function;
27 import java.util.stream.Collectors;
28 
29 /**
30  *  Count of read pairs in the F1R2 and F2R1 configurations supporting the reference and alternate alleles
31  *
32  *  <p>This is an annotation that gathers information about the read pair configuration for the reads supporting each
33  *  allele. It can be used along with downstream filtering steps to identify and filter out erroneous variants that occur
34  *  with higher frequency in one read pair orientation.</p>
35  *
36  *  <h3>References</h3>
37  *  <p>For more details about the mechanism of oxoG artifact generation, see <a href='http://www.ncbi.nlm.nih.gov/pubmed/23303777' target='_blank'>
38  *      <i></i>Discovery and characterization of artefactual mutations in deep coverage targeted capture sequencing data due to oxidative DNA damage during sample preparation.</i>
39  *  by Costello et al, doi: 10.1093/nar/gks1443</a></p>
40  */
41 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Count of read pairs in the F1R2 and F2R1 configurations supporting REF and ALT alleles (F1R2, F2R1)")
42 public final class OrientationBiasReadCounts extends GenotypeAnnotation implements StandardMutectAnnotation {
43 
44     private static final Logger logger = LogManager.getLogger(OrientationBiasReadCounts.class);
45 
46     private static final int MINIMUM_BASE_QUALITY = 20;
47 
48     @Override
getKeyNames()49     public List<String> getKeyNames() {
50         return Arrays.asList(GATKVCFConstants.F1R2_KEY, GATKVCFConstants.F2R1_KEY);
51     }
52 
53     @Override
getDescriptions()54     public List<VCFFormatHeaderLine> getDescriptions() {
55         return Arrays.asList(
56                 GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F1R2_KEY),
57                 GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F2R1_KEY));
58     }
59 
60     @Override
annotate(final ReferenceContext refContext, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final AlleleLikelihoods<GATKRead, Allele> likelihoods)61     public void annotate(final ReferenceContext refContext,
62                                   final VariantContext vc,
63                                   final Genotype g,
64                                   final GenotypeBuilder gb,
65                                   final AlleleLikelihoods<GATKRead, Allele> likelihoods){
66         Utils.nonNull(gb, "gb is null");
67         Utils.nonNull(vc, "vc is null");
68 
69         if (g == null || likelihoods == null) {
70             return;
71         }
72 
73         final Map<Allele, MutableInt> f1r2Counts = likelihoods.alleles().stream()
74                 .collect(Collectors.toMap(a -> a, a -> new MutableInt(0)));
75 
76         final Map<Allele, MutableInt> f2r1Counts = likelihoods.alleles().stream()
77                 .collect(Collectors.toMap(a -> a, a -> new MutableInt(0)));
78 
79         Utils.stream(likelihoods.bestAllelesBreakingTies(g.getSampleName()))
80                 .filter(ba -> ba.isInformative() && isUsableRead(ba.evidence) && BaseQualityRankSumTest.getReadBaseQuality(ba.evidence, vc).orElse(0) >= MINIMUM_BASE_QUALITY)
81                 .forEach(ba -> (ReadUtils.isF2R1(ba.evidence) ? f2r1Counts : f1r2Counts).get(ba.allele).increment());
82 
83         final int[] f1r2 = vc.getAlleles().stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray();
84 
85         final int[] f2r1 = vc.getAlleles().stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray();
86 
87         gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2);
88         gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
89     }
90 
91     /**
92      *  Annotate the given variant context with the OxoG read count attributes, directly from the read pileup.
93      *
94      *  This method may be slow and should be considered EXPERIMENTAL, especially with regard to indels and complex/mixed
95      *    variants.
96      *
97      * @param vc variant context for the genotype.  Necessary so that we can see all alleles.
98      * @param gb genotype builder to put the annotations into.
99      * @param readPileup pileup of the reads at this vc.  Note that this pileup does not have to match the
100      *                   genotype.  In other words, this tool does not check that the pileup was generated from the
101      *                   genotype sample.
102      */
annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb, final ReadPileup readPileup, int meanBaseQualityCutoff)103     public static void annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb,
104                                              final ReadPileup readPileup, int meanBaseQualityCutoff) {
105         Utils.nonNull(gb, "gb is null");
106         Utils.nonNull(vc, "vc is null");
107 
108         // Create a list of unique alleles
109         final List<Allele> variantAllelesWithDupes = vc.getAlleles();
110         final Set<Allele> alleleSet = new LinkedHashSet<>(variantAllelesWithDupes);
111         final List<Allele> variantAlleles = new ArrayList<>(alleleSet);
112 
113         // Initialize the mappings
114         final Map<Allele, MutableInt> f1r2Counts = variantAlleles.stream()
115                 .collect(Collectors.toMap(Function.identity(), a -> new MutableInt(0)));
116 
117         final Map<Allele, MutableInt> f2r1Counts = variantAlleles.stream()
118                 .collect(Collectors.toMap(Function.identity(), a -> new MutableInt(0)));
119 
120         final List<Allele> referenceAlleles = variantAlleles.stream().filter(a -> a.isReference() && !a.isSymbolic()).collect(Collectors.toList());
121         final List<Allele> altAlleles = variantAlleles.stream().filter(a -> a.isNonReference() && !a.isSymbolic()).collect(Collectors.toList());
122 
123         if (referenceAlleles.size() != 1) {
124             logger.warn("Number of reference alleles does not equal  for VC: " + vc);
125         }
126 
127         // We MUST have exactly 1 non-symbolic reference allele and a read pileup,
128         if ((referenceAlleles.size() == 1) && (readPileup != null) && !referenceAlleles.get(0).isSymbolic()) {
129             final Allele referenceAllele = referenceAlleles.get(0);
130             Utils.stream(readPileup)
131                     .filter(pe -> isUsableRead(pe.getRead()))
132                     .forEach(pe -> incrementCounts(pe, f1r2Counts, f2r1Counts, referenceAllele, altAlleles, meanBaseQualityCutoff));
133         }
134 
135         final int[] f1r2 = variantAlleles.stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray();
136 
137         final int[] f2r1 = variantAlleles.stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray();
138 
139         gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2);
140         gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
141     }
142 
143     /**
144      *  If the allele is not in the count mappings, then it is not counted.  No exception will be thrown
145      *  Modifies count variables in place.
146      *
147      * @param pileupElement pileup overlapping the alleles
148      * @param f1r2Counts a mapping of allele to f1r2 counts
149      * @param f2r1Counts a mapping of allele to f2r1 counts
150      */
incrementCounts(final PileupElement pileupElement, final Map<Allele, MutableInt> f1r2Counts, final Map<Allele, MutableInt> f2r1Counts, final Allele referenceAllele, final List<Allele> altAlleles, int minBaseQualityCutoff)151     private static void incrementCounts(final PileupElement pileupElement, final Map<Allele, MutableInt> f1r2Counts,
152                                     final Map<Allele, MutableInt> f2r1Counts, final Allele referenceAllele,
153                                         final List<Allele> altAlleles, int minBaseQualityCutoff) {
154 
155         final Map<Allele, MutableInt> countMap = ReadUtils.isF2R1(pileupElement.getRead()) ? f2r1Counts : f1r2Counts;
156 
157         final Allele pileupAllele = GATKVariantContextUtils.chooseAlleleForRead(pileupElement, referenceAllele, altAlleles, minBaseQualityCutoff);
158 
159         if (pileupAllele == null) {
160             return;
161         }
162 
163         if (countMap.containsKey(pileupAllele)) {
164             countMap.get(pileupAllele).increment();
165         }
166     }
167 
isUsableRead(final GATKRead read)168     protected static boolean isUsableRead(final GATKRead read) {
169         return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE;
170     }
171 
172 }