1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import htsjdk.variant.variantcontext.Allele; 4 import htsjdk.variant.variantcontext.Genotype; 5 import htsjdk.variant.variantcontext.GenotypeBuilder; 6 import htsjdk.variant.variantcontext.VariantContext; 7 import htsjdk.variant.vcf.VCFFormatHeaderLine; 8 import org.apache.commons.lang.mutable.MutableInt; 9 import org.apache.logging.log4j.LogManager; 10 import org.apache.logging.log4j.Logger; 11 import org.broadinstitute.barclay.help.DocumentedFeature; 12 import org.broadinstitute.hellbender.engine.ReferenceContext; 13 import org.broadinstitute.hellbender.utils.QualityUtils; 14 import org.broadinstitute.hellbender.utils.Utils; 15 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; 16 import org.broadinstitute.hellbender.utils.help.HelpConstants; 17 import org.broadinstitute.hellbender.utils.pileup.PileupElement; 18 import org.broadinstitute.hellbender.utils.pileup.ReadPileup; 19 import org.broadinstitute.hellbender.utils.read.GATKRead; 20 import org.broadinstitute.hellbender.utils.read.ReadUtils; 21 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 22 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; 23 import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; 24 25 import java.util.*; 26 import java.util.function.Function; 27 import java.util.stream.Collectors; 28 29 /** 30 * Count of read pairs in the F1R2 and F2R1 configurations supporting the reference and alternate alleles 31 * 32 * <p>This is an annotation that gathers information about the read pair configuration for the reads supporting each 33 * allele. It can be used along with downstream filtering steps to identify and filter out erroneous variants that occur 34 * with higher frequency in one read pair orientation.</p> 35 * 36 * <h3>References</h3> 37 * <p>For more details about the mechanism of oxoG artifact generation, see <a href='http://www.ncbi.nlm.nih.gov/pubmed/23303777' target='_blank'> 38 * <i></i>Discovery and characterization of artefactual mutations in deep coverage targeted capture sequencing data due to oxidative DNA damage during sample preparation.</i> 39 * by Costello et al, doi: 10.1093/nar/gks1443</a></p> 40 */ 41 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Count of read pairs in the F1R2 and F2R1 configurations supporting REF and ALT alleles (F1R2, F2R1)") 42 public final class OrientationBiasReadCounts extends GenotypeAnnotation implements StandardMutectAnnotation { 43 44 private static final Logger logger = LogManager.getLogger(OrientationBiasReadCounts.class); 45 46 private static final int MINIMUM_BASE_QUALITY = 20; 47 48 @Override getKeyNames()49 public List<String> getKeyNames() { 50 return Arrays.asList(GATKVCFConstants.F1R2_KEY, GATKVCFConstants.F2R1_KEY); 51 } 52 53 @Override getDescriptions()54 public List<VCFFormatHeaderLine> getDescriptions() { 55 return Arrays.asList( 56 GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F1R2_KEY), 57 GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F2R1_KEY)); 58 } 59 60 @Override annotate(final ReferenceContext refContext, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final AlleleLikelihoods<GATKRead, Allele> likelihoods)61 public void annotate(final ReferenceContext refContext, 62 final VariantContext vc, 63 final Genotype g, 64 final GenotypeBuilder gb, 65 final AlleleLikelihoods<GATKRead, Allele> likelihoods){ 66 Utils.nonNull(gb, "gb is null"); 67 Utils.nonNull(vc, "vc is null"); 68 69 if (g == null || likelihoods == null) { 70 return; 71 } 72 73 final Map<Allele, MutableInt> f1r2Counts = likelihoods.alleles().stream() 74 .collect(Collectors.toMap(a -> a, a -> new MutableInt(0))); 75 76 final Map<Allele, MutableInt> f2r1Counts = likelihoods.alleles().stream() 77 .collect(Collectors.toMap(a -> a, a -> new MutableInt(0))); 78 79 Utils.stream(likelihoods.bestAllelesBreakingTies(g.getSampleName())) 80 .filter(ba -> ba.isInformative() && isUsableRead(ba.evidence) && BaseQualityRankSumTest.getReadBaseQuality(ba.evidence, vc).orElse(0) >= MINIMUM_BASE_QUALITY) 81 .forEach(ba -> (ReadUtils.isF2R1(ba.evidence) ? f2r1Counts : f1r2Counts).get(ba.allele).increment()); 82 83 final int[] f1r2 = vc.getAlleles().stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray(); 84 85 final int[] f2r1 = vc.getAlleles().stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray(); 86 87 gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2); 88 gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1); 89 } 90 91 /** 92 * Annotate the given variant context with the OxoG read count attributes, directly from the read pileup. 93 * 94 * This method may be slow and should be considered EXPERIMENTAL, especially with regard to indels and complex/mixed 95 * variants. 96 * 97 * @param vc variant context for the genotype. Necessary so that we can see all alleles. 98 * @param gb genotype builder to put the annotations into. 99 * @param readPileup pileup of the reads at this vc. Note that this pileup does not have to match the 100 * genotype. In other words, this tool does not check that the pileup was generated from the 101 * genotype sample. 102 */ annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb, final ReadPileup readPileup, int meanBaseQualityCutoff)103 public static void annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb, 104 final ReadPileup readPileup, int meanBaseQualityCutoff) { 105 Utils.nonNull(gb, "gb is null"); 106 Utils.nonNull(vc, "vc is null"); 107 108 // Create a list of unique alleles 109 final List<Allele> variantAllelesWithDupes = vc.getAlleles(); 110 final Set<Allele> alleleSet = new LinkedHashSet<>(variantAllelesWithDupes); 111 final List<Allele> variantAlleles = new ArrayList<>(alleleSet); 112 113 // Initialize the mappings 114 final Map<Allele, MutableInt> f1r2Counts = variantAlleles.stream() 115 .collect(Collectors.toMap(Function.identity(), a -> new MutableInt(0))); 116 117 final Map<Allele, MutableInt> f2r1Counts = variantAlleles.stream() 118 .collect(Collectors.toMap(Function.identity(), a -> new MutableInt(0))); 119 120 final List<Allele> referenceAlleles = variantAlleles.stream().filter(a -> a.isReference() && !a.isSymbolic()).collect(Collectors.toList()); 121 final List<Allele> altAlleles = variantAlleles.stream().filter(a -> a.isNonReference() && !a.isSymbolic()).collect(Collectors.toList()); 122 123 if (referenceAlleles.size() != 1) { 124 logger.warn("Number of reference alleles does not equal for VC: " + vc); 125 } 126 127 // We MUST have exactly 1 non-symbolic reference allele and a read pileup, 128 if ((referenceAlleles.size() == 1) && (readPileup != null) && !referenceAlleles.get(0).isSymbolic()) { 129 final Allele referenceAllele = referenceAlleles.get(0); 130 Utils.stream(readPileup) 131 .filter(pe -> isUsableRead(pe.getRead())) 132 .forEach(pe -> incrementCounts(pe, f1r2Counts, f2r1Counts, referenceAllele, altAlleles, meanBaseQualityCutoff)); 133 } 134 135 final int[] f1r2 = variantAlleles.stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray(); 136 137 final int[] f2r1 = variantAlleles.stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray(); 138 139 gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2); 140 gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1); 141 } 142 143 /** 144 * If the allele is not in the count mappings, then it is not counted. No exception will be thrown 145 * Modifies count variables in place. 146 * 147 * @param pileupElement pileup overlapping the alleles 148 * @param f1r2Counts a mapping of allele to f1r2 counts 149 * @param f2r1Counts a mapping of allele to f2r1 counts 150 */ incrementCounts(final PileupElement pileupElement, final Map<Allele, MutableInt> f1r2Counts, final Map<Allele, MutableInt> f2r1Counts, final Allele referenceAllele, final List<Allele> altAlleles, int minBaseQualityCutoff)151 private static void incrementCounts(final PileupElement pileupElement, final Map<Allele, MutableInt> f1r2Counts, 152 final Map<Allele, MutableInt> f2r1Counts, final Allele referenceAllele, 153 final List<Allele> altAlleles, int minBaseQualityCutoff) { 154 155 final Map<Allele, MutableInt> countMap = ReadUtils.isF2R1(pileupElement.getRead()) ? f2r1Counts : f1r2Counts; 156 157 final Allele pileupAllele = GATKVariantContextUtils.chooseAlleleForRead(pileupElement, referenceAllele, altAlleles, minBaseQualityCutoff); 158 159 if (pileupAllele == null) { 160 return; 161 } 162 163 if (countMap.containsKey(pileupAllele)) { 164 countMap.get(pileupAllele).increment(); 165 } 166 } 167 isUsableRead(final GATKRead read)168 protected static boolean isUsableRead(final GATKRead read) { 169 return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE; 170 } 171 172 }