1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.variant.variantcontext.Allele; 5 import htsjdk.variant.variantcontext.VariantContext; 6 import org.apache.logging.log4j.LogManager; 7 import org.apache.logging.log4j.Logger; 8 import org.broadinstitute.barclay.help.DocumentedFeature; 9 import org.broadinstitute.hellbender.engine.GATKPath; 10 import org.broadinstitute.hellbender.engine.ReferenceContext; 11 import org.broadinstitute.hellbender.utils.Utils; 12 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; 13 import org.broadinstitute.hellbender.utils.help.HelpConstants; 14 import org.broadinstitute.hellbender.utils.read.GATKRead; 15 import org.broadinstitute.hellbender.utils.samples.MendelianViolation; 16 import org.broadinstitute.hellbender.utils.samples.Trio; 17 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 18 19 import java.util.*; 20 21 /** 22 * Existence of a de novo mutation in at least one of the given families 23 * 24 * <p>This annotation uses the genotype information from individuals in family trios to identify possible de novo mutations and the sample(s) in which they occur. This works best if the genotypes have been processed according to the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4723">Genotype Refinement workflow</a>.</p> 25 * 26 * <h3>Caveats</h3> 27 * <ul> 28 * <li>The calculation assumes that the organism is diploid.</li> 29 * <li>This annotation requires a valid pedigree file.</li> 30 * <li>Only reports possible de novos for children whose genotypes have not been tagged as filtered (which is most appropriate if parent likelihoods 31 * have already been factored in using PhaseByTransmission).</li> 32 * <li>When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.</li> 33 * <li>This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.</li> 34 * </ul> 35 */ 36 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Existence of a de novo mutation in at least one of the given families (hiConfDeNovo, loConfDeNovo)") 37 public final class PossibleDeNovo extends PedigreeAnnotation { 38 protected final Logger warning = LogManager.getLogger(this.getClass()); 39 private final MendelianViolation mendelianViolation; 40 private Set<Trio> trios; 41 42 @VisibleForTesting PossibleDeNovo(final Set<Trio> trios, final double minGenotypeQualityP)43 public PossibleDeNovo(final Set<Trio> trios, final double minGenotypeQualityP) { 44 super((Set<String>) null); 45 this.trios = Collections.unmodifiableSet(new LinkedHashSet<>(trios)); 46 mendelianViolation = new MendelianViolation(minGenotypeQualityP); 47 } 48 PossibleDeNovo(final GATKPath pedigreeFile)49 public PossibleDeNovo(final GATKPath pedigreeFile){ 50 super(pedigreeFile); 51 mendelianViolation = new MendelianViolation(DEFAULT_MIN_GENOTYPE_QUALITY_P); 52 } 53 PossibleDeNovo()54 public PossibleDeNovo(){ 55 super((Set<String>) null); 56 mendelianViolation = new MendelianViolation(DEFAULT_MIN_GENOTYPE_QUALITY_P); 57 } 58 59 @Override validateArguments(Collection<String> founderIds, GATKPath pedigreeFile)60 void validateArguments(Collection<String> founderIds, GATKPath pedigreeFile) { 61 if (pedigreeFile == null) { 62 if ((founderIds != null && !founderIds.isEmpty())) { 63 warning.warn("PossibleDenovo annotation will not be calculated, must provide a valid PED file (-ped). Founder-id arguments cannot be used for this annotation"); 64 } else { 65 warning.warn("PossibleDenovo Annotation will not be calculated, must provide a valid PED file (-ped) from the command line."); 66 } 67 } else { 68 if ((founderIds != null && !founderIds.isEmpty())) { 69 warning.warn("PossibleDenovo annotation does not take founder-id arguments, trio information will be extracted only from the provided PED file"); 70 } 71 } 72 } 73 74 // Static thresholds for the denovo calculation 75 public final static double DEFAULT_MIN_GENOTYPE_QUALITY_P = 0; // TODO should this be exposed as a command line argument? 76 private static final int hi_GQ_threshold = 20; //WARNING - If you change this value, update the description in GATKVCFHeaderLines 77 private static final int lo_GQ_threshold = 10; //WARNING - If you change this value, update the description in GATKVCFHeaderLines 78 private static final double percentOfSamplesCutoff = 0.001; //for many, many samples use 0.1% of samples as allele frequency threshold for de novos 79 private static final int flatNumberOfSamplesCutoff = 4; 80 initializeAndGetTrios()81 private Set<Trio> initializeAndGetTrios() { 82 if (trios == null) { 83 trios = getTrios(); 84 } 85 return trios; 86 } 87 88 @Override getKeyNames()89 public List<String> getKeyNames() { 90 return Arrays.asList( 91 GATKVCFConstants.HI_CONF_DENOVO_KEY, 92 GATKVCFConstants.LO_CONF_DENOVO_KEY); 93 } 94 95 @Override annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)96 public Map<String, Object> annotate(final ReferenceContext ref, 97 final VariantContext vc, 98 final AlleleLikelihoods<GATKRead, Allele> likelihoods) { 99 Utils.nonNull(vc); 100 Set<Trio> trioSet = initializeAndGetTrios(); 101 if (trioSet.isEmpty()){ 102 return Collections.emptyMap(); 103 } 104 final List<String> highConfDeNovoChildren = new ArrayList<>(); 105 final List<String> lowConfDeNovoChildren = new ArrayList<>(); 106 for (final Trio trio : trioSet) { 107 if (vc.isBiallelic() && 108 PossibleDeNovo.contextHasTrioLikelihoods(vc, trio) && 109 mendelianViolation.isViolation(trio.getMother(), trio.getFather(), trio.getChild(), vc) && 110 mendelianViolation.getParentsRefRefChildHet() > 0) { 111 112 final int childGQ = vc.getGenotype(trio.getChildID()).getGQ(); 113 final int momGQ = vc.getGenotype(trio.getMaternalID()).getGQ(); 114 final int dadGQ = vc.getGenotype(trio.getPaternalID()).getGQ(); 115 116 if (childGQ >= hi_GQ_threshold && momGQ >= hi_GQ_threshold && dadGQ >= hi_GQ_threshold) { 117 highConfDeNovoChildren.add(trio.getChildID()); 118 } else if (childGQ >= lo_GQ_threshold && momGQ > 0 && dadGQ > 0) { 119 lowConfDeNovoChildren.add(trio.getChildID()); 120 } 121 } 122 } 123 124 final double percentNumberOfSamplesCutoff = vc.getNSamples()*percentOfSamplesCutoff; 125 final double AFcutoff = Math.max(flatNumberOfSamplesCutoff, percentNumberOfSamplesCutoff); 126 final int deNovoAlleleCount = vc.getCalledChrCount(vc.getAlternateAllele(0)); //we assume we're biallelic above so use the first alt 127 128 final Map<String,Object> attributeMap = new LinkedHashMap<>(2); 129 if ( !highConfDeNovoChildren.isEmpty() && deNovoAlleleCount < AFcutoff ) { 130 attributeMap.put(GATKVCFConstants.HI_CONF_DENOVO_KEY, highConfDeNovoChildren); 131 } 132 if ( !lowConfDeNovoChildren.isEmpty() && deNovoAlleleCount < AFcutoff ) { 133 attributeMap.put(GATKVCFConstants.LO_CONF_DENOVO_KEY, lowConfDeNovoChildren); 134 } 135 return attributeMap; 136 } 137 contextHasTrioLikelihoods(final VariantContext vc, final Trio trio)138 private static boolean contextHasTrioLikelihoods(final VariantContext vc, final Trio trio) { 139 final String mom = trio.getMaternalID(); 140 final String dad = trio.getPaternalID(); 141 final String kid = trio.getChildID(); 142 143 return (!mom.isEmpty() && vc.hasGenotype(mom) && vc.getGenotype(mom).hasLikelihoods()) 144 && (!dad.isEmpty() && vc.hasGenotype(dad) && vc.getGenotype(dad).hasLikelihoods()) 145 && (!kid.isEmpty() && vc.hasGenotype(kid) && vc.getGenotype(kid).hasLikelihoods()); 146 } 147 148 } 149