1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.variant.variantcontext.Allele; 5 import htsjdk.variant.variantcontext.GenotypesContext; 6 import htsjdk.variant.variantcontext.VariantContext; 7 import org.apache.commons.lang3.tuple.Pair; 8 import org.broadinstitute.barclay.help.DocumentedFeature; 9 import org.broadinstitute.hellbender.engine.GATKPath; 10 import org.broadinstitute.hellbender.engine.ReferenceContext; 11 import org.broadinstitute.hellbender.utils.GenotypeCounts; 12 import org.broadinstitute.hellbender.utils.GenotypeUtils; 13 import org.broadinstitute.hellbender.utils.Utils; 14 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; 15 import org.broadinstitute.hellbender.utils.help.HelpConstants; 16 import org.broadinstitute.hellbender.utils.logging.OneShotLogger; 17 import org.broadinstitute.hellbender.utils.read.GATKRead; 18 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 19 20 import java.util.Collections; 21 import java.util.List; 22 import java.util.Map; 23 import java.util.Set; 24 25 26 /** 27 * Likelihood-based test for the consanguinuity among samples 28 * 29 * <p>This annotation estimates whether there is evidence of consanguinuity in a population. The higher the score, the 30 * higher the chance that some samples are related. If samples are known to be related, a pedigree file can be provided so 31 * that the calculation is only performed on founders and offspring are excluded.</p> 32 * 33 * <h3>Details</h3> 34 * <p>The output is the inbreeding coefficient 'F' (fixation) statistic, which for large sample sizes converges to the probability 35 * that an individual's two alleles are identical by descent, provided that cosanguinity is the only source of deviation from Hardy-Weinberg equilibrium. 36 * If this assumption is not true F may be negative and the excess heterozygosity often indicates an artifactual variant. 37 * It is calculated as F = 1 - (# of het genotypes)/(# of het genotypes expected under Hardy-Weinberg equilibrium). The number of het genotypes expeced under Hardy-Weinberg equilibrium 38 * is 2*(# of samples)*(ref allele frequency)*(alt allele frequency), where allele frequencies are calculated from the samples' genotypes.</p> 39 * 40 * <h3>Caveats</h3> 41 * <ul> 42 * <li>The Inbreeding Coefficient annotation can only be calculated for cohorts containing at least 10 founder samples.</li> 43 * <li>The Inbreeding Coefficient annotation can only be calculated for diploid samples.</li> 44 * </ul> 45 * 46 * <h3>Related annotations</h3> 47 * <p><b>ExcessHet</b> also describes the heterozygosity of the called samples, giving a probability of excess heterozygosity being observed</p> 48 */ 49 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Likelihood-based test for the consanguinity among samples (InbreedingCoeff)") 50 public final class InbreedingCoeff extends PedigreeAnnotation implements StandardAnnotation { 51 52 private static final OneShotLogger oneShotLogger = new OneShotLogger(InbreedingCoeff.class); 53 private static final int MIN_SAMPLES = 10; 54 private static final boolean ROUND_GENOTYPE_COUNTS = false; 55 InbreedingCoeff()56 public InbreedingCoeff(){ 57 super((Set<String>) null); 58 } 59 InbreedingCoeff(final Set<String> founderIds)60 public InbreedingCoeff(final Set<String> founderIds){ 61 super(founderIds); 62 } 63 InbreedingCoeff(final GATKPath pedigreeFile)64 public InbreedingCoeff(final GATKPath pedigreeFile){ 65 super(pedigreeFile); 66 } 67 68 @Override annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)69 public Map<String, Object> annotate(final ReferenceContext ref, 70 final VariantContext vc, 71 final AlleleLikelihoods<GATKRead, Allele> likelihoods) { 72 Utils.nonNull(vc); 73 final GenotypesContext genotypes = getFounderGenotypes(vc); 74 if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) { 75 oneShotLogger.warn("InbreedingCoeff will not be calculated at position " + vc.getContig() + ":" + vc.getStart() + 76 " and possibly subsequent; at least " + MIN_SAMPLES + " samples must have called genotypes"); 77 return Collections.emptyMap(); 78 } 79 final Pair<Integer, Double> sampleCountCoeff = calculateIC(vc, genotypes); 80 final int sampleCount = sampleCountCoeff.getLeft(); 81 final double F = sampleCountCoeff.getRight(); 82 if (sampleCount < MIN_SAMPLES) { 83 oneShotLogger.warn("InbreedingCoeff will not be calculated for at least one position; at least " + MIN_SAMPLES + " samples must have called genotypes"); 84 return Collections.emptyMap(); 85 } 86 return Collections.singletonMap(getKeyNames().get(0), String.format("%.4f", F)); 87 } 88 89 @VisibleForTesting calculateIC(final VariantContext vc, final GenotypesContext genotypes)90 static Pair<Integer, Double> calculateIC(final VariantContext vc, final GenotypesContext genotypes) { 91 final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS); 92 93 final double refCount = t.getRefs(); 94 final double hetCount = t.getHets(); 95 final double homCount = t.getHoms(); 96 // number of samples that have likelihoods 97 final int sampleCount = (int) genotypes.stream().filter(g-> GenotypeUtils.isDiploidWithLikelihoods(g)).count(); 98 99 final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency 100 final double q = 1.0 - p; // expected alternative allele frequency 101 final double expectedHets = 2.0 * p * q * sampleCount; //numbers of hets that would be expected based on the allele frequency (asuming Hardy Weinberg Equilibrium) 102 final double F = 1.0 - ( hetCount / expectedHets ); // inbreeding coefficient 103 104 return Pair.of(sampleCount, F); 105 } 106 107 @Override getKeyNames()108 public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); } 109 }