1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.GenotypesContext;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import org.apache.commons.lang3.tuple.Pair;
8 import org.broadinstitute.barclay.help.DocumentedFeature;
9 import org.broadinstitute.hellbender.engine.GATKPath;
10 import org.broadinstitute.hellbender.engine.ReferenceContext;
11 import org.broadinstitute.hellbender.utils.GenotypeCounts;
12 import org.broadinstitute.hellbender.utils.GenotypeUtils;
13 import org.broadinstitute.hellbender.utils.Utils;
14 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
15 import org.broadinstitute.hellbender.utils.help.HelpConstants;
16 import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
17 import org.broadinstitute.hellbender.utils.read.GATKRead;
18 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
19 
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.Map;
23 import java.util.Set;
24 
25 
26 /**
27  * Likelihood-based test for the consanguinuity among samples
28  *
29  * <p>This annotation estimates whether there is evidence of consanguinuity in a population. The higher the score, the
30  * higher the chance that some samples are related. If samples are known to be related, a pedigree file can be provided so
31  * that the calculation is only performed on founders and offspring are excluded.</p>
32  *
33  * <h3>Details</h3>
34  * <p>The output is the inbreeding coefficient 'F' (fixation) statistic, which for large sample sizes converges to the probability
35  * that an individual's two alleles are identical by descent, provided that cosanguinity is the only source of deviation from Hardy-Weinberg equilibrium.
36  * If this assumption is not true F may be negative and the excess heterozygosity often indicates an artifactual variant.
37  * It is calculated as F = 1 - (# of het genotypes)/(# of het genotypes expected under Hardy-Weinberg equilibrium).  The number of het genotypes expeced under Hardy-Weinberg equilibrium
38  * is 2*(# of samples)*(ref allele frequency)*(alt allele frequency), where allele frequencies are calculated from the samples' genotypes.</p>
39  *
40  * <h3>Caveats</h3>
41  * <ul>
42  * <li>The Inbreeding Coefficient annotation can only be calculated for cohorts containing at least 10 founder samples.</li>
43  * <li>The Inbreeding Coefficient annotation can only be calculated for diploid samples.</li>
44  * </ul>
45  *
46  * <h3>Related annotations</h3>
47  * <p><b>ExcessHet</b> also describes the heterozygosity of the called samples, giving a probability of excess heterozygosity being observed</p>
48  */
49 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Likelihood-based test for the consanguinity among samples (InbreedingCoeff)")
50 public final class InbreedingCoeff extends PedigreeAnnotation implements StandardAnnotation {
51 
52     private static final OneShotLogger oneShotLogger = new OneShotLogger(InbreedingCoeff.class);
53     private static final int MIN_SAMPLES = 10;
54     private static final boolean ROUND_GENOTYPE_COUNTS = false;
55 
InbreedingCoeff()56     public InbreedingCoeff(){
57         super((Set<String>) null);
58     }
59 
InbreedingCoeff(final Set<String> founderIds)60     public InbreedingCoeff(final Set<String> founderIds){
61         super(founderIds);
62      }
63 
InbreedingCoeff(final GATKPath pedigreeFile)64     public InbreedingCoeff(final GATKPath pedigreeFile){
65         super(pedigreeFile);
66     }
67 
68     @Override
annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)69     public Map<String, Object> annotate(final ReferenceContext ref,
70                                         final VariantContext vc,
71                                         final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
72         Utils.nonNull(vc);
73         final GenotypesContext genotypes = getFounderGenotypes(vc);
74         if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) {
75             oneShotLogger.warn("InbreedingCoeff will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
76                     " and possibly subsequent; at least " + MIN_SAMPLES + " samples must have called genotypes");
77             return Collections.emptyMap();
78         }
79         final Pair<Integer, Double> sampleCountCoeff = calculateIC(vc, genotypes);
80         final int sampleCount = sampleCountCoeff.getLeft();
81         final double F = sampleCountCoeff.getRight();
82         if (sampleCount < MIN_SAMPLES) {
83             oneShotLogger.warn("InbreedingCoeff will not be calculated for at least one position; at least " + MIN_SAMPLES + " samples must have called genotypes");
84             return Collections.emptyMap();
85         }
86         return Collections.singletonMap(getKeyNames().get(0), String.format("%.4f", F));
87     }
88 
89     @VisibleForTesting
calculateIC(final VariantContext vc, final GenotypesContext genotypes)90     static Pair<Integer, Double> calculateIC(final VariantContext vc, final GenotypesContext genotypes) {
91         final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS);
92 
93         final double refCount = t.getRefs();
94         final double hetCount = t.getHets();
95         final double homCount = t.getHoms();
96         // number of samples that have likelihoods
97         final int sampleCount = (int) genotypes.stream().filter(g-> GenotypeUtils.isDiploidWithLikelihoods(g)).count();
98 
99         final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency
100         final double q = 1.0 - p; // expected alternative allele frequency
101         final double expectedHets = 2.0 * p * q * sampleCount; //numbers of hets that would be expected based on the allele frequency (asuming Hardy Weinberg Equilibrium)
102         final double F = 1.0 - ( hetCount / expectedHets ); // inbreeding coefficient
103 
104         return Pair.of(sampleCount, F);
105     }
106 
107     @Override
getKeyNames()108     public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); }
109 }