1 
2 package org.broadinstitute.hellbender.tools.walkers.annotator;
3 
4 import com.google.common.annotations.VisibleForTesting;
5 import htsjdk.variant.variantcontext.Allele;
6 import htsjdk.variant.variantcontext.GenotypesContext;
7 import htsjdk.variant.variantcontext.VariantContext;
8 import org.apache.commons.lang3.tuple.Pair;
9 import org.apache.commons.math3.stat.StatUtils;
10 import org.broadinstitute.barclay.help.DocumentedFeature;
11 import org.broadinstitute.hellbender.engine.GATKPath;
12 import org.broadinstitute.hellbender.engine.ReferenceContext;
13 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
14 import org.broadinstitute.hellbender.utils.GenotypeCounts;
15 import org.broadinstitute.hellbender.utils.GenotypeUtils;
16 import org.broadinstitute.hellbender.utils.Utils;
17 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
18 import org.broadinstitute.hellbender.utils.help.HelpConstants;
19 import org.broadinstitute.hellbender.utils.read.GATKRead;
20 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
21 
22 import java.util.*;
23 
24 /**
25  * Phred-scaled p-value for exact test of excess heterozygosity.
26  *
27  * <p>This annotation estimates the probability of the called samples exhibiting excess heterozygosity with respect to the null hypothesis that the samples are unrelated. The higher the score, the
28  * higher the chance that the variant is a technical artifact or that there is consanguinuity among the samples. In
29  * contrast to Inbreeding Coefficient, there is no minimal number of samples for this annotation. If samples are known to be related, a pedigree file can be provided so
30  * that the calculation is only performed on founders and offspring are excluded.</p>
31  *
32  * <h3>Statistical notes</h3>
33  * <p>This annotation uses the implementation from
34  * <a href='http://www.sciencedirect.com/science/article/pii/S0002929707607356?via%3Dihub'>Wigginton JE, Cutler DJ, Abecasis GR. <i>A Note on Exact Tests of Hardy-Weinberg Equilibrium. American Journal of Human Genetics</i>. 2005;76(5):887-893</a>.
35  *
36  * <h3>Caveat</h3>
37  * <p>The Excess Heterozygosity annotation can only be calculated for diploid samples.</p>
38  *
39  * <h3>Related annotations</h3>
40  * <p><b>InbreedingCoeff</b> also describes the heterozygosity of the called samples, though without explicitly taking into account the number of samples</p>
41  */
42 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Phred-scaled p-value for exact test of excess heterozygosity (ExcessHet)")
43 public final class ExcessHet extends PedigreeAnnotation implements StandardAnnotation {
44 
45     private static final double MIN_NEEDED_VALUE = 1.0E-16;
46     private static final boolean ROUND_GENOTYPE_COUNTS = true;
47 
48     public static final double PHRED_SCALED_MIN_P_VALUE = -10.0 * Math.log10(MIN_NEEDED_VALUE);
49     public static final int NUMBER_OF_GENOTYPE_COUNTS = 3;
50 
ExcessHet(final Set<String> founderIds)51     public ExcessHet(final Set<String> founderIds){
52         super(founderIds);
53     }
54 
ExcessHet(final GATKPath pedigreeFile)55     public ExcessHet(final GATKPath pedigreeFile){
56         super(pedigreeFile);
57     }
58 
ExcessHet()59     public ExcessHet() {
60         this((Set<String>) null);
61     }
62 
63     @Override
annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)64     public Map<String, Object> annotate(final ReferenceContext ref,
65                                         final VariantContext vc,
66                                         final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
67         GenotypesContext genotypes = getFounderGenotypes(vc);
68         if (genotypes == null || !vc.isVariant()) {
69             return Collections.emptyMap();
70         }
71         final Pair<Integer, Double> sampleCountEH = calculateEH(vc, genotypes);
72         final int sampleCount = sampleCountEH.getLeft();
73         final double eh =  sampleCountEH.getRight();
74 
75         if (sampleCount < 1) {
76             return Collections.emptyMap();
77         }
78         return Collections.singletonMap(getKeyNames().get(0), (Object) String.format("%.4f", eh));
79     }
80 
81     @VisibleForTesting
calculateEH(final VariantContext vc, final GenotypesContext genotypes)82     static Pair<Integer, Double> calculateEH(final VariantContext vc, final GenotypesContext genotypes) {
83         final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS);
84         // number of samples that have likelihoods
85         final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isDiploidWithLikelihoods(g)).count();
86 
87         return calculateEH(vc, t, sampleCount);
88     }
89 
90     @VisibleForTesting
calculateEH(final VariantContext vc, final GenotypeCounts t, final int sampleCount)91     public static Pair<Integer, Double> calculateEH(final VariantContext vc, final GenotypeCounts t, final int sampleCount) {
92         final int refCount = (int)t.getRefs();
93         final int hetCount = (int)t.getHets();
94         final int homCount = (int)t.getHoms();
95 
96         final double pval = exactTest(hetCount, refCount, homCount);
97 
98         //If the actual phredPval would be infinity we will probably still filter out just a very large number
99         //Since the method does not guarantee precision for any p-value smaller than 1e-16, we can return the phred scaled version
100         if (pval < 10e-60) {
101             return Pair.of(sampleCount, PHRED_SCALED_MIN_P_VALUE);
102         }
103         final double phredPval = -10.0 * Math.log10(pval);
104 
105         return Pair.of(sampleCount, phredPval);
106     }
107 
108     /**
109      * Note that this method is not accurate for very small p-values. Beyond 1.0E-16 there is no guarantee that the
110      * p-value is accurate, just that it is in fact smaller than 1.0E-16 (and therefore we should filter it). It would
111      * be more computationally expensive to calculate accuracy beyond a given threshold. Here we have enough accuracy
112      * to filter anything below a p-value of 10E-6.
113      *
114      * @param hetCount Number of observed hets (n_ab)
115      * @param refCount Number of observed homRefs (n_aa)
116      * @param homCount Number of observed homVars (n_bb)
117      * @return Right sided p-value or the probability of getting the observed or higher number of hets given the sample
118      * size (N) and the observed number of allele a (rareCopies)
119      */
120     @VisibleForTesting
exactTest(final int hetCount, final int refCount, final int homCount)121     static double exactTest(final int hetCount, final int refCount, final int homCount) {
122         Utils.validateArg(hetCount >= 0, "Het count cannot be less than 0");
123         Utils.validateArg(refCount >= 0, "Ref count cannot be less than 0");
124         Utils.validateArg(homCount >= 0, "Hom count cannot be less than 0");
125 
126         //Split into observed common allele and rare allele
127         final int obsHomR;
128         final int obsHomC;
129         if (refCount < homCount) {
130             obsHomR = refCount;
131             obsHomC = homCount;
132         } else {
133             obsHomR = homCount;
134             obsHomC = refCount;
135         }
136 
137         final int rareCopies = 2 * obsHomR + hetCount;
138         final int N = hetCount + obsHomC + obsHomR;
139 
140         //If the probability distribution has only 1 point, then the mid p-value is .5
141         if (rareCopies <= 1) {
142             return .5;
143         }
144 
145         final double[] probs = new double[rareCopies + 1];
146 
147         //Find (something close to the) mode for the midpoint
148         int mid = (int) Math.floor(rareCopies * (2.0 * N - rareCopies) / (2.0 * N - 1.0));
149         if ((mid % 2) != (rareCopies % 2)) {
150             mid++;
151         }
152 
153         probs[mid] = 1.0;
154         double mysum = 1.0;
155 
156         //Calculate probabilities from midpoint down
157         int currHets = mid;
158         int currHomR = (rareCopies - mid) / 2;
159         int currHomC = N - currHets - currHomR;
160 
161         while (currHets >= 2) {
162             final double potentialProb = probs[currHets] * currHets * (currHets - 1.0) / (4.0 * (currHomR + 1.0) * (currHomC + 1.0));
163             if (potentialProb < MIN_NEEDED_VALUE) {
164                 break;
165             }
166 
167             probs[currHets - 2] = potentialProb;
168             mysum = mysum + probs[currHets - 2];
169 
170             //2 fewer hets means one additional homR and homC each
171             currHets = currHets - 2;
172             currHomR = currHomR + 1;
173             currHomC = currHomC + 1;
174         }
175 
176         //Calculate probabilities from midpoint up
177         currHets = mid;
178         currHomR = (rareCopies - mid) / 2;
179         currHomC = N - currHets - currHomR;
180 
181         while (currHets <= rareCopies - 2) {
182             final double potentialProb = probs[currHets] * 4.0 * currHomR * currHomC / ((currHets + 2.0) * (currHets + 1.0));
183             if (potentialProb < MIN_NEEDED_VALUE) {
184                 break;
185             }
186 
187             probs[currHets + 2] = potentialProb;
188             mysum = mysum + probs[currHets + 2];
189 
190             //2 more hets means 1 fewer homR and homC each
191             currHets = currHets + 2;
192             currHomR = currHomR - 1;
193             currHomC = currHomC - 1;
194         }
195 
196         final double rightPval = probs[hetCount] / (2.0 * mysum);
197         //Check if we observed the highest possible number of hets
198         if (hetCount == rareCopies) {
199             return rightPval;
200         }
201         return rightPval + StatUtils.sum(Arrays.copyOfRange(probs, hetCount + 1, probs.length)) / mysum;
202     }
203 
204     @Override
getKeyNames()205     public List<String> getKeyNames() {
206         return Collections.singletonList(GATKVCFConstants.EXCESS_HET_KEY);
207     }
208 
209     //@Override
getRawKeyName()210     public String getRawKeyName() {
211         return GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY;
212     }
213 
214     /**
215      * Generate the raw data necessary to calculate the annotation. Raw data is the final endpoint for gVCFs.
216      *
217      * @param ref         the reference context for this annotation
218      * @param vc          the variant context to annotate
219      * @param likelihoods likelihoods indexed by sample, allele, and read within sample
220      */
221     //@Override
annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods<GATKRead, Allele> likelihoods)222     public Map<String, Object> annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods<GATKRead, Allele> likelihoods) {
223         return null;
224     }
225 
226     /**
227      * Combine raw data, typically during the merging of raw data contained in multiple gVCFs as in CombineGVCFs and the
228      * preliminary merge for GenotypeGVCFs
229      *
230      * @param allelesList   The merged allele list across all variants being combined/merged
231      * @param listOfRawData The raw data for all the variants being combined/merged
232      * @return A key, Object map containing the annotations generated by this combine operation
233      */
234     //@Override
combineRawData(List<Allele> allelesList, List<ReducibleAnnotationData<?>> listOfRawData)235     public Map<String, Object> combineRawData(List<Allele> allelesList, List<ReducibleAnnotationData<?>> listOfRawData) {
236         return null;
237     }
238 
239     /**
240      * Calculate the final annotation value from the raw data which was generated by either annotateRawData or calculateRawData
241      *
242      * @param vc         -- contains the final set of alleles, possibly subset by GenotypeGVCFs
243      * @param originalVC -- used to get all the alleles for all gVCFs
244      * @return A key, Object map containing the finalized annotations generated by this operation to be added to the code
245      */
246    //@Override
finalizeRawData(VariantContext vc, VariantContext originalVC)247     public Map<String, Object> finalizeRawData(VariantContext vc, VariantContext originalVC) {
248         if (vc.hasAttribute(getRawKeyName())) {
249             List<Integer> counts = vc.getAttributeAsIntList(getRawKeyName(), 0);
250             if (counts.size() != NUMBER_OF_GENOTYPE_COUNTS) {
251                 throw new IllegalStateException("Genotype counts for ExcessHet (" + getRawKeyName() + ") should have three values: homozygous reference, heterozygous with one ref allele, and homozygous variant/heterozygous non-reference");
252             }
253             final GenotypeCounts t = new GenotypeCounts(counts.get(0), counts.get(1), counts.get(2));
254             final Pair<Integer, Double> sampleCountEH = calculateEH(vc, t, counts.get(0)+counts.get(1)+counts.get(2));
255             final int sampleCount = sampleCountEH.getLeft();
256             final double eh =  sampleCountEH.getRight();
257 
258             if (sampleCount < 1) {
259                 return Collections.emptyMap();
260             }
261             return Collections.singletonMap(getKeyNames().get(0), (Object) String.format("%.4f", eh));
262         }
263         return null;
264     }
265 }
266