1 2 package org.broadinstitute.hellbender.tools.walkers.annotator; 3 4 import com.google.common.annotations.VisibleForTesting; 5 import htsjdk.variant.variantcontext.Allele; 6 import htsjdk.variant.variantcontext.GenotypesContext; 7 import htsjdk.variant.variantcontext.VariantContext; 8 import org.apache.commons.lang3.tuple.Pair; 9 import org.apache.commons.math3.stat.StatUtils; 10 import org.broadinstitute.barclay.help.DocumentedFeature; 11 import org.broadinstitute.hellbender.engine.GATKPath; 12 import org.broadinstitute.hellbender.engine.ReferenceContext; 13 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData; 14 import org.broadinstitute.hellbender.utils.GenotypeCounts; 15 import org.broadinstitute.hellbender.utils.GenotypeUtils; 16 import org.broadinstitute.hellbender.utils.Utils; 17 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; 18 import org.broadinstitute.hellbender.utils.help.HelpConstants; 19 import org.broadinstitute.hellbender.utils.read.GATKRead; 20 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 21 22 import java.util.*; 23 24 /** 25 * Phred-scaled p-value for exact test of excess heterozygosity. 26 * 27 * <p>This annotation estimates the probability of the called samples exhibiting excess heterozygosity with respect to the null hypothesis that the samples are unrelated. The higher the score, the 28 * higher the chance that the variant is a technical artifact or that there is consanguinuity among the samples. In 29 * contrast to Inbreeding Coefficient, there is no minimal number of samples for this annotation. If samples are known to be related, a pedigree file can be provided so 30 * that the calculation is only performed on founders and offspring are excluded.</p> 31 * 32 * <h3>Statistical notes</h3> 33 * <p>This annotation uses the implementation from 34 * <a href='http://www.sciencedirect.com/science/article/pii/S0002929707607356?via%3Dihub'>Wigginton JE, Cutler DJ, Abecasis GR. <i>A Note on Exact Tests of Hardy-Weinberg Equilibrium. American Journal of Human Genetics</i>. 2005;76(5):887-893</a>. 35 * 36 * <h3>Caveat</h3> 37 * <p>The Excess Heterozygosity annotation can only be calculated for diploid samples.</p> 38 * 39 * <h3>Related annotations</h3> 40 * <p><b>InbreedingCoeff</b> also describes the heterozygosity of the called samples, though without explicitly taking into account the number of samples</p> 41 */ 42 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Phred-scaled p-value for exact test of excess heterozygosity (ExcessHet)") 43 public final class ExcessHet extends PedigreeAnnotation implements StandardAnnotation { 44 45 private static final double MIN_NEEDED_VALUE = 1.0E-16; 46 private static final boolean ROUND_GENOTYPE_COUNTS = true; 47 48 public static final double PHRED_SCALED_MIN_P_VALUE = -10.0 * Math.log10(MIN_NEEDED_VALUE); 49 public static final int NUMBER_OF_GENOTYPE_COUNTS = 3; 50 ExcessHet(final Set<String> founderIds)51 public ExcessHet(final Set<String> founderIds){ 52 super(founderIds); 53 } 54 ExcessHet(final GATKPath pedigreeFile)55 public ExcessHet(final GATKPath pedigreeFile){ 56 super(pedigreeFile); 57 } 58 ExcessHet()59 public ExcessHet() { 60 this((Set<String>) null); 61 } 62 63 @Override annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)64 public Map<String, Object> annotate(final ReferenceContext ref, 65 final VariantContext vc, 66 final AlleleLikelihoods<GATKRead, Allele> likelihoods) { 67 GenotypesContext genotypes = getFounderGenotypes(vc); 68 if (genotypes == null || !vc.isVariant()) { 69 return Collections.emptyMap(); 70 } 71 final Pair<Integer, Double> sampleCountEH = calculateEH(vc, genotypes); 72 final int sampleCount = sampleCountEH.getLeft(); 73 final double eh = sampleCountEH.getRight(); 74 75 if (sampleCount < 1) { 76 return Collections.emptyMap(); 77 } 78 return Collections.singletonMap(getKeyNames().get(0), (Object) String.format("%.4f", eh)); 79 } 80 81 @VisibleForTesting calculateEH(final VariantContext vc, final GenotypesContext genotypes)82 static Pair<Integer, Double> calculateEH(final VariantContext vc, final GenotypesContext genotypes) { 83 final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS); 84 // number of samples that have likelihoods 85 final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isDiploidWithLikelihoods(g)).count(); 86 87 return calculateEH(vc, t, sampleCount); 88 } 89 90 @VisibleForTesting calculateEH(final VariantContext vc, final GenotypeCounts t, final int sampleCount)91 public static Pair<Integer, Double> calculateEH(final VariantContext vc, final GenotypeCounts t, final int sampleCount) { 92 final int refCount = (int)t.getRefs(); 93 final int hetCount = (int)t.getHets(); 94 final int homCount = (int)t.getHoms(); 95 96 final double pval = exactTest(hetCount, refCount, homCount); 97 98 //If the actual phredPval would be infinity we will probably still filter out just a very large number 99 //Since the method does not guarantee precision for any p-value smaller than 1e-16, we can return the phred scaled version 100 if (pval < 10e-60) { 101 return Pair.of(sampleCount, PHRED_SCALED_MIN_P_VALUE); 102 } 103 final double phredPval = -10.0 * Math.log10(pval); 104 105 return Pair.of(sampleCount, phredPval); 106 } 107 108 /** 109 * Note that this method is not accurate for very small p-values. Beyond 1.0E-16 there is no guarantee that the 110 * p-value is accurate, just that it is in fact smaller than 1.0E-16 (and therefore we should filter it). It would 111 * be more computationally expensive to calculate accuracy beyond a given threshold. Here we have enough accuracy 112 * to filter anything below a p-value of 10E-6. 113 * 114 * @param hetCount Number of observed hets (n_ab) 115 * @param refCount Number of observed homRefs (n_aa) 116 * @param homCount Number of observed homVars (n_bb) 117 * @return Right sided p-value or the probability of getting the observed or higher number of hets given the sample 118 * size (N) and the observed number of allele a (rareCopies) 119 */ 120 @VisibleForTesting exactTest(final int hetCount, final int refCount, final int homCount)121 static double exactTest(final int hetCount, final int refCount, final int homCount) { 122 Utils.validateArg(hetCount >= 0, "Het count cannot be less than 0"); 123 Utils.validateArg(refCount >= 0, "Ref count cannot be less than 0"); 124 Utils.validateArg(homCount >= 0, "Hom count cannot be less than 0"); 125 126 //Split into observed common allele and rare allele 127 final int obsHomR; 128 final int obsHomC; 129 if (refCount < homCount) { 130 obsHomR = refCount; 131 obsHomC = homCount; 132 } else { 133 obsHomR = homCount; 134 obsHomC = refCount; 135 } 136 137 final int rareCopies = 2 * obsHomR + hetCount; 138 final int N = hetCount + obsHomC + obsHomR; 139 140 //If the probability distribution has only 1 point, then the mid p-value is .5 141 if (rareCopies <= 1) { 142 return .5; 143 } 144 145 final double[] probs = new double[rareCopies + 1]; 146 147 //Find (something close to the) mode for the midpoint 148 int mid = (int) Math.floor(rareCopies * (2.0 * N - rareCopies) / (2.0 * N - 1.0)); 149 if ((mid % 2) != (rareCopies % 2)) { 150 mid++; 151 } 152 153 probs[mid] = 1.0; 154 double mysum = 1.0; 155 156 //Calculate probabilities from midpoint down 157 int currHets = mid; 158 int currHomR = (rareCopies - mid) / 2; 159 int currHomC = N - currHets - currHomR; 160 161 while (currHets >= 2) { 162 final double potentialProb = probs[currHets] * currHets * (currHets - 1.0) / (4.0 * (currHomR + 1.0) * (currHomC + 1.0)); 163 if (potentialProb < MIN_NEEDED_VALUE) { 164 break; 165 } 166 167 probs[currHets - 2] = potentialProb; 168 mysum = mysum + probs[currHets - 2]; 169 170 //2 fewer hets means one additional homR and homC each 171 currHets = currHets - 2; 172 currHomR = currHomR + 1; 173 currHomC = currHomC + 1; 174 } 175 176 //Calculate probabilities from midpoint up 177 currHets = mid; 178 currHomR = (rareCopies - mid) / 2; 179 currHomC = N - currHets - currHomR; 180 181 while (currHets <= rareCopies - 2) { 182 final double potentialProb = probs[currHets] * 4.0 * currHomR * currHomC / ((currHets + 2.0) * (currHets + 1.0)); 183 if (potentialProb < MIN_NEEDED_VALUE) { 184 break; 185 } 186 187 probs[currHets + 2] = potentialProb; 188 mysum = mysum + probs[currHets + 2]; 189 190 //2 more hets means 1 fewer homR and homC each 191 currHets = currHets + 2; 192 currHomR = currHomR - 1; 193 currHomC = currHomC - 1; 194 } 195 196 final double rightPval = probs[hetCount] / (2.0 * mysum); 197 //Check if we observed the highest possible number of hets 198 if (hetCount == rareCopies) { 199 return rightPval; 200 } 201 return rightPval + StatUtils.sum(Arrays.copyOfRange(probs, hetCount + 1, probs.length)) / mysum; 202 } 203 204 @Override getKeyNames()205 public List<String> getKeyNames() { 206 return Collections.singletonList(GATKVCFConstants.EXCESS_HET_KEY); 207 } 208 209 //@Override getRawKeyName()210 public String getRawKeyName() { 211 return GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY; 212 } 213 214 /** 215 * Generate the raw data necessary to calculate the annotation. Raw data is the final endpoint for gVCFs. 216 * 217 * @param ref the reference context for this annotation 218 * @param vc the variant context to annotate 219 * @param likelihoods likelihoods indexed by sample, allele, and read within sample 220 */ 221 //@Override annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods<GATKRead, Allele> likelihoods)222 public Map<String, Object> annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods<GATKRead, Allele> likelihoods) { 223 return null; 224 } 225 226 /** 227 * Combine raw data, typically during the merging of raw data contained in multiple gVCFs as in CombineGVCFs and the 228 * preliminary merge for GenotypeGVCFs 229 * 230 * @param allelesList The merged allele list across all variants being combined/merged 231 * @param listOfRawData The raw data for all the variants being combined/merged 232 * @return A key, Object map containing the annotations generated by this combine operation 233 */ 234 //@Override combineRawData(List<Allele> allelesList, List<ReducibleAnnotationData<?>> listOfRawData)235 public Map<String, Object> combineRawData(List<Allele> allelesList, List<ReducibleAnnotationData<?>> listOfRawData) { 236 return null; 237 } 238 239 /** 240 * Calculate the final annotation value from the raw data which was generated by either annotateRawData or calculateRawData 241 * 242 * @param vc -- contains the final set of alleles, possibly subset by GenotypeGVCFs 243 * @param originalVC -- used to get all the alleles for all gVCFs 244 * @return A key, Object map containing the finalized annotations generated by this operation to be added to the code 245 */ 246 //@Override finalizeRawData(VariantContext vc, VariantContext originalVC)247 public Map<String, Object> finalizeRawData(VariantContext vc, VariantContext originalVC) { 248 if (vc.hasAttribute(getRawKeyName())) { 249 List<Integer> counts = vc.getAttributeAsIntList(getRawKeyName(), 0); 250 if (counts.size() != NUMBER_OF_GENOTYPE_COUNTS) { 251 throw new IllegalStateException("Genotype counts for ExcessHet (" + getRawKeyName() + ") should have three values: homozygous reference, heterozygous with one ref allele, and homozygous variant/heterozygous non-reference"); 252 } 253 final GenotypeCounts t = new GenotypeCounts(counts.get(0), counts.get(1), counts.get(2)); 254 final Pair<Integer, Double> sampleCountEH = calculateEH(vc, t, counts.get(0)+counts.get(1)+counts.get(2)); 255 final int sampleCount = sampleCountEH.getLeft(); 256 final double eh = sampleCountEH.getRight(); 257 258 if (sampleCount < 1) { 259 return Collections.emptyMap(); 260 } 261 return Collections.singletonMap(getKeyNames().get(0), (Object) String.format("%.4f", eh)); 262 } 263 return null; 264 } 265 } 266