1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.GenotypesContext;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import org.broadinstitute.barclay.help.DocumentedFeature;
8 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
9 import org.broadinstitute.hellbender.utils.help.HelpConstants;
10 import org.broadinstitute.hellbender.utils.read.GATKRead;
11 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
12 
13 import java.util.Collections;
14 import java.util.List;
15 import java.util.Map;
16 
17 import static java.lang.Math.max;
18 import static java.lang.Math.min;
19 
20 /**
21  * Strand bias estimated by the Symmetric Odds Ratio test
22  *
23  * <p>Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in
24  * incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation
25  * is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of
26  * the <a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_annotator_FisherStrand.php">Fisher Strand Test</a>
27  * that is better at taking into account large amounts of data in high coverage situations. It is used to determine if
28  * there is strand bias between forward and reverse strands for the reference or alternate allele(s).</p>
29  *
30  * <h3>Statistical notes</h3>
31  *
32  * <p>The following 2x2 contingency table gives the notation for allele support and strand orientation.</p>
33  *
34  * <table>
35  * <tr><th>&nbsp;</th><th>+ strand&nbsp;&nbsp;&nbsp;</th><th>- strand&nbsp;&nbsp;&nbsp;</th></tr>
36  * <tr><th>REF&nbsp;&nbsp;&nbsp;</th><td>X[0][0]</td><td>X[0][1]</td></tr>
37  * <tr><th>ALT&nbsp;&nbsp;&nbsp;</th><td>X[1][0]</td><td>X[1][1]</td></tr>
38  * </table>
39  *
40  * <p>We can then represent the Odds Ratios with the equation:</p>
41  *
42  * <img src="http://latex.codecogs.com/svg.latex?R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]}" border="0"/>
43  *
44  * <p>and its inverse:</p>
45  *
46  * <img src="http://latex.codecogs.com/svg.latex?\frac{1}{R} = \frac{X[0][1] * X[1][0]}{X[0][0] * X[1][1]}" border="0"/>
47  *
48  * <p>The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT. The sum makes it symmetric.
49  * A high value is indicative of large difference where one entry is very small compared to the others. A scale factor
50  * of refRatio/altRatio where</p>
51  *
52  * <img src="http://latex.codecogs.com/svg.latex?refRatio = \frac{min(X[0][0], X[0][1])}{max(X[0][0], X[0][1])}" border="0"/>
53  *
54  * <p>and </p>
55  *
56  * <img src="http://latex.codecogs.com/svg.latex?altRatio = \frac{min(X[1][0], X[1][1])}{max(X[1][0], X[1][1])}" border="0"/>
57  *
58  * <p>ensures that the annotation value is large only. The final SOR annotation is given in natural log space.</p>
59  *
60  * <p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4732">method document on statistical tests</a>
61  * for a more detailed explanation of this statistical test.</p>
62  *
63  * <h3>Example calculation</h3>
64  *
65  * <p>Here is a variant record where SOR is 0.592.</p>
66  *
67  * <pre>
68  *     AC=78;AF=2.92135e-02;AN=2670;DP=31492;FS=48.628;MQ=58.02;MQRankSum=-2.02400e+00;MQ_DP=3209;QD=3.03; \
69  *     ReadPosRankSum=-1.66500e-01;SB_TABLE=1450,345,160,212;SOR=0.592;VarDP=2167
70  * </pre>
71  *
72  * <p>Read support shows some strand bias for the reference allele but not
73  * the alternate allele. The SB_TABLE annotation (a non-GATK annotation) indicates 1450 reference alleles on the forward strand, 345
74  * reference alleles on the reverse strand, 160 alternate alleles on the forward strand and 212 alternate alleles on
75  * the reverse strand. The tool uses these counts towards calculating SOR. To avoid multiplying or dividing by zero
76  * values, the tool adds one to each count.</p>
77  *
78  * <pre>
79  * refFw = 1450 + 1 = 1451
80  * refRv = 345 + 1 = 346
81  * altFw = 160 + 1 = 161
82  * altRv = 212 + 1 = 213
83  * </pre>
84  *
85  * <p>Calculate SOR with the following.</p>
86  *
87  * <p><img src="http://latex.codecogs.com/svg.latex?SOR = ln(symmetricalRatio) + ln(refRatio) - ln(altRatio)" border="0"/></p>
88  *
89  * <p>where</p>
90  *
91  * <p><img src="http://latex.codecogs.com/svg.latex?symmetricalRatio = R + \frac{1}{R}" border="0"/></p>
92  * <p><img src="http://latex.codecogs.com/svg.latex?R = \frac{(\frac{refFw}{refRv})}{(\frac{altFw}{altRv})} = \frac{(refFw*altRv)}{(altFw*refRv)}" border="0"/></p>
93  *
94  * <p><img src="http://latex.codecogs.com/svg.latex?refRatio =\frac{(smaller\;of\;refFw\;and\;refRv)}{(larger\;of\;refFw\;and\;refRv)}" border="0"/></p>
95  *
96  * <p>and</p>
97  *
98  * <p><img src="http://latex.codecogs.com/svg.latex?altRatio = \frac{(smaller\;of\;altFw\;and\;altRv)}{(larger\;of\;altFw\;and\;altRv)}" border="0"/></p>
99  *
100  * <p>Fill out the component equations with the example counts to calculate SOR.</p>
101  *
102  * <pre>
103  * symmetricalRatio  = (1451*213)/(161*346) + (161*346)/(1451*213) = 5.7284
104  * refRatio = 346/1451 = 0.2385
105  * altRatio = 161/213 = 0.7559
106  * SOR = ln(5.7284) + ln(0.2385) - ln(0.7559) = 1.7454427755 + (-1.433) - (-0.2798) = 0.592
107  * </pre>
108  *
109  * <h3>Related annotations</h3>
110  * <ul>
111  *     <li><b><a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_annotator_allelespecific_AS_StrandOddsRatio.php">AS_StrandOddsRatio</a></b>
112  *     allele-specific strand bias estimated by the symmetric odds ratio test.</li>
113  *     <li><b><a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_annotator_StrandBiasBySample.php">StrandBiasBySample</a></b>
114  *     outputs counts of read depth per allele for each strand orientation.</li>
115  *     <li><b><a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_annotator_FisherStrand.php">FisherStrand</a></b>
116  *     uses Fisher's Exact Test to evaluate strand bias.</li>
117  * </ul>
118  *
119  */
120 @DocumentedFeature(groupName=HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Strand bias estimated by the symmetric odds ratio test (SOR)")
121 public final class StrandOddsRatio extends StrandBiasTest implements StandardAnnotation {
122 
123     private static final double PSEUDOCOUNT = 1.0;
124     private static final int MIN_COUNT = 0;
125 
126     @Override
calculateAnnotationFromGTfield(final GenotypesContext genotypes)127     protected Map<String, Object> calculateAnnotationFromGTfield(final GenotypesContext genotypes){
128         final int[][] tableFromPerSampleAnnotations = getTableFromSamples(genotypes, MIN_COUNT);
129         return tableFromPerSampleAnnotations != null ? annotationForOneTable(calculateSOR(tableFromPerSampleAnnotations)) : null;
130     }
131 
132 
133     @Override
calculateAnnotationFromLikelihoods(final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc)134     protected Map<String, Object> calculateAnnotationFromLikelihoods(final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc){
135         final int[][] table = getContingencyTable(likelihoods, vc, MIN_COUNT);
136         return annotationForOneTable(calculateSOR(table));
137     }
138 
139     /**
140      * Computes the SOR value of a table after augmentation. Based on the symmetric odds ratio but modified to take on
141      * low values when the reference +/- read count ratio is skewed but the alt count ratio is not.  Natural log is taken
142      * to keep values within roughly the same range as other annotations.
143      *
144      * Adding pseudocounts avoids division by zero.
145      *
146      * @param table The table before adding pseudocounts
147      * @return the SOR annotation value
148      */
calculateSOR(final int[][] table)149     public static double calculateSOR(final int[][] table) {
150         final double t00 = table[0][0] + PSEUDOCOUNT;
151         final double t01 = table[0][1] + PSEUDOCOUNT;
152         final double t11 = table[1][1] + PSEUDOCOUNT;
153         final double t10 = table[1][0] + PSEUDOCOUNT;
154 
155         final double ratio = (t00 / t01) * (t11 / t10) + (t01 / t00) * (t10 / t11);
156 
157         final double refRatio = min(t00, t01)/ max(t00, t01);
158         final double altRatio = min(t10, t11)/ max(t10, t11);
159 
160         return Math.log(ratio) + Math.log(refRatio) - Math.log(altRatio);
161     }
162 
163     /**
164      * Returns an annotation result given a sor
165      *
166      * @param sor the symmetric odds ratio of the contingency table
167      * @return a hash map from SOR
168      */
169     @VisibleForTesting
annotationForOneTable(final double sor)170     Map<String, Object> annotationForOneTable(final double sor) {
171         return Collections.singletonMap(getKeyNames().get(0), formattedValue(sor));
172     }
173 
formattedValue(double sor)174     public static String formattedValue(double sor) {
175         return String.format("%.3f", sor);
176     }
177 
178 
179     @Override
getKeyNames()180     public List<String> getKeyNames() {
181         return Collections.singletonList(GATKVCFConstants.STRAND_ODDS_RATIO_KEY);
182     }
183 }
184