1 package org.broadinstitute.hellbender.tools.walkers.annotator;
2 
3 import htsjdk.variant.variantcontext.Allele;
4 import htsjdk.variant.variantcontext.Genotype;
5 import htsjdk.variant.variantcontext.GenotypesContext;
6 import htsjdk.variant.variantcontext.VariantContext;
7 import org.broadinstitute.hellbender.engine.ReferenceContext;
8 import org.broadinstitute.hellbender.exceptions.GATKException;
9 import org.broadinstitute.hellbender.utils.Utils;
10 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
11 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
12 import org.broadinstitute.hellbender.utils.read.GATKRead;
13 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
14 
15 import java.util.*;
16 
17 /**
18  * Class of tests to detect strand bias.
19  */
20 public abstract class StrandBiasTest extends InfoFieldAnnotation {
21 
22     protected static final int ARRAY_DIM = 2;
23     protected static final int ARRAY_SIZE = ARRAY_DIM * ARRAY_DIM;
24 
25     @Override
26     //template method for calculating strand bias annotations using the three different methods
annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)27     public Map<String, Object> annotate(final ReferenceContext ref,
28                                         final VariantContext vc,
29                                         final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
30         Utils.nonNull(vc);
31         if ( !vc.isVariant() ) {
32             return Collections.emptyMap();
33         }
34 
35         // if the genotype and strand bias are provided, calculate the annotation from the Genotype (GT) field
36         if ( vc.hasGenotypes() ) {
37             for (final Genotype g : vc.getGenotypes()) {
38                 if (g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) {
39                     return calculateAnnotationFromGTfield(vc.getGenotypes());
40                 }
41             }
42         }
43 
44         if (likelihoods != null) {
45             return calculateAnnotationFromLikelihoods(likelihoods, vc);
46         }
47         return Collections.emptyMap();
48     }
49 
calculateAnnotationFromGTfield(final GenotypesContext genotypes)50     protected abstract Map<String, Object> calculateAnnotationFromGTfield(final GenotypesContext genotypes);
51 
calculateAnnotationFromLikelihoods(final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc)52     protected abstract Map<String, Object> calculateAnnotationFromLikelihoods(final AlleleLikelihoods<GATKRead, Allele> likelihoods,
53                                                                               final VariantContext vc);
54 
55     /**
56      * Create the contingency table by retrieving the per-sample strand bias annotation and adding them together
57      * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation
58      * @param minCount minimum threshold for the sample strand bias counts for each ref and alt.
59      *                 If both ref and alt counts are above minCount the whole sample strand bias is added to the resulting table
60      * @return the table used for several strand bias tests, will be null if none of the genotypes contain the per-sample SB annotation
61      */
getTableFromSamples( final GenotypesContext genotypes, final int minCount )62     protected int[][] getTableFromSamples( final GenotypesContext genotypes, final int minCount ) {
63         if( genotypes == null ) {
64             return null;
65         }
66 
67         final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse
68         boolean foundData = false;
69 
70         for( final Genotype g : genotypes ) {
71             if( ! g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) ) {
72                 continue;
73             }
74 
75             foundData = true;
76             int[] data = getStrandCounts(g);
77 
78             if ( passesMinimumThreshold(data, minCount) ) {
79                 for( int index = 0; index < sbArray.length; index++ ) {
80                     sbArray[index] += data[index];
81                 }
82             }
83         }
84 
85         return foundData ? decodeSBBS(sbArray) : null ;
86     }
87 
88     @SuppressWarnings("unchecked")
getStrandCounts(final Genotype g)89     public static int[] getStrandCounts(final Genotype g) {
90         int[] data;
91         if ( g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(String.class)) {
92             final String sbbsString = (String)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
93             data = encodeSBBS(sbbsString);
94         } else if (g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(ArrayList.class)) {
95            if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof Integer) {
96                 data = encodeSBBS((ArrayList<Integer>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY));
97             }
98             else if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof String) {
99                 final List<Integer> sbbsList = new ArrayList<>();
100                 for (final Object o : (ArrayList<Object>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) {
101                     sbbsList.add(Integer.parseInt(o.toString()));
102                 }
103                 data = encodeSBBS(sbbsList);
104             } else {
105                 throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type");
106             }
107 
108         } else {
109             throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type");
110         }
111         return data;
112     }
113 
114     /**
115      Allocate and fill a 2x2 strand contingency table.  In the end, it'll look something like this:
116      *             fw      rc
117      *   allele1   #       #
118      *   allele2   #       #
119      * @return a 2x2 contingency table
120      */
getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc, final int minCount)121     public static int[][] getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods,
122                                                final VariantContext vc,
123                                                final int minCount) {
124         return getContingencyTable(likelihoods, vc, minCount, likelihoods.samples());
125     }
126 
127     /**
128      Allocate and fill a 2x2 strand contingency table.  In the end, it'll look something like this:
129      *             fw      rc
130      *   allele1   #       #
131      *   allele2   #       #
132      * @return a 2x2 contingency table
133      */
getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc, final int minCount, final Collection<String> samples)134     public static int[][] getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods,
135                                                final VariantContext vc,
136                                                final int minCount,
137                                                final Collection<String> samples) {
138         if( likelihoods == null || vc == null) {
139             return null;
140         }
141 
142         final Allele ref = vc.getReference();
143         final List<Allele> allAlts = vc.getAlternateAlleles();
144 
145         final int[][] table = new int[ARRAY_DIM][ARRAY_DIM];
146         for (final String sample : samples) {
147             final int[] sampleTable = new int[ARRAY_SIZE];
148             likelihoods.bestAllelesBreakingTies(sample).stream()
149                     .filter(ba -> ba.isInformative())
150                     .forEach(ba -> updateTable(sampleTable, ba.allele, ba.evidence, ref, allAlts));
151             if (passesMinimumThreshold(sampleTable, minCount)) {
152                 copyToMainTable(sampleTable, table);
153             }
154         }
155 
156         return table;
157     }
158 
159     /**
160      * Helper method to copy the per-sample table to the main table
161      *
162      * @param perSampleTable   per-sample table (single dimension)
163      * @param mainTable        main table (two dimensions)
164      */
copyToMainTable(final int[] perSampleTable, final int[][] mainTable)165     private static void copyToMainTable(final int[] perSampleTable, final int[][] mainTable) {
166         mainTable[0][0] += perSampleTable[0];
167         mainTable[0][1] += perSampleTable[1];
168         mainTable[1][0] += perSampleTable[2];
169         mainTable[1][1] += perSampleTable[3];
170     }
171 
updateTable(final int[] table, final Allele allele, final GATKRead read, final Allele ref, final List<Allele> allAlts)172     private static void updateTable(final int[] table, final Allele allele, final GATKRead read, final Allele ref, final List<Allele> allAlts) {
173         final boolean matchesRef = allele.equals(ref, true);
174         final boolean matchesAnyAlt = allAlts.contains(allele);
175 
176         if ( matchesRef || matchesAnyAlt ) {
177             final int offset = matchesRef ? 0 : ARRAY_DIM;
178 
179             // a normal read with an actual strand
180             final boolean isForward = !read.isReverseStrand();
181             table[offset + (isForward ? 0 : 1)]++;
182         }
183     }
184 
185     /**
186      * Does this strand data array pass the minimum threshold for inclusion?
187      *
188      * @param data  the array
189      * @param minCount The minimum threshold of counts in the array
190      * @return true if it passes the minimum threshold, false otherwise
191      */
passesMinimumThreshold(final int[] data, final int minCount)192     protected static boolean passesMinimumThreshold(final int[] data, final int minCount) {
193         // the ref and alt totals must be greater than MIN_COUNT
194         return data[0] + data[1] + data[2] + data[3] > minCount;
195     }
196 
197     /**
198      * Helper function to parse the genotype annotation into the SB annotation array
199      * @param string the string that is returned by genotype.getAnnotation("SB")
200      * @return the array used by the per-sample Strand Bias annotation
201      */
encodeSBBS( final String string )202     private static int[] encodeSBBS( final String string ) {
203         final int[] array = new int[ARRAY_SIZE];
204         final StringTokenizer tokenizer = new StringTokenizer(string, ",", false);
205         for( int index = 0; index < ARRAY_SIZE; index++ ) {
206             array[index] = Integer.parseInt(tokenizer.nextToken());
207         }
208         return array;
209     }
210 
211     /**
212      * Helper function to parse the genotype annotation into the SB annotation array
213      * @param arrayList the ArrayList returned from StrandBiasBySample.annotate()
214      * @return the array used by the per-sample Strand Bias annotation
215      */
encodeSBBS( final List<Integer> arrayList )216     private static int[] encodeSBBS( final List<Integer> arrayList ) {
217         final int[] array = new int[ARRAY_SIZE];
218         int index = 0;
219         for ( Integer item : arrayList ) {
220             array[index++] = item;
221         }
222 
223         return array;
224     }
225 
226     /**
227      * Helper function to turn the  SB annotation array into a contingency table
228      * @param array the array used by the per-sample Strand Bias annotation
229      * @return the table used by the StrandBiasTest annotation
230      */
decodeSBBS( final int[] array )231     public static int[][] decodeSBBS( final int[] array ) {
232         if(array.length != ARRAY_SIZE) {
233             return null;
234         }
235         final int[][] table = new int[ARRAY_DIM][ARRAY_DIM];
236         table[0][0] = array[0];
237         table[0][1] = array[1];
238         table[1][0] = array[2];
239         table[1][1] = array[3];
240         return table;
241     }
242 
243 
244 }
245