1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import htsjdk.variant.variantcontext.Allele; 4 import htsjdk.variant.variantcontext.Genotype; 5 import htsjdk.variant.variantcontext.GenotypesContext; 6 import htsjdk.variant.variantcontext.VariantContext; 7 import org.broadinstitute.hellbender.engine.ReferenceContext; 8 import org.broadinstitute.hellbender.exceptions.GATKException; 9 import org.broadinstitute.hellbender.utils.Utils; 10 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; 11 import org.broadinstitute.hellbender.utils.pileup.PileupElement; 12 import org.broadinstitute.hellbender.utils.read.GATKRead; 13 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 14 15 import java.util.*; 16 17 /** 18 * Class of tests to detect strand bias. 19 */ 20 public abstract class StrandBiasTest extends InfoFieldAnnotation { 21 22 protected static final int ARRAY_DIM = 2; 23 protected static final int ARRAY_SIZE = ARRAY_DIM * ARRAY_DIM; 24 25 @Override 26 //template method for calculating strand bias annotations using the three different methods annotate(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods)27 public Map<String, Object> annotate(final ReferenceContext ref, 28 final VariantContext vc, 29 final AlleleLikelihoods<GATKRead, Allele> likelihoods) { 30 Utils.nonNull(vc); 31 if ( !vc.isVariant() ) { 32 return Collections.emptyMap(); 33 } 34 35 // if the genotype and strand bias are provided, calculate the annotation from the Genotype (GT) field 36 if ( vc.hasGenotypes() ) { 37 for (final Genotype g : vc.getGenotypes()) { 38 if (g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) { 39 return calculateAnnotationFromGTfield(vc.getGenotypes()); 40 } 41 } 42 } 43 44 if (likelihoods != null) { 45 return calculateAnnotationFromLikelihoods(likelihoods, vc); 46 } 47 return Collections.emptyMap(); 48 } 49 calculateAnnotationFromGTfield(final GenotypesContext genotypes)50 protected abstract Map<String, Object> calculateAnnotationFromGTfield(final GenotypesContext genotypes); 51 calculateAnnotationFromLikelihoods(final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc)52 protected abstract Map<String, Object> calculateAnnotationFromLikelihoods(final AlleleLikelihoods<GATKRead, Allele> likelihoods, 53 final VariantContext vc); 54 55 /** 56 * Create the contingency table by retrieving the per-sample strand bias annotation and adding them together 57 * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation 58 * @param minCount minimum threshold for the sample strand bias counts for each ref and alt. 59 * If both ref and alt counts are above minCount the whole sample strand bias is added to the resulting table 60 * @return the table used for several strand bias tests, will be null if none of the genotypes contain the per-sample SB annotation 61 */ getTableFromSamples( final GenotypesContext genotypes, final int minCount )62 protected int[][] getTableFromSamples( final GenotypesContext genotypes, final int minCount ) { 63 if( genotypes == null ) { 64 return null; 65 } 66 67 final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse 68 boolean foundData = false; 69 70 for( final Genotype g : genotypes ) { 71 if( ! g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) ) { 72 continue; 73 } 74 75 foundData = true; 76 int[] data = getStrandCounts(g); 77 78 if ( passesMinimumThreshold(data, minCount) ) { 79 for( int index = 0; index < sbArray.length; index++ ) { 80 sbArray[index] += data[index]; 81 } 82 } 83 } 84 85 return foundData ? decodeSBBS(sbArray) : null ; 86 } 87 88 @SuppressWarnings("unchecked") getStrandCounts(final Genotype g)89 public static int[] getStrandCounts(final Genotype g) { 90 int[] data; 91 if ( g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(String.class)) { 92 final String sbbsString = (String)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY); 93 data = encodeSBBS(sbbsString); 94 } else if (g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(ArrayList.class)) { 95 if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof Integer) { 96 data = encodeSBBS((ArrayList<Integer>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)); 97 } 98 else if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof String) { 99 final List<Integer> sbbsList = new ArrayList<>(); 100 for (final Object o : (ArrayList<Object>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) { 101 sbbsList.add(Integer.parseInt(o.toString())); 102 } 103 data = encodeSBBS(sbbsList); 104 } else { 105 throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type"); 106 } 107 108 } else { 109 throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type"); 110 } 111 return data; 112 } 113 114 /** 115 Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: 116 * fw rc 117 * allele1 # # 118 * allele2 # # 119 * @return a 2x2 contingency table 120 */ getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc, final int minCount)121 public static int[][] getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods, 122 final VariantContext vc, 123 final int minCount) { 124 return getContingencyTable(likelihoods, vc, minCount, likelihoods.samples()); 125 } 126 127 /** 128 Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: 129 * fw rc 130 * allele1 # # 131 * allele2 # # 132 * @return a 2x2 contingency table 133 */ getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods, final VariantContext vc, final int minCount, final Collection<String> samples)134 public static int[][] getContingencyTable( final AlleleLikelihoods<GATKRead, Allele> likelihoods, 135 final VariantContext vc, 136 final int minCount, 137 final Collection<String> samples) { 138 if( likelihoods == null || vc == null) { 139 return null; 140 } 141 142 final Allele ref = vc.getReference(); 143 final List<Allele> allAlts = vc.getAlternateAlleles(); 144 145 final int[][] table = new int[ARRAY_DIM][ARRAY_DIM]; 146 for (final String sample : samples) { 147 final int[] sampleTable = new int[ARRAY_SIZE]; 148 likelihoods.bestAllelesBreakingTies(sample).stream() 149 .filter(ba -> ba.isInformative()) 150 .forEach(ba -> updateTable(sampleTable, ba.allele, ba.evidence, ref, allAlts)); 151 if (passesMinimumThreshold(sampleTable, minCount)) { 152 copyToMainTable(sampleTable, table); 153 } 154 } 155 156 return table; 157 } 158 159 /** 160 * Helper method to copy the per-sample table to the main table 161 * 162 * @param perSampleTable per-sample table (single dimension) 163 * @param mainTable main table (two dimensions) 164 */ copyToMainTable(final int[] perSampleTable, final int[][] mainTable)165 private static void copyToMainTable(final int[] perSampleTable, final int[][] mainTable) { 166 mainTable[0][0] += perSampleTable[0]; 167 mainTable[0][1] += perSampleTable[1]; 168 mainTable[1][0] += perSampleTable[2]; 169 mainTable[1][1] += perSampleTable[3]; 170 } 171 updateTable(final int[] table, final Allele allele, final GATKRead read, final Allele ref, final List<Allele> allAlts)172 private static void updateTable(final int[] table, final Allele allele, final GATKRead read, final Allele ref, final List<Allele> allAlts) { 173 final boolean matchesRef = allele.equals(ref, true); 174 final boolean matchesAnyAlt = allAlts.contains(allele); 175 176 if ( matchesRef || matchesAnyAlt ) { 177 final int offset = matchesRef ? 0 : ARRAY_DIM; 178 179 // a normal read with an actual strand 180 final boolean isForward = !read.isReverseStrand(); 181 table[offset + (isForward ? 0 : 1)]++; 182 } 183 } 184 185 /** 186 * Does this strand data array pass the minimum threshold for inclusion? 187 * 188 * @param data the array 189 * @param minCount The minimum threshold of counts in the array 190 * @return true if it passes the minimum threshold, false otherwise 191 */ passesMinimumThreshold(final int[] data, final int minCount)192 protected static boolean passesMinimumThreshold(final int[] data, final int minCount) { 193 // the ref and alt totals must be greater than MIN_COUNT 194 return data[0] + data[1] + data[2] + data[3] > minCount; 195 } 196 197 /** 198 * Helper function to parse the genotype annotation into the SB annotation array 199 * @param string the string that is returned by genotype.getAnnotation("SB") 200 * @return the array used by the per-sample Strand Bias annotation 201 */ encodeSBBS( final String string )202 private static int[] encodeSBBS( final String string ) { 203 final int[] array = new int[ARRAY_SIZE]; 204 final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); 205 for( int index = 0; index < ARRAY_SIZE; index++ ) { 206 array[index] = Integer.parseInt(tokenizer.nextToken()); 207 } 208 return array; 209 } 210 211 /** 212 * Helper function to parse the genotype annotation into the SB annotation array 213 * @param arrayList the ArrayList returned from StrandBiasBySample.annotate() 214 * @return the array used by the per-sample Strand Bias annotation 215 */ encodeSBBS( final List<Integer> arrayList )216 private static int[] encodeSBBS( final List<Integer> arrayList ) { 217 final int[] array = new int[ARRAY_SIZE]; 218 int index = 0; 219 for ( Integer item : arrayList ) { 220 array[index++] = item; 221 } 222 223 return array; 224 } 225 226 /** 227 * Helper function to turn the SB annotation array into a contingency table 228 * @param array the array used by the per-sample Strand Bias annotation 229 * @return the table used by the StrandBiasTest annotation 230 */ decodeSBBS( final int[] array )231 public static int[][] decodeSBBS( final int[] array ) { 232 if(array.length != ARRAY_SIZE) { 233 return null; 234 } 235 final int[][] table = new int[ARRAY_DIM][ARRAY_DIM]; 236 table[0][0] = array[0]; 237 table[0][1] = array[1]; 238 table[1][0] = array[2]; 239 table[1][1] = array[3]; 240 return table; 241 } 242 243 244 } 245