1 package org.broadinstitute.hellbender.tools.walkers.annotator; 2 3 import htsjdk.samtools.Cigar; 4 import htsjdk.samtools.CigarElement; 5 import htsjdk.samtools.CigarOperator; 6 import htsjdk.samtools.TextCigarCodec; 7 import htsjdk.variant.variantcontext.*; 8 import org.broadinstitute.hellbender.engine.ReferenceContext; 9 import org.broadinstitute.hellbender.utils.MannWhitneyU; 10 import org.broadinstitute.hellbender.utils.QualityUtils; 11 import org.broadinstitute.hellbender.utils.genotyper.*; 12 import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; 13 import org.broadinstitute.hellbender.utils.read.GATKRead; 14 import org.broadinstitute.hellbender.testutils.ArtificialAnnotationUtils; 15 import org.broadinstitute.hellbender.GATKBaseTest; 16 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 17 import org.testng.Assert; 18 import org.testng.annotations.BeforeClass; 19 import org.testng.annotations.DataProvider; 20 import org.testng.annotations.Test; 21 22 import java.util.*; 23 24 public final class ReadPosRankSumTestUnitTest extends GATKBaseTest { 25 private final String sample1 = "NA1"; 26 private final String sample2 = "NA2"; 27 private static final String CONTIG = "1"; 28 29 private static final Allele REF = Allele.create("T", true); 30 private static final Allele ALT = Allele.create("A", false); 31 private static final Allele ALT_INSERTION = Allele.create("TTTTT", false); 32 makeVC(final long position)33 private VariantContext makeVC(final long position) { 34 final double[] genotypeLikelihoods1 = {30,0,190}; 35 final GenotypesContext testGC = GenotypesContext.create(2); 36 // sample1 -> A/T with GQ 30 37 testGC.add(new GenotypeBuilder(sample1).alleles(Arrays.asList(REF, ALT)).PL(genotypeLikelihoods1).GQ(30).make()); 38 // sample2 -> A/T with GQ 40 39 testGC.add(new GenotypeBuilder(sample2).alleles(Arrays.asList(REF, ALT)).PL(genotypeLikelihoods1).GQ(40).make()); 40 41 return (new VariantContextBuilder()) 42 .alleles(Arrays.asList(REF, ALT)).chr(CONTIG).start(position).stop(position).genotypes(testGC).make(); 43 } 44 makeRead(final int start, final int mq)45 private GATKRead makeRead(final int start, final int mq) { 46 Cigar cigar = TextCigarCodec.decode("10M"); 47 final GATKRead read = ArtificialReadUtils.createArtificialRead(cigar); 48 read.setMappingQuality(mq); 49 read.setPosition(CONTIG, start); 50 return read; 51 } 52 53 @Test testReadPos()54 public void testReadPos(){ 55 final InfoFieldAnnotation ann = new ReadPosRankSumTest(); 56 final String key = GATKVCFConstants.READ_POS_RANK_SUM_KEY; 57 final MannWhitneyU mannWhitneyU = new MannWhitneyU(); 58 59 final int[] startAlts = {3, 4}; 60 final int[] startRefs = {1, 2}; 61 final List<GATKRead> refReads = Arrays.asList(makeRead(startRefs[0], 30), makeRead(startRefs[1], 30)); 62 final List<GATKRead> altReads = Arrays.asList(makeRead(startAlts[0], 30), makeRead(startAlts[1], 30)); 63 final AlleleLikelihoods<GATKRead, Allele> likelihoods = 64 ArtificialAnnotationUtils.makeLikelihoods(sample1, refReads, altReads, -100.0, -100.0, REF, ALT); 65 66 Assert.assertEquals(ann.getDescriptions().size(), 1); 67 Assert.assertEquals(ann.getDescriptions().get(0).getID(), key); 68 Assert.assertEquals(ann.getKeyNames().size(), 1); 69 Assert.assertEquals(ann.getKeyNames().get(0), key); 70 71 final ReferenceContext ref= null; 72 73 final long position = 5L; //middle of the read 74 final VariantContext vc= makeVC(position); 75 76 final Map<String, Object> annotate = ann.annotate(ref, vc, likelihoods); 77 final double zScore = mannWhitneyU.test(new double[]{position - startAlts[0], position - startAlts[1]}, new double[]{position - startRefs[0], position - startRefs[1]}, MannWhitneyU.TestType.FIRST_DOMINATES).getZ(); 78 final String zScoreStr = String.format("%.3f", zScore); 79 Assert.assertEquals(annotate.get(key), zScoreStr); 80 81 final long positionEnd = 9L; //past middle 82 final VariantContext vcEnd= makeVC(positionEnd); 83 84 //Note: past the middle of the read we compute the position from the end. 85 final Map<String, Object> annotateEnd = ann.annotate(ref, vcEnd, likelihoods); 86 final double zScoreEnd = mannWhitneyU.test(new double[]{startAlts[0], startAlts[1]}, new double[]{startRefs[0], startRefs[1]}, MannWhitneyU.TestType.FIRST_DOMINATES).getZ(); 87 final String zScoreEndStr = String.format("%.3f", zScoreEnd); 88 Assert.assertEquals(annotateEnd.get(key), zScoreEndStr); 89 90 final long positionPastEnd = 20L; //past middle 91 final VariantContext vcPastEnd= makeVC(positionPastEnd); 92 93 //Note: past the end of the read, there's nothing 94 final Map<String, Object> annotatePastEnd = ann.annotate(ref, vcPastEnd, likelihoods); 95 Assert.assertTrue(annotatePastEnd.isEmpty()); 96 } 97 98 @DataProvider(name = "dataIsUsableRead") dataIsUsableRead()99 private Object[][] dataIsUsableRead(){ 100 return new Object[][]{ 101 {"20M6D2M", 10, 1, 0, true}, 102 {"20M6D2M", 10, 1, 27, true}, 103 {"20M6D2M", 10, 1, 29, false}, 104 {"1I20M1S", 10, 1, 0, true}, 105 {"1I20M1S", 0, 1, 0, false}, 106 {"1I20M1S", QualityUtils.MAPPING_QUALITY_UNAVAILABLE, 1, 0, false}, 107 {"1I20M1S", 10, 1, 22, false}, 108 {"1I20M1S", 10, 21, 42, false}, 109 {"1I20M1H", 10, 1, 21, false}, 110 }; 111 } 112 113 @Test(dataProvider = "dataIsUsableRead") testIsUsableRead(final String cigarString, final int mappingQuality, final int start, final int refLoc, final boolean isUsable )114 public void testIsUsableRead(final String cigarString, final int mappingQuality, final int start, final int refLoc, final boolean isUsable ) { 115 final ReadPosRankSumTest readPosRankSumTest = new ReadPosRankSumTest(); 116 final Cigar cigar = TextCigarCodec.decode(cigarString); 117 final GATKRead read = ArtificialReadUtils.createArtificialRead(cigar); 118 read.setMappingQuality(mappingQuality); 119 read.setPosition(CONTIG, start); 120 Assert.assertEquals(readPosRankSumTest.isUsableRead(read, makeVC(refLoc)), isUsable); 121 } 122 123 //Basic aligned read 124 private GATKRead allMatch; 125 126 //Read with insertion and deletion 127 private GATKRead twoIndels; 128 129 //Read with soft clips at start 130 private GATKRead softClipStart; 131 132 //Read with hard clips at start 133 private GATKRead hardClipStart; 134 135 //Read with low quality tail 136 private GATKRead lowQualTail; 137 138 //Read with low quality tail, partially soft clipped 139 private GATKRead lowQualClippedTail; 140 141 //Read with low quality bases at start 142 private GATKRead lowQualStart; 143 144 //Read with low quality bases, partially soft clipped at both ends 145 private GATKRead lowQualBothEnds; 146 147 @BeforeClass init()148 public void init() { 149 List<CigarElement> cigarElements_allMatch = new LinkedList<>(); 150 cigarElements_allMatch.add(new CigarElement(151, CigarOperator.M)); 151 allMatch = ArtificialReadUtils.createArtificialRead(new Cigar(cigarElements_allMatch)); 152 153 List<CigarElement> cigarElements_2indels = new LinkedList<>(); 154 cigarElements_2indels.add(new CigarElement(66, CigarOperator.M)); 155 cigarElements_2indels.add(new CigarElement(10, CigarOperator.I)); 156 cigarElements_2indels.add(new CigarElement(7, CigarOperator.M)); 157 cigarElements_2indels.add(new CigarElement(10, CigarOperator.D)); 158 cigarElements_2indels.add(new CigarElement(68, CigarOperator.M)); 159 twoIndels = ArtificialReadUtils.createArtificialRead(new Cigar(cigarElements_2indels)); 160 161 List<CigarElement> cigarElements_softClipStart = new LinkedList<>(); 162 cigarElements_softClipStart.add(new CigarElement(17, CigarOperator.S)); 163 cigarElements_softClipStart.add(new CigarElement(134, CigarOperator.M)); 164 softClipStart = ArtificialReadUtils.createArtificialRead(new Cigar(cigarElements_softClipStart)); 165 166 List<CigarElement> cigarElements_hardClipStart = new LinkedList<>(); 167 cigarElements_hardClipStart.add(new CigarElement(17, CigarOperator.H)); 168 cigarElements_hardClipStart.add(new CigarElement(134, CigarOperator.M)); 169 hardClipStart = ArtificialReadUtils.createArtificialRead(new Cigar(cigarElements_hardClipStart)); 170 171 172 final byte [] bases_lowQualTail = {'A', 'C', 'T', 'G', 'A', 'A', 'A', 'A', 'A', 'A'}; 173 final byte [] quals_lowQualTail = {30, 15, 25, 30, 2, 2, 2, 2, 2, 2}; 174 lowQualTail = ArtificialReadUtils.createArtificialRead(bases_lowQualTail, quals_lowQualTail, "10M"); 175 176 final byte [] bases_lowQualClippedTail = {'A', 'C', 'T', 'G', 'A', 'A', 'A', 'A', 'A', 'A'}; 177 final byte [] quals_lowQualClippedTail = {30, 15, 25, 30, 2, 2, 2, 2, 2, 2}; 178 lowQualClippedTail = ArtificialReadUtils.createArtificialRead(bases_lowQualClippedTail, quals_lowQualClippedTail, "8M2S"); 179 180 final byte [] bases_lowQualStart = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'T', 'G'}; 181 final byte [] quals_lowQualStart = {2, 2, 2, 2, 2, 2, 30, 15, 25, 30}; 182 lowQualStart = ArtificialReadUtils.createArtificialRead(bases_lowQualStart, quals_lowQualStart, "10M"); 183 184 final byte [] bases_lowQualBothEnds = {'A', 'A', 'A', 'A', 'A', 'A', 'G', 'C', 'T', 'G', 'A', 'A', 'A', 'A', 'A', 'A'}; 185 final byte [] quals_lowQualBothEnds = { 2, 2, 2, 2, 2, 2, 30, 15, 25, 30, 2, 2, 2, 2, 2, 2}; 186 lowQualBothEnds = ArtificialReadUtils.createArtificialRead(bases_lowQualBothEnds, quals_lowQualBothEnds, "2S12M2S"); 187 } 188 189 @Test testLeadingInsertion()190 public void testLeadingInsertion(){ 191 final int start = 100; 192 final VariantContext vc = new VariantContextBuilder().alleles(Arrays.asList(REF, ALT_INSERTION)).chr(CONTIG).start(start-1).stop(start-1).make(); 193 194 final Cigar cigar = TextCigarCodec.decode("10I10M"); 195 final GATKRead read = ArtificialReadUtils.createArtificialRead(cigar); 196 197 read.setPosition("CONTIG", start); 198 199 Assert.assertEquals(ReadPosRankSumTest.getReadPosition(read, vc).getAsDouble(), 0.0); 200 } 201 202 @Test testSNPBetweenTwoDeletions()203 public void testSNPBetweenTwoDeletions(){ 204 final Cigar cigar = TextCigarCodec.decode("10M10D1M10D10M"); 205 final GATKRead read = ArtificialReadUtils.createArtificialRead(cigar); 206 final int start = 100; 207 read.setPosition("CONTIG", start); 208 209 Assert.assertEquals(ReadPosRankSumTest.getReadPosition(read, makeVC(start + 20)).getAsDouble(), 10.0); 210 Assert.assertEquals(ReadPosRankSumTest.getReadPosition(read, makeVC(start + 19)).getAsDouble(), 10.0); 211 } 212 } 213