1 package org.broadinstitute.hellbender.utils.activityprofile; 2 3 import htsjdk.samtools.SAMFileHeader; 4 import htsjdk.samtools.reference.ReferenceSequenceFile; 5 import org.broadinstitute.hellbender.GATKBaseTest; 6 import org.broadinstitute.hellbender.engine.AssemblyRegion; 7 import org.broadinstitute.hellbender.utils.*; 8 import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile; 9 import org.broadinstitute.hellbender.utils.io.IOUtils; 10 import org.testng.Assert; 11 import org.testng.annotations.BeforeClass; 12 import org.testng.annotations.DataProvider; 13 import org.testng.annotations.Test; 14 15 import java.io.IOException; 16 import java.util.*; 17 18 public class ActivityProfileUnitTest extends GATKBaseTest { 19 private GenomeLocParser genomeLocParser; 20 private GenomeLoc startLoc; 21 private SAMFileHeader header; 22 23 private final static int MAX_PROB_PROPAGATION_DISTANCE = 50; 24 private final static double ACTIVE_PROB_THRESHOLD= 0.002; 25 26 @BeforeClass init()27 public void init() throws IOException { 28 // sequence 29 try(final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(IOUtils.getPath(b37_reference_20_21))) { 30 genomeLocParser = new GenomeLocParser(seq); 31 startLoc = genomeLocParser.createGenomeLoc("20", 0, 1, 100); 32 header = new SAMFileHeader(); 33 seq.getSequenceDictionary().getSequences().forEach(sequence -> header.addSequence(sequence)); 34 } 35 } 36 37 // -------------------------------------------------------------------------------- 38 // 39 // Basic tests Provider 40 // 41 // -------------------------------------------------------------------------------- 42 43 private class BasicActivityProfileTestProvider extends TestDataProvider { 44 List<Double> probs; 45 List<AssemblyRegion> expectedRegions; 46 int extension = 0; 47 GenomeLoc regionStart = startLoc; 48 final ProfileType type; 49 BasicActivityProfileTestProvider(final ProfileType type, final List<Double> probs, boolean startActive, int ... startsAndStops)50 public BasicActivityProfileTestProvider(final ProfileType type, final List<Double> probs, boolean startActive, int ... startsAndStops) { 51 super(BasicActivityProfileTestProvider.class); 52 this.type = type; 53 this.probs = probs; 54 this.expectedRegions = toRegions(startActive, startsAndStops); 55 setName(getName()); 56 } 57 getName()58 private String getName() { 59 return String.format("type=%s probs=%s expectedRegions=%s", type, Utils.join(",", probs), Utils.join(",", expectedRegions)); 60 } 61 makeProfile()62 public ActivityProfile makeProfile() { 63 switch ( type ) { 64 case Base: return new ActivityProfile(MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, header); 65 case BandPass: 66 // zero size => equivalent to ActivityProfile 67 return new BandPassActivityProfile(MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, 0, 0.01, false, header); 68 default: throw new IllegalStateException(type.toString()); 69 } 70 } 71 toRegions(boolean isActive, int[] startsAndStops)72 private List<AssemblyRegion> toRegions(boolean isActive, int[] startsAndStops) { 73 List<AssemblyRegion> l = new ArrayList<>(); 74 for ( int i = 0; i < startsAndStops.length - 1; i++) { 75 int start = regionStart.getStart() + startsAndStops[i]; 76 int end = regionStart.getStart() + startsAndStops[i+1] - 1; 77 GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end); 78 AssemblyRegion r = new AssemblyRegion(new SimpleInterval(activeLoc), isActive, extension, header); 79 l.add(r); 80 isActive = ! isActive; 81 } 82 return l; 83 } 84 } 85 86 private enum ProfileType { 87 Base, BandPass 88 } 89 90 @DataProvider(name = "BasicActivityProfileTestProvider") makeQualIntervalTestProvider()91 public Object[][] makeQualIntervalTestProvider() { 92 for ( final ProfileType type : ProfileType.values() ) { 93 new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1); 94 new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2); 95 new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2); 96 new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); 97 new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); 98 } 99 100 return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); 101 } 102 103 @Test(dataProvider = "BasicActivityProfileTestProvider") testBasicActivityProfile(BasicActivityProfileTestProvider cfg)104 public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) { 105 ActivityProfile profile = cfg.makeProfile(); 106 107 Assert.assertTrue(profile.isEmpty()); 108 109 for ( int i = 0; i < cfg.probs.size(); i++ ) { 110 double p = cfg.probs.get(i); 111 GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); 112 profile.add(new ActivityProfileState(new SimpleInterval(loc), p)); 113 Assert.assertFalse(profile.isEmpty(), "Profile shouldn't be empty after adding a state"); 114 } 115 Assert.assertEquals(genomeLocParser.createGenomeLoc(profile.regionStartLoc), genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ), "Start loc should be the start of the region"); 116 117 Assert.assertEquals(profile.size(), cfg.probs.size(), "Should have exactly the number of states we expected to add"); 118 assertProbsAreEqual(profile.stateList, cfg.probs); 119 120 // TODO -- reanble tests 121 //assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); 122 } 123 assertProbsAreEqual(List<ActivityProfileState> actual, List<Double> expected)124 private void assertProbsAreEqual(List<ActivityProfileState> actual, List<Double> expected) { 125 Assert.assertEquals(actual.size(), expected.size()); 126 for ( int i = 0; i < actual.size(); i++ ) { 127 Assert.assertEquals(actual.get(i).isActiveProb(), (double) expected.get(i)); 128 } 129 } 130 131 // ------------------------------------------------------------------------------------- 132 // 133 // Hardcore tests for adding to the profile and constructing active regions 134 // 135 // ------------------------------------------------------------------------------------- 136 137 private static class SizeToStringList<T> extends ArrayList<T> { 138 private static final long serialVersionUID = 1L; 139 toString()140 @Override public String toString() { return "List[" + size() + "]"; } 141 } 142 143 @DataProvider(name = "RegionCreationTests") makeRegionCreationTests()144 public Object[][] makeRegionCreationTests() { 145 final List<Object[]> tests = new LinkedList<>(); 146 147 final int contigLength = genomeLocParser.getSequenceDictionary().getSequences().get(0).getSequenceLength(); 148 for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10) ) { 149 for ( int regionSize : Arrays.asList(1, 10, 100, 1000, 10000) ) { 150 for ( int maxRegionSize : Arrays.asList(10, 50, 200) ) { 151 for ( final boolean waitUntilEnd : Arrays.asList(false, true) ) { 152 for ( final boolean forceConversion : Arrays.asList(false, true) ) { 153 // what do I really want to test here? I'd like to test a few cases: 154 // -- region is all active (1.0) 155 // -- region is all inactive (0.0) 156 // -- cut the interval into 1, 2, 3, 4, 5 ... 10 regions, each with alternating activity values 157 for ( final boolean startWithActive : Arrays.asList(true, false) ) { 158 for ( int nParts : Arrays.asList(1, 2, 3, 4, 5, 7, 10, 11, 13) ) { 159 160 // for ( int start : Arrays.asList(1) ) { 161 // for ( int regionSize : Arrays.asList(100) ) { 162 // for ( int maxRegionSize : Arrays.asList(10) ) { 163 // for ( final boolean waitUntilEnd : Arrays.asList(true) ) { 164 // for ( final boolean forceConversion : Arrays.asList(false) ) { 165 // for ( final boolean startWithActive : Arrays.asList(true) ) { 166 // for ( int nParts : Arrays.asList(3) ) { 167 regionSize = Math.min(regionSize, contigLength - start); 168 final List<Boolean> regions = makeRegions(regionSize, startWithActive, nParts); 169 tests.add(new Object[]{ start, regions, maxRegionSize, nParts, forceConversion, waitUntilEnd }); 170 } 171 } 172 } 173 } 174 } 175 } 176 } 177 178 return tests.toArray(new Object[][]{}); 179 } 180 makeRegions(final int totalRegionSize, final boolean startWithActive, final int nParts)181 private List<Boolean> makeRegions(final int totalRegionSize, 182 final boolean startWithActive, 183 final int nParts) { 184 final List<Boolean> regions = new SizeToStringList<Boolean>(); 185 186 boolean isActive = startWithActive; 187 final int activeRegionSize = Math.max(totalRegionSize / nParts, 1); 188 for ( int i = 0; i < totalRegionSize; i += activeRegionSize ) { 189 for ( int j = 0; j < activeRegionSize && j + i < totalRegionSize; j++ ) { 190 regions.add(isActive); 191 } 192 isActive = ! isActive; 193 } 194 195 return regions; 196 } 197 198 199 @Test(dataProvider = "RegionCreationTests") testRegionCreation(final int start, final List<Boolean> probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd)200 public void testRegionCreation(final int start, final List<Boolean> probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) { 201 final ActivityProfile profile = new ActivityProfile(MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, header); 202 Assert.assertNotNull(profile.toString()); 203 204 final String contig = genomeLocParser.getSequenceDictionary().getSequences().get(0).getSequenceName(); 205 final List<Boolean> seenSites = new ArrayList<Boolean>(Collections.nCopies(probs.size(), false)); 206 AssemblyRegion lastRegion = null; 207 for ( int i = 0; i < probs.size(); i++ ) { 208 final boolean isActive = probs.get(i); 209 final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); 210 final ActivityProfileState state = new ActivityProfileState(new SimpleInterval(loc), isActive ? 1.0 : 0.0); 211 profile.add(state); 212 Assert.assertNotNull(profile.toString()); 213 214 if ( ! waitUntilEnd ) { 215 final List<AssemblyRegion> regions = profile.popReadyAssemblyRegions(0, 1, maxRegionSize, false); 216 lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites); 217 } 218 } 219 220 if ( waitUntilEnd || forceConversion ) { 221 final List<AssemblyRegion> regions = profile.popReadyAssemblyRegions(0, 1, maxRegionSize, forceConversion); 222 lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites); 223 } 224 225 for ( int i = 0; i < probs.size(); i++ ) { 226 if ( forceConversion || (i + maxRegionSize + profile.getMaxProbPropagationDistance() < probs.size())) 227 // only require a site to be seen if we are forcing conversion or the site is more than maxRegionSize from the end 228 Assert.assertTrue(seenSites.get(i), "Missed site " + i); 229 } 230 231 Assert.assertNotNull(profile.toString()); 232 } 233 assertGoodRegions(final int start, final List<AssemblyRegion> regions, final int maxRegionSize, AssemblyRegion lastRegion, final List<Boolean> probs, final List<Boolean> seenSites)234 private AssemblyRegion assertGoodRegions(final int start, final List<AssemblyRegion> regions, final int maxRegionSize, AssemblyRegion lastRegion, final List<Boolean> probs, final List<Boolean> seenSites) { 235 for ( final AssemblyRegion region : regions ) { 236 Assert.assertTrue(region.getSpan().size() > 0, "Region " + region + " has a bad size"); 237 Assert.assertTrue(region.getSpan().size() <= maxRegionSize, "Region " + region + " has a bad size: it's big than the max region size " + maxRegionSize); 238 if ( lastRegion != null ) { 239 Assert.assertTrue(region.getSpan().getStart() == lastRegion.getSpan().getEnd() + 1, "Region " + region + " doesn't start immediately after previous region" + lastRegion); 240 } 241 242 // check that all active bases are actually active 243 final int regionOffset = region.getSpan().getStart() - start; 244 Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start"); 245 for ( int j = 0; j < region.getSpan().size(); j++ ) { 246 final int siteOffset = j + regionOffset; 247 Assert.assertEquals(region.isActive(), probs.get(siteOffset).booleanValue()); 248 Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already"); 249 seenSites.set(siteOffset, true); 250 } 251 252 lastRegion = region; 253 } 254 255 return lastRegion; 256 } 257 258 // ------------------------------------------------------------------------------------- 259 // 260 // Hardcore tests for adding to the profile and constructing active regions 261 // 262 // ------------------------------------------------------------------------------------- 263 264 @DataProvider(name = "SoftClipsTest") makeSoftClipsTest()265 public Object[][] makeSoftClipsTest() { 266 final List<Object[]> tests = new LinkedList<Object[]>(); 267 268 final int contigLength = genomeLocParser.getSequenceDictionary().getSequences().get(0).getSequenceLength(); 269 for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10, contigLength - 1) ) { 270 for ( int precedingSites: Arrays.asList(0, 1, 10) ) { 271 if ( precedingSites + start < contigLength ) { 272 for ( int softClipSize : Arrays.asList(1, 2, 10, 100) ) { 273 // for ( int start : Arrays.asList(10) ) { 274 // for ( int precedingSites: Arrays.asList(10) ) { 275 // for ( int softClipSize : Arrays.asList(1) ) { 276 tests.add(new Object[]{ start, precedingSites, softClipSize }); 277 } 278 } 279 } 280 } 281 282 return tests.toArray(new Object[][]{}); 283 } 284 285 @Test(dataProvider = "SoftClipsTest") testSoftClips(final int start, int nPrecedingSites, final int softClipSize)286 public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) { 287 final ActivityProfile profile = new ActivityProfile(MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, header); 288 289 final int contigLength = genomeLocParser.getSequenceDictionary().getSequences().get(0).getSequenceLength(); 290 final String contig = genomeLocParser.getSequenceDictionary().getSequences().get(0).getSequenceName(); 291 for ( int i = 0; i < nPrecedingSites; i++ ) { 292 final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); 293 final ActivityProfileState state = new ActivityProfileState(new SimpleInterval(loc), 0.0); 294 profile.add(state); 295 } 296 297 final GenomeLoc softClipLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); 298 profile.add(new ActivityProfileState(new SimpleInterval(softClipLoc), 1.0, ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS, softClipSize)); 299 300 final int actualNumOfSoftClips = Math.min(softClipSize, profile.getMaxProbPropagationDistance()); 301 if ( nPrecedingSites == 0 ) { 302 final int profileSize = Math.min(start + actualNumOfSoftClips, contigLength) - start + 1; 303 Assert.assertEquals(profile.size(), profileSize, "Wrong number of states in the profile"); 304 } 305 306 for ( int i = 0; i < profile.size(); i++ ) { 307 final ActivityProfileState state = profile.getStateList().get(i); 308 final boolean withinSCRange = genomeLocParser.createGenomeLoc(state.getLoc()).distance(softClipLoc) <= actualNumOfSoftClips; 309 if ( withinSCRange ) { 310 Assert.assertTrue(state.isActiveProb() > 0.0, "active prob should be changed within soft clip size"); 311 } else { 312 Assert.assertEquals(state.isActiveProb(), 0.0, "active prob shouldn't be changed outside of clip size"); 313 } 314 } 315 } 316 317 // ------------------------------------------------------------------------------------- 318 // 319 // Tests to ensure we cut large active regions in the right place 320 // 321 // ------------------------------------------------------------------------------------- 322 addProb(final List<Double> l, final double v)323 private void addProb(final List<Double> l, final double v) { 324 l.add(v); 325 } 326 327 @DataProvider(name = "ActiveRegionCutTests") makeActiveRegionCutTests()328 public Object[][] makeActiveRegionCutTests() { 329 final List<Object[]> tests = new LinkedList<Object[]>(); 330 331 // for ( final int activeRegionSize : Arrays.asList(30) ) { 332 // for ( final int minRegionSize : Arrays.asList(5) ) { 333 for ( final int activeRegionSize : Arrays.asList(10, 12, 20, 30, 40) ) { 334 for ( final int minRegionSize : Arrays.asList(1, 5, 10) ) { 335 final int maxRegionSize = activeRegionSize * 2 / 3; 336 if ( minRegionSize >= maxRegionSize ) continue; 337 { // test flat activity profile 338 final List<Double> probs = Collections.nCopies(activeRegionSize, 1.0); 339 tests.add(new Object[]{minRegionSize, maxRegionSize, maxRegionSize, probs}); 340 } 341 342 { // test point profile is properly handled 343 for ( int end = 1; end < activeRegionSize; end++ ) { 344 final List<Double> probs = Collections.nCopies(end, 1.0); 345 tests.add(new Object[]{minRegionSize, maxRegionSize, Math.min(end, maxRegionSize), probs}); 346 } 347 } 348 349 { // test increasing activity profile 350 final List<Double> probs = new ArrayList<Double>(activeRegionSize); 351 for ( int i = 0; i < activeRegionSize; i++ ) { 352 addProb(probs, (1.0*(i+1))/ activeRegionSize); 353 } 354 tests.add(new Object[]{minRegionSize, maxRegionSize, maxRegionSize, probs}); 355 } 356 357 { // test decreasing activity profile 358 final List<Double> probs = new ArrayList<Double>(activeRegionSize); 359 for ( int i = 0; i < activeRegionSize; i++ ) { 360 addProb(probs, 1 - (1.0*(i+1))/ activeRegionSize); 361 } 362 tests.add(new Object[]{minRegionSize, maxRegionSize, maxRegionSize, probs}); 363 } 364 365 { // test two peaks 366 // for ( final double rootSigma : Arrays.asList(2.0) ) { 367 // int maxPeak1 = 9; { 368 // int maxPeak2 = 16; { 369 for ( final double rootSigma : Arrays.asList(1.0, 2.0, 3.0) ) { 370 for ( int maxPeak1 = 0; maxPeak1 < activeRegionSize / 2; maxPeak1++ ) { 371 for ( int maxPeak2 = activeRegionSize / 2 + 1; maxPeak2 < activeRegionSize; maxPeak2++ ) { 372 final double[] gauss1 = makeGaussian(maxPeak1, activeRegionSize, rootSigma); 373 final double[] gauss2 = makeGaussian(maxPeak2, activeRegionSize, rootSigma+1); 374 final List<Double> probs = new ArrayList<Double>(activeRegionSize); 375 for ( int i = 0; i < activeRegionSize; i++ ) { 376 addProb(probs, gauss1[i] + gauss2[i]); 377 } 378 final int cutSite = findCutSiteForTwoMaxPeaks(probs, minRegionSize); 379 if ( cutSite != -1 && cutSite < maxRegionSize ) 380 tests.add(new Object[]{minRegionSize, maxRegionSize, Math.max(cutSite, minRegionSize), probs}); 381 } 382 } 383 } 384 } 385 386 { // test that the lowest of two minima is taken 387 // looks like a bunch of 1s, 0.5, some 1.0s, 0.75, some more 1s 388 // int firstMin = 0; { 389 // int secondMin = 4; { 390 for ( int firstMin = 1; firstMin < activeRegionSize; firstMin++ ) { 391 for ( int secondMin = firstMin + 1; secondMin < activeRegionSize; secondMin++ ) { 392 final List<Double> probs = new ArrayList<Double>(Collections.nCopies(activeRegionSize, 1.0)); 393 probs.set(firstMin, 0.5); 394 probs.set(secondMin, 0.75); 395 final int expectedCut; 396 if ( firstMin + 1 < minRegionSize ) { 397 if ( firstMin == secondMin - 1 ) // edge case for non-min at minRegionSize 398 expectedCut = maxRegionSize; 399 else 400 expectedCut = secondMin + 1 > maxRegionSize ? maxRegionSize : ( secondMin + 1 < minRegionSize ? maxRegionSize : secondMin + 1); 401 } else if ( firstMin + 1 > maxRegionSize ) 402 expectedCut = maxRegionSize; 403 else { 404 expectedCut = firstMin + 1; 405 } 406 407 Math.min(firstMin + 1, maxRegionSize); 408 tests.add(new Object[]{minRegionSize, maxRegionSize, expectedCut, probs}); 409 } 410 } 411 } 412 } 413 } 414 415 return tests.toArray(new Object[][]{}); 416 } 417 makeGaussian(final int mean, final int range, final double sigma)418 private double[] makeGaussian(final int mean, final int range, final double sigma) { 419 final double[] gauss = new double[range]; 420 for( int iii = 0; iii < range; iii++ ) { 421 gauss[iii] = MathUtils.normalDistribution(mean, sigma, iii) + ACTIVE_PROB_THRESHOLD; 422 } 423 return gauss; 424 } 425 findCutSiteForTwoMaxPeaks(final List<Double> probs, final int minRegionSize)426 private int findCutSiteForTwoMaxPeaks(final List<Double> probs, final int minRegionSize) { 427 for ( int i = probs.size() - 2; i > minRegionSize; i-- ) { 428 double prev = probs.get(i - 1); 429 double next = probs.get(i + 1); 430 double cur = probs.get(i); 431 if ( cur < next && cur < prev ) 432 return i + 1; 433 } 434 435 return -1; 436 } 437 438 @Test(dataProvider = "ActiveRegionCutTests") testActiveRegionCutTests(final int minRegionSize, final int maxRegionSize, final int expectedRegionSize, final List<Double> probs)439 public void testActiveRegionCutTests(final int minRegionSize, final int maxRegionSize, final int expectedRegionSize, final List<Double> probs) { 440 final ActivityProfile profile = new ActivityProfile(MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, header); 441 442 final String contig = genomeLocParser.getSequenceDictionary().getSequences().get(0).getSequenceName(); 443 for ( int i = 0; i <= maxRegionSize + profile.getMaxProbPropagationDistance(); i++ ) { 444 final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + 1); 445 final double prob = i < probs.size() ? probs.get(i) : 0.0; 446 final ActivityProfileState state = new ActivityProfileState(new SimpleInterval(loc), prob); 447 profile.add(state); 448 } 449 450 final List<AssemblyRegion> regions = profile.popReadyAssemblyRegions(0, minRegionSize, maxRegionSize, false); 451 Assert.assertTrue(regions.size() >= 1, "Should only be one regions for this test"); 452 final AssemblyRegion region = regions.get(0); 453 Assert.assertEquals(region.getSpan().getStart(), 1, "Region should start at 1"); 454 Assert.assertEquals(region.getSpan().size(), expectedRegionSize, "Incorrect region size; cut must have been incorrect"); 455 } 456 } 457