1 package org.broadinstitute.hellbender.tools.walkers.readorientation; 2 3 import htsjdk.variant.variantcontext.VariantContext; 4 import org.apache.commons.lang3.tuple.ImmutableTriple; 5 import org.apache.commons.lang3.tuple.Triple; 6 import org.broadinstitute.hellbender.CommandLineProgramTest; 7 import org.broadinstitute.hellbender.GATKBaseTest; 8 import org.broadinstitute.hellbender.Main; 9 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 10 import org.broadinstitute.hellbender.engine.FeatureDataSource; 11 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; 12 import org.broadinstitute.hellbender.tools.walkers.SplitIntervals; 13 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; 14 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls; 15 import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2; 16 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.M2FiltersArgumentCollection; 17 import org.broadinstitute.hellbender.utils.SimpleInterval; 18 import org.broadinstitute.hellbender.utils.io.IOUtils; 19 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; 20 import org.testng.Assert; 21 import org.testng.annotations.DataProvider; 22 import org.testng.annotations.Test; 23 24 import java.io.File; 25 import java.io.IOException; 26 import java.nio.file.Files; 27 import java.nio.file.Paths; 28 import java.util.*; 29 import java.util.stream.Collectors; 30 import java.util.stream.IntStream; 31 import java.util.stream.StreamSupport; 32 33 public class LearnReadOrientationModelIntegrationTest extends CommandLineProgramTest { 34 public static final String testDir = toolsTestDir + "read_orientation_filter/"; 35 public static final String hapmapBamSnippet = testDir + "hapmap-20-plex-chr-20-21-read-orientation.bam"; 36 public static final String intervalList = testDir + "hapmap-20-plex-chr-20-21-read-orientation.interval_list"; 37 38 @DataProvider(name = "scatterCounts") getScatterCounts()39 public Object[][] getScatterCounts(){ 40 return new Object[][]{{1}, {5}}; 41 } 42 /** 43 * Test that the tool sites of orientation bias that are manually picked out. 44 * Also tests scattering CollectF1R2Counts 45 */ 46 @Test(dataProvider = "scatterCounts") testOnRealBam(final int scatterCount)47 public void testOnRealBam(final int scatterCount) throws IOException { 48 final File scatteredDir = createTempDir("scattered"); 49 50 // Step 1: SplitIntervals 51 final File intervalDir = createTempDir("intervals"); 52 53 final ArgumentsBuilder splitIntervalsArgs = new ArgumentsBuilder() 54 .addReference(b37Reference) 55 .addIntervals(new File(intervalList)) 56 .addOutput(intervalDir) 57 .add(SplitIntervals.SCATTER_COUNT_SHORT_NAME, scatterCount); 58 59 runCommandLine(splitIntervalsArgs, SplitIntervals.class.getSimpleName()); 60 61 // Step 2: CollectF1R2Counts 62 final File[] intervals = intervalDir.listFiles(); 63 final List<File> extractedDirs = IntStream.range(0, intervals.length).mapToObj(i -> createTempDir("extracted_" + i)).collect(Collectors.toList()); 64 final List<File> scatteredTarGzs = IntStream.range(0, intervals.length).mapToObj(i -> new File(scatteredDir, "scatter_" + i + ".tar.gz")).collect(Collectors.toList()); 65 for (int i = 0; i < intervals.length; i++){ 66 67 final ArgumentsBuilder collectF1R2CountsArgs = new ArgumentsBuilder() 68 .addReference(b37Reference) 69 .addInput(new File(hapmapBamSnippet)) 70 .addIntervals(intervals[i]) 71 .addOutput(scatteredTarGzs.get(i)); 72 73 runCommandLine(collectF1R2CountsArgs, CollectF1R2Counts.class.getSimpleName()); 74 IOUtils.extractTarGz(scatteredTarGzs.get(i).toPath(), extractedDirs.get(i).toPath()); 75 final File refHist = F1R2CountsCollector.getRefHistogramsFromExtractedTar(extractedDirs.get(i)).get(0); 76 77 // Ensure that we print every bin, even when the count is 0 78 final int lineCount = (int) Files.lines(Paths.get(refHist.getAbsolutePath())).filter(l -> l.matches("^[0-9].+")).count(); 79 Assert.assertEquals(lineCount, F1R2FilterConstants.DEFAULT_MAX_DEPTH); 80 } 81 82 // Step 3: LearnReadOrientationModel 83 final File priorTarGz = createTempFile("prior", ".tar.gz"); 84 final ArgumentsBuilder args = new ArgumentsBuilder() 85 .addOutput(priorTarGz); 86 IntStream.range(0, intervals.length).forEach(n -> args.addInput(scatteredTarGzs.get(n))); 87 88 runCommandLine(args.getArgsList(), LearnReadOrientationModel.class.getSimpleName()); 89 90 final File extractedPriorDir = createTempDir("extracted_priors"); 91 IOUtils.extractTarGz(priorTarGz.toPath(), extractedPriorDir.toPath()); 92 93 94 final ArtifactPriorCollection artifactPriorCollection = ArtifactPriorCollection.readArtifactPriors(extractedPriorDir.listFiles()[0]); 95 96 // Step 4: Mutect 2 97 final File unfilteredVcf = GATKBaseTest.createTempFile("unfiltered", ".vcf"); 98 final File filteredVcf = GATKBaseTest.createTempFile("filtered", ".vcf"); 99 final File bamout = GATKBaseTest.createTempFile("SM-CEMAH", ".bam"); 100 101 final ArgumentsBuilder mutect2Args = new ArgumentsBuilder() 102 .addReference(b37Reference) 103 .addInput(new File(hapmapBamSnippet)) 104 .addOutput(unfilteredVcf) 105 .add(AssemblyBasedCallerArgumentCollection.BAM_OUTPUT_LONG_NAME, bamout); 106 runCommandLine(mutect2Args, Mutect2.class.getSimpleName()); 107 108 final ArgumentsBuilder filterArgs = new ArgumentsBuilder() 109 .addReference(b37Reference) 110 .addVCF(unfilteredVcf) 111 .add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, priorTarGz) 112 .addOutput(filteredVcf); 113 114 runCommandLine(filterArgs, FilterMutectCalls.class.getSimpleName()); 115 116 // These artifacts have been verified manually 117 // The pair is of type (Position, Expected Source of Prior Probability) 118 // Prior for e.g. TGA->A F2R1 should come from TCA->T F1R2 119 final List<Triple<Integer, ReadOrientation, ArtifactState>> knownArtifacts = Arrays.asList( 120 new ImmutableTriple<>(23421079, ReadOrientation.F2R1, ArtifactState.F2R1_G), // CAC->G F2R1 121 new ImmutableTriple<>(34144749, ReadOrientation.F2R1, ArtifactState.F2R1_A), // TGA->A F2R1, ref context is not canonical 122 new ImmutableTriple<>(62165528, ReadOrientation.F1R2, ArtifactState.F1R2_A)); // CGC->A F1R2 123 124 for (final Triple<Integer, ReadOrientation, ArtifactState> artifact : knownArtifacts) { 125 final int position = artifact.getLeft(); 126 127 Optional<VariantContext> variant = StreamSupport.stream(new FeatureDataSource<VariantContext>(filteredVcf).spliterator(), false) 128 .filter(vc -> vc.getStart() == position).findFirst(); 129 Assert.assertTrue(variant.isPresent()); 130 131 // Check that the expected filters were applied 132 Assert.assertTrue(variant.get().getFilters().contains(GATKVCFConstants.READ_ORIENTATION_ARTIFACT_FILTER_NAME)); 133 } 134 } 135 136 @Test testTwoSamples()137 public void testTwoSamples() throws Exception { 138 final File countsTarGz = createTempFile("counts", ".tar.gz"); 139 final File priorsTarGz = createTempFile("priors", ".tar.gz"); 140 final String sample1 = "SAMPLE1"; 141 final String sample2 = "SAMPLE2"; 142 final File sam1 = CollectF1R2CountsIntegrationTest.createSyntheticSam(10, 1, sample1); 143 final File sam2 = CollectF1R2CountsIntegrationTest.createSyntheticSam(20, 2, sample2); 144 145 146 new Main().instanceMain(makeCommandLineArgs(Arrays.asList( 147 "-R", hg19_chr1_1M_Reference, "-I", sam1.getAbsolutePath(), "-I", sam2.getAbsolutePath(), "-O", countsTarGz.getAbsolutePath()), 148 CollectF1R2Counts.class.getSimpleName())); 149 150 final ArgumentsBuilder args = new ArgumentsBuilder() 151 .add(StandardArgumentDefinitions.INPUT_LONG_NAME, countsTarGz.getAbsolutePath()) 152 .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, priorsTarGz.getAbsolutePath()); 153 154 runCommandLine(args, LearnReadOrientationModel.class.getSimpleName()); 155 156 final File extractedPriorsDir = createTempDir("extracted"); 157 IOUtils.extractTarGz(priorsTarGz.toPath(), extractedPriorsDir.toPath()); 158 159 Assert.assertTrue(new File(extractedPriorsDir, sample1 + LearnReadOrientationModel.ARTIFACT_PRIOR_EXTENSION).exists()); 160 Assert.assertTrue(new File(extractedPriorsDir, sample2 + LearnReadOrientationModel.ARTIFACT_PRIOR_EXTENSION).exists()); 161 } 162 163 // make sure that nothing goes wrong if the target territory is so small that no data exists for some contexts 164 @Test testFewSites()165 public void testFewSites() throws IOException { 166 // Step 1: CollectF1R2Counts 167 final File extractedDir = createTempDir("extracted"); 168 final File scatteredTarGz = createTempFile("counts", ".tar.gz"); 169 170 final ArgumentsBuilder collectF1R2Args = new ArgumentsBuilder() 171 .addReference(b37Reference) 172 .addInput(new File(hapmapBamSnippet)) 173 .addInterval(new SimpleInterval("20:10000-10001")) 174 .addOutput(scatteredTarGz); 175 176 runCommandLine(collectF1R2Args, CollectF1R2Counts.class.getSimpleName()); 177 178 IOUtils.extractTarGz(scatteredTarGz.toPath(), extractedDir.toPath()); 179 180 // Step 2: LearnReadOrientationModel 181 final File priorTarGz = createTempFile("prior", ".tar.gz"); 182 final ArgumentsBuilder args = new ArgumentsBuilder() 183 .addInput(scatteredTarGz) 184 .addOutput(priorTarGz); 185 186 runCommandLine(args, LearnReadOrientationModel.class.getSimpleName()); 187 188 final File extractedPriorDir = createTempDir("extracted_priors"); 189 IOUtils.extractTarGz(priorTarGz.toPath(), extractedPriorDir.toPath()); 190 191 final ArtifactPriorCollection artifactPriorCollection = ArtifactPriorCollection.readArtifactPriors(extractedPriorDir.listFiles()[0]); 192 193 // Step 4: Mutect 2 194 final File unfilteredVcf = GATKBaseTest.createTempFile("unfiltered", ".vcf"); 195 final File filteredVcf = GATKBaseTest.createTempFile("filtered", ".vcf"); 196 final File bamout = GATKBaseTest.createTempFile("SM-CEMAH", ".bam"); 197 198 199 final ArgumentsBuilder mutect2Args = new ArgumentsBuilder() 200 .addReference(b37Reference) 201 .addInput(new File(hapmapBamSnippet)) 202 .addInterval(new SimpleInterval("20:24000000-26000000")) 203 .add(AssemblyBasedCallerArgumentCollection.BAM_OUTPUT_LONG_NAME, bamout) 204 .addOutput(unfilteredVcf); 205 runCommandLine(mutect2Args, Mutect2.class.getSimpleName()); 206 207 final ArgumentsBuilder filterArgs = new ArgumentsBuilder() 208 .addReference(b37Reference) 209 .addVCF(unfilteredVcf) 210 .add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, priorTarGz) 211 .addOutput(filteredVcf); 212 213 runCommandLine(filterArgs, FilterMutectCalls.class.getSimpleName()); 214 } 215 }