1 package org.broadinstitute.hellbender.tools.walkers.readorientation;
2 
3 import htsjdk.variant.variantcontext.VariantContext;
4 import org.apache.commons.lang3.tuple.ImmutableTriple;
5 import org.apache.commons.lang3.tuple.Triple;
6 import org.broadinstitute.hellbender.CommandLineProgramTest;
7 import org.broadinstitute.hellbender.GATKBaseTest;
8 import org.broadinstitute.hellbender.Main;
9 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
10 import org.broadinstitute.hellbender.engine.FeatureDataSource;
11 import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
12 import org.broadinstitute.hellbender.tools.walkers.SplitIntervals;
13 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection;
14 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls;
15 import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2;
16 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.M2FiltersArgumentCollection;
17 import org.broadinstitute.hellbender.utils.SimpleInterval;
18 import org.broadinstitute.hellbender.utils.io.IOUtils;
19 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
20 import org.testng.Assert;
21 import org.testng.annotations.DataProvider;
22 import org.testng.annotations.Test;
23 
24 import java.io.File;
25 import java.io.IOException;
26 import java.nio.file.Files;
27 import java.nio.file.Paths;
28 import java.util.*;
29 import java.util.stream.Collectors;
30 import java.util.stream.IntStream;
31 import java.util.stream.StreamSupport;
32 
33 public class LearnReadOrientationModelIntegrationTest extends CommandLineProgramTest {
34     public static final String testDir = toolsTestDir + "read_orientation_filter/";
35     public static final String hapmapBamSnippet = testDir + "hapmap-20-plex-chr-20-21-read-orientation.bam";
36     public static final String intervalList = testDir + "hapmap-20-plex-chr-20-21-read-orientation.interval_list";
37 
38     @DataProvider(name = "scatterCounts")
getScatterCounts()39     public Object[][] getScatterCounts(){
40         return new Object[][]{{1}, {5}};
41     }
42     /**
43      * Test that the tool sites of orientation bias that are manually picked out.
44      * Also tests scattering CollectF1R2Counts
45      */
46     @Test(dataProvider = "scatterCounts")
testOnRealBam(final int scatterCount)47     public void testOnRealBam(final int scatterCount) throws IOException {
48         final File scatteredDir = createTempDir("scattered");
49 
50         // Step 1: SplitIntervals
51         final File intervalDir = createTempDir("intervals");
52 
53         final ArgumentsBuilder splitIntervalsArgs = new ArgumentsBuilder()
54                 .addReference(b37Reference)
55                 .addIntervals(new File(intervalList))
56                 .addOutput(intervalDir)
57                 .add(SplitIntervals.SCATTER_COUNT_SHORT_NAME, scatterCount);
58 
59         runCommandLine(splitIntervalsArgs, SplitIntervals.class.getSimpleName());
60 
61         // Step 2: CollectF1R2Counts
62         final File[] intervals = intervalDir.listFiles();
63         final List<File> extractedDirs = IntStream.range(0, intervals.length).mapToObj(i -> createTempDir("extracted_" + i)).collect(Collectors.toList());
64         final List<File> scatteredTarGzs = IntStream.range(0, intervals.length).mapToObj(i -> new File(scatteredDir, "scatter_" + i + ".tar.gz")).collect(Collectors.toList());
65         for (int i = 0; i < intervals.length; i++){
66 
67             final ArgumentsBuilder collectF1R2CountsArgs = new ArgumentsBuilder()
68                     .addReference(b37Reference)
69                     .addInput(new File(hapmapBamSnippet))
70                     .addIntervals(intervals[i])
71                     .addOutput(scatteredTarGzs.get(i));
72 
73             runCommandLine(collectF1R2CountsArgs, CollectF1R2Counts.class.getSimpleName());
74             IOUtils.extractTarGz(scatteredTarGzs.get(i).toPath(), extractedDirs.get(i).toPath());
75             final File refHist = F1R2CountsCollector.getRefHistogramsFromExtractedTar(extractedDirs.get(i)).get(0);
76 
77             // Ensure that we print every bin, even when the count is 0
78             final int lineCount = (int) Files.lines(Paths.get(refHist.getAbsolutePath())).filter(l -> l.matches("^[0-9].+")).count();
79             Assert.assertEquals(lineCount, F1R2FilterConstants.DEFAULT_MAX_DEPTH);
80         }
81 
82         // Step 3: LearnReadOrientationModel
83         final File priorTarGz = createTempFile("prior", ".tar.gz");
84         final ArgumentsBuilder args = new ArgumentsBuilder()
85                 .addOutput(priorTarGz);
86         IntStream.range(0, intervals.length).forEach(n -> args.addInput(scatteredTarGzs.get(n)));
87 
88         runCommandLine(args.getArgsList(), LearnReadOrientationModel.class.getSimpleName());
89 
90         final File extractedPriorDir = createTempDir("extracted_priors");
91         IOUtils.extractTarGz(priorTarGz.toPath(), extractedPriorDir.toPath());
92 
93 
94         final ArtifactPriorCollection artifactPriorCollection = ArtifactPriorCollection.readArtifactPriors(extractedPriorDir.listFiles()[0]);
95 
96         // Step 4: Mutect 2
97         final File unfilteredVcf = GATKBaseTest.createTempFile("unfiltered", ".vcf");
98         final File filteredVcf = GATKBaseTest.createTempFile("filtered", ".vcf");
99         final File bamout = GATKBaseTest.createTempFile("SM-CEMAH", ".bam");
100 
101         final ArgumentsBuilder mutect2Args = new ArgumentsBuilder()
102                 .addReference(b37Reference)
103                 .addInput(new File(hapmapBamSnippet))
104                 .addOutput(unfilteredVcf)
105                 .add(AssemblyBasedCallerArgumentCollection.BAM_OUTPUT_LONG_NAME, bamout);
106         runCommandLine(mutect2Args, Mutect2.class.getSimpleName());
107 
108         final ArgumentsBuilder filterArgs = new ArgumentsBuilder()
109                 .addReference(b37Reference)
110                 .addVCF(unfilteredVcf)
111                 .add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, priorTarGz)
112                 .addOutput(filteredVcf);
113 
114         runCommandLine(filterArgs, FilterMutectCalls.class.getSimpleName());
115 
116         // These artifacts have been verified manually
117         // The pair is of type (Position, Expected Source of Prior Probability)
118         // Prior for e.g. TGA->A F2R1 should come from TCA->T F1R2
119         final List<Triple<Integer, ReadOrientation, ArtifactState>> knownArtifacts = Arrays.asList(
120                 new ImmutableTriple<>(23421079, ReadOrientation.F2R1, ArtifactState.F2R1_G), // CAC->G F2R1
121                 new ImmutableTriple<>(34144749, ReadOrientation.F2R1, ArtifactState.F2R1_A), // TGA->A F2R1, ref context is not canonical
122                 new ImmutableTriple<>(62165528, ReadOrientation.F1R2, ArtifactState.F1R2_A)); // CGC->A F1R2
123 
124         for (final Triple<Integer, ReadOrientation, ArtifactState> artifact : knownArtifacts) {
125             final int position = artifact.getLeft();
126 
127             Optional<VariantContext> variant = StreamSupport.stream(new FeatureDataSource<VariantContext>(filteredVcf).spliterator(), false)
128                     .filter(vc -> vc.getStart() == position).findFirst();
129             Assert.assertTrue(variant.isPresent());
130 
131             // Check that the expected filters were applied
132             Assert.assertTrue(variant.get().getFilters().contains(GATKVCFConstants.READ_ORIENTATION_ARTIFACT_FILTER_NAME));
133         }
134     }
135 
136     @Test
testTwoSamples()137     public void testTwoSamples() throws Exception {
138         final File countsTarGz = createTempFile("counts", ".tar.gz");
139         final File priorsTarGz = createTempFile("priors", ".tar.gz");
140         final String sample1 = "SAMPLE1";
141         final String sample2 = "SAMPLE2";
142         final File sam1 = CollectF1R2CountsIntegrationTest.createSyntheticSam(10, 1, sample1);
143         final File sam2 = CollectF1R2CountsIntegrationTest.createSyntheticSam(20, 2, sample2);
144 
145 
146         new Main().instanceMain(makeCommandLineArgs(Arrays.asList(
147                 "-R", hg19_chr1_1M_Reference, "-I", sam1.getAbsolutePath(), "-I", sam2.getAbsolutePath(), "-O", countsTarGz.getAbsolutePath()),
148                 CollectF1R2Counts.class.getSimpleName()));
149 
150         final ArgumentsBuilder args = new ArgumentsBuilder()
151                 .add(StandardArgumentDefinitions.INPUT_LONG_NAME, countsTarGz.getAbsolutePath())
152                 .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, priorsTarGz.getAbsolutePath());
153 
154         runCommandLine(args, LearnReadOrientationModel.class.getSimpleName());
155 
156         final File extractedPriorsDir = createTempDir("extracted");
157         IOUtils.extractTarGz(priorsTarGz.toPath(), extractedPriorsDir.toPath());
158 
159         Assert.assertTrue(new File(extractedPriorsDir, sample1 + LearnReadOrientationModel.ARTIFACT_PRIOR_EXTENSION).exists());
160         Assert.assertTrue(new File(extractedPriorsDir, sample2 + LearnReadOrientationModel.ARTIFACT_PRIOR_EXTENSION).exists());
161     }
162 
163     // make sure that nothing goes wrong if the target territory is so small that no data exists for some contexts
164     @Test
testFewSites()165     public void testFewSites() throws IOException {
166         // Step 1: CollectF1R2Counts
167         final File extractedDir = createTempDir("extracted");
168         final File scatteredTarGz = createTempFile("counts", ".tar.gz");
169 
170         final ArgumentsBuilder collectF1R2Args = new ArgumentsBuilder()
171                 .addReference(b37Reference)
172                 .addInput(new File(hapmapBamSnippet))
173                 .addInterval(new SimpleInterval("20:10000-10001"))
174                 .addOutput(scatteredTarGz);
175 
176         runCommandLine(collectF1R2Args, CollectF1R2Counts.class.getSimpleName());
177 
178         IOUtils.extractTarGz(scatteredTarGz.toPath(), extractedDir.toPath());
179 
180         // Step 2: LearnReadOrientationModel
181         final File priorTarGz = createTempFile("prior", ".tar.gz");
182         final ArgumentsBuilder args = new ArgumentsBuilder()
183                 .addInput(scatteredTarGz)
184                 .addOutput(priorTarGz);
185 
186         runCommandLine(args, LearnReadOrientationModel.class.getSimpleName());
187 
188         final File extractedPriorDir = createTempDir("extracted_priors");
189         IOUtils.extractTarGz(priorTarGz.toPath(), extractedPriorDir.toPath());
190 
191         final ArtifactPriorCollection artifactPriorCollection = ArtifactPriorCollection.readArtifactPriors(extractedPriorDir.listFiles()[0]);
192 
193         // Step 4: Mutect 2
194         final File unfilteredVcf = GATKBaseTest.createTempFile("unfiltered", ".vcf");
195         final File filteredVcf = GATKBaseTest.createTempFile("filtered", ".vcf");
196         final File bamout = GATKBaseTest.createTempFile("SM-CEMAH", ".bam");
197 
198 
199         final ArgumentsBuilder mutect2Args = new ArgumentsBuilder()
200                 .addReference(b37Reference)
201                 .addInput(new File(hapmapBamSnippet))
202                 .addInterval(new SimpleInterval("20:24000000-26000000"))
203                 .add(AssemblyBasedCallerArgumentCollection.BAM_OUTPUT_LONG_NAME, bamout)
204                 .addOutput(unfilteredVcf);
205         runCommandLine(mutect2Args, Mutect2.class.getSimpleName());
206 
207         final ArgumentsBuilder filterArgs = new ArgumentsBuilder()
208                 .addReference(b37Reference)
209                 .addVCF(unfilteredVcf)
210                 .add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, priorTarGz)
211                 .addOutput(filteredVcf);
212 
213         runCommandLine(filterArgs, FilterMutectCalls.class.getSimpleName());
214     }
215 }