1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs;
2 
3 import com.google.common.base.Strings;
4 import htsjdk.samtools.Cigar;
5 import htsjdk.samtools.CigarElement;
6 import htsjdk.samtools.CigarOperator;
7 import org.broadinstitute.hellbender.GATKBaseTest;
8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.JunctionTreeLinkedDeBruijnGraph;
9 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
10 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
11 import org.broadinstitute.hellbender.utils.read.CigarBuilder;
12 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner;
13 import org.testng.Assert;
14 import org.testng.annotations.DataProvider;
15 import org.testng.annotations.Test;
16 
17 import java.util.*;
18 import java.util.stream.Collectors;
19 import java.util.stream.IntStream;
20 
21 public class JunctionTreeKBestHaplotypeFinderUnitTest extends GATKBaseTest {
22 
getBytes(final String alignment)23     public static byte[] getBytes(final String alignment) {
24         return alignment.replace("-","").getBytes();
25     }
26 
27     @DataProvider (name = "loopingReferences")
loopingReferencesRecoverablehaplotypes()28     public static Object[][] loopingReferencesRecoverablehaplotypes() {
29         return new Object[][]{
30                 new Object[]{"GACACACAGTCA", 3, 8, true}, //ACA and CAC are repeated, 9 is enough bases to span the path
31                 //TODO this fails due to new conservative edge pruning code  new Object[]{"GACACACAGTCA", 3, 5, false}, //ACA and CAC are repeated, 8 is not
32                 new Object[]{"GACACTCACGCACGG", 3, 6, true}, //CAC repeated thrice, showing that the loops are handled somewhat sanely
33                 new Object[]{"GACATCGACGG", 3, 11, false}, //GAC repeated twice, starting with GAC, can it recover the reference if it starts on a repeated kmer (should not be recoverable based on our decisions)
34                 new Object[]{"GACATCGACGG", 3, 6, false}, //With non-unique reference start we fall over for looping structures
35                 new Object[]{"GACATCGACGG", 3, 6, false}, //reads too short to be resolvable
36                 new Object[]{"GACATCACATC", 3, 8, true}, //final kmer ATC is repeated, can we recover the reference for repating final kmer
37 
38                 // Some less complicated cases
39                 new Object[]{"ACTGTGGGGGGGGGGGGCCGCG", 5, 14, true}, //Just long enough to be resolvable (12 repeated G's with 1 base anchor on either side)
40                 new Object[]{"ACTGTGGGGGGGGGGGGCCGCG", 5, 12, false}, //Just too short to be resolvable
41 
42                 new Object[]{"ACTGTTCTCTCTCTCTCCCGCG", 5, 14, true}, //Just long enough to be resolvable
43                 //TODO this fails due to new conservative edge pruning code new Object[]{"ACTGTTCTCTCTCTCTCCCGCG", 5, 9, false}, //Just too short to be resolvable
44         };
45     }
46 
47     @Test (dataProvider = "loopingReferences")
testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize, final int readlength, final boolean resolvable)48     public void testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize, final int readlength, final boolean resolvable) {
49         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
50         assembler.addSequence("anonymous", getBytes(ref), true);
51         // Add "reads" to the graph
52         for (int i = 0; i + readlength <= ref.length(); i ++) {
53             assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(ref), i, i + readlength), false);
54         }
55         assembler.buildGraphIfNecessary();
56         assembler.generateJunctionTrees();
57 
58         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
59                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
60 
61         // For all of these loops we expect to recover at least the reference haplotype
62         Assert.assertTrue(bestPaths.contains(ref));
63 
64         if (resolvable) {
65             Assert.assertEquals(bestPaths.size(), 1);
66         } else {
67             Assert.assertTrue(bestPaths.size() > 1);
68         }
69     }
70 
71     @Test (enabled = false) //TODO this test needs to be reconsidered, this is the "baby thrown out with the bathwater" for ignoring reference weight edges wherever possible,
72     // todo                             this loop becomes unresolvable because there is no evidence for the path out of the loop so the chain stops expanding after a point
73     // Asserting that for a very degenerate looping case (the only weight resides in the loop (which could happen due to non-unique kmers for unmerged dangling ends)
74     // note that this particular case is recovered by virtue of the fact that the reference path itself has weight
75     // This test asserts that ref path weight is still taken into accoun
testDegenerateLoopingCase()76     public void testDegenerateLoopingCase() {
77         final String ref = "GACACTCACGCACGG";
78         final int kmerSize = 3;
79         final int readlength = 6;
80 
81         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
82         assembler.addSequence("anonymous", getBytes(ref), true);
83         // Add "reads" to the graph
84         for (int i = 0; i + readlength < ref.length(); i ++) {
85             assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(ref), i, i + readlength), false);
86         }
87         assembler.buildGraphIfNecessary();
88         assembler.generateJunctionTrees();
89 
90         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
91                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
92 
93         // For all of these loops we expect to recover at least the reference haplotype
94         Assert.assertTrue(bestPaths.contains(ref));
95 
96         Assert.assertEquals(bestPaths.size(), 5);
97 
98     }
99 
100     @Test
testRecoveryOfAPathDroppedByJunctionTrees()101     public void testRecoveryOfAPathDroppedByJunctionTrees() {
102         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
103         String ref = "AAAACAC"+"C"+"ATGTGCGG"+"T"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg
104 
105         // A simple snip het
106         String read1 = "AAAACAC"+"T"+"CTGTGCGG"+"C"+"GGGTT"; // CGAC merges with the below read
107         String read2 =                "TGTGCGG"+"A"+"GGGTT"; // doesn't show up in the first graph
108 
109         assembler.addSequence("reference", getBytes(ref), true);
110 
111         assembler.addSequence("anonymous", getBytes(read1), false);
112         assembler.addSequence("anonymous", getBytes(read1), false);
113         assembler.addSequence("anonymous", getBytes(read1), false);
114 
115         assembler.addSequence("anonymous", getBytes(read2), false);
116 
117         assembler.buildGraphIfNecessary();
118         assembler.generateJunctionTrees();
119 
120         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
121                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
122 
123         Assert.assertEquals(bestPaths.size(), 2);
124         Assert.assertEquals(bestPaths.get(0), read1);
125         Assert.assertEquals(bestPaths.get(1), "AAAACAC"+"T"+"C" + read2); //asserting that the front of read 1 was appended to read 2 for haplotype construction
126     }
127 
128     @Test
129     // This test documents one of the known edge cases in the pivotal edge recovery code where sometimes an edge might be
130     // dropped if there doesn't happen to be any result path that connects to the root vertex of this dorpped path
testRecoveryOfAPathDroppedByJunctionTreeFailureDueToStillBeingUnreachable()131     public void testRecoveryOfAPathDroppedByJunctionTreeFailureDueToStillBeingUnreachable() {
132         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
133         String ref = "AAAACAC"+"C"+"ATGTGCGG"+"TTTAGAGAG"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph is interestingg
134 
135         // A simple snip het
136         String read1 = "AAAACAC"+"T"+"CTGTGCGG"+"C"+"GGGTT"; // CGAC merges with the below read
137         String read2 =                    "TAGAGTG"+"GGGTT"; // creates a branch off of the uncovered reference path
138 
139         assembler.addSequence("reference", getBytes(ref), true);
140 
141         assembler.addSequence("anonymous", getBytes(read1), false);
142         assembler.addSequence("anonymous", getBytes(read1), false);
143         assembler.addSequence("anonymous", getBytes(read1), false);
144 
145         assembler.addSequence("anonymous", getBytes(read2), false);
146 
147         assembler.buildGraphIfNecessary();
148         assembler.generateJunctionTrees();
149 
150         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
151                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
152 
153         Assert.assertEquals(bestPaths.size(), 1);
154         Assert.assertEquals(bestPaths.get(0), read1);
155     }
156 
157     // TODO this test is disabled because currently we have NOT implemented, so now we are simply using the score of the output path as the
158     // TODO heurisitic for stapling the front onto this path. this might be addressed in the future.
159     @Test (enabled = false)
testRecoveryOfDroppedPathChoosingMostLikePathDespiteThatPathHavingAWorseScore()160     public void testRecoveryOfDroppedPathChoosingMostLikePathDespiteThatPathHavingAWorseScore() {
161         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
162         String ref =            "AAAACAC"+"C"+"ATGTGCGG"+"A"+"TTTAGAGAG"+"C"+"GGGTTCC"+"A"+"GAGAGATATA"+"C"+"GAGTTTTGTTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg
163 
164         String completePath1 =  "AAAACAC"+"T"+"ATGTGCGG"+"A"+"TTTAGAGAG"+"A"+"GGGTTCC"+"T"+"GAGAGATATA"+"C"+"GAGTTTTGTTT";
165         String completePath2 =  "AAAACAC"+"G"+"ATGTGCGG"+"A"+"TTTAGAGAG"+"A"+"GGGTTCC"+"A"+"GAGAGATATA"+"G"+"GAGTTTTGTTT";
166         String incompletePath =                "TGTGCGG"+"C"+"TTTAGAGAG"+"A"+"GGGTTCC"+"A"+"GAGAGATATA"+"G"+"GAGTTTTGTTT"; // njote that this point is a copy of path 2 after its first C variant
167 
168         // Ensure that completePath1 will have the highest score by weighting it heavily
169         assembler.addSequence("reference", getBytes(ref), true);
170 
171         assembler.addSequence("anonymous", getBytes(completePath1), false);
172         assembler.addSequence("anonymous", getBytes(completePath1), false);
173         assembler.addSequence("anonymous", getBytes(completePath1), false);
174         assembler.addSequence("anonymous", getBytes(completePath1), false);
175         assembler.addSequence("anonymous", getBytes(completePath1), false);
176 
177         // followed by lower weight complete path 2
178         assembler.addSequence("anonymous", getBytes(completePath2), false);
179         assembler.addSequence("anonymous", getBytes(completePath2), false);
180         assembler.addSequence("anonymous", getBytes(completePath2), false);
181 
182         // And the incomplete path
183         assembler.addSequence("anonymous", getBytes(incompletePath), false);
184         assembler.addSequence("anonymous", getBytes(incompletePath), false);
185 
186         assembler.buildGraphIfNecessary();
187         assembler.generateJunctionTrees();
188 
189         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
190                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
191 
192 
193         Assert.assertEquals(bestPaths.size(), 3);
194         Assert.assertEquals(bestPaths.get(0), completePath1);
195         Assert.assertEquals(bestPaths.get(1), completePath2);
196         Assert.assertEquals(bestPaths.get(2), "AAAACAC"+"G"+"A" + incompletePath); // asserting that path 2 (the most similar path) was used for the head of incomplete path
197     }
198 
199     @Test
200     // This test demonstrates a potential source of infinite looping, specifically the chain extension
testDegernerateLoopingDanglingEndInChainGatheringCode()201     public void testDegernerateLoopingDanglingEndInChainGatheringCode() {
202         final String ref = "GACACTCACGCACGGCC";
203         final String alt = "GACACTCACCCCCCCCC"; // the alt ends with a looping and un-escpaed string that is repetative, this will make a path that has no hope of escape (RUN!)
204         final int kmerSize = 5;
205         final int readlength = 8;
206 
207         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
208         assembler.addSequence("anonymous", getBytes(ref), true);
209         // Add "reads" to the graph
210         for (int i = 0; i + readlength < ref.length(); i ++) {
211             assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt), i, i + readlength), false);
212         }
213         assembler.buildGraphIfNecessary();
214         assembler.generateJunctionTrees();
215         assembler.pruneJunctionTrees(1);
216 
217         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
218                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
219 
220         Assert.assertTrue(true, "This case could have infinitely looped because the junction trees were all uninformative and pruned on the poly-C branch. There is no condition to stop because the junction tree was removed by pruning");
221     }
222 
223     @Test
224     // Asserting that for a very degenerate looping case (the only weight resides in the loop (which could happen due to non-unique kmers for unmerged dangling ends)
225     // note that this particular case is recovered by virtue of the fact that the reference path itself has weight
testDegernerateLoopingDanglingEnd()226     public void testDegernerateLoopingDanglingEnd() {
227         final String ref = "GACACTCACGCACGGCC";
228         final String alt = "GACACTCACCCCCCCCC"; // the alt ends with a looping and un-escpaed string that is repetative, this will make a path that has no hope of escape (RUN!)
229         final int kmerSize = 5;
230         final int readlength = 8;
231 
232         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
233         assembler.addSequence("anonymous", getBytes(ref), true);
234         // Add "reads" to the graph
235         for (int i = 0; i + readlength < ref.length(); i ++) {
236             assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt), i, i + readlength), false);
237         }
238         assembler.buildGraphIfNecessary();
239         assembler.generateJunctionTrees();
240 
241         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
242                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
243 
244         // For this graph structure, there is no evidence whatsoever that a path follow the reference, unfortunately this means we can currently not recover the loop in the alt
245         Assert.assertTrue(bestPaths.isEmpty());
246 
247     }
248 
249     @Test
250     // Due to there being no JT covering the reference, we assert that we are only finding the read supported alt path despite the reference path being a reasonable start path
testJunctionTreeRefusesToFollowReferencePathUnlessThereIsNoChoice()251     public void testJunctionTreeRefusesToFollowReferencePathUnlessThereIsNoChoice() {
252         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(4);
253         String ref = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg
254 
255         // A simple snip het
256         String read1 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the below read
257         String read2 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the above read
258 
259         assembler.addSequence("anonymous", getBytes(ref), true);
260         assembler.addSequence("anonymous", getBytes(read1), false);
261         assembler.addSequence("anonymous", getBytes(read2), false);
262 
263         assembler.buildGraphIfNecessary();
264         assembler.generateJunctionTrees();
265 
266         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1)
267                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
268 
269         Assert.assertEquals(bestPaths.size(), 1);
270         Assert.assertEquals(new String(bestPaths.get(0).getBytes()), read1);
271     }
272 
273     // TODO this test will soon be obsolete, we will change path termination to be based on loops soon enough (disabled because the number was bumped up to 7 rather than 4 which broke this test)
274     @Test (enabled = false)
275     // This test asserts the current behavior towards junction tree ending because of insufficient junction tree data, if we are past our point of junction tree evidence we will fail
276     // to find paths, thus this test documents the limitations therein.
testPathTerminationBasedOnDistanceFromLastHaplotype()277     public void testPathTerminationBasedOnDistanceFromLastHaplotype() {
278         final JunctionTreeLinkedDeBruijnGraph assembler1 = new JunctionTreeLinkedDeBruijnGraph(6);
279         final JunctionTreeLinkedDeBruijnGraph assembler2 = new JunctionTreeLinkedDeBruijnGraph(6);
280 
281         String ref =  "AAACAC"+"C"+"ATGGCGG"+"A"+"GGAGTT"+"T"+"GCTCGAA"+"G"+"GGCGTA"+"C"+"CCTACCT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg
282         String alt1 = "AAACAC"+"A"+"ATGGCGG"+"T"+"GGAGTT"+"G"+"GCTCGAA"+"A"+"GGCGTA"+"G"+"CCTACCT";
283         String alt2 = "AAACAC"+"A"+"ATGGCGG"+"T"+"GGAGTT"+"G"+"GCTCGAA"+"A"+"GGCGTA"+"C"+"CCTACCT"; //No expansion of the last path
284 
285         assembler1.addSequence("anonymous", getBytes(ref), true);
286         assembler2.addSequence("anonymous", getBytes(ref), true);
287         // Add short sequences of reads to generate the appropriate alt paths
288         for (int i = 0; i + 6 < ref.length(); i ++) {
289             assembler1.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt1), i, i + 7), false);
290             assembler2.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt2), i, i + 7), false);
291         }
292 
293         assembler1.buildGraphIfNecessary();
294         assembler2.buildGraphIfNecessary();
295         assembler1.generateJunctionTrees();
296         assembler2.generateJunctionTrees();
297         // Assert that the trees are all pointless
298         assembler1.getReadThreadingJunctionTrees(false).values().forEach(tree -> Assert.assertTrue(tree.getRootNode().hasNoEvidence()));
299         assembler2.getReadThreadingJunctionTrees(false).values().forEach(tree -> Assert.assertTrue(tree.getRootNode().hasNoEvidence()));
300 
301         // We had to close out our paths because none of the junction trees are informative and there are 4 decisions to make
302         final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler1).setJunctionTreeEvidenceWeightThreshold(1)
303                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
304         Assert.assertEquals(bestPaths.size(), 0);
305 
306         // We didn't have to close out our paths because there were < 4 decisions made without junction trees, thus we do recover the path
307         final List<String> bestPaths2 = new JunctionTreeKBestHaplotypeFinder<>(assembler2).setJunctionTreeEvidenceWeightThreshold(1)
308                 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList());
309         Assert.assertEquals(bestPaths2.size(), 1);
310     }
311 
312     @Test
313     // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1.
314     // We are asserting that the JunctionTree generated by this case lives on the node itself
testPerfectlyPhasedHaplotypeRecoveryRef()315     public void testPerfectlyPhasedHaplotypeRecoveryRef() {
316         int readlength = 20;
317         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
318         String ref = "AAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size
319         String alt1 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites
320 
321         // Generate some reads that do not span the entire active region
322         assembler.addSequence("anonymous", getBytes(ref), true);
323         for (int i = 0; i + readlength < ref.length(); i++) {
324             assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false);
325             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false);
326         }
327 
328         assembler.buildGraphIfNecessary();
329         assembler.generateJunctionTrees();
330 
331         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
332         Assert.assertEquals(junctionTrees.size(), 10);
333 
334         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
335         Assert.assertEquals(finder.sources.size(), 1);
336         Assert.assertEquals(finder.sinks.size(), 1);
337         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes();
338 
339         // We assert that we found all of the haplotypes present in reads and nothing else
340         Assert.assertEquals(haplotypes.size(), 2);
341         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
342         // Asserting that
343         Assert.assertTrue(foundHaplotypes.contains(alt1));
344         Assert.assertTrue(foundHaplotypes.contains(ref));
345     }
346 
347     @Test
348     // This test is intended to illustrate a problem we know causes probems for junction trees, namely that despite there being
349     // evidence for a particular path, if there is insufficient evidence we simply end up with combinatorial expansion like before
testInsufficientJunctionTreeDataCausingCombinatorialExpansion()350     public void testInsufficientJunctionTreeDataCausingCombinatorialExpansion() {
351         int readlength = 20;
352         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
353         String ref = "AAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size
354         String alt1 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites
355 
356         // Generate some reads that do not span the entire active region
357         assembler.addSequence("anonymous", getBytes(ref), true);
358 
359         assembler.addSequence("anonymous", getBytes(alt1), false);
360         assembler.addSequence("anonymous", getBytes(ref), false);
361 
362         assembler.buildGraphIfNecessary();
363         assembler.generateJunctionTrees();
364 
365         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
366         Assert.assertEquals(junctionTrees.size(), 10);
367 
368         JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
369         Assert.assertEquals(finder.sources.size(), 1);
370         Assert.assertEquals(finder.sinks.size(), 1);
371         List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes();
372 
373         // We assert that we found all of the haplotypes present in reads and nothing else
374         Assert.assertEquals(haplotypes.size(), 32); // 2^5 paths from combinatorial expansion
375         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
376         // Asserting that
377         Assert.assertTrue(foundHaplotypes.contains(alt1));
378         Assert.assertTrue(foundHaplotypes.contains(ref));
379 
380     }
381 
382 
383     @Test
384     // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1.
385     // We are asserting that the JunctionTree generated by this case lives on the node itself
testPerfectlyPhasedHaplotypeRecoveryTwoAlts()386     public void testPerfectlyPhasedHaplotypeRecoveryTwoAlts() {
387         int readlength = 20;
388         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
389         String ref = "AAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size
390         String alt1 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites
391         String alt2 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"C"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with one different value from alt1
392 
393         // Generate some reads that do not span the entire active region
394         assembler.addSequence("anonymous", getBytes(ref), true);
395         for (int i = 0; i + readlength < ref.length(); i++) {
396             assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false);
397             assembler.addSequence("anonymous", getBytes(alt2.substring(i, i + readlength)), false);
398         }
399 
400         assembler.buildGraphIfNecessary();
401         assembler.generateJunctionTrees();
402 
403         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
404         Assert.assertEquals(junctionTrees.size(), 6);
405 
406         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
407         Assert.assertEquals(finder.sources.size(), 1);
408         Assert.assertEquals(finder.sinks.size(), 1);
409         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes();
410 
411         // We assert that we found all of the haplotypes present in reads and nothing else
412         Assert.assertEquals(haplotypes.size(), 2);
413         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
414         // Asserting that
415         Assert.assertTrue(foundHaplotypes.contains(alt1));
416         Assert.assertTrue(foundHaplotypes.contains(alt2));
417     }
418 
419     @Test
420     // Asserting that the reference end base is handled with care... This code has the tendency to cut early
421     // TODO this test is disabled
testNonUniqueRefStopPosition()422     public void testNonUniqueRefStopPosition() {
423         int readlength = 15;
424         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
425         String ref = "AACTTGGGTGTGTGAAACCCGGGTTGTGTGTGAA"; // The sequence GTGTGTGAA is repeated
426 
427         // Generate some reads that do not span the entire active region
428         assembler.addSequence("anonymous", getBytes(ref), true);
429         for (int i = 0; i <= ref.length(); i+=2) {
430             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength <= ref.length() ? i + readlength : ref.length())), false);
431         }
432 
433         assembler.buildGraphIfNecessary();
434         assembler.generateJunctionTrees();
435 
436         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
437         Assert.assertEquals(junctionTrees.size(), 2);
438 
439         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(3);
440         Assert.assertEquals(finder.sources.size(), 1);
441         Assert.assertEquals(finder.sinks.size(), 1);
442         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5);
443 
444         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
445         // Asserting that the correct reference haplotype actually existed
446         Assert.assertTrue(foundHaplotypes.contains(ref));
447         // Asserting that we did NOT find the haplotype with 0 loops of the repeat:
448         Assert.assertFalse(foundHaplotypes.contains("AACTTGGGTGTGTG"));
449 
450         // We assert that we found all of the haplotypes present in reads and nothing else
451         Assert.assertEquals(haplotypes.size(), 1);
452     }
453 
454     @Test
testNonUniqueRefStopPositionNoStopJTToResolveIt()455     public void testNonUniqueRefStopPositionNoStopJTToResolveIt() {
456         int readlength = 15;
457         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
458         String ref = "AACTTGGGTGTGTGAAACCCGGGTTGTGTGTGAA"; // The sequence GTGTGTGAA is repeated
459 
460         // Generate some reads that do not span the entire active region
461         assembler.addSequence("anonymous", getBytes(ref), true);
462         for (int i = 0; i + readlength < ref.length() - 2; i++) { //the - 2 means that we don't span to the end
463             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false);
464         }
465 
466         assembler.buildGraphIfNecessary();
467         assembler.generateJunctionTrees();
468 
469         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
470         Assert.assertEquals(junctionTrees.size(), 1);
471 
472         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
473         Assert.assertEquals(finder.sources.size(), 1);
474         Assert.assertEquals(finder.sinks.size(), 1);
475         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5);
476 
477         // We assert that we found all of the haplotypes present in reads and nothing else
478         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
479         // Asserting that the correct reference haplotype actually existed
480         Assert.assertTrue(foundHaplotypes.contains(ref));
481         // Asserting that we did NOT find the haplotype with 0 loops of the repeat:
482         Assert.assertFalse(foundHaplotypes.contains("AACTTGGGTGTGTG"));
483         //TODO maybe assert something about the lenght... unclear
484     }
485 
486     @Test
487     // Test asserting that the behavior is reasonable when there is read data past the reference end kmer
testReferenceEndWithErrorSpanningPastEndBase()488     public void testReferenceEndWithErrorSpanningPastEndBase() {
489         int readlength = 15;
490         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
491         String ref = "AACTGGGTT"  +  "GCGCGCGTTACCCGT"; // The sequence GTGTGTGAA is repeated
492         String alt = "AACTGGGTT"+"T"+"GCGCGCGTTACCCGTTT"; // Alt contig with error spanning past the end of the reference sequence
493 
494         // Generate some reads that do not span the entire active region
495         assembler.addSequence("anonymous", getBytes(ref), true);
496         for (int i = 0; i + readlength < alt.length(); i++) {
497             assembler.addSequence("anonymous", getBytes(alt.substring(i, i + readlength)), false);
498         }
499 
500         assembler.buildGraphIfNecessary();
501         assembler.generateJunctionTrees();
502 
503         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
504         Assert.assertEquals(junctionTrees.size(), 1); //TODO this will become 2 once the change is implemented
505 
506         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
507         Assert.assertEquals(finder.sources.size(), 1);
508         Assert.assertEquals(finder.sinks.size(), 1);
509         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5);
510 
511         // We assert that we found all of the haplotypes present in reads and nothing else
512         Assert.assertEquals(haplotypes.size(), 1);
513         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
514         // Asserting that we did NOT find the reference haplotype itself
515         Assert.assertFalse(foundHaplotypes.contains(ref));
516         // Asserting that we did find the alt haplotype minus the ending bases
517         Assert.assertTrue(foundHaplotypes.contains("AACTGGGTT"+"T"+"GCGCGCGTTACCCGT"));
518     }
519 
520     @Test
521     // Test asserting that the behavior is reasonable when there is read data past the reference end kmer
testFullReferencePathRecoveryDespiteReadsNotReachingLastBase()522     public void testFullReferencePathRecoveryDespiteReadsNotReachingLastBase() {
523         int readlength = 15;
524         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
525         String ref = "AACTGGGTT"  +  "GCGCGCGTTACCCGT"; // The sequence GTGTGTGAA is repeated
526         String alt = "AACTGGGTT"+"T"+"GCGCGCGTTACCC"; // Alt contig that doesn't reach the end of the reference
527 
528         // Generate some reads that do not span the entire active region
529         assembler.addSequence("anonymous", getBytes(ref), true);
530         for (int i = 0; i + readlength < alt.length(); i++) {
531             assembler.addSequence("anonymous", getBytes(alt.substring(i, i + readlength)), false);
532         }
533 
534         assembler.buildGraphIfNecessary();
535         assembler.generateJunctionTrees();
536 
537         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
538         Assert.assertEquals(junctionTrees.size(), 1);
539 
540         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
541         Assert.assertEquals(finder.sources.size(), 1);
542         Assert.assertEquals(finder.sinks.size(), 1);
543         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5);
544 
545         // We assert that we found all of the haplotypes present in reads and nothing else
546         Assert.assertEquals(haplotypes.size(), 1);
547         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
548         // Asserting that we did NOT find the reference haplotype itself
549         Assert.assertFalse(foundHaplotypes.contains(ref));
550         // Asserting that we did find the alt haplotype minus the ending bases
551         Assert.assertTrue(foundHaplotypes.contains("AACTGGGTT"+"T"+"GCGCGCGTTACCCGT"));
552     }
553 
554     @Test
555     // The read evidence at the loop is short (15 bases) and consequently doesn't span the entire loop.
testOfLoopingReferenceReadsTooShortToRecoverIt()556     public void testOfLoopingReferenceReadsTooShortToRecoverIt() {
557         int readlength = 15;
558         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
559         String ref = "AAACTTTCGCGGGCCCTTAAACCCGCCCTTAAACCCGCCCTTAAACCGCTGTAAGAAA"; // The sequence GCCCTTAAACCC (12 bases) is repeated
560 
561         // Generate some reads that do not span the entire active region
562         assembler.addSequence("anonymous", getBytes(ref), true);
563         for (int i = 0; i + readlength < ref.length(); i++) {
564             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false);
565             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false);
566         }
567 
568         assembler.buildGraphIfNecessary();
569         assembler.generateJunctionTrees();
570 
571         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
572         Assert.assertEquals(junctionTrees.size(), 2);
573 
574         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
575         Assert.assertEquals(finder.sources.size(), 1);
576         Assert.assertEquals(finder.sinks.size(), 1);
577         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5); //NOTE we only ask for 5 haplotypes here since the graph might loop forever...
578 
579         // We assert that we found all of the haplotypes present in reads and nothing else
580         Assert.assertEquals(haplotypes.size(), 2);
581         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
582         // Asserting that the correct reference haplotype actually existed
583         Assert.assertTrue(foundHaplotypes.contains(ref));
584         // Asserting that we did NOT find the haplotype with 0 loops of the repeat:
585         Assert.assertFalse(foundHaplotypes.contains("AAACTTTCGCGGGCCCTTAAACCGCTGTAAGAAA"));
586     }
587 
588     // TODO this is a lingering and very serious problem that we aim to resolve shortly somehow, unclear as to exaclty how right now
589     @Test (enabled = false)
590     // This test illustrates a current known issue with the new algorithm, where old junction trees with subranchees that don't have a lot of data
591     // are used in place of younger trees that might cointain evidence of new paths. This particular test shows that a variant might be dropped as a result.
testOrphanedSubBranchDueToLackingOldJunctionTree()592     public void testOrphanedSubBranchDueToLackingOldJunctionTree() {
593         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(6);
594 
595         String ref        = "AAAACAC"+"T"+"ATGTGGGG"+"A"+"GGGTTAA"+"A"+"GTCTGAA";
596         String haplotype1 = "AAAACAC"+"G"+"ATGTGGGG"+"T"+"GGGTTAA"+"A"+"GTCTGAA";
597         String haplotype2 = "AAAACAC"+"G"+"ATGTGGGG"+"T"+"GGGTTAA"+"C"+"GTCTGAA";
598         int longReadLength = 20;
599         int shortReadLength = 10;
600 
601         // Add the reference a few times to get it into the junction trees
602         assembler.addSequence("anonymous", getBytes(ref), true);
603         assembler.addSequence("anonymous", getBytes(ref), false);
604         assembler.addSequence("anonymous", getBytes(ref), false);
605         assembler.addSequence("anonymous", getBytes(ref), false);
606 
607         // Add haplotype 1 reads that are long enough to catch the last variant site from the first junction tree (22)
608         for (int i = 0; i + longReadLength < ref.length(); i++) {
609             assembler.addSequence("anonymous", getBytes(haplotype1.substring(i, i + longReadLength)), false);
610         }
611 
612         // Add haplotype 2 reads that are long enough to leapfrog the entire haplotype but insufficient to establish the G->T->C path on the first tree
613         for (int i = 0; i + shortReadLength < ref.length(); i++) {
614             assembler.addSequence("anonymous", getBytes(haplotype2.substring(i, i + shortReadLength)), false);
615         }
616 
617         assembler.generateJunctionTrees();
618 
619         final List<String> haplotypes = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(2)
620                 .findBestHaplotypes(5).stream().map(h -> new String(h.getBases())).collect(Collectors.toList());
621 
622         // Assert that the reference haplotype is recovered
623         Assert.assertTrue(haplotypes.contains(ref));
624         // Assert that haplotype1 is recovered
625         Assert.assertTrue(haplotypes.contains(haplotype1));
626         // Assert that haplotype2 is recovered
627         // NOTE: this gets dropped because the oldest junction tree on the T->A path has data for haplotype1 but not haplotype2
628         Assert.assertTrue(haplotypes.contains(haplotype2));
629     }
630 
631     @Test
632     // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1.
633     // We are asserting that the JunctionTree generated by this case lives on the node itself
testEdgeCaseInvolvingHighInDegreeAndOutDegreeChars()634     public void testEdgeCaseInvolvingHighInDegreeAndOutDegreeChars() {
635         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(4);
636         String ref = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting
637 
638         // A simple snip het
639         String refRead1 = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT";
640         String read1 = "AAAACAC"+"CCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the below read
641         String read2 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the above read
642 
643         assembler.addSequence("anonymous", getBytes(ref), true);
644         assembler.addSequence("anonymous", getBytes(refRead1), false);
645         assembler.addSequence("anonymous", getBytes(read1), false);
646         assembler.addSequence("anonymous", getBytes(read2), false);
647 
648         assembler.buildGraphIfNecessary();
649         assembler.generateJunctionTrees();
650 
651         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
652         Assert.assertEquals(junctionTrees.size(), 6);
653 
654         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
655         finder.setJunctionTreeEvidenceWeightThreshold(0);
656         Assert.assertEquals(finder.sources.size(), 1);
657         Assert.assertEquals(finder.sinks.size(), 1);
658         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes();
659 
660         // We assert that we found all of the haplotypes present in reads and nothing else
661         Assert.assertEquals(haplotypes.size(), 3);
662         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
663         Assert.assertTrue(foundHaplotypes.contains(refRead1));
664         Assert.assertTrue(foundHaplotypes.contains(read1));
665         Assert.assertTrue(foundHaplotypes.contains(read2));
666     }
667 
668 
669     @Test
670     // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1.
671     // We are asserting that the JunctionTree generated by this case lives on the node itself
testGraphRecoveryWhenTreeContainsRepeatedKmers()672     public void testGraphRecoveryWhenTreeContainsRepeatedKmers() {
673         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
674         String ref = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting
675 
676         // A simple snip het
677         String refRead = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG";
678 
679         assembler.addSequence("anonymous", getBytes(ref), true);
680         assembler.addSequence("anonymous", getBytes(refRead), false);
681 
682         assembler.buildGraphIfNecessary();
683         assembler.generateJunctionTrees();
684 
685         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler);
686         finder.setJunctionTreeEvidenceWeightThreshold(0);
687         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes();
688 
689         // We assert that we found all of the haplotypes present in reads and nothing else
690         Assert.assertEquals(haplotypes.size(), 1);
691         Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet());
692         Assert.assertTrue(foundHaplotypes.contains(refRead));
693     }
694 
695     @Test
testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites()696     public void testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites() {
697         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
698         String ref = "GGGAAAT" + "T" + "TCCGGC" + "T" + "CGTTTA"; //Two variant sites in close proximity
699 
700         // A simple snip het
701         String altAARead1 = "GGGAAAT" + "A" + "TCCGGC" + "A" + "CGTTTA"; // Replaces a T with an A, then a T with a A
702         String altAARead2 = "GGGAAAT" + "A" + "TCCGGC" + "A" + "CGTTTA"; // Replaces a T with an A, then a T with a A
703         String altTCRead1 = "GGGAAAT" + "T" + "TCCGGC" + "C" + "CGTTTA"; // Keeps the T, then replaces a T with a C
704         String altTCRead2 = "GGGAAAT" + "T" + "TCCGGC" + "C" + "CGTTTA"; // Keeps the T, then replaces a T with a C
705 
706         assembler.addSequence("anonymous", getBytes(ref), true);
707         assembler.addSequence("anonymous", getBytes(altAARead1), false);
708         assembler.addSequence("anonymous", getBytes(altAARead2), false);
709         assembler.addSequence("anonymous", getBytes(altTCRead1), false);
710         assembler.addSequence("anonymous", getBytes(altTCRead2), false);
711 
712         assembler.buildGraphIfNecessary();
713         assembler.generateJunctionTrees();
714         assembler.pruneJunctionTrees(0);
715 
716         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder1 = new JunctionTreeKBestHaplotypeFinder<>(assembler);
717         Assert.assertEquals(finder1.sources.size(), 1);
718         Assert.assertEquals(finder1.sinks.size(), 1);
719 
720         List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder1.findBestHaplotypes(10);
721         System.out.println();
722     }
723 
724     // Disabled until multi-sink/source edges are supported
725     @Test (enabled = false)
testDeadNode()726     public void testDeadNode(){
727         final JunctionTreeLinkedDeBruijnGraph g = new JunctionTreeLinkedDeBruijnGraph(3);
728         final MultiDeBruijnVertex v1 = new MultiDeBruijnVertex("a".getBytes());
729         final MultiDeBruijnVertex v2 = new MultiDeBruijnVertex("b".getBytes());
730         final MultiDeBruijnVertex v3 = new MultiDeBruijnVertex("c".getBytes());
731         final MultiDeBruijnVertex v4 = new MultiDeBruijnVertex("d".getBytes());
732         final MultiDeBruijnVertex v5 = new MultiDeBruijnVertex("e".getBytes());
733         g.addVertex(v1);   //source
734         g.addVertex(v2);
735         g.addVertex(v3);
736         g.addVertex(v4);  //sink
737         g.addVertex(v5);  //sink
738         g.addEdge(v1, v2);
739         g.addEdge(v2, v3);
740         g.addEdge(v3, v2);//cycle
741         g.addEdge(v2, v5);
742         g.addEdge(v1, v4);
743         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder1 = new JunctionTreeKBestHaplotypeFinder<>(g);
744         Assert.assertEquals(finder1.sources.size(), 1);
745         Assert.assertEquals(finder1.sinks.size(), 2);
746 
747         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder2 = new JunctionTreeKBestHaplotypeFinder<>(g, v1, v4); //v5 is a dead node (can't reach the sink v4)
748         Assert.assertEquals(finder2.sources.size(), 1);
749         Assert.assertEquals(finder2.sinks.size(), 1);
750     }
751 
752     @DataProvider(name = "BasicPathFindingData")
makeBasicPathFindingData()753     public Object[][] makeBasicPathFindingData() {
754         final List<Object[]> tests = new ArrayList<>();
755         for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) {
756             for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) {
757                 for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) {
758                     tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes});
759                 }
760             }
761         }
762         return tests.toArray(new Object[][]{});
763     }
764 
765     private static int weight = 1;
createVertices(final JunctionTreeLinkedDeBruijnGraph graph, final int n, final MultiDeBruijnVertex source, final MultiDeBruijnVertex target)766     final Set<MultiDeBruijnVertex> createVertices(final JunctionTreeLinkedDeBruijnGraph graph, final int n, final MultiDeBruijnVertex source, final MultiDeBruijnVertex target) {
767         final List<String> seqs = Arrays.asList("A", "C", "G", "T");
768         final Set<MultiDeBruijnVertex> vertices = new LinkedHashSet<>();
769         for ( int i = 0; i < n; i++ ) {
770             final MultiDeBruijnVertex v = new MultiDeBruijnVertex(seqs.get(i).getBytes());
771             graph.addVertex(v);
772             vertices.add(v);
773             if ( source != null ) graph.addEdge(source, v, new MultiSampleEdge(false, weight++, 1));
774             if ( target != null ) graph.addEdge(v, target, new MultiSampleEdge(false, weight++, 1));
775         }
776         return vertices;
777     }
778 
779 
780     @Test(dataProvider = "BasicPathFindingData")
testBasicPathFindingNoJunctionTrees(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes)781     public void testBasicPathFindingNoJunctionTrees(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes) {
782         final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(11);
783 
784         final MultiDeBruijnVertex middleTop = new MultiDeBruijnVertex("GTAC".getBytes());
785         final MultiDeBruijnVertex middleBottom = new MultiDeBruijnVertex("ACTG".getBytes());
786         graph.addVertices(middleTop, middleBottom);
787         final Set<MultiDeBruijnVertex> starts = createVertices(graph, nStartNodes, null, middleTop);
788         @SuppressWarnings("unused")
789         final Set<MultiDeBruijnVertex> bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom);
790         final Set<MultiDeBruijnVertex> ends = createVertices(graph, nEndNodes, middleBottom, null);
791 
792         final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes;
793         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = new JunctionTreeKBestHaplotypeFinder<>(graph, starts, ends, JunctionTreeKBestHaplotypeFinder.DEFAULT_OUTGOING_JT_EVIDENCE_THRESHOLD_TO_BELEIVE, false).findBestHaplotypes();
794         Assert.assertEquals(haplotypes.size(), expectedNumOfPaths);
795         IntStream.range(1, haplotypes.size()).forEach(n -> Assert.assertTrue(haplotypes.get(n-1).score() >= haplotypes.get(n).score()));
796     }
797 
798 
799     @DataProvider(name = "BasicBubbleDataProvider")
makeBasicBubbleDataProvider()800     public Object[][] makeBasicBubbleDataProvider() {
801         final List<Object[]> tests = new ArrayList<>();
802         for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
803             for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
804                 tests.add(new Object[]{refBubbleLength, altBubbleLength});
805             }
806         }
807         return tests.toArray(new Object[][]{});
808     }
809 
810     @Test(dataProvider = "BasicBubbleDataProvider")
testBasicBubbleData(final int refBubbleLength, final int altBubbleLength)811     public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) {
812         // Construct the assembly graph
813         JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(4);
814         final String preRef = "ATGG";
815         final String postRef = "GCGGC";
816 
817         final String ref = preRef + Strings.repeat("A", refBubbleLength) + postRef;
818         final String alt = preRef + Strings.repeat("A", altBubbleLength-1) + "T" + postRef;
819 
820         graph.addSequence("anonomyous", ref.getBytes(),  true);
821         for (int i = 0; i < 5; i++) graph.addSequence("anonomyous", ref.getBytes(), 1, false);
822         for (int i = 0; i < 10; i++) graph.addSequence("anonomyous", alt.getBytes(), 1, false);
823 
824         graph.buildGraphIfNecessary();
825         graph.generateJunctionTrees();
826 
827         // Construct the test path
828         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(graph).findBestHaplotypes(5);
829         Assert.assertEquals(bestPaths.size(), 2);
830         KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge> path = bestPaths.get(0);
831 
832         // Construct the actual cigar string implied by the test path
833         final CigarBuilder expectedCigar = new CigarBuilder();
834         expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
835         if( refBubbleLength > altBubbleLength ) {
836             expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
837             expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M));
838         } else if ( refBubbleLength < altBubbleLength ) {
839             expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
840             expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength, CigarOperator.I));
841         } else {
842             expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
843         }
844         expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
845 
846         Assert.assertEquals(path.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), expectedCigar.make().toString(), "Cigar string mismatch");
847     }
848 
849     @DataProvider(name = "TripleBubbleDataProvider")
makeTripleBubbleDataProvider()850     public Object[][] makeTripleBubbleDataProvider() {
851         final List<Object[]> tests = new ArrayList<>();
852         for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) {
853             for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) {
854                 for ( final boolean offRefEnding : Arrays.asList(true, false) ) {
855                     for ( final boolean offRefBeginning : Arrays.asList(false) ) {
856                         tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding});
857                     }
858                 }
859             }
860         }
861         return tests.toArray(new Object[][]{});
862     }
863 
864     @Test(dataProvider = "TripleBubbleDataProvider")
865     //TODO figure out what to do here as this test doesn't apply
testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding)866     public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) {
867         // Construct the assembly graph
868         SeqGraph graph = new SeqGraph(11);
869         final String preAltOption = "ATCGATCGATCGATCGATCG";
870         final String postAltOption = "CCCC";
871         final String preRef = "ATGG";
872         final String postRef = "GGCCG";
873         final String midRef1 = "TTCCT";
874         final String midRef2 = "CCCAAAAAAAAAAAA";
875 
876         SeqVertex preV = new SeqVertex(preAltOption);
877         SeqVertex v = new SeqVertex(preRef);
878         SeqVertex v2Ref = new SeqVertex(Strings.repeat("A", refBubbleLength));
879         SeqVertex v2Alt = new SeqVertex(Strings.repeat("A", altBubbleLength - 1) + "T");
880         SeqVertex v4Ref = new SeqVertex(Strings.repeat("C", refBubbleLength));
881         SeqVertex v4Alt = new SeqVertex(Strings.repeat("C", altBubbleLength - 1) + "T");
882         SeqVertex v6Ref = new SeqVertex(Strings.repeat("G", refBubbleLength));
883         SeqVertex v6Alt = new SeqVertex(Strings.repeat("G", altBubbleLength - 1) + "T");
884         SeqVertex v3 = new SeqVertex(midRef1);
885         SeqVertex v5 = new SeqVertex(midRef2);
886         SeqVertex v7 = new SeqVertex(postRef);
887         SeqVertex postV = new SeqVertex(postAltOption);
888 
889         final String ref = preRef + v2Ref.getSequenceString() + midRef1 + v4Ref.getSequenceString() + midRef2 + v6Ref.getSequenceString() + postRef;
890 
891         graph.addVertex(preV);
892         graph.addVertex(v);
893         graph.addVertex(v2Ref);
894         graph.addVertex(v2Alt);
895         graph.addVertex(v3);
896         graph.addVertex(v4Ref);
897         graph.addVertex(v4Alt);
898         graph.addVertex(v5);
899         graph.addVertex(v6Ref);
900         graph.addVertex(v6Alt);
901         graph.addVertex(v7);
902         graph.addVertex(postV);
903         graph.addEdge(preV, v, new BaseEdge(false, 1));
904         graph.addEdge(v, v2Ref, new BaseEdge(true, 10));
905         graph.addEdge(v2Ref, v3, new BaseEdge(true, 10));
906         graph.addEdge(v, v2Alt, new BaseEdge(false, 5));
907         graph.addEdge(v2Alt, v3, new BaseEdge(false, 5));
908         graph.addEdge(v3, v4Ref, new BaseEdge(true, 10));
909         graph.addEdge(v4Ref, v5, new BaseEdge(true, 10));
910         graph.addEdge(v3, v4Alt, new BaseEdge(false, 5));
911         graph.addEdge(v4Alt, v5, new BaseEdge(false, 5));
912         graph.addEdge(v5, v6Ref, new BaseEdge(true, 11));
913         graph.addEdge(v6Ref, v7, new BaseEdge(true, 11));
914         graph.addEdge(v5, v6Alt, new BaseEdge(false, 55));
915         graph.addEdge(v6Alt, v7, new BaseEdge(false, 55));
916         graph.addEdge(v7, postV, new BaseEdge(false, 1));
917 
918         // Construct the test path
919         Path<SeqVertex,BaseEdge> path = new Path<>( (offRefBeginning ? preV : v), graph);
920         if( offRefBeginning )
921             path = new Path<>(path, graph.getEdge(preV, v));
922         path = new Path<>(path, graph.getEdge(v, v2Alt));
923         path = new Path<>(path, graph.getEdge(v2Alt, v3));
924         path = new Path<>(path, graph.getEdge(v3, v4Ref));
925         path = new Path<>(path, graph.getEdge(v4Ref, v5));
926         path = new Path<>(path, graph.getEdge(v5, v6Alt));
927         path = new Path<>(path, graph.getEdge(v6Alt, v7));
928         if( offRefEnding )
929             path = new Path<>(path, graph.getEdge(v7,postV));
930 
931         // Construct the actual cigar string implied by the test path
932         final CigarBuilder expectedCigar = new CigarBuilder();
933         if( offRefBeginning ) {
934             expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I));
935         }
936         expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M));
937         // first bubble
938         if( refBubbleLength > altBubbleLength ) {
939             expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
940             expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M));
941         } else if ( refBubbleLength < altBubbleLength ) {
942             expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
943             expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength, CigarOperator.I));
944         } else {
945             expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
946         }
947         expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M));
948         // second bubble is ref path
949         expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
950         expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M));
951         // third bubble
952         if( refBubbleLength > altBubbleLength ) {
953             expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D));
954             expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M));
955         } else if ( refBubbleLength < altBubbleLength ) {
956             expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
957             expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength, CigarOperator.I));
958         } else {
959             expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M));
960         }
961         expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M));
962         if( offRefEnding ) {
963             expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I));
964         }
965 
966         Assert.assertEquals(path.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(),
967                 expectedCigar.make().toString(),
968                 "Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases()));
969     }
970 
971     @Test (enabled = false)
972     //TODO this test illustrates a problem with junction trees and dangling end recovery, namely the path that the JT points to
973     //TODO is a dead end after dangling tail recovery. This needs to be resolved with either SmithWaterman or by coopting the threading code
testIntraNodeInsertionDeletion()974     public void testIntraNodeInsertionDeletion() {
975         // Construct the assembly graph
976         final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(5);
977         final String ref = "TTTT" + "CCCCCGGG" + "TTT";
978         final String alt = "TTTT" + "AAACCCCC" + "TTT";
979 
980         graph.addSequence("anonymous", getBytes(ref), true);
981         graph.addSequence("anonymous", getBytes(ref), false);
982         graph.addSequence("anonymous", getBytes(alt), false);
983         graph.buildGraphIfNecessary();
984         graph.recoverDanglingHeads(0, 0,  true, SmithWatermanJavaAligner.getInstance());
985         graph.recoverDanglingTails(0, 0,  true, SmithWatermanJavaAligner.getInstance());
986         graph.generateJunctionTrees();
987 
988         @SuppressWarnings("all")
989         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(graph).findBestHaplotypes();
990         Assert.assertEquals(bestPaths.size(), 2);
991         final Path<MultiDeBruijnVertex,MultiSampleEdge> refPath = bestPaths.get(0);
992         final Path<MultiDeBruijnVertex,MultiSampleEdge> altPath = bestPaths.get(1);
993 
994         Assert.assertEquals(refPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "15M");
995         Assert.assertEquals(altPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "4M3I5M3D3M");
996     }
997 
998     @Test (enabled = false) //TODO this is disabled due to the k max paths per node optimization not being implemented yet
999     /*
1000      This is a test of what can go awry if path pruning is based on the number of incoming edges to a given vertex.
1001      An illustration of the graph in this test:
1002                / ----- top \
1003               /(33)         \
1004      refStart -(32) -- mid -- midExt ---------------------------- refEnd
1005               \(34)        / (1)                               /
1006                \ ---- bot /- (33) botExt - (15) - botExtTop - /
1007                                         \ (18)               /
1008                                          \ ------ botExtBot /
1009 
1010       The expected best paths are (refStart->top->midExt->refEnd), and (refStart->mid->midExt->refEnd) because the bottom
1011       path is penalized for two extra forks despite it greedily looking the best at the start.
1012 
1013       Because the old behavior used to base pruning on the number of incoming edges < K, the edge (bot->midExt) would be created
1014       first, then (top -> midExt) but (mid -> midExt) would never be created because midExt already has 2 incoming edges.
1015       */
1016 
testDegeneratePathPruningOptimizationCase()1017     public void testDegeneratePathPruningOptimizationCase() {
1018         // Construct an assembly graph demonstrating this issue
1019         final SeqGraph graph = new SeqGraph(11);
1020         final SeqVertex top = new SeqVertex("T");
1021         final SeqVertex mid = new SeqVertex("C");
1022         final SeqVertex midAndTopExt = new SeqVertex("GGG");
1023         final SeqVertex bot = new SeqVertex("G");
1024         final SeqVertex botExt = new SeqVertex("AAA");
1025         final SeqVertex botExtTop = new SeqVertex("A");
1026         final SeqVertex botExtBot = new SeqVertex("T");
1027 
1028         // Ref source and sink vertexes
1029         final SeqVertex refStart = new SeqVertex("CCCCCGGG");
1030         final SeqVertex refEnd = new SeqVertex("TTTT");
1031 
1032         graph.addVertices(top, bot, mid, midAndTopExt, bot, botExt, botExtTop, botExtBot, refStart, refEnd);
1033         // First "diamond" with 3 mostly equivalent cost paths
1034         graph.addEdges(() -> new BaseEdge(false, 34), refStart, bot, botExt);
1035         graph.addEdges(() -> new BaseEdge(true, 33), refStart, top, midAndTopExt);
1036         graph.addEdges(() -> new BaseEdge(false, 32), refStart, mid, midAndTopExt);
1037 
1038         // The best looking path reconnects with a very poor edge multiplicity to midAndTopExt (This is this is what casues the bug)
1039         graph.addEdges(() -> new BaseEdge(false, 1), bot, midAndTopExt);
1040         // There is another diamond at bot ext that will end up discounting that path from being in the k best
1041         graph.addEdges(() -> new BaseEdge(false, 15), botExt, botExtTop, refEnd);
1042         graph.addEdges(() -> new BaseEdge(false, 18), botExt, botExtBot, refEnd);
1043 
1044         // Wheras the path is smooth sailing from refEnd
1045         graph.addEdges(() -> new BaseEdge(true, 65), midAndTopExt, refEnd);
1046 
1047         @SuppressWarnings("all")
1048         final List<KBestHaplotype<SeqVertex, BaseEdge>> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(graph,refStart,refEnd).findBestHaplotypes(2);
1049         Assert.assertEquals(bestPaths.size(), 2);
1050         final Path<SeqVertex,BaseEdge> refPath = bestPaths.get(0);
1051         final Path<SeqVertex,BaseEdge> altPath = bestPaths.get(1);
1052 
1053         Assert.assertEquals(refPath.getVertices().toArray(new SeqVertex[0]), new SeqVertex[]{refStart, top, midAndTopExt, refEnd});
1054         Assert.assertEquals(altPath.getVertices().toArray(new SeqVertex[0]), new SeqVertex[]{refStart, mid, midAndTopExt, refEnd});
1055     }
1056 
1057 
1058     @Test
1059     @SuppressWarnings({"unchecked"})
1060     //TODO this test will make a good base for testing later realignment if the leading Ns are cut back down
testHardSWPath()1061     public void testHardSWPath() {
1062         // Construct the assembly graph
1063         final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(11);
1064         String ref = "NNNNNNNNNNN"+"TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"+"NNN"; // Alt with one different value from alt1
1065         String alt = "NNNNNNNNNNN"+"ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"+"NNN"; // Alt with one different value from alt1
1066 
1067         // Generate some reads that do not span the entire active region
1068         graph.addSequence("anonymous", getBytes(ref), true);
1069         graph.addSequence("anonymous", getBytes(ref), false);
1070         graph.addSequence("anonymous", getBytes(alt), false);
1071         graph.buildGraphIfNecessary();
1072         graph.recoverDanglingHeads(0, 2, true, SmithWatermanJavaAligner.getInstance());
1073         graph.generateJunctionTrees();
1074 
1075         @SuppressWarnings("all")
1076         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(graph);
1077         finder.setJunctionTreeEvidenceWeightThreshold(1);
1078         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = finder.findBestHaplotypes();
1079 
1080         Assert.assertEquals(paths.size(), 2);
1081 
1082         final Path<MultiDeBruijnVertex, MultiSampleEdge> refPath = paths.get(0);
1083         final Path<MultiDeBruijnVertex, MultiSampleEdge> altPath = paths.get(1);
1084 
1085         logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar(ref.getBytes(),
1086                 SmithWatermanJavaAligner.getInstance()));
1087         logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar(ref.getBytes(),
1088                 SmithWatermanJavaAligner.getInstance()));
1089 
1090         Assert.assertEquals(refPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "59M");
1091         Assert.assertEquals(altPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "11M6I48M");
1092     }
1093 
1094     @Test
testKmerGraphSimpleReferenceRecovery()1095     public void testKmerGraphSimpleReferenceRecovery() {
1096         // Construct the assembly graph
1097         final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(5);
1098         final MultiDeBruijnVertex refSource = new MultiDeBruijnVertex( "AAATT".getBytes() );
1099         final MultiDeBruijnVertex k1 = new MultiDeBruijnVertex( "AATTT".getBytes() );
1100         final MultiDeBruijnVertex k2 = new MultiDeBruijnVertex( "ATTTG".getBytes() );
1101         final MultiDeBruijnVertex k3 = new MultiDeBruijnVertex( "TTTGG".getBytes() );
1102         final MultiDeBruijnVertex k4 = new MultiDeBruijnVertex( "TTGGG".getBytes() );
1103         final MultiDeBruijnVertex k5 = new MultiDeBruijnVertex( "TGGGC".getBytes() );
1104         final MultiDeBruijnVertex k6 = new MultiDeBruijnVertex( "GGGCC".getBytes() );
1105         final MultiDeBruijnVertex k7 = new MultiDeBruijnVertex( "GGCCC".getBytes() );
1106         final MultiDeBruijnVertex k8 = new MultiDeBruijnVertex( "GCCCT".getBytes() );
1107         final MultiDeBruijnVertex refEnd = new MultiDeBruijnVertex( "CCCTT".getBytes() );
1108         graph.addVertices(refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd);
1109         graph.addEdges(() -> new MultiSampleEdge(true, 1, 1), refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd);
1110 
1111         @SuppressWarnings("all")
1112         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new JunctionTreeKBestHaplotypeFinder<>(graph, refSource, refEnd).findBestHaplotypes();
1113 
1114         Assert.assertEquals(paths.size(), 1);
1115 
1116         final Path<MultiDeBruijnVertex, MultiSampleEdge> refPath = paths.get(0);
1117 
1118         final String refString = "AAATTTGGGCCCTT";
1119 
1120         Assert.assertEquals(refPath.getBases(), refString.getBytes());
1121     }
1122 
1123     @Test
testKmerGraphSimpleReferenceRecoveryWithSNP()1124     public void testKmerGraphSimpleReferenceRecoveryWithSNP() {
1125         // Construct the assembly graph
1126         final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(5);
1127         final MultiDeBruijnVertex refSource = new MultiDeBruijnVertex( "AAATT".getBytes() );
1128         final MultiDeBruijnVertex k1 = new MultiDeBruijnVertex( "AATTT".getBytes() );
1129         final MultiDeBruijnVertex k2 = new MultiDeBruijnVertex( "ATTTG".getBytes() );
1130         final MultiDeBruijnVertex k3 = new MultiDeBruijnVertex( "TTTGG".getBytes() );
1131         final MultiDeBruijnVertex k4 = new MultiDeBruijnVertex( "TTGGG".getBytes() );
1132         final MultiDeBruijnVertex k5 = new MultiDeBruijnVertex( "TGGGC".getBytes() );
1133         final MultiDeBruijnVertex k6 = new MultiDeBruijnVertex( "GGGCC".getBytes() );
1134         final MultiDeBruijnVertex k7 = new MultiDeBruijnVertex( "GGCCC".getBytes() );
1135         final MultiDeBruijnVertex k8 = new MultiDeBruijnVertex( "GCCCT".getBytes() );
1136         final MultiDeBruijnVertex refEnd = new MultiDeBruijnVertex( "CCCTT".getBytes() );
1137         final MultiDeBruijnVertex v3 = new MultiDeBruijnVertex( "TTTGC".getBytes() );
1138         final MultiDeBruijnVertex v4 = new MultiDeBruijnVertex( "TTGCG".getBytes() );
1139         final MultiDeBruijnVertex v5 = new MultiDeBruijnVertex( "TGCGC".getBytes() );
1140         final MultiDeBruijnVertex v6 = new MultiDeBruijnVertex( "GCGCC".getBytes() );
1141         final MultiDeBruijnVertex v7 = new MultiDeBruijnVertex( "CGCCC".getBytes() );
1142         graph.addVertices(refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd, v3, v4, v5, v6, v7);
1143         graph.addEdges(() -> new MultiSampleEdge(true, 1, 4), refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd);
1144         graph.addEdges(() -> new MultiSampleEdge(false, 1, 3), k2, v3, v4, v5, v6, v7, k8);
1145 
1146         @SuppressWarnings("all")
1147         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new JunctionTreeKBestHaplotypeFinder<>(graph, refSource, refEnd).findBestHaplotypes();
1148 
1149         Assert.assertEquals(paths.size(), 1);
1150 
1151         final Path<MultiDeBruijnVertex, MultiSampleEdge> altPath = paths.get(0);
1152 
1153         final String altString = "AAATTTGCGCCCTT";
1154 
1155         Assert.assertEquals(altPath.getBases(), altString.getBytes());
1156     }
1157 
1158     // -----------------------------------------------------------------
1159     //
1160     // Systematic tests to ensure that we get the correct SW result for
1161     // a variety of variants in the ref vs alt bubble
1162     //
1163     // -----------------------------------------------------------------
1164 
1165     @DataProvider(name = "SystematicRefAltSWTestData")
makeSystematicRefAltSWTestData()1166     public Object[][] makeSystematicRefAltSWTestData() {
1167         final List<Object[]> tests = new ArrayList<>();
1168 
1169         final List<List<String>> allDiffs = Arrays.asList(
1170                 Arrays.asList("G", "C", "1M"),
1171                 Arrays.asList("G", "", "1D"),
1172                 Arrays.asList("", "C", "1I"),
1173                 Arrays.asList("AAA", "CGT", "3M"),
1174                 Arrays.asList("TAT", "CAC", "3M"),
1175                 Arrays.asList("GCTG", "GTCG", "4M"),
1176                 Arrays.asList("AAAAA", "", "5D"),
1177                 Arrays.asList("", "AAAAA", "5I"),
1178                 Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I")
1179         );
1180 
1181         for ( final String prefix : Arrays.asList("", "X", "XXXXXXXXXXXXX")) {
1182             for ( final String end : Arrays.asList("", "X", "XXXXXXXXXXXXX")) {
1183                 for ( final List<String> diffs : allDiffs )
1184                     tests.add(new Object[]{prefix, end, diffs.get(0), diffs.get(1), diffs.get(2)});
1185             }
1186         }
1187 
1188         return tests.toArray(new Object[][]{});
1189     }
1190 
1191 
1192     /**
1193      * Convenience constructor for testing that creates a path through vertices in graph
1194      */
makePath(final List<T> vertices, final BaseGraph<T, E> graph)1195     private static <T extends BaseVertex, E extends BaseEdge> Path<T,E> makePath(final List<T> vertices, final BaseGraph<T, E> graph) {
1196         Path<T,E> path = new Path<>(vertices.get(0), graph);
1197         for ( int i = 1; i < vertices.size(); i++ ) {
1198             path = new Path<>(path, graph.getEdge(path.getLastVertex(), vertices.get(i)));
1199         }
1200         return path;
1201     }
1202 
1203 
1204     @Test
testCreateMapOfPivotalEdgesInTopoglogicalOrderBasicExample()1205     public void testCreateMapOfPivotalEdgesInTopoglogicalOrderBasicExample() {
1206         int readlength = 20;
1207         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
1208         String ref = "TAAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size
1209         String alt1 = "TAAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites
1210 
1211         // Generate some reads that do not span the entire active region
1212         assembler.addSequence("anonymous", getBytes(ref), true);
1213         for (int i = 0; i + readlength < ref.length(); i++) {
1214             assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false);
1215         }
1216 
1217         assembler.buildGraphIfNecessary();
1218         assembler.generateJunctionTrees();
1219 
1220         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1);
1221 
1222         LinkedHashSet<MultiSampleEdge> pivotalEdges = bestPaths.createMapOfPivotalEdgesInTopologicalOrder();
1223         Iterator<MultiSampleEdge> edgesInOrder = pivotalEdges.iterator();
1224         ListIterator<String> expectedEdgeKmerTargetsFOrPivotalEdges = Arrays.asList(new String[]{"AACAAGT","GGTTCGG","GGGTTCA","CGAAGTC","TAATATG"}).listIterator();
1225 
1226         // Asserting that the order of edges is as expected
1227         Assert.assertEquals(pivotalEdges.size(), 5);
1228         while (edgesInOrder.hasNext()) {
1229             MultiSampleEdge nextEdge = edgesInOrder.next();
1230             String expectedKmerEdge = expectedEdgeKmerTargetsFOrPivotalEdges.next();
1231             String nextTarget = assembler.getEdgeTarget(nextEdge).getSequenceString();
1232             Assert.assertEquals(nextTarget, expectedKmerEdge);
1233         }
1234 
1235     }
1236 
1237     @Test
1238     // this test asserts that included in pivotal edges are edges that are only reachable by following uncovered reference path
testCreateMapOfPivotalEdgesInTopoglogicalOrderHiddenReferenceSequence()1239     public void testCreateMapOfPivotalEdgesInTopoglogicalOrderHiddenReferenceSequence() {
1240         int readlength = 20;
1241         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7);
1242         String ref =  "TAAACAAG"+"G"+"TTGGGTTCG"+"AGCGGT"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size
1243         String alt1 = "TAAACAAG"+"T"+"TTGGGTTCG"+"GGCGGA"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites
1244         String alt2 =                        "AGCGGTCT"+"G"+"CGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // A snippet of reference sequence starting after an uncovered reference edge
1245 
1246         // Generate some reads that do not span the entire active region
1247         assembler.addSequence("anonymous", getBytes(ref), true);
1248         for (int i = 0; i + readlength < ref.length(); i++) {
1249             assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false);
1250         }
1251 
1252         for (int i = 0; i + readlength < alt2.length(); i++) {
1253             assembler.addSequence("anonymous", getBytes(alt2.substring(i, i + readlength)), false);
1254         }
1255 
1256         assembler.buildGraphIfNecessary();
1257         assembler.generateJunctionTrees();
1258 
1259         final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1);
1260 
1261         LinkedHashSet<MultiSampleEdge> pivotalEdges = bestPaths.createMapOfPivotalEdgesInTopologicalOrder();
1262         Iterator<MultiSampleEdge> edgesInOrder = pivotalEdges.iterator();
1263         ListIterator<String> expectedEdgeKmerTargetsFOrPivotalEdges = Arrays.asList(new String[]{"AACAAGT","GGTTCGG", // edges corresoinding to the start of alt1 path
1264                 "CGGTCTG", // Edge corresponding to the G insertion (This would be dropped if we didn't include reference edges with no support in this calculation)
1265                 "CGAAGTC",
1266                 "TAATATA","TAATATG" // Edges correspoindg to the ref and alt respectively, the order is arbitrary because the shortest path is what is taken to determine distance
1267         }).listIterator();
1268 
1269         // Asserting that the order of edges is as expected
1270         Assert.assertEquals(pivotalEdges.size(), 6);
1271         while (edgesInOrder.hasNext()) {
1272             MultiSampleEdge nextEdge = edgesInOrder.next();
1273             String expectedKmerEdge = expectedEdgeKmerTargetsFOrPivotalEdges.next();
1274             String nextTarget = assembler.getEdgeTarget(nextEdge).getSequenceString();
1275             Assert.assertEquals(nextTarget, expectedKmerEdge);
1276         }
1277     }
1278 }