1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading;
2 
3 import htsjdk.samtools.SAMFileHeader;
4 import htsjdk.samtools.TextCigarCodec;
5 import org.apache.commons.lang3.tuple.Pair;
6 import org.broadinstitute.hellbender.testutils.BaseTest;
7 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer;
8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.*;
9 import org.broadinstitute.hellbender.utils.Utils;
10 import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
11 import org.broadinstitute.hellbender.utils.read.GATKRead;
12 import org.broadinstitute.hellbender.utils.read.ReadUtils;
13 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
14 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner;
15 import org.testng.Assert;
16 import org.testng.annotations.DataProvider;
17 import org.testng.annotations.Test;
18 
19 import java.util.*;
20 import java.util.stream.Collectors;
21 
22 public class JunctionTreeLinkedDeBruijnGraphUnitTest extends BaseTest {
23 
24     @DataProvider (name = "loopingReferences")
loopingReferences()25     public static Object[][] loopingReferences() {
26         return new Object[][]{
27                 new Object[]{"GACACACAGTCA", 3}, //ACA and CAC are repeated
28                 new Object[]{"GACACTCACGCACGG", 3}, //CAC repeated twice, showing that the loops are handled somewhat sanely
29                 new Object[]{"GACATCGACGG", 3}, //GAC repeated twice, starting with GAC, can it recover the reference if it starts on a repeated kmer
30                 new Object[]{"GACATCACATC", 3} //final kmer ATC is repeated, can we recover the reference for repating final kmer
31         };
32     }
33 
34     @Test (dataProvider = "loopingReferences")
testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize)35     public void testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize) {
36         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
37         assembler.addSequence("anonymous", getBytes(ref), true);
38         assembler.buildGraphIfNecessary();
39         List<MultiDeBruijnVertex> refVertexes = assembler.getReferencePath(ReadThreadingGraph.TraversalDirection.downwards);
40         final StringBuilder builder = new StringBuilder(refVertexes.get(0).getSequenceString());
41         refVertexes.stream().skip(1).forEach(v -> builder.append(v.getSuffixString()));
42         Assert.assertEquals(builder.toString(), ref);
43     }
44 
45     @Test
46     // This test is intended to assert that a potentially dangerous infinite loop is closed
testDanglingEndRecoveryInfiniteHeadLoop()47     public void testDanglingEndRecoveryInfiniteHeadLoop() {
48         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
49         final String ref = "AACTGGGCTAGAGAGCGTT";
50         final String loopingDanglingHead = "TTCGAAGGTCGAAG"; // "TCGAA" is repeated, causing dangling head recovery to fail
51 
52         assembler.addSequence("anonymous", getBytes(ref), true);
53         assembler.addSequence("anonymous", getBytes(ref), false);
54         // Coverage = 3, thus above the prune factor of 2
55         assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false);
56         assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false);
57         assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false);
58 
59         // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG)
60         assembler.buildGraphIfNecessary();
61 
62         //Try dangling head recovery (this might have infinitely looped if we are not careful)
63         assembler.recoverDanglingHeads(2, 0, true, SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA));
64 
65         Assert.assertTrue(true, "If we reached here than the above code didn't infinitely loop");
66     }
67 
68     @Test
69     // As above, pruning the dangling end recovery path can cause the infinite loop recovery to fall over
testDanglingEndRecoveryInfiniteHeadLoopWithPruning()70     public void testDanglingEndRecoveryInfiniteHeadLoopWithPruning() {
71         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
72         final String ref = "AACTGGGCTAGAGAGCGTT";
73         final String loopingDanglingHead = "TTCGAAGGTCGAA"; // "TCGAA" is repeated, causing dangling head recovery to fail
74         final String loopingDanglingHeadShortened = "TTCGAAGGTC"; // "TCGAA" is repeated, causing dangling head recovery to fail
75 
76 
77         assembler.addSequence("anonymous", getBytes(ref), true);
78         assembler.addSequence("anonymous", getBytes(ref), false);
79         assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false);
80         assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false);
81         // Now the some of the kimers in the infinite loop should have < 3 coverage, causing pruning to kick in
82         assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHeadShortened), false);
83 
84         // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG)
85         assembler.buildGraphIfNecessary();
86 
87         //Try dangling head recovery (this might have infinitely looped if we are not careful)
88         assembler.recoverDanglingHeads(3, 0, true, SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA));
89 
90         Assert.assertTrue(true, "If we reached here than the above code didn't infinitely loop");
91     }
92 
93     @Test
testJunctionTreePaperTestExample()94     public void testJunctionTreePaperTestExample() {
95         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
96         String ref = "ACTGATTTCGATGCGATGCGATGCCACGGTGG"; // a loop of length 3 in the middle
97 
98         assembler.addSequence("anonymous", getBytes(ref), true);
99         assembler.addSequence("anonymous", getBytes(ref), false);
100 
101         // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG)
102         assembler.buildGraphIfNecessary();
103         assembler.generateJunctionTrees();
104         Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGAT"))).isPresent());
105         Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GCGAT"))).isPresent());
106 
107         Assert.assertEquals(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGAT"))).get().getPathsPresentAsBaseChoiceStrings().get(0), "GGC_");
108     }
109 
110     @Test
testGraphPruningSanityCheck()111     public void testGraphPruningSanityCheck() {
112         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
113         String ref = "TTTTTAAAAACCCCCGGGGGATATATCGCGCG"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting
114 
115         String altRead1 = "TTTTTGAAAACCCTCGGGGGATAAATCGCGCG"; // 3 SNPs
116         String altRead2 = "TTTTTGAAAACCCTCGGGGG";
117         String altRead3 = "TTTTTGAAAACCCTCG";
118         String altRead4 = "TTTTTGAAAA";
119         String altRead5 = "TTTTTG";
120 
121         assembler.addSequence("anonymous", getBytes(ref), true);
122         assembler.addSequence("anonymous", getBytes(altRead1), false);
123         assembler.addSequence("anonymous", getBytes(altRead2), false);
124         assembler.addSequence("anonymous", getBytes(altRead3), false);
125         assembler.addSequence("anonymous", getBytes(altRead4), false);
126         assembler.addSequence("anonymous", getBytes(altRead5), false);
127 
128         // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG)
129         assembler.buildGraphIfNecessary();
130         assembler.generateJunctionTrees();
131         Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).isPresent());
132         Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGGG"))).isPresent());
133         Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("AATCG"))).isPresent());
134 
135         Assert.assertEquals(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).get().getPathsPresentAsBaseChoiceStrings().get(0), "TA_");
136 
137         // Now we prune requiring at least 2 bases of support (which should kill all but the first graph and then only the first two nodes of the first graph)
138         assembler.pruneJunctionTrees(2);
139 
140         Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).isPresent());
141         Assert.assertFalse(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGGG"))).isPresent());
142         Assert.assertFalse(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("AATCG"))).isPresent());
143 
144         Assert.assertEquals(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).get().getPathsPresentAsBaseChoiceStrings().get(0), "T");
145     }
146 
147     @Test
148     // The simplest case where we expect two uninformative junction trees to be constructed
testSimpleJunctionTreeExample()149     public void testSimpleJunctionTreeExample() {
150         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
151         String ref = "GGGGAAATTTCCCGGGG";
152 
153         // A simple SNP het
154         String refRead1 = "GGGAAATTTCC";
155         String refRead2 = "GAAATTTCCCGGG";
156         String altRead1 = "GGAAATGTC";
157         String altRead2 = "GGAAATGTCCCGGG";
158 
159         assembler.addSequence("anonymous", getBytes(ref), true);
160         assembler.addSequence("anonymous", getBytes(refRead1), false);
161         assembler.addSequence("anonymous", getBytes(refRead2), false);
162         assembler.addSequence("anonymous", getBytes(altRead1), false);
163         assembler.addSequence("anonymous", getBytes(altRead2), false);
164 
165         assembler.buildGraphIfNecessary();
166         assembler.generateJunctionTrees();
167 
168         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
169         Assert.assertEquals(junctionTrees.size(), 2);
170 
171         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("GTCCC")));
172         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("TTCCC")));
173         Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site
174         Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site
175 
176         List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings();
177         List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings();
178         Assert.assertEquals(tree1Choices, Collections.singletonList(""));
179         Assert.assertEquals(tree2Choices, Collections.singletonList(""));
180 
181         // Since the remaining graphs are empty prune them and assert as much
182         junctionTrees = assembler.getReadThreadingJunctionTrees(true);
183         Assert.assertEquals(junctionTrees.size(), 0);
184     }
185 
186     @Test
187     // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1.
188     // We are asserting that the JunctionTree generated by this case lives on the node itself
testEdgeCaseInvolvingHighInDegreeAndOutDegreeCars()189     public void testEdgeCaseInvolvingHighInDegreeAndOutDegreeCars() {
190         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(4);
191         String ref = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting
192 
193         // A simple snip het
194         String refRead1 = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT";
195         String read1 = "AAAACAC"+"CCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the below read
196         String read2 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the above read
197 
198         assembler.addSequence("anonymous", getBytes(ref), true);
199         assembler.addSequence("anonymous", getBytes(refRead1), false);
200         assembler.addSequence("anonymous", getBytes(read1), false);
201         assembler.addSequence("anonymous", getBytes(read2), false);
202 
203         assembler.buildGraphIfNecessary();
204         assembler.generateJunctionTrees();
205 
206         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
207         Assert.assertEquals(junctionTrees.size(), 6);
208 
209         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("CCGA")));
210         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("TCGA")));
211         Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site
212         Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site
213 
214         List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings();
215         List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings();
216         Assert.assertEquals(new HashSet<>(tree1Choices), new HashSet<>(Arrays.asList("AA_","CC_"))); // The AA and CC correspond to taking A and C out of the current nodes.
217         Assert.assertEquals(tree2Choices, Collections.singletonList("C_"));
218     }
219 
220 
221     @Test
222     // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1.
223     // We are asserting that the JunctionTree generated by this case lives on the node itself
testGraphRecoveryWhenKmerIsRepeated()224     public void testGraphRecoveryWhenKmerIsRepeated() {
225         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
226         String ref = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting
227 
228         // A simple snip het
229         String refRead = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG";
230 
231         assembler.addSequence("anonymous", getBytes(ref), true);
232         assembler.addSequence("anonymous", getBytes(refRead), false);
233 
234         assembler.buildGraphIfNecessary();
235         assembler.generateJunctionTrees();
236 
237         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
238         Assert.assertEquals(junctionTrees.size(), 2);
239 
240         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("CGGGG")));
241         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("GGGGG")));
242         Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site
243         Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site
244 
245 
246         List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings();
247         Assert.assertEquals(tree1Choices, Collections.singletonList("GGGGGGGGGT_"));
248 
249         List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings();
250         Assert.assertEquals(new HashSet<>(tree2Choices), new HashSet<>(Arrays.asList(
251                  "GGGGGGGGGT_"
252                 ,"GGGGGGGGT_"
253                 ,"GGGGGGGT_"
254                 ,"GGGGGGT_"
255                 ,"GGGGGT_"
256                 ,"GGGGT_"
257                 ,"GGGT_"
258                 ,"GGT_"
259                 ,"GT_"
260                 ,"T_")));
261     }
262 
263     @Test
264     // The simplest case where we expect two uninformative junction trees to be constructed
testPruneGraph()265     public void testPruneGraph() {
266         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
267         String ref = "GGGGAAATTTCCCGGGG";
268 
269         // A simple snip het
270         String refRead1 = "GGGAAATTTCC";
271         String refRead2 = "GAAATTTCCCGGG";
272         String altRead1 = "GGAAATGTC";
273         String altRead2 = "GGAAATGTCCCGGG";
274 
275         assembler.addSequence("anonymous", getBytes(ref), true);
276         assembler.addSequence("anonymous", getBytes(refRead1), false);
277         assembler.addSequence("anonymous", getBytes(refRead2), false);
278         assembler.addSequence("anonymous", getBytes(altRead1), false);
279         assembler.addSequence("anonymous", getBytes(altRead2), false);
280 
281         assembler.buildGraphIfNecessary();
282         assembler.generateJunctionTrees();
283 
284         assembler.pruneJunctionTrees(0);
285 
286         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
287         Assert.assertEquals(junctionTrees.size(), 0);
288     }
289 
290     @Test
testSimpleJunctionTreeIncludeRefInJunctionTreeNoStopEnd()291     public void testSimpleJunctionTreeIncludeRefInJunctionTreeNoStopEnd() {
292         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
293         String ref = "GGGGAAATTTCCCGGGG"; // Ref starts 1 base before/after any of the reads
294 
295         // A simple snip het
296         String altARead1 = "GGGAAATATCC"; // Replaces a T with an A
297         String altARead2 = "GAAATATCCCGGG";
298         String altTRead1 = "GGAAATGTC"; // Replaces a T with a G
299         String altTRead2 = "GGAAATGTCCCGGG";
300 
301         assembler.addSequence("anonymous", getBytes(ref), true);
302         assembler.addSequence("anonymous", getBytes(altTRead1), false);
303         assembler.addSequence("anonymous", getBytes(altTRead2), false);
304         assembler.addSequence("anonymous", getBytes(altARead1), false);
305         assembler.addSequence("anonymous", getBytes(altARead2), false);
306 
307         assembler.buildGraphIfNecessary();
308         assembler.generateJunctionTrees();
309 
310         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
311         Assert.assertEquals(junctionTrees.size(), 2);
312 
313         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("ATCCC")));
314         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("GTCCC")));
315         //NOTE there is no "TTCCC" edge here because it only existed as a reference path
316         Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site
317         Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site
318 
319         List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings();
320         List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings();
321         Assert.assertEquals(tree1Choices, Collections.singletonList(""));
322         Assert.assertEquals(tree2Choices, Collections.singletonList(""));
323     }
324 
325     @Test
326     // NOTE this test is less useful now that the starting JT has been removed
testSimpleJunctionTreeIncludeRefInJunctionTreeNoWithStartEndChanges()327     public void testSimpleJunctionTreeIncludeRefInJunctionTreeNoWithStartEndChanges() {
328         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
329         String ref = "GGGAAATTTCCCGGG"; // Reads span to the start/stop of the ref, thus we have to worry about symbolic alleles
330 
331         // A simple snip het
332         String altARead1 = "GGGAAATATCC"; // Replaces a T with an A
333         String altARead2 = "GAAATATCCCGGG";
334         String altTRead1 = "GGAAATGTC"; // Replaces a T with a G
335         String altTRead2 = "GGAAATGTCCCGGG";
336 
337         assembler.addSequence("anonymous", getBytes(ref), true);
338         assembler.addSequence("anonymous", getBytes(altTRead1), false);
339         assembler.addSequence("anonymous", getBytes(altTRead2), false);
340         assembler.addSequence("anonymous", getBytes(altARead1), false);
341         assembler.addSequence("anonymous", getBytes(altARead2), false);
342 
343         assembler.buildGraphIfNecessary();
344         assembler.generateJunctionTrees();
345 
346         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
347         Assert.assertEquals(junctionTrees.size(), 2);
348 
349         JunctionTreeLinkedDeBruijnGraph.ThreadingTree starttree = junctionTrees.get(assembler.findKmer(new Kmer("GGGAA")));
350         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("ATCCC")));
351         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("GTCCC")));
352         //NOTE there is no "TTCCC" edge here because it only existed as a reference path
353         Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site
354         Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site
355 
356         List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings();
357         List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings();
358         Assert.assertEquals(tree1Choices, Collections.singletonList("_"));
359         Assert.assertEquals(tree2Choices, Collections.singletonList("_"));
360     }
361 
362     @Test
testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites()363     public void testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites() {
364         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
365         String ref = "GGAAAT"+"T"+"TCCGGC"+"T"+"CGTTTAG"; //Two variant sites in close proximity
366 
367         // A simple snip het
368         String altAARead1 = "GAAAT"+"A"+"TCCGGC"+"A"+"CGTTTA"; // Replaces a T with an A, then a T with a A
369         String altAARead2 = "GAAAT"+"A"+"TCCGGC"+"A"+"CGTTTA"; // Replaces a T with an A, then a T with a A
370         String altTCRead1 = "GAAAT"+"T"+"TCCGGC"+"C"+"CGTTTA"; // Keeps the T, then replaces a T with a C
371         String altTCRead2 = "GAAAT"+"T"+"TCCGGC"+"C"+"CGTTTA"; // Keeps the T, then replaces a T with a C
372 
373         assembler.addSequence("anonymous", getBytes(ref), true);
374         assembler.addSequence("anonymous", getBytes(altAARead1), false);
375         assembler.addSequence("anonymous", getBytes(altAARead2), false);
376         assembler.addSequence("anonymous", getBytes(altTCRead1), false);
377         assembler.addSequence("anonymous", getBytes(altTCRead2), false);
378 
379         assembler.buildGraphIfNecessary();
380         assembler.generateJunctionTrees();
381 
382         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(true);
383         Assert.assertEquals(junctionTrees.size(), 2);
384 
385         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("ATCCG")));
386         JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("TTCCG")));
387         //NOTE there is no "TTCCC" edge here because it only existed as a reference path
388         Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site
389         Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site
390 
391         List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings();
392         List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings();
393         // Perfectly phased variants (site 2 has 2 mismatches to the reference)
394         Assert.assertEquals(tree1Choices, Collections.singletonList("A"));
395         Assert.assertEquals(tree2Choices, Collections.singletonList("C"));
396     }
397 
398     @Test
testSimpleJuncionTreeLoopingReference()399     public void testSimpleJuncionTreeLoopingReference() {
400         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
401         String ref = "ATGGTTAGGGGAAATTTAAATTTAAAGCGCCCCCG"; // AAATT, AATTT, ATTTA, TTTAA, and TTAAA are all repeated in a loop
402 
403         assembler.addSequence("reference", getBytes(ref), true);
404         for (int i = 0; i + 20 < ref.length(); i++) {
405             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + 20)), false);
406         }
407         assembler.buildGraphIfNecessary();
408         assembler.generateJunctionTrees();
409 
410         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
411         Assert.assertEquals(junctionTrees.size(), 2); //
412 
413         JunctionTreeLinkedDeBruijnGraph.ThreadingTree outsideTree = junctionTrees.get(assembler.findKmer(new Kmer("GAAAT")));
414         JunctionTreeLinkedDeBruijnGraph.ThreadingTree insideTree = junctionTrees.get(assembler.findKmer(new Kmer("TAAAT")));
415         Assert.assertNotNull(outsideTree); // Make sure we actually have tree data for that site
416         Assert.assertNotNull(insideTree); // Make sure we actually have tree data for that site
417 
418         List<String> insideChoices = insideTree.getPathsPresentAsBaseChoiceStrings();
419         Assert.assertEquals(insideChoices.size(), 1);
420         Assert.assertTrue(insideChoices.contains("G"));
421 
422         List<String> outsideChoices = outsideTree.getPathsPresentAsBaseChoiceStrings();
423         Assert.assertEquals(outsideChoices.size(), 1);
424         Assert.assertTrue(outsideChoices.contains("TG"));
425     }
426 
427     @Test
testSimpleJuncionTreeThriceLoopingReference()428     public void testSimpleJuncionTreeThriceLoopingReference() {
429         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5);
430         String ref = "ATGGTTAGGGGAAATTTAAATTTAAATTTAAAGCGCCCCCG"; // AAATT, AATTT, ATTTA, TTTAA, and TTAAA are all repeated in a loop
431 
432         assembler.addSequence("reference", getBytes(ref), true);
433         for (int i = 0; i + 23 < ref.length(); i++) {
434             // 20 bases should be exactly enough to recover the whole path through the loop (from GGAAAT to TTAAAG)
435             assembler.addSequence("anonymous", getBytes(ref.substring(i, i + 23)), false);
436         }
437         assembler.buildGraphIfNecessary();
438         assembler.generateJunctionTrees();
439 
440         Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false);
441         Assert.assertEquals(junctionTrees.size(), 2); //
442 
443         JunctionTreeLinkedDeBruijnGraph.ThreadingTree outsideTree = junctionTrees.get(assembler.findKmer(new Kmer("GAAAT")));
444         JunctionTreeLinkedDeBruijnGraph.ThreadingTree insideTree = junctionTrees.get(assembler.findKmer(new Kmer("TAAAT")));
445         Assert.assertNotNull(outsideTree); // Make sure we actually have tree data for that site
446         Assert.assertNotNull(insideTree); // Make sure we actually have tree data for that site
447 
448         List<String> insideChoices = insideTree.getPathsPresentAsBaseChoiceStrings();
449         Assert.assertEquals(insideChoices.size(), 2);
450         Assert.assertTrue(insideChoices.contains("TG"));
451         Assert.assertTrue(insideChoices.contains("G"));
452 
453         List<String> outsideChoices = outsideTree.getPathsPresentAsBaseChoiceStrings();
454         Assert.assertEquals(outsideChoices.size(), 1);
455         Assert.assertTrue(outsideChoices.contains("TTG"));
456     }
457 
458     private static final boolean DEBUG = false;
459 
getBytes(final String alignment)460     public static byte[] getBytes(final String alignment) {
461         return alignment.replace("-","").getBytes();
462     }
463 
assertNonUniqueKmersInvolveLoops(final JunctionTreeLinkedDeBruijnGraph assembler, String... nonUniques)464     private void assertNonUniqueKmersInvolveLoops(final JunctionTreeLinkedDeBruijnGraph assembler, String... nonUniques) {
465         final Set<String> actual = new HashSet<>();
466         assembler.buildGraphIfNecessary();
467         for (String kmer : nonUniques) {
468             MultiDeBruijnVertex vertex = assembler.findKmer(new Kmer(kmer));
469             Assert.assertTrue(assembler.incomingEdgesOf(vertex).size() > 1 || assembler.outgoingEdgesOf(vertex).size() > 1);
470         }
471     }
472 
473     @Test
testSimpleHaplotypeRethreading()474     public void testSimpleHaplotypeRethreading() {
475         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11);
476         final String ref   = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG";
477         final String alt   = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG";
478         assembler.addSequence("anonymous", getBytes(ref), true);
479         assembler.addSequence("anonymous", getBytes(alt), false);
480         assembler.buildGraphIfNecessary();
481         Assert.assertNotEquals(ref.length() - 11 + 1, assembler.vertexSet().size(), "the number of vertex in the graph is the same as if there was no alternative sequence");
482         Assert.assertEquals(ref.length() - 11 + 1 + 11, assembler.vertexSet().size(), "the number of vertex in the graph is not the same as if there is an alternative sequence");
483         MultiDeBruijnVertex startAlt = assembler.findKmer(new Kmer(alt.getBytes(),20,11));
484         Assert.assertNotNull(startAlt);
485     }
486 
487     @Test
testJunctionTreeErrorCorrection()488     public void testJunctionTreeErrorCorrection() {
489         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11);
490         final String ref            = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
491         final String supportedAlt   = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT";
492         // This path has an undersupported edge that gets pruned, we want to assert that we can recover the proper junction tree weights regardless
493         final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"C"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
494         assembler.addSequence("anonymous", getBytes(ref), true);
495         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
496         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
497         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
498         // Only provide a single instance of the path so that its middle C variant gets pruned
499         assembler.addSequence("anonymous", getBytes(unSupportedAlt), false);
500         assembler.buildGraphIfNecessary();
501         new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler);
502         assembler.generateJunctionTrees();
503 
504         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths =
505                 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10);
506 
507         Assert.assertEquals(bestPaths.size(),2);
508         Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt);
509         Assert.assertEquals(new String(bestPaths.get(1).getBases()), "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT");
510     }
511 
512     @Test
testJunctionTreeErrorCorrectionUnrecoverableWithInsertion()513     public void testJunctionTreeErrorCorrectionUnrecoverableWithInsertion() {
514         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11);
515         final String ref            = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
516         final String supportedAlt   = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT";
517         // This path has an undersupported edge that gets pruned, we want to assert that we can recover the proper junction tree weights regardless
518         final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"CAAT"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
519         assembler.addSequence("anonymous", getBytes(ref), true);
520         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
521         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
522         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
523         // Only provide a single instance of the path so that its middle C variant gets pruned
524         assembler.addSequence("anonymous", getBytes(unSupportedAlt), false);
525         assembler.buildGraphIfNecessary();
526         new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler);
527         assembler.generateJunctionTrees();
528 
529         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths =
530                 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10);
531 
532         // We only see the supported alt path because the unsupported alt path is never recovered
533         Assert.assertEquals(bestPaths.size(),1);
534         Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt);
535     }
536 
537     @Test
testJunctionTreeErrorCorrectionKmerSizeDistanceAllowance()538     public void testJunctionTreeErrorCorrectionKmerSizeDistanceAllowance() {
539         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11);
540         final String ref            = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"A"+"TTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
541         final String supportedAlt   = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT";
542         // This path has two unsupported edges which should be pruned, however they are more than kmer size apart so the junction tree code should still work to recover the connectivity
543         final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"G"+"TTTTTGGGGGGG"+"T"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
544         assembler.addSequence("anonymous", getBytes(ref), true);
545         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
546         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
547         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
548         // Only provide a single instance of the path so that its middle C variant gets pruned
549         assembler.addSequence("anonymous", getBytes(unSupportedAlt), false);
550         assembler.buildGraphIfNecessary();
551         new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler);
552         assembler.generateJunctionTrees();
553 
554         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths =
555                 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10);
556 
557         Assert.assertEquals(bestPaths.size(),2);
558         Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt);
559         Assert.assertEquals(new String(bestPaths.get(1).getBases()), "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT");
560     }
561 
562     @Test
testJuncitonTreeErrorCorrectionFailingIfWithinKmerSizeForConsistencySake()563     public void testJuncitonTreeErrorCorrectionFailingIfWithinKmerSizeForConsistencySake() {
564         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11);
565         final String ref            = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"A"+"TTTTGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
566         final String supportedAlt   = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTTTGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT";
567         // This path has two unsupported edges which should be pruned, however they are less than a kmer size apart and should result in the path being unable to find
568         final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"G"+"TTTTGGGG"+"T"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
569         assembler.addSequence("anonymous", getBytes(ref), true);
570         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
571         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
572         assembler.addSequence("anonymous", getBytes(supportedAlt), false);
573         // Only provide a single instance of the path so that its middle C variant gets pruned
574         assembler.addSequence("anonymous", getBytes(unSupportedAlt), false);
575         assembler.buildGraphIfNecessary();
576         new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler);
577         assembler.generateJunctionTrees();
578 
579         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths =
580                 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10);
581 
582         // We only see the supported alt path because the unsupported alt path is never recovered
583         Assert.assertEquals(bestPaths.size(),1);
584         Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt);
585     }
586 
587     @Test
588     // test asserting that we don't make spurious edges when we are trying to recover error bases
testJunctionTreeErrorCorrectionNotAddingTreesWhenOverTentativeBases()589     public void testJunctionTreeErrorCorrectionNotAddingTreesWhenOverTentativeBases() {
590         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11);
591         final String ref             = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"A"+"TTGG"+"A"+"GGG"+"C"+"GTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
592         final String supportedAlt   = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTGG"+"C"+"GGG"+"C"+"GTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT";
593 
594         // This path shold be pruned, but if we aren't careful might end up pairing the first SNP with the middle SNP at this site
595         final String unSupportedAltWithError = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"C"+"TTGG"+"C"+"GGG"+"A"+"GTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT";
596         assembler.addSequence("anonymous", getBytes(ref), true);
597         assembler.addSequence("anonymous", getBytes(ref), false);
598         assembler.addSequence("anonymous", getBytes(ref), false);
599         assembler.addSequence("anonymous", getBytes(supportedAlt),  false);
600         assembler.addSequence("anonymous", getBytes(supportedAlt),  false);
601         assembler.addSequence("anonymous", getBytes(unSupportedAltWithError), 1, false);
602         assembler.buildGraphIfNecessary();
603         new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler);
604         assembler.generateJunctionTrees();
605 
606         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths =
607                 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).setJunctionTreeEvidenceWeightThreshold(1).findBestHaplotypes(10);
608 
609         // We only see the supported alt path because the unsupported alt path is never recovered
610         // If we saw 3 that means the undersupported pruned path ended up in the junciton trees and created a path that shouldn't be there
611         Assert.assertEquals(bestPaths.size(),2);
612         Assert.assertEquals(new String(bestPaths.get(0).getBases()), ref);
613         Assert.assertEquals(new String(bestPaths.get(1).getBases()), supportedAlt);
614     }
615 
616     @Test(enabled = ! DEBUG)
testNonUniqueMiddle()617     public void testNonUniqueMiddle() {
618         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3);
619         final String ref   = "GACACACAGTCA";
620         final String read1 = "GACAC---GTCA";
621         final String read2 =   "CAC---GTCA";
622         assembler.addSequence(getBytes(ref), true);
623         assembler.addSequence(getBytes(read1), false);
624         assembler.addSequence(getBytes(read2), false);
625         assertNonUniqueKmersInvolveLoops(assembler, "ACA", "CAC");
626     }
627 
628     @Test(enabled = ! DEBUG)
testReadsCreateNonUnique()629     public void testReadsCreateNonUnique() {
630         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3);
631         final String ref   = "GCAC--GTCA"; // CAC is unique
632         final String read1 = "GCACACGTCA"; // makes CAC non unique because it has a duplication
633         final String read2 =    "CACGTCA"; // shouldn't be allowed to match CAC as start
634         assembler.addSequence(getBytes(ref), true);
635         assembler.addSequence(getBytes(read1), false);
636         assembler.addSequence(getBytes(read2), false);
637 //        assembler.convertToSequenceGraph().printGraph(new File("test.dot"), 0);
638 
639         assertNonUniqueKmersInvolveLoops(assembler, "CAC");
640         //assertSingleBubble(assembler, ref, "CAAAATCGGG");
641     }
642 
643     @Test(enabled = ! DEBUG)
644     // NOTE that now we do still count the reference as edge multiplicity
testCountingOfStartEdges()645     public void testCountingOfStartEdges() {
646         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3);
647         final String ref   = "NNNGTCAAA"; // ref has some bases before start
648         final String read1 =    "GTCAAA"; // starts at first non N base
649 
650         assembler.addSequence(getBytes(ref), true);
651         assembler.addSequence(getBytes(read1), false);
652         assembler.buildGraphIfNecessary();
653 //        assembler.printGraph(new File("test.dot"), 0);
654 
655         for ( final MultiSampleEdge edge : assembler.edgeSet() ) {
656             final MultiDeBruijnVertex source = assembler.getEdgeSource(edge);
657             final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge);
658             final boolean headerVertex = source.getSuffix() == 'N' || target.getSuffix() == 'N';
659             if ( headerVertex ) {
660                 Assert.assertEquals(edge.getMultiplicity(), 1, "Bases in the unique reference header should have multiplicity of 0");
661             } else {
662                 Assert.assertEquals(edge.getMultiplicity(), 2, "Should have multiplicity of 1 for any edge outside the ref header but got " + edge + " " + source + " -> " + target);
663             }
664         }
665     }
666 
667     @Test(enabled = !DEBUG)
testCountingOfStartEdgesWithMultiplePrefixes()668     public void testCountingOfStartEdgesWithMultiplePrefixes() {
669         final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3);
670         assembler.setIncreaseCountsThroughBranches(true);
671         final String ref   = "NNNGTCAXX"; // ref has some bases before start
672         final String alt1  = "NNNCTCAXX"; // alt1 has SNP right after N
673         final String read  =     "TCAXX"; // starts right after SNP, but merges right before branch
674 
675         assembler.addSequence(getBytes(ref), true);
676         assembler.addSequence(getBytes(alt1), false);
677         assembler.addSequence(getBytes(read), false);
678         assembler.buildGraphIfNecessary();
679         assembler.printGraph(createTempFile("test",".dot"), 0);
680 
681         final List<String> oneCountVertices = Arrays.asList("NNN", "NNC", "NCT", "NNG", "NGT");
682         final List<String> twoCountVertices = Arrays.asList("CTC", "TCA", "GTC");
683 
684         for ( final MultiSampleEdge edge : assembler.edgeSet() ) {
685             final MultiDeBruijnVertex source = assembler.getEdgeSource(edge);
686             final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge);
687             final int expected;
688 //            if (source.getSequenceString().equals("GTC") && target.getSequenceString().equals("TCA")) {
689 //                expected = 1;
690 //            } else {
691                 expected = oneCountVertices.contains(target.getSequenceString()) ? 1 : ((twoCountVertices.contains(target.getSequenceString()) ? 2 : 3));
692 //            }
693             Assert.assertEquals(edge.getMultiplicity(), expected, "Bases at edge " + edge + " from " + source + " to " + target + " has bad multiplicity");
694         }
695     }
696 
697     @Test(enabled = !DEBUG)
testCyclesInGraph()698     public void testCyclesInGraph() {
699 
700         // b37 20:12655200-12655850
701         final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
702 
703         // SNP at 20:12655528 creates a cycle for small kmers
704         final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
705 
706         final List<GATKRead> reads = new ArrayList<>();
707         for ( int index = 0; index < alt.length() - 100; index += 20 ) {
708             reads.add(ArtificialReadUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
709         }
710 
711         // test that there are cycles detected for small kmer
712         final JunctionTreeLinkedDeBruijnGraph rtgraph25 = new JunctionTreeLinkedDeBruijnGraph(25);
713         rtgraph25.addSequence("ref", ref.getBytes(), true);
714         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
715         for ( final GATKRead read : reads ) {
716             rtgraph25.addRead(read, header);
717         }
718         rtgraph25.buildGraphIfNecessary();
719         Assert.assertTrue(rtgraph25.hasCycles());
720 
721         // test that there are no cycles detected for large kmer
722         final JunctionTreeLinkedDeBruijnGraph rtgraph75 = new JunctionTreeLinkedDeBruijnGraph(75);
723         rtgraph75.addSequence("ref", ref.getBytes(), true);
724         for ( final GATKRead read : reads ) {
725             rtgraph75.addRead(read, header);
726         }
727         rtgraph75.buildGraphIfNecessary();
728         Assert.assertFalse(rtgraph75.hasCycles());
729     }
730 
731     @Test(enabled = !DEBUG)
732     // Test showing that if a read gets completely clipped in the ReadThreadingAssembler, that the assembly will not crash
testEmptyReadBeingAddedToGraph()733     public void testEmptyReadBeingAddedToGraph() {
734 
735         // b37 20:12655200-12655850
736         final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
737 
738         // SNP at 20:12655528 creates a cycle for small kmers
739         final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG";
740 
741         final List<GATKRead> reads = new ArrayList<>();
742         for ( int index = 0; index < alt.length() - 100; index += 20 ) {
743             reads.add(ArtificialReadUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M"));
744         }
745         reads.add(ReadUtils.emptyRead(reads.get(0)));
746 
747         // test that there are cycles detected for small kmer
748         final JunctionTreeLinkedDeBruijnGraph rtgraph25 = new JunctionTreeLinkedDeBruijnGraph(25);
749         rtgraph25.addSequence("ref", ref.getBytes(), true);
750         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
751         for ( final GATKRead read : reads ) {
752             rtgraph25.addRead(read, header);
753         }
754         rtgraph25.buildGraphIfNecessary();
755     }
756 
757     @Test(enabled = false)
758     // TODO determine what is causing null bases here
testNsInReadsAreNotUsedForGraph()759     public void testNsInReadsAreNotUsedForGraph() {
760 
761         final int length = 100;
762         final byte[] ref = Utils.dupBytes((byte) 'A', length);
763 
764         final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(25);
765         rtgraph.addSequence("ref", ref, true);
766 
767         // add reads with Ns at any position
768         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
769         for ( int i = 0; i < length; i++ ) {
770             final byte[] bases = ref.clone();
771             bases[i] = 'N';
772             final GATKRead read = ArtificialReadUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M");
773             rtgraph.addRead(read, header);
774         }
775         rtgraph.buildGraphIfNecessary();
776 
777         //final SeqGraph graph = rtgraph.toSequenceGraph();
778         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new GraphBasedKBestHaplotypeFinder<>(rtgraph, rtgraph.getReferenceSourceVertex(), rtgraph.getReferenceSinkVertex()).findBestHaplotypes();
779         Assert.assertEquals(paths.size(), 1);
780     }
781 
782 // TODO -- update to use determineKmerSizeAndNonUniques directly
783 //    @DataProvider(name = "KmerSizeData")
784 //    public Object[][] makeKmerSizeDataProvider() {
785 //        List<Object[]> tests = new ArrayList<Object[]>();
786 //
787 //        // this functionality can be adapted to provide input data for whatever you might want in your data
788 //        tests.add(new Object[]{3, 3, 3, Arrays.asList("ACG"), Arrays.asList()});
789 //        tests.add(new Object[]{3, 4, 3, Arrays.asList("CAGACG"), Arrays.asList()});
790 //
791 //        tests.add(new Object[]{3, 3, 3, Arrays.asList("AAAAC"), Arrays.asList("AAA")});
792 //        tests.add(new Object[]{3, 4, 4, Arrays.asList("AAAAC"), Arrays.asList()});
793 //        tests.add(new Object[]{3, 5, 4, Arrays.asList("AAAAC"), Arrays.asList()});
794 //        tests.add(new Object[]{3, 4, 3, Arrays.asList("CAAA"), Arrays.asList()});
795 //        tests.add(new Object[]{3, 4, 4, Arrays.asList("CAAAA"), Arrays.asList()});
796 //        tests.add(new Object[]{3, 5, 4, Arrays.asList("CAAAA"), Arrays.asList()});
797 //        tests.add(new Object[]{3, 5, 5, Arrays.asList("ACGAAAAACG"), Arrays.asList()});
798 //
799 //        for ( int maxSize = 3; maxSize < 20; maxSize++ ) {
800 //            for ( int dupSize = 3; dupSize < 20; dupSize++ ) {
801 //                final int expectedSize = Math.min(maxSize, dupSize);
802 //                final String dup = Utils.dupString("C", dupSize);
803 //                final List<String> nonUnique = dupSize > maxSize ? Arrays.asList(Utils.dupString("C", maxSize)) : Collections.<String>emptyList();
804 //                tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("ACGT", "A" + dup + "GT"), nonUnique});
805 //                tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("A" + dup + "GT", "ACGT"), nonUnique});
806 //            }
807 //        }
808 //
809 //        return tests.toArray(new Object[][]{});
810 //    }
811 //
812 //    /**
813 //     * Example testng test using MyDataProvider
814 //     */
815 //    @Test(dataProvider = "KmerSizeData")
816 //    public void testDynamicKmerSizing(final int min, final int max, final int expectKmer, final List<String> seqs, final List<String> expectedNonUniques) {
817 //        final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(min, max);
818 //        for ( String seq : seqs ) assembler.addSequence(seq.getBytes(), false);
819 //        assembler.buildGraphIfNecessary();
820 //        Assert.assertEquals(assembler.getKmerSize(), expectKmer);
821 //        assertNonUniqueKmersInvolveLoops(assembler, expectedNonUniques.toArray(new String[]{}));
822 //    }
823 
824     @DataProvider(name = "DanglingTails")
makeDanglingTailsData()825     public Object[][] makeDanglingTailsData() {
826         List<Object[]> tests = new ArrayList<>();
827 
828         // add 1M to the expected CIGAR because it includes the previous (common) base too
829         tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3});                  // incomplete haplotype
830         tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10});     // insertion
831         tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10});    // deletion
832         tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7});                // 1 snp
833         tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2});                // several snps
834         tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1});                    // funky SW alignment
835         tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1});                    // very little data
836         tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1});                 // ends in mismatch
837         tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0});       // alignment is too complex
838         tests.add(new Object[]{"AAAAA", "XXXXX", "1M5I", false, -1});                  // insertion
839 
840         return tests.toArray(new Object[][]{});
841     }
842 
843     @Test(dataProvider = "DanglingTails")
testDanglingTails(final String refEnd, final String altEnd, final String cigar, final boolean cigarIsGood, final int mergePointDistanceFromSink)844     public void testDanglingTails(final String refEnd,
845                                   final String altEnd,
846                                   final String cigar,
847                                   final boolean cigarIsGood,
848                                   final int mergePointDistanceFromSink) {
849 
850         final int kmerSize = 15;
851 
852         // construct the haplotypes
853         final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
854         final String ref = commonPrefix + refEnd;
855         final String alt = commonPrefix + altEnd;
856 
857         // create the graph and populate it
858         final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
859         rtgraph.addSequence("ref", ref.getBytes(), true);
860         final GATKRead read = ArtificialReadUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
861         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
862         rtgraph.addRead(read, header);
863         rtgraph.buildGraphIfNecessary();
864 
865         // confirm that we have just a single dangling tail
866         MultiDeBruijnVertex altSink = null;
867         for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
868             if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) {
869                 Assert.assertTrue(altSink == null, "We found more than one non-reference sink");
870                 altSink = v;
871             }
872         }
873 
874         Assert.assertTrue(altSink != null, "We did not find a non-reference sink");
875 
876         // confirm that the SW alignment agrees with our expectations
877         final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstDownwardsReferencePath(altSink, 0, 4, false, SmithWatermanJavaAligner
878                 .getInstance());
879 
880         if ( result == null ) {
881             Assert.assertFalse(cigarIsGood);
882             return;
883         }
884 
885         Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
886 
887         // confirm that the goodness of the cigar agrees with our expectations
888         Assert.assertEquals(JunctionTreeLinkedDeBruijnGraph.cigarIsOkayToMerge(result.cigar, false, true), cigarIsGood);
889 
890         // confirm that the tail merging works as expected
891         if ( cigarIsGood ) {
892             final int mergeResult = rtgraph.mergeDanglingTail(result);
893             Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1);
894 
895             // confirm that we created the appropriate edge
896             if ( mergePointDistanceFromSink >= 0 ) {
897                 MultiDeBruijnVertex v = altSink;
898                 for ( int i = 0; i < mergePointDistanceFromSink; i++ ) {
899                     if ( rtgraph.inDegreeOf(v) != 1 )
900                         Assert.fail("Encountered vertex with multiple edges");
901                     v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v));
902                 }
903                 Assert.assertTrue(rtgraph.outDegreeOf(v) > 1);
904             }
905         }
906     }
907 
908     @Test (enabled = false)
909     //TODO this test will need to be reenabled when dangling head recovery is handled with the JTBesthaplotypeFinder
testForkedDanglingEnds()910     public void testForkedDanglingEnds() {
911 
912         final int kmerSize = 15;
913 
914         // construct the haplotypes
915         final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT";
916         final String refEnd = "GCTAGCTAATCG";
917         final String altEnd1 = "ACTAGCTAATCG";
918         final String altEnd2 = "ACTAGATAATCG";
919         final String ref = commonPrefix + refEnd;
920         final String alt1 = commonPrefix + altEnd1;
921         final String alt2 = commonPrefix + altEnd2;
922 
923         // create the graph and populate it
924         final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
925         rtgraph.addSequence("ref", ref.getBytes(), true);
926         final GATKRead read1 = ArtificialReadUtils.createArtificialRead(alt1.getBytes(), Utils.dupBytes((byte) 30, alt1.length()), alt1.length() + "M");
927         final GATKRead read2 = ArtificialReadUtils.createArtificialRead(alt2.getBytes(), Utils.dupBytes((byte) 30, alt2.length()), alt2.length() + "M");
928         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
929         rtgraph.addRead(read1, header);
930         rtgraph.addRead(read2, header);
931         rtgraph.buildGraphIfNecessary();
932 
933         Assert.assertEquals(rtgraph.getSinks().size(), 3);
934 
935         for (final MultiDeBruijnVertex altSink : rtgraph.getSinks()) {
936             if (rtgraph.isReferenceNode(altSink)) {
937                 continue;
938             }
939 
940             // confirm that the SW alignment agrees with our expectations
941             final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstDownwardsReferencePath(altSink,
942                     0, 4, true, SmithWatermanJavaAligner.getInstance());
943             Assert.assertNotNull(result);
944             Assert.assertTrue(JunctionTreeLinkedDeBruijnGraph.cigarIsOkayToMerge(result.cigar, false, true));
945 
946             // confirm that the tail merging works as expected
947             final int mergeResult = rtgraph.mergeDanglingTail(result);
948             Assert.assertTrue(mergeResult == 1);
949         }
950 
951         final List<String> paths = new JunctionTreeKBestHaplotypeFinder<>(rtgraph).findBestHaplotypes().stream()
952                 .map(kBestHaplotype -> new String(kBestHaplotype.getBases()))
953                 .distinct()
954                 .sorted()
955                 .collect(Collectors.toList());
956 
957         Assert.assertEquals(paths.size(), 3);
958         Assert.assertEquals(paths, Arrays.asList(ref, alt1, alt2).stream().sorted().collect(Collectors.toList()));
959     }
960 
961     @Test
testWholeTailIsInsertion()962     public void testWholeTailIsInsertion() {
963         final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(10);
964         final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = new JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper(null, null, "AXXXXX".getBytes(), "AAAAAA".getBytes(), TextCigarCodec.decode("5I1M"));
965         final int mergeResult = rtgraph.mergeDanglingTail(result);
966         Assert.assertEquals(mergeResult, 0);
967     }
968 
969     @Test
testGetBasesForPath()970     public void testGetBasesForPath() {
971 
972         final int kmerSize = 4;
973         final String testString = "AATGGGGCAATACTA";
974 
975         final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
976         graph.addSequence(testString.getBytes(), true);
977         graph.buildGraphIfNecessary();
978 
979         final List<MultiDeBruijnVertex> vertexes = new ArrayList<>();
980         MultiDeBruijnVertex v = graph.getReferenceSourceVertex();
981         while ( v != null ) {
982             vertexes.add(v);
983             v = graph.getNextReferenceVertex(v);
984         }
985 
986         final String resultForTails = new String(graph.getBasesForPath(vertexes, false));
987         Assert.assertEquals(resultForTails, testString.substring(kmerSize - 1));
988         final String resultForHeads = new String(graph.getBasesForPath(vertexes, true));
989         Assert.assertEquals(resultForHeads, "GTAAGGGCAATACTA");  // because the source node will be reversed
990     }
991 
992     @DataProvider(name = "DanglingHeads")
makeDanglingHeadsData()993     public Object[][] makeDanglingHeadsData() {
994         List<Object[]> tests = new ArrayList<>();
995 
996         // add 1M to the expected CIGAR because it includes the last (common) base too
997         tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "AAYCGGTTACGT", "8M", true});        // 1 snp
998         tests.add(new Object[]{"XXXAACCGGTTACGT", "XAAACCGGTTACGT", "7M", false});         // 1 snp
999         tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "XAACGGTTACGT", "4M1D4M", false});   // deletion
1000         tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "AYYCGGTTACGT", "8M", true});        // 2 snps
1001         tests.add(new Object[]{"XXTXXXXAACCGGTTACGTAA", "AYCYGGTTACGTAA", "9M", true});    // 2 snps
1002         tests.add(new Object[]{"XXXTXXXAACCGGTTACGT", "AYCGGTTACGT", "7M", true});         // very little data
1003         tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "YCCGGTTACGT", "6M", true});         // begins in mismatch
1004 
1005         //TODO these tests are diffiuclt to evaluate, as GraphBasedKBestHaplotypeFinder currently cannot handle looping sequence which poses a problem for testing explicitly looping refrence behavior
1006 //        tests.add(new Object[]{"XXTXXXXAACCGGTTACGTTTACGT", "AAYCGGTTACGT", "8M", true});        // dangling head hanging off of a non-unique reference kmer
1007 //        tests.add(new Object[]{"XXTXXXXAACCGGTTACGTCGGTTACGT", "AAYCGGTTACGT", "8M", true});
1008 
1009         return tests.toArray(new Object[][]{});
1010     }
1011 
1012     @Test(dataProvider = "DanglingHeads")
testDanglingHeads(final String ref, final String alt, final String cigar, final boolean shouldBeMerged)1013     public void testDanglingHeads(final String ref,
1014                                   final String alt,
1015                                   final String cigar,
1016                                   final boolean shouldBeMerged) {
1017 
1018         final int kmerSize = 5;
1019 
1020         // create the graph and populate it
1021         final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
1022         rtgraph.addSequence("ref", ref.getBytes(), true);
1023         final GATKRead read = ArtificialReadUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
1024         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
1025         rtgraph.addRead(read, header);
1026         rtgraph.setMaxMismatchesInDanglingHead(10);
1027         rtgraph.buildGraphIfNecessary();
1028 
1029         // confirm that we have just a single dangling head
1030         MultiDeBruijnVertex altSource = null;
1031         for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
1032             if ( rtgraph.isSource(v) && !rtgraph.isReferenceNode(v) ) {
1033                 Assert.assertTrue(altSource == null, "We found more than one non-reference source");
1034                 altSource = v;
1035             }
1036         }
1037 
1038         Assert.assertTrue(altSource != null, "We did not find a non-reference source");
1039 
1040         // confirm that the SW alignment agrees with our expectations
1041         final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstUpwardsReferencePath(altSource, 0, 1, false, SmithWatermanJavaAligner
1042                 .getInstance());
1043 
1044         if ( result == null ) {
1045             Assert.assertFalse(shouldBeMerged);
1046             return;
1047         }
1048 
1049         Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
1050 
1051         // confirm that the tail merging works as expected
1052         final int mergeResult = rtgraph.mergeDanglingHeadLegacy(result);
1053         Assert.assertTrue(mergeResult > 0 || !shouldBeMerged);
1054 
1055         // confirm that we created the appropriate bubble in the graph only if expected
1056         rtgraph.cleanNonRefPaths();
1057 //        final SeqGraph seqGraph = rtgraph.toSequenceGraph();
1058         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new GraphBasedKBestHaplotypeFinder<>(rtgraph, rtgraph.getReferenceSourceVertex(), rtgraph.getReferenceSinkVertex()).findBestHaplotypes();
1059         Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1);
1060     }
1061 
1062     @Test(dataProvider = "DanglingHeads", enabled = false)
1063     //TODO this test will need to be enabled when we change the dangling end recovery logic, as of right now definitionally we are
1064     //TODO incapable of recovering dangling heads and that will have to change eventually
testDanglingHeadsWithJTBestHaplotypeFinder(final String ref, final String alt, final String cigar, final boolean shouldBeMerged)1065     public void testDanglingHeadsWithJTBestHaplotypeFinder(final String ref,
1066                                   final String alt,
1067                                   final String cigar,
1068                                   final boolean shouldBeMerged) {
1069 
1070         final int kmerSize = 5;
1071 
1072         // create the graph and populate it
1073         final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize);
1074         rtgraph.addSequence("ref", ref.getBytes(), true);
1075         final GATKRead read = ArtificialReadUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
1076         final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
1077         rtgraph.addRead(read, header);
1078         rtgraph.setMaxMismatchesInDanglingHead(10);
1079         rtgraph.buildGraphIfNecessary();
1080 
1081         // confirm that we have just a single dangling head
1082         MultiDeBruijnVertex altSource = null;
1083         for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) {
1084             if ( rtgraph.isSource(v) && !rtgraph.isReferenceNode(v) ) {
1085                 Assert.assertTrue(altSource == null, "We found more than one non-reference source");
1086                 altSource = v;
1087             }
1088         }
1089 
1090         Assert.assertTrue(altSource != null, "We did not find a non-reference source");
1091 
1092         // confirm that the SW alignment agrees with our expectations
1093         final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstUpwardsReferencePath(altSource, 0, 1, false, SmithWatermanJavaAligner
1094                 .getInstance());
1095 
1096         if ( result == null ) {
1097             Assert.assertFalse(shouldBeMerged);
1098             return;
1099         }
1100 
1101         Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString());
1102 
1103         // confirm that the tail merging works as expected
1104         final int mergeResult = rtgraph.mergeDanglingHead(result);
1105         Assert.assertTrue(mergeResult > 0 || !shouldBeMerged);
1106 
1107         // confirm that we created the appropriate bubble in the graph only if expected
1108         rtgraph.cleanNonRefPaths();
1109 //        final SeqGraph seqGraph = rtgraph.toSequenceGraph();
1110         final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new JunctionTreeKBestHaplotypeFinder<>(rtgraph, rtgraph.getReferenceSourceVertex(), rtgraph.getReferenceSinkVertex()).findBestHaplotypes();
1111         Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1);
1112     }
1113 
1114     /**
1115      * Find the ending position of the longest uniquely matching
1116      * run of bases of kmer in seq.
1117      *
1118      * for example, if seq = ACGT and kmer is NAC, this function returns 1,2 as we have the following
1119      * match:
1120      *
1121      *  0123
1122      * .ACGT
1123      * NAC..
1124      *
1125      * @param seq a non-null sequence of bytes
1126      * @param kmer a non-null kmer
1127      * @return the ending position and length where kmer matches uniquely in sequence, or null if no
1128      *         unique longest match can be found
1129      */
findLongestUniqueSuffixMatch(final byte[] seq, final byte[] kmer)1130     private static Pair<Integer, Integer> findLongestUniqueSuffixMatch(final byte[] seq, final byte[] kmer) {
1131         int longestPos = -1;
1132         int length = 0;
1133         boolean foundDup = false;
1134 
1135         for ( int i = 0; i < seq.length; i++ ) {
1136             final int matchSize = JunctionTreeLinkedDeBruijnGraph.longestSuffixMatch(seq, kmer, i);
1137             if ( matchSize > length ) {
1138                 longestPos = i;
1139                 length = matchSize;
1140                 foundDup = false;
1141             } else if ( matchSize == length ) {
1142                 foundDup = true;
1143             }
1144         }
1145 
1146         return foundDup ? null : Pair.of(longestPos, length);
1147     }
1148 
1149     /**
1150      * Example testng test using MyDataProvider
1151      */
1152     @Test(dataProvider = "findLongestUniqueMatchData")
testfindLongestUniqueMatch(final String seq, final String kmer, final int start, final int length)1153     public void testfindLongestUniqueMatch(final String seq, final String kmer, final int start, final int length) {
1154         // adaptor this code to do whatever testing you want given the arguments start and size
1155         final Pair<Integer, Integer> actual = findLongestUniqueSuffixMatch(seq.getBytes(), kmer.getBytes());
1156         if ( start == -1 )
1157             Assert.assertNull(actual);
1158         else {
1159             Assert.assertNotNull(actual);
1160             Assert.assertEquals((int)actual.getLeft(), start);
1161             Assert.assertEquals((int)actual.getRight(), length);
1162         }
1163     }
1164 
1165     @DataProvider(name = "findLongestUniqueMatchData")
makefindLongestUniqueMatchData()1166     public Object[][] makefindLongestUniqueMatchData() {
1167         List<Object[]> tests = new ArrayList<>();
1168 
1169         { // test all edge conditions
1170             final String ref = "ACGT";
1171             for ( int start = 0; start < ref.length(); start++ ) {
1172                 for ( int end = start + 1; end <= ref.length(); end++ ) {
1173                     final String kmer = ref.substring(start, end);
1174                     tests.add(new Object[]{ref, kmer, end - 1, end - start});
1175                     tests.add(new Object[]{ref, "N" + kmer, end - 1, end - start});
1176                     tests.add(new Object[]{ref, "NN" + kmer, end - 1, end - start});
1177                     tests.add(new Object[]{ref, kmer + "N", -1, 0});
1178                     tests.add(new Object[]{ref, kmer + "NN", -1, 0});
1179                 }
1180             }
1181         }
1182 
1183         { // multiple matches
1184             final String ref = "AACCGGTT";
1185             for ( final String alt : Arrays.asList("A", "C", "G", "T") )
1186                 tests.add(new Object[]{ref, alt, -1, 0});
1187             tests.add(new Object[]{ref, "AA", 1, 2});
1188             tests.add(new Object[]{ref, "CC", 3, 2});
1189             tests.add(new Object[]{ref, "GG", 5, 2});
1190             tests.add(new Object[]{ref, "TT", 7, 2});
1191         }
1192 
1193         { // complex matches that have unique substrings of lots of parts of kmer in the ref
1194             final String ref = "ACGTACGTACGT";
1195             tests.add(new Object[]{ref, "ACGT", -1, 0});
1196             tests.add(new Object[]{ref, "TACGT", -1, 0});
1197             tests.add(new Object[]{ref, "GTACGT", -1, 0});
1198             tests.add(new Object[]{ref, "CGTACGT", -1, 0});
1199             tests.add(new Object[]{ref, "ACGTACGT", -1, 0});
1200             tests.add(new Object[]{ref, "TACGTACGT", 11, 9});
1201             tests.add(new Object[]{ref, "NTACGTACGT", 11, 9});
1202             tests.add(new Object[]{ref, "GTACGTACGT", 11, 10});
1203             tests.add(new Object[]{ref, "NGTACGTACGT", 11, 10});
1204             tests.add(new Object[]{ref, "CGTACGTACGT", 11, 11});
1205         }
1206 
1207         return tests.toArray(new Object[][]{});
1208     }
1209 }