1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading; 2 3 import htsjdk.samtools.SAMFileHeader; 4 import htsjdk.samtools.TextCigarCodec; 5 import org.apache.commons.lang3.tuple.Pair; 6 import org.broadinstitute.hellbender.testutils.BaseTest; 7 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer; 8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.*; 9 import org.broadinstitute.hellbender.utils.Utils; 10 import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; 11 import org.broadinstitute.hellbender.utils.read.GATKRead; 12 import org.broadinstitute.hellbender.utils.read.ReadUtils; 13 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; 14 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner; 15 import org.testng.Assert; 16 import org.testng.annotations.DataProvider; 17 import org.testng.annotations.Test; 18 19 import java.util.*; 20 import java.util.stream.Collectors; 21 22 public class JunctionTreeLinkedDeBruijnGraphUnitTest extends BaseTest { 23 24 @DataProvider (name = "loopingReferences") loopingReferences()25 public static Object[][] loopingReferences() { 26 return new Object[][]{ 27 new Object[]{"GACACACAGTCA", 3}, //ACA and CAC are repeated 28 new Object[]{"GACACTCACGCACGG", 3}, //CAC repeated twice, showing that the loops are handled somewhat sanely 29 new Object[]{"GACATCGACGG", 3}, //GAC repeated twice, starting with GAC, can it recover the reference if it starts on a repeated kmer 30 new Object[]{"GACATCACATC", 3} //final kmer ATC is repeated, can we recover the reference for repating final kmer 31 }; 32 } 33 34 @Test (dataProvider = "loopingReferences") testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize)35 public void testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize) { 36 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 37 assembler.addSequence("anonymous", getBytes(ref), true); 38 assembler.buildGraphIfNecessary(); 39 List<MultiDeBruijnVertex> refVertexes = assembler.getReferencePath(ReadThreadingGraph.TraversalDirection.downwards); 40 final StringBuilder builder = new StringBuilder(refVertexes.get(0).getSequenceString()); 41 refVertexes.stream().skip(1).forEach(v -> builder.append(v.getSuffixString())); 42 Assert.assertEquals(builder.toString(), ref); 43 } 44 45 @Test 46 // This test is intended to assert that a potentially dangerous infinite loop is closed testDanglingEndRecoveryInfiniteHeadLoop()47 public void testDanglingEndRecoveryInfiniteHeadLoop() { 48 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 49 final String ref = "AACTGGGCTAGAGAGCGTT"; 50 final String loopingDanglingHead = "TTCGAAGGTCGAAG"; // "TCGAA" is repeated, causing dangling head recovery to fail 51 52 assembler.addSequence("anonymous", getBytes(ref), true); 53 assembler.addSequence("anonymous", getBytes(ref), false); 54 // Coverage = 3, thus above the prune factor of 2 55 assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false); 56 assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false); 57 assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false); 58 59 // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG) 60 assembler.buildGraphIfNecessary(); 61 62 //Try dangling head recovery (this might have infinitely looped if we are not careful) 63 assembler.recoverDanglingHeads(2, 0, true, SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA)); 64 65 Assert.assertTrue(true, "If we reached here than the above code didn't infinitely loop"); 66 } 67 68 @Test 69 // As above, pruning the dangling end recovery path can cause the infinite loop recovery to fall over testDanglingEndRecoveryInfiniteHeadLoopWithPruning()70 public void testDanglingEndRecoveryInfiniteHeadLoopWithPruning() { 71 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 72 final String ref = "AACTGGGCTAGAGAGCGTT"; 73 final String loopingDanglingHead = "TTCGAAGGTCGAA"; // "TCGAA" is repeated, causing dangling head recovery to fail 74 final String loopingDanglingHeadShortened = "TTCGAAGGTC"; // "TCGAA" is repeated, causing dangling head recovery to fail 75 76 77 assembler.addSequence("anonymous", getBytes(ref), true); 78 assembler.addSequence("anonymous", getBytes(ref), false); 79 assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false); 80 assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHead), false); 81 // Now the some of the kimers in the infinite loop should have < 3 coverage, causing pruning to kick in 82 assembler.addSequence("loopingUnconecetedHead", getBytes(loopingDanglingHeadShortened), false); 83 84 // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG) 85 assembler.buildGraphIfNecessary(); 86 87 //Try dangling head recovery (this might have infinitely looped if we are not careful) 88 assembler.recoverDanglingHeads(3, 0, true, SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA)); 89 90 Assert.assertTrue(true, "If we reached here than the above code didn't infinitely loop"); 91 } 92 93 @Test testJunctionTreePaperTestExample()94 public void testJunctionTreePaperTestExample() { 95 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 96 String ref = "ACTGATTTCGATGCGATGCGATGCCACGGTGG"; // a loop of length 3 in the middle 97 98 assembler.addSequence("anonymous", getBytes(ref), true); 99 assembler.addSequence("anonymous", getBytes(ref), false); 100 101 // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG) 102 assembler.buildGraphIfNecessary(); 103 assembler.generateJunctionTrees(); 104 Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGAT"))).isPresent()); 105 Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GCGAT"))).isPresent()); 106 107 Assert.assertEquals(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGAT"))).get().getPathsPresentAsBaseChoiceStrings().get(0), "GGC_"); 108 } 109 110 @Test testGraphPruningSanityCheck()111 public void testGraphPruningSanityCheck() { 112 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 113 String ref = "TTTTTAAAAACCCCCGGGGGATATATCGCGCG"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting 114 115 String altRead1 = "TTTTTGAAAACCCTCGGGGGATAAATCGCGCG"; // 3 SNPs 116 String altRead2 = "TTTTTGAAAACCCTCGGGGG"; 117 String altRead3 = "TTTTTGAAAACCCTCG"; 118 String altRead4 = "TTTTTGAAAA"; 119 String altRead5 = "TTTTTG"; 120 121 assembler.addSequence("anonymous", getBytes(ref), true); 122 assembler.addSequence("anonymous", getBytes(altRead1), false); 123 assembler.addSequence("anonymous", getBytes(altRead2), false); 124 assembler.addSequence("anonymous", getBytes(altRead3), false); 125 assembler.addSequence("anonymous", getBytes(altRead4), false); 126 assembler.addSequence("anonymous", getBytes(altRead5), false); 127 128 // This graph should have generated 3 junction trees (one at GAAAA, one at TCGGG, and one at AATCG) 129 assembler.buildGraphIfNecessary(); 130 assembler.generateJunctionTrees(); 131 Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).isPresent()); 132 Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGGG"))).isPresent()); 133 Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("AATCG"))).isPresent()); 134 135 Assert.assertEquals(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).get().getPathsPresentAsBaseChoiceStrings().get(0), "TA_"); 136 137 // Now we prune requiring at least 2 bases of support (which should kill all but the first graph and then only the first two nodes of the first graph) 138 assembler.pruneJunctionTrees(2); 139 140 Assert.assertTrue(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).isPresent()); 141 Assert.assertFalse(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("TCGGG"))).isPresent()); 142 Assert.assertFalse(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("AATCG"))).isPresent()); 143 144 Assert.assertEquals(assembler.getJunctionTreeForNode(assembler.findKmer( new Kmer("GAAAA"))).get().getPathsPresentAsBaseChoiceStrings().get(0), "T"); 145 } 146 147 @Test 148 // The simplest case where we expect two uninformative junction trees to be constructed testSimpleJunctionTreeExample()149 public void testSimpleJunctionTreeExample() { 150 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 151 String ref = "GGGGAAATTTCCCGGGG"; 152 153 // A simple SNP het 154 String refRead1 = "GGGAAATTTCC"; 155 String refRead2 = "GAAATTTCCCGGG"; 156 String altRead1 = "GGAAATGTC"; 157 String altRead2 = "GGAAATGTCCCGGG"; 158 159 assembler.addSequence("anonymous", getBytes(ref), true); 160 assembler.addSequence("anonymous", getBytes(refRead1), false); 161 assembler.addSequence("anonymous", getBytes(refRead2), false); 162 assembler.addSequence("anonymous", getBytes(altRead1), false); 163 assembler.addSequence("anonymous", getBytes(altRead2), false); 164 165 assembler.buildGraphIfNecessary(); 166 assembler.generateJunctionTrees(); 167 168 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 169 Assert.assertEquals(junctionTrees.size(), 2); 170 171 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("GTCCC"))); 172 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("TTCCC"))); 173 Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site 174 Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site 175 176 List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings(); 177 List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings(); 178 Assert.assertEquals(tree1Choices, Collections.singletonList("")); 179 Assert.assertEquals(tree2Choices, Collections.singletonList("")); 180 181 // Since the remaining graphs are empty prune them and assert as much 182 junctionTrees = assembler.getReadThreadingJunctionTrees(true); 183 Assert.assertEquals(junctionTrees.size(), 0); 184 } 185 186 @Test 187 // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1. 188 // We are asserting that the JunctionTree generated by this case lives on the node itself testEdgeCaseInvolvingHighInDegreeAndOutDegreeCars()189 public void testEdgeCaseInvolvingHighInDegreeAndOutDegreeCars() { 190 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(4); 191 String ref = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting 192 193 // A simple snip het 194 String refRead1 = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; 195 String read1 = "AAAACAC"+"CCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the below read 196 String read2 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the above read 197 198 assembler.addSequence("anonymous", getBytes(ref), true); 199 assembler.addSequence("anonymous", getBytes(refRead1), false); 200 assembler.addSequence("anonymous", getBytes(read1), false); 201 assembler.addSequence("anonymous", getBytes(read2), false); 202 203 assembler.buildGraphIfNecessary(); 204 assembler.generateJunctionTrees(); 205 206 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 207 Assert.assertEquals(junctionTrees.size(), 6); 208 209 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("CCGA"))); 210 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("TCGA"))); 211 Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site 212 Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site 213 214 List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings(); 215 List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings(); 216 Assert.assertEquals(new HashSet<>(tree1Choices), new HashSet<>(Arrays.asList("AA_","CC_"))); // The AA and CC correspond to taking A and C out of the current nodes. 217 Assert.assertEquals(tree2Choices, Collections.singletonList("C_")); 218 } 219 220 221 @Test 222 // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1. 223 // We are asserting that the JunctionTree generated by this case lives on the node itself testGraphRecoveryWhenKmerIsRepeated()224 public void testGraphRecoveryWhenKmerIsRepeated() { 225 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 226 String ref = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting 227 228 // A simple snip het 229 String refRead = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG"; 230 231 assembler.addSequence("anonymous", getBytes(ref), true); 232 assembler.addSequence("anonymous", getBytes(refRead), false); 233 234 assembler.buildGraphIfNecessary(); 235 assembler.generateJunctionTrees(); 236 237 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 238 Assert.assertEquals(junctionTrees.size(), 2); 239 240 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("CGGGG"))); 241 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("GGGGG"))); 242 Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site 243 Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site 244 245 246 List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings(); 247 Assert.assertEquals(tree1Choices, Collections.singletonList("GGGGGGGGGT_")); 248 249 List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings(); 250 Assert.assertEquals(new HashSet<>(tree2Choices), new HashSet<>(Arrays.asList( 251 "GGGGGGGGGT_" 252 ,"GGGGGGGGT_" 253 ,"GGGGGGGT_" 254 ,"GGGGGGT_" 255 ,"GGGGGT_" 256 ,"GGGGT_" 257 ,"GGGT_" 258 ,"GGT_" 259 ,"GT_" 260 ,"T_"))); 261 } 262 263 @Test 264 // The simplest case where we expect two uninformative junction trees to be constructed testPruneGraph()265 public void testPruneGraph() { 266 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 267 String ref = "GGGGAAATTTCCCGGGG"; 268 269 // A simple snip het 270 String refRead1 = "GGGAAATTTCC"; 271 String refRead2 = "GAAATTTCCCGGG"; 272 String altRead1 = "GGAAATGTC"; 273 String altRead2 = "GGAAATGTCCCGGG"; 274 275 assembler.addSequence("anonymous", getBytes(ref), true); 276 assembler.addSequence("anonymous", getBytes(refRead1), false); 277 assembler.addSequence("anonymous", getBytes(refRead2), false); 278 assembler.addSequence("anonymous", getBytes(altRead1), false); 279 assembler.addSequence("anonymous", getBytes(altRead2), false); 280 281 assembler.buildGraphIfNecessary(); 282 assembler.generateJunctionTrees(); 283 284 assembler.pruneJunctionTrees(0); 285 286 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 287 Assert.assertEquals(junctionTrees.size(), 0); 288 } 289 290 @Test testSimpleJunctionTreeIncludeRefInJunctionTreeNoStopEnd()291 public void testSimpleJunctionTreeIncludeRefInJunctionTreeNoStopEnd() { 292 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 293 String ref = "GGGGAAATTTCCCGGGG"; // Ref starts 1 base before/after any of the reads 294 295 // A simple snip het 296 String altARead1 = "GGGAAATATCC"; // Replaces a T with an A 297 String altARead2 = "GAAATATCCCGGG"; 298 String altTRead1 = "GGAAATGTC"; // Replaces a T with a G 299 String altTRead2 = "GGAAATGTCCCGGG"; 300 301 assembler.addSequence("anonymous", getBytes(ref), true); 302 assembler.addSequence("anonymous", getBytes(altTRead1), false); 303 assembler.addSequence("anonymous", getBytes(altTRead2), false); 304 assembler.addSequence("anonymous", getBytes(altARead1), false); 305 assembler.addSequence("anonymous", getBytes(altARead2), false); 306 307 assembler.buildGraphIfNecessary(); 308 assembler.generateJunctionTrees(); 309 310 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 311 Assert.assertEquals(junctionTrees.size(), 2); 312 313 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("ATCCC"))); 314 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("GTCCC"))); 315 //NOTE there is no "TTCCC" edge here because it only existed as a reference path 316 Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site 317 Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site 318 319 List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings(); 320 List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings(); 321 Assert.assertEquals(tree1Choices, Collections.singletonList("")); 322 Assert.assertEquals(tree2Choices, Collections.singletonList("")); 323 } 324 325 @Test 326 // NOTE this test is less useful now that the starting JT has been removed testSimpleJunctionTreeIncludeRefInJunctionTreeNoWithStartEndChanges()327 public void testSimpleJunctionTreeIncludeRefInJunctionTreeNoWithStartEndChanges() { 328 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 329 String ref = "GGGAAATTTCCCGGG"; // Reads span to the start/stop of the ref, thus we have to worry about symbolic alleles 330 331 // A simple snip het 332 String altARead1 = "GGGAAATATCC"; // Replaces a T with an A 333 String altARead2 = "GAAATATCCCGGG"; 334 String altTRead1 = "GGAAATGTC"; // Replaces a T with a G 335 String altTRead2 = "GGAAATGTCCCGGG"; 336 337 assembler.addSequence("anonymous", getBytes(ref), true); 338 assembler.addSequence("anonymous", getBytes(altTRead1), false); 339 assembler.addSequence("anonymous", getBytes(altTRead2), false); 340 assembler.addSequence("anonymous", getBytes(altARead1), false); 341 assembler.addSequence("anonymous", getBytes(altARead2), false); 342 343 assembler.buildGraphIfNecessary(); 344 assembler.generateJunctionTrees(); 345 346 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 347 Assert.assertEquals(junctionTrees.size(), 2); 348 349 JunctionTreeLinkedDeBruijnGraph.ThreadingTree starttree = junctionTrees.get(assembler.findKmer(new Kmer("GGGAA"))); 350 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("ATCCC"))); 351 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("GTCCC"))); 352 //NOTE there is no "TTCCC" edge here because it only existed as a reference path 353 Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site 354 Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site 355 356 List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings(); 357 List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings(); 358 Assert.assertEquals(tree1Choices, Collections.singletonList("_")); 359 Assert.assertEquals(tree2Choices, Collections.singletonList("_")); 360 } 361 362 @Test testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites()363 public void testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites() { 364 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 365 String ref = "GGAAAT"+"T"+"TCCGGC"+"T"+"CGTTTAG"; //Two variant sites in close proximity 366 367 // A simple snip het 368 String altAARead1 = "GAAAT"+"A"+"TCCGGC"+"A"+"CGTTTA"; // Replaces a T with an A, then a T with a A 369 String altAARead2 = "GAAAT"+"A"+"TCCGGC"+"A"+"CGTTTA"; // Replaces a T with an A, then a T with a A 370 String altTCRead1 = "GAAAT"+"T"+"TCCGGC"+"C"+"CGTTTA"; // Keeps the T, then replaces a T with a C 371 String altTCRead2 = "GAAAT"+"T"+"TCCGGC"+"C"+"CGTTTA"; // Keeps the T, then replaces a T with a C 372 373 assembler.addSequence("anonymous", getBytes(ref), true); 374 assembler.addSequence("anonymous", getBytes(altAARead1), false); 375 assembler.addSequence("anonymous", getBytes(altAARead2), false); 376 assembler.addSequence("anonymous", getBytes(altTCRead1), false); 377 assembler.addSequence("anonymous", getBytes(altTCRead2), false); 378 379 assembler.buildGraphIfNecessary(); 380 assembler.generateJunctionTrees(); 381 382 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(true); 383 Assert.assertEquals(junctionTrees.size(), 2); 384 385 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree1 = junctionTrees.get(assembler.findKmer(new Kmer("ATCCG"))); 386 JunctionTreeLinkedDeBruijnGraph.ThreadingTree tree2 = junctionTrees.get(assembler.findKmer(new Kmer("TTCCG"))); 387 //NOTE there is no "TTCCC" edge here because it only existed as a reference path 388 Assert.assertNotNull(tree1); // Make sure we actually have tree data for that site 389 Assert.assertNotNull(tree2); // Make sure we actually have tree data for that site 390 391 List<String> tree1Choices = tree1.getPathsPresentAsBaseChoiceStrings(); 392 List<String> tree2Choices = tree2.getPathsPresentAsBaseChoiceStrings(); 393 // Perfectly phased variants (site 2 has 2 mismatches to the reference) 394 Assert.assertEquals(tree1Choices, Collections.singletonList("A")); 395 Assert.assertEquals(tree2Choices, Collections.singletonList("C")); 396 } 397 398 @Test testSimpleJuncionTreeLoopingReference()399 public void testSimpleJuncionTreeLoopingReference() { 400 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 401 String ref = "ATGGTTAGGGGAAATTTAAATTTAAAGCGCCCCCG"; // AAATT, AATTT, ATTTA, TTTAA, and TTAAA are all repeated in a loop 402 403 assembler.addSequence("reference", getBytes(ref), true); 404 for (int i = 0; i + 20 < ref.length(); i++) { 405 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + 20)), false); 406 } 407 assembler.buildGraphIfNecessary(); 408 assembler.generateJunctionTrees(); 409 410 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 411 Assert.assertEquals(junctionTrees.size(), 2); // 412 413 JunctionTreeLinkedDeBruijnGraph.ThreadingTree outsideTree = junctionTrees.get(assembler.findKmer(new Kmer("GAAAT"))); 414 JunctionTreeLinkedDeBruijnGraph.ThreadingTree insideTree = junctionTrees.get(assembler.findKmer(new Kmer("TAAAT"))); 415 Assert.assertNotNull(outsideTree); // Make sure we actually have tree data for that site 416 Assert.assertNotNull(insideTree); // Make sure we actually have tree data for that site 417 418 List<String> insideChoices = insideTree.getPathsPresentAsBaseChoiceStrings(); 419 Assert.assertEquals(insideChoices.size(), 1); 420 Assert.assertTrue(insideChoices.contains("G")); 421 422 List<String> outsideChoices = outsideTree.getPathsPresentAsBaseChoiceStrings(); 423 Assert.assertEquals(outsideChoices.size(), 1); 424 Assert.assertTrue(outsideChoices.contains("TG")); 425 } 426 427 @Test testSimpleJuncionTreeThriceLoopingReference()428 public void testSimpleJuncionTreeThriceLoopingReference() { 429 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 430 String ref = "ATGGTTAGGGGAAATTTAAATTTAAATTTAAAGCGCCCCCG"; // AAATT, AATTT, ATTTA, TTTAA, and TTAAA are all repeated in a loop 431 432 assembler.addSequence("reference", getBytes(ref), true); 433 for (int i = 0; i + 23 < ref.length(); i++) { 434 // 20 bases should be exactly enough to recover the whole path through the loop (from GGAAAT to TTAAAG) 435 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + 23)), false); 436 } 437 assembler.buildGraphIfNecessary(); 438 assembler.generateJunctionTrees(); 439 440 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 441 Assert.assertEquals(junctionTrees.size(), 2); // 442 443 JunctionTreeLinkedDeBruijnGraph.ThreadingTree outsideTree = junctionTrees.get(assembler.findKmer(new Kmer("GAAAT"))); 444 JunctionTreeLinkedDeBruijnGraph.ThreadingTree insideTree = junctionTrees.get(assembler.findKmer(new Kmer("TAAAT"))); 445 Assert.assertNotNull(outsideTree); // Make sure we actually have tree data for that site 446 Assert.assertNotNull(insideTree); // Make sure we actually have tree data for that site 447 448 List<String> insideChoices = insideTree.getPathsPresentAsBaseChoiceStrings(); 449 Assert.assertEquals(insideChoices.size(), 2); 450 Assert.assertTrue(insideChoices.contains("TG")); 451 Assert.assertTrue(insideChoices.contains("G")); 452 453 List<String> outsideChoices = outsideTree.getPathsPresentAsBaseChoiceStrings(); 454 Assert.assertEquals(outsideChoices.size(), 1); 455 Assert.assertTrue(outsideChoices.contains("TTG")); 456 } 457 458 private static final boolean DEBUG = false; 459 getBytes(final String alignment)460 public static byte[] getBytes(final String alignment) { 461 return alignment.replace("-","").getBytes(); 462 } 463 assertNonUniqueKmersInvolveLoops(final JunctionTreeLinkedDeBruijnGraph assembler, String... nonUniques)464 private void assertNonUniqueKmersInvolveLoops(final JunctionTreeLinkedDeBruijnGraph assembler, String... nonUniques) { 465 final Set<String> actual = new HashSet<>(); 466 assembler.buildGraphIfNecessary(); 467 for (String kmer : nonUniques) { 468 MultiDeBruijnVertex vertex = assembler.findKmer(new Kmer(kmer)); 469 Assert.assertTrue(assembler.incomingEdgesOf(vertex).size() > 1 || assembler.outgoingEdgesOf(vertex).size() > 1); 470 } 471 } 472 473 @Test testSimpleHaplotypeRethreading()474 public void testSimpleHaplotypeRethreading() { 475 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11); 476 final String ref = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG"; 477 final String alt = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG"; 478 assembler.addSequence("anonymous", getBytes(ref), true); 479 assembler.addSequence("anonymous", getBytes(alt), false); 480 assembler.buildGraphIfNecessary(); 481 Assert.assertNotEquals(ref.length() - 11 + 1, assembler.vertexSet().size(), "the number of vertex in the graph is the same as if there was no alternative sequence"); 482 Assert.assertEquals(ref.length() - 11 + 1 + 11, assembler.vertexSet().size(), "the number of vertex in the graph is not the same as if there is an alternative sequence"); 483 MultiDeBruijnVertex startAlt = assembler.findKmer(new Kmer(alt.getBytes(),20,11)); 484 Assert.assertNotNull(startAlt); 485 } 486 487 @Test testJunctionTreeErrorCorrection()488 public void testJunctionTreeErrorCorrection() { 489 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11); 490 final String ref = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 491 final String supportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT"; 492 // This path has an undersupported edge that gets pruned, we want to assert that we can recover the proper junction tree weights regardless 493 final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"C"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 494 assembler.addSequence("anonymous", getBytes(ref), true); 495 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 496 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 497 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 498 // Only provide a single instance of the path so that its middle C variant gets pruned 499 assembler.addSequence("anonymous", getBytes(unSupportedAlt), false); 500 assembler.buildGraphIfNecessary(); 501 new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler); 502 assembler.generateJunctionTrees(); 503 504 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = 505 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10); 506 507 Assert.assertEquals(bestPaths.size(),2); 508 Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt); 509 Assert.assertEquals(new String(bestPaths.get(1).getBases()), "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"); 510 } 511 512 @Test testJunctionTreeErrorCorrectionUnrecoverableWithInsertion()513 public void testJunctionTreeErrorCorrectionUnrecoverableWithInsertion() { 514 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11); 515 final String ref = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 516 final String supportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT"; 517 // This path has an undersupported edge that gets pruned, we want to assert that we can recover the proper junction tree weights regardless 518 final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCTTTTTTTTTTTGGGGGGG"+"CAAT"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 519 assembler.addSequence("anonymous", getBytes(ref), true); 520 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 521 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 522 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 523 // Only provide a single instance of the path so that its middle C variant gets pruned 524 assembler.addSequence("anonymous", getBytes(unSupportedAlt), false); 525 assembler.buildGraphIfNecessary(); 526 new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler); 527 assembler.generateJunctionTrees(); 528 529 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = 530 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10); 531 532 // We only see the supported alt path because the unsupported alt path is never recovered 533 Assert.assertEquals(bestPaths.size(),1); 534 Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt); 535 } 536 537 @Test testJunctionTreeErrorCorrectionKmerSizeDistanceAllowance()538 public void testJunctionTreeErrorCorrectionKmerSizeDistanceAllowance() { 539 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11); 540 final String ref = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"A"+"TTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 541 final String supportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT"; 542 // This path has two unsupported edges which should be pruned, however they are more than kmer size apart so the junction tree code should still work to recover the connectivity 543 final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"G"+"TTTTTGGGGGGG"+"T"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 544 assembler.addSequence("anonymous", getBytes(ref), true); 545 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 546 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 547 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 548 // Only provide a single instance of the path so that its middle C variant gets pruned 549 assembler.addSequence("anonymous", getBytes(unSupportedAlt), false); 550 assembler.buildGraphIfNecessary(); 551 new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler); 552 assembler.generateJunctionTrees(); 553 554 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = 555 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10); 556 557 Assert.assertEquals(bestPaths.size(),2); 558 Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt); 559 Assert.assertEquals(new String(bestPaths.get(1).getBases()), "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTTTTGGGGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"); 560 } 561 562 @Test testJuncitonTreeErrorCorrectionFailingIfWithinKmerSizeForConsistencySake()563 public void testJuncitonTreeErrorCorrectionFailingIfWithinKmerSizeForConsistencySake() { 564 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11); 565 final String ref = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"A"+"TTTTGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 566 final String supportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTTTGGGG"+"A"+"GGGGTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT"; 567 // This path has two unsupported edges which should be pruned, however they are less than a kmer size apart and should result in the path being unable to find 568 final String unSupportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"G"+"TTTTGGGG"+"T"+"GGGGTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 569 assembler.addSequence("anonymous", getBytes(ref), true); 570 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 571 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 572 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 573 // Only provide a single instance of the path so that its middle C variant gets pruned 574 assembler.addSequence("anonymous", getBytes(unSupportedAlt), false); 575 assembler.buildGraphIfNecessary(); 576 new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler); 577 assembler.generateJunctionTrees(); 578 579 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = 580 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).findBestHaplotypes(10); 581 582 // We only see the supported alt path because the unsupported alt path is never recovered 583 Assert.assertEquals(bestPaths.size(),1); 584 Assert.assertEquals(new String(bestPaths.get(0).getBases()), supportedAlt); 585 } 586 587 @Test 588 // test asserting that we don't make spurious edges when we are trying to recover error bases testJunctionTreeErrorCorrectionNotAddingTreesWhenOverTentativeBases()589 public void testJunctionTreeErrorCorrectionNotAddingTreesWhenOverTentativeBases() { 590 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(11); 591 final String ref = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"A"+"TTGG"+"A"+"GGG"+"C"+"GTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 592 final String supportedAlt = "AAAAAAAAAAACCCCCC"+"T"+"CCCCCCTTTTTT"+"A"+"TTGG"+"C"+"GGG"+"C"+"GTGTGTGTGTGCCCGTGTGT"+"A"+"ATATATATAATAT"; 593 594 // This path shold be pruned, but if we aren't careful might end up pairing the first SNP with the middle SNP at this site 595 final String unSupportedAltWithError = "AAAAAAAAAAACCCCCC"+"G"+"CCCCCCTTTTTT"+"C"+"TTGG"+"C"+"GGG"+"A"+"GTGTGTGTGTGCCCGTGTGT"+"C"+"ATATATATAATAT"; 596 assembler.addSequence("anonymous", getBytes(ref), true); 597 assembler.addSequence("anonymous", getBytes(ref), false); 598 assembler.addSequence("anonymous", getBytes(ref), false); 599 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 600 assembler.addSequence("anonymous", getBytes(supportedAlt), false); 601 assembler.addSequence("anonymous", getBytes(unSupportedAltWithError), 1, false); 602 assembler.buildGraphIfNecessary(); 603 new LowWeightChainPruner<MultiDeBruijnVertex, MultiSampleEdge>(2).pruneLowWeightChains(assembler); 604 assembler.generateJunctionTrees(); 605 606 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = 607 new JunctionTreeKBestHaplotypeFinder<>(assembler,assembler.getReferenceSourceVertex(),assembler.getReferenceSinkVertex(), 0, false).setJunctionTreeEvidenceWeightThreshold(1).findBestHaplotypes(10); 608 609 // We only see the supported alt path because the unsupported alt path is never recovered 610 // If we saw 3 that means the undersupported pruned path ended up in the junciton trees and created a path that shouldn't be there 611 Assert.assertEquals(bestPaths.size(),2); 612 Assert.assertEquals(new String(bestPaths.get(0).getBases()), ref); 613 Assert.assertEquals(new String(bestPaths.get(1).getBases()), supportedAlt); 614 } 615 616 @Test(enabled = ! DEBUG) testNonUniqueMiddle()617 public void testNonUniqueMiddle() { 618 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3); 619 final String ref = "GACACACAGTCA"; 620 final String read1 = "GACAC---GTCA"; 621 final String read2 = "CAC---GTCA"; 622 assembler.addSequence(getBytes(ref), true); 623 assembler.addSequence(getBytes(read1), false); 624 assembler.addSequence(getBytes(read2), false); 625 assertNonUniqueKmersInvolveLoops(assembler, "ACA", "CAC"); 626 } 627 628 @Test(enabled = ! DEBUG) testReadsCreateNonUnique()629 public void testReadsCreateNonUnique() { 630 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3); 631 final String ref = "GCAC--GTCA"; // CAC is unique 632 final String read1 = "GCACACGTCA"; // makes CAC non unique because it has a duplication 633 final String read2 = "CACGTCA"; // shouldn't be allowed to match CAC as start 634 assembler.addSequence(getBytes(ref), true); 635 assembler.addSequence(getBytes(read1), false); 636 assembler.addSequence(getBytes(read2), false); 637 // assembler.convertToSequenceGraph().printGraph(new File("test.dot"), 0); 638 639 assertNonUniqueKmersInvolveLoops(assembler, "CAC"); 640 //assertSingleBubble(assembler, ref, "CAAAATCGGG"); 641 } 642 643 @Test(enabled = ! DEBUG) 644 // NOTE that now we do still count the reference as edge multiplicity testCountingOfStartEdges()645 public void testCountingOfStartEdges() { 646 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3); 647 final String ref = "NNNGTCAAA"; // ref has some bases before start 648 final String read1 = "GTCAAA"; // starts at first non N base 649 650 assembler.addSequence(getBytes(ref), true); 651 assembler.addSequence(getBytes(read1), false); 652 assembler.buildGraphIfNecessary(); 653 // assembler.printGraph(new File("test.dot"), 0); 654 655 for ( final MultiSampleEdge edge : assembler.edgeSet() ) { 656 final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); 657 final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); 658 final boolean headerVertex = source.getSuffix() == 'N' || target.getSuffix() == 'N'; 659 if ( headerVertex ) { 660 Assert.assertEquals(edge.getMultiplicity(), 1, "Bases in the unique reference header should have multiplicity of 0"); 661 } else { 662 Assert.assertEquals(edge.getMultiplicity(), 2, "Should have multiplicity of 1 for any edge outside the ref header but got " + edge + " " + source + " -> " + target); 663 } 664 } 665 } 666 667 @Test(enabled = !DEBUG) testCountingOfStartEdgesWithMultiplePrefixes()668 public void testCountingOfStartEdgesWithMultiplePrefixes() { 669 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(3); 670 assembler.setIncreaseCountsThroughBranches(true); 671 final String ref = "NNNGTCAXX"; // ref has some bases before start 672 final String alt1 = "NNNCTCAXX"; // alt1 has SNP right after N 673 final String read = "TCAXX"; // starts right after SNP, but merges right before branch 674 675 assembler.addSequence(getBytes(ref), true); 676 assembler.addSequence(getBytes(alt1), false); 677 assembler.addSequence(getBytes(read), false); 678 assembler.buildGraphIfNecessary(); 679 assembler.printGraph(createTempFile("test",".dot"), 0); 680 681 final List<String> oneCountVertices = Arrays.asList("NNN", "NNC", "NCT", "NNG", "NGT"); 682 final List<String> twoCountVertices = Arrays.asList("CTC", "TCA", "GTC"); 683 684 for ( final MultiSampleEdge edge : assembler.edgeSet() ) { 685 final MultiDeBruijnVertex source = assembler.getEdgeSource(edge); 686 final MultiDeBruijnVertex target = assembler.getEdgeTarget(edge); 687 final int expected; 688 // if (source.getSequenceString().equals("GTC") && target.getSequenceString().equals("TCA")) { 689 // expected = 1; 690 // } else { 691 expected = oneCountVertices.contains(target.getSequenceString()) ? 1 : ((twoCountVertices.contains(target.getSequenceString()) ? 2 : 3)); 692 // } 693 Assert.assertEquals(edge.getMultiplicity(), expected, "Bases at edge " + edge + " from " + source + " to " + target + " has bad multiplicity"); 694 } 695 } 696 697 @Test(enabled = !DEBUG) testCyclesInGraph()698 public void testCyclesInGraph() { 699 700 // b37 20:12655200-12655850 701 final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; 702 703 // SNP at 20:12655528 creates a cycle for small kmers 704 final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; 705 706 final List<GATKRead> reads = new ArrayList<>(); 707 for ( int index = 0; index < alt.length() - 100; index += 20 ) { 708 reads.add(ArtificialReadUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); 709 } 710 711 // test that there are cycles detected for small kmer 712 final JunctionTreeLinkedDeBruijnGraph rtgraph25 = new JunctionTreeLinkedDeBruijnGraph(25); 713 rtgraph25.addSequence("ref", ref.getBytes(), true); 714 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 715 for ( final GATKRead read : reads ) { 716 rtgraph25.addRead(read, header); 717 } 718 rtgraph25.buildGraphIfNecessary(); 719 Assert.assertTrue(rtgraph25.hasCycles()); 720 721 // test that there are no cycles detected for large kmer 722 final JunctionTreeLinkedDeBruijnGraph rtgraph75 = new JunctionTreeLinkedDeBruijnGraph(75); 723 rtgraph75.addSequence("ref", ref.getBytes(), true); 724 for ( final GATKRead read : reads ) { 725 rtgraph75.addRead(read, header); 726 } 727 rtgraph75.buildGraphIfNecessary(); 728 Assert.assertFalse(rtgraph75.hasCycles()); 729 } 730 731 @Test(enabled = !DEBUG) 732 // Test showing that if a read gets completely clipped in the ReadThreadingAssembler, that the assembly will not crash testEmptyReadBeingAddedToGraph()733 public void testEmptyReadBeingAddedToGraph() { 734 735 // b37 20:12655200-12655850 736 final String ref = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTATACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; 737 738 // SNP at 20:12655528 creates a cycle for small kmers 739 final String alt = "CAATTGTCATAGAGAGTGACAAATGTTTCAAAAGCTTATTGACCCCAAGGTGCAGCGGTGCACATTAGAGGGCACCTAAGACAGCCTACAGGGGTCAGAAAAGATGTCTCAGAGGGACTCACACCTGAGCTGAGTTGTGAAGGAAGAGCAGGATAGAATGAGCCAAAGATAAAGACTCCAGGCAAAAGCAAATGAGCCTGAGGGAAACTGGAGCCAAGGCAAGAGCAGCAGAAAAGAGCAAAGCCAGCCGGTGGTCAAGGTGGGCTACTGTGTATGCAGAATGAGGAAGCTGGCCAAGTAGACATGTTTCAGATGATGAACATCCTGTGTACTAGATGCATTGGAACTTTTTTCATCCCCTCAACTCCACCAAGCCTCTGTCCACTCTTGGTACCTCTCTCCAAGTAGACATATTTCAGATCATGAACATCCTGTGTACTAGATGCATTGGAAATTTTTTCATCCCCTCAACTCCACCCAGCCTCTGTCCACACTTGGTACCTCTCTCTATTCATATCTCTGGCCTCAAGGAGGGTATTTGGCATTAGTAAATAAATTCCAGAGATACTAAAGTCAGATTTTCTAAGACTGGGTGAATGACTCCATGGAAGAAGTGAAAAAGAGGAAGTTGTAATAGGGAGACCTCTTCGG"; 740 741 final List<GATKRead> reads = new ArrayList<>(); 742 for ( int index = 0; index < alt.length() - 100; index += 20 ) { 743 reads.add(ArtificialReadUtils.createArtificialRead(Arrays.copyOfRange(alt.getBytes(), index, index + 100), Utils.dupBytes((byte) 30, 100), 100 + "M")); 744 } 745 reads.add(ReadUtils.emptyRead(reads.get(0))); 746 747 // test that there are cycles detected for small kmer 748 final JunctionTreeLinkedDeBruijnGraph rtgraph25 = new JunctionTreeLinkedDeBruijnGraph(25); 749 rtgraph25.addSequence("ref", ref.getBytes(), true); 750 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 751 for ( final GATKRead read : reads ) { 752 rtgraph25.addRead(read, header); 753 } 754 rtgraph25.buildGraphIfNecessary(); 755 } 756 757 @Test(enabled = false) 758 // TODO determine what is causing null bases here testNsInReadsAreNotUsedForGraph()759 public void testNsInReadsAreNotUsedForGraph() { 760 761 final int length = 100; 762 final byte[] ref = Utils.dupBytes((byte) 'A', length); 763 764 final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(25); 765 rtgraph.addSequence("ref", ref, true); 766 767 // add reads with Ns at any position 768 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 769 for ( int i = 0; i < length; i++ ) { 770 final byte[] bases = ref.clone(); 771 bases[i] = 'N'; 772 final GATKRead read = ArtificialReadUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M"); 773 rtgraph.addRead(read, header); 774 } 775 rtgraph.buildGraphIfNecessary(); 776 777 //final SeqGraph graph = rtgraph.toSequenceGraph(); 778 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new GraphBasedKBestHaplotypeFinder<>(rtgraph, rtgraph.getReferenceSourceVertex(), rtgraph.getReferenceSinkVertex()).findBestHaplotypes(); 779 Assert.assertEquals(paths.size(), 1); 780 } 781 782 // TODO -- update to use determineKmerSizeAndNonUniques directly 783 // @DataProvider(name = "KmerSizeData") 784 // public Object[][] makeKmerSizeDataProvider() { 785 // List<Object[]> tests = new ArrayList<Object[]>(); 786 // 787 // // this functionality can be adapted to provide input data for whatever you might want in your data 788 // tests.add(new Object[]{3, 3, 3, Arrays.asList("ACG"), Arrays.asList()}); 789 // tests.add(new Object[]{3, 4, 3, Arrays.asList("CAGACG"), Arrays.asList()}); 790 // 791 // tests.add(new Object[]{3, 3, 3, Arrays.asList("AAAAC"), Arrays.asList("AAA")}); 792 // tests.add(new Object[]{3, 4, 4, Arrays.asList("AAAAC"), Arrays.asList()}); 793 // tests.add(new Object[]{3, 5, 4, Arrays.asList("AAAAC"), Arrays.asList()}); 794 // tests.add(new Object[]{3, 4, 3, Arrays.asList("CAAA"), Arrays.asList()}); 795 // tests.add(new Object[]{3, 4, 4, Arrays.asList("CAAAA"), Arrays.asList()}); 796 // tests.add(new Object[]{3, 5, 4, Arrays.asList("CAAAA"), Arrays.asList()}); 797 // tests.add(new Object[]{3, 5, 5, Arrays.asList("ACGAAAAACG"), Arrays.asList()}); 798 // 799 // for ( int maxSize = 3; maxSize < 20; maxSize++ ) { 800 // for ( int dupSize = 3; dupSize < 20; dupSize++ ) { 801 // final int expectedSize = Math.min(maxSize, dupSize); 802 // final String dup = Utils.dupString("C", dupSize); 803 // final List<String> nonUnique = dupSize > maxSize ? Arrays.asList(Utils.dupString("C", maxSize)) : Collections.<String>emptyList(); 804 // tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("ACGT", "A" + dup + "GT"), nonUnique}); 805 // tests.add(new Object[]{3, maxSize, expectedSize, Arrays.asList("A" + dup + "GT", "ACGT"), nonUnique}); 806 // } 807 // } 808 // 809 // return tests.toArray(new Object[][]{}); 810 // } 811 // 812 // /** 813 // * Example testng test using MyDataProvider 814 // */ 815 // @Test(dataProvider = "KmerSizeData") 816 // public void testDynamicKmerSizing(final int min, final int max, final int expectKmer, final List<String> seqs, final List<String> expectedNonUniques) { 817 // final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(min, max); 818 // for ( String seq : seqs ) assembler.addSequence(seq.getBytes(), false); 819 // assembler.buildGraphIfNecessary(); 820 // Assert.assertEquals(assembler.getKmerSize(), expectKmer); 821 // assertNonUniqueKmersInvolveLoops(assembler, expectedNonUniques.toArray(new String[]{})); 822 // } 823 824 @DataProvider(name = "DanglingTails") makeDanglingTailsData()825 public Object[][] makeDanglingTailsData() { 826 List<Object[]> tests = new ArrayList<>(); 827 828 // add 1M to the expected CIGAR because it includes the previous (common) base too 829 tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype 830 tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion 831 tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion 832 tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp 833 tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps 834 tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment 835 tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data 836 tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch 837 tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex 838 tests.add(new Object[]{"AAAAA", "XXXXX", "1M5I", false, -1}); // insertion 839 840 return tests.toArray(new Object[][]{}); 841 } 842 843 @Test(dataProvider = "DanglingTails") testDanglingTails(final String refEnd, final String altEnd, final String cigar, final boolean cigarIsGood, final int mergePointDistanceFromSink)844 public void testDanglingTails(final String refEnd, 845 final String altEnd, 846 final String cigar, 847 final boolean cigarIsGood, 848 final int mergePointDistanceFromSink) { 849 850 final int kmerSize = 15; 851 852 // construct the haplotypes 853 final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; 854 final String ref = commonPrefix + refEnd; 855 final String alt = commonPrefix + altEnd; 856 857 // create the graph and populate it 858 final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 859 rtgraph.addSequence("ref", ref.getBytes(), true); 860 final GATKRead read = ArtificialReadUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); 861 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 862 rtgraph.addRead(read, header); 863 rtgraph.buildGraphIfNecessary(); 864 865 // confirm that we have just a single dangling tail 866 MultiDeBruijnVertex altSink = null; 867 for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { 868 if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { 869 Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); 870 altSink = v; 871 } 872 } 873 874 Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); 875 876 // confirm that the SW alignment agrees with our expectations 877 final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstDownwardsReferencePath(altSink, 0, 4, false, SmithWatermanJavaAligner 878 .getInstance()); 879 880 if ( result == null ) { 881 Assert.assertFalse(cigarIsGood); 882 return; 883 } 884 885 Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); 886 887 // confirm that the goodness of the cigar agrees with our expectations 888 Assert.assertEquals(JunctionTreeLinkedDeBruijnGraph.cigarIsOkayToMerge(result.cigar, false, true), cigarIsGood); 889 890 // confirm that the tail merging works as expected 891 if ( cigarIsGood ) { 892 final int mergeResult = rtgraph.mergeDanglingTail(result); 893 Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); 894 895 // confirm that we created the appropriate edge 896 if ( mergePointDistanceFromSink >= 0 ) { 897 MultiDeBruijnVertex v = altSink; 898 for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { 899 if ( rtgraph.inDegreeOf(v) != 1 ) 900 Assert.fail("Encountered vertex with multiple edges"); 901 v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); 902 } 903 Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); 904 } 905 } 906 } 907 908 @Test (enabled = false) 909 //TODO this test will need to be reenabled when dangling head recovery is handled with the JTBesthaplotypeFinder testForkedDanglingEnds()910 public void testForkedDanglingEnds() { 911 912 final int kmerSize = 15; 913 914 // construct the haplotypes 915 final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; 916 final String refEnd = "GCTAGCTAATCG"; 917 final String altEnd1 = "ACTAGCTAATCG"; 918 final String altEnd2 = "ACTAGATAATCG"; 919 final String ref = commonPrefix + refEnd; 920 final String alt1 = commonPrefix + altEnd1; 921 final String alt2 = commonPrefix + altEnd2; 922 923 // create the graph and populate it 924 final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 925 rtgraph.addSequence("ref", ref.getBytes(), true); 926 final GATKRead read1 = ArtificialReadUtils.createArtificialRead(alt1.getBytes(), Utils.dupBytes((byte) 30, alt1.length()), alt1.length() + "M"); 927 final GATKRead read2 = ArtificialReadUtils.createArtificialRead(alt2.getBytes(), Utils.dupBytes((byte) 30, alt2.length()), alt2.length() + "M"); 928 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 929 rtgraph.addRead(read1, header); 930 rtgraph.addRead(read2, header); 931 rtgraph.buildGraphIfNecessary(); 932 933 Assert.assertEquals(rtgraph.getSinks().size(), 3); 934 935 for (final MultiDeBruijnVertex altSink : rtgraph.getSinks()) { 936 if (rtgraph.isReferenceNode(altSink)) { 937 continue; 938 } 939 940 // confirm that the SW alignment agrees with our expectations 941 final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstDownwardsReferencePath(altSink, 942 0, 4, true, SmithWatermanJavaAligner.getInstance()); 943 Assert.assertNotNull(result); 944 Assert.assertTrue(JunctionTreeLinkedDeBruijnGraph.cigarIsOkayToMerge(result.cigar, false, true)); 945 946 // confirm that the tail merging works as expected 947 final int mergeResult = rtgraph.mergeDanglingTail(result); 948 Assert.assertTrue(mergeResult == 1); 949 } 950 951 final List<String> paths = new JunctionTreeKBestHaplotypeFinder<>(rtgraph).findBestHaplotypes().stream() 952 .map(kBestHaplotype -> new String(kBestHaplotype.getBases())) 953 .distinct() 954 .sorted() 955 .collect(Collectors.toList()); 956 957 Assert.assertEquals(paths.size(), 3); 958 Assert.assertEquals(paths, Arrays.asList(ref, alt1, alt2).stream().sorted().collect(Collectors.toList())); 959 } 960 961 @Test testWholeTailIsInsertion()962 public void testWholeTailIsInsertion() { 963 final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(10); 964 final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = new JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper(null, null, "AXXXXX".getBytes(), "AAAAAA".getBytes(), TextCigarCodec.decode("5I1M")); 965 final int mergeResult = rtgraph.mergeDanglingTail(result); 966 Assert.assertEquals(mergeResult, 0); 967 } 968 969 @Test testGetBasesForPath()970 public void testGetBasesForPath() { 971 972 final int kmerSize = 4; 973 final String testString = "AATGGGGCAATACTA"; 974 975 final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 976 graph.addSequence(testString.getBytes(), true); 977 graph.buildGraphIfNecessary(); 978 979 final List<MultiDeBruijnVertex> vertexes = new ArrayList<>(); 980 MultiDeBruijnVertex v = graph.getReferenceSourceVertex(); 981 while ( v != null ) { 982 vertexes.add(v); 983 v = graph.getNextReferenceVertex(v); 984 } 985 986 final String resultForTails = new String(graph.getBasesForPath(vertexes, false)); 987 Assert.assertEquals(resultForTails, testString.substring(kmerSize - 1)); 988 final String resultForHeads = new String(graph.getBasesForPath(vertexes, true)); 989 Assert.assertEquals(resultForHeads, "GTAAGGGCAATACTA"); // because the source node will be reversed 990 } 991 992 @DataProvider(name = "DanglingHeads") makeDanglingHeadsData()993 public Object[][] makeDanglingHeadsData() { 994 List<Object[]> tests = new ArrayList<>(); 995 996 // add 1M to the expected CIGAR because it includes the last (common) base too 997 tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "AAYCGGTTACGT", "8M", true}); // 1 snp 998 tests.add(new Object[]{"XXXAACCGGTTACGT", "XAAACCGGTTACGT", "7M", false}); // 1 snp 999 tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "XAACGGTTACGT", "4M1D4M", false}); // deletion 1000 tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "AYYCGGTTACGT", "8M", true}); // 2 snps 1001 tests.add(new Object[]{"XXTXXXXAACCGGTTACGTAA", "AYCYGGTTACGTAA", "9M", true}); // 2 snps 1002 tests.add(new Object[]{"XXXTXXXAACCGGTTACGT", "AYCGGTTACGT", "7M", true}); // very little data 1003 tests.add(new Object[]{"XXTXXXXAACCGGTTACGT", "YCCGGTTACGT", "6M", true}); // begins in mismatch 1004 1005 //TODO these tests are diffiuclt to evaluate, as GraphBasedKBestHaplotypeFinder currently cannot handle looping sequence which poses a problem for testing explicitly looping refrence behavior 1006 // tests.add(new Object[]{"XXTXXXXAACCGGTTACGTTTACGT", "AAYCGGTTACGT", "8M", true}); // dangling head hanging off of a non-unique reference kmer 1007 // tests.add(new Object[]{"XXTXXXXAACCGGTTACGTCGGTTACGT", "AAYCGGTTACGT", "8M", true}); 1008 1009 return tests.toArray(new Object[][]{}); 1010 } 1011 1012 @Test(dataProvider = "DanglingHeads") testDanglingHeads(final String ref, final String alt, final String cigar, final boolean shouldBeMerged)1013 public void testDanglingHeads(final String ref, 1014 final String alt, 1015 final String cigar, 1016 final boolean shouldBeMerged) { 1017 1018 final int kmerSize = 5; 1019 1020 // create the graph and populate it 1021 final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 1022 rtgraph.addSequence("ref", ref.getBytes(), true); 1023 final GATKRead read = ArtificialReadUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); 1024 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 1025 rtgraph.addRead(read, header); 1026 rtgraph.setMaxMismatchesInDanglingHead(10); 1027 rtgraph.buildGraphIfNecessary(); 1028 1029 // confirm that we have just a single dangling head 1030 MultiDeBruijnVertex altSource = null; 1031 for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { 1032 if ( rtgraph.isSource(v) && !rtgraph.isReferenceNode(v) ) { 1033 Assert.assertTrue(altSource == null, "We found more than one non-reference source"); 1034 altSource = v; 1035 } 1036 } 1037 1038 Assert.assertTrue(altSource != null, "We did not find a non-reference source"); 1039 1040 // confirm that the SW alignment agrees with our expectations 1041 final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstUpwardsReferencePath(altSource, 0, 1, false, SmithWatermanJavaAligner 1042 .getInstance()); 1043 1044 if ( result == null ) { 1045 Assert.assertFalse(shouldBeMerged); 1046 return; 1047 } 1048 1049 Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); 1050 1051 // confirm that the tail merging works as expected 1052 final int mergeResult = rtgraph.mergeDanglingHeadLegacy(result); 1053 Assert.assertTrue(mergeResult > 0 || !shouldBeMerged); 1054 1055 // confirm that we created the appropriate bubble in the graph only if expected 1056 rtgraph.cleanNonRefPaths(); 1057 // final SeqGraph seqGraph = rtgraph.toSequenceGraph(); 1058 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new GraphBasedKBestHaplotypeFinder<>(rtgraph, rtgraph.getReferenceSourceVertex(), rtgraph.getReferenceSinkVertex()).findBestHaplotypes(); 1059 Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1); 1060 } 1061 1062 @Test(dataProvider = "DanglingHeads", enabled = false) 1063 //TODO this test will need to be enabled when we change the dangling end recovery logic, as of right now definitionally we are 1064 //TODO incapable of recovering dangling heads and that will have to change eventually testDanglingHeadsWithJTBestHaplotypeFinder(final String ref, final String alt, final String cigar, final boolean shouldBeMerged)1065 public void testDanglingHeadsWithJTBestHaplotypeFinder(final String ref, 1066 final String alt, 1067 final String cigar, 1068 final boolean shouldBeMerged) { 1069 1070 final int kmerSize = 5; 1071 1072 // create the graph and populate it 1073 final JunctionTreeLinkedDeBruijnGraph rtgraph = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 1074 rtgraph.addSequence("ref", ref.getBytes(), true); 1075 final GATKRead read = ArtificialReadUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); 1076 final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); 1077 rtgraph.addRead(read, header); 1078 rtgraph.setMaxMismatchesInDanglingHead(10); 1079 rtgraph.buildGraphIfNecessary(); 1080 1081 // confirm that we have just a single dangling head 1082 MultiDeBruijnVertex altSource = null; 1083 for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { 1084 if ( rtgraph.isSource(v) && !rtgraph.isReferenceNode(v) ) { 1085 Assert.assertTrue(altSource == null, "We found more than one non-reference source"); 1086 altSource = v; 1087 } 1088 } 1089 1090 Assert.assertTrue(altSource != null, "We did not find a non-reference source"); 1091 1092 // confirm that the SW alignment agrees with our expectations 1093 final JunctionTreeLinkedDeBruijnGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstUpwardsReferencePath(altSource, 0, 1, false, SmithWatermanJavaAligner 1094 .getInstance()); 1095 1096 if ( result == null ) { 1097 Assert.assertFalse(shouldBeMerged); 1098 return; 1099 } 1100 1101 Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); 1102 1103 // confirm that the tail merging works as expected 1104 final int mergeResult = rtgraph.mergeDanglingHead(result); 1105 Assert.assertTrue(mergeResult > 0 || !shouldBeMerged); 1106 1107 // confirm that we created the appropriate bubble in the graph only if expected 1108 rtgraph.cleanNonRefPaths(); 1109 // final SeqGraph seqGraph = rtgraph.toSequenceGraph(); 1110 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new JunctionTreeKBestHaplotypeFinder<>(rtgraph, rtgraph.getReferenceSourceVertex(), rtgraph.getReferenceSinkVertex()).findBestHaplotypes(); 1111 Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1); 1112 } 1113 1114 /** 1115 * Find the ending position of the longest uniquely matching 1116 * run of bases of kmer in seq. 1117 * 1118 * for example, if seq = ACGT and kmer is NAC, this function returns 1,2 as we have the following 1119 * match: 1120 * 1121 * 0123 1122 * .ACGT 1123 * NAC.. 1124 * 1125 * @param seq a non-null sequence of bytes 1126 * @param kmer a non-null kmer 1127 * @return the ending position and length where kmer matches uniquely in sequence, or null if no 1128 * unique longest match can be found 1129 */ findLongestUniqueSuffixMatch(final byte[] seq, final byte[] kmer)1130 private static Pair<Integer, Integer> findLongestUniqueSuffixMatch(final byte[] seq, final byte[] kmer) { 1131 int longestPos = -1; 1132 int length = 0; 1133 boolean foundDup = false; 1134 1135 for ( int i = 0; i < seq.length; i++ ) { 1136 final int matchSize = JunctionTreeLinkedDeBruijnGraph.longestSuffixMatch(seq, kmer, i); 1137 if ( matchSize > length ) { 1138 longestPos = i; 1139 length = matchSize; 1140 foundDup = false; 1141 } else if ( matchSize == length ) { 1142 foundDup = true; 1143 } 1144 } 1145 1146 return foundDup ? null : Pair.of(longestPos, length); 1147 } 1148 1149 /** 1150 * Example testng test using MyDataProvider 1151 */ 1152 @Test(dataProvider = "findLongestUniqueMatchData") testfindLongestUniqueMatch(final String seq, final String kmer, final int start, final int length)1153 public void testfindLongestUniqueMatch(final String seq, final String kmer, final int start, final int length) { 1154 // adaptor this code to do whatever testing you want given the arguments start and size 1155 final Pair<Integer, Integer> actual = findLongestUniqueSuffixMatch(seq.getBytes(), kmer.getBytes()); 1156 if ( start == -1 ) 1157 Assert.assertNull(actual); 1158 else { 1159 Assert.assertNotNull(actual); 1160 Assert.assertEquals((int)actual.getLeft(), start); 1161 Assert.assertEquals((int)actual.getRight(), length); 1162 } 1163 } 1164 1165 @DataProvider(name = "findLongestUniqueMatchData") makefindLongestUniqueMatchData()1166 public Object[][] makefindLongestUniqueMatchData() { 1167 List<Object[]> tests = new ArrayList<>(); 1168 1169 { // test all edge conditions 1170 final String ref = "ACGT"; 1171 for ( int start = 0; start < ref.length(); start++ ) { 1172 for ( int end = start + 1; end <= ref.length(); end++ ) { 1173 final String kmer = ref.substring(start, end); 1174 tests.add(new Object[]{ref, kmer, end - 1, end - start}); 1175 tests.add(new Object[]{ref, "N" + kmer, end - 1, end - start}); 1176 tests.add(new Object[]{ref, "NN" + kmer, end - 1, end - start}); 1177 tests.add(new Object[]{ref, kmer + "N", -1, 0}); 1178 tests.add(new Object[]{ref, kmer + "NN", -1, 0}); 1179 } 1180 } 1181 } 1182 1183 { // multiple matches 1184 final String ref = "AACCGGTT"; 1185 for ( final String alt : Arrays.asList("A", "C", "G", "T") ) 1186 tests.add(new Object[]{ref, alt, -1, 0}); 1187 tests.add(new Object[]{ref, "AA", 1, 2}); 1188 tests.add(new Object[]{ref, "CC", 3, 2}); 1189 tests.add(new Object[]{ref, "GG", 5, 2}); 1190 tests.add(new Object[]{ref, "TT", 7, 2}); 1191 } 1192 1193 { // complex matches that have unique substrings of lots of parts of kmer in the ref 1194 final String ref = "ACGTACGTACGT"; 1195 tests.add(new Object[]{ref, "ACGT", -1, 0}); 1196 tests.add(new Object[]{ref, "TACGT", -1, 0}); 1197 tests.add(new Object[]{ref, "GTACGT", -1, 0}); 1198 tests.add(new Object[]{ref, "CGTACGT", -1, 0}); 1199 tests.add(new Object[]{ref, "ACGTACGT", -1, 0}); 1200 tests.add(new Object[]{ref, "TACGTACGT", 11, 9}); 1201 tests.add(new Object[]{ref, "NTACGTACGT", 11, 9}); 1202 tests.add(new Object[]{ref, "GTACGTACGT", 11, 10}); 1203 tests.add(new Object[]{ref, "NGTACGTACGT", 11, 10}); 1204 tests.add(new Object[]{ref, "CGTACGTACGT", 11, 11}); 1205 } 1206 1207 return tests.toArray(new Object[][]{}); 1208 } 1209 }