1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs; 2 3 import com.google.common.base.Strings; 4 import htsjdk.samtools.Cigar; 5 import htsjdk.samtools.CigarElement; 6 import htsjdk.samtools.CigarOperator; 7 import org.broadinstitute.hellbender.GATKBaseTest; 8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.JunctionTreeLinkedDeBruijnGraph; 9 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; 10 import org.broadinstitute.hellbender.utils.read.AlignmentUtils; 11 import org.broadinstitute.hellbender.utils.read.CigarBuilder; 12 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner; 13 import org.testng.Assert; 14 import org.testng.annotations.DataProvider; 15 import org.testng.annotations.Test; 16 17 import java.util.*; 18 import java.util.stream.Collectors; 19 import java.util.stream.IntStream; 20 21 public class JunctionTreeKBestHaplotypeFinderUnitTest extends GATKBaseTest { 22 getBytes(final String alignment)23 public static byte[] getBytes(final String alignment) { 24 return alignment.replace("-","").getBytes(); 25 } 26 27 @DataProvider (name = "loopingReferences") loopingReferencesRecoverablehaplotypes()28 public static Object[][] loopingReferencesRecoverablehaplotypes() { 29 return new Object[][]{ 30 new Object[]{"GACACACAGTCA", 3, 8, true}, //ACA and CAC are repeated, 9 is enough bases to span the path 31 //TODO this fails due to new conservative edge pruning code new Object[]{"GACACACAGTCA", 3, 5, false}, //ACA and CAC are repeated, 8 is not 32 new Object[]{"GACACTCACGCACGG", 3, 6, true}, //CAC repeated thrice, showing that the loops are handled somewhat sanely 33 new Object[]{"GACATCGACGG", 3, 11, false}, //GAC repeated twice, starting with GAC, can it recover the reference if it starts on a repeated kmer (should not be recoverable based on our decisions) 34 new Object[]{"GACATCGACGG", 3, 6, false}, //With non-unique reference start we fall over for looping structures 35 new Object[]{"GACATCGACGG", 3, 6, false}, //reads too short to be resolvable 36 new Object[]{"GACATCACATC", 3, 8, true}, //final kmer ATC is repeated, can we recover the reference for repating final kmer 37 38 // Some less complicated cases 39 new Object[]{"ACTGTGGGGGGGGGGGGCCGCG", 5, 14, true}, //Just long enough to be resolvable (12 repeated G's with 1 base anchor on either side) 40 new Object[]{"ACTGTGGGGGGGGGGGGCCGCG", 5, 12, false}, //Just too short to be resolvable 41 42 new Object[]{"ACTGTTCTCTCTCTCTCCCGCG", 5, 14, true}, //Just long enough to be resolvable 43 //TODO this fails due to new conservative edge pruning code new Object[]{"ACTGTTCTCTCTCTCTCCCGCG", 5, 9, false}, //Just too short to be resolvable 44 }; 45 } 46 47 @Test (dataProvider = "loopingReferences") testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize, final int readlength, final boolean resolvable)48 public void testRecoveryOfLoopingReferenceSequences(final String ref, final int kmerSize, final int readlength, final boolean resolvable) { 49 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 50 assembler.addSequence("anonymous", getBytes(ref), true); 51 // Add "reads" to the graph 52 for (int i = 0; i + readlength <= ref.length(); i ++) { 53 assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(ref), i, i + readlength), false); 54 } 55 assembler.buildGraphIfNecessary(); 56 assembler.generateJunctionTrees(); 57 58 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 59 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 60 61 // For all of these loops we expect to recover at least the reference haplotype 62 Assert.assertTrue(bestPaths.contains(ref)); 63 64 if (resolvable) { 65 Assert.assertEquals(bestPaths.size(), 1); 66 } else { 67 Assert.assertTrue(bestPaths.size() > 1); 68 } 69 } 70 71 @Test (enabled = false) //TODO this test needs to be reconsidered, this is the "baby thrown out with the bathwater" for ignoring reference weight edges wherever possible, 72 // todo this loop becomes unresolvable because there is no evidence for the path out of the loop so the chain stops expanding after a point 73 // Asserting that for a very degenerate looping case (the only weight resides in the loop (which could happen due to non-unique kmers for unmerged dangling ends) 74 // note that this particular case is recovered by virtue of the fact that the reference path itself has weight 75 // This test asserts that ref path weight is still taken into accoun testDegenerateLoopingCase()76 public void testDegenerateLoopingCase() { 77 final String ref = "GACACTCACGCACGG"; 78 final int kmerSize = 3; 79 final int readlength = 6; 80 81 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 82 assembler.addSequence("anonymous", getBytes(ref), true); 83 // Add "reads" to the graph 84 for (int i = 0; i + readlength < ref.length(); i ++) { 85 assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(ref), i, i + readlength), false); 86 } 87 assembler.buildGraphIfNecessary(); 88 assembler.generateJunctionTrees(); 89 90 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 91 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 92 93 // For all of these loops we expect to recover at least the reference haplotype 94 Assert.assertTrue(bestPaths.contains(ref)); 95 96 Assert.assertEquals(bestPaths.size(), 5); 97 98 } 99 100 @Test testRecoveryOfAPathDroppedByJunctionTrees()101 public void testRecoveryOfAPathDroppedByJunctionTrees() { 102 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 103 String ref = "AAAACAC"+"C"+"ATGTGCGG"+"T"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg 104 105 // A simple snip het 106 String read1 = "AAAACAC"+"T"+"CTGTGCGG"+"C"+"GGGTT"; // CGAC merges with the below read 107 String read2 = "TGTGCGG"+"A"+"GGGTT"; // doesn't show up in the first graph 108 109 assembler.addSequence("reference", getBytes(ref), true); 110 111 assembler.addSequence("anonymous", getBytes(read1), false); 112 assembler.addSequence("anonymous", getBytes(read1), false); 113 assembler.addSequence("anonymous", getBytes(read1), false); 114 115 assembler.addSequence("anonymous", getBytes(read2), false); 116 117 assembler.buildGraphIfNecessary(); 118 assembler.generateJunctionTrees(); 119 120 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 121 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 122 123 Assert.assertEquals(bestPaths.size(), 2); 124 Assert.assertEquals(bestPaths.get(0), read1); 125 Assert.assertEquals(bestPaths.get(1), "AAAACAC"+"T"+"C" + read2); //asserting that the front of read 1 was appended to read 2 for haplotype construction 126 } 127 128 @Test 129 // This test documents one of the known edge cases in the pivotal edge recovery code where sometimes an edge might be 130 // dropped if there doesn't happen to be any result path that connects to the root vertex of this dorpped path testRecoveryOfAPathDroppedByJunctionTreeFailureDueToStillBeingUnreachable()131 public void testRecoveryOfAPathDroppedByJunctionTreeFailureDueToStillBeingUnreachable() { 132 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 133 String ref = "AAAACAC"+"C"+"ATGTGCGG"+"TTTAGAGAG"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph is interestingg 134 135 // A simple snip het 136 String read1 = "AAAACAC"+"T"+"CTGTGCGG"+"C"+"GGGTT"; // CGAC merges with the below read 137 String read2 = "TAGAGTG"+"GGGTT"; // creates a branch off of the uncovered reference path 138 139 assembler.addSequence("reference", getBytes(ref), true); 140 141 assembler.addSequence("anonymous", getBytes(read1), false); 142 assembler.addSequence("anonymous", getBytes(read1), false); 143 assembler.addSequence("anonymous", getBytes(read1), false); 144 145 assembler.addSequence("anonymous", getBytes(read2), false); 146 147 assembler.buildGraphIfNecessary(); 148 assembler.generateJunctionTrees(); 149 150 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 151 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 152 153 Assert.assertEquals(bestPaths.size(), 1); 154 Assert.assertEquals(bestPaths.get(0), read1); 155 } 156 157 // TODO this test is disabled because currently we have NOT implemented, so now we are simply using the score of the output path as the 158 // TODO heurisitic for stapling the front onto this path. this might be addressed in the future. 159 @Test (enabled = false) testRecoveryOfDroppedPathChoosingMostLikePathDespiteThatPathHavingAWorseScore()160 public void testRecoveryOfDroppedPathChoosingMostLikePathDespiteThatPathHavingAWorseScore() { 161 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 162 String ref = "AAAACAC"+"C"+"ATGTGCGG"+"A"+"TTTAGAGAG"+"C"+"GGGTTCC"+"A"+"GAGAGATATA"+"C"+"GAGTTTTGTTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg 163 164 String completePath1 = "AAAACAC"+"T"+"ATGTGCGG"+"A"+"TTTAGAGAG"+"A"+"GGGTTCC"+"T"+"GAGAGATATA"+"C"+"GAGTTTTGTTT"; 165 String completePath2 = "AAAACAC"+"G"+"ATGTGCGG"+"A"+"TTTAGAGAG"+"A"+"GGGTTCC"+"A"+"GAGAGATATA"+"G"+"GAGTTTTGTTT"; 166 String incompletePath = "TGTGCGG"+"C"+"TTTAGAGAG"+"A"+"GGGTTCC"+"A"+"GAGAGATATA"+"G"+"GAGTTTTGTTT"; // njote that this point is a copy of path 2 after its first C variant 167 168 // Ensure that completePath1 will have the highest score by weighting it heavily 169 assembler.addSequence("reference", getBytes(ref), true); 170 171 assembler.addSequence("anonymous", getBytes(completePath1), false); 172 assembler.addSequence("anonymous", getBytes(completePath1), false); 173 assembler.addSequence("anonymous", getBytes(completePath1), false); 174 assembler.addSequence("anonymous", getBytes(completePath1), false); 175 assembler.addSequence("anonymous", getBytes(completePath1), false); 176 177 // followed by lower weight complete path 2 178 assembler.addSequence("anonymous", getBytes(completePath2), false); 179 assembler.addSequence("anonymous", getBytes(completePath2), false); 180 assembler.addSequence("anonymous", getBytes(completePath2), false); 181 182 // And the incomplete path 183 assembler.addSequence("anonymous", getBytes(incompletePath), false); 184 assembler.addSequence("anonymous", getBytes(incompletePath), false); 185 186 assembler.buildGraphIfNecessary(); 187 assembler.generateJunctionTrees(); 188 189 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 190 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 191 192 193 Assert.assertEquals(bestPaths.size(), 3); 194 Assert.assertEquals(bestPaths.get(0), completePath1); 195 Assert.assertEquals(bestPaths.get(1), completePath2); 196 Assert.assertEquals(bestPaths.get(2), "AAAACAC"+"G"+"A" + incompletePath); // asserting that path 2 (the most similar path) was used for the head of incomplete path 197 } 198 199 @Test 200 // This test demonstrates a potential source of infinite looping, specifically the chain extension testDegernerateLoopingDanglingEndInChainGatheringCode()201 public void testDegernerateLoopingDanglingEndInChainGatheringCode() { 202 final String ref = "GACACTCACGCACGGCC"; 203 final String alt = "GACACTCACCCCCCCCC"; // the alt ends with a looping and un-escpaed string that is repetative, this will make a path that has no hope of escape (RUN!) 204 final int kmerSize = 5; 205 final int readlength = 8; 206 207 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 208 assembler.addSequence("anonymous", getBytes(ref), true); 209 // Add "reads" to the graph 210 for (int i = 0; i + readlength < ref.length(); i ++) { 211 assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt), i, i + readlength), false); 212 } 213 assembler.buildGraphIfNecessary(); 214 assembler.generateJunctionTrees(); 215 assembler.pruneJunctionTrees(1); 216 217 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 218 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 219 220 Assert.assertTrue(true, "This case could have infinitely looped because the junction trees were all uninformative and pruned on the poly-C branch. There is no condition to stop because the junction tree was removed by pruning"); 221 } 222 223 @Test 224 // Asserting that for a very degenerate looping case (the only weight resides in the loop (which could happen due to non-unique kmers for unmerged dangling ends) 225 // note that this particular case is recovered by virtue of the fact that the reference path itself has weight testDegernerateLoopingDanglingEnd()226 public void testDegernerateLoopingDanglingEnd() { 227 final String ref = "GACACTCACGCACGGCC"; 228 final String alt = "GACACTCACCCCCCCCC"; // the alt ends with a looping and un-escpaed string that is repetative, this will make a path that has no hope of escape (RUN!) 229 final int kmerSize = 5; 230 final int readlength = 8; 231 232 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(kmerSize); 233 assembler.addSequence("anonymous", getBytes(ref), true); 234 // Add "reads" to the graph 235 for (int i = 0; i + readlength < ref.length(); i ++) { 236 assembler.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt), i, i + readlength), false); 237 } 238 assembler.buildGraphIfNecessary(); 239 assembler.generateJunctionTrees(); 240 241 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 242 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 243 244 // For this graph structure, there is no evidence whatsoever that a path follow the reference, unfortunately this means we can currently not recover the loop in the alt 245 Assert.assertTrue(bestPaths.isEmpty()); 246 247 } 248 249 @Test 250 // Due to there being no JT covering the reference, we assert that we are only finding the read supported alt path despite the reference path being a reasonable start path testJunctionTreeRefusesToFollowReferencePathUnlessThereIsNoChoice()251 public void testJunctionTreeRefusesToFollowReferencePathUnlessThereIsNoChoice() { 252 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(4); 253 String ref = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg 254 255 // A simple snip het 256 String read1 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the below read 257 String read2 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the above read 258 259 assembler.addSequence("anonymous", getBytes(ref), true); 260 assembler.addSequence("anonymous", getBytes(read1), false); 261 assembler.addSequence("anonymous", getBytes(read2), false); 262 263 assembler.buildGraphIfNecessary(); 264 assembler.generateJunctionTrees(); 265 266 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1) 267 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 268 269 Assert.assertEquals(bestPaths.size(), 1); 270 Assert.assertEquals(new String(bestPaths.get(0).getBytes()), read1); 271 } 272 273 // TODO this test will soon be obsolete, we will change path termination to be based on loops soon enough (disabled because the number was bumped up to 7 rather than 4 which broke this test) 274 @Test (enabled = false) 275 // This test asserts the current behavior towards junction tree ending because of insufficient junction tree data, if we are past our point of junction tree evidence we will fail 276 // to find paths, thus this test documents the limitations therein. testPathTerminationBasedOnDistanceFromLastHaplotype()277 public void testPathTerminationBasedOnDistanceFromLastHaplotype() { 278 final JunctionTreeLinkedDeBruijnGraph assembler1 = new JunctionTreeLinkedDeBruijnGraph(6); 279 final JunctionTreeLinkedDeBruijnGraph assembler2 = new JunctionTreeLinkedDeBruijnGraph(6); 280 281 String ref = "AAACAC"+"C"+"ATGGCGG"+"A"+"GGAGTT"+"T"+"GCTCGAA"+"G"+"GGCGTA"+"C"+"CCTACCT"; // the first site has an interesting graph structure and the second site is used to ensure the graph isinterestingg 282 String alt1 = "AAACAC"+"A"+"ATGGCGG"+"T"+"GGAGTT"+"G"+"GCTCGAA"+"A"+"GGCGTA"+"G"+"CCTACCT"; 283 String alt2 = "AAACAC"+"A"+"ATGGCGG"+"T"+"GGAGTT"+"G"+"GCTCGAA"+"A"+"GGCGTA"+"C"+"CCTACCT"; //No expansion of the last path 284 285 assembler1.addSequence("anonymous", getBytes(ref), true); 286 assembler2.addSequence("anonymous", getBytes(ref), true); 287 // Add short sequences of reads to generate the appropriate alt paths 288 for (int i = 0; i + 6 < ref.length(); i ++) { 289 assembler1.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt1), i, i + 7), false); 290 assembler2.addSequence("anonymous", Arrays.copyOfRange(getBytes(alt2), i, i + 7), false); 291 } 292 293 assembler1.buildGraphIfNecessary(); 294 assembler2.buildGraphIfNecessary(); 295 assembler1.generateJunctionTrees(); 296 assembler2.generateJunctionTrees(); 297 // Assert that the trees are all pointless 298 assembler1.getReadThreadingJunctionTrees(false).values().forEach(tree -> Assert.assertTrue(tree.getRootNode().hasNoEvidence())); 299 assembler2.getReadThreadingJunctionTrees(false).values().forEach(tree -> Assert.assertTrue(tree.getRootNode().hasNoEvidence())); 300 301 // We had to close out our paths because none of the junction trees are informative and there are 4 decisions to make 302 final List<String> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler1).setJunctionTreeEvidenceWeightThreshold(1) 303 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 304 Assert.assertEquals(bestPaths.size(), 0); 305 306 // We didn't have to close out our paths because there were < 4 decisions made without junction trees, thus we do recover the path 307 final List<String> bestPaths2 = new JunctionTreeKBestHaplotypeFinder<>(assembler2).setJunctionTreeEvidenceWeightThreshold(1) 308 .findBestHaplotypes(5).stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toList()); 309 Assert.assertEquals(bestPaths2.size(), 1); 310 } 311 312 @Test 313 // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1. 314 // We are asserting that the JunctionTree generated by this case lives on the node itself testPerfectlyPhasedHaplotypeRecoveryRef()315 public void testPerfectlyPhasedHaplotypeRecoveryRef() { 316 int readlength = 20; 317 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 318 String ref = "AAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size 319 String alt1 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites 320 321 // Generate some reads that do not span the entire active region 322 assembler.addSequence("anonymous", getBytes(ref), true); 323 for (int i = 0; i + readlength < ref.length(); i++) { 324 assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false); 325 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false); 326 } 327 328 assembler.buildGraphIfNecessary(); 329 assembler.generateJunctionTrees(); 330 331 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 332 Assert.assertEquals(junctionTrees.size(), 10); 333 334 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 335 Assert.assertEquals(finder.sources.size(), 1); 336 Assert.assertEquals(finder.sinks.size(), 1); 337 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(); 338 339 // We assert that we found all of the haplotypes present in reads and nothing else 340 Assert.assertEquals(haplotypes.size(), 2); 341 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 342 // Asserting that 343 Assert.assertTrue(foundHaplotypes.contains(alt1)); 344 Assert.assertTrue(foundHaplotypes.contains(ref)); 345 } 346 347 @Test 348 // This test is intended to illustrate a problem we know causes probems for junction trees, namely that despite there being 349 // evidence for a particular path, if there is insufficient evidence we simply end up with combinatorial expansion like before testInsufficientJunctionTreeDataCausingCombinatorialExpansion()350 public void testInsufficientJunctionTreeDataCausingCombinatorialExpansion() { 351 int readlength = 20; 352 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 353 String ref = "AAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size 354 String alt1 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites 355 356 // Generate some reads that do not span the entire active region 357 assembler.addSequence("anonymous", getBytes(ref), true); 358 359 assembler.addSequence("anonymous", getBytes(alt1), false); 360 assembler.addSequence("anonymous", getBytes(ref), false); 361 362 assembler.buildGraphIfNecessary(); 363 assembler.generateJunctionTrees(); 364 365 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 366 Assert.assertEquals(junctionTrees.size(), 10); 367 368 JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 369 Assert.assertEquals(finder.sources.size(), 1); 370 Assert.assertEquals(finder.sinks.size(), 1); 371 List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(); 372 373 // We assert that we found all of the haplotypes present in reads and nothing else 374 Assert.assertEquals(haplotypes.size(), 32); // 2^5 paths from combinatorial expansion 375 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 376 // Asserting that 377 Assert.assertTrue(foundHaplotypes.contains(alt1)); 378 Assert.assertTrue(foundHaplotypes.contains(ref)); 379 380 } 381 382 383 @Test 384 // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1. 385 // We are asserting that the JunctionTree generated by this case lives on the node itself testPerfectlyPhasedHaplotypeRecoveryTwoAlts()386 public void testPerfectlyPhasedHaplotypeRecoveryTwoAlts() { 387 int readlength = 20; 388 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 389 String ref = "AAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size 390 String alt1 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites 391 String alt2 = "AAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"C"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with one different value from alt1 392 393 // Generate some reads that do not span the entire active region 394 assembler.addSequence("anonymous", getBytes(ref), true); 395 for (int i = 0; i + readlength < ref.length(); i++) { 396 assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false); 397 assembler.addSequence("anonymous", getBytes(alt2.substring(i, i + readlength)), false); 398 } 399 400 assembler.buildGraphIfNecessary(); 401 assembler.generateJunctionTrees(); 402 403 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 404 Assert.assertEquals(junctionTrees.size(), 6); 405 406 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 407 Assert.assertEquals(finder.sources.size(), 1); 408 Assert.assertEquals(finder.sinks.size(), 1); 409 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(); 410 411 // We assert that we found all of the haplotypes present in reads and nothing else 412 Assert.assertEquals(haplotypes.size(), 2); 413 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 414 // Asserting that 415 Assert.assertTrue(foundHaplotypes.contains(alt1)); 416 Assert.assertTrue(foundHaplotypes.contains(alt2)); 417 } 418 419 @Test 420 // Asserting that the reference end base is handled with care... This code has the tendency to cut early 421 // TODO this test is disabled testNonUniqueRefStopPosition()422 public void testNonUniqueRefStopPosition() { 423 int readlength = 15; 424 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 425 String ref = "AACTTGGGTGTGTGAAACCCGGGTTGTGTGTGAA"; // The sequence GTGTGTGAA is repeated 426 427 // Generate some reads that do not span the entire active region 428 assembler.addSequence("anonymous", getBytes(ref), true); 429 for (int i = 0; i <= ref.length(); i+=2) { 430 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength <= ref.length() ? i + readlength : ref.length())), false); 431 } 432 433 assembler.buildGraphIfNecessary(); 434 assembler.generateJunctionTrees(); 435 436 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 437 Assert.assertEquals(junctionTrees.size(), 2); 438 439 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(3); 440 Assert.assertEquals(finder.sources.size(), 1); 441 Assert.assertEquals(finder.sinks.size(), 1); 442 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5); 443 444 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 445 // Asserting that the correct reference haplotype actually existed 446 Assert.assertTrue(foundHaplotypes.contains(ref)); 447 // Asserting that we did NOT find the haplotype with 0 loops of the repeat: 448 Assert.assertFalse(foundHaplotypes.contains("AACTTGGGTGTGTG")); 449 450 // We assert that we found all of the haplotypes present in reads and nothing else 451 Assert.assertEquals(haplotypes.size(), 1); 452 } 453 454 @Test testNonUniqueRefStopPositionNoStopJTToResolveIt()455 public void testNonUniqueRefStopPositionNoStopJTToResolveIt() { 456 int readlength = 15; 457 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 458 String ref = "AACTTGGGTGTGTGAAACCCGGGTTGTGTGTGAA"; // The sequence GTGTGTGAA is repeated 459 460 // Generate some reads that do not span the entire active region 461 assembler.addSequence("anonymous", getBytes(ref), true); 462 for (int i = 0; i + readlength < ref.length() - 2; i++) { //the - 2 means that we don't span to the end 463 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false); 464 } 465 466 assembler.buildGraphIfNecessary(); 467 assembler.generateJunctionTrees(); 468 469 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 470 Assert.assertEquals(junctionTrees.size(), 1); 471 472 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 473 Assert.assertEquals(finder.sources.size(), 1); 474 Assert.assertEquals(finder.sinks.size(), 1); 475 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5); 476 477 // We assert that we found all of the haplotypes present in reads and nothing else 478 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 479 // Asserting that the correct reference haplotype actually existed 480 Assert.assertTrue(foundHaplotypes.contains(ref)); 481 // Asserting that we did NOT find the haplotype with 0 loops of the repeat: 482 Assert.assertFalse(foundHaplotypes.contains("AACTTGGGTGTGTG")); 483 //TODO maybe assert something about the lenght... unclear 484 } 485 486 @Test 487 // Test asserting that the behavior is reasonable when there is read data past the reference end kmer testReferenceEndWithErrorSpanningPastEndBase()488 public void testReferenceEndWithErrorSpanningPastEndBase() { 489 int readlength = 15; 490 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 491 String ref = "AACTGGGTT" + "GCGCGCGTTACCCGT"; // The sequence GTGTGTGAA is repeated 492 String alt = "AACTGGGTT"+"T"+"GCGCGCGTTACCCGTTT"; // Alt contig with error spanning past the end of the reference sequence 493 494 // Generate some reads that do not span the entire active region 495 assembler.addSequence("anonymous", getBytes(ref), true); 496 for (int i = 0; i + readlength < alt.length(); i++) { 497 assembler.addSequence("anonymous", getBytes(alt.substring(i, i + readlength)), false); 498 } 499 500 assembler.buildGraphIfNecessary(); 501 assembler.generateJunctionTrees(); 502 503 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 504 Assert.assertEquals(junctionTrees.size(), 1); //TODO this will become 2 once the change is implemented 505 506 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 507 Assert.assertEquals(finder.sources.size(), 1); 508 Assert.assertEquals(finder.sinks.size(), 1); 509 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5); 510 511 // We assert that we found all of the haplotypes present in reads and nothing else 512 Assert.assertEquals(haplotypes.size(), 1); 513 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 514 // Asserting that we did NOT find the reference haplotype itself 515 Assert.assertFalse(foundHaplotypes.contains(ref)); 516 // Asserting that we did find the alt haplotype minus the ending bases 517 Assert.assertTrue(foundHaplotypes.contains("AACTGGGTT"+"T"+"GCGCGCGTTACCCGT")); 518 } 519 520 @Test 521 // Test asserting that the behavior is reasonable when there is read data past the reference end kmer testFullReferencePathRecoveryDespiteReadsNotReachingLastBase()522 public void testFullReferencePathRecoveryDespiteReadsNotReachingLastBase() { 523 int readlength = 15; 524 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 525 String ref = "AACTGGGTT" + "GCGCGCGTTACCCGT"; // The sequence GTGTGTGAA is repeated 526 String alt = "AACTGGGTT"+"T"+"GCGCGCGTTACCC"; // Alt contig that doesn't reach the end of the reference 527 528 // Generate some reads that do not span the entire active region 529 assembler.addSequence("anonymous", getBytes(ref), true); 530 for (int i = 0; i + readlength < alt.length(); i++) { 531 assembler.addSequence("anonymous", getBytes(alt.substring(i, i + readlength)), false); 532 } 533 534 assembler.buildGraphIfNecessary(); 535 assembler.generateJunctionTrees(); 536 537 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 538 Assert.assertEquals(junctionTrees.size(), 1); 539 540 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 541 Assert.assertEquals(finder.sources.size(), 1); 542 Assert.assertEquals(finder.sinks.size(), 1); 543 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5); 544 545 // We assert that we found all of the haplotypes present in reads and nothing else 546 Assert.assertEquals(haplotypes.size(), 1); 547 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 548 // Asserting that we did NOT find the reference haplotype itself 549 Assert.assertFalse(foundHaplotypes.contains(ref)); 550 // Asserting that we did find the alt haplotype minus the ending bases 551 Assert.assertTrue(foundHaplotypes.contains("AACTGGGTT"+"T"+"GCGCGCGTTACCCGT")); 552 } 553 554 @Test 555 // The read evidence at the loop is short (15 bases) and consequently doesn't span the entire loop. testOfLoopingReferenceReadsTooShortToRecoverIt()556 public void testOfLoopingReferenceReadsTooShortToRecoverIt() { 557 int readlength = 15; 558 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 559 String ref = "AAACTTTCGCGGGCCCTTAAACCCGCCCTTAAACCCGCCCTTAAACCGCTGTAAGAAA"; // The sequence GCCCTTAAACCC (12 bases) is repeated 560 561 // Generate some reads that do not span the entire active region 562 assembler.addSequence("anonymous", getBytes(ref), true); 563 for (int i = 0; i + readlength < ref.length(); i++) { 564 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false); 565 assembler.addSequence("anonymous", getBytes(ref.substring(i, i + readlength)), false); 566 } 567 568 assembler.buildGraphIfNecessary(); 569 assembler.generateJunctionTrees(); 570 571 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 572 Assert.assertEquals(junctionTrees.size(), 2); 573 574 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 575 Assert.assertEquals(finder.sources.size(), 1); 576 Assert.assertEquals(finder.sinks.size(), 1); 577 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(5); //NOTE we only ask for 5 haplotypes here since the graph might loop forever... 578 579 // We assert that we found all of the haplotypes present in reads and nothing else 580 Assert.assertEquals(haplotypes.size(), 2); 581 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 582 // Asserting that the correct reference haplotype actually existed 583 Assert.assertTrue(foundHaplotypes.contains(ref)); 584 // Asserting that we did NOT find the haplotype with 0 loops of the repeat: 585 Assert.assertFalse(foundHaplotypes.contains("AAACTTTCGCGGGCCCTTAAACCGCTGTAAGAAA")); 586 } 587 588 // TODO this is a lingering and very serious problem that we aim to resolve shortly somehow, unclear as to exaclty how right now 589 @Test (enabled = false) 590 // This test illustrates a current known issue with the new algorithm, where old junction trees with subranchees that don't have a lot of data 591 // are used in place of younger trees that might cointain evidence of new paths. This particular test shows that a variant might be dropped as a result. testOrphanedSubBranchDueToLackingOldJunctionTree()592 public void testOrphanedSubBranchDueToLackingOldJunctionTree() { 593 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(6); 594 595 String ref = "AAAACAC"+"T"+"ATGTGGGG"+"A"+"GGGTTAA"+"A"+"GTCTGAA"; 596 String haplotype1 = "AAAACAC"+"G"+"ATGTGGGG"+"T"+"GGGTTAA"+"A"+"GTCTGAA"; 597 String haplotype2 = "AAAACAC"+"G"+"ATGTGGGG"+"T"+"GGGTTAA"+"C"+"GTCTGAA"; 598 int longReadLength = 20; 599 int shortReadLength = 10; 600 601 // Add the reference a few times to get it into the junction trees 602 assembler.addSequence("anonymous", getBytes(ref), true); 603 assembler.addSequence("anonymous", getBytes(ref), false); 604 assembler.addSequence("anonymous", getBytes(ref), false); 605 assembler.addSequence("anonymous", getBytes(ref), false); 606 607 // Add haplotype 1 reads that are long enough to catch the last variant site from the first junction tree (22) 608 for (int i = 0; i + longReadLength < ref.length(); i++) { 609 assembler.addSequence("anonymous", getBytes(haplotype1.substring(i, i + longReadLength)), false); 610 } 611 612 // Add haplotype 2 reads that are long enough to leapfrog the entire haplotype but insufficient to establish the G->T->C path on the first tree 613 for (int i = 0; i + shortReadLength < ref.length(); i++) { 614 assembler.addSequence("anonymous", getBytes(haplotype2.substring(i, i + shortReadLength)), false); 615 } 616 617 assembler.generateJunctionTrees(); 618 619 final List<String> haplotypes = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(2) 620 .findBestHaplotypes(5).stream().map(h -> new String(h.getBases())).collect(Collectors.toList()); 621 622 // Assert that the reference haplotype is recovered 623 Assert.assertTrue(haplotypes.contains(ref)); 624 // Assert that haplotype1 is recovered 625 Assert.assertTrue(haplotypes.contains(haplotype1)); 626 // Assert that haplotype2 is recovered 627 // NOTE: this gets dropped because the oldest junction tree on the T->A path has data for haplotype1 but not haplotype2 628 Assert.assertTrue(haplotypes.contains(haplotype2)); 629 } 630 631 @Test 632 // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1. 633 // We are asserting that the JunctionTree generated by this case lives on the node itself testEdgeCaseInvolvingHighInDegreeAndOutDegreeChars()634 public void testEdgeCaseInvolvingHighInDegreeAndOutDegreeChars() { 635 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(4); 636 String ref = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting 637 638 // A simple snip het 639 String refRead1 = "AAAACAC"+"CCGA"+"ATGTGGGG"+"A"+"GGGTT"; 640 String read1 = "AAAACAC"+"CCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the below read 641 String read2 = "AAAACAC"+"TCGA"+"CTGTGGGG"+"C"+"GGGTT"; // CGAC merges with the above read 642 643 assembler.addSequence("anonymous", getBytes(ref), true); 644 assembler.addSequence("anonymous", getBytes(refRead1), false); 645 assembler.addSequence("anonymous", getBytes(read1), false); 646 assembler.addSequence("anonymous", getBytes(read2), false); 647 648 assembler.buildGraphIfNecessary(); 649 assembler.generateJunctionTrees(); 650 651 Map<MultiDeBruijnVertex, JunctionTreeLinkedDeBruijnGraph.ThreadingTree> junctionTrees = assembler.getReadThreadingJunctionTrees(false); 652 Assert.assertEquals(junctionTrees.size(), 6); 653 654 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 655 finder.setJunctionTreeEvidenceWeightThreshold(0); 656 Assert.assertEquals(finder.sources.size(), 1); 657 Assert.assertEquals(finder.sinks.size(), 1); 658 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(); 659 660 // We assert that we found all of the haplotypes present in reads and nothing else 661 Assert.assertEquals(haplotypes.size(), 3); 662 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 663 Assert.assertTrue(foundHaplotypes.contains(refRead1)); 664 Assert.assertTrue(foundHaplotypes.contains(read1)); 665 Assert.assertTrue(foundHaplotypes.contains(read2)); 666 } 667 668 669 @Test 670 // This is a test enforcing that the behavior around nodes are both outDegree > 1 while also having downstream children with inDegree > 1. 671 // We are asserting that the JunctionTree generated by this case lives on the node itself testGraphRecoveryWhenTreeContainsRepeatedKmers()672 public void testGraphRecoveryWhenTreeContainsRepeatedKmers() { 673 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 674 String ref = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG"; // the first site has an interesting graph structure and the second site is used to ensurethe graph is intersting 675 676 // A simple snip het 677 String refRead = "AAATCTTCGGGGGGGGGGGGGGTTTCTGGG"; 678 679 assembler.addSequence("anonymous", getBytes(ref), true); 680 assembler.addSequence("anonymous", getBytes(refRead), false); 681 682 assembler.buildGraphIfNecessary(); 683 assembler.generateJunctionTrees(); 684 685 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(assembler); 686 finder.setJunctionTreeEvidenceWeightThreshold(0); 687 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder.findBestHaplotypes(); 688 689 // We assert that we found all of the haplotypes present in reads and nothing else 690 Assert.assertEquals(haplotypes.size(), 1); 691 Set<String> foundHaplotypes = haplotypes.stream().map(haplotype -> new String(haplotype.getBases())).collect(Collectors.toSet()); 692 Assert.assertTrue(foundHaplotypes.contains(refRead)); 693 } 694 695 @Test testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites()696 public void testSimpleJunctionTreeIncludeRefInJunctionTreeTwoSites() { 697 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(5); 698 String ref = "GGGAAAT" + "T" + "TCCGGC" + "T" + "CGTTTA"; //Two variant sites in close proximity 699 700 // A simple snip het 701 String altAARead1 = "GGGAAAT" + "A" + "TCCGGC" + "A" + "CGTTTA"; // Replaces a T with an A, then a T with a A 702 String altAARead2 = "GGGAAAT" + "A" + "TCCGGC" + "A" + "CGTTTA"; // Replaces a T with an A, then a T with a A 703 String altTCRead1 = "GGGAAAT" + "T" + "TCCGGC" + "C" + "CGTTTA"; // Keeps the T, then replaces a T with a C 704 String altTCRead2 = "GGGAAAT" + "T" + "TCCGGC" + "C" + "CGTTTA"; // Keeps the T, then replaces a T with a C 705 706 assembler.addSequence("anonymous", getBytes(ref), true); 707 assembler.addSequence("anonymous", getBytes(altAARead1), false); 708 assembler.addSequence("anonymous", getBytes(altAARead2), false); 709 assembler.addSequence("anonymous", getBytes(altTCRead1), false); 710 assembler.addSequence("anonymous", getBytes(altTCRead2), false); 711 712 assembler.buildGraphIfNecessary(); 713 assembler.generateJunctionTrees(); 714 assembler.pruneJunctionTrees(0); 715 716 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder1 = new JunctionTreeKBestHaplotypeFinder<>(assembler); 717 Assert.assertEquals(finder1.sources.size(), 1); 718 Assert.assertEquals(finder1.sinks.size(), 1); 719 720 List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = finder1.findBestHaplotypes(10); 721 System.out.println(); 722 } 723 724 // Disabled until multi-sink/source edges are supported 725 @Test (enabled = false) testDeadNode()726 public void testDeadNode(){ 727 final JunctionTreeLinkedDeBruijnGraph g = new JunctionTreeLinkedDeBruijnGraph(3); 728 final MultiDeBruijnVertex v1 = new MultiDeBruijnVertex("a".getBytes()); 729 final MultiDeBruijnVertex v2 = new MultiDeBruijnVertex("b".getBytes()); 730 final MultiDeBruijnVertex v3 = new MultiDeBruijnVertex("c".getBytes()); 731 final MultiDeBruijnVertex v4 = new MultiDeBruijnVertex("d".getBytes()); 732 final MultiDeBruijnVertex v5 = new MultiDeBruijnVertex("e".getBytes()); 733 g.addVertex(v1); //source 734 g.addVertex(v2); 735 g.addVertex(v3); 736 g.addVertex(v4); //sink 737 g.addVertex(v5); //sink 738 g.addEdge(v1, v2); 739 g.addEdge(v2, v3); 740 g.addEdge(v3, v2);//cycle 741 g.addEdge(v2, v5); 742 g.addEdge(v1, v4); 743 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder1 = new JunctionTreeKBestHaplotypeFinder<>(g); 744 Assert.assertEquals(finder1.sources.size(), 1); 745 Assert.assertEquals(finder1.sinks.size(), 2); 746 747 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder2 = new JunctionTreeKBestHaplotypeFinder<>(g, v1, v4); //v5 is a dead node (can't reach the sink v4) 748 Assert.assertEquals(finder2.sources.size(), 1); 749 Assert.assertEquals(finder2.sinks.size(), 1); 750 } 751 752 @DataProvider(name = "BasicPathFindingData") makeBasicPathFindingData()753 public Object[][] makeBasicPathFindingData() { 754 final List<Object[]> tests = new ArrayList<>(); 755 for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { 756 for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { 757 for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { 758 tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes}); 759 } 760 } 761 } 762 return tests.toArray(new Object[][]{}); 763 } 764 765 private static int weight = 1; createVertices(final JunctionTreeLinkedDeBruijnGraph graph, final int n, final MultiDeBruijnVertex source, final MultiDeBruijnVertex target)766 final Set<MultiDeBruijnVertex> createVertices(final JunctionTreeLinkedDeBruijnGraph graph, final int n, final MultiDeBruijnVertex source, final MultiDeBruijnVertex target) { 767 final List<String> seqs = Arrays.asList("A", "C", "G", "T"); 768 final Set<MultiDeBruijnVertex> vertices = new LinkedHashSet<>(); 769 for ( int i = 0; i < n; i++ ) { 770 final MultiDeBruijnVertex v = new MultiDeBruijnVertex(seqs.get(i).getBytes()); 771 graph.addVertex(v); 772 vertices.add(v); 773 if ( source != null ) graph.addEdge(source, v, new MultiSampleEdge(false, weight++, 1)); 774 if ( target != null ) graph.addEdge(v, target, new MultiSampleEdge(false, weight++, 1)); 775 } 776 return vertices; 777 } 778 779 780 @Test(dataProvider = "BasicPathFindingData") testBasicPathFindingNoJunctionTrees(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes)781 public void testBasicPathFindingNoJunctionTrees(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes) { 782 final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(11); 783 784 final MultiDeBruijnVertex middleTop = new MultiDeBruijnVertex("GTAC".getBytes()); 785 final MultiDeBruijnVertex middleBottom = new MultiDeBruijnVertex("ACTG".getBytes()); 786 graph.addVertices(middleTop, middleBottom); 787 final Set<MultiDeBruijnVertex> starts = createVertices(graph, nStartNodes, null, middleTop); 788 @SuppressWarnings("unused") 789 final Set<MultiDeBruijnVertex> bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); 790 final Set<MultiDeBruijnVertex> ends = createVertices(graph, nEndNodes, middleBottom, null); 791 792 final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes; 793 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> haplotypes = new JunctionTreeKBestHaplotypeFinder<>(graph, starts, ends, JunctionTreeKBestHaplotypeFinder.DEFAULT_OUTGOING_JT_EVIDENCE_THRESHOLD_TO_BELEIVE, false).findBestHaplotypes(); 794 Assert.assertEquals(haplotypes.size(), expectedNumOfPaths); 795 IntStream.range(1, haplotypes.size()).forEach(n -> Assert.assertTrue(haplotypes.get(n-1).score() >= haplotypes.get(n).score())); 796 } 797 798 799 @DataProvider(name = "BasicBubbleDataProvider") makeBasicBubbleDataProvider()800 public Object[][] makeBasicBubbleDataProvider() { 801 final List<Object[]> tests = new ArrayList<>(); 802 for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { 803 for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { 804 tests.add(new Object[]{refBubbleLength, altBubbleLength}); 805 } 806 } 807 return tests.toArray(new Object[][]{}); 808 } 809 810 @Test(dataProvider = "BasicBubbleDataProvider") testBasicBubbleData(final int refBubbleLength, final int altBubbleLength)811 public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { 812 // Construct the assembly graph 813 JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(4); 814 final String preRef = "ATGG"; 815 final String postRef = "GCGGC"; 816 817 final String ref = preRef + Strings.repeat("A", refBubbleLength) + postRef; 818 final String alt = preRef + Strings.repeat("A", altBubbleLength-1) + "T" + postRef; 819 820 graph.addSequence("anonomyous", ref.getBytes(), true); 821 for (int i = 0; i < 5; i++) graph.addSequence("anonomyous", ref.getBytes(), 1, false); 822 for (int i = 0; i < 10; i++) graph.addSequence("anonomyous", alt.getBytes(), 1, false); 823 824 graph.buildGraphIfNecessary(); 825 graph.generateJunctionTrees(); 826 827 // Construct the test path 828 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(graph).findBestHaplotypes(5); 829 Assert.assertEquals(bestPaths.size(), 2); 830 KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge> path = bestPaths.get(0); 831 832 // Construct the actual cigar string implied by the test path 833 final CigarBuilder expectedCigar = new CigarBuilder(); 834 expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); 835 if( refBubbleLength > altBubbleLength ) { 836 expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); 837 expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); 838 } else if ( refBubbleLength < altBubbleLength ) { 839 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 840 expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength, CigarOperator.I)); 841 } else { 842 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 843 } 844 expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); 845 846 Assert.assertEquals(path.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), expectedCigar.make().toString(), "Cigar string mismatch"); 847 } 848 849 @DataProvider(name = "TripleBubbleDataProvider") makeTripleBubbleDataProvider()850 public Object[][] makeTripleBubbleDataProvider() { 851 final List<Object[]> tests = new ArrayList<>(); 852 for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { 853 for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { 854 for ( final boolean offRefEnding : Arrays.asList(true, false) ) { 855 for ( final boolean offRefBeginning : Arrays.asList(false) ) { 856 tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding}); 857 } 858 } 859 } 860 } 861 return tests.toArray(new Object[][]{}); 862 } 863 864 @Test(dataProvider = "TripleBubbleDataProvider") 865 //TODO figure out what to do here as this test doesn't apply testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding)866 public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { 867 // Construct the assembly graph 868 SeqGraph graph = new SeqGraph(11); 869 final String preAltOption = "ATCGATCGATCGATCGATCG"; 870 final String postAltOption = "CCCC"; 871 final String preRef = "ATGG"; 872 final String postRef = "GGCCG"; 873 final String midRef1 = "TTCCT"; 874 final String midRef2 = "CCCAAAAAAAAAAAA"; 875 876 SeqVertex preV = new SeqVertex(preAltOption); 877 SeqVertex v = new SeqVertex(preRef); 878 SeqVertex v2Ref = new SeqVertex(Strings.repeat("A", refBubbleLength)); 879 SeqVertex v2Alt = new SeqVertex(Strings.repeat("A", altBubbleLength - 1) + "T"); 880 SeqVertex v4Ref = new SeqVertex(Strings.repeat("C", refBubbleLength)); 881 SeqVertex v4Alt = new SeqVertex(Strings.repeat("C", altBubbleLength - 1) + "T"); 882 SeqVertex v6Ref = new SeqVertex(Strings.repeat("G", refBubbleLength)); 883 SeqVertex v6Alt = new SeqVertex(Strings.repeat("G", altBubbleLength - 1) + "T"); 884 SeqVertex v3 = new SeqVertex(midRef1); 885 SeqVertex v5 = new SeqVertex(midRef2); 886 SeqVertex v7 = new SeqVertex(postRef); 887 SeqVertex postV = new SeqVertex(postAltOption); 888 889 final String ref = preRef + v2Ref.getSequenceString() + midRef1 + v4Ref.getSequenceString() + midRef2 + v6Ref.getSequenceString() + postRef; 890 891 graph.addVertex(preV); 892 graph.addVertex(v); 893 graph.addVertex(v2Ref); 894 graph.addVertex(v2Alt); 895 graph.addVertex(v3); 896 graph.addVertex(v4Ref); 897 graph.addVertex(v4Alt); 898 graph.addVertex(v5); 899 graph.addVertex(v6Ref); 900 graph.addVertex(v6Alt); 901 graph.addVertex(v7); 902 graph.addVertex(postV); 903 graph.addEdge(preV, v, new BaseEdge(false, 1)); 904 graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); 905 graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); 906 graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); 907 graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); 908 graph.addEdge(v3, v4Ref, new BaseEdge(true, 10)); 909 graph.addEdge(v4Ref, v5, new BaseEdge(true, 10)); 910 graph.addEdge(v3, v4Alt, new BaseEdge(false, 5)); 911 graph.addEdge(v4Alt, v5, new BaseEdge(false, 5)); 912 graph.addEdge(v5, v6Ref, new BaseEdge(true, 11)); 913 graph.addEdge(v6Ref, v7, new BaseEdge(true, 11)); 914 graph.addEdge(v5, v6Alt, new BaseEdge(false, 55)); 915 graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); 916 graph.addEdge(v7, postV, new BaseEdge(false, 1)); 917 918 // Construct the test path 919 Path<SeqVertex,BaseEdge> path = new Path<>( (offRefBeginning ? preV : v), graph); 920 if( offRefBeginning ) 921 path = new Path<>(path, graph.getEdge(preV, v)); 922 path = new Path<>(path, graph.getEdge(v, v2Alt)); 923 path = new Path<>(path, graph.getEdge(v2Alt, v3)); 924 path = new Path<>(path, graph.getEdge(v3, v4Ref)); 925 path = new Path<>(path, graph.getEdge(v4Ref, v5)); 926 path = new Path<>(path, graph.getEdge(v5, v6Alt)); 927 path = new Path<>(path, graph.getEdge(v6Alt, v7)); 928 if( offRefEnding ) 929 path = new Path<>(path, graph.getEdge(v7,postV)); 930 931 // Construct the actual cigar string implied by the test path 932 final CigarBuilder expectedCigar = new CigarBuilder(); 933 if( offRefBeginning ) { 934 expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I)); 935 } 936 expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); 937 // first bubble 938 if( refBubbleLength > altBubbleLength ) { 939 expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); 940 expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); 941 } else if ( refBubbleLength < altBubbleLength ) { 942 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 943 expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength, CigarOperator.I)); 944 } else { 945 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 946 } 947 expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M)); 948 // second bubble is ref path 949 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 950 expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M)); 951 // third bubble 952 if( refBubbleLength > altBubbleLength ) { 953 expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); 954 expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); 955 } else if ( refBubbleLength < altBubbleLength ) { 956 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 957 expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength, CigarOperator.I)); 958 } else { 959 expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); 960 } 961 expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); 962 if( offRefEnding ) { 963 expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I)); 964 } 965 966 Assert.assertEquals(path.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), 967 expectedCigar.make().toString(), 968 "Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases())); 969 } 970 971 @Test (enabled = false) 972 //TODO this test illustrates a problem with junction trees and dangling end recovery, namely the path that the JT points to 973 //TODO is a dead end after dangling tail recovery. This needs to be resolved with either SmithWaterman or by coopting the threading code testIntraNodeInsertionDeletion()974 public void testIntraNodeInsertionDeletion() { 975 // Construct the assembly graph 976 final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(5); 977 final String ref = "TTTT" + "CCCCCGGG" + "TTT"; 978 final String alt = "TTTT" + "AAACCCCC" + "TTT"; 979 980 graph.addSequence("anonymous", getBytes(ref), true); 981 graph.addSequence("anonymous", getBytes(ref), false); 982 graph.addSequence("anonymous", getBytes(alt), false); 983 graph.buildGraphIfNecessary(); 984 graph.recoverDanglingHeads(0, 0, true, SmithWatermanJavaAligner.getInstance()); 985 graph.recoverDanglingTails(0, 0, true, SmithWatermanJavaAligner.getInstance()); 986 graph.generateJunctionTrees(); 987 988 @SuppressWarnings("all") 989 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(graph).findBestHaplotypes(); 990 Assert.assertEquals(bestPaths.size(), 2); 991 final Path<MultiDeBruijnVertex,MultiSampleEdge> refPath = bestPaths.get(0); 992 final Path<MultiDeBruijnVertex,MultiSampleEdge> altPath = bestPaths.get(1); 993 994 Assert.assertEquals(refPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "15M"); 995 Assert.assertEquals(altPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "4M3I5M3D3M"); 996 } 997 998 @Test (enabled = false) //TODO this is disabled due to the k max paths per node optimization not being implemented yet 999 /* 1000 This is a test of what can go awry if path pruning is based on the number of incoming edges to a given vertex. 1001 An illustration of the graph in this test: 1002 / ----- top \ 1003 /(33) \ 1004 refStart -(32) -- mid -- midExt ---------------------------- refEnd 1005 \(34) / (1) / 1006 \ ---- bot /- (33) botExt - (15) - botExtTop - / 1007 \ (18) / 1008 \ ------ botExtBot / 1009 1010 The expected best paths are (refStart->top->midExt->refEnd), and (refStart->mid->midExt->refEnd) because the bottom 1011 path is penalized for two extra forks despite it greedily looking the best at the start. 1012 1013 Because the old behavior used to base pruning on the number of incoming edges < K, the edge (bot->midExt) would be created 1014 first, then (top -> midExt) but (mid -> midExt) would never be created because midExt already has 2 incoming edges. 1015 */ 1016 testDegeneratePathPruningOptimizationCase()1017 public void testDegeneratePathPruningOptimizationCase() { 1018 // Construct an assembly graph demonstrating this issue 1019 final SeqGraph graph = new SeqGraph(11); 1020 final SeqVertex top = new SeqVertex("T"); 1021 final SeqVertex mid = new SeqVertex("C"); 1022 final SeqVertex midAndTopExt = new SeqVertex("GGG"); 1023 final SeqVertex bot = new SeqVertex("G"); 1024 final SeqVertex botExt = new SeqVertex("AAA"); 1025 final SeqVertex botExtTop = new SeqVertex("A"); 1026 final SeqVertex botExtBot = new SeqVertex("T"); 1027 1028 // Ref source and sink vertexes 1029 final SeqVertex refStart = new SeqVertex("CCCCCGGG"); 1030 final SeqVertex refEnd = new SeqVertex("TTTT"); 1031 1032 graph.addVertices(top, bot, mid, midAndTopExt, bot, botExt, botExtTop, botExtBot, refStart, refEnd); 1033 // First "diamond" with 3 mostly equivalent cost paths 1034 graph.addEdges(() -> new BaseEdge(false, 34), refStart, bot, botExt); 1035 graph.addEdges(() -> new BaseEdge(true, 33), refStart, top, midAndTopExt); 1036 graph.addEdges(() -> new BaseEdge(false, 32), refStart, mid, midAndTopExt); 1037 1038 // The best looking path reconnects with a very poor edge multiplicity to midAndTopExt (This is this is what casues the bug) 1039 graph.addEdges(() -> new BaseEdge(false, 1), bot, midAndTopExt); 1040 // There is another diamond at bot ext that will end up discounting that path from being in the k best 1041 graph.addEdges(() -> new BaseEdge(false, 15), botExt, botExtTop, refEnd); 1042 graph.addEdges(() -> new BaseEdge(false, 18), botExt, botExtBot, refEnd); 1043 1044 // Wheras the path is smooth sailing from refEnd 1045 graph.addEdges(() -> new BaseEdge(true, 65), midAndTopExt, refEnd); 1046 1047 @SuppressWarnings("all") 1048 final List<KBestHaplotype<SeqVertex, BaseEdge>> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(graph,refStart,refEnd).findBestHaplotypes(2); 1049 Assert.assertEquals(bestPaths.size(), 2); 1050 final Path<SeqVertex,BaseEdge> refPath = bestPaths.get(0); 1051 final Path<SeqVertex,BaseEdge> altPath = bestPaths.get(1); 1052 1053 Assert.assertEquals(refPath.getVertices().toArray(new SeqVertex[0]), new SeqVertex[]{refStart, top, midAndTopExt, refEnd}); 1054 Assert.assertEquals(altPath.getVertices().toArray(new SeqVertex[0]), new SeqVertex[]{refStart, mid, midAndTopExt, refEnd}); 1055 } 1056 1057 1058 @Test 1059 @SuppressWarnings({"unchecked"}) 1060 //TODO this test will make a good base for testing later realignment if the leading Ns are cut back down testHardSWPath()1061 public void testHardSWPath() { 1062 // Construct the assembly graph 1063 final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(11); 1064 String ref = "NNNNNNNNNNN"+"TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"+"NNN"; // Alt with one different value from alt1 1065 String alt = "NNNNNNNNNNN"+"ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"+"NNN"; // Alt with one different value from alt1 1066 1067 // Generate some reads that do not span the entire active region 1068 graph.addSequence("anonymous", getBytes(ref), true); 1069 graph.addSequence("anonymous", getBytes(ref), false); 1070 graph.addSequence("anonymous", getBytes(alt), false); 1071 graph.buildGraphIfNecessary(); 1072 graph.recoverDanglingHeads(0, 2, true, SmithWatermanJavaAligner.getInstance()); 1073 graph.generateJunctionTrees(); 1074 1075 @SuppressWarnings("all") 1076 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> finder = new JunctionTreeKBestHaplotypeFinder<>(graph); 1077 finder.setJunctionTreeEvidenceWeightThreshold(1); 1078 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = finder.findBestHaplotypes(); 1079 1080 Assert.assertEquals(paths.size(), 2); 1081 1082 final Path<MultiDeBruijnVertex, MultiSampleEdge> refPath = paths.get(0); 1083 final Path<MultiDeBruijnVertex, MultiSampleEdge> altPath = paths.get(1); 1084 1085 logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar(ref.getBytes(), 1086 SmithWatermanJavaAligner.getInstance())); 1087 logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar(ref.getBytes(), 1088 SmithWatermanJavaAligner.getInstance())); 1089 1090 Assert.assertEquals(refPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "59M"); 1091 Assert.assertEquals(altPath.calculateCigar(ref.getBytes(), SmithWatermanJavaAligner.getInstance()).toString(), "11M6I48M"); 1092 } 1093 1094 @Test testKmerGraphSimpleReferenceRecovery()1095 public void testKmerGraphSimpleReferenceRecovery() { 1096 // Construct the assembly graph 1097 final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(5); 1098 final MultiDeBruijnVertex refSource = new MultiDeBruijnVertex( "AAATT".getBytes() ); 1099 final MultiDeBruijnVertex k1 = new MultiDeBruijnVertex( "AATTT".getBytes() ); 1100 final MultiDeBruijnVertex k2 = new MultiDeBruijnVertex( "ATTTG".getBytes() ); 1101 final MultiDeBruijnVertex k3 = new MultiDeBruijnVertex( "TTTGG".getBytes() ); 1102 final MultiDeBruijnVertex k4 = new MultiDeBruijnVertex( "TTGGG".getBytes() ); 1103 final MultiDeBruijnVertex k5 = new MultiDeBruijnVertex( "TGGGC".getBytes() ); 1104 final MultiDeBruijnVertex k6 = new MultiDeBruijnVertex( "GGGCC".getBytes() ); 1105 final MultiDeBruijnVertex k7 = new MultiDeBruijnVertex( "GGCCC".getBytes() ); 1106 final MultiDeBruijnVertex k8 = new MultiDeBruijnVertex( "GCCCT".getBytes() ); 1107 final MultiDeBruijnVertex refEnd = new MultiDeBruijnVertex( "CCCTT".getBytes() ); 1108 graph.addVertices(refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd); 1109 graph.addEdges(() -> new MultiSampleEdge(true, 1, 1), refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd); 1110 1111 @SuppressWarnings("all") 1112 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new JunctionTreeKBestHaplotypeFinder<>(graph, refSource, refEnd).findBestHaplotypes(); 1113 1114 Assert.assertEquals(paths.size(), 1); 1115 1116 final Path<MultiDeBruijnVertex, MultiSampleEdge> refPath = paths.get(0); 1117 1118 final String refString = "AAATTTGGGCCCTT"; 1119 1120 Assert.assertEquals(refPath.getBases(), refString.getBytes()); 1121 } 1122 1123 @Test testKmerGraphSimpleReferenceRecoveryWithSNP()1124 public void testKmerGraphSimpleReferenceRecoveryWithSNP() { 1125 // Construct the assembly graph 1126 final JunctionTreeLinkedDeBruijnGraph graph = new JunctionTreeLinkedDeBruijnGraph(5); 1127 final MultiDeBruijnVertex refSource = new MultiDeBruijnVertex( "AAATT".getBytes() ); 1128 final MultiDeBruijnVertex k1 = new MultiDeBruijnVertex( "AATTT".getBytes() ); 1129 final MultiDeBruijnVertex k2 = new MultiDeBruijnVertex( "ATTTG".getBytes() ); 1130 final MultiDeBruijnVertex k3 = new MultiDeBruijnVertex( "TTTGG".getBytes() ); 1131 final MultiDeBruijnVertex k4 = new MultiDeBruijnVertex( "TTGGG".getBytes() ); 1132 final MultiDeBruijnVertex k5 = new MultiDeBruijnVertex( "TGGGC".getBytes() ); 1133 final MultiDeBruijnVertex k6 = new MultiDeBruijnVertex( "GGGCC".getBytes() ); 1134 final MultiDeBruijnVertex k7 = new MultiDeBruijnVertex( "GGCCC".getBytes() ); 1135 final MultiDeBruijnVertex k8 = new MultiDeBruijnVertex( "GCCCT".getBytes() ); 1136 final MultiDeBruijnVertex refEnd = new MultiDeBruijnVertex( "CCCTT".getBytes() ); 1137 final MultiDeBruijnVertex v3 = new MultiDeBruijnVertex( "TTTGC".getBytes() ); 1138 final MultiDeBruijnVertex v4 = new MultiDeBruijnVertex( "TTGCG".getBytes() ); 1139 final MultiDeBruijnVertex v5 = new MultiDeBruijnVertex( "TGCGC".getBytes() ); 1140 final MultiDeBruijnVertex v6 = new MultiDeBruijnVertex( "GCGCC".getBytes() ); 1141 final MultiDeBruijnVertex v7 = new MultiDeBruijnVertex( "CGCCC".getBytes() ); 1142 graph.addVertices(refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd, v3, v4, v5, v6, v7); 1143 graph.addEdges(() -> new MultiSampleEdge(true, 1, 4), refSource, k1, k2, k3, k4, k5, k6, k7, k8, refEnd); 1144 graph.addEdges(() -> new MultiSampleEdge(false, 1, 3), k2, v3, v4, v5, v6, v7, k8); 1145 1146 @SuppressWarnings("all") 1147 final List<KBestHaplotype<MultiDeBruijnVertex, MultiSampleEdge>> paths = new JunctionTreeKBestHaplotypeFinder<>(graph, refSource, refEnd).findBestHaplotypes(); 1148 1149 Assert.assertEquals(paths.size(), 1); 1150 1151 final Path<MultiDeBruijnVertex, MultiSampleEdge> altPath = paths.get(0); 1152 1153 final String altString = "AAATTTGCGCCCTT"; 1154 1155 Assert.assertEquals(altPath.getBases(), altString.getBytes()); 1156 } 1157 1158 // ----------------------------------------------------------------- 1159 // 1160 // Systematic tests to ensure that we get the correct SW result for 1161 // a variety of variants in the ref vs alt bubble 1162 // 1163 // ----------------------------------------------------------------- 1164 1165 @DataProvider(name = "SystematicRefAltSWTestData") makeSystematicRefAltSWTestData()1166 public Object[][] makeSystematicRefAltSWTestData() { 1167 final List<Object[]> tests = new ArrayList<>(); 1168 1169 final List<List<String>> allDiffs = Arrays.asList( 1170 Arrays.asList("G", "C", "1M"), 1171 Arrays.asList("G", "", "1D"), 1172 Arrays.asList("", "C", "1I"), 1173 Arrays.asList("AAA", "CGT", "3M"), 1174 Arrays.asList("TAT", "CAC", "3M"), 1175 Arrays.asList("GCTG", "GTCG", "4M"), 1176 Arrays.asList("AAAAA", "", "5D"), 1177 Arrays.asList("", "AAAAA", "5I"), 1178 Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I") 1179 ); 1180 1181 for ( final String prefix : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { 1182 for ( final String end : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { 1183 for ( final List<String> diffs : allDiffs ) 1184 tests.add(new Object[]{prefix, end, diffs.get(0), diffs.get(1), diffs.get(2)}); 1185 } 1186 } 1187 1188 return tests.toArray(new Object[][]{}); 1189 } 1190 1191 1192 /** 1193 * Convenience constructor for testing that creates a path through vertices in graph 1194 */ makePath(final List<T> vertices, final BaseGraph<T, E> graph)1195 private static <T extends BaseVertex, E extends BaseEdge> Path<T,E> makePath(final List<T> vertices, final BaseGraph<T, E> graph) { 1196 Path<T,E> path = new Path<>(vertices.get(0), graph); 1197 for ( int i = 1; i < vertices.size(); i++ ) { 1198 path = new Path<>(path, graph.getEdge(path.getLastVertex(), vertices.get(i))); 1199 } 1200 return path; 1201 } 1202 1203 1204 @Test testCreateMapOfPivotalEdgesInTopoglogicalOrderBasicExample()1205 public void testCreateMapOfPivotalEdgesInTopoglogicalOrderBasicExample() { 1206 int readlength = 20; 1207 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 1208 String ref = "TAAACAAG"+"G"+"TTGGGTTCG"+"A"+"GCGGGGTTC"+"T"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size 1209 String alt1 = "TAAACAAG"+"T"+"TTGGGTTCG"+"G"+"GCGGGGTTC"+"A"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites 1210 1211 // Generate some reads that do not span the entire active region 1212 assembler.addSequence("anonymous", getBytes(ref), true); 1213 for (int i = 0; i + readlength < ref.length(); i++) { 1214 assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false); 1215 } 1216 1217 assembler.buildGraphIfNecessary(); 1218 assembler.generateJunctionTrees(); 1219 1220 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1); 1221 1222 LinkedHashSet<MultiSampleEdge> pivotalEdges = bestPaths.createMapOfPivotalEdgesInTopologicalOrder(); 1223 Iterator<MultiSampleEdge> edgesInOrder = pivotalEdges.iterator(); 1224 ListIterator<String> expectedEdgeKmerTargetsFOrPivotalEdges = Arrays.asList(new String[]{"AACAAGT","GGTTCGG","GGGTTCA","CGAAGTC","TAATATG"}).listIterator(); 1225 1226 // Asserting that the order of edges is as expected 1227 Assert.assertEquals(pivotalEdges.size(), 5); 1228 while (edgesInOrder.hasNext()) { 1229 MultiSampleEdge nextEdge = edgesInOrder.next(); 1230 String expectedKmerEdge = expectedEdgeKmerTargetsFOrPivotalEdges.next(); 1231 String nextTarget = assembler.getEdgeTarget(nextEdge).getSequenceString(); 1232 Assert.assertEquals(nextTarget, expectedKmerEdge); 1233 } 1234 1235 } 1236 1237 @Test 1238 // this test asserts that included in pivotal edges are edges that are only reachable by following uncovered reference path testCreateMapOfPivotalEdgesInTopoglogicalOrderHiddenReferenceSequence()1239 public void testCreateMapOfPivotalEdgesInTopoglogicalOrderHiddenReferenceSequence() { 1240 int readlength = 20; 1241 final JunctionTreeLinkedDeBruijnGraph assembler = new JunctionTreeLinkedDeBruijnGraph(7); 1242 String ref = "TAAACAAG"+"G"+"TTGGGTTCG"+"AGCGGT"+"CTCGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // Reference with 5 sites all separated by at least kmer size 1243 String alt1 = "TAAACAAG"+"T"+"TTGGGTTCG"+"GGCGGA"+"CTCGAAGT"+"C"+"CTTGGTAATAT"+"G"+"GGGGGCCCC"; // Alt with different values for all sites 1244 String alt2 = "AGCGGTCT"+"G"+"CGAAGT"+"T"+"CTTGGTAATAT"+"A"+"GGGGGCCCC"; // A snippet of reference sequence starting after an uncovered reference edge 1245 1246 // Generate some reads that do not span the entire active region 1247 assembler.addSequence("anonymous", getBytes(ref), true); 1248 for (int i = 0; i + readlength < ref.length(); i++) { 1249 assembler.addSequence("anonymous", getBytes(alt1.substring(i, i + readlength)), false); 1250 } 1251 1252 for (int i = 0; i + readlength < alt2.length(); i++) { 1253 assembler.addSequence("anonymous", getBytes(alt2.substring(i, i + readlength)), false); 1254 } 1255 1256 assembler.buildGraphIfNecessary(); 1257 assembler.generateJunctionTrees(); 1258 1259 final JunctionTreeKBestHaplotypeFinder<MultiDeBruijnVertex, MultiSampleEdge> bestPaths = new JunctionTreeKBestHaplotypeFinder<>(assembler).setJunctionTreeEvidenceWeightThreshold(1); 1260 1261 LinkedHashSet<MultiSampleEdge> pivotalEdges = bestPaths.createMapOfPivotalEdgesInTopologicalOrder(); 1262 Iterator<MultiSampleEdge> edgesInOrder = pivotalEdges.iterator(); 1263 ListIterator<String> expectedEdgeKmerTargetsFOrPivotalEdges = Arrays.asList(new String[]{"AACAAGT","GGTTCGG", // edges corresoinding to the start of alt1 path 1264 "CGGTCTG", // Edge corresponding to the G insertion (This would be dropped if we didn't include reference edges with no support in this calculation) 1265 "CGAAGTC", 1266 "TAATATA","TAATATG" // Edges correspoindg to the ref and alt respectively, the order is arbitrary because the shortest path is what is taken to determine distance 1267 }).listIterator(); 1268 1269 // Asserting that the order of edges is as expected 1270 Assert.assertEquals(pivotalEdges.size(), 6); 1271 while (edgesInOrder.hasNext()) { 1272 MultiSampleEdge nextEdge = edgesInOrder.next(); 1273 String expectedKmerEdge = expectedEdgeKmerTargetsFOrPivotalEdges.next(); 1274 String nextTarget = assembler.getEdgeTarget(nextEdge).getSequenceString(); 1275 Assert.assertEquals(nextTarget, expectedKmerEdge); 1276 } 1277 } 1278 }