1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import com.google.common.collect.Lists; 5 import com.google.common.collect.Maps; 6 import htsjdk.samtools.util.Locatable; 7 import org.apache.commons.lang3.ArrayUtils; 8 import org.broadinstitute.hellbender.exceptions.UserException; 9 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer; 10 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.*; 11 import org.broadinstitute.hellbender.utils.Utils; 12 13 import java.io.File; 14 import java.io.FileNotFoundException; 15 import java.io.FileOutputStream; 16 import java.io.PrintStream; 17 import java.util.*; 18 import java.util.stream.Collectors; 19 20 /** 21 * Experimental version of the ReadThreadingGraph with support for threading reads to generate JunctionTrees for resolving 22 * connectivity information at longer ranges. 23 * 24 * Note that many of the non-DeBruijn graph alterations that are made to ReadThreadingGraph are not made here: 25 * - Non-Unique kmers are not duplicated by this graph 26 * - Kmers are not Zipped together to form a SeqGraph 27 * - The reference path is stored in its entirety rather than being calculated on the fly 28 * 29 * For ease of debugging, this graph supports the method {@link #printSimplifiedGraph(File, int)}} which generates a SequenceGraph and 30 * adds the junction trees to the output .dot file. 31 */ 32 public class JunctionTreeLinkedDeBruijnGraph extends AbstractReadThreadingGraph { 33 private static final long serialVersionUID = 1l; 34 private static final MultiDeBruijnVertex SYMBOLIC_END_VETEX = new MultiDeBruijnVertex(new byte[]{'_'}); 35 private MultiSampleEdge SYMBOLIC_END_EDGE; 36 37 private Map<MultiDeBruijnVertex, ThreadingTree> readThreadingJunctionTrees = new HashMap<>(); 38 39 // TODO should this be constructed here or elsewhere 40 private final Set<Kmer> kmers = new HashSet<>(); 41 42 @VisibleForTesting JunctionTreeLinkedDeBruijnGraph(int kmerSize)43 public JunctionTreeLinkedDeBruijnGraph(int kmerSize) { 44 this(kmerSize, false, (byte)6, 1, -1); 45 } 46 47 /** 48 * Create a new ReadThreadingAssembler using kmerSize for matching 49 * @param kmerSize must be >= 1 50 */ JunctionTreeLinkedDeBruijnGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases)51 JunctionTreeLinkedDeBruijnGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) { 52 super(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, numDanglingMatchingPrefixBases); 53 } 54 55 /** 56 * Find vertex and its position in seqForKmers where we should start assembling seqForKmers 57 * 58 * @param seqForKmers the sequence we want to thread into the graph 59 * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one 60 */ 61 findStartForJunctionThreading(final SequenceForKmers seqForKmers)62 protected int findStartForJunctionThreading(final SequenceForKmers seqForKmers) { 63 for ( int i = seqForKmers.start; i < seqForKmers.stop - kmerSize; i++ ) { 64 final Kmer kmer1 = new Kmer(seqForKmers.sequence, i, kmerSize); 65 if ( kmerToVertexMap.containsKey(kmer1) ) { 66 return i; 67 } 68 } 69 70 return -1; 71 } 72 73 @Override 74 // We don't need to track non-uniques here so this is a no-op preprocessReads()75 protected void preprocessReads() { 76 return; 77 } 78 79 @Override 80 // We don't want to remove pending sequences as they are the data we need for read threading shouldRemoveReadsAfterGraphConstruction()81 protected boolean shouldRemoveReadsAfterGraphConstruction() { 82 return false; 83 } 84 85 @Override 86 //TODO come up with some heuristic for when we think one of these graphs is "too difficult" to call properly isLowQualityGraph()87 public boolean isLowQualityGraph() { 88 return false; 89 } 90 91 /** 92 * Checks whether a kmer can be the threading start based on the current threading start location policy. 93 * 94 * @see #setThreadingStartOnlyAtExistingVertex(boolean) 95 * 96 * @param kmer the query kmer. 97 * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise. 98 */ isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex)99 protected boolean isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex) { 100 Utils.nonNull(kmer); 101 return !startThreadingOnlyAtExistingVertex || kmers.contains(kmer); 102 } 103 104 /** 105 * Finds the path in the graph from this vertex to the reference sink, including this vertex 106 * 107 * @param start the reference vertex to start from 108 * @param direction describes which direction to move in the graph (i.e. down to the reference sink or up to the source) 109 * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths 110 * @return the path (non-null, non-empty) 111 */ 112 @Override getReferencePath(final MultiDeBruijnVertex start, final TraversalDirection direction, final Optional<MultiSampleEdge> blacklistedEdge)113 protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start, 114 final TraversalDirection direction, 115 final Optional<MultiSampleEdge> blacklistedEdge) { 116 if ( direction == TraversalDirection.downwards ) { 117 return getReferencePathForwardFromKmer(start, blacklistedEdge); 118 } else { 119 return getReferencePathBackwardsForKmer(start); 120 } 121 } 122 123 // Since there are no non-unique kmers to worry about we just add it to our map 124 @Override trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex)125 protected void trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex) { 126 kmerToVertexMap.putIfAbsent(kmer, newVertex); 127 } 128 129 @VisibleForTesting getReferencePath(final TraversalDirection direction)130 List<MultiDeBruijnVertex> getReferencePath(final TraversalDirection direction) { 131 return Collections.unmodifiableList(direction == TraversalDirection.downwards ? referencePath : Lists.reverse(referencePath)); 132 } 133 134 /** 135 * Extends the current vertex chain by one, making sure to build junction tree objects at each node with out-degree > 1. 136 * 137 * If a node has an in-degree > 1, then it has a junction tree added to the previous node. 138 * 139 * @param prevVertex Current vertex to extend off of 140 * @param sequence Sequence of kmers to extend 141 * @param kmerStart index of where the current kmer starts 142 * @param alterTrees flag to control if we actually put anything into junction trees as a result of this path 143 * @return The next vertex in the sequence. Null if the corresponding edge doesn't exist. 144 */ extendJunctionThreadingByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final JunctionTreeThreadingHelper nodesHelper, final boolean alterTrees)145 protected MultiDeBruijnVertex extendJunctionThreadingByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final JunctionTreeThreadingHelper nodesHelper, final boolean alterTrees) { 146 final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(prevVertex); 147 148 final int nextPos = kmerStart + kmerSize - 1; 149 for (final MultiSampleEdge outgoingEdge : outgoingEdges) { 150 final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge); 151 if (target.getSuffix() == sequence[nextPos]) { 152 if (alterTrees) { 153 // Only want to create a new tree if we walk into a node that meets our junction tree criteria 154 // NOTE: we do this deciding on our path 155 nodesHelper.addTreeIfNecessary(prevVertex); 156 157 // If this node has an out-degree > 1, add the edge we took to existing trees 158 if (outgoingEdges.size() != 1) { 159 nodesHelper.addEdgeToJunctionTreeNodes(outgoingEdge); 160 } 161 } 162 return target; 163 } 164 } 165 return null; 166 } 167 168 /** 169 * NOTE: we override the old behavior here to allow for looping/nonunique references to be handled properly 170 * 171 * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph 172 */ 173 @Override getReferenceSourceVertex( )174 public final MultiDeBruijnVertex getReferenceSourceVertex( ) { 175 return referencePath != null && !referencePath.isEmpty()? referencePath.get(0) : null; 176 } 177 178 /** 179 * NOTE: we override the old behavior here to allow for looping/nonunique references to be handled properly 180 * 181 * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph 182 */ 183 @Override getReferenceSinkVertex( )184 public final MultiDeBruijnVertex getReferenceSinkVertex( ) { 185 return referencePath != null && !referencePath.isEmpty()? referencePath.get(referencePath.size() - 1) : null; 186 } 187 188 /** 189 * This is a heruistic for deciding what the proper reference path is to align the dangling ends to. The last possible 190 * reference path emanating from the kmer (in the case of the root kmer for a dangling end being a repeated kmer 191 * from the reference) is chose (or the first in the case for dangling heads). This is based on the assumption that 192 * dangling tails worthy of recovering are often a result of the assembly window and thus we choose the last possible 193 * kmer as the option. 194 * 195 * //TODO both of these are a hack to emulate the current behavior when targetkmer isn't actually a reference kmer. The old behaior 196 * //TODO was to return a singleton list of targetKmer. The real solution is to walk back target kmer to the actual ref base 197 * 198 * @param targetKmer vertex corresponding to the root 199 * @return 200 */ getReferencePathForwardFromKmer(final MultiDeBruijnVertex targetKmer, final Optional<MultiSampleEdge> blacklistedEdge)201 private List<MultiDeBruijnVertex> getReferencePathForwardFromKmer(final MultiDeBruijnVertex targetKmer, 202 final Optional<MultiSampleEdge> blacklistedEdge) { 203 List<MultiDeBruijnVertex> extraSequence = new ArrayList<>(2); 204 MultiDeBruijnVertex vert = targetKmer; 205 int finalIndex = referencePath.lastIndexOf(vert); 206 207 while (finalIndex == -1 && vert != null) { 208 // If the current verex is not a reference vertex but exists, add it to our list of extra bases to append to the reference 209 extraSequence.add(vert); 210 211 final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(vert); 212 213 // singleton or empty set 214 final Set<MultiSampleEdge> blacklistedEdgeSet = blacklistedEdge.isPresent() ? Collections.singleton(blacklistedEdge.get()) : Collections.emptySet(); 215 216 // walk forward while the path is unambiguous 217 final List<MultiSampleEdge> edges = outgoingEdges.stream().filter(e -> !blacklistedEdgeSet.contains(e)).limit(2).collect(Collectors.toList()); 218 219 vert = edges.size() == 1 ? getEdgeTarget(edges.get(0)) : null; 220 221 // Defense against loops, kill the attempt if we are stuck in a loop 222 if (extraSequence.contains(vert)) { 223 System.err.println("Dangling End recovery killed because of a loop (getReferencePathForwardFromKmer)"); 224 vert = null; 225 } 226 227 finalIndex = vert == null ? -1 : referencePath.lastIndexOf(vert); 228 } 229 230 // if we found extra sequence append it to the front of the 231 extraSequence.addAll(finalIndex != -1 ? referencePath.subList(finalIndex, referencePath.size()) : Collections.emptyList()); 232 return extraSequence; 233 } 234 235 // TODO this behavior is frankly silly and needs to be fixed, there is no way upwards paths should be dangingling head recovered differently getReferencePathBackwardsForKmer(final MultiDeBruijnVertex targetKmer)236 private List<MultiDeBruijnVertex> getReferencePathBackwardsForKmer(final MultiDeBruijnVertex targetKmer) { 237 int firstIndex = referencePath.indexOf(targetKmer); 238 if (firstIndex == -1) return Collections.singletonList(targetKmer); 239 return Lists.reverse(referencePath.subList(0, firstIndex + 1)); 240 } 241 242 // Generate a SeqGraph that is special 243 @Override toSequenceGraph()244 public SeqGraph toSequenceGraph() { 245 throw new UnsupportedOperationException("Cannot construct a sequence graph using JunctionTreeLinkedDeBruijnGraph"); 246 } 247 248 /** 249 * Print out the graph in the dot language for visualization for the graph after merging the graph into a seq graph. 250 * Junction trees will naively add junction trees to the corresponding zipped SeqGraph node for its root vertex. 251 * 252 * NOTE: this is intended for debugging complex assembly graphs while preserving junction tree data. 253 * @param destination File to write to 254 */ 255 @VisibleForTesting printSimplifiedGraph(final File destination, final int pruneFactor)256 public final void printSimplifiedGraph(final File destination, final int pruneFactor) { 257 try (PrintStream stream = new PrintStream(new FileOutputStream(destination))) { 258 printSimplifiedGraph(stream, true, pruneFactor); 259 } catch ( final FileNotFoundException e ) { 260 throw new UserException.CouldNotReadInputFile(destination.getAbsolutePath(), e); 261 } 262 } printSimplifiedGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor)263 private final void printSimplifiedGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) { 264 265 /// LOCAL CLASS TO MANAGE PRINTING THE JUNCTION TREES ONTO THE SEQ GRAPH 266 class PrintingSeqGraph extends SeqGraph { 267 private static final long serialVersionUID = 1l; 268 269 private PrintingSeqGraph(int kmerSize) { 270 super(kmerSize); 271 } 272 private Map<MultiDeBruijnVertex, SeqVertex> vertexToOrigionalSeqVertex; 273 private Map<SeqVertex, SeqVertex> originalSeqVertexToMergedVerex; 274 @Override 275 // Extendable method intended to allow for adding extra material to the graph 276 public List<String> getExtraGraphFileLines() { 277 List<String> output = new ArrayList<>(); 278 for( Map.Entry<MultiDeBruijnVertex, ThreadingTree> entry : readThreadingJunctionTrees.entrySet()) { 279 // Resolving the chain of altered vertexes 280 SeqVertex mergedSeqVertex = originalSeqVertexToMergedVerex.get(vertexToOrigionalSeqVertex.get(entry.getKey())); 281 282 // adding the root node to the graph 283 output.add(String.format("\t%s -> %s ", mergedSeqVertex.toString(), entry.getValue().rootNode.getDotName()) + 284 String.format("[color=blue];")); 285 output.add(String.format("\t%s [shape=point];", entry.getValue().rootNode.getDotName())); 286 287 output.addAll(edgesForNodeRecursive(entry.getValue().rootNode)); 288 } 289 return output; 290 } 291 // Helper class to be used for zipping linear chains for easy print outputting 292 private Map<SeqVertex, SeqVertex> zipLinearChainsWithVertexMapping() { 293 Map<SeqVertex, SeqVertex> vertexMapping = new HashMap<>(); 294 // create the list of start sites [doesn't modify graph yet] 295 final Collection<SeqVertex> zipStarts = vertexSet().stream().filter(this::isLinearChainStart).collect(Collectors.toList()); 296 297 if ( zipStarts.isEmpty() ) // nothing to do, as nothing could start a chain 298 { 299 return vertexMapping; 300 } 301 302 // At this point, zipStarts contains all of the vertices in this graph that might start some linear 303 // chain of vertices. We walk through each start, building up the linear chain of vertices and then 304 // zipping them up with mergeLinearChain, if possible 305 for ( final SeqVertex zipStart : zipStarts ) { 306 final LinkedList<SeqVertex> linearChain = traceLinearChain(zipStart); 307 308 SeqVertex mergedOne = mergeLinearChainVertex(linearChain); 309 310 // Add all of the mapped vertexes to the map. 311 if (mergedOne != null) { 312 for (SeqVertex v : linearChain) { 313 vertexMapping.put(v, mergedOne); 314 } 315 } else { 316 // otherwise we indicate that nothing has changed 317 for (SeqVertex v : linearChain) { 318 vertexMapping.put(v, v); 319 } 320 } 321 } 322 return vertexMapping; 323 } 324 }; 325 326 PrintingSeqGraph printingSeqGraph = new PrintingSeqGraph(kmerSize); 327 328 final Map<MultiDeBruijnVertex, SeqVertex> vertexToOrigionalSeqVertex = new HashMap<>(); 329 330 // create all of the equivalent seq graph vertices 331 for ( final MultiDeBruijnVertex dv : vertexSet() ) { 332 final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); 333 sv.setAdditionalInfo(dv.getAdditionalInfo()); 334 vertexToOrigionalSeqVertex.put(dv, sv); 335 printingSeqGraph.addVertex(sv); 336 } 337 338 // walk through the nodes and connect them to their equivalent seq vertices 339 for( final MultiSampleEdge e : edgeSet() ) { 340 final SeqVertex seqInV = vertexToOrigionalSeqVertex.get(getEdgeSource(e)); 341 final SeqVertex seqOutV = vertexToOrigionalSeqVertex.get(getEdgeTarget(e)); 342 printingSeqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity())); 343 } 344 345 final Map<SeqVertex, SeqVertex> originalSeqVertexToMergedVerex = printingSeqGraph.zipLinearChainsWithVertexMapping(); 346 printingSeqGraph.originalSeqVertexToMergedVerex = originalSeqVertexToMergedVerex; 347 printingSeqGraph.vertexToOrigionalSeqVertex = vertexToOrigionalSeqVertex; 348 349 350 printingSeqGraph.printGraph(graphWriter, writeHeader, pruneFactor); 351 } 352 353 /** 354 * Walk along the reference path in the graph and pull out the corresponding bases 355 * 356 * NOTE: this attempts to generate the longest sequence of refernce bases in the event that fromVertex or toVertex are non-unique 357 * 358 * @param fromVertex starting vertex 359 * @param toVertex ending vertex 360 * @param includeStart should the starting vertex be included in the path 361 * @param includeStop should the ending vertex be included in the path 362 * @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example) 363 */ 364 @Override getReferenceBytes( final MultiDeBruijnVertex fromVertex, final MultiDeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop )365 public byte[] getReferenceBytes( final MultiDeBruijnVertex fromVertex, final MultiDeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop ) { 366 Utils.nonNull(fromVertex, "Starting vertex in requested path cannot be null."); 367 Utils.nonNull(toVertex, "From vertex in requested path cannot be null."); 368 369 byte[] bytes = null; 370 int fromIndex = referencePath.indexOf(fromVertex); 371 int toIndex = referencePath.lastIndexOf(toVertex); 372 373 if( includeStart ) { 374 bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(fromVertex, true)); 375 } 376 for (int i = fromIndex + 1; i < toIndex; i++) { 377 bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(referencePath.get(i))); 378 } 379 380 if( includeStop ) { 381 bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(toVertex)); 382 } 383 return bytes; 384 } 385 386 @Override 387 // since we don't have to validate unique vertex merging we just find the vertex and pass getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex)388 protected MultiDeBruijnVertex getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex) { 389 return kmerToVertexMap.get(kmer); 390 } 391 392 /** 393 * Generate the junction trees and prune them (optionally printing the graph stages as output) 394 * 395 * @param debugGraphOutputPath path for graph output files 396 * @param refHaplotype ref haplotype location 397 */ postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype)398 public void postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype) { 399 annotateEdgesWithReferenceIndices(); 400 generateJunctionTrees(); 401 if (debugGraphTransformations) { 402 printGraph(new File(debugGraphOutputPath, refHaplotype + "-sequenceGraph." + kmerSize + ".0.4.JT_unpruned.dot"), 10000); 403 } 404 pruneJunctionTrees(JunctionTreeKBestHaplotypeFinder.DEFAULT_MINIMUM_WEIGHT_FOR_JT_BRANCH_TO_NOT_BE_PRUNED); 405 if (debugGraphTransformations) { 406 printGraph(new File(debugGraphOutputPath, refHaplotype + "-sequenceGraph." + kmerSize + ".0.5.JT_pruned.dot"), 10000); 407 } 408 } 409 annotateEdgesWithReferenceIndices()410 private void annotateEdgesWithReferenceIndices() { 411 final List<MultiDeBruijnVertex> referencePath = getReferencePath(TraversalDirection.downwards); 412 MultiDeBruijnVertex lastVert = null; 413 int refIndex = 0; 414 for (MultiDeBruijnVertex nextVert : referencePath) { 415 if (lastVert != null) { 416 getEdge(lastVert, nextVert).addReferenceIndex(refIndex++); 417 } 418 lastVert = nextVert; 419 } 420 } 421 422 423 // Generate threading trees generateJunctionTrees()424 public void generateJunctionTrees() { 425 Utils.validate(alreadyBuilt, "Assembly graph has not been constructed, please call BuildGraphIfNecessary() before trying to thread reads to the graph"); 426 // Adding handle vertex to support symbolic end alleles 427 addVertex(SYMBOLIC_END_VETEX); 428 SYMBOLIC_END_EDGE = addEdge(getReferenceSinkVertex(), SYMBOLIC_END_VETEX); 429 430 readThreadingJunctionTrees = new HashMap<>(); 431 pending.values().stream().flatMap(Collection::stream).forEach(this::threadSequenceForJuncitonTree); 432 } 433 434 /** 435 * Traverse all of the junction trees in the graph and remove branches supported by < minimumEdgeWeight edges recursively. 436 * This will also handle pruning of trees that are uninformative (i.e. empty roots). 437 * 438 * @param minimumEdgeWeight minimum edge weight below which branches are removed 439 */ pruneJunctionTrees(final int minimumEdgeWeight)440 public void pruneJunctionTrees(final int minimumEdgeWeight) { 441 readThreadingJunctionTrees.forEach((key, value) -> value.getRootNode().pruneNode(minimumEdgeWeight)); 442 readThreadingJunctionTrees = Maps.filterValues( readThreadingJunctionTrees, ThreadingTree::isEmptyTree); 443 } 444 445 /** 446 * Takes a contiguous sequence of kmers and threads it through the graph, generating and subsequently adding to 447 * juncition trees at each relevant node. 448 * 449 * @param seqForKmers 450 */ threadSequenceForJuncitonTree(final SequenceForKmers seqForKmers)451 private void threadSequenceForJuncitonTree(final SequenceForKmers seqForKmers) { 452 // Maybe handle this differently, the reference junction tree should be held seperatedly from everything else. 453 if (seqForKmers.isRef) { 454 return; 455 } 456 457 // List we will use to keep track of sequences 458 JunctionTreeThreadingHelper nodeHelper = new JunctionTreeThreadingHelper(); 459 460 // Find the first kmer in the read that exists on the graph 461 final int startPos = findStartForJunctionThreading(seqForKmers); 462 if ( startPos == -1 ) { 463 return; 464 } 465 466 final MultiDeBruijnVertex startingVertex = kmerToVertexMap.get(new Kmer(seqForKmers.sequence, startPos, kmerSize)); 467 468 // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate 469 MultiDeBruijnVertex lastVertex = startingVertex; 470 boolean hasToRediscoverKmer = false; 471 for ( int i = startPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) { 472 MultiDeBruijnVertex vertex; 473 if (!hasToRediscoverKmer) { 474 vertex = extendJunctionThreadingByOne(lastVertex, seqForKmers.sequence, i, nodeHelper, true); 475 } else { 476 Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); 477 vertex = kmerToVertexMap.get(kmer); 478 } 479 480 // If we missed the vertex, attempt to recover the path from the graph if there is no ambiguity 481 if (vertex == null && !hasToRediscoverKmer) { 482 final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(lastVertex); 483 MultiDeBruijnVertex tentativeVertex = null; 484 if (outgoingEdges.size()==1) { 485 tentativeVertex = getEdgeTarget(outgoingEdges.stream().findFirst().get()); 486 } 487 // Loop over the tentative path until the end of the read, we have walked over kmersize bases, or there is another missing path in the graph respectively 488 for (int j = i+1; j <= seqForKmers.stop - kmerSize && 489 j <= i + kmerSize && 490 tentativeVertex != null; j++) { 491 tentativeVertex = extendJunctionThreadingByOne(tentativeVertex, seqForKmers.sequence, j, null, false); 492 } 493 // If tentativeVertex is not null then we will be able to successfully traverse the read through the graph 494 if (tentativeVertex != null) { 495 vertex = getEdgeTarget(outgoingEdges.stream().findFirst().get()); 496 } 497 } 498 499 // If for whatever reason vertex = null, then we have fallen off the corrected graph so we don't update anything 500 if (vertex != null) { 501 lastVertex = vertex; 502 hasToRediscoverKmer = false; 503 } else { 504 nodeHelper.clear(); 505 hasToRediscoverKmer = true; 506 } 507 } 508 509 // As a final step, if the last vetex happens to be the ref-stop vertex then we want to append a symbolic node to the junciton trees 510 if (lastVertex == getReferenceSinkVertex()) { 511 nodeHelper.addEdgeToJunctionTreeNodes(SYMBOLIC_END_EDGE); 512 } 513 } 514 515 // Extendable method intended to allow for adding extra material to the graph getExtraGraphFileLines()516 public List<String> getExtraGraphFileLines() { 517 List<String> output = new ArrayList<>(); 518 for( Map.Entry<MultiDeBruijnVertex, ThreadingTree> entry : readThreadingJunctionTrees.entrySet()) { 519 // adding the root node to the graph 520 output.add(String.format("\t%s -> %s ", entry.getKey().toString(), entry.getValue().rootNode.getDotName()) + 521 String.format("[color=blue];")); 522 output.add(String.format("\t%s [shape=point];", entry.getValue().rootNode.getDotName())); 523 524 output.addAll(edgesForNodeRecursive(entry.getValue().rootNode)); 525 } 526 return output; 527 } 528 529 // Recursive search through a threading tree for nodes edgesForNodeRecursive(ThreadingNode node)530 private List<String> edgesForNodeRecursive(ThreadingNode node) { 531 List<String> output = new ArrayList<>(); 532 533 for ( Map.Entry<MultiSampleEdge, ThreadingNode> childrenNode : node.childrenNodes.entrySet() ) { 534 output.add(String.format("\t%s -> %s ", node.getDotName(), childrenNode.getValue().getDotName() + String.format("[color=blue,label=\"%d\"];",childrenNode.getValue().count))); 535 output.add(String.format("\t%s [label=\"%s\",shape=plaintext]", childrenNode.getValue().getDotName(), 536 new String(getEdgeTarget(childrenNode.getKey()).getAdditionalSequence(false)))); 537 output.addAll(edgesForNodeRecursive(childrenNode.getValue())); 538 } 539 return output; 540 } 541 getJunctionTreeForNode(MultiDeBruijnVertex vertex)542 public Optional<ThreadingTree> getJunctionTreeForNode(MultiDeBruijnVertex vertex) { 543 return Optional.ofNullable(readThreadingJunctionTrees.get(vertex)); 544 } 545 546 @VisibleForTesting 547 // Test method for returning all existing junction trees. getReadThreadingJunctionTrees(boolean pruned)548 public Map<MultiDeBruijnVertex, ThreadingTree> getReadThreadingJunctionTrees(boolean pruned) { 549 return pruned ? Maps.filterValues( Collections.unmodifiableMap(readThreadingJunctionTrees), n -> n.isEmptyTree()) 550 : Collections.unmodifiableMap(readThreadingJunctionTrees); 551 } 552 553 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 554 // Classes associated with junction trees 555 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 556 557 /** 558 * Class for storing a junction tree. A Juntion tree is a tree whose nodes correspond to edges in the graph where a 559 * traversal decision was made (i.e. the path forks and one edge was taken). Each node keeps track of a list of 560 * its children nodes, each of which stores a count of the evidence seen supporting the given edge. 561 */ 562 public class ThreadingTree { 563 private final ThreadingNode rootNode; 564 private final MultiDeBruijnVertex treeVertex; // this reference exists purely for toString() reasons 565 ThreadingTree(final MultiDeBruijnVertex treeVertex)566 private ThreadingTree(final MultiDeBruijnVertex treeVertex) { 567 this.rootNode = new ThreadingNode(null); 568 this.treeVertex = treeVertex; 569 } 570 571 /** 572 * Returns the root node for this edge (ensuring that it has had its count intcremented to keep track of the total evidence spanning this tree. 573 * 574 * @return 575 */ getAndIncrementRootNode()576 private ThreadingNode getAndIncrementRootNode() { 577 rootNode.incrementCount(); 578 return rootNode; 579 } 580 581 // Determines if this tree actually has any data associated with it isEmptyTree()582 private boolean isEmptyTree() { 583 return !rootNode.childrenNodes.isEmpty(); 584 } 585 586 // Returns the junction choices as a list of suffixes to follow (this is used for writing human readable tests) 587 @VisibleForTesting getPathsPresentAsBaseChoiceStrings()588 List<String> getPathsPresentAsBaseChoiceStrings() { 589 return rootNode.getEdgesThroughNode().stream() 590 .map(path -> path.stream() 591 .map(edge -> getEdgeTarget(edge).getSuffix()) 592 .map(b -> Character.toString((char)b.byteValue())) 593 .collect(Collectors.joining())) 594 .collect(Collectors.toList()); 595 596 } 597 toString()598 public String toString() { 599 return "ThreadingTree_at_"+treeVertex.getSequenceString()+"_with_evidence_"+rootNode.count; 600 } 601 602 // getter for the root node, TODO to possibly be replaced when the graph gets hidden from prying eyes. getRootNode()603 public ThreadingNode getRootNode() { 604 return rootNode; 605 } 606 } 607 608 // Linked node object for storing tree topography 609 public class ThreadingNode { 610 private Map<MultiSampleEdge, ThreadingNode> childrenNodes; 611 private final MultiSampleEdge prevEdge; // This may be null if this node corresponds to the root of the graph 612 private int count = 0; 613 ThreadingNode(MultiSampleEdge edge)614 private ThreadingNode(MultiSampleEdge edge) { 615 prevEdge = edge; 616 childrenNodes = new HashMap<>(2); 617 } 618 619 /** 620 * Adds an edge to the tree if this node has not been traversed before, or increments the corresponding node 621 * if the edge already exists. Then returns the corresponding next node 622 * 623 * @param edge edge to add to this current node 624 * @return the node corresponding to the one that was added to this graph 625 */ addEdge(MultiSampleEdge edge)626 private ThreadingNode addEdge(MultiSampleEdge edge) { 627 ThreadingNode nextNode = childrenNodes.computeIfAbsent(edge, k -> new ThreadingNode(edge)); 628 nextNode.incrementCount(); 629 return nextNode; 630 } 631 incrementCount()632 void incrementCount() { 633 count++; 634 } 635 incrementNode(List<MultiSampleEdge> edges, int index)636 private void incrementNode(List<MultiSampleEdge> edges, int index) { 637 incrementCount(); 638 if (index < edges.size()) { 639 getNode(edges.get(index)).incrementNode(edges, index + 1); 640 } 641 } 642 getNode(MultiSampleEdge edge)643 private ThreadingNode getNode(MultiSampleEdge edge) { 644 ThreadingNode nextNode = childrenNodes.get(edge); 645 if (nextNode == null) { 646 nextNode = new ThreadingNode(edge); 647 childrenNodes.put(edge, nextNode); 648 } 649 return nextNode; 650 } 651 652 /** 653 * Helper method that returns a list of all the possible branching paths through the tree originiating from this node. 654 * 655 * NOTE: This method is intended for debugging and testing and thus may not be efficient for returning the list 656 * 657 * @return A list of potential paths originating from this node as observed 658 */ 659 @VisibleForTesting getEdgesThroughNode()660 private List<List<MultiSampleEdge>> getEdgesThroughNode() { 661 List<List<MultiSampleEdge>> paths = new ArrayList<>(); 662 if (childrenNodes.isEmpty()) { 663 paths.add(prevEdge == null ? new ArrayList<>() : Lists.newArrayList(prevEdge)); 664 } 665 666 for (ThreadingNode childNode : childrenNodes.values()) { 667 for ( List<MultiSampleEdge> subPath : childNode.getEdgesThroughNode()) { 668 List<MultiSampleEdge> pathRoot = (prevEdge == null ? new ArrayList<>() : Lists.newArrayList(prevEdge)); 669 pathRoot.addAll(subPath); 670 paths.add(pathRoot); 671 } 672 } 673 674 return paths; 675 } 676 677 // Recursively prunes nodes based on the provided threshold, removing branches without sufficient support. pruneNode(final int threshold)678 private void pruneNode(final int threshold) { 679 childrenNodes = Maps.filterValues( childrenNodes, node -> node.getEvidenceCount() >= threshold); 680 childrenNodes.forEach((edge, node) -> node.pruneNode(threshold)); 681 } 682 683 // Returns a unique name based on the memory id that conforms to the restrictions placed on .dot file nodes getDotName()684 public String getDotName() { 685 return "TreadingNode_" + Integer.toHexString(hashCode()); 686 } 687 688 // Getter for external tools to access a node getChildrenNodes()689 public Map<MultiSampleEdge, ThreadingNode> getChildrenNodes() { 690 return Collections.unmodifiableMap(childrenNodes); 691 } 692 693 // Returns true if there are no paths emanating from this node hasNoEvidence()694 public boolean hasNoEvidence() { 695 return childrenNodes.isEmpty(); 696 } 697 698 // Return the count of total evidence supporting this node in the tree getEvidenceCount()699 public int getEvidenceCount() { 700 return count; 701 } 702 703 // Checks if this node is the symbolic end by determining if its previous edge ends on the symbolic edge isSymbolicEnd()704 public boolean isSymbolicEnd() { 705 return prevEdge == SYMBOLIC_END_EDGE; 706 } 707 } 708 709 /** 710 * A convenient helper class designed to pull much of the direct interaction with junction trees in the threading code 711 * into a single place. 712 */ 713 private class JunctionTreeThreadingHelper { 714 final List<ThreadingNode> trackedNodes = new ArrayList<>(3); 715 716 // Helper method used to determine whether a vertex meets the criteria to hold a junction tree 717 // The current criteria, if any outgoing edge from a particular vertex leads to a vertex with inDegree > 1, then it warrants a tree. Or if it is the reference start vertex. 718 // NOTE: this check is necessary to handle the edge cases that may arise when a vertex has multiple exit paths but happens to lead to a vetex that needs a junction tree vertexWarrantsJunctionTree(final MultiDeBruijnVertex vertex)719 private boolean vertexWarrantsJunctionTree(final MultiDeBruijnVertex vertex) { 720 return outgoingEdgesOf(vertex).stream().anyMatch(edge -> inDegreeOf(getEdgeTarget(edge)) > 1); 721 } 722 723 // Helper method that adds a single edge to all of the nodes in nodesToExtend. addEdgeToJunctionTreeNodes(final MultiSampleEdge outgoingEdge)724 private void addEdgeToJunctionTreeNodes(final MultiSampleEdge outgoingEdge) { 725 List<ThreadingNode> newNodes = trackedNodes.stream().map(n -> n.addEdge(outgoingEdge)).collect(Collectors.toList()); 726 trackedNodes.clear(); 727 trackedNodes.addAll(newNodes); 728 } 729 730 // removes all of the nodes currently on the tree for safety if there are gaps in a reads traversal clear()731 private void clear() { 732 trackedNodes.clear(); 733 } 734 735 // Adds a junction tree prevVertex if the vertex warrants a junciton tree addTreeIfNecessary(MultiDeBruijnVertex prevVertex)736 private void addTreeIfNecessary(MultiDeBruijnVertex prevVertex) { 737 if (vertexWarrantsJunctionTree(prevVertex)) { 738 trackedNodes.add(readThreadingJunctionTrees.computeIfAbsent(prevVertex, k -> new ThreadingTree(prevVertex)).getAndIncrementRootNode()); 739 } 740 } 741 } 742 } 743