1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import com.google.common.collect.Lists;
5 import com.google.common.collect.Maps;
6 import htsjdk.samtools.util.Locatable;
7 import org.apache.commons.lang3.ArrayUtils;
8 import org.broadinstitute.hellbender.exceptions.UserException;
9 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer;
10 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.*;
11 import org.broadinstitute.hellbender.utils.Utils;
12 
13 import java.io.File;
14 import java.io.FileNotFoundException;
15 import java.io.FileOutputStream;
16 import java.io.PrintStream;
17 import java.util.*;
18 import java.util.stream.Collectors;
19 
20 /**
21  * Experimental version of the ReadThreadingGraph with support for threading reads to generate JunctionTrees for resolving
22  * connectivity information at longer ranges.
23  *
24  * Note that many of the non-DeBruijn graph alterations that are made to ReadThreadingGraph are not made here:
25  * - Non-Unique kmers are not duplicated by this graph
26  * - Kmers are not Zipped together to form a SeqGraph
27  * - The reference path is stored in its entirety rather than being calculated on the fly
28  *
29  * For ease of debugging, this graph supports the method {@link #printSimplifiedGraph(File, int)}} which generates a SequenceGraph and
30  * adds the junction trees to the output .dot file.
31  */
32 public class JunctionTreeLinkedDeBruijnGraph extends AbstractReadThreadingGraph {
33     private static final long serialVersionUID = 1l;
34     private static final MultiDeBruijnVertex SYMBOLIC_END_VETEX = new MultiDeBruijnVertex(new byte[]{'_'});
35     private MultiSampleEdge SYMBOLIC_END_EDGE;
36 
37     private Map<MultiDeBruijnVertex, ThreadingTree> readThreadingJunctionTrees = new HashMap<>();
38 
39     // TODO should this be constructed here or elsewhere
40     private final Set<Kmer> kmers = new HashSet<>();
41 
42     @VisibleForTesting
JunctionTreeLinkedDeBruijnGraph(int kmerSize)43     public JunctionTreeLinkedDeBruijnGraph(int kmerSize) {
44         this(kmerSize, false, (byte)6, 1, -1);
45     }
46 
47     /**
48      * Create a new ReadThreadingAssembler using kmerSize for matching
49      * @param kmerSize must be >= 1
50      */
JunctionTreeLinkedDeBruijnGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases)51     JunctionTreeLinkedDeBruijnGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) {
52         super(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, numDanglingMatchingPrefixBases);
53     }
54 
55     /**
56      * Find vertex and its position in seqForKmers where we should start assembling seqForKmers
57      *
58      * @param seqForKmers the sequence we want to thread into the graph
59      * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one
60      */
61 
findStartForJunctionThreading(final SequenceForKmers seqForKmers)62     protected int findStartForJunctionThreading(final SequenceForKmers seqForKmers) {
63         for ( int i = seqForKmers.start; i < seqForKmers.stop - kmerSize; i++ ) {
64             final Kmer kmer1 = new Kmer(seqForKmers.sequence, i, kmerSize);
65             if ( kmerToVertexMap.containsKey(kmer1) ) {
66                 return i;
67             }
68         }
69 
70         return -1;
71     }
72 
73     @Override
74     // We don't need to track non-uniques here so this is a no-op
preprocessReads()75     protected void preprocessReads() {
76         return;
77     }
78 
79     @Override
80     // We don't want to remove pending sequences as they are the data we need for read threading
shouldRemoveReadsAfterGraphConstruction()81     protected boolean shouldRemoveReadsAfterGraphConstruction() {
82         return false;
83     }
84 
85     @Override
86     //TODO come up with some heuristic for when we think one of these graphs is "too difficult" to call properly
isLowQualityGraph()87     public boolean isLowQualityGraph() {
88         return false;
89     }
90 
91     /**
92      * Checks whether a kmer can be the threading start based on the current threading start location policy.
93      *
94      * @see #setThreadingStartOnlyAtExistingVertex(boolean)
95      *
96      * @param kmer the query kmer.
97      * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise.
98      */
isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex)99     protected boolean isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex) {
100         Utils.nonNull(kmer);
101         return !startThreadingOnlyAtExistingVertex || kmers.contains(kmer);
102     }
103 
104     /**
105      * Finds the path in the graph from this vertex to the reference sink, including this vertex
106      *
107      * @param start   the reference vertex to start from
108      * @param direction describes which direction to move in the graph (i.e. down to the reference sink or up to the source)
109      * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths
110      * @return the path (non-null, non-empty)
111      */
112     @Override
getReferencePath(final MultiDeBruijnVertex start, final TraversalDirection direction, final Optional<MultiSampleEdge> blacklistedEdge)113     protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start,
114                                                          final TraversalDirection direction,
115                                                          final Optional<MultiSampleEdge> blacklistedEdge) {
116         if ( direction == TraversalDirection.downwards ) {
117             return getReferencePathForwardFromKmer(start, blacklistedEdge);
118         } else {
119             return getReferencePathBackwardsForKmer(start);
120         }
121     }
122 
123     // Since there are no non-unique kmers to worry about we just add it to our map
124     @Override
trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex)125     protected void trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex) {
126         kmerToVertexMap.putIfAbsent(kmer, newVertex);
127     }
128 
129     @VisibleForTesting
getReferencePath(final TraversalDirection direction)130     List<MultiDeBruijnVertex> getReferencePath(final TraversalDirection direction) {
131         return Collections.unmodifiableList(direction == TraversalDirection.downwards ? referencePath : Lists.reverse(referencePath));
132     }
133 
134     /**
135      * Extends the current vertex chain by one, making sure to build junction tree objects at each node with out-degree > 1.
136      *
137      * If a node has an in-degree > 1, then it has a junction tree added to the previous node.
138      *
139      * @param prevVertex Current vertex to extend off of
140      * @param sequence Sequence of kmers to extend
141      * @param kmerStart index of where the current kmer starts
142      * @param alterTrees flag to control if we actually put anything into junction trees as a result of this path
143      * @return The next vertex in the sequence. Null if the corresponding edge doesn't exist.
144      */
extendJunctionThreadingByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final JunctionTreeThreadingHelper nodesHelper, final boolean alterTrees)145     protected MultiDeBruijnVertex extendJunctionThreadingByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final JunctionTreeThreadingHelper nodesHelper, final boolean alterTrees) {
146         final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(prevVertex);
147 
148         final int nextPos = kmerStart + kmerSize - 1;
149         for (final MultiSampleEdge outgoingEdge : outgoingEdges) {
150             final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge);
151             if (target.getSuffix() == sequence[nextPos]) {
152                 if (alterTrees) {
153                     // Only want to create a new tree if we walk into a node that meets our junction tree criteria
154                     // NOTE: we do this deciding on our path
155                     nodesHelper.addTreeIfNecessary(prevVertex);
156 
157                     // If this node has an out-degree > 1, add the edge we took to existing trees
158                     if (outgoingEdges.size() != 1) {
159                         nodesHelper.addEdgeToJunctionTreeNodes(outgoingEdge);
160                     }
161                 }
162                 return target;
163             }
164         }
165         return null;
166     }
167 
168     /**
169      * NOTE: we override the old behavior here to allow for looping/nonunique references to be handled properly
170      *
171      * @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph
172      */
173     @Override
getReferenceSourceVertex( )174     public final MultiDeBruijnVertex getReferenceSourceVertex( ) {
175         return referencePath != null && !referencePath.isEmpty()? referencePath.get(0) : null;
176     }
177 
178     /**
179      * NOTE: we override the old behavior here to allow for looping/nonunique references to be handled properly
180      *
181      * @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph
182      */
183     @Override
getReferenceSinkVertex( )184     public final MultiDeBruijnVertex getReferenceSinkVertex( ) {
185         return referencePath != null && !referencePath.isEmpty()? referencePath.get(referencePath.size() - 1) : null;
186     }
187 
188     /**
189      * This is a heruistic for deciding what the proper reference path is to align the dangling ends to. The last possible
190      * reference path emanating from the kmer (in the case of the root kmer for a dangling end being a repeated kmer
191      * from the reference) is chose (or the first in the case for dangling heads). This is based on the assumption that
192      * dangling tails worthy of recovering are often a result of the assembly window and thus we choose the last possible
193      * kmer as the option.
194      *
195      * //TODO both of these are a hack to emulate the current behavior when targetkmer isn't actually a reference kmer. The old behaior
196      * //TODO was to return a singleton list of targetKmer. The real solution is to walk back target kmer to the actual ref base
197      *
198      * @param targetKmer vertex corresponding to the root
199      * @return
200      */
getReferencePathForwardFromKmer(final MultiDeBruijnVertex targetKmer, final Optional<MultiSampleEdge> blacklistedEdge)201     private List<MultiDeBruijnVertex> getReferencePathForwardFromKmer(final MultiDeBruijnVertex targetKmer,
202                                                                       final Optional<MultiSampleEdge> blacklistedEdge) {
203         List<MultiDeBruijnVertex> extraSequence = new ArrayList<>(2);
204         MultiDeBruijnVertex vert = targetKmer;
205         int finalIndex = referencePath.lastIndexOf(vert);
206 
207         while (finalIndex == -1 &&  vert != null) {
208             // If the current verex is not a reference vertex but exists, add it to our list of extra bases to append to the reference
209             extraSequence.add(vert);
210 
211             final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(vert);
212 
213             // singleton or empty set
214             final Set<MultiSampleEdge> blacklistedEdgeSet = blacklistedEdge.isPresent() ? Collections.singleton(blacklistedEdge.get()) : Collections.emptySet();
215 
216             // walk forward while the path is unambiguous
217             final List<MultiSampleEdge> edges = outgoingEdges.stream().filter(e -> !blacklistedEdgeSet.contains(e)).limit(2).collect(Collectors.toList());
218 
219             vert = edges.size() == 1 ? getEdgeTarget(edges.get(0)) : null;
220 
221             // Defense against loops, kill the attempt if we are stuck in a loop
222             if (extraSequence.contains(vert)) {
223                 System.err.println("Dangling End recovery killed because of a loop (getReferencePathForwardFromKmer)");
224                 vert = null;
225             }
226 
227             finalIndex = vert == null ? -1 : referencePath.lastIndexOf(vert);
228         }
229 
230         // if we found extra sequence append it to the front of the
231         extraSequence.addAll(finalIndex != -1 ? referencePath.subList(finalIndex, referencePath.size()) : Collections.emptyList());
232         return extraSequence;
233     }
234 
235     // TODO this behavior is frankly silly and needs to be fixed, there is no way upwards paths should be dangingling head recovered differently
getReferencePathBackwardsForKmer(final MultiDeBruijnVertex targetKmer)236     private List<MultiDeBruijnVertex> getReferencePathBackwardsForKmer(final MultiDeBruijnVertex targetKmer) {
237         int firstIndex = referencePath.indexOf(targetKmer);
238         if (firstIndex == -1) return Collections.singletonList(targetKmer);
239         return Lists.reverse(referencePath.subList(0, firstIndex + 1));
240     }
241 
242     // Generate a SeqGraph that is special
243     @Override
toSequenceGraph()244     public SeqGraph toSequenceGraph() {
245         throw new UnsupportedOperationException("Cannot construct a sequence graph using JunctionTreeLinkedDeBruijnGraph");
246     }
247 
248     /**
249      * Print out the graph in the dot language for visualization for the graph after merging the graph into a seq graph.
250      * Junction trees will naively add junction trees to the corresponding zipped SeqGraph node for its root vertex.
251      *
252      * NOTE: this is intended for debugging complex assembly graphs while preserving junction tree data.
253      * @param destination File to write to
254      */
255     @VisibleForTesting
printSimplifiedGraph(final File destination, final int pruneFactor)256     public final void printSimplifiedGraph(final File destination, final int pruneFactor) {
257         try (PrintStream stream = new PrintStream(new FileOutputStream(destination))) {
258             printSimplifiedGraph(stream, true, pruneFactor);
259         } catch ( final FileNotFoundException e ) {
260             throw new UserException.CouldNotReadInputFile(destination.getAbsolutePath(), e);
261         }
262     }
printSimplifiedGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor)263     private final void printSimplifiedGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) {
264 
265         /// LOCAL CLASS TO MANAGE PRINTING THE JUNCTION TREES ONTO THE SEQ GRAPH
266         class PrintingSeqGraph extends SeqGraph {
267             private static final long serialVersionUID = 1l;
268 
269             private PrintingSeqGraph(int kmerSize) {
270                 super(kmerSize);
271             }
272             private Map<MultiDeBruijnVertex, SeqVertex> vertexToOrigionalSeqVertex;
273             private Map<SeqVertex, SeqVertex> originalSeqVertexToMergedVerex;
274             @Override
275             // Extendable method intended to allow for adding extra material to the graph
276             public List<String> getExtraGraphFileLines() {
277                 List<String> output = new ArrayList<>();
278                 for( Map.Entry<MultiDeBruijnVertex, ThreadingTree> entry : readThreadingJunctionTrees.entrySet()) {
279                     // Resolving the chain of altered vertexes
280                     SeqVertex mergedSeqVertex = originalSeqVertexToMergedVerex.get(vertexToOrigionalSeqVertex.get(entry.getKey()));
281 
282                     // adding the root node to the graph
283                     output.add(String.format("\t%s -> %s ", mergedSeqVertex.toString(), entry.getValue().rootNode.getDotName()) +
284                             String.format("[color=blue];"));
285                     output.add(String.format("\t%s [shape=point];", entry.getValue().rootNode.getDotName()));
286 
287                     output.addAll(edgesForNodeRecursive(entry.getValue().rootNode));
288                 }
289                 return output;
290             }
291             // Helper class to be used for zipping linear chains for easy print outputting
292             private Map<SeqVertex, SeqVertex> zipLinearChainsWithVertexMapping() {
293                 Map<SeqVertex, SeqVertex> vertexMapping = new HashMap<>();
294                 // create the list of start sites [doesn't modify graph yet]
295                 final Collection<SeqVertex> zipStarts = vertexSet().stream().filter(this::isLinearChainStart).collect(Collectors.toList());
296 
297                 if ( zipStarts.isEmpty() ) // nothing to do, as nothing could start a chain
298                 {
299                     return vertexMapping;
300                 }
301 
302                 // At this point, zipStarts contains all of the vertices in this graph that might start some linear
303                 // chain of vertices.  We walk through each start, building up the linear chain of vertices and then
304                 // zipping them up with mergeLinearChain, if possible
305                 for ( final SeqVertex zipStart : zipStarts ) {
306                     final LinkedList<SeqVertex> linearChain = traceLinearChain(zipStart);
307 
308                     SeqVertex mergedOne = mergeLinearChainVertex(linearChain);
309 
310                     // Add all of the mapped vertexes to the map.
311                     if (mergedOne != null) {
312                         for (SeqVertex v : linearChain) {
313                             vertexMapping.put(v, mergedOne);
314                         }
315                     } else {
316                         // otherwise we indicate that nothing has changed
317                         for (SeqVertex v : linearChain) {
318                             vertexMapping.put(v, v);
319                         }
320                     }
321                 }
322                 return vertexMapping;
323             }
324         };
325 
326         PrintingSeqGraph printingSeqGraph = new PrintingSeqGraph(kmerSize);
327 
328         final Map<MultiDeBruijnVertex, SeqVertex> vertexToOrigionalSeqVertex = new HashMap<>();
329 
330         // create all of the equivalent seq graph vertices
331         for ( final MultiDeBruijnVertex dv : vertexSet() ) {
332             final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
333             sv.setAdditionalInfo(dv.getAdditionalInfo());
334             vertexToOrigionalSeqVertex.put(dv, sv);
335             printingSeqGraph.addVertex(sv);
336         }
337 
338         // walk through the nodes and connect them to their equivalent seq vertices
339         for( final MultiSampleEdge e : edgeSet() ) {
340             final SeqVertex seqInV = vertexToOrigionalSeqVertex.get(getEdgeSource(e));
341             final SeqVertex seqOutV = vertexToOrigionalSeqVertex.get(getEdgeTarget(e));
342             printingSeqGraph.addEdge(seqInV, seqOutV, new BaseEdge(e.isRef(), e.getMultiplicity()));
343         }
344 
345         final Map<SeqVertex, SeqVertex> originalSeqVertexToMergedVerex = printingSeqGraph.zipLinearChainsWithVertexMapping();
346         printingSeqGraph.originalSeqVertexToMergedVerex = originalSeqVertexToMergedVerex;
347         printingSeqGraph.vertexToOrigionalSeqVertex = vertexToOrigionalSeqVertex;
348 
349 
350         printingSeqGraph.printGraph(graphWriter, writeHeader, pruneFactor);
351     }
352 
353     /**
354      * Walk along the reference path in the graph and pull out the corresponding bases
355      *
356      * NOTE: this attempts to generate the longest sequence of refernce bases in the event that fromVertex or toVertex are non-unique
357      *
358      * @param fromVertex    starting vertex
359      * @param toVertex      ending vertex
360      * @param includeStart  should the starting vertex be included in the path
361      * @param includeStop   should the ending vertex be included in the path
362      * @return              byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
363      */
364     @Override
getReferenceBytes( final MultiDeBruijnVertex fromVertex, final MultiDeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop )365     public byte[] getReferenceBytes( final MultiDeBruijnVertex fromVertex, final MultiDeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop ) {
366         Utils.nonNull(fromVertex, "Starting vertex in requested path cannot be null.");
367         Utils.nonNull(toVertex, "From vertex in requested path cannot be null.");
368 
369         byte[] bytes = null;
370         int fromIndex = referencePath.indexOf(fromVertex);
371         int toIndex = referencePath.lastIndexOf(toVertex);
372 
373         if( includeStart ) {
374             bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(fromVertex, true));
375         }
376         for (int i = fromIndex + 1; i < toIndex; i++) {
377             bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(referencePath.get(i)));
378         }
379 
380         if( includeStop ) {
381             bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(toVertex));
382         }
383         return bytes;
384     }
385 
386     @Override
387     // since we don't have to validate unique vertex merging we just find the vertex and pass
getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex)388     protected MultiDeBruijnVertex getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex) {
389         return kmerToVertexMap.get(kmer);
390     }
391 
392     /**
393      * Generate the junction trees and prune them (optionally printing the graph stages as output)
394      *
395      * @param debugGraphOutputPath path for graph output files
396      * @param refHaplotype ref haplotype location
397      */
postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype)398     public void postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype) {
399         annotateEdgesWithReferenceIndices();
400         generateJunctionTrees();
401         if (debugGraphTransformations) {
402             printGraph(new File(debugGraphOutputPath, refHaplotype + "-sequenceGraph." + kmerSize + ".0.4.JT_unpruned.dot"), 10000);
403         }
404         pruneJunctionTrees(JunctionTreeKBestHaplotypeFinder.DEFAULT_MINIMUM_WEIGHT_FOR_JT_BRANCH_TO_NOT_BE_PRUNED);
405         if (debugGraphTransformations) {
406             printGraph(new File(debugGraphOutputPath, refHaplotype + "-sequenceGraph." + kmerSize + ".0.5.JT_pruned.dot"), 10000);
407         }
408     }
409 
annotateEdgesWithReferenceIndices()410     private void annotateEdgesWithReferenceIndices() {
411         final List<MultiDeBruijnVertex> referencePath = getReferencePath(TraversalDirection.downwards);
412         MultiDeBruijnVertex lastVert = null;
413         int refIndex = 0;
414         for (MultiDeBruijnVertex nextVert : referencePath) {
415             if (lastVert != null) {
416                 getEdge(lastVert, nextVert).addReferenceIndex(refIndex++);
417             }
418             lastVert = nextVert;
419         }
420     }
421 
422 
423     // Generate threading trees
generateJunctionTrees()424     public void generateJunctionTrees() {
425         Utils.validate(alreadyBuilt, "Assembly graph has not been constructed, please call BuildGraphIfNecessary() before trying to thread reads to the graph");
426         // Adding handle vertex to support symbolic end alleles
427         addVertex(SYMBOLIC_END_VETEX);
428         SYMBOLIC_END_EDGE = addEdge(getReferenceSinkVertex(), SYMBOLIC_END_VETEX);
429 
430         readThreadingJunctionTrees = new HashMap<>();
431         pending.values().stream().flatMap(Collection::stream).forEach(this::threadSequenceForJuncitonTree);
432     }
433 
434     /**
435      * Traverse all of the junction trees in the graph and remove branches supported by < minimumEdgeWeight edges recursively.
436      * This will also handle pruning of trees that are uninformative (i.e. empty roots).
437      *
438      * @param minimumEdgeWeight minimum edge weight below which branches are removed
439      */
pruneJunctionTrees(final int minimumEdgeWeight)440     public void pruneJunctionTrees(final int minimumEdgeWeight) {
441         readThreadingJunctionTrees.forEach((key, value) -> value.getRootNode().pruneNode(minimumEdgeWeight));
442         readThreadingJunctionTrees = Maps.filterValues( readThreadingJunctionTrees, ThreadingTree::isEmptyTree);
443     }
444 
445     /**
446      * Takes a contiguous sequence of kmers and threads it through the graph, generating and subsequently adding to
447      * juncition trees at each relevant node.
448      *
449      * @param seqForKmers
450      */
threadSequenceForJuncitonTree(final SequenceForKmers seqForKmers)451     private void threadSequenceForJuncitonTree(final SequenceForKmers seqForKmers) {
452         // Maybe handle this differently, the reference junction tree should be held seperatedly from everything else.
453         if (seqForKmers.isRef) {
454             return;
455         }
456 
457         // List we will use to keep track of sequences
458         JunctionTreeThreadingHelper nodeHelper = new JunctionTreeThreadingHelper();
459 
460         // Find the first kmer in the read that exists on the graph
461         final int startPos = findStartForJunctionThreading(seqForKmers);
462         if ( startPos == -1 ) {
463             return;
464         }
465 
466         final MultiDeBruijnVertex startingVertex = kmerToVertexMap.get(new Kmer(seqForKmers.sequence, startPos, kmerSize));
467 
468         // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
469         MultiDeBruijnVertex lastVertex = startingVertex;
470         boolean hasToRediscoverKmer = false;
471         for ( int i = startPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) {
472             MultiDeBruijnVertex vertex;
473             if (!hasToRediscoverKmer) {
474                 vertex = extendJunctionThreadingByOne(lastVertex, seqForKmers.sequence, i, nodeHelper, true);
475             } else {
476                 Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
477                 vertex = kmerToVertexMap.get(kmer);
478             }
479 
480             // If we missed the vertex, attempt to recover the path from the graph if there is no ambiguity
481             if (vertex == null && !hasToRediscoverKmer) {
482                 final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(lastVertex);
483                 MultiDeBruijnVertex tentativeVertex = null;
484                 if (outgoingEdges.size()==1) {
485                     tentativeVertex = getEdgeTarget(outgoingEdges.stream().findFirst().get());
486                 }
487                 // Loop over the tentative path until the end of the read, we have walked over kmersize bases, or there is another missing path in the graph respectively
488                 for (int j = i+1; j <= seqForKmers.stop - kmerSize &&
489                                 j <= i + kmerSize &&
490                                 tentativeVertex != null; j++) {
491                     tentativeVertex = extendJunctionThreadingByOne(tentativeVertex, seqForKmers.sequence, j, null, false);
492                 }
493                 // If tentativeVertex is not null then we will be able to successfully traverse the read through the graph
494                 if (tentativeVertex != null) {
495                     vertex = getEdgeTarget(outgoingEdges.stream().findFirst().get());
496                 }
497             }
498 
499             // If for whatever reason vertex = null, then we have fallen off the corrected graph so we don't update anything
500             if (vertex != null) {
501                 lastVertex = vertex;
502                 hasToRediscoverKmer = false;
503             } else {
504                 nodeHelper.clear();
505                 hasToRediscoverKmer = true;
506             }
507         }
508 
509         // As a final step, if the last vetex happens to be the ref-stop vertex then we want to append a symbolic node to the junciton trees
510         if (lastVertex == getReferenceSinkVertex()) {
511             nodeHelper.addEdgeToJunctionTreeNodes(SYMBOLIC_END_EDGE);
512         }
513     }
514 
515     // Extendable method intended to allow for adding extra material to the graph
getExtraGraphFileLines()516     public List<String> getExtraGraphFileLines() {
517         List<String> output = new ArrayList<>();
518         for( Map.Entry<MultiDeBruijnVertex, ThreadingTree> entry : readThreadingJunctionTrees.entrySet()) {
519             // adding the root node to the graph
520             output.add(String.format("\t%s -> %s ", entry.getKey().toString(), entry.getValue().rootNode.getDotName()) +
521             String.format("[color=blue];"));
522             output.add(String.format("\t%s [shape=point];", entry.getValue().rootNode.getDotName()));
523 
524             output.addAll(edgesForNodeRecursive(entry.getValue().rootNode));
525         }
526         return output;
527     }
528 
529     // Recursive search through a threading tree for nodes
edgesForNodeRecursive(ThreadingNode node)530     private List<String> edgesForNodeRecursive(ThreadingNode node) {
531         List<String> output = new ArrayList<>();
532 
533         for ( Map.Entry<MultiSampleEdge, ThreadingNode> childrenNode : node.childrenNodes.entrySet() ) {
534             output.add(String.format("\t%s -> %s ", node.getDotName(), childrenNode.getValue().getDotName() + String.format("[color=blue,label=\"%d\"];",childrenNode.getValue().count)));
535             output.add(String.format("\t%s [label=\"%s\",shape=plaintext]", childrenNode.getValue().getDotName(),
536                     new String(getEdgeTarget(childrenNode.getKey()).getAdditionalSequence(false))));
537             output.addAll(edgesForNodeRecursive(childrenNode.getValue()));
538         }
539         return output;
540     }
541 
getJunctionTreeForNode(MultiDeBruijnVertex vertex)542     public Optional<ThreadingTree> getJunctionTreeForNode(MultiDeBruijnVertex vertex) {
543         return Optional.ofNullable(readThreadingJunctionTrees.get(vertex));
544     }
545 
546     @VisibleForTesting
547     // Test method for returning all existing junction trees.
getReadThreadingJunctionTrees(boolean pruned)548     public Map<MultiDeBruijnVertex, ThreadingTree> getReadThreadingJunctionTrees(boolean pruned) {
549         return pruned ? Maps.filterValues( Collections.unmodifiableMap(readThreadingJunctionTrees), n -> n.isEmptyTree())
550                 : Collections.unmodifiableMap(readThreadingJunctionTrees);
551     }
552 
553     //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
554     // Classes associated with junction trees
555     //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
556 
557     /**
558      * Class for storing a junction tree. A Juntion tree is a tree whose nodes correspond to edges in the graph where a
559      * traversal decision was made (i.e. the path forks and one edge was taken). Each node keeps track of a list of
560      * its children nodes, each of which stores a count of the evidence seen supporting the given edge.
561      */
562     public class ThreadingTree {
563         private final ThreadingNode rootNode;
564         private final MultiDeBruijnVertex treeVertex; // this reference exists purely for toString() reasons
565 
ThreadingTree(final MultiDeBruijnVertex treeVertex)566         private ThreadingTree(final MultiDeBruijnVertex treeVertex) {
567             this.rootNode = new ThreadingNode(null);
568             this.treeVertex = treeVertex;
569         }
570 
571         /**
572          * Returns the root node for this edge (ensuring that it has had its count intcremented to keep track of the total evidence spanning this tree.
573          *
574          * @return
575          */
getAndIncrementRootNode()576         private ThreadingNode getAndIncrementRootNode() {
577             rootNode.incrementCount();
578             return rootNode;
579         }
580 
581         // Determines if this tree actually has any data associated with it
isEmptyTree()582         private boolean isEmptyTree() {
583             return !rootNode.childrenNodes.isEmpty();
584         }
585 
586         // Returns the junction choices as a list of suffixes to follow (this is used for writing human readable tests)
587         @VisibleForTesting
getPathsPresentAsBaseChoiceStrings()588         List<String> getPathsPresentAsBaseChoiceStrings() {
589             return rootNode.getEdgesThroughNode().stream()
590                     .map(path -> path.stream()
591                                     .map(edge -> getEdgeTarget(edge).getSuffix())
592                                     .map(b -> Character.toString((char)b.byteValue()))
593                                     .collect(Collectors.joining()))
594                     .collect(Collectors.toList());
595 
596         }
597 
toString()598         public String toString() {
599             return "ThreadingTree_at_"+treeVertex.getSequenceString()+"_with_evidence_"+rootNode.count;
600         }
601 
602         // getter for the root node, TODO to possibly be replaced when the graph gets hidden from prying eyes.
getRootNode()603         public ThreadingNode getRootNode() {
604             return rootNode;
605         }
606     }
607 
608     // Linked node object for storing tree topography
609     public class ThreadingNode {
610         private Map<MultiSampleEdge, ThreadingNode> childrenNodes;
611         private final MultiSampleEdge prevEdge; // This may be null if this node corresponds to the root of the graph
612         private int count = 0;
613 
ThreadingNode(MultiSampleEdge edge)614         private ThreadingNode(MultiSampleEdge edge) {
615             prevEdge = edge;
616             childrenNodes = new HashMap<>(2);
617         }
618 
619         /**
620          * Adds an edge to the tree if this node has not been traversed before, or increments the corresponding node
621          * if the edge already exists. Then returns the corresponding next node
622          *
623          * @param edge edge to add to this current node
624          * @return the node corresponding to the one that was added to this graph
625          */
addEdge(MultiSampleEdge edge)626         private ThreadingNode addEdge(MultiSampleEdge edge) {
627             ThreadingNode nextNode = childrenNodes.computeIfAbsent(edge, k -> new ThreadingNode(edge));
628             nextNode.incrementCount();
629             return nextNode;
630         }
631 
incrementCount()632         void incrementCount() {
633             count++;
634         }
635 
incrementNode(List<MultiSampleEdge> edges, int index)636         private void incrementNode(List<MultiSampleEdge> edges, int index) {
637             incrementCount();
638             if (index < edges.size()) {
639                 getNode(edges.get(index)).incrementNode(edges, index + 1);
640             }
641         }
642 
getNode(MultiSampleEdge edge)643         private ThreadingNode getNode(MultiSampleEdge edge) {
644             ThreadingNode nextNode = childrenNodes.get(edge);
645             if (nextNode == null) {
646                 nextNode = new ThreadingNode(edge);
647                 childrenNodes.put(edge, nextNode);
648             }
649             return nextNode;
650         }
651 
652         /**
653          * Helper method that returns a list of all the possible branching paths through the tree originiating from this node.
654          *
655          * NOTE: This method is intended for debugging and testing and thus may not be efficient for returning the list
656          *
657          * @return A list of potential paths originating from this node as observed
658          */
659         @VisibleForTesting
getEdgesThroughNode()660         private List<List<MultiSampleEdge>> getEdgesThroughNode() {
661             List<List<MultiSampleEdge>> paths = new ArrayList<>();
662             if (childrenNodes.isEmpty()) {
663                 paths.add(prevEdge == null ? new ArrayList<>() : Lists.newArrayList(prevEdge));
664             }
665 
666             for (ThreadingNode childNode : childrenNodes.values()) {
667                 for ( List<MultiSampleEdge> subPath : childNode.getEdgesThroughNode()) {
668                     List<MultiSampleEdge> pathRoot = (prevEdge == null ? new ArrayList<>() : Lists.newArrayList(prevEdge));
669                     pathRoot.addAll(subPath);
670                     paths.add(pathRoot);
671                 }
672             }
673 
674             return paths;
675         }
676 
677         // Recursively prunes nodes based on the provided threshold, removing branches without sufficient support.
pruneNode(final int threshold)678         private void pruneNode(final int threshold) {
679             childrenNodes = Maps.filterValues( childrenNodes, node -> node.getEvidenceCount() >= threshold);
680             childrenNodes.forEach((edge, node) -> node.pruneNode(threshold));
681         }
682 
683         // Returns a unique name based on the memory id that conforms to the restrictions placed on .dot file nodes
getDotName()684         public String getDotName() {
685             return "TreadingNode_" + Integer.toHexString(hashCode());
686         }
687 
688         // Getter for external tools to access a node
getChildrenNodes()689         public Map<MultiSampleEdge, ThreadingNode> getChildrenNodes() {
690             return Collections.unmodifiableMap(childrenNodes);
691         }
692 
693         // Returns true if there are no paths emanating from this node
hasNoEvidence()694         public boolean hasNoEvidence() {
695             return childrenNodes.isEmpty();
696         }
697 
698         // Return the count of total evidence supporting this node in the tree
getEvidenceCount()699         public int getEvidenceCount() {
700             return count;
701         }
702 
703         // Checks if this node is the symbolic end by determining if its previous edge ends on the symbolic edge
isSymbolicEnd()704         public boolean isSymbolicEnd() {
705             return prevEdge == SYMBOLIC_END_EDGE;
706         }
707     }
708 
709     /**
710      * A convenient helper class designed to pull much of the direct interaction with junction trees in the threading code
711      * into a single place.
712      */
713     private class JunctionTreeThreadingHelper {
714         final List<ThreadingNode> trackedNodes = new ArrayList<>(3);
715 
716         // Helper method used to determine whether a vertex meets the criteria to hold a junction tree
717         // The current criteria, if any outgoing edge from a particular vertex leads to a vertex with inDegree > 1, then it warrants a tree. Or if it is the reference start vertex.
718         // NOTE: this check is necessary to handle the edge cases that may arise when a vertex has multiple exit paths but happens to lead to a vetex that needs a junction tree
vertexWarrantsJunctionTree(final MultiDeBruijnVertex vertex)719         private boolean vertexWarrantsJunctionTree(final MultiDeBruijnVertex vertex) {
720             return outgoingEdgesOf(vertex).stream().anyMatch(edge -> inDegreeOf(getEdgeTarget(edge)) > 1);
721         }
722 
723         // Helper method that adds a single edge to all of the nodes in nodesToExtend.
addEdgeToJunctionTreeNodes(final MultiSampleEdge outgoingEdge)724         private void addEdgeToJunctionTreeNodes(final MultiSampleEdge outgoingEdge) {
725             List<ThreadingNode> newNodes = trackedNodes.stream().map(n -> n.addEdge(outgoingEdge)).collect(Collectors.toList());
726             trackedNodes.clear();
727             trackedNodes.addAll(newNodes);
728         }
729 
730         // removes all of the nodes currently on the tree for safety if there are gaps in a reads traversal
clear()731         private void clear() {
732             trackedNodes.clear();
733         }
734 
735         // Adds a junction tree prevVertex if the vertex warrants a junciton tree
addTreeIfNecessary(MultiDeBruijnVertex prevVertex)736         private void addTreeIfNecessary(MultiDeBruijnVertex prevVertex) {
737             if (vertexWarrantsJunctionTree(prevVertex)) {
738                 trackedNodes.add(readThreadingJunctionTrees.computeIfAbsent(prevVertex, k -> new ThreadingTree(prevVertex)).getAndIncrementRootNode());
739             }
740         }
741     }
742 }
743