1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import com.google.common.collect.Lists;
5 import htsjdk.samtools.Cigar;
6 import htsjdk.samtools.CigarElement;
7 import htsjdk.samtools.CigarOperator;
8 import htsjdk.samtools.SAMFileHeader;
9 import htsjdk.samtools.util.Locatable;
10 import org.apache.commons.lang3.tuple.Pair;
11 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
12 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer;
13 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.BaseGraph;
14 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.KmerSearchableGraph;
15 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge;
16 import org.broadinstitute.hellbender.utils.BaseUtils;
17 import org.broadinstitute.hellbender.utils.Utils;
18 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
19 import org.broadinstitute.hellbender.utils.read.GATKRead;
20 import org.broadinstitute.hellbender.utils.read.ReadUtils;
21 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
22 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
23 import org.jgrapht.EdgeFactory;
24 
25 import java.io.File;
26 import java.util.*;
27 import java.util.function.Function;
28 import java.util.function.Predicate;
29 import java.util.stream.Collectors;
30 
31 /**
32  * Read threading graph class intended to contain duplicated code between {@link ReadThreadingGraph} and {@link JunctionTreeLinkedDeBruijnGraph}.
33  */
34 public abstract class AbstractReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> implements KmerSearchableGraph<MultiDeBruijnVertex, MultiSampleEdge> {
35     private static final long serialVersionUID = 1l;
36     private static final String ANONYMOUS_SAMPLE = "XXX_UNNAMED_XXX";
37     private static final boolean WRITE_GRAPH = false;
38     private static final boolean DEBUG_NON_UNIQUE_CALC = false;
39     private static final int MAX_CIGAR_COMPLEXITY = 3;
40     private static final boolean INCREASE_COUNTS_BACKWARDS = true;
41     private int minMatchingBasesToDangingEndRecovery = -1;
42 
43     /**
44      * for debugging info printing
45      */
46     private static int counter = 0;
47     /**
48      * Sequences added for read threading before we've actually built the graph
49      */
50     protected final Map<String, List<SequenceForKmers>> pending = new LinkedHashMap<>();
51     /**
52      * A map from kmers -> their corresponding vertex in the graph
53      */
54     protected final Map<Kmer, MultiDeBruijnVertex> kmerToVertexMap = new LinkedHashMap<>();
55     protected final boolean debugGraphTransformations;
56     protected final byte minBaseQualityToUseInAssembly;
57     protected List<MultiDeBruijnVertex> referencePath = null;
58     protected boolean alreadyBuilt = false;
59     // --------------------------------------------------------------------------------
60     // state variables, initialized in setToInitialState()
61     // --------------------------------------------------------------------------------
62     private Kmer refSource = null;
63     private boolean startThreadingOnlyAtExistingVertex = false;
64     private int maxMismatchesInDanglingHead = -1; // this argument exists purely for testing purposes in constructing helpful tests and is currently not hooked up
65     private boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds
66 
67     protected enum TraversalDirection {
68         downwards,
69         upwards
70     }
71 
AbstractReadThreadingGraph(int kmerSize, EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory)72     AbstractReadThreadingGraph(int kmerSize, EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory) {
73         super(kmerSize, edgeFactory);
74         debugGraphTransformations = false;
75         minBaseQualityToUseInAssembly = 0;
76     }
77 
78     /**
79      * Create a new ReadThreadingAssembler using kmerSize for matching
80      *
81      * @param kmerSize must be >= 1
82      */
AbstractReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases)83     public AbstractReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) {
84         super(kmerSize, new MyEdgeFactory(numPruningSamples));
85 
86         Utils.validateArg(kmerSize > 0, () -> "bad minkKmerSize " + kmerSize);
87 
88         this.debugGraphTransformations = debugGraphTransformations;
89         this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
90         this.minMatchingBasesToDangingEndRecovery = numDanglingMatchingPrefixBases;
91     }
92 
93     /**
94      * Checks whether a kmer can be the threading start based on the current threading start location policy.
95      *
96      * @param kmer the query kmer.
97      * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise.
98      * @see #setThreadingStartOnlyAtExistingVertex(boolean)
99      */
isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex)100     protected abstract boolean isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex);
101 
102     // get the next kmerVertex for ChainExtension and validate if necessary.
getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex)103     protected abstract MultiDeBruijnVertex getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex);
104 
105     // perform any necessary preprocessing on the graph (such as non-unique kmer determination) before the graph is constructed
preprocessReads()106     protected abstract void preprocessReads();
107 
108     // heuristic to decide if the graph should be thown away based on low complexity/poor assembly
isLowQualityGraph()109     public abstract boolean isLowQualityGraph();
110 
111     // whether reads are needed after graph construction
shouldRemoveReadsAfterGraphConstruction()112     protected abstract boolean shouldRemoveReadsAfterGraphConstruction();
113 
114     // Method that will be called immediately before haplotype finding in the event there are alteations that must be made to the graph based on implementation
postProcessForHaplotypeFinding(File debugGraphOutputPath, Locatable refHaplotype)115     public abstract void postProcessForHaplotypeFinding(File debugGraphOutputPath, Locatable refHaplotype);
116 
117     /**
118      * Define the behavior for how the graph should keep track of a potentially new kmer.
119      *
120      * @param kmer      (potentially) new kmer to track
121      * @param newVertex corresponding vertex for that kmer
122      */
trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex)123     protected abstract void trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex);
124 
125     /**
126      * Determine whether the provided cigar is okay to merge into the reference path
127      *
128      * @param cigar                the cigar to analyze
129      * @param requireFirstElementM if true, require that the first cigar element be an M operator in order for it to be okay
130      * @param requireLastElementM  if true, require that the last cigar element be an M operator in order for it to be okay
131      * @return true if it's okay to merge, false otherwise
132      */
133     @VisibleForTesting
cigarIsOkayToMerge(final Cigar cigar, final boolean requireFirstElementM, final boolean requireLastElementM)134     static boolean cigarIsOkayToMerge(final Cigar cigar, final boolean requireFirstElementM, final boolean requireLastElementM) {
135         final List<CigarElement> elements = cigar.getCigarElements();
136         final int numElements = elements.size();
137 
138         // don't allow more than a couple of different ops
139         if (numElements == 0 || numElements > MAX_CIGAR_COMPLEXITY) {
140             return false;
141         }
142 
143         // the first element must be an M
144         if (requireFirstElementM && elements.get(0).getOperator() != CigarOperator.M) {
145             return false;
146         }
147 
148         // the last element must be an M
149         if (requireLastElementM && elements.get(numElements - 1).getOperator() != CigarOperator.M) {
150             return false;
151         }
152 
153         // note that there are checks for too many mismatches in the dangling branch later in the process
154 
155         return true;
156     }
157 
158     /**
159      * calculates the longest suffix match between a sequence and a smaller kmer
160      *
161      * @param seq      the (reference) sequence
162      * @param kmer     the smaller kmer sequence
163      * @param seqStart the index (inclusive) on seq to start looking backwards from
164      * @return the longest matching suffix
165      */
166     @VisibleForTesting
longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart)167     static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) {
168         for (int len = 1; len <= kmer.length; len++) {
169             final int seqI = seqStart - len + 1;
170             final int kmerI = kmer.length - len;
171             if (seqI < 0 || seq[seqI] != kmer[kmerI]) {
172                 return len - 1;
173             }
174         }
175         return kmer.length;
176     }
177 
178     @VisibleForTesting
setAlreadyBuilt()179     protected void setAlreadyBuilt() {
180         alreadyBuilt = true;
181     }
182 
183     @VisibleForTesting
setMinMatchingBasesToDangingEndRecovery(final int minMatchingBasesToDangingEndRecovery)184     void setMinMatchingBasesToDangingEndRecovery(final int minMatchingBasesToDangingEndRecovery) {
185         this.minMatchingBasesToDangingEndRecovery = minMatchingBasesToDangingEndRecovery;
186     }
187 
188     @VisibleForTesting
setMaxMismatchesInDanglingHead(final int maxMismatchesInDanglingHead)189     void setMaxMismatchesInDanglingHead(final int maxMismatchesInDanglingHead) {
190         this.maxMismatchesInDanglingHead = maxMismatchesInDanglingHead;
191     }
192 
193     /**
194      * Find vertex and its position in seqForKmers where we should start assembling seqForKmers
195      *
196      * @param seqForKmers the sequence we want to thread into the graph
197      * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one
198      */
findStart(final SequenceForKmers seqForKmers)199     protected int findStart(final SequenceForKmers seqForKmers) {
200         if (seqForKmers.isRef) {
201             return 0;
202         }
203 
204         for (int i = seqForKmers.start; i < seqForKmers.stop - kmerSize; i++) {
205             final Kmer kmer1 = new Kmer(seqForKmers.sequence, i, kmerSize);
206             if (isThreadingStart(kmer1, startThreadingOnlyAtExistingVertex)) {
207                 return i;
208             }
209         }
210 
211         return -1;
212     }
213 
214     /**
215      * Add the all bases in sequence to the graph
216      *
217      * @param sequence a non-null sequence
218      * @param isRef    is this the reference sequence?
219      */
220     @VisibleForTesting
addSequence(final byte[] sequence, final boolean isRef)221     public final void addSequence(final byte[] sequence, final boolean isRef) {
222         addSequence("anonymous", sequence, isRef);
223     }
224 
225     /**
226      * Add all bases in sequence to this graph
227      *
228      * @see #addSequence(String, String, byte[], int, int, int, boolean) for full information
229      */
addSequence(final String seqName, final byte[] sequence, final boolean isRef)230     public final void addSequence(final String seqName, final byte[] sequence, final boolean isRef) {
231         addSequence(seqName, sequence, 1, isRef);
232     }
233 
234     /**
235      * Add all bases in sequence to this graph
236      *
237      * @see #addSequence(String, String, byte[], int, int, int, boolean) for full information
238      */
addSequence(final String seqName, final byte[] sequence, final int count, final boolean isRef)239     public final void addSequence(final String seqName, final byte[] sequence, final int count, final boolean isRef) {
240         addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, count, isRef);
241     }
242 
243     /**
244      * Add bases in sequence to this graph
245      *
246      * @param seqName  a useful seqName for this read, for debugging purposes
247      * @param sequence non-null sequence of bases
248      * @param start    the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive
249      * @param stop     the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive
250      * @param count    the representative count of this sequence (to use as the weight)
251      * @param isRef    is this the reference sequence.
252      */
addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int count, final boolean isRef)253     protected void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int count, final boolean isRef) {
254         // note that argument testing is taken care of in SequenceForKmers
255         Utils.validate(!alreadyBuilt, "Attempting to add sequence to a graph that has already been built");
256 
257         // get the list of sequences for this sample
258         List<SequenceForKmers> sampleSequences = pending.computeIfAbsent(sampleName, s -> new LinkedList<>());
259 
260         // add the new sequence to the list of sequences for sample
261         sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, count, isRef));
262     }
263 
264     /**
265      * Thread sequence seqForKmers through the current graph, updating the graph as appropriate
266      *
267      * @param seqForKmers a non-null sequence
268      */
threadSequence(final SequenceForKmers seqForKmers)269     private void threadSequence(final SequenceForKmers seqForKmers) {
270         final int startPos = findStart(seqForKmers);
271         if (startPos == -1) {
272             return;
273         }
274 
275         final MultiDeBruijnVertex startingVertex = getOrCreateKmerVertex(seqForKmers.sequence, startPos);
276 
277         // increase the counts of all edges incoming into the starting vertex supported by going back in sequence
278         if (INCREASE_COUNTS_BACKWARDS) {
279             increaseCountsInMatchedKmers(seqForKmers, startingVertex, startingVertex.getSequence(), kmerSize - 2);
280         }
281 
282         if (debugGraphTransformations) {
283             startingVertex.addRead(seqForKmers.name);
284         }
285 
286         // keep track of information about the reference source
287         if (seqForKmers.isRef) {
288             if (refSource != null) {
289                 throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex);
290             }
291             referencePath = new ArrayList<>(seqForKmers.sequence.length - kmerSize);
292             referencePath.add(startingVertex);
293             refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize);
294         }
295 
296         // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
297         MultiDeBruijnVertex vertex = startingVertex;
298         for (int i = startPos + 1; i <= seqForKmers.stop - kmerSize; i++) {
299             vertex = extendChainByOne(vertex, seqForKmers.sequence, i, seqForKmers.count, seqForKmers.isRef);
300             if (seqForKmers.isRef) {
301                 referencePath.add(vertex);
302             }
303             if (debugGraphTransformations) {
304                 vertex.addRead(seqForKmers.name);
305             }
306         }
307         if (seqForKmers.isRef) {
308             referencePath = Collections.unmodifiableList(referencePath);
309         }
310     }
311 
312     /**
313      * Changes the threading start location policy.
314      *
315      * @param value {@code true} if threading will start only at existing vertices in the graph, {@code false} if
316      *              it can start at any unique kmer.
317      */
setThreadingStartOnlyAtExistingVertex(final boolean value)318     public final void setThreadingStartOnlyAtExistingVertex(final boolean value) {
319         startThreadingOnlyAtExistingVertex = value;
320     }
321 
322     /**
323      * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have
324      * been added to the graph.
325      */
buildGraphIfNecessary()326     public final void buildGraphIfNecessary() {
327         if (alreadyBuilt) {
328             return;
329         }
330 
331         // Capture the set of non-unique kmers for the given kmer size (if applicable)
332         preprocessReads();
333 
334         if (DEBUG_NON_UNIQUE_CALC) {
335             ReadThreadingGraph.logger.info("using " + kmerSize + " kmer size for this assembly with the following non-uniques");
336         }
337 
338         // go through the pending sequences, and add them to the graph
339         for (final List<SequenceForKmers> sequencesForSample : pending.values()) {
340             for (final SequenceForKmers sequenceForKmers : sequencesForSample) {
341                 threadSequence(sequenceForKmers);
342                 if (WRITE_GRAPH) {
343                     printGraph(new File("threading." + counter++ + '.' + sequenceForKmers.name.replace(" ", "_") + ".dot"), 0);
344                 }
345             }
346 
347             // flush the single sample edge values from the graph
348             for (final MultiSampleEdge e : edgeSet()) {
349                 e.flushSingleSampleMultiplicity();
350             }
351         }
352 
353         // clear the pending reads pile to conserve memory
354         if (shouldRemoveReadsAfterGraphConstruction()) {
355             pending.clear();
356         }
357         alreadyBuilt = true;
358         for (final MultiDeBruijnVertex v : kmerToVertexMap.values()) {
359             v.setAdditionalInfo(v.getAdditionalInfo() + '+');
360         }
361     }
362 
363     @Override
removeVertex(final MultiDeBruijnVertex V)364     public boolean removeVertex(final MultiDeBruijnVertex V) {
365         final boolean result = super.removeVertex(V);
366         if (result) {
367             final byte[] sequence = V.getSequence();
368             final Kmer kmer = new Kmer(sequence);
369             kmerToVertexMap.remove(kmer);
370         }
371         return result;
372     }
373 
374     @Override
removeSingletonOrphanVertices()375     public void removeSingletonOrphanVertices() {
376         // Run through the graph and clean up singular orphaned nodes
377         final Collection<MultiDeBruijnVertex> verticesToRemove = new LinkedList<>();
378         for (final MultiDeBruijnVertex v : vertexSet()) {
379             if (inDegreeOf(v) == 0 && outDegreeOf(v) == 0) {
380                 verticesToRemove.add(v);
381             }
382         }
383         removeVertex(null);
384         removeAllVertices(verticesToRemove);
385     }
386 
387     @VisibleForTesting
setIncreaseCountsThroughBranches(final boolean increaseCountsThroughBranches)388     void setIncreaseCountsThroughBranches(final boolean increaseCountsThroughBranches) {
389         this.increaseCountsThroughBranches = increaseCountsThroughBranches;
390     }
391 
392     /**
393      * Try to recover dangling tails
394      *
395      * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
396      * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
397      * @param recoverAll              recover even branches with forks
398      * @param aligner
399      */
recoverDanglingTails(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)400     public void recoverDanglingTails(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) {
401         Utils.validateArg(pruneFactor >= 0, () -> "pruneFactor must be non-negative but was " + pruneFactor);
402         Utils.validateArg(minDanglingBranchLength >= 0, () -> "minDanglingBranchLength must be non-negative but was " + minDanglingBranchLength);
403 
404         if (!alreadyBuilt) {
405             throw new IllegalStateException("recoverDanglingTails requires the graph be already built");
406         }
407 
408         int attempted = 0;
409         int nRecovered = 0;
410         for (final MultiDeBruijnVertex v : vertexSet()) {
411             if (outDegreeOf(v) == 0 && !isRefSink(v)) {
412                 attempted++;
413                 nRecovered += recoverDanglingTail(v, pruneFactor, minDanglingBranchLength, recoverAll, aligner);
414             }
415         }
416 
417         ReadThreadingGraph.logger.debug(String.format("Recovered %d of %d dangling tails", nRecovered, attempted));
418     }
419 
420     /**
421      * Try to recover dangling heads
422      *
423      * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
424      * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
425      * @param recoverAll              recover even branches with forks
426      * @param aligner
427      */
recoverDanglingHeads(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)428     public void recoverDanglingHeads(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) {
429         Utils.validateArg(pruneFactor >= 0, () -> "pruneFactor must be non-negative but was " + pruneFactor);
430         Utils.validateArg(minDanglingBranchLength >= 0, () -> "minDanglingBranchLength must be non-negative but was " + minDanglingBranchLength);
431         if (!alreadyBuilt) {
432             throw new IllegalStateException("recoverDanglingHeads requires the graph be already built");
433         }
434 
435         // we need to build a list of dangling heads because that process can modify the graph (and otherwise generate
436         // a ConcurrentModificationException if we do it while iterating over the vertexes)
437         final Collection<MultiDeBruijnVertex> danglingHeads = vertexSet().stream()
438                 .filter(v -> inDegreeOf(v) == 0 && !isRefSource(v))
439                 .collect(Collectors.toList());
440 
441         // now we can try to recover the dangling heads
442         int attempted = 0;
443         int nRecovered = 0;
444         for (final MultiDeBruijnVertex v : danglingHeads) {
445             attempted++;
446             nRecovered += recoverDanglingHead(v, pruneFactor, minDanglingBranchLength, recoverAll, aligner);
447         }
448 
449         ReadThreadingGraph.logger.debug(String.format("Recovered %d of %d dangling heads", nRecovered, attempted));
450     }
451 
452     /**
453      * Attempt to attach vertex with out-degree == 0 to the graph
454      *
455      * @param vertex                  the vertex to recover
456      * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
457      * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
458      * @param aligner
459      * @return 1 if we successfully recovered the vertex and 0 otherwise
460      */
recoverDanglingTail(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)461     private int recoverDanglingTail(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) {
462         if (outDegreeOf(vertex) != 0) {
463             throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0");
464         }
465 
466         // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
467         final DanglingChainMergeHelper danglingTailMergeResult = generateCigarAgainstDownwardsReferencePath(vertex, pruneFactor, minDanglingBranchLength, recoverAll, aligner);
468 
469         // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
470         if (danglingTailMergeResult == null || !cigarIsOkayToMerge(danglingTailMergeResult.cigar, false, true)) {
471             return 0;
472         }
473 
474         // merge
475         return mergeDanglingTail(danglingTailMergeResult);
476     }
477 
478     /**
479      * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
480      * Requires that at a minimum there are at least #getMinMatchingBases() matches between the reference and the read
481      * at the end in order to emit an alignment offset.
482      *
483      * @param cigarElements cigar elements corresponding to the alignment between path1 and path2
484      * @param path1  the first path
485      * @param path2  the second path
486      * @return an integer pair object where the key is the offset into path1 and the value is offset into path2 (both -1 if no path is found)
487      */
bestPrefixMatch(final List<CigarElement> cigarElements, final byte[] path1, final byte[] path2)488     private Pair<Integer, Integer> bestPrefixMatch(final List<CigarElement> cigarElements, final byte[] path1, final byte[] path2) {
489         final int minMismatchingBases = getMinMatchingBases();
490 
491         int refIdx = cigarElements.stream().mapToInt(ce -> ce.getOperator().consumesReferenceBases()? ce.getLength() : 0).sum() - 1;
492         int readIdx = path2.length - 1;
493 
494         // NOTE: this only works when the last cigar element has a sufficient number of M bases, so no indels within min-mismatches of the edge.
495         cigarLoop:
496         for (final CigarElement ce : Lists.reverse(cigarElements)) {
497             if (!(ce.getOperator().consumesReadBases() && ce.getOperator().consumesReferenceBases())) {
498                 break;
499             }
500             for (int j = 0; j < ce.getLength(); j++, refIdx--, readIdx--) {
501                 if (path1[refIdx] != path2[readIdx]) {
502                     break cigarLoop;
503                 }
504             }
505         }
506 
507         final int matches = path2.length - 1 - readIdx;
508         return matches < minMismatchingBases ? Pair.of(-1,-1) : Pair.of(refIdx, readIdx);
509     }
510 
511     /**
512      * The minimum number of matches to be considered allowable for recovering dangling ends
513      */
getMinMatchingBases()514     private int getMinMatchingBases() {
515         return minMatchingBasesToDangingEndRecovery;
516     }
517 
518     /**
519      * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph
520      *
521      * @param vertex                  the vertex to recover
522      * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
523      * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
524      * @param recoverAll              recover even branches with forks
525      * @param aligner
526      * @return 1 if we successfully recovered a vertex and 0 otherwise
527      */
recoverDanglingHead(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)528     private int recoverDanglingHead(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) {
529         if (inDegreeOf(vertex) != 0) {
530             throw new IllegalStateException("Attempting to recover a dangling head for " + vertex + " but it has in-degree > 0");
531         }
532 
533         // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
534         final DanglingChainMergeHelper danglingHeadMergeResult = generateCigarAgainstUpwardsReferencePath(vertex, pruneFactor, minDanglingBranchLength, recoverAll, aligner);
535 
536         // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path
537         if (danglingHeadMergeResult == null || !cigarIsOkayToMerge(danglingHeadMergeResult.cigar, true, false)) {
538             return 0;
539         }
540 
541         // merge
542         return minMatchingBasesToDangingEndRecovery >= 0 ? mergeDanglingHead(danglingHeadMergeResult) : mergeDanglingHeadLegacy(danglingHeadMergeResult);
543     }
544 
545     /**
546      * Actually merge the dangling tail if possible
547      *
548      * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference
549      * @return 1 if merge was successful, 0 otherwise
550      */
551     @VisibleForTesting
mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResult)552     final int mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResult) {
553 
554         final List<CigarElement> elements = danglingTailMergeResult.cigar.getCigarElements();
555         final CigarElement lastElement = elements.get(elements.size() - 1);
556         Utils.validateArg(lastElement.getOperator() == CigarOperator.M, "The last Cigar element must be an M");
557 
558         final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1;
559         final int matchingSuffix = Math.min(longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength());
560         if (minMatchingBasesToDangingEndRecovery >= 0 ? matchingSuffix < minMatchingBasesToDangingEndRecovery : matchingSuffix == 0 ) {
561             return 0;
562         }
563 
564         final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0);
565 
566         // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a
567         // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling
568         // tail is a perfect match to the suffix of the reference path.  In this case we need to push the reference index to merge
569         // down one position so that we don't incorrectly cut a base off of the deletion.
570         final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D;
571         final boolean mustHandleLeadingDeletionCase = firstElementIsDeletion && (elements.get(0).getLength() + matchingSuffix == lastRefIndex + 1);
572         final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0);
573 
574         // another edge condition occurs here: if Smith-Waterman places the whole tail into an insertion then it will try to
575         // merge back to the LCA, which results in a cycle in the graph.  So we do not want to merge in such a case.
576         if (refIndexToMerge == 0) {
577             return 0;
578         }
579 
580         // it's safe to merge now
581         addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory) getEdgeFactory()).createEdge(false, 1));
582 
583         return 1;
584     }
585 
586     /**
587      * Actually merge the dangling head if possible, this is the old codepath that does not handle indels
588      *
589      * @param danglingHeadMergeResult   the result from generating a Cigar for the dangling head against the reference
590      * @return 1 if merge was successful, 0 otherwise
591      */
592     @VisibleForTesting
mergeDanglingHeadLegacy(final DanglingChainMergeHelper danglingHeadMergeResult)593     int mergeDanglingHeadLegacy(final DanglingChainMergeHelper danglingHeadMergeResult) {
594 
595         final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements();
596         final CigarElement firstElement = elements.get(0);
597         Utils.validateArg(firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M");
598 
599         final int indexesToMerge = bestPrefixMatchLegacy(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength());
600         if (indexesToMerge <= 0) {
601             return 0;
602         }
603 
604         // we can't push back the reference path
605         if (indexesToMerge >= danglingHeadMergeResult.referencePath.size() - 1) {
606             return 0;
607         }
608 
609         // but we can manipulate the dangling path if we need to
610         if (indexesToMerge >= danglingHeadMergeResult.danglingPath.size() &&
611                 !extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge - danglingHeadMergeResult.danglingPath.size() + 2, elements)) {
612             return 0;
613         }
614 
615         addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge + 1), danglingHeadMergeResult.danglingPath.get(indexesToMerge), ((MyEdgeFactory) getEdgeFactory()).createEdge(false, 1));
616 
617         return 1;
618     }
619 
620 
621     /**
622      * Actually merge the dangling head if possible
623      *
624      * @param danglingHeadMergeResult   the result from generating a Cigar for the dangling head against the reference
625      * @return 1 if merge was successful, 0 otherwise
626      */
627     @VisibleForTesting
mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult)628     int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) {
629 
630         final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements();
631         final CigarElement firstElement = elements.get(0);
632         Utils.validateArg( firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M");
633 
634         final Pair<Integer, Integer> indexesToMerge = bestPrefixMatch(elements, danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString);
635         if ( indexesToMerge.getKey() <= 0 ||  indexesToMerge.getValue() <= 0 ) {
636             return 0;
637         }
638 
639         // we can't push back the reference path
640         if ( indexesToMerge.getKey() >= danglingHeadMergeResult.referencePath.size() - 1 ) {
641             return 0;
642         }
643 
644         // but we can manipulate the dangling path if we need to
645         if ( indexesToMerge.getValue() >= danglingHeadMergeResult.danglingPath.size() &&
646                 ! extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge.getValue() - danglingHeadMergeResult.danglingPath.size() + 2, elements) ) {
647             return 0;
648         }
649 
650         addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge.getKey() + 1), danglingHeadMergeResult.danglingPath.get(indexesToMerge.getValue()), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1));
651 
652         return 1;
653     }
654 
655     /**
656      * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
657      * provided vertex is the sink) and the reference path.
658      *
659      * @param aligner
660      * @param vertex      the sink of the dangling chain
661      * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
662      * @param recoverAll  recover even branches with forks
663      * @return a SmithWaterman object which can be null if no proper alignment could be generated
664      */
665     @VisibleForTesting
generateCigarAgainstDownwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner)666     DanglingChainMergeHelper generateCigarAgainstDownwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner) {
667         final int minTailPathLength = Math.max(1, minDanglingBranchLength); // while heads can be 0, tails absolutely cannot
668 
669         // find the lowest common ancestor path between this vertex and the diverging master path if available
670         final List<MultiDeBruijnVertex> altPath = findPathUpwardsToLowestCommonAncestor(vertex, pruneFactor, !recoverAll);
671         if (altPath == null || isRefSource(altPath.get(0)) || altPath.size() < minTailPathLength + 1) // add 1 to include the LCA
672         {
673             return null;
674         }
675 
676         // now get the reference path from the LCA
677         final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0), TraversalDirection.downwards, Optional.ofNullable(getHeaviestIncomingEdge(altPath.get(1))));
678 
679         // create the Smith-Waterman strings to use
680         final byte[] refBases = getBasesForPath(refPath, false);
681         final byte[] altBases = getBasesForPath(altPath, false);
682 
683         // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
684         final SmithWatermanAlignment alignment = aligner.align(refBases, altBases, SmithWatermanAligner.STANDARD_NGS, SWOverhangStrategy.LEADING_INDEL);
685         return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
686     }
687 
688     /**
689      * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
690      * provided vertex is the source) and the reference path.
691      *
692      * @param aligner
693      * @param vertex      the source of the dangling head
694      * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
695      * @param recoverAll  recover even branches with forks
696      * @return a SmithWaterman object which can be null if no proper alignment could be generated
697      */
698     @VisibleForTesting
generateCigarAgainstUpwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner)699     DanglingChainMergeHelper generateCigarAgainstUpwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner) {
700 
701         // find the highest common descendant path between vertex and the reference source if available
702         final List<MultiDeBruijnVertex> altPath = findPathDownwardsToHighestCommonDescendantOfReference(vertex, pruneFactor, !recoverAll);
703         if (altPath == null || isRefSink(altPath.get(0)) || altPath.size() < minDanglingBranchLength + 1) // add 1 to include the LCA
704         {
705             return null;
706         }
707 
708         // now get the reference path from the LCA
709         final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0), TraversalDirection.upwards, Optional.empty());
710 
711         // create the Smith-Waterman strings to use
712         final byte[] refBases = getBasesForPath(refPath, true);
713         final byte[] altBases = getBasesForPath(altPath, true);
714 
715         // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting)
716         final SmithWatermanAlignment alignment = aligner.align(refBases, altBases, SmithWatermanAligner.STANDARD_NGS, SWOverhangStrategy.LEADING_INDEL);
717         return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar()));
718     }
719 
720     /**
721      * Finds the path upwards in the graph from this vertex to the first diverging node, including that (lowest common ancestor) vertex.
722      * Note that nodes are excluded if their pruning weight is less than the pruning factor.
723      *
724      * @param vertex         the original vertex
725      * @param pruneFactor    the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
726      * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
727      * @return the path if it can be determined or null if this vertex either doesn't merge onto another path or
728      * has an ancestor with multiple incoming edges before hitting the reference path
729      */
findPathUpwardsToLowestCommonAncestor(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch)730     private List<MultiDeBruijnVertex> findPathUpwardsToLowestCommonAncestor(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch) {
731         return giveUpAtBranch ? findPath(vertex, pruneFactor, v -> inDegreeOf(v) != 1 || outDegreeOf(v) >= 2, v -> outDegreeOf(v) > 1, v -> incomingEdgeOf(v), e -> getEdgeSource(e))
732                 : findPath(vertex, pruneFactor, v -> hasIncidentRefEdge(v) || inDegreeOf(v) == 0, v -> outDegreeOf(v) > 1 && hasIncidentRefEdge(v), this::getHeaviestIncomingEdge, e -> getEdgeSource(e));
733     }
734 
hasIncidentRefEdge(final MultiDeBruijnVertex v)735     private boolean hasIncidentRefEdge(final MultiDeBruijnVertex v) {
736         for (final MultiSampleEdge edge : incomingEdgesOf(v)) {
737             if (edge.isRef()) {
738                 return true;
739             }
740         }
741         return false;
742     }
743 
getHeaviestIncomingEdge(final MultiDeBruijnVertex v)744     private MultiSampleEdge getHeaviestIncomingEdge(final MultiDeBruijnVertex v) {
745         final Set<MultiSampleEdge> incomingEdges = incomingEdgesOf(v);
746         return incomingEdges.size() == 1 ? incomingEdges.iterator().next() :
747                 incomingEdges.stream().max(Comparator.comparingInt(MultiSampleEdge::getMultiplicity)).get();
748     }
749 
750     /**
751      * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex.
752      * However note that the path is reversed so that this vertex ends up at the end of the path.
753      * Also note that nodes are excluded if their pruning weight is less than the pruning factor.
754      *
755      * @param vertex         the original vertex
756      * @param pruneFactor    the prune factor to use in ignoring chain pieces
757      * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
758      * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
759      * has a descendant with multiple outgoing edges before hitting the reference path
760      */
findPathDownwardsToHighestCommonDescendantOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch)761     private List<MultiDeBruijnVertex> findPathDownwardsToHighestCommonDescendantOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch) {
762         return giveUpAtBranch ? findPath(vertex, pruneFactor, v -> isReferenceNode(v) || outDegreeOf(v) != 1, v -> isReferenceNode(v), v -> outgoingEdgeOf(v), e -> getEdgeTarget(e))
763                 : findPath(vertex, pruneFactor, v -> isReferenceNode(v) || outDegreeOf(v) == 0, v -> isReferenceNode(v), this::getHeaviestOutgoingEdge, e -> getEdgeTarget(e));
764     }
765 
getHeaviestOutgoingEdge(final MultiDeBruijnVertex v)766     private MultiSampleEdge getHeaviestOutgoingEdge(final MultiDeBruijnVertex v) {
767         final Set<MultiSampleEdge> outgoing = outgoingEdgesOf(v);
768         return outgoing.size() == 1 ? outgoing.iterator().next() :
769                 outgoing.stream().max(Comparator.comparingInt(MultiSampleEdge::getMultiplicity)).get();
770     }
771 
772     /**
773      * Finds a path starting from a given vertex and satisfying various predicates
774      *
775      * @param vertex      the original vertex
776      * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
777      * @param done        test for whether a vertex is at the end of the path
778      * @param returnPath  test for whether to return a found path based on its terminal vertex
779      * @param nextEdge    function on vertices returning the next edge in the path
780      * @param nextNode    function of edges returning the next vertex in the path
781      * @return a path, if one satisfying all predicates is found, {@code null} otherwise
782      */
findPath(final MultiDeBruijnVertex vertex, final int pruneFactor, final Predicate<MultiDeBruijnVertex> done, final Predicate<MultiDeBruijnVertex> returnPath, final Function<MultiDeBruijnVertex, MultiSampleEdge> nextEdge, final Function<MultiSampleEdge, MultiDeBruijnVertex> nextNode)783     private List<MultiDeBruijnVertex> findPath(final MultiDeBruijnVertex vertex, final int pruneFactor,
784                                                final Predicate<MultiDeBruijnVertex> done,
785                                                final Predicate<MultiDeBruijnVertex> returnPath,
786                                                final Function<MultiDeBruijnVertex, MultiSampleEdge> nextEdge,
787                                                final Function<MultiSampleEdge, MultiDeBruijnVertex> nextNode) {
788         final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>();
789         final Set<MultiDeBruijnVertex> visitedNodes = new HashSet<>(); // This code is necessary to
790 
791         MultiDeBruijnVertex v = vertex;
792         while (!done.test(v)) {
793             final MultiSampleEdge edge = nextEdge.apply(v);
794             // if it has too low a weight, don't use it (or previous vertexes) for the path
795             if (edge.getPruningMultiplicity() < pruneFactor) {
796                 // save the previously visited notes to protect us from riding forever 'neath the streets of boston
797                 visitedNodes.addAll(path);
798                 path.clear();
799             } else {
800                 path.addFirst(v);
801             }
802             v = nextNode.apply(edge);
803             // Check that we aren't stuck in a loop
804             if (path.contains(v) || visitedNodes.contains(v)) {
805                 System.err.println("Dangling End recovery killed because of a loop (findPath)");
806                 return null;
807             }
808         }
809         path.addFirst(v);
810 
811         return returnPath.test(v) ? path : null;
812     }
813 
814     /**
815      * Finds the path in the graph from this vertex to the reference sink, including this vertex
816      *
817      * @param start           the reference vertex to start from
818      * @param direction       describes which direction to move in the graph (i.e. down to the reference sink or up to the source)
819      * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths
820      * @return the path (non-null, non-empty)
821      */
getReferencePath(final MultiDeBruijnVertex start, final TraversalDirection direction, final Optional<MultiSampleEdge> blacklistedEdge)822     protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start,
823                                                          final TraversalDirection direction,
824                                                          final Optional<MultiSampleEdge> blacklistedEdge) {
825 
826         final List<MultiDeBruijnVertex> path = new ArrayList<>();
827 
828         MultiDeBruijnVertex v = start;
829         while (v != null) {
830             path.add(v);
831             v = (direction == TraversalDirection.downwards ? getNextReferenceVertex(v, true, blacklistedEdge) : getPrevReferenceVertex(v));
832         }
833 
834         return path;
835     }
836 
837     /**
838      * The base sequence for the given path.
839      *
840      * @param path         the list of vertexes that make up the path
841      * @param expandSource if true and if we encounter a source node, then expand (and reverse) the character sequence for that node
842      * @return non-null sequence of bases corresponding to the given path
843      */
844     @VisibleForTesting
getBasesForPath(final List<MultiDeBruijnVertex> path, final boolean expandSource)845     byte[] getBasesForPath(final List<MultiDeBruijnVertex> path, final boolean expandSource) {
846         Utils.nonNull(path, "Path cannot be null");
847 
848         final StringBuilder sb = new StringBuilder();
849         for (final MultiDeBruijnVertex v : path) {
850             if (expandSource && isSource(v)) {
851                 final String seq = v.getSequenceString();
852                 sb.append(new StringBuilder(seq).reverse().toString());
853             } else {
854                 sb.append((char) v.getSuffix());
855             }
856         }
857 
858         return sb.toString().getBytes();
859     }
860 
861     /**
862      * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
863      * Assumes that path1.length >= maxIndex and path2.length >= maxIndex.
864      *
865      * @param path1    the first path
866      * @param path2    the second path
867      * @param maxIndex the maximum index to traverse (not inclusive)
868      * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex
869      */
bestPrefixMatchLegacy(final byte[] path1, final byte[] path2, final int maxIndex)870     private int bestPrefixMatchLegacy(final byte[] path1, final byte[] path2, final int maxIndex) {
871         final int maxMismatches = getMaxMismatchesLegacy(maxIndex);
872         int mismatches = 0;
873         int index = 0;
874         int lastGoodIndex = -1;
875         while (index < maxIndex) {
876             if (path1[index] != path2[index]) {
877                 if (++mismatches > maxMismatches) {
878                     return -1;
879                 }
880                 lastGoodIndex = index;
881             }
882             index++;
883         }
884         // if we got here then we hit the max index
885         return lastGoodIndex;
886     }
887 
888     /**
889      * NOTE: this method is only used for dangling heads and not tails.
890      *
891      * Determine the maximum number of mismatches permitted on the branch.
892      * Unless it's preset (e.g. by unit tests) it should be the length of the branch divided by the kmer size.
893      *
894      * @param lengthOfDanglingBranch the length of the branch itself
895      * @return positive integer
896      */
getMaxMismatchesLegacy(final int lengthOfDanglingBranch)897     private int getMaxMismatchesLegacy(final int lengthOfDanglingBranch) {
898         return maxMismatchesInDanglingHead > 0 ? maxMismatchesInDanglingHead : Math.max(1, (lengthOfDanglingBranch / kmerSize));
899     }
900 
extendDanglingPathAgainstReference(final DanglingChainMergeHelper danglingHeadMergeResult, final int numNodesToExtend, List<CigarElement> elements)901     private boolean extendDanglingPathAgainstReference(final DanglingChainMergeHelper danglingHeadMergeResult, final int numNodesToExtend, List<CigarElement> elements) {
902 
903         final int indexOfLastDanglingNode = danglingHeadMergeResult.danglingPath.size() - 1;
904         final int offsetForRefEndToDanglingEnd = elements.stream().mapToInt(ce -> (ce.getOperator().consumesReferenceBases()? ce.getLength() : 0) - (ce.getOperator().consumesReadBases()? ce.getLength() : 0)).sum();
905         final int indexOfRefNodeToUse = indexOfLastDanglingNode + offsetForRefEndToDanglingEnd + numNodesToExtend;
906         if (indexOfRefNodeToUse >= danglingHeadMergeResult.referencePath.size()) {
907             return false;
908         }
909 
910         final MultiDeBruijnVertex danglingSource = danglingHeadMergeResult.danglingPath.remove(indexOfLastDanglingNode);
911         final StringBuilder sb = new StringBuilder();
912         final byte[] refSourceSequence = danglingHeadMergeResult.referencePath.get(indexOfRefNodeToUse).getSequence();
913         for (int i = 0; i < numNodesToExtend; i++) {
914             sb.append((char) refSourceSequence[i]);
915         }
916         sb.append(danglingSource.getSequenceString());
917         final byte[] sequenceToExtend = sb.toString().getBytes();
918 
919         // clean up the source and edge
920         final MultiSampleEdge sourceEdge = getHeaviestOutgoingEdge(danglingSource);
921         MultiDeBruijnVertex prevV = getEdgeTarget(sourceEdge);
922         removeEdge(danglingSource, prevV);
923 
924         // extend the path
925         for (int i = numNodesToExtend; i > 0; i--) {
926             final MultiDeBruijnVertex newV = new MultiDeBruijnVertex(Arrays.copyOfRange(sequenceToExtend, i, i + kmerSize));
927             addVertex(newV);
928             final MultiSampleEdge newE = addEdge(newV, prevV);
929             newE.setMultiplicity(sourceEdge.getMultiplicity());
930             danglingHeadMergeResult.danglingPath.add(newV);
931             prevV = newV;
932         }
933 
934         return true;
935     }
936 
increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, final MultiDeBruijnVertex vertex, final byte[] originalKmer, final int offset)937     private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers,
938                                               final MultiDeBruijnVertex vertex,
939                                               final byte[] originalKmer,
940                                               final int offset) {
941         if (offset == -1) {
942             return;
943         }
944 
945         for (final MultiSampleEdge edge : incomingEdgesOf(vertex)) {
946             final MultiDeBruijnVertex prev = getEdgeSource(edge);
947             final byte suffix = prev.getSuffix();
948             final byte seqBase = originalKmer[offset];
949             if (suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1)) {
950                 edge.incMultiplicity(seqForKmers.count);
951                 increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset - 1);
952             }
953         }
954     }
955 
956     /**
957      * Get the vertex for the kmer in sequence starting at start
958      *
959      * @param sequence the sequence
960      * @param start    the position of the kmer start
961      * @return a non-null vertex
962      */
getOrCreateKmerVertex(final byte[] sequence, final int start)963     private MultiDeBruijnVertex getOrCreateKmerVertex(final byte[] sequence, final int start) {
964         final Kmer kmer = new Kmer(sequence, start, kmerSize);
965         final MultiDeBruijnVertex vertex = getKmerVertex(kmer, true);
966         return (vertex != null) ? vertex : createVertex(kmer);
967     }
968 
969     /**
970      * Get the unique vertex for kmer, or null if not possible.
971      *
972      * @param allowRefSource if true, we will allow kmer to match the reference source vertex
973      * @return a vertex for kmer, or null (either because it doesn't exist or is non-unique for graphs that have such a distinction)
974      */
getKmerVertex(final Kmer kmer, final boolean allowRefSource)975     protected MultiDeBruijnVertex getKmerVertex(final Kmer kmer, final boolean allowRefSource) {
976         if (!allowRefSource && kmer.equals(refSource)) {
977             return null;
978         }
979 
980         return kmerToVertexMap.get(kmer);
981     }
982 
983     /**
984      * Create a new vertex for kmer.  Add it to the kmerToVertexMap map if appropriate.
985      *
986      * @param kmer the kmer we want to create a vertex for
987      * @return the non-null created vertex
988      */
createVertex(final Kmer kmer)989     private MultiDeBruijnVertex createVertex(final Kmer kmer) {
990         final MultiDeBruijnVertex newVertex = new MultiDeBruijnVertex(kmer.bases());
991         final int prevSize = vertexSet().size();
992         addVertex(newVertex);
993 
994         // make sure we aren't adding duplicates (would be a bug)
995         if (vertexSet().size() != prevSize + 1) {
996             throw new IllegalStateException("Adding vertex " + newVertex + " to graph didn't increase the graph size");
997         }
998         trackKmer(kmer, newVertex);
999 
1000         return newVertex;
1001     }
1002 
1003     /**
1004      * Workhorse routine of the assembler.  Given a sequence whose last vertex is anchored in the graph, extend
1005      * the graph one bp according to the bases in sequence.
1006      *
1007      * @param prevVertex a non-null vertex where sequence was last anchored in the graph
1008      * @param sequence   the sequence we're threading through the graph
1009      * @param kmerStart  the start of the current kmer in graph we'd like to add
1010      * @param count      the number of observations of this kmer in graph (can be > 1 for GGA)
1011      * @param isRef      is this the reference sequence?
1012      * @return a non-null vertex connecting prevVertex to in the graph based on sequence
1013      */
extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef)1014     protected MultiDeBruijnVertex extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef) {
1015         final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(prevVertex);
1016 
1017         final int nextPos = kmerStart + kmerSize - 1;
1018         for (final MultiSampleEdge outgoingEdge : outgoingEdges) {
1019             final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge);
1020             if (target.getSuffix() == sequence[nextPos]) {
1021                 // we've got a match in the chain, so simply increase the count of the edge by 1 and continue
1022                 outgoingEdge.incMultiplicity(count);
1023                 return target;
1024             }
1025         }
1026 
1027         // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
1028         final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize);
1029         final MultiDeBruijnVertex mergeVertex = getNextKmerVertexForChainExtension(kmer, isRef, prevVertex);
1030 
1031         // either use our merge vertex, or create a new one in the chain
1032         final MultiDeBruijnVertex nextVertex = mergeVertex == null ? createVertex(kmer) : mergeVertex;
1033         addEdge(prevVertex, nextVertex, ((MyEdgeFactory) getEdgeFactory()).createEdge(isRef, count));
1034         return nextVertex;
1035     }
1036 
1037     /**
1038      * Add a read to the sequence graph.  Finds maximal consecutive runs of bases with sufficient quality
1039      * and applies {@see addSequence} to these subreads if they are longer than the kmer size.
1040      *
1041      * @param read a non-null read
1042      */
1043     @VisibleForTesting
addRead(final GATKRead read, final SAMFileHeader header)1044     void addRead(final GATKRead read, final SAMFileHeader header) {
1045         final byte[] sequence = read.getBases();
1046         final byte[] qualities = read.getBaseQualities();
1047 
1048         int lastGood = -1;
1049         for (int end = 0; end <= sequence.length; end++) {
1050             if (end == sequence.length || !baseIsUsableForAssembly(sequence[end], qualities[end])) {
1051                 // the first good base is at lastGood, can be -1 if last base was bad
1052                 final int start = lastGood;
1053                 // the stop base is end - 1 (if we're not at the end of the sequence)
1054                 final int len = end - start;
1055 
1056                 if (start != -1 && len >= kmerSize) {
1057                     // if the sequence is long enough to get some value out of, add it to the graph
1058                     final String name = read.getName() + '_' + start + '_' + end;
1059                     addSequence(name, ReadUtils.getSampleName(read, header), sequence, start, end, 1, false);
1060                 }
1061 
1062                 lastGood = -1; // reset the last good base
1063             } else if (lastGood == -1) {
1064                 lastGood = end; // we're at a good base, the last good one is us
1065             }
1066         }
1067     }
1068 
1069     /**
1070      * Determines whether a base can safely be used for assembly.
1071      * Currently disallows Ns and/or those with low quality
1072      *
1073      * @param base  the base under consideration
1074      * @param qual  the quality of that base
1075      * @return true if the base can be used for assembly, false otherwise
1076      */
baseIsUsableForAssembly(final byte base, final byte qual)1077     protected boolean baseIsUsableForAssembly(final byte base, final byte qual) {
1078         return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly;
1079     }
1080 
1081     @Override
findKmer(final Kmer k)1082     public MultiDeBruijnVertex findKmer(final Kmer k) {
1083         return kmerToVertexMap.get(k);
1084     }
1085 
1086     /**
1087      * Edge factory that encapsulates the numPruningSamples assembly parameter
1088      */
1089     protected static final class MyEdgeFactory implements EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> {
1090         final int numPruningSamples;
1091 
MyEdgeFactory(final int numPruningSamples)1092         MyEdgeFactory(final int numPruningSamples) {
1093             this.numPruningSamples = numPruningSamples;
1094         }
1095 
1096         @Override
createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex)1097         public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) {
1098             return new MultiSampleEdge(false, 1, numPruningSamples);
1099         }
1100 
createEdge(final boolean isRef, final int multiplicity)1101         MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) {
1102             return new MultiSampleEdge(isRef, multiplicity, numPruningSamples);
1103         }
1104     }
1105 
1106     /**
1107      * Class to keep track of the important dangling chain merging data
1108      */
1109     static final class DanglingChainMergeHelper {
1110         final List<MultiDeBruijnVertex> danglingPath;
1111         final List<MultiDeBruijnVertex> referencePath;
1112         final byte[] danglingPathString;
1113         final byte[] referencePathString;
1114         final Cigar cigar;
1115 
DanglingChainMergeHelper(final List<MultiDeBruijnVertex> danglingPath, final List<MultiDeBruijnVertex> referencePath, final byte[] danglingPathString, final byte[] referencePathString, final Cigar cigar)1116         DanglingChainMergeHelper(final List<MultiDeBruijnVertex> danglingPath,
1117                                  final List<MultiDeBruijnVertex> referencePath,
1118                                  final byte[] danglingPathString,
1119                                  final byte[] referencePathString,
1120                                  final Cigar cigar) {
1121             this.danglingPath = danglingPath;
1122             this.referencePath = referencePath;
1123             this.danglingPathString = danglingPathString;
1124             this.referencePathString = referencePathString;
1125             this.cigar = cigar;
1126         }
1127     }
1128 
1129     /**
1130      * Keeps track of the information needed to add a sequence to the read threading assembly graph
1131      */
1132     static final class SequenceForKmers {
1133         final String name;
1134         final byte[] sequence;
1135         final int start;
1136         final int stop;
1137         final int count;
1138         final boolean isRef;
1139 
1140         /**
1141          * Create a new sequence for creating kmers
1142          */
SequenceForKmers(final String name, final byte[] sequence, final int start, final int stop, final int count, final boolean ref)1143         SequenceForKmers(final String name, final byte[] sequence, final int start, final int stop, final int count, final boolean ref) {
1144             Utils.nonNull(sequence, "Sequence is null ");
1145             Utils.validateArg(start >= 0, () -> "Invalid start " + start);
1146             Utils.validateArg(stop >= start, () -> "Invalid stop " + stop);
1147             Utils.validateArg(count > 0, "Invalid count " + count);
1148 
1149             this.name = name;
1150             this.sequence = sequence;
1151             this.start = start;
1152             this.stop = stop;
1153             this.count = count;
1154             isRef = ref;
1155         }
1156     }
1157 }
1158