1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import com.google.common.collect.Lists; 5 import htsjdk.samtools.Cigar; 6 import htsjdk.samtools.CigarElement; 7 import htsjdk.samtools.CigarOperator; 8 import htsjdk.samtools.SAMFileHeader; 9 import htsjdk.samtools.util.Locatable; 10 import org.apache.commons.lang3.tuple.Pair; 11 import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; 12 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer; 13 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.BaseGraph; 14 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.KmerSearchableGraph; 15 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge; 16 import org.broadinstitute.hellbender.utils.BaseUtils; 17 import org.broadinstitute.hellbender.utils.Utils; 18 import org.broadinstitute.hellbender.utils.read.AlignmentUtils; 19 import org.broadinstitute.hellbender.utils.read.GATKRead; 20 import org.broadinstitute.hellbender.utils.read.ReadUtils; 21 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; 22 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment; 23 import org.jgrapht.EdgeFactory; 24 25 import java.io.File; 26 import java.util.*; 27 import java.util.function.Function; 28 import java.util.function.Predicate; 29 import java.util.stream.Collectors; 30 31 /** 32 * Read threading graph class intended to contain duplicated code between {@link ReadThreadingGraph} and {@link JunctionTreeLinkedDeBruijnGraph}. 33 */ 34 public abstract class AbstractReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> implements KmerSearchableGraph<MultiDeBruijnVertex, MultiSampleEdge> { 35 private static final long serialVersionUID = 1l; 36 private static final String ANONYMOUS_SAMPLE = "XXX_UNNAMED_XXX"; 37 private static final boolean WRITE_GRAPH = false; 38 private static final boolean DEBUG_NON_UNIQUE_CALC = false; 39 private static final int MAX_CIGAR_COMPLEXITY = 3; 40 private static final boolean INCREASE_COUNTS_BACKWARDS = true; 41 private int minMatchingBasesToDangingEndRecovery = -1; 42 43 /** 44 * for debugging info printing 45 */ 46 private static int counter = 0; 47 /** 48 * Sequences added for read threading before we've actually built the graph 49 */ 50 protected final Map<String, List<SequenceForKmers>> pending = new LinkedHashMap<>(); 51 /** 52 * A map from kmers -> their corresponding vertex in the graph 53 */ 54 protected final Map<Kmer, MultiDeBruijnVertex> kmerToVertexMap = new LinkedHashMap<>(); 55 protected final boolean debugGraphTransformations; 56 protected final byte minBaseQualityToUseInAssembly; 57 protected List<MultiDeBruijnVertex> referencePath = null; 58 protected boolean alreadyBuilt = false; 59 // -------------------------------------------------------------------------------- 60 // state variables, initialized in setToInitialState() 61 // -------------------------------------------------------------------------------- 62 private Kmer refSource = null; 63 private boolean startThreadingOnlyAtExistingVertex = false; 64 private int maxMismatchesInDanglingHead = -1; // this argument exists purely for testing purposes in constructing helpful tests and is currently not hooked up 65 private boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds 66 67 protected enum TraversalDirection { 68 downwards, 69 upwards 70 } 71 AbstractReadThreadingGraph(int kmerSize, EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory)72 AbstractReadThreadingGraph(int kmerSize, EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory) { 73 super(kmerSize, edgeFactory); 74 debugGraphTransformations = false; 75 minBaseQualityToUseInAssembly = 0; 76 } 77 78 /** 79 * Create a new ReadThreadingAssembler using kmerSize for matching 80 * 81 * @param kmerSize must be >= 1 82 */ AbstractReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases)83 public AbstractReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) { 84 super(kmerSize, new MyEdgeFactory(numPruningSamples)); 85 86 Utils.validateArg(kmerSize > 0, () -> "bad minkKmerSize " + kmerSize); 87 88 this.debugGraphTransformations = debugGraphTransformations; 89 this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; 90 this.minMatchingBasesToDangingEndRecovery = numDanglingMatchingPrefixBases; 91 } 92 93 /** 94 * Checks whether a kmer can be the threading start based on the current threading start location policy. 95 * 96 * @param kmer the query kmer. 97 * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise. 98 * @see #setThreadingStartOnlyAtExistingVertex(boolean) 99 */ isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex)100 protected abstract boolean isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex); 101 102 // get the next kmerVertex for ChainExtension and validate if necessary. getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex)103 protected abstract MultiDeBruijnVertex getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex); 104 105 // perform any necessary preprocessing on the graph (such as non-unique kmer determination) before the graph is constructed preprocessReads()106 protected abstract void preprocessReads(); 107 108 // heuristic to decide if the graph should be thown away based on low complexity/poor assembly isLowQualityGraph()109 public abstract boolean isLowQualityGraph(); 110 111 // whether reads are needed after graph construction shouldRemoveReadsAfterGraphConstruction()112 protected abstract boolean shouldRemoveReadsAfterGraphConstruction(); 113 114 // Method that will be called immediately before haplotype finding in the event there are alteations that must be made to the graph based on implementation postProcessForHaplotypeFinding(File debugGraphOutputPath, Locatable refHaplotype)115 public abstract void postProcessForHaplotypeFinding(File debugGraphOutputPath, Locatable refHaplotype); 116 117 /** 118 * Define the behavior for how the graph should keep track of a potentially new kmer. 119 * 120 * @param kmer (potentially) new kmer to track 121 * @param newVertex corresponding vertex for that kmer 122 */ trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex)123 protected abstract void trackKmer(Kmer kmer, MultiDeBruijnVertex newVertex); 124 125 /** 126 * Determine whether the provided cigar is okay to merge into the reference path 127 * 128 * @param cigar the cigar to analyze 129 * @param requireFirstElementM if true, require that the first cigar element be an M operator in order for it to be okay 130 * @param requireLastElementM if true, require that the last cigar element be an M operator in order for it to be okay 131 * @return true if it's okay to merge, false otherwise 132 */ 133 @VisibleForTesting cigarIsOkayToMerge(final Cigar cigar, final boolean requireFirstElementM, final boolean requireLastElementM)134 static boolean cigarIsOkayToMerge(final Cigar cigar, final boolean requireFirstElementM, final boolean requireLastElementM) { 135 final List<CigarElement> elements = cigar.getCigarElements(); 136 final int numElements = elements.size(); 137 138 // don't allow more than a couple of different ops 139 if (numElements == 0 || numElements > MAX_CIGAR_COMPLEXITY) { 140 return false; 141 } 142 143 // the first element must be an M 144 if (requireFirstElementM && elements.get(0).getOperator() != CigarOperator.M) { 145 return false; 146 } 147 148 // the last element must be an M 149 if (requireLastElementM && elements.get(numElements - 1).getOperator() != CigarOperator.M) { 150 return false; 151 } 152 153 // note that there are checks for too many mismatches in the dangling branch later in the process 154 155 return true; 156 } 157 158 /** 159 * calculates the longest suffix match between a sequence and a smaller kmer 160 * 161 * @param seq the (reference) sequence 162 * @param kmer the smaller kmer sequence 163 * @param seqStart the index (inclusive) on seq to start looking backwards from 164 * @return the longest matching suffix 165 */ 166 @VisibleForTesting longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart)167 static int longestSuffixMatch(final byte[] seq, final byte[] kmer, final int seqStart) { 168 for (int len = 1; len <= kmer.length; len++) { 169 final int seqI = seqStart - len + 1; 170 final int kmerI = kmer.length - len; 171 if (seqI < 0 || seq[seqI] != kmer[kmerI]) { 172 return len - 1; 173 } 174 } 175 return kmer.length; 176 } 177 178 @VisibleForTesting setAlreadyBuilt()179 protected void setAlreadyBuilt() { 180 alreadyBuilt = true; 181 } 182 183 @VisibleForTesting setMinMatchingBasesToDangingEndRecovery(final int minMatchingBasesToDangingEndRecovery)184 void setMinMatchingBasesToDangingEndRecovery(final int minMatchingBasesToDangingEndRecovery) { 185 this.minMatchingBasesToDangingEndRecovery = minMatchingBasesToDangingEndRecovery; 186 } 187 188 @VisibleForTesting setMaxMismatchesInDanglingHead(final int maxMismatchesInDanglingHead)189 void setMaxMismatchesInDanglingHead(final int maxMismatchesInDanglingHead) { 190 this.maxMismatchesInDanglingHead = maxMismatchesInDanglingHead; 191 } 192 193 /** 194 * Find vertex and its position in seqForKmers where we should start assembling seqForKmers 195 * 196 * @param seqForKmers the sequence we want to thread into the graph 197 * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one 198 */ findStart(final SequenceForKmers seqForKmers)199 protected int findStart(final SequenceForKmers seqForKmers) { 200 if (seqForKmers.isRef) { 201 return 0; 202 } 203 204 for (int i = seqForKmers.start; i < seqForKmers.stop - kmerSize; i++) { 205 final Kmer kmer1 = new Kmer(seqForKmers.sequence, i, kmerSize); 206 if (isThreadingStart(kmer1, startThreadingOnlyAtExistingVertex)) { 207 return i; 208 } 209 } 210 211 return -1; 212 } 213 214 /** 215 * Add the all bases in sequence to the graph 216 * 217 * @param sequence a non-null sequence 218 * @param isRef is this the reference sequence? 219 */ 220 @VisibleForTesting addSequence(final byte[] sequence, final boolean isRef)221 public final void addSequence(final byte[] sequence, final boolean isRef) { 222 addSequence("anonymous", sequence, isRef); 223 } 224 225 /** 226 * Add all bases in sequence to this graph 227 * 228 * @see #addSequence(String, String, byte[], int, int, int, boolean) for full information 229 */ addSequence(final String seqName, final byte[] sequence, final boolean isRef)230 public final void addSequence(final String seqName, final byte[] sequence, final boolean isRef) { 231 addSequence(seqName, sequence, 1, isRef); 232 } 233 234 /** 235 * Add all bases in sequence to this graph 236 * 237 * @see #addSequence(String, String, byte[], int, int, int, boolean) for full information 238 */ addSequence(final String seqName, final byte[] sequence, final int count, final boolean isRef)239 public final void addSequence(final String seqName, final byte[] sequence, final int count, final boolean isRef) { 240 addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, count, isRef); 241 } 242 243 /** 244 * Add bases in sequence to this graph 245 * 246 * @param seqName a useful seqName for this read, for debugging purposes 247 * @param sequence non-null sequence of bases 248 * @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive 249 * @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive 250 * @param count the representative count of this sequence (to use as the weight) 251 * @param isRef is this the reference sequence. 252 */ addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int count, final boolean isRef)253 protected void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int count, final boolean isRef) { 254 // note that argument testing is taken care of in SequenceForKmers 255 Utils.validate(!alreadyBuilt, "Attempting to add sequence to a graph that has already been built"); 256 257 // get the list of sequences for this sample 258 List<SequenceForKmers> sampleSequences = pending.computeIfAbsent(sampleName, s -> new LinkedList<>()); 259 260 // add the new sequence to the list of sequences for sample 261 sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, count, isRef)); 262 } 263 264 /** 265 * Thread sequence seqForKmers through the current graph, updating the graph as appropriate 266 * 267 * @param seqForKmers a non-null sequence 268 */ threadSequence(final SequenceForKmers seqForKmers)269 private void threadSequence(final SequenceForKmers seqForKmers) { 270 final int startPos = findStart(seqForKmers); 271 if (startPos == -1) { 272 return; 273 } 274 275 final MultiDeBruijnVertex startingVertex = getOrCreateKmerVertex(seqForKmers.sequence, startPos); 276 277 // increase the counts of all edges incoming into the starting vertex supported by going back in sequence 278 if (INCREASE_COUNTS_BACKWARDS) { 279 increaseCountsInMatchedKmers(seqForKmers, startingVertex, startingVertex.getSequence(), kmerSize - 2); 280 } 281 282 if (debugGraphTransformations) { 283 startingVertex.addRead(seqForKmers.name); 284 } 285 286 // keep track of information about the reference source 287 if (seqForKmers.isRef) { 288 if (refSource != null) { 289 throw new IllegalStateException("Found two refSources! prev: " + refSource + ", new: " + startingVertex); 290 } 291 referencePath = new ArrayList<>(seqForKmers.sequence.length - kmerSize); 292 referencePath.add(startingVertex); 293 refSource = new Kmer(seqForKmers.sequence, seqForKmers.start, kmerSize); 294 } 295 296 // loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate 297 MultiDeBruijnVertex vertex = startingVertex; 298 for (int i = startPos + 1; i <= seqForKmers.stop - kmerSize; i++) { 299 vertex = extendChainByOne(vertex, seqForKmers.sequence, i, seqForKmers.count, seqForKmers.isRef); 300 if (seqForKmers.isRef) { 301 referencePath.add(vertex); 302 } 303 if (debugGraphTransformations) { 304 vertex.addRead(seqForKmers.name); 305 } 306 } 307 if (seqForKmers.isRef) { 308 referencePath = Collections.unmodifiableList(referencePath); 309 } 310 } 311 312 /** 313 * Changes the threading start location policy. 314 * 315 * @param value {@code true} if threading will start only at existing vertices in the graph, {@code false} if 316 * it can start at any unique kmer. 317 */ setThreadingStartOnlyAtExistingVertex(final boolean value)318 public final void setThreadingStartOnlyAtExistingVertex(final boolean value) { 319 startThreadingOnlyAtExistingVertex = value; 320 } 321 322 /** 323 * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have 324 * been added to the graph. 325 */ buildGraphIfNecessary()326 public final void buildGraphIfNecessary() { 327 if (alreadyBuilt) { 328 return; 329 } 330 331 // Capture the set of non-unique kmers for the given kmer size (if applicable) 332 preprocessReads(); 333 334 if (DEBUG_NON_UNIQUE_CALC) { 335 ReadThreadingGraph.logger.info("using " + kmerSize + " kmer size for this assembly with the following non-uniques"); 336 } 337 338 // go through the pending sequences, and add them to the graph 339 for (final List<SequenceForKmers> sequencesForSample : pending.values()) { 340 for (final SequenceForKmers sequenceForKmers : sequencesForSample) { 341 threadSequence(sequenceForKmers); 342 if (WRITE_GRAPH) { 343 printGraph(new File("threading." + counter++ + '.' + sequenceForKmers.name.replace(" ", "_") + ".dot"), 0); 344 } 345 } 346 347 // flush the single sample edge values from the graph 348 for (final MultiSampleEdge e : edgeSet()) { 349 e.flushSingleSampleMultiplicity(); 350 } 351 } 352 353 // clear the pending reads pile to conserve memory 354 if (shouldRemoveReadsAfterGraphConstruction()) { 355 pending.clear(); 356 } 357 alreadyBuilt = true; 358 for (final MultiDeBruijnVertex v : kmerToVertexMap.values()) { 359 v.setAdditionalInfo(v.getAdditionalInfo() + '+'); 360 } 361 } 362 363 @Override removeVertex(final MultiDeBruijnVertex V)364 public boolean removeVertex(final MultiDeBruijnVertex V) { 365 final boolean result = super.removeVertex(V); 366 if (result) { 367 final byte[] sequence = V.getSequence(); 368 final Kmer kmer = new Kmer(sequence); 369 kmerToVertexMap.remove(kmer); 370 } 371 return result; 372 } 373 374 @Override removeSingletonOrphanVertices()375 public void removeSingletonOrphanVertices() { 376 // Run through the graph and clean up singular orphaned nodes 377 final Collection<MultiDeBruijnVertex> verticesToRemove = new LinkedList<>(); 378 for (final MultiDeBruijnVertex v : vertexSet()) { 379 if (inDegreeOf(v) == 0 && outDegreeOf(v) == 0) { 380 verticesToRemove.add(v); 381 } 382 } 383 removeVertex(null); 384 removeAllVertices(verticesToRemove); 385 } 386 387 @VisibleForTesting setIncreaseCountsThroughBranches(final boolean increaseCountsThroughBranches)388 void setIncreaseCountsThroughBranches(final boolean increaseCountsThroughBranches) { 389 this.increaseCountsThroughBranches = increaseCountsThroughBranches; 390 } 391 392 /** 393 * Try to recover dangling tails 394 * 395 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 396 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it 397 * @param recoverAll recover even branches with forks 398 * @param aligner 399 */ recoverDanglingTails(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)400 public void recoverDanglingTails(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) { 401 Utils.validateArg(pruneFactor >= 0, () -> "pruneFactor must be non-negative but was " + pruneFactor); 402 Utils.validateArg(minDanglingBranchLength >= 0, () -> "minDanglingBranchLength must be non-negative but was " + minDanglingBranchLength); 403 404 if (!alreadyBuilt) { 405 throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); 406 } 407 408 int attempted = 0; 409 int nRecovered = 0; 410 for (final MultiDeBruijnVertex v : vertexSet()) { 411 if (outDegreeOf(v) == 0 && !isRefSink(v)) { 412 attempted++; 413 nRecovered += recoverDanglingTail(v, pruneFactor, minDanglingBranchLength, recoverAll, aligner); 414 } 415 } 416 417 ReadThreadingGraph.logger.debug(String.format("Recovered %d of %d dangling tails", nRecovered, attempted)); 418 } 419 420 /** 421 * Try to recover dangling heads 422 * 423 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 424 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it 425 * @param recoverAll recover even branches with forks 426 * @param aligner 427 */ recoverDanglingHeads(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)428 public void recoverDanglingHeads(final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) { 429 Utils.validateArg(pruneFactor >= 0, () -> "pruneFactor must be non-negative but was " + pruneFactor); 430 Utils.validateArg(minDanglingBranchLength >= 0, () -> "minDanglingBranchLength must be non-negative but was " + minDanglingBranchLength); 431 if (!alreadyBuilt) { 432 throw new IllegalStateException("recoverDanglingHeads requires the graph be already built"); 433 } 434 435 // we need to build a list of dangling heads because that process can modify the graph (and otherwise generate 436 // a ConcurrentModificationException if we do it while iterating over the vertexes) 437 final Collection<MultiDeBruijnVertex> danglingHeads = vertexSet().stream() 438 .filter(v -> inDegreeOf(v) == 0 && !isRefSource(v)) 439 .collect(Collectors.toList()); 440 441 // now we can try to recover the dangling heads 442 int attempted = 0; 443 int nRecovered = 0; 444 for (final MultiDeBruijnVertex v : danglingHeads) { 445 attempted++; 446 nRecovered += recoverDanglingHead(v, pruneFactor, minDanglingBranchLength, recoverAll, aligner); 447 } 448 449 ReadThreadingGraph.logger.debug(String.format("Recovered %d of %d dangling heads", nRecovered, attempted)); 450 } 451 452 /** 453 * Attempt to attach vertex with out-degree == 0 to the graph 454 * 455 * @param vertex the vertex to recover 456 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 457 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it 458 * @param aligner 459 * @return 1 if we successfully recovered the vertex and 0 otherwise 460 */ recoverDanglingTail(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)461 private int recoverDanglingTail(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) { 462 if (outDegreeOf(vertex) != 0) { 463 throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); 464 } 465 466 // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths 467 final DanglingChainMergeHelper danglingTailMergeResult = generateCigarAgainstDownwardsReferencePath(vertex, pruneFactor, minDanglingBranchLength, recoverAll, aligner); 468 469 // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path 470 if (danglingTailMergeResult == null || !cigarIsOkayToMerge(danglingTailMergeResult.cigar, false, true)) { 471 return 0; 472 } 473 474 // merge 475 return mergeDanglingTail(danglingTailMergeResult); 476 } 477 478 /** 479 * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging. 480 * Requires that at a minimum there are at least #getMinMatchingBases() matches between the reference and the read 481 * at the end in order to emit an alignment offset. 482 * 483 * @param cigarElements cigar elements corresponding to the alignment between path1 and path2 484 * @param path1 the first path 485 * @param path2 the second path 486 * @return an integer pair object where the key is the offset into path1 and the value is offset into path2 (both -1 if no path is found) 487 */ bestPrefixMatch(final List<CigarElement> cigarElements, final byte[] path1, final byte[] path2)488 private Pair<Integer, Integer> bestPrefixMatch(final List<CigarElement> cigarElements, final byte[] path1, final byte[] path2) { 489 final int minMismatchingBases = getMinMatchingBases(); 490 491 int refIdx = cigarElements.stream().mapToInt(ce -> ce.getOperator().consumesReferenceBases()? ce.getLength() : 0).sum() - 1; 492 int readIdx = path2.length - 1; 493 494 // NOTE: this only works when the last cigar element has a sufficient number of M bases, so no indels within min-mismatches of the edge. 495 cigarLoop: 496 for (final CigarElement ce : Lists.reverse(cigarElements)) { 497 if (!(ce.getOperator().consumesReadBases() && ce.getOperator().consumesReferenceBases())) { 498 break; 499 } 500 for (int j = 0; j < ce.getLength(); j++, refIdx--, readIdx--) { 501 if (path1[refIdx] != path2[readIdx]) { 502 break cigarLoop; 503 } 504 } 505 } 506 507 final int matches = path2.length - 1 - readIdx; 508 return matches < minMismatchingBases ? Pair.of(-1,-1) : Pair.of(refIdx, readIdx); 509 } 510 511 /** 512 * The minimum number of matches to be considered allowable for recovering dangling ends 513 */ getMinMatchingBases()514 private int getMinMatchingBases() { 515 return minMatchingBasesToDangingEndRecovery; 516 } 517 518 /** 519 * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph 520 * 521 * @param vertex the vertex to recover 522 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 523 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it 524 * @param recoverAll recover even branches with forks 525 * @param aligner 526 * @return 1 if we successfully recovered a vertex and 0 otherwise 527 */ recoverDanglingHead(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner)528 private int recoverDanglingHead(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, final SmithWatermanAligner aligner) { 529 if (inDegreeOf(vertex) != 0) { 530 throw new IllegalStateException("Attempting to recover a dangling head for " + vertex + " but it has in-degree > 0"); 531 } 532 533 // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths 534 final DanglingChainMergeHelper danglingHeadMergeResult = generateCigarAgainstUpwardsReferencePath(vertex, pruneFactor, minDanglingBranchLength, recoverAll, aligner); 535 536 // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path 537 if (danglingHeadMergeResult == null || !cigarIsOkayToMerge(danglingHeadMergeResult.cigar, true, false)) { 538 return 0; 539 } 540 541 // merge 542 return minMatchingBasesToDangingEndRecovery >= 0 ? mergeDanglingHead(danglingHeadMergeResult) : mergeDanglingHeadLegacy(danglingHeadMergeResult); 543 } 544 545 /** 546 * Actually merge the dangling tail if possible 547 * 548 * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference 549 * @return 1 if merge was successful, 0 otherwise 550 */ 551 @VisibleForTesting mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResult)552 final int mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResult) { 553 554 final List<CigarElement> elements = danglingTailMergeResult.cigar.getCigarElements(); 555 final CigarElement lastElement = elements.get(elements.size() - 1); 556 Utils.validateArg(lastElement.getOperator() == CigarOperator.M, "The last Cigar element must be an M"); 557 558 final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; 559 final int matchingSuffix = Math.min(longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); 560 if (minMatchingBasesToDangingEndRecovery >= 0 ? matchingSuffix < minMatchingBasesToDangingEndRecovery : matchingSuffix == 0 ) { 561 return 0; 562 } 563 564 final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); 565 566 // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a 567 // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling 568 // tail is a perfect match to the suffix of the reference path. In this case we need to push the reference index to merge 569 // down one position so that we don't incorrectly cut a base off of the deletion. 570 final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D; 571 final boolean mustHandleLeadingDeletionCase = firstElementIsDeletion && (elements.get(0).getLength() + matchingSuffix == lastRefIndex + 1); 572 final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0); 573 574 // another edge condition occurs here: if Smith-Waterman places the whole tail into an insertion then it will try to 575 // merge back to the LCA, which results in a cycle in the graph. So we do not want to merge in such a case. 576 if (refIndexToMerge == 0) { 577 return 0; 578 } 579 580 // it's safe to merge now 581 addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory) getEdgeFactory()).createEdge(false, 1)); 582 583 return 1; 584 } 585 586 /** 587 * Actually merge the dangling head if possible, this is the old codepath that does not handle indels 588 * 589 * @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference 590 * @return 1 if merge was successful, 0 otherwise 591 */ 592 @VisibleForTesting mergeDanglingHeadLegacy(final DanglingChainMergeHelper danglingHeadMergeResult)593 int mergeDanglingHeadLegacy(final DanglingChainMergeHelper danglingHeadMergeResult) { 594 595 final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements(); 596 final CigarElement firstElement = elements.get(0); 597 Utils.validateArg(firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M"); 598 599 final int indexesToMerge = bestPrefixMatchLegacy(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength()); 600 if (indexesToMerge <= 0) { 601 return 0; 602 } 603 604 // we can't push back the reference path 605 if (indexesToMerge >= danglingHeadMergeResult.referencePath.size() - 1) { 606 return 0; 607 } 608 609 // but we can manipulate the dangling path if we need to 610 if (indexesToMerge >= danglingHeadMergeResult.danglingPath.size() && 611 !extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge - danglingHeadMergeResult.danglingPath.size() + 2, elements)) { 612 return 0; 613 } 614 615 addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge + 1), danglingHeadMergeResult.danglingPath.get(indexesToMerge), ((MyEdgeFactory) getEdgeFactory()).createEdge(false, 1)); 616 617 return 1; 618 } 619 620 621 /** 622 * Actually merge the dangling head if possible 623 * 624 * @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference 625 * @return 1 if merge was successful, 0 otherwise 626 */ 627 @VisibleForTesting mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult)628 int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) { 629 630 final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements(); 631 final CigarElement firstElement = elements.get(0); 632 Utils.validateArg( firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M"); 633 634 final Pair<Integer, Integer> indexesToMerge = bestPrefixMatch(elements, danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString); 635 if ( indexesToMerge.getKey() <= 0 || indexesToMerge.getValue() <= 0 ) { 636 return 0; 637 } 638 639 // we can't push back the reference path 640 if ( indexesToMerge.getKey() >= danglingHeadMergeResult.referencePath.size() - 1 ) { 641 return 0; 642 } 643 644 // but we can manipulate the dangling path if we need to 645 if ( indexesToMerge.getValue() >= danglingHeadMergeResult.danglingPath.size() && 646 ! extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge.getValue() - danglingHeadMergeResult.danglingPath.size() + 2, elements) ) { 647 return 0; 648 } 649 650 addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge.getKey() + 1), danglingHeadMergeResult.danglingPath.get(indexesToMerge.getValue()), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); 651 652 return 1; 653 } 654 655 /** 656 * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the 657 * provided vertex is the sink) and the reference path. 658 * 659 * @param aligner 660 * @param vertex the sink of the dangling chain 661 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 662 * @param recoverAll recover even branches with forks 663 * @return a SmithWaterman object which can be null if no proper alignment could be generated 664 */ 665 @VisibleForTesting generateCigarAgainstDownwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner)666 DanglingChainMergeHelper generateCigarAgainstDownwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner) { 667 final int minTailPathLength = Math.max(1, minDanglingBranchLength); // while heads can be 0, tails absolutely cannot 668 669 // find the lowest common ancestor path between this vertex and the diverging master path if available 670 final List<MultiDeBruijnVertex> altPath = findPathUpwardsToLowestCommonAncestor(vertex, pruneFactor, !recoverAll); 671 if (altPath == null || isRefSource(altPath.get(0)) || altPath.size() < minTailPathLength + 1) // add 1 to include the LCA 672 { 673 return null; 674 } 675 676 // now get the reference path from the LCA 677 final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0), TraversalDirection.downwards, Optional.ofNullable(getHeaviestIncomingEdge(altPath.get(1)))); 678 679 // create the Smith-Waterman strings to use 680 final byte[] refBases = getBasesForPath(refPath, false); 681 final byte[] altBases = getBasesForPath(altPath, false); 682 683 // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) 684 final SmithWatermanAlignment alignment = aligner.align(refBases, altBases, SmithWatermanAligner.STANDARD_NGS, SWOverhangStrategy.LEADING_INDEL); 685 return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); 686 } 687 688 /** 689 * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the 690 * provided vertex is the source) and the reference path. 691 * 692 * @param aligner 693 * @param vertex the source of the dangling head 694 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 695 * @param recoverAll recover even branches with forks 696 * @return a SmithWaterman object which can be null if no proper alignment could be generated 697 */ 698 @VisibleForTesting generateCigarAgainstUpwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner)699 DanglingChainMergeHelper generateCigarAgainstUpwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor, final int minDanglingBranchLength, final boolean recoverAll, SmithWatermanAligner aligner) { 700 701 // find the highest common descendant path between vertex and the reference source if available 702 final List<MultiDeBruijnVertex> altPath = findPathDownwardsToHighestCommonDescendantOfReference(vertex, pruneFactor, !recoverAll); 703 if (altPath == null || isRefSink(altPath.get(0)) || altPath.size() < minDanglingBranchLength + 1) // add 1 to include the LCA 704 { 705 return null; 706 } 707 708 // now get the reference path from the LCA 709 final List<MultiDeBruijnVertex> refPath = getReferencePath(altPath.get(0), TraversalDirection.upwards, Optional.empty()); 710 711 // create the Smith-Waterman strings to use 712 final byte[] refBases = getBasesForPath(refPath, true); 713 final byte[] altBases = getBasesForPath(altPath, true); 714 715 // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) 716 final SmithWatermanAlignment alignment = aligner.align(refBases, altBases, SmithWatermanAligner.STANDARD_NGS, SWOverhangStrategy.LEADING_INDEL); 717 return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); 718 } 719 720 /** 721 * Finds the path upwards in the graph from this vertex to the first diverging node, including that (lowest common ancestor) vertex. 722 * Note that nodes are excluded if their pruning weight is less than the pruning factor. 723 * 724 * @param vertex the original vertex 725 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 726 * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found 727 * @return the path if it can be determined or null if this vertex either doesn't merge onto another path or 728 * has an ancestor with multiple incoming edges before hitting the reference path 729 */ findPathUpwardsToLowestCommonAncestor(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch)730 private List<MultiDeBruijnVertex> findPathUpwardsToLowestCommonAncestor(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch) { 731 return giveUpAtBranch ? findPath(vertex, pruneFactor, v -> inDegreeOf(v) != 1 || outDegreeOf(v) >= 2, v -> outDegreeOf(v) > 1, v -> incomingEdgeOf(v), e -> getEdgeSource(e)) 732 : findPath(vertex, pruneFactor, v -> hasIncidentRefEdge(v) || inDegreeOf(v) == 0, v -> outDegreeOf(v) > 1 && hasIncidentRefEdge(v), this::getHeaviestIncomingEdge, e -> getEdgeSource(e)); 733 } 734 hasIncidentRefEdge(final MultiDeBruijnVertex v)735 private boolean hasIncidentRefEdge(final MultiDeBruijnVertex v) { 736 for (final MultiSampleEdge edge : incomingEdgesOf(v)) { 737 if (edge.isRef()) { 738 return true; 739 } 740 } 741 return false; 742 } 743 getHeaviestIncomingEdge(final MultiDeBruijnVertex v)744 private MultiSampleEdge getHeaviestIncomingEdge(final MultiDeBruijnVertex v) { 745 final Set<MultiSampleEdge> incomingEdges = incomingEdgesOf(v); 746 return incomingEdges.size() == 1 ? incomingEdges.iterator().next() : 747 incomingEdges.stream().max(Comparator.comparingInt(MultiSampleEdge::getMultiplicity)).get(); 748 } 749 750 /** 751 * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex. 752 * However note that the path is reversed so that this vertex ends up at the end of the path. 753 * Also note that nodes are excluded if their pruning weight is less than the pruning factor. 754 * 755 * @param vertex the original vertex 756 * @param pruneFactor the prune factor to use in ignoring chain pieces 757 * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found 758 * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or 759 * has a descendant with multiple outgoing edges before hitting the reference path 760 */ findPathDownwardsToHighestCommonDescendantOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch)761 private List<MultiDeBruijnVertex> findPathDownwardsToHighestCommonDescendantOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor, final boolean giveUpAtBranch) { 762 return giveUpAtBranch ? findPath(vertex, pruneFactor, v -> isReferenceNode(v) || outDegreeOf(v) != 1, v -> isReferenceNode(v), v -> outgoingEdgeOf(v), e -> getEdgeTarget(e)) 763 : findPath(vertex, pruneFactor, v -> isReferenceNode(v) || outDegreeOf(v) == 0, v -> isReferenceNode(v), this::getHeaviestOutgoingEdge, e -> getEdgeTarget(e)); 764 } 765 getHeaviestOutgoingEdge(final MultiDeBruijnVertex v)766 private MultiSampleEdge getHeaviestOutgoingEdge(final MultiDeBruijnVertex v) { 767 final Set<MultiSampleEdge> outgoing = outgoingEdgesOf(v); 768 return outgoing.size() == 1 ? outgoing.iterator().next() : 769 outgoing.stream().max(Comparator.comparingInt(MultiSampleEdge::getMultiplicity)).get(); 770 } 771 772 /** 773 * Finds a path starting from a given vertex and satisfying various predicates 774 * 775 * @param vertex the original vertex 776 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor 777 * @param done test for whether a vertex is at the end of the path 778 * @param returnPath test for whether to return a found path based on its terminal vertex 779 * @param nextEdge function on vertices returning the next edge in the path 780 * @param nextNode function of edges returning the next vertex in the path 781 * @return a path, if one satisfying all predicates is found, {@code null} otherwise 782 */ findPath(final MultiDeBruijnVertex vertex, final int pruneFactor, final Predicate<MultiDeBruijnVertex> done, final Predicate<MultiDeBruijnVertex> returnPath, final Function<MultiDeBruijnVertex, MultiSampleEdge> nextEdge, final Function<MultiSampleEdge, MultiDeBruijnVertex> nextNode)783 private List<MultiDeBruijnVertex> findPath(final MultiDeBruijnVertex vertex, final int pruneFactor, 784 final Predicate<MultiDeBruijnVertex> done, 785 final Predicate<MultiDeBruijnVertex> returnPath, 786 final Function<MultiDeBruijnVertex, MultiSampleEdge> nextEdge, 787 final Function<MultiSampleEdge, MultiDeBruijnVertex> nextNode) { 788 final LinkedList<MultiDeBruijnVertex> path = new LinkedList<>(); 789 final Set<MultiDeBruijnVertex> visitedNodes = new HashSet<>(); // This code is necessary to 790 791 MultiDeBruijnVertex v = vertex; 792 while (!done.test(v)) { 793 final MultiSampleEdge edge = nextEdge.apply(v); 794 // if it has too low a weight, don't use it (or previous vertexes) for the path 795 if (edge.getPruningMultiplicity() < pruneFactor) { 796 // save the previously visited notes to protect us from riding forever 'neath the streets of boston 797 visitedNodes.addAll(path); 798 path.clear(); 799 } else { 800 path.addFirst(v); 801 } 802 v = nextNode.apply(edge); 803 // Check that we aren't stuck in a loop 804 if (path.contains(v) || visitedNodes.contains(v)) { 805 System.err.println("Dangling End recovery killed because of a loop (findPath)"); 806 return null; 807 } 808 } 809 path.addFirst(v); 810 811 return returnPath.test(v) ? path : null; 812 } 813 814 /** 815 * Finds the path in the graph from this vertex to the reference sink, including this vertex 816 * 817 * @param start the reference vertex to start from 818 * @param direction describes which direction to move in the graph (i.e. down to the reference sink or up to the source) 819 * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths 820 * @return the path (non-null, non-empty) 821 */ getReferencePath(final MultiDeBruijnVertex start, final TraversalDirection direction, final Optional<MultiSampleEdge> blacklistedEdge)822 protected List<MultiDeBruijnVertex> getReferencePath(final MultiDeBruijnVertex start, 823 final TraversalDirection direction, 824 final Optional<MultiSampleEdge> blacklistedEdge) { 825 826 final List<MultiDeBruijnVertex> path = new ArrayList<>(); 827 828 MultiDeBruijnVertex v = start; 829 while (v != null) { 830 path.add(v); 831 v = (direction == TraversalDirection.downwards ? getNextReferenceVertex(v, true, blacklistedEdge) : getPrevReferenceVertex(v)); 832 } 833 834 return path; 835 } 836 837 /** 838 * The base sequence for the given path. 839 * 840 * @param path the list of vertexes that make up the path 841 * @param expandSource if true and if we encounter a source node, then expand (and reverse) the character sequence for that node 842 * @return non-null sequence of bases corresponding to the given path 843 */ 844 @VisibleForTesting getBasesForPath(final List<MultiDeBruijnVertex> path, final boolean expandSource)845 byte[] getBasesForPath(final List<MultiDeBruijnVertex> path, final boolean expandSource) { 846 Utils.nonNull(path, "Path cannot be null"); 847 848 final StringBuilder sb = new StringBuilder(); 849 for (final MultiDeBruijnVertex v : path) { 850 if (expandSource && isSource(v)) { 851 final String seq = v.getSequenceString(); 852 sb.append(new StringBuilder(seq).reverse().toString()); 853 } else { 854 sb.append((char) v.getSuffix()); 855 } 856 } 857 858 return sb.toString().getBytes(); 859 } 860 861 /** 862 * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging. 863 * Assumes that path1.length >= maxIndex and path2.length >= maxIndex. 864 * 865 * @param path1 the first path 866 * @param path2 the second path 867 * @param maxIndex the maximum index to traverse (not inclusive) 868 * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex 869 */ bestPrefixMatchLegacy(final byte[] path1, final byte[] path2, final int maxIndex)870 private int bestPrefixMatchLegacy(final byte[] path1, final byte[] path2, final int maxIndex) { 871 final int maxMismatches = getMaxMismatchesLegacy(maxIndex); 872 int mismatches = 0; 873 int index = 0; 874 int lastGoodIndex = -1; 875 while (index < maxIndex) { 876 if (path1[index] != path2[index]) { 877 if (++mismatches > maxMismatches) { 878 return -1; 879 } 880 lastGoodIndex = index; 881 } 882 index++; 883 } 884 // if we got here then we hit the max index 885 return lastGoodIndex; 886 } 887 888 /** 889 * NOTE: this method is only used for dangling heads and not tails. 890 * 891 * Determine the maximum number of mismatches permitted on the branch. 892 * Unless it's preset (e.g. by unit tests) it should be the length of the branch divided by the kmer size. 893 * 894 * @param lengthOfDanglingBranch the length of the branch itself 895 * @return positive integer 896 */ getMaxMismatchesLegacy(final int lengthOfDanglingBranch)897 private int getMaxMismatchesLegacy(final int lengthOfDanglingBranch) { 898 return maxMismatchesInDanglingHead > 0 ? maxMismatchesInDanglingHead : Math.max(1, (lengthOfDanglingBranch / kmerSize)); 899 } 900 extendDanglingPathAgainstReference(final DanglingChainMergeHelper danglingHeadMergeResult, final int numNodesToExtend, List<CigarElement> elements)901 private boolean extendDanglingPathAgainstReference(final DanglingChainMergeHelper danglingHeadMergeResult, final int numNodesToExtend, List<CigarElement> elements) { 902 903 final int indexOfLastDanglingNode = danglingHeadMergeResult.danglingPath.size() - 1; 904 final int offsetForRefEndToDanglingEnd = elements.stream().mapToInt(ce -> (ce.getOperator().consumesReferenceBases()? ce.getLength() : 0) - (ce.getOperator().consumesReadBases()? ce.getLength() : 0)).sum(); 905 final int indexOfRefNodeToUse = indexOfLastDanglingNode + offsetForRefEndToDanglingEnd + numNodesToExtend; 906 if (indexOfRefNodeToUse >= danglingHeadMergeResult.referencePath.size()) { 907 return false; 908 } 909 910 final MultiDeBruijnVertex danglingSource = danglingHeadMergeResult.danglingPath.remove(indexOfLastDanglingNode); 911 final StringBuilder sb = new StringBuilder(); 912 final byte[] refSourceSequence = danglingHeadMergeResult.referencePath.get(indexOfRefNodeToUse).getSequence(); 913 for (int i = 0; i < numNodesToExtend; i++) { 914 sb.append((char) refSourceSequence[i]); 915 } 916 sb.append(danglingSource.getSequenceString()); 917 final byte[] sequenceToExtend = sb.toString().getBytes(); 918 919 // clean up the source and edge 920 final MultiSampleEdge sourceEdge = getHeaviestOutgoingEdge(danglingSource); 921 MultiDeBruijnVertex prevV = getEdgeTarget(sourceEdge); 922 removeEdge(danglingSource, prevV); 923 924 // extend the path 925 for (int i = numNodesToExtend; i > 0; i--) { 926 final MultiDeBruijnVertex newV = new MultiDeBruijnVertex(Arrays.copyOfRange(sequenceToExtend, i, i + kmerSize)); 927 addVertex(newV); 928 final MultiSampleEdge newE = addEdge(newV, prevV); 929 newE.setMultiplicity(sourceEdge.getMultiplicity()); 930 danglingHeadMergeResult.danglingPath.add(newV); 931 prevV = newV; 932 } 933 934 return true; 935 } 936 increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, final MultiDeBruijnVertex vertex, final byte[] originalKmer, final int offset)937 private void increaseCountsInMatchedKmers(final SequenceForKmers seqForKmers, 938 final MultiDeBruijnVertex vertex, 939 final byte[] originalKmer, 940 final int offset) { 941 if (offset == -1) { 942 return; 943 } 944 945 for (final MultiSampleEdge edge : incomingEdgesOf(vertex)) { 946 final MultiDeBruijnVertex prev = getEdgeSource(edge); 947 final byte suffix = prev.getSuffix(); 948 final byte seqBase = originalKmer[offset]; 949 if (suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1)) { 950 edge.incMultiplicity(seqForKmers.count); 951 increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset - 1); 952 } 953 } 954 } 955 956 /** 957 * Get the vertex for the kmer in sequence starting at start 958 * 959 * @param sequence the sequence 960 * @param start the position of the kmer start 961 * @return a non-null vertex 962 */ getOrCreateKmerVertex(final byte[] sequence, final int start)963 private MultiDeBruijnVertex getOrCreateKmerVertex(final byte[] sequence, final int start) { 964 final Kmer kmer = new Kmer(sequence, start, kmerSize); 965 final MultiDeBruijnVertex vertex = getKmerVertex(kmer, true); 966 return (vertex != null) ? vertex : createVertex(kmer); 967 } 968 969 /** 970 * Get the unique vertex for kmer, or null if not possible. 971 * 972 * @param allowRefSource if true, we will allow kmer to match the reference source vertex 973 * @return a vertex for kmer, or null (either because it doesn't exist or is non-unique for graphs that have such a distinction) 974 */ getKmerVertex(final Kmer kmer, final boolean allowRefSource)975 protected MultiDeBruijnVertex getKmerVertex(final Kmer kmer, final boolean allowRefSource) { 976 if (!allowRefSource && kmer.equals(refSource)) { 977 return null; 978 } 979 980 return kmerToVertexMap.get(kmer); 981 } 982 983 /** 984 * Create a new vertex for kmer. Add it to the kmerToVertexMap map if appropriate. 985 * 986 * @param kmer the kmer we want to create a vertex for 987 * @return the non-null created vertex 988 */ createVertex(final Kmer kmer)989 private MultiDeBruijnVertex createVertex(final Kmer kmer) { 990 final MultiDeBruijnVertex newVertex = new MultiDeBruijnVertex(kmer.bases()); 991 final int prevSize = vertexSet().size(); 992 addVertex(newVertex); 993 994 // make sure we aren't adding duplicates (would be a bug) 995 if (vertexSet().size() != prevSize + 1) { 996 throw new IllegalStateException("Adding vertex " + newVertex + " to graph didn't increase the graph size"); 997 } 998 trackKmer(kmer, newVertex); 999 1000 return newVertex; 1001 } 1002 1003 /** 1004 * Workhorse routine of the assembler. Given a sequence whose last vertex is anchored in the graph, extend 1005 * the graph one bp according to the bases in sequence. 1006 * 1007 * @param prevVertex a non-null vertex where sequence was last anchored in the graph 1008 * @param sequence the sequence we're threading through the graph 1009 * @param kmerStart the start of the current kmer in graph we'd like to add 1010 * @param count the number of observations of this kmer in graph (can be > 1 for GGA) 1011 * @param isRef is this the reference sequence? 1012 * @return a non-null vertex connecting prevVertex to in the graph based on sequence 1013 */ extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef)1014 protected MultiDeBruijnVertex extendChainByOne(final MultiDeBruijnVertex prevVertex, final byte[] sequence, final int kmerStart, final int count, final boolean isRef) { 1015 final Set<MultiSampleEdge> outgoingEdges = outgoingEdgesOf(prevVertex); 1016 1017 final int nextPos = kmerStart + kmerSize - 1; 1018 for (final MultiSampleEdge outgoingEdge : outgoingEdges) { 1019 final MultiDeBruijnVertex target = getEdgeTarget(outgoingEdge); 1020 if (target.getSuffix() == sequence[nextPos]) { 1021 // we've got a match in the chain, so simply increase the count of the edge by 1 and continue 1022 outgoingEdge.incMultiplicity(count); 1023 return target; 1024 } 1025 } 1026 1027 // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in 1028 final Kmer kmer = new Kmer(sequence, kmerStart, kmerSize); 1029 final MultiDeBruijnVertex mergeVertex = getNextKmerVertexForChainExtension(kmer, isRef, prevVertex); 1030 1031 // either use our merge vertex, or create a new one in the chain 1032 final MultiDeBruijnVertex nextVertex = mergeVertex == null ? createVertex(kmer) : mergeVertex; 1033 addEdge(prevVertex, nextVertex, ((MyEdgeFactory) getEdgeFactory()).createEdge(isRef, count)); 1034 return nextVertex; 1035 } 1036 1037 /** 1038 * Add a read to the sequence graph. Finds maximal consecutive runs of bases with sufficient quality 1039 * and applies {@see addSequence} to these subreads if they are longer than the kmer size. 1040 * 1041 * @param read a non-null read 1042 */ 1043 @VisibleForTesting addRead(final GATKRead read, final SAMFileHeader header)1044 void addRead(final GATKRead read, final SAMFileHeader header) { 1045 final byte[] sequence = read.getBases(); 1046 final byte[] qualities = read.getBaseQualities(); 1047 1048 int lastGood = -1; 1049 for (int end = 0; end <= sequence.length; end++) { 1050 if (end == sequence.length || !baseIsUsableForAssembly(sequence[end], qualities[end])) { 1051 // the first good base is at lastGood, can be -1 if last base was bad 1052 final int start = lastGood; 1053 // the stop base is end - 1 (if we're not at the end of the sequence) 1054 final int len = end - start; 1055 1056 if (start != -1 && len >= kmerSize) { 1057 // if the sequence is long enough to get some value out of, add it to the graph 1058 final String name = read.getName() + '_' + start + '_' + end; 1059 addSequence(name, ReadUtils.getSampleName(read, header), sequence, start, end, 1, false); 1060 } 1061 1062 lastGood = -1; // reset the last good base 1063 } else if (lastGood == -1) { 1064 lastGood = end; // we're at a good base, the last good one is us 1065 } 1066 } 1067 } 1068 1069 /** 1070 * Determines whether a base can safely be used for assembly. 1071 * Currently disallows Ns and/or those with low quality 1072 * 1073 * @param base the base under consideration 1074 * @param qual the quality of that base 1075 * @return true if the base can be used for assembly, false otherwise 1076 */ baseIsUsableForAssembly(final byte base, final byte qual)1077 protected boolean baseIsUsableForAssembly(final byte base, final byte qual) { 1078 return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly; 1079 } 1080 1081 @Override findKmer(final Kmer k)1082 public MultiDeBruijnVertex findKmer(final Kmer k) { 1083 return kmerToVertexMap.get(k); 1084 } 1085 1086 /** 1087 * Edge factory that encapsulates the numPruningSamples assembly parameter 1088 */ 1089 protected static final class MyEdgeFactory implements EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> { 1090 final int numPruningSamples; 1091 MyEdgeFactory(final int numPruningSamples)1092 MyEdgeFactory(final int numPruningSamples) { 1093 this.numPruningSamples = numPruningSamples; 1094 } 1095 1096 @Override createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex)1097 public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) { 1098 return new MultiSampleEdge(false, 1, numPruningSamples); 1099 } 1100 createEdge(final boolean isRef, final int multiplicity)1101 MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) { 1102 return new MultiSampleEdge(isRef, multiplicity, numPruningSamples); 1103 } 1104 } 1105 1106 /** 1107 * Class to keep track of the important dangling chain merging data 1108 */ 1109 static final class DanglingChainMergeHelper { 1110 final List<MultiDeBruijnVertex> danglingPath; 1111 final List<MultiDeBruijnVertex> referencePath; 1112 final byte[] danglingPathString; 1113 final byte[] referencePathString; 1114 final Cigar cigar; 1115 DanglingChainMergeHelper(final List<MultiDeBruijnVertex> danglingPath, final List<MultiDeBruijnVertex> referencePath, final byte[] danglingPathString, final byte[] referencePathString, final Cigar cigar)1116 DanglingChainMergeHelper(final List<MultiDeBruijnVertex> danglingPath, 1117 final List<MultiDeBruijnVertex> referencePath, 1118 final byte[] danglingPathString, 1119 final byte[] referencePathString, 1120 final Cigar cigar) { 1121 this.danglingPath = danglingPath; 1122 this.referencePath = referencePath; 1123 this.danglingPathString = danglingPathString; 1124 this.referencePathString = referencePathString; 1125 this.cigar = cigar; 1126 } 1127 } 1128 1129 /** 1130 * Keeps track of the information needed to add a sequence to the read threading assembly graph 1131 */ 1132 static final class SequenceForKmers { 1133 final String name; 1134 final byte[] sequence; 1135 final int start; 1136 final int stop; 1137 final int count; 1138 final boolean isRef; 1139 1140 /** 1141 * Create a new sequence for creating kmers 1142 */ SequenceForKmers(final String name, final byte[] sequence, final int start, final int stop, final int count, final boolean ref)1143 SequenceForKmers(final String name, final byte[] sequence, final int start, final int stop, final int count, final boolean ref) { 1144 Utils.nonNull(sequence, "Sequence is null "); 1145 Utils.validateArg(start >= 0, () -> "Invalid start " + start); 1146 Utils.validateArg(stop >= start, () -> "Invalid stop " + stop); 1147 Utils.validateArg(count > 0, "Invalid count " + count); 1148 1149 this.name = name; 1150 this.sequence = sequence; 1151 this.start = start; 1152 this.stop = stop; 1153 this.count = count; 1154 isRef = ref; 1155 } 1156 } 1157 } 1158