1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.util.Locatable; 5 import org.apache.logging.log4j.LogManager; 6 import org.apache.logging.log4j.Logger; 7 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer; 8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge; 9 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.SeqGraph; 10 import org.broadinstitute.hellbender.utils.BaseUtils; 11 import org.broadinstitute.hellbender.utils.Utils; 12 import org.jgrapht.EdgeFactory; 13 14 import java.io.File; 15 import java.util.*; 16 import java.util.stream.Collectors; 17 18 /** 19 * Note: not final but only intended to be subclassed for testing. 20 */ 21 public class ReadThreadingGraph extends AbstractReadThreadingGraph { 22 23 protected static final Logger logger = LogManager.getLogger(ReadThreadingGraph.class); 24 25 private static final long serialVersionUID = 1l; 26 27 /** 28 * A set of non-unique kmers that cannot be used as merge points in the graph 29 */ 30 protected Set<Kmer> nonUniqueKmers; 31 32 /** 33 * Constructs an empty read-threading-grpah provided the kmerSize. 34 * @param kmerSize 1 or greater. 35 * 36 * @throws IllegalArgumentException if (@code kmerSize) < 1. 37 */ ReadThreadingGraph(final int kmerSize)38 public ReadThreadingGraph(final int kmerSize) { 39 this(kmerSize, false, (byte)6, 1, -1); 40 } 41 42 /** 43 * Constructs a read-threading-graph for a string representation. 44 * 45 * <p> 46 * Note: only used for testing. 47 * </p> 48 */ 49 @VisibleForTesting ReadThreadingGraph(final int kmerSizeFromString, final EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory)50 protected ReadThreadingGraph(final int kmerSizeFromString, final EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory) { 51 super(kmerSizeFromString, new MyEdgeFactory(1)); 52 nonUniqueKmers = null; 53 } 54 55 /** 56 * Create a new ReadThreadingAssembler using kmerSize for matching 57 * @param kmerSize must be >= 1 58 */ ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases)59 ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) { 60 super(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, numDanglingMatchingPrefixBases); 61 nonUniqueKmers = null; 62 } 63 64 /** 65 * Since we want to duplicate non-unique kmers in the graph code we must determine what those kmers are 66 */ 67 @Override preprocessReads()68 protected void preprocessReads() { 69 nonUniqueKmers = determineNonUniques(kmerSize, getAllPendingSequences()); 70 } 71 72 @Override shouldRemoveReadsAfterGraphConstruction()73 protected boolean shouldRemoveReadsAfterGraphConstruction() { 74 return true; 75 } 76 77 /** 78 * Does the graph not have enough complexity? We define low complexity as a situation where the number 79 * of non-unique kmers is more than 20% of the total number of kmers. 80 * 81 * @return true if the graph has low complexity, false otherwise 82 */ 83 @Override isLowQualityGraph()84 public boolean isLowQualityGraph() { 85 return nonUniqueKmers.size() * 4 > kmerToVertexMap.size(); 86 } 87 88 // only add the new kmer to the map if it exists and isn't in our non-unique kmer list 89 @Override trackKmer(final Kmer kmer, final MultiDeBruijnVertex newVertex)90 protected void trackKmer(final Kmer kmer, final MultiDeBruijnVertex newVertex) { 91 if ( ! nonUniqueKmers.contains(kmer) && ! kmerToVertexMap.containsKey(kmer) ) { 92 kmerToVertexMap.put(kmer, newVertex); 93 } 94 } 95 96 @Override clone()97 public ReadThreadingGraph clone() { 98 return (ReadThreadingGraph) super.clone(); 99 } 100 101 /** 102 * Checks whether a kmer can be the threading start based on the current threading start location policy. 103 * 104 * @see #setThreadingStartOnlyAtExistingVertex(boolean) 105 * 106 * @param kmer the query kmer. 107 * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise. 108 */ isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex)109 protected boolean isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex) { 110 Utils.nonNull(kmer); 111 return startThreadingOnlyAtExistingVertex ? kmerToVertexMap.containsKey(kmer) : !nonUniqueKmers.contains(kmer); 112 } 113 114 /** 115 * Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers 116 * among all sequences added to the current graph. Will always return a result for maxKmerSize if 117 * all smaller kmers had non-unique kmers. 118 * 119 * @param kmerSize the kmer size to check for non-unique kmers of 120 * @return a non-null NonUniqueResult 121 */ determineNonUniques(final int kmerSize, Collection<SequenceForKmers> withNonUniques)122 private static Set<Kmer> determineNonUniques(final int kmerSize, Collection<SequenceForKmers> withNonUniques) { 123 final Set<Kmer> nonUniqueKmers = new HashSet<>(); 124 125 // loop over all sequences that have non-unique kmers in them from the previous iterator 126 final Iterator<SequenceForKmers> it = withNonUniques.iterator(); 127 while ( it.hasNext() ) { 128 final SequenceForKmers sequenceForKmers = it.next(); 129 130 // determine the non-unique kmers for this sequence 131 final Collection<Kmer> nonUniquesFromSeq = determineNonUniqueKmers(sequenceForKmers, kmerSize); 132 if ( nonUniquesFromSeq.isEmpty() ) { 133 // remove this sequence from future consideration 134 it.remove(); 135 } else { 136 // keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques 137 nonUniqueKmers.addAll(nonUniquesFromSeq); 138 } 139 } 140 141 return nonUniqueKmers; 142 } 143 144 /** 145 * Get the collection of all sequences for kmers across all samples in no particular order 146 * @return non-null Collection 147 */ getAllPendingSequences()148 private Collection<SequenceForKmers> getAllPendingSequences() { 149 return pending.values().stream().flatMap(oneSampleWorth -> oneSampleWorth.stream()).collect(Collectors.toList()); 150 } 151 152 /** 153 * Get the collection of non-unique kmers from sequence for kmer size kmerSize 154 * @param seqForKmers a sequence to get kmers from 155 * @param kmerSize the size of the kmers 156 * @return a non-null collection of non-unique kmers in sequence 157 */ determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize)158 static Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) { 159 // count up occurrences of kmers within each read 160 final Set<Kmer> allKmers = new LinkedHashSet<>(); 161 final List<Kmer> nonUniqueKmers = new ArrayList<>(); 162 final int stopPosition = seqForKmers.stop - kmerSize; 163 for (int i = 0; i <= stopPosition; i++) { 164 final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize); 165 if (!allKmers.add(kmer)) { 166 nonUniqueKmers.add(kmer); 167 } 168 } 169 return nonUniqueKmers; 170 } 171 172 @Override toSequenceGraph()173 public SeqGraph toSequenceGraph() { 174 buildGraphIfNecessary(); 175 return super.toSequenceGraph(); 176 } 177 178 /** 179 * Get the set of non-unique kmers in this graph. For debugging purposes 180 * @return a non-null set of kmers 181 */ 182 @VisibleForTesting getNonUniqueKmers()183 Set<Kmer> getNonUniqueKmers() { 184 return nonUniqueKmers; 185 } 186 187 @Override getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex)188 protected MultiDeBruijnVertex getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex) { 189 final MultiDeBruijnVertex uniqueMergeVertex = getKmerVertex(kmer, false); 190 191 Utils.validate(!(isRef && uniqueMergeVertex != null), "Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex); 192 193 return uniqueMergeVertex; 194 } 195 196 @Override postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype)197 public void postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype) { 198 return; // There is no processing required for this graph so simply return 199 } 200 201 @Override toString()202 public String toString() { 203 return "ReadThreadingAssembler{kmerSize=" + kmerSize + '}'; 204 } 205 206 }