1 package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.util.Locatable;
5 import org.apache.logging.log4j.LogManager;
6 import org.apache.logging.log4j.Logger;
7 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer;
8 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge;
9 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.SeqGraph;
10 import org.broadinstitute.hellbender.utils.BaseUtils;
11 import org.broadinstitute.hellbender.utils.Utils;
12 import org.jgrapht.EdgeFactory;
13 
14 import java.io.File;
15 import java.util.*;
16 import java.util.stream.Collectors;
17 
18 /**
19  * Note: not final but only intended to be subclassed for testing.
20  */
21 public class ReadThreadingGraph extends AbstractReadThreadingGraph {
22 
23     protected static final Logger logger = LogManager.getLogger(ReadThreadingGraph.class);
24 
25     private static final long serialVersionUID = 1l;
26 
27     /**
28      * A set of non-unique kmers that cannot be used as merge points in the graph
29      */
30     protected Set<Kmer> nonUniqueKmers;
31 
32     /**
33      * Constructs an empty read-threading-grpah provided the kmerSize.
34      * @param kmerSize 1 or greater.
35      *
36      * @throws IllegalArgumentException if (@code kmerSize) < 1.
37      */
ReadThreadingGraph(final int kmerSize)38     public ReadThreadingGraph(final int kmerSize) {
39         this(kmerSize, false, (byte)6, 1, -1);
40     }
41 
42     /**
43      * Constructs a read-threading-graph for a string representation.
44      *
45      * <p>
46      *     Note: only used for testing.
47      * </p>
48      */
49     @VisibleForTesting
ReadThreadingGraph(final int kmerSizeFromString, final EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory)50     protected ReadThreadingGraph(final int kmerSizeFromString, final EdgeFactory<MultiDeBruijnVertex, MultiSampleEdge> edgeFactory) {
51         super(kmerSizeFromString, new MyEdgeFactory(1));
52         nonUniqueKmers = null;
53     }
54 
55     /**
56      * Create a new ReadThreadingAssembler using kmerSize for matching
57      * @param kmerSize must be >= 1
58      */
ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases)59     ReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) {
60         super(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, numDanglingMatchingPrefixBases);
61         nonUniqueKmers = null;
62     }
63 
64     /**
65      * Since we want to duplicate non-unique kmers in the graph code we must determine what those kmers are
66      */
67     @Override
preprocessReads()68     protected void preprocessReads() {
69         nonUniqueKmers = determineNonUniques(kmerSize, getAllPendingSequences());
70     }
71 
72     @Override
shouldRemoveReadsAfterGraphConstruction()73     protected boolean shouldRemoveReadsAfterGraphConstruction() {
74         return true;
75     }
76 
77     /**
78      * Does the graph not have enough complexity?  We define low complexity as a situation where the number
79      * of non-unique kmers is more than 20% of the total number of kmers.
80      *
81      * @return true if the graph has low complexity, false otherwise
82      */
83     @Override
isLowQualityGraph()84     public boolean isLowQualityGraph() {
85         return nonUniqueKmers.size() * 4 > kmerToVertexMap.size();
86     }
87 
88     // only add the new kmer to the map if it exists and isn't in our non-unique kmer list
89     @Override
trackKmer(final Kmer kmer, final MultiDeBruijnVertex newVertex)90     protected void trackKmer(final Kmer kmer, final MultiDeBruijnVertex newVertex) {
91         if ( ! nonUniqueKmers.contains(kmer) && ! kmerToVertexMap.containsKey(kmer) ) {
92             kmerToVertexMap.put(kmer, newVertex);
93         }
94     }
95 
96     @Override
clone()97     public ReadThreadingGraph clone() {
98         return (ReadThreadingGraph) super.clone();
99     }
100 
101     /**
102      * Checks whether a kmer can be the threading start based on the current threading start location policy.
103      *
104      * @see #setThreadingStartOnlyAtExistingVertex(boolean)
105      *
106      * @param kmer the query kmer.
107      * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise.
108      */
isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex)109     protected boolean isThreadingStart(final Kmer kmer, final boolean startThreadingOnlyAtExistingVertex) {
110         Utils.nonNull(kmer);
111         return startThreadingOnlyAtExistingVertex ? kmerToVertexMap.containsKey(kmer) : !nonUniqueKmers.contains(kmer);
112     }
113 
114     /**
115      * Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers
116      * among all sequences added to the current graph.  Will always return a result for maxKmerSize if
117      * all smaller kmers had non-unique kmers.
118      *
119      * @param kmerSize the kmer size to check for non-unique kmers of
120      * @return a non-null NonUniqueResult
121      */
determineNonUniques(final int kmerSize, Collection<SequenceForKmers> withNonUniques)122     private static Set<Kmer> determineNonUniques(final int kmerSize, Collection<SequenceForKmers> withNonUniques) {
123         final Set<Kmer> nonUniqueKmers = new HashSet<>();
124 
125         // loop over all sequences that have non-unique kmers in them from the previous iterator
126         final Iterator<SequenceForKmers> it = withNonUniques.iterator();
127         while ( it.hasNext() ) {
128             final SequenceForKmers sequenceForKmers = it.next();
129 
130             // determine the non-unique kmers for this sequence
131             final Collection<Kmer> nonUniquesFromSeq = determineNonUniqueKmers(sequenceForKmers, kmerSize);
132             if ( nonUniquesFromSeq.isEmpty() ) {
133                 // remove this sequence from future consideration
134                 it.remove();
135             } else {
136                 // keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques
137                 nonUniqueKmers.addAll(nonUniquesFromSeq);
138             }
139         }
140 
141         return nonUniqueKmers;
142     }
143 
144     /**
145      * Get the collection of all sequences for kmers across all samples in no particular order
146      * @return non-null Collection
147      */
getAllPendingSequences()148     private Collection<SequenceForKmers> getAllPendingSequences() {
149         return pending.values().stream().flatMap(oneSampleWorth -> oneSampleWorth.stream()).collect(Collectors.toList());
150     }
151 
152     /**
153      * Get the collection of non-unique kmers from sequence for kmer size kmerSize
154      * @param seqForKmers a sequence to get kmers from
155      * @param kmerSize the size of the kmers
156      * @return a non-null collection of non-unique kmers in sequence
157      */
determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize)158     static Collection<Kmer> determineNonUniqueKmers(final SequenceForKmers seqForKmers, final int kmerSize) {
159         // count up occurrences of kmers within each read
160         final Set<Kmer> allKmers = new LinkedHashSet<>();
161         final List<Kmer> nonUniqueKmers = new ArrayList<>();
162         final int stopPosition = seqForKmers.stop - kmerSize;
163         for (int i = 0; i <= stopPosition; i++) {
164             final Kmer kmer = new Kmer(seqForKmers.sequence, i, kmerSize);
165             if (!allKmers.add(kmer)) {
166                 nonUniqueKmers.add(kmer);
167             }
168         }
169         return nonUniqueKmers;
170     }
171 
172     @Override
toSequenceGraph()173     public SeqGraph toSequenceGraph() {
174         buildGraphIfNecessary();
175         return super.toSequenceGraph();
176     }
177 
178     /**
179      * Get the set of non-unique kmers in this graph.  For debugging purposes
180      * @return a non-null set of kmers
181      */
182     @VisibleForTesting
getNonUniqueKmers()183     Set<Kmer> getNonUniqueKmers() {
184         return nonUniqueKmers;
185     }
186 
187     @Override
getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex)188     protected MultiDeBruijnVertex getNextKmerVertexForChainExtension(final Kmer kmer, final boolean isRef, final MultiDeBruijnVertex prevVertex) {
189         final MultiDeBruijnVertex uniqueMergeVertex = getKmerVertex(kmer, false);
190 
191         Utils.validate(!(isRef && uniqueMergeVertex != null), "Found a unique vertex to merge into the reference graph " + prevVertex + " -> " + uniqueMergeVertex);
192 
193         return uniqueMergeVertex;
194     }
195 
196     @Override
postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype)197     public void postProcessForHaplotypeFinding(final File debugGraphOutputPath, final Locatable refHaplotype) {
198         return; // There is no processing required for this graph so simply return
199     }
200 
201     @Override
toString()202     public String toString() {
203         return "ReadThreadingAssembler{kmerSize=" + kmerSize + '}';
204     }
205 
206 }