1 package org.broadinstitute.hellbender.testutils; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.MultiSampleEdge; 5 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; 6 import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph; 7 import org.broadinstitute.hellbender.utils.Utils; 8 9 import java.util.*; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 13 public final class TestingReadThreadingGraph extends ReadThreadingGraph { 14 /************************************************************* 15 * Simple string representation support for testing purposes * 16 *************************************************************/ 17 18 private static final Pattern PROPERTIES_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*\\]"); 19 private static final Pattern PATH_PATTERN = Pattern.compile("\\{((\\S+):)?([^\\}]*)\\}"); 20 private static final Pattern KMERSIZE_EXTRACTOR_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*(ks|kmerSize)\\s*=\\s*(\\d+)\\s*[,\\]]"); 21 private static final long serialVersionUID = 1l; 22 23 24 /** 25 * Constructs a read-threading-graph for a string representation. 26 * 27 * <p> 28 * Note: only used for testing. 29 * Checkout {@see HaplotypeGraphUnitTest} for examples. 30 * </p> 31 * @param s the string representation of the graph {@code null}. 32 */ TestingReadThreadingGraph(final String s)33 public TestingReadThreadingGraph(final String s) { 34 super(kmerSizeFromString(s)); 35 applyString(s); 36 setAlreadyBuilt(); 37 } 38 39 /** 40 * Obtain the kmer size for the string representation. 41 * @param str the source string representation. 42 * @return 1 or greater. 43 * @throws IllegalArgumentException if {@code} str does not contain a valid representation. 44 */ kmerSizeFromString(final String str)45 private static int kmerSizeFromString(final String str) { 46 final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str); 47 if (matcher.find()) { 48 return Integer.parseInt(matcher.group(2)); 49 } else { 50 throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize"); 51 } 52 } 53 54 /** 55 * Apply description string into the graph. 56 * 57 * <p> 58 * Note: this is done just for testing purposes. 59 * Checkout {@see HaplotypeGraphUnitTest} for examples. 60 * </p> 61 * @param str the string representation. 62 */ applyString(final String str)63 private void applyString(final String str) { 64 final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str); 65 final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0; 66 67 final String pathString = str.substring(pathStart); 68 final Matcher pathMatcher = PATH_PATTERN.matcher(pathString); 69 70 boolean referenceFound = false; 71 final Map<String, MultiDeBruijnVertex> vertexById = new HashMap<>(); 72 73 // Loop between path strings and add them one by one. 74 while (pathMatcher.find()) { 75 final String label = pathMatcher.group(2); 76 final boolean isReference = "REF".equals(label); 77 if (referenceFound) { 78 Utils.validateArg(!isReference, "there are two reference paths"); 79 } else if ( isReference ) { 80 referenceFound = true; 81 } 82 83 // Divide each path into its elements getting a list of sequences and labels if applies: 84 final String elementsString = pathMatcher.group(3); 85 final String[] elements = elementsString.split("\\s*->\\s*"); 86 Utils.validateArg(elements.length > 0, "empty path not allowed"); 87 final String[] seqs = new String[elements.length]; 88 final String[] ids = new String[elements.length]; 89 for (int i = 0; i < elements.length; i++) { 90 ids[i] = pathElementId(elements[i]); 91 seqs[i] = pathElementSeq(elements[i]); 92 Utils.validateArg(!(seqs[i].isEmpty() && ids[i] == null), "path with empty element without an id"); 93 } 94 final boolean isSource = ids[0] == null || !vertexById.containsKey(ids[0]); 95 if (isSource && seqs[0].length() != kmerSize) { 96 throw new IllegalArgumentException("source sequence length must be the same as the kmerSize " 97 + ids[0] + ' ' + seqs[0] + ' ' + pathMatcher.group()); 98 } 99 final MultiDeBruijnVertex firstVertex; 100 if (ids[0] != null && vertexById.containsKey(ids[0])) { 101 firstVertex = vertexById.get(ids[0]); 102 } else { 103 firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes()); 104 addVertex(firstVertex); 105 if (ids[0] != null) { 106 vertexById.put(ids[0], firstVertex); 107 } 108 } 109 if (!seqs[0].isEmpty() && 110 ((isSource && !firstVertex.getSequenceString().equals(seqs[0])) 111 || (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0]))) { 112 throw new IllegalArgumentException("mismatched first element sequence"); 113 } 114 115 MultiDeBruijnVertex lastVertex = firstVertex; 116 for (int i = 1; i < elements.length; i++) { 117 //TODO: code and comment disagree 118 Utils.validateArg(seqs[i].length() <= 1, "non-source vertex sequence must have length 1"); 119 final MultiDeBruijnVertex nextVertex; 120 if (ids[i] == null || !vertexById.containsKey(ids[i])) { 121 final Set<MultiDeBruijnVertex> nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]); 122 if (nextVertices.isEmpty()) { 123 nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0])); 124 addVertex(nextVertex); 125 } else { 126 nextVertex = nextVertices.iterator().next(); 127 } 128 if (ids[i] != null) { 129 vertexById.put(ids[i], nextVertex); 130 } 131 } else { 132 nextVertex = vertexById.get(ids[i]); 133 } 134 final MultiSampleEdge edge = addEdge(lastVertex,nextVertex); 135 if (isReference) { 136 edge.setIsRef(true); 137 } 138 lastVertex = nextVertex; 139 } 140 } 141 } 142 143 /** 144 * Return the collection of outgoing vertices that expand this vertex with a particular base. 145 * 146 * @param v original vertex. 147 * @param b expanding base. 148 * @return never null, but perhaps an empty set. You cannot assume that you can modify the result. 149 */ 150 @VisibleForTesting getNextVertices(final MultiDeBruijnVertex v, final byte b)151 Set<MultiDeBruijnVertex> getNextVertices(final MultiDeBruijnVertex v, final byte b) { 152 Utils.nonNull(v, "the input vertex cannot be null"); 153 Utils.validateArg(vertexSet().contains(v), "the vertex must be present in the graph"); 154 final List<MultiDeBruijnVertex> result = new LinkedList<>(); 155 for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) { 156 if (w.getSuffix() == b) { 157 result.add(w); 158 } 159 } 160 switch (result.size()) { 161 case 0: return Collections.emptySet(); 162 case 1: return Collections.singleton(result.get(0)); 163 default: 164 return new HashSet<>(result); 165 } 166 } 167 pathElementId(final String element)168 private static String pathElementId(final String element) { 169 final int openBracketPosition = element.indexOf('('); 170 171 if (openBracketPosition == -1) { 172 return null; 173 } 174 175 final int closeBracketPosition = element.lastIndexOf(')'); 176 Utils.validateArg(closeBracketPosition != -1, () -> "non-closed id parantesys found in element: " + element); 177 final String result = element.substring(openBracketPosition + 1,closeBracketPosition).trim(); 178 Utils.validateArg(!result.isEmpty(), () -> "empty id found in element: " + element); 179 return result; 180 } 181 182 /** 183 * Returns the lenght of a path element in the string representation. 184 * @param element the query element. 185 * @return 0 or greater. 186 */ pathElementSeq(final String element)187 private static String pathElementSeq(final String element) { 188 final int parentesysPos = element.indexOf('('); 189 190 if (parentesysPos == -1) { 191 return element.trim(); 192 } 193 194 return element.substring(0,parentesysPos).trim(); 195 } 196 197 /** 198 * Add a base to the end of a byte sequence. 199 * @param sequence sequence where to add the base to. 200 * @param b base to add. 201 * @return never {@code null}, a new array each time. 202 */ extendSequence(final byte[] sequence, final byte b)203 private static byte[] extendSequence(final byte[] sequence, final byte b) { 204 final byte[] result = new byte[sequence.length]; 205 System.arraycopy(sequence, 1, result, 0, sequence.length - 1); 206 result[result.length - 1] = b; 207 return result; 208 } 209 210 @Override clone()211 public TestingReadThreadingGraph clone() { 212 return (TestingReadThreadingGraph) super.clone(); 213 } 214 } 215