1 package org.broadinstitute.hellbender.tools.spark.sv.utils; 2 3 import htsjdk.samtools.*; 4 import htsjdk.variant.variantcontext.VariantContext; 5 import htsjdk.variant.vcf.VCFConstants; 6 import org.broadinstitute.hellbender.exceptions.UserException; 7 import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet; 8 import org.broadinstitute.hellbender.tools.spark.utils.LongIterator; 9 import org.broadinstitute.hellbender.utils.SimpleInterval; 10 import org.broadinstitute.hellbender.utils.Utils; 11 import org.broadinstitute.hellbender.utils.io.IOUtils; 12 13 import javax.annotation.Nonnull; 14 import java.io.IOException; 15 import java.math.BigInteger; 16 import java.nio.file.Files; 17 import java.util.*; 18 import java.util.function.Predicate; 19 import java.util.stream.Collector; 20 import java.util.stream.Collectors; 21 import java.util.stream.Stream; 22 23 /** 24 * Useful scraps of this and that. 25 */ 26 public final class SVUtils { 27 28 public static final String GATKSV_CONTIG_ALIGNMENTS_READ_GROUP_ID = "GATKSVContigAlignments"; 29 getSampleId(final SAMFileHeader header)30 public static String getSampleId(final SAMFileHeader header) { 31 final List<SAMReadGroupRecord> readGroups = header.getReadGroups(); 32 final Set<String> sampleSet = readGroups.stream().map(SAMReadGroupRecord::getSample).collect(Collectors.toSet()); 33 34 Utils.validate(sampleSet.size() == 1, 35 "Read groups must contain reads from one and only one sample, " + 36 "but we are finding the following ones in the given header: \t" + sampleSet.toString()); 37 38 final String sample = sampleSet.iterator().next(); 39 return sample; 40 } 41 42 /** 43 * Given {@code sortOrder}, provide appropriate comparator. 44 * Currently only support coordinate or query-name order, 45 * and throws UserException if other values are specified. 46 */ getSamRecordComparator(final SAMFileHeader.SortOrder sortOrder)47 public static SAMRecordComparator getSamRecordComparator(final SAMFileHeader.SortOrder sortOrder) { 48 final SAMRecordComparator samRecordComparator; 49 switch (sortOrder) { 50 case coordinate: 51 samRecordComparator = new SAMRecordCoordinateComparator(); 52 break; 53 case queryname: 54 samRecordComparator = new SAMRecordQueryNameComparator(); 55 break; 56 default: 57 throw new UserException("Unsupported assembly alignment sort order specified"); 58 } 59 return samRecordComparator; 60 } 61 62 /** 63 * todo: this should be fixed in a new major version of htsjdk 64 * this exist because for whatever reason, 65 * VC.getAttributeAsStringList() sometimes returns a giant single string, while using 66 * VC.getAttributeAsString() gives back an array..... 67 */ getAttributeAsStringList(final VariantContext vc, final String attributeKey)68 public static List<String> getAttributeAsStringList(final VariantContext vc, final String attributeKey) { 69 return getAttributeAsStringStream(vc, attributeKey) 70 .collect(Collectors.toList()); 71 } 72 getAttributeAsStringStream(final VariantContext vc, final String attributeKey)73 public static Stream<String> getAttributeAsStringStream(final VariantContext vc, final String attributeKey) { 74 if ( ! vc.hasAttribute(attributeKey) ) 75 return Stream.empty(); 76 return vc.getAttributeAsStringList(attributeKey, "").stream() 77 .flatMap(s -> { 78 if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) { 79 return Arrays.stream( s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ); 80 } else { 81 return Stream.of(s); 82 } 83 }); 84 } 85 86 /** 87 * Given chromosome name and requested position (1-based coordinate system), 88 * return a 1 bp long interval that spans only the requested base. 89 */ 90 public static SimpleInterval makeOneBpInterval(final String chr, final int pos) { 91 return new SimpleInterval(chr, pos, pos); 92 } 93 94 /** 95 * Canonical chromosomes are defined, for homo sapiens, as chromosomes 1-22, chromosome X and Y. 96 */ 97 public static Set<String> getCanonicalChromosomes(final String nonCanonicalContigNamesFile, 98 @Nonnull final SAMSequenceDictionary dictionary) { 99 final LinkedHashSet<String> allContigs = Utils.nonNull(dictionary).getSequences().stream().map(SAMSequenceRecord::getSequenceName) 100 .collect(Collectors.toCollection(LinkedHashSet::new)); 101 if (nonCanonicalContigNamesFile == null) 102 return allContigs; 103 104 try (final Stream<String> nonCanonical = Files.lines(IOUtils.getPath(( Utils.nonNull(nonCanonicalContigNamesFile) )))) { 105 nonCanonical.forEach(allContigs::remove); 106 return allContigs; 107 } catch ( final IOException ioe ) { 108 throw new UserException("Can't read nonCanonicalContigNamesFile file "+nonCanonicalContigNamesFile, ioe); 109 } 110 } 111 112 // ================================================================================================================= 113 114 /** return a good initialCapacity for a HashMap that will hold a given number of elements */ 115 public static int hashMapCapacity( final int nElements ) 116 { 117 return (int)((nElements*4L)/3) + 1; 118 } 119 120 /** count the number of items available from an iterator */ 121 public static <T> int iteratorSize( final Iterator<T> itr ) { 122 int result = 0; 123 while ( itr.hasNext() ) { result += 1; itr.next(); } 124 return result; 125 } 126 127 public static int iteratorSize( final LongIterator itr ) { 128 int result = 0; 129 while ( itr.hasNext() ) { result += 1; itr.next(); } 130 return result; 131 } 132 133 public static <T> Iterator<T> singletonIterator( final T t ) { 134 return Collections.singletonList(t).iterator(); 135 } 136 137 public static Collection<SVKmer> uniquify(final Collection<SVKmer> coll1, final Collection<SVKmer> coll2) { 138 Utils.nonNull(coll1, "first collection of kmers is null"); 139 Utils.nonNull(coll2, "second collection of kmers is null"); 140 141 final HopscotchSet<SVKmer> kmers = new HopscotchSet<>(coll1.size() + coll2.size()); 142 kmers.addAll(coll1); 143 kmers.addAll(coll2); 144 return kmers; 145 } 146 147 /** Concatenate two lists. */ 148 public static <T extends Object> List<T> concatenateLists(final List<T> list1, final List<T> list2) { 149 Utils.validateArg(list1.getClass().equals(list2.getClass()), "Lists to be concatenated are of different classes"); 150 151 final List<T> result = new ArrayList<T>(list1.size() + list2.size()); 152 result.addAll(list1); 153 result.addAll(list2); 154 return result; 155 } 156 157 public static class IteratorFilter<T> implements Iterator<T> { 158 private final Iterator<T> itr; 159 private final Predicate<T> predicate; 160 private T obj; 161 162 public IteratorFilter( final Iterator<T> itr, final Predicate<T> predicate ) { 163 this.itr = itr; 164 this.predicate = predicate; 165 advance(); 166 } 167 168 @Override public boolean hasNext() { return obj != null; } 169 170 @Override 171 public T next() { 172 if ( !hasNext() ) { 173 throw new NoSuchElementException("IteratorFilter is exhausted."); 174 } 175 final T result = obj; 176 advance(); 177 return result; 178 } 179 180 private void advance() { 181 obj = null; 182 while ( itr.hasNext() ) { 183 final T next = itr.next(); 184 if ( predicate.test(next) ) { 185 obj = next; 186 break; 187 } 188 } 189 } 190 } 191 192 /** 193 * Provides a stream collector that will collect items into an array list with a given initial capacity. 194 */ 195 public static <T> Collector<T, ?, ArrayList<T>> arrayListCollector(final int size) { 196 return Collectors.toCollection( () -> new ArrayList<>(size)); 197 } 198 199 public static <T extends Enum<T>> EnumMap<T, Long> getZeroInitializedEnumMap(final Class<T> clazz) { 200 EnumMap<T, Long> map = new EnumMap<>(clazz); 201 for (final T t : clazz.getEnumConstants()) { 202 map.put(t, 0L); 203 } 204 return map; 205 } 206 207 // ================================================================================================================= 208 209 //Workaround for seed 14695981039346656037 that doesn't fit in a signed long 210 private static final long FNV64_DEFAULT_SEED = new BigInteger("14695981039346656037").longValue(); 211 212 /** 213 * 64-bit FNV-1a hash for long's 214 */ 215 216 public static long fnvLong64( final long toHash ) { 217 return fnvLong64(FNV64_DEFAULT_SEED, toHash); 218 } 219 220 public static long fnvLong64( long start, final long toHash ) { 221 final long mult = 1099511628211L; 222 start ^= (toHash >> 56) & 0xffL; 223 start *= mult; 224 start ^= (toHash >> 48) & 0xffL; 225 start *= mult; 226 start ^= (toHash >> 40) & 0xffL; 227 start *= mult; 228 start ^= (toHash >> 32) & 0xffL; 229 start *= mult; 230 start ^= (toHash >> 24) & 0xffL; 231 start *= mult; 232 start ^= (toHash >> 16) & 0xffL; 233 start *= mult; 234 start ^= (toHash >> 8) & 0xffL; 235 start *= mult; 236 start ^= toHash & 0xffL; 237 start *= mult; 238 return start; 239 } 240 241 /** 242 * 64-bit FNV-1a hash for byte arrays 243 */ 244 public static long fnvByteArray64(final byte[] toHash) { 245 // TODO: this is a mistake: the constant should be the FNV64_DEFAULT_SEED, but it's the multiplier instead. 246 return fnvByteArray64(1099511628211L, toHash); 247 } 248 249 public static long fnvByteArray64(long start, final byte[] toHash) { 250 for (int i = 0; i < toHash.length; i += 8) { 251 long val = 0; 252 for (int j = 0; j < 8 && i + j < toHash.length; j++) { 253 val = (val << 8) | toHash[i + j]; 254 } 255 start = fnvLong64(start, val); 256 } 257 return start; 258 } 259 260 } 261