1 package org.broadinstitute.hellbender.tools.spark.sv.utils;
2 
3 import htsjdk.samtools.*;
4 import htsjdk.variant.variantcontext.VariantContext;
5 import htsjdk.variant.vcf.VCFConstants;
6 import org.broadinstitute.hellbender.exceptions.UserException;
7 import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet;
8 import org.broadinstitute.hellbender.tools.spark.utils.LongIterator;
9 import org.broadinstitute.hellbender.utils.SimpleInterval;
10 import org.broadinstitute.hellbender.utils.Utils;
11 import org.broadinstitute.hellbender.utils.io.IOUtils;
12 
13 import javax.annotation.Nonnull;
14 import java.io.IOException;
15 import java.math.BigInteger;
16 import java.nio.file.Files;
17 import java.util.*;
18 import java.util.function.Predicate;
19 import java.util.stream.Collector;
20 import java.util.stream.Collectors;
21 import java.util.stream.Stream;
22 
23 /**
24  * Useful scraps of this and that.
25  */
26 public final class SVUtils {
27 
28     public static final String GATKSV_CONTIG_ALIGNMENTS_READ_GROUP_ID = "GATKSVContigAlignments";
29 
getSampleId(final SAMFileHeader header)30     public static String getSampleId(final SAMFileHeader header) {
31         final List<SAMReadGroupRecord> readGroups = header.getReadGroups();
32         final Set<String> sampleSet = readGroups.stream().map(SAMReadGroupRecord::getSample).collect(Collectors.toSet());
33 
34         Utils.validate(sampleSet.size() == 1,
35                 "Read groups must contain reads from one and only one sample, " +
36                         "but we are finding the following ones in the given header: \t" + sampleSet.toString());
37 
38         final String sample = sampleSet.iterator().next();
39         return sample;
40     }
41 
42     /**
43      * Given {@code sortOrder}, provide appropriate comparator.
44      * Currently only support coordinate or query-name order,
45      * and throws UserException if other values are specified.
46      */
getSamRecordComparator(final SAMFileHeader.SortOrder sortOrder)47     public static SAMRecordComparator getSamRecordComparator(final SAMFileHeader.SortOrder sortOrder) {
48         final SAMRecordComparator samRecordComparator;
49         switch (sortOrder) {
50             case coordinate:
51                 samRecordComparator = new SAMRecordCoordinateComparator();
52                 break;
53             case queryname:
54                 samRecordComparator = new SAMRecordQueryNameComparator();
55                 break;
56             default:
57                 throw new UserException("Unsupported assembly alignment sort order specified");
58         }
59         return samRecordComparator;
60     }
61 
62     /**
63      * todo: this should be fixed in a new major version of htsjdk
64      * this exist because for whatever reason,
65      * VC.getAttributeAsStringList() sometimes returns a giant single string, while using
66      * VC.getAttributeAsString() gives back an array.....
67      */
getAttributeAsStringList(final VariantContext vc, final String attributeKey)68     public static List<String> getAttributeAsStringList(final VariantContext vc, final String attributeKey) {
69         return getAttributeAsStringStream(vc, attributeKey)
70                 .collect(Collectors.toList());
71     }
72 
getAttributeAsStringStream(final VariantContext vc, final String attributeKey)73     public static Stream<String> getAttributeAsStringStream(final VariantContext vc, final String attributeKey) {
74         if ( ! vc.hasAttribute(attributeKey) )
75             return Stream.empty();
76         return vc.getAttributeAsStringList(attributeKey, "").stream()
77                 .flatMap(s -> {
78                     if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) {
79                         return Arrays.stream( s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) );
80                     } else {
81                         return Stream.of(s);
82                     }
83                 });
84     }
85 
86     /**
87      * Given chromosome name and requested position (1-based coordinate system),
88      * return a 1 bp long interval that spans only the requested base.
89      */
90     public static SimpleInterval makeOneBpInterval(final String chr, final int pos) {
91         return new SimpleInterval(chr, pos, pos);
92     }
93 
94     /**
95      * Canonical chromosomes are defined, for homo sapiens, as chromosomes 1-22, chromosome X and Y.
96      */
97     public static Set<String> getCanonicalChromosomes(final String nonCanonicalContigNamesFile,
98                                                       @Nonnull final SAMSequenceDictionary dictionary) {
99         final LinkedHashSet<String> allContigs = Utils.nonNull(dictionary).getSequences().stream().map(SAMSequenceRecord::getSequenceName)
100                 .collect(Collectors.toCollection(LinkedHashSet::new));
101         if (nonCanonicalContigNamesFile == null)
102             return allContigs;
103 
104         try (final Stream<String> nonCanonical = Files.lines(IOUtils.getPath(( Utils.nonNull(nonCanonicalContigNamesFile) )))) {
105             nonCanonical.forEach(allContigs::remove);
106             return allContigs;
107         } catch ( final IOException ioe ) {
108             throw new UserException("Can't read nonCanonicalContigNamesFile file "+nonCanonicalContigNamesFile, ioe);
109         }
110     }
111 
112     // =================================================================================================================
113 
114     /** return a good initialCapacity for a HashMap that will hold a given number of elements */
115     public static int hashMapCapacity( final int nElements )
116     {
117         return (int)((nElements*4L)/3) + 1;
118     }
119 
120     /** count the number of items available from an iterator */
121     public static <T> int iteratorSize( final Iterator<T> itr ) {
122         int result = 0;
123         while ( itr.hasNext() ) { result += 1; itr.next(); }
124         return result;
125     }
126 
127     public static int iteratorSize( final LongIterator itr ) {
128         int result = 0;
129         while ( itr.hasNext() ) { result += 1; itr.next(); }
130         return result;
131     }
132 
133     public static <T> Iterator<T> singletonIterator( final T t ) {
134         return Collections.singletonList(t).iterator();
135     }
136 
137     public static Collection<SVKmer> uniquify(final Collection<SVKmer> coll1, final Collection<SVKmer> coll2) {
138         Utils.nonNull(coll1, "first collection of kmers is null");
139         Utils.nonNull(coll2, "second collection of kmers is null");
140 
141         final HopscotchSet<SVKmer> kmers = new HopscotchSet<>(coll1.size() + coll2.size());
142         kmers.addAll(coll1);
143         kmers.addAll(coll2);
144         return kmers;
145     }
146 
147     /** Concatenate two lists. */
148     public static <T extends Object> List<T> concatenateLists(final List<T> list1, final List<T> list2) {
149         Utils.validateArg(list1.getClass().equals(list2.getClass()), "Lists to be concatenated are of different classes");
150 
151         final List<T> result = new ArrayList<T>(list1.size() + list2.size());
152         result.addAll(list1);
153         result.addAll(list2);
154         return result;
155     }
156 
157     public static class IteratorFilter<T> implements Iterator<T> {
158         private final Iterator<T> itr;
159         private final Predicate<T> predicate;
160         private T obj;
161 
162         public IteratorFilter( final Iterator<T> itr, final Predicate<T> predicate ) {
163             this.itr = itr;
164             this.predicate = predicate;
165             advance();
166         }
167 
168         @Override public boolean hasNext() { return obj != null; }
169 
170         @Override
171         public T next() {
172             if ( !hasNext() ) {
173                 throw new NoSuchElementException("IteratorFilter is exhausted.");
174             }
175             final T result = obj;
176             advance();
177             return result;
178         }
179 
180         private void advance() {
181             obj = null;
182             while ( itr.hasNext() ) {
183                 final T next = itr.next();
184                 if ( predicate.test(next) ) {
185                     obj = next;
186                     break;
187                 }
188             }
189         }
190     }
191 
192     /**
193      * Provides a stream collector that will collect items into an array list with a given initial capacity.
194      */
195     public static <T> Collector<T, ?, ArrayList<T>> arrayListCollector(final int size) {
196         return Collectors.toCollection( () -> new ArrayList<>(size));
197     }
198 
199     public static <T extends Enum<T>> EnumMap<T, Long> getZeroInitializedEnumMap(final Class<T> clazz) {
200         EnumMap<T, Long> map = new EnumMap<>(clazz);
201         for (final T t : clazz.getEnumConstants()) {
202             map.put(t, 0L);
203         }
204         return map;
205     }
206 
207     // =================================================================================================================
208 
209     //Workaround for seed 14695981039346656037 that doesn't fit in a signed long
210     private static final long FNV64_DEFAULT_SEED = new BigInteger("14695981039346656037").longValue();
211 
212     /**
213      * 64-bit FNV-1a hash for long's
214      */
215 
216     public static long fnvLong64( final long toHash ) {
217         return fnvLong64(FNV64_DEFAULT_SEED, toHash);
218     }
219 
220     public static long fnvLong64( long start, final long toHash ) {
221         final long mult = 1099511628211L;
222         start ^= (toHash >> 56) & 0xffL;
223         start *= mult;
224         start ^= (toHash >> 48) & 0xffL;
225         start *= mult;
226         start ^= (toHash >> 40) & 0xffL;
227         start *= mult;
228         start ^= (toHash >> 32) & 0xffL;
229         start *= mult;
230         start ^= (toHash >> 24) & 0xffL;
231         start *= mult;
232         start ^= (toHash >> 16) & 0xffL;
233         start *= mult;
234         start ^= (toHash >> 8) & 0xffL;
235         start *= mult;
236         start ^= toHash & 0xffL;
237         start *= mult;
238         return start;
239     }
240 
241     /**
242      * 64-bit FNV-1a hash for byte arrays
243      */
244     public static long fnvByteArray64(final byte[] toHash) {
245         // TODO: this is a mistake:  the constant should be the FNV64_DEFAULT_SEED, but it's the multiplier instead.
246         return fnvByteArray64(1099511628211L, toHash);
247     }
248 
249     public static long fnvByteArray64(long start, final byte[] toHash) {
250         for (int i = 0; i < toHash.length; i += 8) {
251             long val = 0;
252             for (int j = 0; j < 8 && i + j < toHash.length; j++) {
253                 val = (val << 8) | toHash[i + j];
254             }
255             start = fnvLong64(start, val);
256         }
257         return start;
258     }
259 
260 }
261