1 package org.broadinstitute.hellbender.engine;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.SAMFileHeader;
5 import htsjdk.samtools.SAMSequenceDictionary;
6 import htsjdk.samtools.util.Locatable;
7 import htsjdk.tribble.Feature;
8 import htsjdk.tribble.FeatureCodec;
9 import htsjdk.variant.vcf.VCFHeader;
10 import org.apache.commons.lang3.tuple.Pair;
11 import org.apache.logging.log4j.LogManager;
12 import org.apache.logging.log4j.Logger;
13 import org.broadinstitute.barclay.argparser.Argument;
14 import org.broadinstitute.barclay.argparser.ArgumentDefinition;
15 import org.broadinstitute.barclay.argparser.ClassFinder;
16 import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
17 import org.broadinstitute.hellbender.exceptions.GATKException;
18 import org.broadinstitute.hellbender.exceptions.UserException;
19 import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
20 import org.broadinstitute.hellbender.utils.SimpleInterval;
21 import org.broadinstitute.hellbender.utils.Utils;
22 import org.broadinstitute.hellbender.utils.config.ConfigFactory;
23 import org.broadinstitute.hellbender.utils.config.GATKConfig;
24 
25 import java.lang.reflect.InvocationTargetException;
26 import java.lang.reflect.ParameterizedType;
27 import java.lang.reflect.Type;
28 import java.nio.file.Files;
29 import java.nio.file.Path;
30 import java.util.*;
31 import java.util.stream.Collectors;
32 
33 
34 /**
35  * Handles discovery of available codecs and Feature arguments, file format detection and codec selection,
36  * and creation/management/querying of FeatureDataSources for each source of Features.
37  *
38  * At startup, walks the packages specified in {@link GATKConfig#codec_packages} in the config file to discover what codecs are available
39  * to decode Feature-containing files.
40  *
41  * Then, given a tool instance, it discovers what FeatureInput argument fields are declared in the
42  * tool's class hierarchy (and associated ArgumentCollections), and for each argument actually specified
43  * by the user on the command line, determines the type of the file and the codec required to decode it,
44  * creates a FeatureDataSource for that file, and adds it to a query-able resource pool.
45  *
46  * Clients can then call {@link #getFeatures(FeatureInput, SimpleInterval)} to query the data source for
47  * a particular FeatureInput over a specific interval.
48  */
49 public final class FeatureManager implements AutoCloseable {
50     private static final Logger logger = LogManager.getLogger(FeatureManager.class);
51 
52     /**
53      * All codecs descend from this class
54      */
55     private static final Class<FeatureCodec> CODEC_BASE_CLASS = FeatureCodec.class;
56 
57     /**
58      * The codec classes we locate when searching codec packages
59      */
60     private static final Set<Class<?>> DISCOVERED_CODECS;
61 
62     /**
63      * Feature arguments in tools are of this type
64      */
65     private static final Class<FeatureInput> FEATURE_ARGUMENT_CLASS = FeatureInput.class;
66 
67     /**
68      * At startup, walk through the packages in codec packages, and save any (concrete) FeatureCodecs discovered
69      * in DISCOVERED_CODECS
70      */
71     static {
72 
73         // Get our configuration:
74         final GATKConfig config = ConfigFactory.getInstance().getGATKConfig();
75 
76         final ClassFinder finder = new ClassFinder();
77         for ( final String codecPackage : config.codec_packages() ) {
finder.find(codecPackage, CODEC_BASE_CLASS)78             finder.find(codecPackage, CODEC_BASE_CLASS);
79         }
80         // Exclude abstract classes and interfaces from the list of discovered codec classes
81         DISCOVERED_CODECS = Collections.unmodifiableSet(finder.getConcreteClasses());
82     }
83 
84     /**
85      * The simple class name of the tool instance containing the FeatureInput argument values that will form the basis of our
86      * pool of FeatureDataSources
87      */
88     private final String toolInstanceSimpleClassName;
89 
90     /**
91      * Mapping from FeatureInput argument to query-able FeatureDataSource for that source of Features
92      */
93     private final Map<FeatureInput<? extends Feature>, FeatureDataSource<? extends Feature>> featureSources;
94 
95     /**
96      * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
97      * arguments in the tool and creating query-able FeatureDataSources for them. Uses the default
98      * caching behavior of {@link FeatureDataSource}.
99      *
100      * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
101      *                     Must have undergone command-line argument parsing and argument value injection already.
102      */
FeatureManager( final CommandLineProgram toolInstance )103     public FeatureManager( final CommandLineProgram toolInstance ) {
104         this(toolInstance, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES);
105     }
106 
107     /**
108      * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
109      * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over
110      * how much caching is performed by each {@link FeatureDataSource}.
111      *
112      * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
113      *                     Must have undergone command-line argument parsing and argument value injection already.
114      * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
115      *                              the end of query intervals in anticipation of future queries (>= 0).
116      */
FeatureManager( final CommandLineProgram toolInstance, final int featureQueryLookahead )117     public FeatureManager( final CommandLineProgram toolInstance, final int featureQueryLookahead ) {
118         this(toolInstance, featureQueryLookahead, 0, 0);
119     }
120 
121 
122     /**
123      * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
124      * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over
125      * how much caching is performed by each {@link FeatureDataSource}.
126      *
127      * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
128      *                     Must have undergone command-line argument parsing and argument value injection already.
129      * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
130      *                              the end of query intervals in anticipation of future queries (>= 0).
131      * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
132      * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
133      *
134      */
FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer)135     public FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) {
136         this(toolInstance, featureQueryLookahead, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, null);
137     }
138 
139     /**
140      * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput
141      * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over
142      * how much caching is performed by each {@link FeatureDataSource}.
143      *  @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
144      *                     Must have undergone command-line argument parsing and argument value injection already.
145      * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond
146      *                              the end of query intervals in anticipation of future queries (>= 0).
147      * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
148      * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
149      * @param gdbOptions settings for GenomicsDB to use when reading from a GenomicsDB workspace
150      *
151      */
FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions)152     public FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions) {
153         this.toolInstanceSimpleClassName = toolInstance.getClass().getSimpleName();
154         this.featureSources = new LinkedHashMap<>();
155 
156         initializeFeatureSources(featureQueryLookahead, toolInstance, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, gdbOptions);
157     }
158 
159     /**
160      * Same as {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}, except used when the
161      *  FeatureInputs (and associated types) are known.
162      *
163      *  This constructor should only be used in test code.
164      *
165      * @param featureInputsToTypeMap {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}.  Never {@code null}
166      * @param toolInstanceName See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
167      * @param featureQueryLookahead See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
168      * @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
169      * @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
170      * @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}
171      */
172     @VisibleForTesting
FeatureManager(final Map<FeatureInput<? extends Feature>, Class<? extends Feature>> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference)173     FeatureManager(final Map<FeatureInput<? extends Feature>, Class<? extends Feature>> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference) {
174 
175         Utils.nonNull(featureInputsToTypeMap);
176 
177         this.toolInstanceSimpleClassName = toolInstanceName;
178         this.featureSources = new LinkedHashMap<>();
179         Utils.nonNull(featureInputsToTypeMap);
180         featureInputsToTypeMap.forEach((k,v) -> addToFeatureSources(featureQueryLookahead, k, v, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference));
181     }
182 
183     /**
184      * Given our tool instance, discover all argument of type FeatureInput (or Collections thereof), determine
185      * the type of each Feature-containing file, and add a FeatureDataSource for each file to our query pool.
186      *
187      * @param featureQueryLookahead Set up each FeatureDataSource to cache this many extra bases of context beyond
188      *                              the end of query intervals in anticipation of future queries (>= 0).
189      * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments)
190      *                     Must have undergone command-line argument parsing and argument value injection already.
191      * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
192      * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
193      */
194     @SuppressWarnings({"unchecked", "rawtypes"})
initializeFeatureSources( final int featureQueryLookahead, final CommandLineProgram toolInstance, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions)195     private void initializeFeatureSources( final int featureQueryLookahead, final CommandLineProgram toolInstance, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions) {
196 
197         // Discover all arguments of type FeatureInput (or Collections thereof) in our tool's class hierarchy
198         // (and associated ArgumentCollections). Arguments not specified by the user on the command line will
199         // come back to us with a null FeatureInput.
200         final List<Pair<ArgumentDefinition, FeatureInput>> featureArgumentValues =
201                 toolInstance.getCommandLineParser().gatherArgumentValuesOfType(FEATURE_ARGUMENT_CLASS);
202 
203         for ( final Pair<ArgumentDefinition, FeatureInput> featureArgument : featureArgumentValues ) {
204             final FeatureInput<? extends Feature> featureInput = featureArgument.getValue();
205 
206             // Only create a data source for Feature arguments that were actually specified
207             if ( featureInput != null ) {
208                 final Class<? extends Feature> featureType = getFeatureTypeForFeatureInputArgument(featureArgument.getKey());
209                 addToFeatureSources(featureQueryLookahead, featureInput, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
210                         gdbOptions);
211             }
212         }
213     }
214 
215     @SuppressWarnings({"unchecked", "rawtypes"})
dumpAllFeatureCacheStats()216     public void dumpAllFeatureCacheStats() {
217         for ( final FeatureDataSource f : featureSources.values() ) {
218             f.printCacheStats();
219         }
220     }
221 
addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput, final Class<? extends Feature> featureType, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference)222     void addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput,
223                              final Class<? extends Feature> featureType, final int cloudPrefetchBuffer,
224                              final int cloudIndexPrefetchBuffer, final Path reference) {
225         // Create a new FeatureDataSource for this file, and add it to our query pool
226         featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, new GenomicsDBOptions(reference)));
227     }
228 
229     /**
230      * Add the feature data source to the given feature input.
231      *
232      * @param featureQueryLookahead look ahead this many bases during queries that produce cache misses
233      * @param featureInput source of features
234      * @param featureType class of features
235      * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
236      * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
237      * @param genomicsDBOptions options and info for reading from a GenomicsDB
238      *
239      * Note: package-visible to enable access from the core walker classes
240      * (but not actual tools, so it's not protected).
241      */
addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput, final Class<? extends Feature> featureType, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions)242     void addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput,
243                              final Class<? extends Feature> featureType, final int cloudPrefetchBuffer,
244                              final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions) {
245         // Create a new FeatureDataSource for this file, and add it to our query pool
246         featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, genomicsDBOptions));
247     }
248 
249     /**
250      * Given a ArgumentDefinition for an argument known to be of type FeatureInput (or a Collection thereof), retrieves the type
251      * parameter for the FeatureInput (eg., for FeatureInput<VariantContext> or List<FeatureInput<VariantContext>>
252      * this would be VariantContext).
253      *
254      * @param argDef an {@code ArgumentDefinition} for an argument known to be of type FeatureInput whose type parameter to retrieve
255      * @return type parameter of the FeatureInput declaration represented by the given ArgumentDefinition
256      */
257     @SuppressWarnings("unchecked")
getFeatureTypeForFeatureInputArgument( final ArgumentDefinition argDef )258     static Class<? extends Feature> getFeatureTypeForFeatureInputArgument( final ArgumentDefinition argDef ) {
259         final Type featureInputType = argDef.isCollection() ?
260                                 getNextTypeParameter((ParameterizedType)(argDef.getUnderlyingField().getGenericType())) :
261                                 argDef.getUnderlyingField().getGenericType();
262 
263         if ( ! (featureInputType instanceof ParameterizedType) ) {
264             throw new GATKException(String.format("FeatureInput declaration for argument --%s lacks an explicit type parameter for the Feature type",
265                                 argDef.getUnderlyingField().getAnnotation(Argument.class).fullName()));
266         }
267 
268         return (Class<? extends Feature>)getNextTypeParameter((ParameterizedType)featureInputType);
269     }
270 
271     /**
272      * Helper method for {@link #getFeatureTypeForFeatureInputArgument(ArgumentDefinition)} that "unpacks" a
273      * parameterized type by one level of parameterization. Eg., given List<FeatureInput<VariantContext>>
274      * would return FeatureInput<VariantContext>.
275      *
276      * @param parameterizedType parameterized type to unpack
277      * @return the type parameter of the given parameterized type
278      */
getNextTypeParameter( final ParameterizedType parameterizedType )279     private static Type getNextTypeParameter( final ParameterizedType parameterizedType ) {
280         final Type[] typeParameters = parameterizedType.getActualTypeArguments();
281         if ( typeParameters.length != 1 ) {
282             throw new GATKException("Found a FeatureInput declaration with multiple type parameters, which is not supported");
283         }
284         return typeParameters[0];
285     }
286 
287     /**
288      * Does this manager have no sources of Features to query?
289      *
290      * @return true if there are no Feature sources available to query, otherwise false
291      */
isEmpty()292     public boolean isEmpty() {
293         return featureSources.isEmpty();
294     }
295 
296 
297     /**
298      * This method finds and returns all of the variant headers from the feature sources.
299      *
300      * @return A list of all variant headers for features.
301      */
getAllVariantHeaders()302     public List<VCFHeader> getAllVariantHeaders() {
303         return featureSources.values().stream()
304                 .map(feature -> feature.getHeader())
305                 .filter(header -> header instanceof VCFHeader)
306                 .map(header -> (VCFHeader)header).collect(Collectors.toList());
307     }
308 
309     /**
310      * Returns the list of sequence dictionaries retrieved from the VCF headers of variant Feature inputs.
311      * Note: this method returns an empty list if the variant inputs
312      * happen not to have sequence dictionaries (since they are optional in the VCF format).
313      */
getVariantSequenceDictionaries()314     public List<SAMSequenceDictionary> getVariantSequenceDictionaries() {
315         return getAllVariantHeaders()
316                 .stream().map(h -> h.getSequenceDictionary())
317                 .filter(dict -> dict != null)
318                 .collect(Collectors.toList());
319     }
320 
321     /**
322      * Returns the sequence dictionaries associated with all feature sources.
323      * This method will return an empty List if none of the feature sources have dictionaries.
324      */
getAllSequenceDictionaries()325     public List<SAMSequenceDictionary> getAllSequenceDictionaries() {
326         return featureSources.values().stream().map(fs -> fs.getSequenceDictionary())
327                 .filter(dict -> dict != null)
328                 .collect(Collectors.toList());
329     }
330 
331     /**
332      * Given a FeatureInput argument field from our tool, queries the data source for that FeatureInput
333      * over the specified interval, and returns a List of the Features overlapping that interval from
334      * that data source.
335      *
336      * Will throw an exception if the provided FeatureInput did not come from the tool that this
337      * FeatureManager was initialized with, or was not an @Argument-annotated field in the tool
338      * (or parent classes).
339      *
340      * @param featureDescriptor FeatureInput argument from our tool representing the Feature source to query
341      * @param interval interval to query over (returned Features will overlap this interval)
342      * @param <T> type of Feature in the source represented by featureDescriptor
343      * @return A List of all Features in the backing data source for the provided FeatureInput that overlap
344      *         the provided interval (may be empty if there are none, but never null)
345      */
getFeatures( final FeatureInput<T> featureDescriptor, final Locatable interval )346     public <T extends Feature> List<T> getFeatures( final FeatureInput<T> featureDescriptor, final Locatable interval ) {
347         final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor);
348 
349         // No danger of a ClassCastException here, since we verified that the FeatureDataSource for this
350         // FeatureInput will return Features of the expected type T when we first created the data source
351         // in initializeFeatureSources()
352         return dataSource.queryAndPrefetch(interval);
353     }
354 
355     /**
356      * Given a FeatureInput argument field from our tool, returns an iterator to its features starting
357      * from the first one.
358      * <p><b>Warning!</b>: calling this method a second time on the same {@link FeatureInput}
359      * on the same FeatureManager instance will invalidate (close) the iterator returned from
360      * the first call.
361      * </p>
362      * <p>
363      * An exception will be thrown if the {@link FeatureInput} provided did not come from the tool that this
364      * manager was initialized with, or was not an &#64;Argument-annotated field in the tool
365      * (or parent classes).
366      * </p>
367      *
368      * @param featureDescriptor FeatureInput argument from our tool representing the Feature source to query
369      * @param <T> type of Feature in the source represented by featureDescriptor
370      * @return never {@code null}, a iterator to all the features in the backing data source.
371      * @throws GATKException if the feature-descriptor is not found in the manager or is {@code null}.
372      */
getFeatureIterator(final FeatureInput<T> featureDescriptor)373     public <T extends Feature> Iterator<T> getFeatureIterator(final FeatureInput<T> featureDescriptor) {
374         final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor);
375         return dataSource.iterator();
376     }
377 
378     /**
379      * Get the header associated with a particular FeatureInput
380      *
381      * @param featureDescriptor the FeatureInput whose header we want to retrieve
382      * @param <T> type of Feature in our FeatureInput
383      * @return header for the provided FeatureInput
384      */
getHeader( final FeatureInput<T> featureDescriptor )385     public <T extends Feature> Object getHeader( final FeatureInput<T> featureDescriptor ) {
386         final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor);
387         return dataSource.getHeader();
388     }
389 
390     /**
391      * Retrieve the data source for a particular FeatureInput. Throws an exception if the provided
392      * FeatureInput is not among our discovered sources of Features.
393      *
394      * @param featureDescriptor FeatureInput whose data source to retrieve
395      * @param <T> type of Feature in our FeatureInput
396      * @return query-able data source for the provided FeatureInput, if it was found
397      */
lookupDataSource( final FeatureInput<T> featureDescriptor )398     private <T extends Feature> FeatureDataSource<T> lookupDataSource( final FeatureInput<T> featureDescriptor ) {
399         @SuppressWarnings("unchecked") final FeatureDataSource<T> dataSource = (FeatureDataSource<T>)featureSources.get(featureDescriptor);
400 
401         // Make sure the provided FeatureInput actually came from our tool as an @Argument-annotated field
402         if ( dataSource == null ) {
403             throw new GATKException(String.format("FeatureInput %s not found in feature manager's database for tool %s. " +
404                                                   "In order to be detected, FeatureInputs must be declared in the tool class " +
405                                                   "itself, a superclass of the tool class, or an @ArgumentCollection declared " +
406                                                   "in the tool class or a superclass. They must also be annotated as an @Argument.",
407                                                   featureDescriptor.getName(), toolInstanceSimpleClassName));
408         }
409 
410         return dataSource;
411     }
412 
413     /**
414      * Utility method that determines the correct codec to use to read Features from the provided file.
415      *
416      * Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method
417      * in order to be considered as candidates for decoding the file.
418      *
419      * Throws an exception if no suitable codecs are found (this is a user error, since the file is of
420      * an unsupported format), or if more than one codec claims to be able to decode the file (this is
421      * a configuration error on the codec authors' part).
422      *
423      * @param featurePath path for which to find the right codec
424      * @return the codec suitable for decoding the provided file
425      */
getCodecForFile( final Path featurePath )426     public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath ) {
427         return getCodecForFile(featurePath, null);
428     }
429 
430     /**
431      * Utility method that determines the correct codec to use to read Features from the provided file,
432      * optionally considering only codecs that produce a particular type of Feature.
433      *
434      * Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method
435      * in order to be considered as candidates for decoding the file, and must produce
436      * Features of the specified type if featureType is non-null.
437      *
438      * Throws an exception if no suitable codecs are found (this is a user error, since the file is of
439      * an unsupported format), or if more than one codec claims to be able to decode the file (this is
440      * a configuration error on the codec authors' part).
441      *
442      * @param featurePath Path for which to find the right codec
443      * @param featureType If specified, consider only codecs that produce Features of this type. May be null,
444      *                    in which case all codecs are considered.
445      * @return the codec suitable for decoding the provided file
446      */
getCodecForFile( final Path featurePath, final Class<? extends Feature> featureType )447     public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath, final Class<? extends Feature> featureType ) {
448         // Make sure Path exists/is readable
449         if ( ! Files.isReadable(featurePath) ) {
450             throw new UserException.CouldNotReadInputFile(featurePath.toUri().toString());
451         }
452 
453         // Gather all discovered codecs that claim to be able to decode the given file according to their
454         // canDecode() methods
455         final List<FeatureCodec<? extends Feature, ?>> candidateCodecs = getCandidateCodecsForFile(featurePath);
456 
457         // If no codecs can handle the file, it's a user error (the user provided a file in an unsupported format)
458         if ( candidateCodecs.isEmpty() ) {
459             throw new UserException.NoSuitableCodecs(featurePath);
460         }
461 
462         // If featureType was specified, subset to only codecs that produce the requested type of Feature,
463         // and throw an error if there are no such codecs.
464         if ( featureType != null ) {
465             final List<String> discoveredCodecsFeatureTypes = candidateCodecs.stream().map(codec -> codec.getFeatureType().getSimpleName()).collect(Collectors.toList());
466             candidateCodecs.removeIf(codec -> ! featureType.isAssignableFrom(codec.getFeatureType()));
467 
468             if ( candidateCodecs.isEmpty() ) {
469                 throw new UserException.WrongFeatureType(featurePath, featureType, discoveredCodecsFeatureTypes);
470             }
471         }
472 
473         // If we still have multiple candidate codecs, it's a configuration error on the part of the codec authors
474         if ( candidateCodecs.size() > 1 ) {
475             final StringBuilder multiCodecMatches = new StringBuilder();
476             for ( FeatureCodec<? extends Feature, ?> candidateCodec : candidateCodecs ) {
477                 multiCodecMatches.append(candidateCodec.getClass().getCanonicalName());
478                 multiCodecMatches.append(' ');
479             }
480             throw new GATKException("Multiple codecs found able to decode file " + featurePath.toAbsolutePath().toUri() +
481                                     ". This indicates a misconfiguration on the part of the codec authors. " +
482                                     "Matching codecs are: " + multiCodecMatches.toString());
483         }
484 
485         final FeatureCodec<? extends Feature, ?> selectedCodec = candidateCodecs.get(0);
486         logger.info("Using codec " + selectedCodec.getClass().getSimpleName() + " to read file " + featurePath.toAbsolutePath().toUri());
487         return selectedCodec;
488     }
489 
490     /**
491      * Returns a List of all codecs in DISCOVERED_CODECS that claim to be able to decode the specified file
492      * according to their {@link FeatureCodec#canDecode(String)} methods.
493      *
494      * @param featureFile file for which to find potential codecs
495      * @return A List of all codecs in DISCOVERED_CODECS for which {@link FeatureCodec#canDecode(String)} returns true on the specified file
496      */
getCandidateCodecsForFile( final Path featureFile )497     private static List<FeatureCodec<? extends Feature, ?>> getCandidateCodecsForFile( final Path featureFile )  {
498         final List<FeatureCodec<? extends Feature, ?>> candidateCodecs = new ArrayList<>();
499 
500         for ( final Class<?> codecClass : DISCOVERED_CODECS ) {
501             try {
502                 final FeatureCodec<? extends Feature, ?> codec = (FeatureCodec<? extends Feature, ?>)codecClass.getDeclaredConstructor().newInstance();
503                 if ( codec.canDecode(featureFile.toAbsolutePath().toUri().toString()) ) {
504                     candidateCodecs.add(codec);
505                 }
506             }
507             catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e ) {
508                 throw new GATKException("Unable to automatically instantiate codec " + codecClass.getName());
509             }
510         }
511 
512         return candidateCodecs;
513     }
514 
515     /**
516      * @param file file to check
517      * @return True if the file exists and contains Features (ie., we have a FeatureCodec that can decode it), otherwise false
518      */
isFeatureFile( final Path file )519     public static boolean isFeatureFile( final Path file ) {
520         return Files.exists(file) && ! getCandidateCodecsForFile(file).isEmpty();
521     }
522 
523     /**
524      * Permanently closes this manager by closing all backing data sources
525      */
526     @Override
close()527     public void close() {
528         featureSources.values().forEach(ds -> ds.close());
529     }
530 
531 }
532