1 package org.broadinstitute.hellbender.engine; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.SAMFileHeader; 5 import htsjdk.samtools.SAMSequenceDictionary; 6 import htsjdk.samtools.util.Locatable; 7 import htsjdk.tribble.Feature; 8 import htsjdk.tribble.FeatureCodec; 9 import htsjdk.variant.vcf.VCFHeader; 10 import org.apache.commons.lang3.tuple.Pair; 11 import org.apache.logging.log4j.LogManager; 12 import org.apache.logging.log4j.Logger; 13 import org.broadinstitute.barclay.argparser.Argument; 14 import org.broadinstitute.barclay.argparser.ArgumentDefinition; 15 import org.broadinstitute.barclay.argparser.ClassFinder; 16 import org.broadinstitute.hellbender.cmdline.CommandLineProgram; 17 import org.broadinstitute.hellbender.exceptions.GATKException; 18 import org.broadinstitute.hellbender.exceptions.UserException; 19 import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions; 20 import org.broadinstitute.hellbender.utils.SimpleInterval; 21 import org.broadinstitute.hellbender.utils.Utils; 22 import org.broadinstitute.hellbender.utils.config.ConfigFactory; 23 import org.broadinstitute.hellbender.utils.config.GATKConfig; 24 25 import java.lang.reflect.InvocationTargetException; 26 import java.lang.reflect.ParameterizedType; 27 import java.lang.reflect.Type; 28 import java.nio.file.Files; 29 import java.nio.file.Path; 30 import java.util.*; 31 import java.util.stream.Collectors; 32 33 34 /** 35 * Handles discovery of available codecs and Feature arguments, file format detection and codec selection, 36 * and creation/management/querying of FeatureDataSources for each source of Features. 37 * 38 * At startup, walks the packages specified in {@link GATKConfig#codec_packages} in the config file to discover what codecs are available 39 * to decode Feature-containing files. 40 * 41 * Then, given a tool instance, it discovers what FeatureInput argument fields are declared in the 42 * tool's class hierarchy (and associated ArgumentCollections), and for each argument actually specified 43 * by the user on the command line, determines the type of the file and the codec required to decode it, 44 * creates a FeatureDataSource for that file, and adds it to a query-able resource pool. 45 * 46 * Clients can then call {@link #getFeatures(FeatureInput, SimpleInterval)} to query the data source for 47 * a particular FeatureInput over a specific interval. 48 */ 49 public final class FeatureManager implements AutoCloseable { 50 private static final Logger logger = LogManager.getLogger(FeatureManager.class); 51 52 /** 53 * All codecs descend from this class 54 */ 55 private static final Class<FeatureCodec> CODEC_BASE_CLASS = FeatureCodec.class; 56 57 /** 58 * The codec classes we locate when searching codec packages 59 */ 60 private static final Set<Class<?>> DISCOVERED_CODECS; 61 62 /** 63 * Feature arguments in tools are of this type 64 */ 65 private static final Class<FeatureInput> FEATURE_ARGUMENT_CLASS = FeatureInput.class; 66 67 /** 68 * At startup, walk through the packages in codec packages, and save any (concrete) FeatureCodecs discovered 69 * in DISCOVERED_CODECS 70 */ 71 static { 72 73 // Get our configuration: 74 final GATKConfig config = ConfigFactory.getInstance().getGATKConfig(); 75 76 final ClassFinder finder = new ClassFinder(); 77 for ( final String codecPackage : config.codec_packages() ) { finder.find(codecPackage, CODEC_BASE_CLASS)78 finder.find(codecPackage, CODEC_BASE_CLASS); 79 } 80 // Exclude abstract classes and interfaces from the list of discovered codec classes 81 DISCOVERED_CODECS = Collections.unmodifiableSet(finder.getConcreteClasses()); 82 } 83 84 /** 85 * The simple class name of the tool instance containing the FeatureInput argument values that will form the basis of our 86 * pool of FeatureDataSources 87 */ 88 private final String toolInstanceSimpleClassName; 89 90 /** 91 * Mapping from FeatureInput argument to query-able FeatureDataSource for that source of Features 92 */ 93 private final Map<FeatureInput<? extends Feature>, FeatureDataSource<? extends Feature>> featureSources; 94 95 /** 96 * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput 97 * arguments in the tool and creating query-able FeatureDataSources for them. Uses the default 98 * caching behavior of {@link FeatureDataSource}. 99 * 100 * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments) 101 * Must have undergone command-line argument parsing and argument value injection already. 102 */ FeatureManager( final CommandLineProgram toolInstance )103 public FeatureManager( final CommandLineProgram toolInstance ) { 104 this(toolInstance, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES); 105 } 106 107 /** 108 * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput 109 * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over 110 * how much caching is performed by each {@link FeatureDataSource}. 111 * 112 * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments) 113 * Must have undergone command-line argument parsing and argument value injection already. 114 * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond 115 * the end of query intervals in anticipation of future queries (>= 0). 116 */ FeatureManager( final CommandLineProgram toolInstance, final int featureQueryLookahead )117 public FeatureManager( final CommandLineProgram toolInstance, final int featureQueryLookahead ) { 118 this(toolInstance, featureQueryLookahead, 0, 0); 119 } 120 121 122 /** 123 * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput 124 * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over 125 * how much caching is performed by each {@link FeatureDataSource}. 126 * 127 * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments) 128 * Must have undergone command-line argument parsing and argument value injection already. 129 * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond 130 * the end of query intervals in anticipation of future queries (>= 0). 131 * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable). 132 * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable). 133 * 134 */ FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer)135 public FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) { 136 this(toolInstance, featureQueryLookahead, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, null); 137 } 138 139 /** 140 * Create a FeatureManager given a CommandLineProgram tool instance, discovering all FeatureInput 141 * arguments in the tool and creating query-able FeatureDataSources for them. Allows control over 142 * how much caching is performed by each {@link FeatureDataSource}. 143 * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments) 144 * Must have undergone command-line argument parsing and argument value injection already. 145 * @param featureQueryLookahead When querying FeatureDataSources, cache this many extra bases of context beyond 146 * the end of query intervals in anticipation of future queries (>= 0). 147 * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable). 148 * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable). 149 * @param gdbOptions settings for GenomicsDB to use when reading from a GenomicsDB workspace 150 * 151 */ FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions)152 public FeatureManager(final CommandLineProgram toolInstance, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions) { 153 this.toolInstanceSimpleClassName = toolInstance.getClass().getSimpleName(); 154 this.featureSources = new LinkedHashMap<>(); 155 156 initializeFeatureSources(featureQueryLookahead, toolInstance, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, gdbOptions); 157 } 158 159 /** 160 * Same as {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)}, except used when the 161 * FeatureInputs (and associated types) are known. 162 * 163 * This constructor should only be used in test code. 164 * 165 * @param featureInputsToTypeMap {@link Map} of a {@link FeatureInput} to the output type that must extend {@link Feature}. Never {@code null} 166 * @param toolInstanceName See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)} 167 * @param featureQueryLookahead See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)} 168 * @param cloudPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)} 169 * @param cloudIndexPrefetchBuffer See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)} 170 * @param reference See {@link FeatureManager#FeatureManager(CommandLineProgram, int, int, int, GenomicsDBOptions)} 171 */ 172 @VisibleForTesting FeatureManager(final Map<FeatureInput<? extends Feature>, Class<? extends Feature>> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference)173 FeatureManager(final Map<FeatureInput<? extends Feature>, Class<? extends Feature>> featureInputsToTypeMap, final String toolInstanceName, final int featureQueryLookahead, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference) { 174 175 Utils.nonNull(featureInputsToTypeMap); 176 177 this.toolInstanceSimpleClassName = toolInstanceName; 178 this.featureSources = new LinkedHashMap<>(); 179 Utils.nonNull(featureInputsToTypeMap); 180 featureInputsToTypeMap.forEach((k,v) -> addToFeatureSources(featureQueryLookahead, k, v, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, reference)); 181 } 182 183 /** 184 * Given our tool instance, discover all argument of type FeatureInput (or Collections thereof), determine 185 * the type of each Feature-containing file, and add a FeatureDataSource for each file to our query pool. 186 * 187 * @param featureQueryLookahead Set up each FeatureDataSource to cache this many extra bases of context beyond 188 * the end of query intervals in anticipation of future queries (>= 0). 189 * @param toolInstance Instance of the tool to be run (potentially containing one or more FeatureInput arguments) 190 * Must have undergone command-line argument parsing and argument value injection already. 191 * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable). 192 * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable). 193 */ 194 @SuppressWarnings({"unchecked", "rawtypes"}) initializeFeatureSources( final int featureQueryLookahead, final CommandLineProgram toolInstance, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions)195 private void initializeFeatureSources( final int featureQueryLookahead, final CommandLineProgram toolInstance, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions gdbOptions) { 196 197 // Discover all arguments of type FeatureInput (or Collections thereof) in our tool's class hierarchy 198 // (and associated ArgumentCollections). Arguments not specified by the user on the command line will 199 // come back to us with a null FeatureInput. 200 final List<Pair<ArgumentDefinition, FeatureInput>> featureArgumentValues = 201 toolInstance.getCommandLineParser().gatherArgumentValuesOfType(FEATURE_ARGUMENT_CLASS); 202 203 for ( final Pair<ArgumentDefinition, FeatureInput> featureArgument : featureArgumentValues ) { 204 final FeatureInput<? extends Feature> featureInput = featureArgument.getValue(); 205 206 // Only create a data source for Feature arguments that were actually specified 207 if ( featureInput != null ) { 208 final Class<? extends Feature> featureType = getFeatureTypeForFeatureInputArgument(featureArgument.getKey()); 209 addToFeatureSources(featureQueryLookahead, featureInput, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, 210 gdbOptions); 211 } 212 } 213 } 214 215 @SuppressWarnings({"unchecked", "rawtypes"}) dumpAllFeatureCacheStats()216 public void dumpAllFeatureCacheStats() { 217 for ( final FeatureDataSource f : featureSources.values() ) { 218 f.printCacheStats(); 219 } 220 } 221 addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput, final Class<? extends Feature> featureType, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference)222 void addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput, 223 final Class<? extends Feature> featureType, final int cloudPrefetchBuffer, 224 final int cloudIndexPrefetchBuffer, final Path reference) { 225 // Create a new FeatureDataSource for this file, and add it to our query pool 226 featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, new GenomicsDBOptions(reference))); 227 } 228 229 /** 230 * Add the feature data source to the given feature input. 231 * 232 * @param featureQueryLookahead look ahead this many bases during queries that produce cache misses 233 * @param featureInput source of features 234 * @param featureType class of features 235 * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable). 236 * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable). 237 * @param genomicsDBOptions options and info for reading from a GenomicsDB 238 * 239 * Note: package-visible to enable access from the core walker classes 240 * (but not actual tools, so it's not protected). 241 */ addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput, final Class<? extends Feature> featureType, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions)242 void addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? extends Feature> featureInput, 243 final Class<? extends Feature> featureType, final int cloudPrefetchBuffer, 244 final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions) { 245 // Create a new FeatureDataSource for this file, and add it to our query pool 246 featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, genomicsDBOptions)); 247 } 248 249 /** 250 * Given a ArgumentDefinition for an argument known to be of type FeatureInput (or a Collection thereof), retrieves the type 251 * parameter for the FeatureInput (eg., for FeatureInput<VariantContext> or List<FeatureInput<VariantContext>> 252 * this would be VariantContext). 253 * 254 * @param argDef an {@code ArgumentDefinition} for an argument known to be of type FeatureInput whose type parameter to retrieve 255 * @return type parameter of the FeatureInput declaration represented by the given ArgumentDefinition 256 */ 257 @SuppressWarnings("unchecked") getFeatureTypeForFeatureInputArgument( final ArgumentDefinition argDef )258 static Class<? extends Feature> getFeatureTypeForFeatureInputArgument( final ArgumentDefinition argDef ) { 259 final Type featureInputType = argDef.isCollection() ? 260 getNextTypeParameter((ParameterizedType)(argDef.getUnderlyingField().getGenericType())) : 261 argDef.getUnderlyingField().getGenericType(); 262 263 if ( ! (featureInputType instanceof ParameterizedType) ) { 264 throw new GATKException(String.format("FeatureInput declaration for argument --%s lacks an explicit type parameter for the Feature type", 265 argDef.getUnderlyingField().getAnnotation(Argument.class).fullName())); 266 } 267 268 return (Class<? extends Feature>)getNextTypeParameter((ParameterizedType)featureInputType); 269 } 270 271 /** 272 * Helper method for {@link #getFeatureTypeForFeatureInputArgument(ArgumentDefinition)} that "unpacks" a 273 * parameterized type by one level of parameterization. Eg., given List<FeatureInput<VariantContext>> 274 * would return FeatureInput<VariantContext>. 275 * 276 * @param parameterizedType parameterized type to unpack 277 * @return the type parameter of the given parameterized type 278 */ getNextTypeParameter( final ParameterizedType parameterizedType )279 private static Type getNextTypeParameter( final ParameterizedType parameterizedType ) { 280 final Type[] typeParameters = parameterizedType.getActualTypeArguments(); 281 if ( typeParameters.length != 1 ) { 282 throw new GATKException("Found a FeatureInput declaration with multiple type parameters, which is not supported"); 283 } 284 return typeParameters[0]; 285 } 286 287 /** 288 * Does this manager have no sources of Features to query? 289 * 290 * @return true if there are no Feature sources available to query, otherwise false 291 */ isEmpty()292 public boolean isEmpty() { 293 return featureSources.isEmpty(); 294 } 295 296 297 /** 298 * This method finds and returns all of the variant headers from the feature sources. 299 * 300 * @return A list of all variant headers for features. 301 */ getAllVariantHeaders()302 public List<VCFHeader> getAllVariantHeaders() { 303 return featureSources.values().stream() 304 .map(feature -> feature.getHeader()) 305 .filter(header -> header instanceof VCFHeader) 306 .map(header -> (VCFHeader)header).collect(Collectors.toList()); 307 } 308 309 /** 310 * Returns the list of sequence dictionaries retrieved from the VCF headers of variant Feature inputs. 311 * Note: this method returns an empty list if the variant inputs 312 * happen not to have sequence dictionaries (since they are optional in the VCF format). 313 */ getVariantSequenceDictionaries()314 public List<SAMSequenceDictionary> getVariantSequenceDictionaries() { 315 return getAllVariantHeaders() 316 .stream().map(h -> h.getSequenceDictionary()) 317 .filter(dict -> dict != null) 318 .collect(Collectors.toList()); 319 } 320 321 /** 322 * Returns the sequence dictionaries associated with all feature sources. 323 * This method will return an empty List if none of the feature sources have dictionaries. 324 */ getAllSequenceDictionaries()325 public List<SAMSequenceDictionary> getAllSequenceDictionaries() { 326 return featureSources.values().stream().map(fs -> fs.getSequenceDictionary()) 327 .filter(dict -> dict != null) 328 .collect(Collectors.toList()); 329 } 330 331 /** 332 * Given a FeatureInput argument field from our tool, queries the data source for that FeatureInput 333 * over the specified interval, and returns a List of the Features overlapping that interval from 334 * that data source. 335 * 336 * Will throw an exception if the provided FeatureInput did not come from the tool that this 337 * FeatureManager was initialized with, or was not an @Argument-annotated field in the tool 338 * (or parent classes). 339 * 340 * @param featureDescriptor FeatureInput argument from our tool representing the Feature source to query 341 * @param interval interval to query over (returned Features will overlap this interval) 342 * @param <T> type of Feature in the source represented by featureDescriptor 343 * @return A List of all Features in the backing data source for the provided FeatureInput that overlap 344 * the provided interval (may be empty if there are none, but never null) 345 */ getFeatures( final FeatureInput<T> featureDescriptor, final Locatable interval )346 public <T extends Feature> List<T> getFeatures( final FeatureInput<T> featureDescriptor, final Locatable interval ) { 347 final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor); 348 349 // No danger of a ClassCastException here, since we verified that the FeatureDataSource for this 350 // FeatureInput will return Features of the expected type T when we first created the data source 351 // in initializeFeatureSources() 352 return dataSource.queryAndPrefetch(interval); 353 } 354 355 /** 356 * Given a FeatureInput argument field from our tool, returns an iterator to its features starting 357 * from the first one. 358 * <p><b>Warning!</b>: calling this method a second time on the same {@link FeatureInput} 359 * on the same FeatureManager instance will invalidate (close) the iterator returned from 360 * the first call. 361 * </p> 362 * <p> 363 * An exception will be thrown if the {@link FeatureInput} provided did not come from the tool that this 364 * manager was initialized with, or was not an @Argument-annotated field in the tool 365 * (or parent classes). 366 * </p> 367 * 368 * @param featureDescriptor FeatureInput argument from our tool representing the Feature source to query 369 * @param <T> type of Feature in the source represented by featureDescriptor 370 * @return never {@code null}, a iterator to all the features in the backing data source. 371 * @throws GATKException if the feature-descriptor is not found in the manager or is {@code null}. 372 */ getFeatureIterator(final FeatureInput<T> featureDescriptor)373 public <T extends Feature> Iterator<T> getFeatureIterator(final FeatureInput<T> featureDescriptor) { 374 final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor); 375 return dataSource.iterator(); 376 } 377 378 /** 379 * Get the header associated with a particular FeatureInput 380 * 381 * @param featureDescriptor the FeatureInput whose header we want to retrieve 382 * @param <T> type of Feature in our FeatureInput 383 * @return header for the provided FeatureInput 384 */ getHeader( final FeatureInput<T> featureDescriptor )385 public <T extends Feature> Object getHeader( final FeatureInput<T> featureDescriptor ) { 386 final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor); 387 return dataSource.getHeader(); 388 } 389 390 /** 391 * Retrieve the data source for a particular FeatureInput. Throws an exception if the provided 392 * FeatureInput is not among our discovered sources of Features. 393 * 394 * @param featureDescriptor FeatureInput whose data source to retrieve 395 * @param <T> type of Feature in our FeatureInput 396 * @return query-able data source for the provided FeatureInput, if it was found 397 */ lookupDataSource( final FeatureInput<T> featureDescriptor )398 private <T extends Feature> FeatureDataSource<T> lookupDataSource( final FeatureInput<T> featureDescriptor ) { 399 @SuppressWarnings("unchecked") final FeatureDataSource<T> dataSource = (FeatureDataSource<T>)featureSources.get(featureDescriptor); 400 401 // Make sure the provided FeatureInput actually came from our tool as an @Argument-annotated field 402 if ( dataSource == null ) { 403 throw new GATKException(String.format("FeatureInput %s not found in feature manager's database for tool %s. " + 404 "In order to be detected, FeatureInputs must be declared in the tool class " + 405 "itself, a superclass of the tool class, or an @ArgumentCollection declared " + 406 "in the tool class or a superclass. They must also be annotated as an @Argument.", 407 featureDescriptor.getName(), toolInstanceSimpleClassName)); 408 } 409 410 return dataSource; 411 } 412 413 /** 414 * Utility method that determines the correct codec to use to read Features from the provided file. 415 * 416 * Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method 417 * in order to be considered as candidates for decoding the file. 418 * 419 * Throws an exception if no suitable codecs are found (this is a user error, since the file is of 420 * an unsupported format), or if more than one codec claims to be able to decode the file (this is 421 * a configuration error on the codec authors' part). 422 * 423 * @param featurePath path for which to find the right codec 424 * @return the codec suitable for decoding the provided file 425 */ getCodecForFile( final Path featurePath )426 public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath ) { 427 return getCodecForFile(featurePath, null); 428 } 429 430 /** 431 * Utility method that determines the correct codec to use to read Features from the provided file, 432 * optionally considering only codecs that produce a particular type of Feature. 433 * 434 * Codecs MUST correctly implement the {@link FeatureCodec#canDecode(String)} method 435 * in order to be considered as candidates for decoding the file, and must produce 436 * Features of the specified type if featureType is non-null. 437 * 438 * Throws an exception if no suitable codecs are found (this is a user error, since the file is of 439 * an unsupported format), or if more than one codec claims to be able to decode the file (this is 440 * a configuration error on the codec authors' part). 441 * 442 * @param featurePath Path for which to find the right codec 443 * @param featureType If specified, consider only codecs that produce Features of this type. May be null, 444 * in which case all codecs are considered. 445 * @return the codec suitable for decoding the provided file 446 */ getCodecForFile( final Path featurePath, final Class<? extends Feature> featureType )447 public static FeatureCodec<? extends Feature, ?> getCodecForFile( final Path featurePath, final Class<? extends Feature> featureType ) { 448 // Make sure Path exists/is readable 449 if ( ! Files.isReadable(featurePath) ) { 450 throw new UserException.CouldNotReadInputFile(featurePath.toUri().toString()); 451 } 452 453 // Gather all discovered codecs that claim to be able to decode the given file according to their 454 // canDecode() methods 455 final List<FeatureCodec<? extends Feature, ?>> candidateCodecs = getCandidateCodecsForFile(featurePath); 456 457 // If no codecs can handle the file, it's a user error (the user provided a file in an unsupported format) 458 if ( candidateCodecs.isEmpty() ) { 459 throw new UserException.NoSuitableCodecs(featurePath); 460 } 461 462 // If featureType was specified, subset to only codecs that produce the requested type of Feature, 463 // and throw an error if there are no such codecs. 464 if ( featureType != null ) { 465 final List<String> discoveredCodecsFeatureTypes = candidateCodecs.stream().map(codec -> codec.getFeatureType().getSimpleName()).collect(Collectors.toList()); 466 candidateCodecs.removeIf(codec -> ! featureType.isAssignableFrom(codec.getFeatureType())); 467 468 if ( candidateCodecs.isEmpty() ) { 469 throw new UserException.WrongFeatureType(featurePath, featureType, discoveredCodecsFeatureTypes); 470 } 471 } 472 473 // If we still have multiple candidate codecs, it's a configuration error on the part of the codec authors 474 if ( candidateCodecs.size() > 1 ) { 475 final StringBuilder multiCodecMatches = new StringBuilder(); 476 for ( FeatureCodec<? extends Feature, ?> candidateCodec : candidateCodecs ) { 477 multiCodecMatches.append(candidateCodec.getClass().getCanonicalName()); 478 multiCodecMatches.append(' '); 479 } 480 throw new GATKException("Multiple codecs found able to decode file " + featurePath.toAbsolutePath().toUri() + 481 ". This indicates a misconfiguration on the part of the codec authors. " + 482 "Matching codecs are: " + multiCodecMatches.toString()); 483 } 484 485 final FeatureCodec<? extends Feature, ?> selectedCodec = candidateCodecs.get(0); 486 logger.info("Using codec " + selectedCodec.getClass().getSimpleName() + " to read file " + featurePath.toAbsolutePath().toUri()); 487 return selectedCodec; 488 } 489 490 /** 491 * Returns a List of all codecs in DISCOVERED_CODECS that claim to be able to decode the specified file 492 * according to their {@link FeatureCodec#canDecode(String)} methods. 493 * 494 * @param featureFile file for which to find potential codecs 495 * @return A List of all codecs in DISCOVERED_CODECS for which {@link FeatureCodec#canDecode(String)} returns true on the specified file 496 */ getCandidateCodecsForFile( final Path featureFile )497 private static List<FeatureCodec<? extends Feature, ?>> getCandidateCodecsForFile( final Path featureFile ) { 498 final List<FeatureCodec<? extends Feature, ?>> candidateCodecs = new ArrayList<>(); 499 500 for ( final Class<?> codecClass : DISCOVERED_CODECS ) { 501 try { 502 final FeatureCodec<? extends Feature, ?> codec = (FeatureCodec<? extends Feature, ?>)codecClass.getDeclaredConstructor().newInstance(); 503 if ( codec.canDecode(featureFile.toAbsolutePath().toUri().toString()) ) { 504 candidateCodecs.add(codec); 505 } 506 } 507 catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e ) { 508 throw new GATKException("Unable to automatically instantiate codec " + codecClass.getName()); 509 } 510 } 511 512 return candidateCodecs; 513 } 514 515 /** 516 * @param file file to check 517 * @return True if the file exists and contains Features (ie., we have a FeatureCodec that can decode it), otherwise false 518 */ isFeatureFile( final Path file )519 public static boolean isFeatureFile( final Path file ) { 520 return Files.exists(file) && ! getCandidateCodecsForFile(file).isEmpty(); 521 } 522 523 /** 524 * Permanently closes this manager by closing all backing data sources 525 */ 526 @Override close()527 public void close() { 528 featureSources.values().forEach(ds -> ds.close()); 529 } 530 531 } 532