1 package org.broadinstitute.hellbender.tools.funcotator; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.tribble.Feature; 5 import htsjdk.variant.variantcontext.VariantContext; 6 import org.apache.logging.log4j.LogManager; 7 import org.apache.logging.log4j.Logger; 8 import org.broadinstitute.barclay.utils.Utils; 9 import org.broadinstitute.hellbender.engine.FeatureContext; 10 import org.broadinstitute.hellbender.engine.FeatureInput; 11 import org.broadinstitute.hellbender.engine.ReferenceContext; 12 import org.broadinstitute.hellbender.exceptions.GATKException; 13 import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation; 14 import org.broadinstitute.hellbender.utils.SimpleInterval; 15 16 import java.io.Closeable; 17 import java.util.*; 18 19 /** 20 * An abstract class to allow for the creation of a {@link Funcotation} for a given data source. 21 * Created by jonn on 8/30/17. 22 * 23 * Subclasses that support the annotation of segments must override: 24 * getSupportedFuncotationFieldsForSegments() 25 * isSupportingSegmentFuncotation() 26 * createFuncotationsOnSegment(...) 27 * 28 */ 29 public abstract class DataSourceFuncotationFactory implements Closeable { 30 31 //================================================================================================================== 32 33 /** Standard Logger. */ 34 protected static final Logger logger = LogManager.getLogger(DataSourceFuncotationFactory.class); 35 36 /** Default version string for this {@link DataSourceFuncotationFactory}. */ 37 @VisibleForTesting 38 public static final String DEFAULT_VERSION_STRING = "UNKNOWN_VERSION"; 39 40 /** 41 * Version number of this {@link DataSourceFuncotationFactory}. 42 */ 43 protected String version = DEFAULT_VERSION_STRING; 44 45 /** 46 * Map of ANNOTATION_NAME -> OVERRIDE_VALUE. 47 */ 48 protected Map<String, String> annotationOverrideMap; 49 50 /** 51 * Enables b37 data sources to be combined with hg19 data sources and work with the same input variants. 52 * Should only be used in cases where data sources cannot be made / found for hg19 and hg19 annotations are required. 53 * A value of {@code false} ONLY indicates that the data source is NOT b37. 54 * If {@code true}, the backing data behind this {@link DataSourceFuncotationFactory} is based on the b37 reference AND we are using hg19 data. 55 * If {@code false}, the backing data behind this the backing data behind this {@link DataSourceFuncotationFactory} is NOT based on the b37 reference. 56 */ 57 protected boolean dataSourceIsB37 = false; 58 59 /** 60 * The backing data store as a FeatureInput to leverage tribble querying. Can be {@code null} for non-locatable 61 * funcotation factories. 62 */ 63 protected final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput; 64 65 /** 66 * Minimum number of bases for a segment to be considered valid. 67 */ 68 protected int minBasesForValidSegment; 69 70 @VisibleForTesting getMainSourceFileAsFeatureInput()71 public FeatureInput<? extends Feature> getMainSourceFileAsFeatureInput() { 72 return mainSourceFileAsFeatureInput; 73 } 74 75 /** 76 * Constructor to initialize final fields in this class with defaults. 77 * @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid. 78 */ DataSourceFuncotationFactory(final int minBasesForValidSegment)79 protected DataSourceFuncotationFactory(final int minBasesForValidSegment) { 80 this.mainSourceFileAsFeatureInput = null; 81 this.minBasesForValidSegment = minBasesForValidSegment; 82 } 83 84 /** 85 * Constructor to initialize final fields in this class. 86 * @param mainSourceFileAsFeatureInput The backing data store as a FeatureInput to leverage tribble querying. Can be {@code null} for non-locatable funcotation factories. 87 * @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid. 88 */ DataSourceFuncotationFactory(final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput, final int minBasesForValidSegment)89 protected DataSourceFuncotationFactory(final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput, 90 final int minBasesForValidSegment) { 91 this.mainSourceFileAsFeatureInput = mainSourceFileAsFeatureInput; 92 this.minBasesForValidSegment = minBasesForValidSegment; 93 } 94 95 96 /** 97 * Set values in {@link DataSourceFuncotationFactory#annotationOverrideMap} based on the given annotation override values 98 * and whether or not this {@link DataSourceFuncotationFactory} supports those annotations. 99 * @param annotationOverrides The {@link Map} of annotation override key names and values. 100 */ initializeAnnotationOverrides(final LinkedHashMap<String, String> annotationOverrides)101 protected void initializeAnnotationOverrides(final LinkedHashMap<String, String> annotationOverrides) { 102 // Go through the Annotation Maps and check to see if the default/override annotation names are applicable for 103 // this FuncotationFactory: 104 final Set<String> supportedFuncotations = getSupportedFuncotationFields(); 105 this.annotationOverrideMap = new HashMap<>(); 106 for ( final String annotationOverrideKey : annotationOverrides.keySet() ) { 107 if ( supportedFuncotations.contains(annotationOverrideKey) ) { 108 annotationOverrideMap.put( annotationOverrideKey, annotationOverrides.get(annotationOverrideKey) ); 109 } 110 } 111 } 112 113 //================================================================================================================== 114 115 /** 116 * @return A {@link String} containing information about this {@link DataSourceFuncotationFactory}. 117 */ getInfoString()118 public String getInfoString() { 119 return getName() + " " + getVersion(); 120 } 121 122 /** 123 * Perform cleanup tasks for this {@link DataSourceFuncotationFactory}. 124 */ close()125 public void close() {} 126 127 /** 128 * Apply the override values in {@link DataSourceFuncotationFactory#annotationOverrideMap} to every 129 * {@link Funcotation} in the given {@code outputFuncotations}. 130 * @param funcotationList {@link List} of {@link Funcotation} to which to apply override values. 131 */ setOverrideValuesInFuncotations(final List<Funcotation> funcotationList)132 protected void setOverrideValuesInFuncotations(final List<Funcotation> funcotationList) { 133 for ( final Funcotation funcotation : funcotationList ) { 134 funcotation.setFieldSerializationOverrideValues( annotationOverrideMap ); 135 } 136 } 137 138 /** 139 * @return The name of the data source corresponding to this {@link DataSourceFuncotationFactory}. 140 */ getName()141 public abstract String getName(); 142 143 /** 144 * @return The {@link org.broadinstitute.hellbender.tools.funcotator.FuncotatorArgumentDefinitions.DataSourceType} of this {@link DataSourceFuncotationFactory}. 145 */ getType()146 public abstract FuncotatorArgumentDefinitions.DataSourceType getType(); 147 148 /** 149 * @return The version of the data source corresponding to this {@link DataSourceFuncotationFactory}. 150 */ getVersion()151 public String getVersion() { 152 return version; 153 } 154 155 /** 156 * @return {@code True} if this {@link DataSourceFuncotationFactory} requires features to create {@link Funcotation}s. {@code False} otherwise. 157 */ 158 @VisibleForTesting requiresFeatures()159 public boolean requiresFeatures() { return true; } 160 161 /** 162 * @return An ordered {@link LinkedHashSet} of the names of annotations that this Data Source supports. 163 */ getSupportedFuncotationFields()164 public abstract LinkedHashSet<String> getSupportedFuncotationFields(); 165 166 /** 167 * @return An ordered {@link LinkedHashSet} of the names of annotations that this Data Source supports when annotating segments. 168 */ getSupportedFuncotationFieldsForSegments()169 public LinkedHashSet<String> getSupportedFuncotationFieldsForSegments() { 170 return new LinkedHashSet<>(); 171 } 172 173 /** 174 * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, and {@code featureContext}. 175 * Accounts for override values passed into the constructor as well. 176 * @param variant {@link VariantContext} to annotate. 177 * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. 178 * @param featureContext {@link FeatureContext} corresponding to the variant. Never {@code null}. 179 * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. 180 */ createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext)181 public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext) { 182 return createFuncotations(variant, referenceContext, featureContext, null); 183 } 184 185 /** 186 * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, {@code featureContext}, and {@code gencodeFuncotations}. 187 * For some Data Sources knowledge of Gene Name or Transcript ID is required for annotation. 188 * Accounts for override values passed into the constructor as well. 189 * @param variant {@link VariantContext} to annotate. Never {@code null}. 190 * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. Never {@code null}. 191 * @param featureContext {@link FeatureContext} corresponding to the variant. Never {@code null}. 192 * @param gencodeFuncotations {@link List} of {@link GencodeFuncotation} that have already been created for the given {@code variant}/{@code referenceContext}/{@code featureContext}. 193 * {@code null} is acceptable if there are no corresponding gencode funcotations. 194 * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. 195 */ createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext, final List<GencodeFuncotation> gencodeFuncotations)196 public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext, final List<GencodeFuncotation> gencodeFuncotations) { 197 198 Utils.nonNull(variant); 199 Utils.nonNull(referenceContext); 200 Utils.nonNull(featureContext); 201 202 final List<Funcotation> outputFuncotations; 203 204 // Query this funcotation factory to get the list of overlapping features. 205 // NOTE: This will only get features that are LOCATABLE! 206 // This corresponds to requiresFeatures() returning `True`. 207 final List<Feature> featureList = getFeaturesFromFeatureContext(featureContext); 208 209 // If our featureList is compatible with this DataSourceFuncotationFactory, then we make our funcotations: 210 if ( isFeatureListCompatible(featureList) ) { 211 outputFuncotations = determineFuncotations(variant, referenceContext, featureList, gencodeFuncotations); 212 213 // Set our overrides: 214 setOverrideValuesInFuncotations(outputFuncotations); 215 } 216 else { 217 return createDefaultFuncotationsOnVariant(variant, referenceContext); 218 } 219 220 if ((outputFuncotations == null) || (outputFuncotations.size() == 0)) { 221 return createDefaultFuncotationsOnVariant(variant, referenceContext); 222 } else { 223 return outputFuncotations; 224 } 225 } 226 getFeaturesFromFeatureContext(final FeatureContext featureContext)227 private List<Feature> getFeaturesFromFeatureContext(final FeatureContext featureContext) { 228 return requiresFeatures() ? 229 queryFeaturesFromFeatureContext(featureContext) : 230 Collections.emptyList(); 231 } 232 determineFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList, final List<GencodeFuncotation> gencodeFuncotations)233 private List<Funcotation> determineFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList, final List<GencodeFuncotation> gencodeFuncotations) { 234 235 // Create our funcotations: 236 final List<Funcotation> outputFuncotations; 237 238 if (FuncotatorUtils.isSegmentVariantContext(variant, minBasesForValidSegment) && isSupportingSegmentFuncotation()) { 239 outputFuncotations = createFuncotationsOnSegment(variant, referenceContext, featureList); 240 } else { 241 242 if (gencodeFuncotations == null) { 243 outputFuncotations = createFuncotationsOnVariant(variant, referenceContext, featureList); 244 } else { 245 outputFuncotations = createFuncotationsOnVariant(variant, referenceContext, featureList, gencodeFuncotations); 246 } 247 } 248 return outputFuncotations; 249 } 250 251 /** 252 * Checks to see if the given featureList is compatible with this {@link DataSourceFuncotationFactory}. 253 * Cues off of the feature type in the feature list and whether the given list contains any non-null features. 254 * This method acts as a sanity-check before attempting to do any annotations on features. 255 * If this {@link DataSourceFuncotationFactory} does not require features as per {@link #requiresFeatures()}, then 256 * this method will always return {@code True}. 257 * @param featureList {@link List} of {@link Feature} that might be applicable to this {@link DataSourceFuncotationFactory} for annotation. 258 * @return {@code true} if the given {@code featureList} contains at least one non-null feature of type {@link #getAnnotationFeatureClass()}; {@code false} otherwise. 259 */ isFeatureListCompatible(final List<Feature> featureList)260 private boolean isFeatureListCompatible(final List<Feature> featureList) { 261 // Make sure these features can be annotated by this DataSourceFuncotationFactory. 262 // NOTE: We only check the first non-null element of the list for feature type: 263 264 // The feature list is compatible if we found a compatible feature 265 // OR 266 // if this DataSourceFuncotationFactory does not require features. 267 if ( !requiresFeatures() ) { 268 return true; 269 } 270 271 boolean foundCompatibleFeature = false; 272 for ( final Feature f : featureList ) { 273 if (f != null) { 274 foundCompatibleFeature = getAnnotationFeatureClass().isAssignableFrom(f.getClass()); 275 break; 276 } 277 } 278 return foundCompatibleFeature; 279 } 280 281 /** 282 * Queries the provided FeatureContext for Features from our FeatureInput {@link #mainSourceFileAsFeatureInput}. 283 * The default implementation returns all Features from our FeatureInput that overlap the FeatureContext's 284 * interval, but subclasses may override (for example, to pad the query). 285 * 286 * @param featureContext the FeatureContext to query 287 * @return Features from our FeatureInput {@link #mainSourceFileAsFeatureInput} queried from the FeatureContext 288 */ 289 @SuppressWarnings("unchecked") queryFeaturesFromFeatureContext(final FeatureContext featureContext)290 private List<Feature> queryFeaturesFromFeatureContext(final FeatureContext featureContext) { 291 final List<Feature> features; 292 293 SimpleInterval queryInterval = featureContext.getInterval(); 294 295 // Do we need to do a fuzzy hg19 / b37 conversion for querying our features: 296 if ( dataSourceIsB37 ) { 297 // Create a B37 interval: 298 queryInterval = new SimpleInterval( 299 FuncotatorUtils.convertHG19ContigToB37Contig(queryInterval.getContig()), 300 queryInterval.getStart(), 301 queryInterval.getEnd() 302 ); 303 } 304 305 // Perform extra transformations on the query interval: 306 queryInterval = transformFeatureQueryInterval(queryInterval); 307 308 // If the interval has not changed, we should use the original one: 309 if ( queryInterval.equals(featureContext.getInterval() ) ) { // Get the features: 310 features = (List<Feature>) featureContext.getValues(mainSourceFileAsFeatureInput); 311 } 312 else { 313 // Query as normal: 314 features = (List<Feature>) featureContext.getValues(mainSourceFileAsFeatureInput, queryInterval); 315 } 316 317 return features; 318 } 319 320 /** 321 * A Method to allow {@link DataSourceFuncotationFactory} objects to adjust the query interval further for their 322 * own needs (e.g. for flanking). 323 * @param queryInterval The baseline {@link SimpleInterval} to be modified. 324 * @return A {@link SimpleInterval} that has been modified for this {@link DataSourceFuncotationFactory}'s specific needs. 325 */ transformFeatureQueryInterval(final SimpleInterval queryInterval)326 protected SimpleInterval transformFeatureQueryInterval(final SimpleInterval queryInterval) { 327 return queryInterval; 328 } 329 330 /** 331 * Creates a {@link List} of {@link Funcotation} for the given {@code variant} and {@code referenceContext}. 332 * These will be default funcotations that essentially have empty values. 333 * @param variant {@link VariantContext} to annotate. 334 * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. 335 * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. 336 */ createDefaultFuncotationsOnVariant( final VariantContext variant, final ReferenceContext referenceContext)337 protected abstract List<Funcotation> createDefaultFuncotationsOnVariant( final VariantContext variant, final ReferenceContext referenceContext); 338 339 /** 340 * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, and {@code featureContext}. 341 * @param variant {@link VariantContext} to annotate. 342 * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. 343 * @param featureList {@link List} of {@link Feature} corresponding to the given {@code variant}. 344 * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. 345 */ createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList)346 protected abstract List<Funcotation> createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList); 347 348 /** 349 * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, {@code featureContext}, and {@code gencodeFuncotations}. 350 * For some Data Sources knowledge of Gene Name or Transcript ID is required for annotation. 351 * @param variant {@link VariantContext} to annotate. 352 * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}. 353 * @param featureList {@link List} of {@link Feature} corresponding to the given {@code variant}. 354 * @param gencodeFuncotations {@link List} of {@link GencodeFuncotation} that have already been created for the given {@code variant}/{@code referenceContext}/{@code featureContext}. 355 * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}. This should never be empty. 356 */ createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList, final List<GencodeFuncotation> gencodeFuncotations)357 protected abstract List<Funcotation> createFuncotationsOnVariant(final VariantContext variant, 358 final ReferenceContext referenceContext, 359 final List<Feature> featureList, 360 final List<GencodeFuncotation> gencodeFuncotations); 361 362 /** 363 * @return Get the {@link Class} of the feature type that can be used to create annotations by this {@link DataSourceFuncotationFactory}. 364 */ 365 @VisibleForTesting getAnnotationFeatureClass()366 public abstract Class<? extends Feature> getAnnotationFeatureClass(); 367 368 /** 369 * @return Whether this funcotation factory can support creating funcotations from segments. 370 */ isSupportingSegmentFuncotation()371 public boolean isSupportingSegmentFuncotation() { 372 return false; 373 } 374 375 /** 376 * Sublclasses that support annotating segments should override this method to create funcotations for segments. 377 * Additionally, those subclasses should override 378 * {@link DataSourceFuncotationFactory#isSupportingSegmentFuncotation()} to return true. 379 */ createFuncotationsOnSegment(final VariantContext segmentVariantContext, final ReferenceContext referenceContext, final List<Feature> featureList)380 protected List<Funcotation> createFuncotationsOnSegment(final VariantContext segmentVariantContext, 381 final ReferenceContext referenceContext, 382 final List<Feature> featureList) { 383 throw new GATKException.ShouldNeverReachHereException("This funcotation factory does not support the annotation of segments."); 384 } 385 386 387 } 388