1 package org.broadinstitute.hellbender.tools.funcotator;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.tribble.Feature;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import org.apache.logging.log4j.LogManager;
7 import org.apache.logging.log4j.Logger;
8 import org.broadinstitute.barclay.utils.Utils;
9 import org.broadinstitute.hellbender.engine.FeatureContext;
10 import org.broadinstitute.hellbender.engine.FeatureInput;
11 import org.broadinstitute.hellbender.engine.ReferenceContext;
12 import org.broadinstitute.hellbender.exceptions.GATKException;
13 import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
14 import org.broadinstitute.hellbender.utils.SimpleInterval;
15 
16 import java.io.Closeable;
17 import java.util.*;
18 
19 /**
20  * An abstract class to allow for the creation of a {@link Funcotation} for a given data source.
21  * Created by jonn on 8/30/17.
22  *
23  * Subclasses that support the annotation of segments must override:
24  *  getSupportedFuncotationFieldsForSegments()
25  *  isSupportingSegmentFuncotation()
26  *  createFuncotationsOnSegment(...)
27  *
28  */
29 public abstract class DataSourceFuncotationFactory implements Closeable {
30 
31     //==================================================================================================================
32 
33     /** Standard Logger.  */
34     protected static final Logger logger = LogManager.getLogger(DataSourceFuncotationFactory.class);
35 
36     /** Default version string for this {@link DataSourceFuncotationFactory}. */
37     @VisibleForTesting
38     public static final String DEFAULT_VERSION_STRING = "UNKNOWN_VERSION";
39 
40     /**
41      * Version number of this {@link DataSourceFuncotationFactory}.
42      */
43     protected String version = DEFAULT_VERSION_STRING;
44 
45     /**
46      * Map of ANNOTATION_NAME -> OVERRIDE_VALUE.
47      */
48     protected Map<String, String> annotationOverrideMap;
49 
50     /**
51      * Enables b37 data sources to be combined with hg19 data sources and work with the same input variants.
52      * Should only be used in cases where data sources cannot be made / found for hg19 and hg19 annotations are required.
53      * A value of {@code false} ONLY indicates that the data source is NOT b37.
54      * If {@code true}, the backing data behind this {@link DataSourceFuncotationFactory} is based on the b37 reference AND we are using hg19 data.
55      * If {@code false}, the backing data behind this the backing data behind this {@link DataSourceFuncotationFactory} is NOT based on the b37 reference.
56      */
57     protected boolean dataSourceIsB37 = false;
58 
59     /**
60      * The backing data store as a FeatureInput to leverage tribble querying.  Can be {@code null} for non-locatable
61      * funcotation factories.
62      */
63     protected final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput;
64 
65     /**
66      * Minimum number of bases for a segment to be considered valid.
67      */
68     protected int minBasesForValidSegment;
69 
70     @VisibleForTesting
getMainSourceFileAsFeatureInput()71     public FeatureInput<? extends Feature> getMainSourceFileAsFeatureInput() {
72         return mainSourceFileAsFeatureInput;
73     }
74 
75     /**
76      * Constructor to initialize final fields in this class with defaults.
77      * @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid.
78      */
DataSourceFuncotationFactory(final int minBasesForValidSegment)79     protected DataSourceFuncotationFactory(final int minBasesForValidSegment) {
80         this.mainSourceFileAsFeatureInput = null;
81         this.minBasesForValidSegment = minBasesForValidSegment;
82     }
83 
84     /**
85      * Constructor to initialize final fields in this class.
86      * @param mainSourceFileAsFeatureInput The backing data store as a FeatureInput to leverage tribble querying.  Can be {@code null} for non-locatable funcotation factories.
87      * @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid.
88      */
DataSourceFuncotationFactory(final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput, final int minBasesForValidSegment)89     protected DataSourceFuncotationFactory(final FeatureInput<? extends Feature> mainSourceFileAsFeatureInput,
90                                            final int minBasesForValidSegment) {
91         this.mainSourceFileAsFeatureInput = mainSourceFileAsFeatureInput;
92         this.minBasesForValidSegment = minBasesForValidSegment;
93     }
94 
95 
96     /**
97      * Set values in {@link DataSourceFuncotationFactory#annotationOverrideMap} based on the given annotation override values
98      * and whether or not this {@link DataSourceFuncotationFactory} supports those annotations.
99      * @param annotationOverrides The {@link Map} of annotation override key names and values.
100      */
initializeAnnotationOverrides(final LinkedHashMap<String, String> annotationOverrides)101     protected void initializeAnnotationOverrides(final LinkedHashMap<String, String> annotationOverrides) {
102         // Go through the Annotation Maps and check to see if the default/override annotation names are applicable for
103         // this FuncotationFactory:
104         final Set<String> supportedFuncotations = getSupportedFuncotationFields();
105         this.annotationOverrideMap = new HashMap<>();
106         for ( final String annotationOverrideKey : annotationOverrides.keySet() ) {
107             if ( supportedFuncotations.contains(annotationOverrideKey) ) {
108                 annotationOverrideMap.put( annotationOverrideKey, annotationOverrides.get(annotationOverrideKey) );
109             }
110         }
111     }
112 
113     //==================================================================================================================
114 
115     /**
116      * @return A {@link String} containing information about this {@link DataSourceFuncotationFactory}.
117      */
getInfoString()118     public String getInfoString() {
119         return getName() + " " + getVersion();
120     }
121 
122     /**
123      * Perform cleanup tasks for this {@link DataSourceFuncotationFactory}.
124      */
close()125     public void close() {}
126 
127     /**
128      * Apply the override values in {@link DataSourceFuncotationFactory#annotationOverrideMap} to every
129      * {@link Funcotation} in the given {@code outputFuncotations}.
130      * @param funcotationList {@link List} of {@link Funcotation} to which to apply override values.
131      */
setOverrideValuesInFuncotations(final List<Funcotation> funcotationList)132     protected void setOverrideValuesInFuncotations(final List<Funcotation> funcotationList) {
133         for ( final Funcotation funcotation : funcotationList ) {
134             funcotation.setFieldSerializationOverrideValues( annotationOverrideMap );
135         }
136     }
137 
138     /**
139      * @return The name of the data source corresponding to this {@link DataSourceFuncotationFactory}.
140      */
getName()141     public abstract String getName();
142 
143     /**
144      * @return The {@link org.broadinstitute.hellbender.tools.funcotator.FuncotatorArgumentDefinitions.DataSourceType} of this {@link DataSourceFuncotationFactory}.
145      */
getType()146     public abstract FuncotatorArgumentDefinitions.DataSourceType getType();
147 
148     /**
149      * @return The version of the data source corresponding to this {@link DataSourceFuncotationFactory}.
150      */
getVersion()151     public String getVersion() {
152         return version;
153     }
154 
155     /**
156      * @return {@code True} if this {@link DataSourceFuncotationFactory} requires features to create {@link Funcotation}s.  {@code False} otherwise.
157      */
158     @VisibleForTesting
requiresFeatures()159     public boolean requiresFeatures() { return true; }
160 
161     /**
162      * @return An ordered {@link LinkedHashSet} of the names of annotations that this Data Source supports.
163      */
getSupportedFuncotationFields()164     public abstract LinkedHashSet<String> getSupportedFuncotationFields();
165 
166     /**
167      * @return An ordered {@link LinkedHashSet} of the names of annotations that this Data Source supports when annotating segments.
168      */
getSupportedFuncotationFieldsForSegments()169     public LinkedHashSet<String> getSupportedFuncotationFieldsForSegments() {
170         return new LinkedHashSet<>();
171     }
172 
173     /**
174      * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, and {@code featureContext}.
175      * Accounts for override values passed into the constructor as well.
176      * @param variant {@link VariantContext} to annotate.
177      * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.
178      * @param featureContext {@link FeatureContext} corresponding to the variant.  Never {@code null}.
179      * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}.  This should never be empty.
180      */
createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext)181     public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext) {
182         return createFuncotations(variant, referenceContext, featureContext, null);
183     }
184 
185     /**
186      * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, {@code featureContext}, and {@code gencodeFuncotations}.
187      * For some Data Sources knowledge of Gene Name or Transcript ID is required for annotation.
188      * Accounts for override values passed into the constructor as well.
189      * @param variant {@link VariantContext} to annotate.  Never {@code null}.
190      * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.  Never {@code null}.
191      * @param featureContext {@link FeatureContext} corresponding to the variant.  Never {@code null}.
192      * @param gencodeFuncotations {@link List} of {@link GencodeFuncotation} that have already been created for the given {@code variant}/{@code referenceContext}/{@code featureContext}.
193      *   {@code null} is acceptable if there are no corresponding gencode funcotations.
194      * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}.  This should never be empty.
195      */
createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext, final List<GencodeFuncotation> gencodeFuncotations)196     public List<Funcotation> createFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final FeatureContext featureContext, final List<GencodeFuncotation> gencodeFuncotations) {
197 
198         Utils.nonNull(variant);
199         Utils.nonNull(referenceContext);
200         Utils.nonNull(featureContext);
201 
202         final List<Funcotation> outputFuncotations;
203 
204         // Query this funcotation factory to get the list of overlapping features.
205         // NOTE: This will only get features that are LOCATABLE!
206         //       This corresponds to requiresFeatures() returning `True`.
207         final List<Feature> featureList = getFeaturesFromFeatureContext(featureContext);
208 
209         // If our featureList is compatible with this DataSourceFuncotationFactory, then we make our funcotations:
210         if ( isFeatureListCompatible(featureList) ) {
211             outputFuncotations = determineFuncotations(variant, referenceContext, featureList, gencodeFuncotations);
212 
213             // Set our overrides:
214             setOverrideValuesInFuncotations(outputFuncotations);
215         }
216         else {
217             return createDefaultFuncotationsOnVariant(variant, referenceContext);
218         }
219 
220         if ((outputFuncotations == null) || (outputFuncotations.size() == 0)) {
221             return createDefaultFuncotationsOnVariant(variant, referenceContext);
222         } else {
223             return outputFuncotations;
224         }
225     }
226 
getFeaturesFromFeatureContext(final FeatureContext featureContext)227     private List<Feature> getFeaturesFromFeatureContext(final FeatureContext featureContext) {
228         return requiresFeatures() ?
229                     queryFeaturesFromFeatureContext(featureContext) :
230                     Collections.emptyList();
231     }
232 
determineFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList, final List<GencodeFuncotation> gencodeFuncotations)233     private List<Funcotation> determineFuncotations(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList, final List<GencodeFuncotation> gencodeFuncotations) {
234 
235         // Create our funcotations:
236         final List<Funcotation> outputFuncotations;
237 
238         if (FuncotatorUtils.isSegmentVariantContext(variant, minBasesForValidSegment) && isSupportingSegmentFuncotation()) {
239             outputFuncotations = createFuncotationsOnSegment(variant, referenceContext, featureList);
240         } else {
241 
242             if (gencodeFuncotations == null) {
243                 outputFuncotations = createFuncotationsOnVariant(variant, referenceContext, featureList);
244             } else {
245                 outputFuncotations = createFuncotationsOnVariant(variant, referenceContext, featureList, gencodeFuncotations);
246             }
247         }
248         return outputFuncotations;
249     }
250 
251     /**
252      * Checks to see if the given featureList is compatible with this {@link DataSourceFuncotationFactory}.
253      * Cues off of the feature type in the feature list and whether the given list contains any non-null features.
254      * This method acts as a sanity-check before attempting to do any annotations on features.
255      * If this {@link DataSourceFuncotationFactory} does not require features as per {@link #requiresFeatures()}, then
256      * this method will always return {@code True}.
257      * @param featureList {@link List} of {@link Feature} that might be applicable to this {@link DataSourceFuncotationFactory} for annotation.
258      * @return {@code true} if the given {@code featureList} contains at least one non-null feature of type {@link #getAnnotationFeatureClass()}; {@code false} otherwise.
259      */
isFeatureListCompatible(final List<Feature> featureList)260     private boolean isFeatureListCompatible(final List<Feature> featureList) {
261         // Make sure these features can be annotated by this DataSourceFuncotationFactory.
262         // NOTE: We only check the first non-null element of the list for feature type:
263 
264         // The feature list is compatible if we found a compatible feature
265         // OR
266         // if this DataSourceFuncotationFactory does not require features.
267         if ( !requiresFeatures() ) {
268             return true;
269         }
270 
271         boolean foundCompatibleFeature = false;
272         for ( final Feature f : featureList ) {
273             if (f != null) {
274                 foundCompatibleFeature = getAnnotationFeatureClass().isAssignableFrom(f.getClass());
275                 break;
276             }
277         }
278         return foundCompatibleFeature;
279     }
280 
281     /**
282      * Queries the provided FeatureContext for Features from our FeatureInput {@link #mainSourceFileAsFeatureInput}.
283      * The default implementation returns all Features from our FeatureInput that overlap the FeatureContext's
284      * interval, but subclasses may override (for example, to pad the query).
285      *
286      * @param featureContext the FeatureContext to query
287      * @return Features from our FeatureInput {@link #mainSourceFileAsFeatureInput} queried from the FeatureContext
288      */
289     @SuppressWarnings("unchecked")
queryFeaturesFromFeatureContext(final FeatureContext featureContext)290     private List<Feature> queryFeaturesFromFeatureContext(final FeatureContext featureContext) {
291         final List<Feature> features;
292 
293         SimpleInterval queryInterval = featureContext.getInterval();
294 
295         // Do we need to do a fuzzy hg19 / b37 conversion for querying our features:
296         if ( dataSourceIsB37 ) {
297             // Create a B37 interval:
298             queryInterval = new SimpleInterval(
299                             FuncotatorUtils.convertHG19ContigToB37Contig(queryInterval.getContig()),
300                             queryInterval.getStart(),
301                             queryInterval.getEnd()
302                     );
303         }
304 
305         // Perform extra transformations on the query interval:
306         queryInterval = transformFeatureQueryInterval(queryInterval);
307 
308         // If the interval has not changed, we should use the original one:
309         if ( queryInterval.equals(featureContext.getInterval() ) ) {    // Get the features:
310             features = (List<Feature>) featureContext.getValues(mainSourceFileAsFeatureInput);
311         }
312         else {
313             // Query as normal:
314             features = (List<Feature>) featureContext.getValues(mainSourceFileAsFeatureInput, queryInterval);
315         }
316 
317         return features;
318     }
319 
320     /**
321      * A Method to allow {@link DataSourceFuncotationFactory} objects to adjust the query interval further for their
322      * own needs (e.g. for flanking).
323      * @param queryInterval The baseline {@link SimpleInterval} to be modified.
324      * @return A {@link SimpleInterval} that has been modified for this {@link DataSourceFuncotationFactory}'s specific needs.
325      */
transformFeatureQueryInterval(final SimpleInterval queryInterval)326     protected SimpleInterval transformFeatureQueryInterval(final SimpleInterval queryInterval) {
327         return queryInterval;
328     }
329 
330     /**
331      * Creates a {@link List} of {@link Funcotation} for the given {@code variant} and {@code referenceContext}.
332      * These will be default funcotations that essentially have empty values.
333      * @param variant {@link VariantContext} to annotate.
334      * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.
335      * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}.  This should never be empty.
336      */
createDefaultFuncotationsOnVariant( final VariantContext variant, final ReferenceContext referenceContext)337     protected abstract List<Funcotation> createDefaultFuncotationsOnVariant( final VariantContext variant, final ReferenceContext referenceContext);
338 
339     /**
340      * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, and {@code featureContext}.
341      * @param variant {@link VariantContext} to annotate.
342      * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.
343      * @param featureList {@link List} of {@link Feature} corresponding to the given {@code variant}.
344      * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}.  This should never be empty.
345      */
createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList)346     protected abstract List<Funcotation> createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList);
347 
348     /**
349      * Creates a {@link List} of {@link Funcotation} for the given {@code variant}, {@code referenceContext}, {@code featureContext}, and {@code gencodeFuncotations}.
350      * For some Data Sources knowledge of Gene Name or Transcript ID is required for annotation.
351      * @param variant {@link VariantContext} to annotate.
352      * @param referenceContext {@link ReferenceContext} corresponding to the given {@code variant}.
353      * @param featureList {@link List} of {@link Feature} corresponding to the given {@code variant}.
354      * @param gencodeFuncotations {@link List} of {@link GencodeFuncotation} that have already been created for the given {@code variant}/{@code referenceContext}/{@code featureContext}.
355      * @return {@link List} of {@link Funcotation} given the {@code variant}, {@code referenceContext}, and {@code featureContext}.  This should never be empty.
356      */
createFuncotationsOnVariant(final VariantContext variant, final ReferenceContext referenceContext, final List<Feature> featureList, final List<GencodeFuncotation> gencodeFuncotations)357     protected abstract List<Funcotation> createFuncotationsOnVariant(final VariantContext variant,
358                                                                   final ReferenceContext referenceContext,
359                                                                   final List<Feature> featureList,
360                                                                   final List<GencodeFuncotation> gencodeFuncotations);
361 
362     /**
363      * @return Get the {@link Class} of the feature type that can be used to create annotations by this {@link DataSourceFuncotationFactory}.
364      */
365     @VisibleForTesting
getAnnotationFeatureClass()366     public abstract Class<? extends Feature> getAnnotationFeatureClass();
367 
368     /**
369      * @return Whether this funcotation factory can support creating funcotations from segments.
370      */
isSupportingSegmentFuncotation()371     public boolean isSupportingSegmentFuncotation() {
372         return false;
373     }
374 
375     /**
376      * Sublclasses that support annotating segments should override this method to create funcotations for segments.
377      * Additionally, those subclasses should override
378      *  {@link DataSourceFuncotationFactory#isSupportingSegmentFuncotation()} to return true.
379      */
createFuncotationsOnSegment(final VariantContext segmentVariantContext, final ReferenceContext referenceContext, final List<Feature> featureList)380     protected List<Funcotation> createFuncotationsOnSegment(final VariantContext segmentVariantContext,
381                                                          final ReferenceContext referenceContext,
382                                                          final List<Feature> featureList) {
383         throw new GATKException.ShouldNeverReachHereException("This funcotation factory does not support the annotation of segments.");
384     }
385 
386 
387 }
388