1 package org.broadinstitute.hellbender.utils.codecs.gtf;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.util.Locatable;
5 import htsjdk.tribble.Feature;
6 import htsjdk.tribble.annotation.Strand;
7 import org.apache.commons.lang3.StringUtils;
8 import org.apache.logging.log4j.LogManager;
9 import org.apache.logging.log4j.Logger;
10 import org.broadinstitute.hellbender.exceptions.UserException;
11 import org.broadinstitute.hellbender.utils.SimpleInterval;
12 import org.broadinstitute.hellbender.utils.Utils;
13 
14 import java.util.*;
15 import java.util.regex.Pattern;
16 import java.util.stream.Collectors;
17 
18 /**
19  * A {@link GencodeGtfFeature} represents data in a GENCODE GTF file.
20  *
21  * Features are grouped logically by related data.
22  * While the abstract class {@link GencodeGtfFeature} represents a single line
23  * of a GENCODE GTF File, the concrete instantiations represent at least one line,
24  * and often more than one.
25  *
26  * For example, a {@link GencodeGtfGeneFeature} represents all lines in the given
27  * data file with information on a particular gene.  This includes all transcripts,
28  * exons, coding regions, etc. in that gene.
29  *
30  * Similarly, a {@link GencodeGtfTranscriptFeature} represents all lines in the given
31  * data file with information on a particular transcript.
32  *
33  * However, a {@link GencodeGtfSelenocysteineFeature} represents a particular line
34  * in the given data file that contains information on a specific selenocysteine.
35  *
36  * The specification of a GTF file is defined here:
37  * http://mblab.wustl.edu/GTF22.html
38  *
39  * Currently only supports GENCODE versions 19-26.
40  *
41  * Created by jonn on 7/21/17.
42  */
43 public abstract class GencodeGtfFeature implements Feature, Comparable<GencodeGtfFeature> {
44 
45     private static final Logger logger = LogManager.getLogger(GencodeGtfFeature.class);
46 
47     // ===========================================================================
48 
49 
50     public static final String ANNOTATION_SOURCE_ENSEMBL = "ENSEMBL";
51     public static final String ANNOTATION_SOURCE_HAVANA = "HAVANA";
52     public static final String ANNOTATION_SOURCE_ENA = "ena";
53 
54     // ===========================================================================
55 
56     // Metadata fields:
57 
58     private static final String FIELD_DELIMITER                 = "\t";
59 
60     public static final int NO_FEATURE_ORDER                    = -1;
61     public static final int NO_EXON_NUMBER                      = -1;
62 
63     private static final int NUM_FIELDS                         = 9;
64 
65     private static final int CHROMOSOME_NAME_INDEX              = 0;
66     private static final int ANNOTATION_SOURCE_INDEX            = 1;
67     private static final int FEATURE_TYPE_INDEX                 = 2;
68     private static final int START_LOCATION_INDEX               = 3;
69     private static final int END_LOCATION_INDEX                 = 4;
70     private static final int GENOMIC_STRAND_INDEX               = 6;
71     private static final int GENOMIC_PHASE_INDEX                = 7;
72     private static final int EXTRA_FIELDS_INDEX                 = 8;
73 
74     private static final String EXTRA_FIELD_DELIMITER           = ";";
75 
76     private static final int EXTRA_FIELD_KEY_INDEX              = 0;
77     private static final int EXTRA_FIELD_VALUE_INDEX            = 1;
78     public static final String EXTRA_FIELD_KEY_VALUE_SPLITTER   = " ";
79 
80     private static final Pattern NUMBER_PATTERN                 = Pattern.compile("\\d\\d*");
81 
82     private String ucscGenomeVersion =  null;
83     @VisibleForTesting
84     final GencodeGtfFeatureBaseData baseData;
85 
86     // ================================================================================================
87 
88     /**
89      * Populate this GencodeGtfFeature with the given data.
90      * @param gtfFields {@link String[]} containing an ordered list of fields to use to populate this {@link GencodeGtfFeature}.
91      * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
92      */
GencodeGtfFeature(final String[] gtfFields, final String gtfFileType)93     protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType) {
94 
95         Utils.validateArg(gtfFields.length == NUM_FIELDS, "Unexpected number of fields: " + gtfFields.length + " != " + NUM_FIELDS);
96 
97         baseData = new GencodeGtfFeatureBaseData();
98 
99         try {
100             baseData.genomicPosition = new SimpleInterval(
101                     gtfFields[CHROMOSOME_NAME_INDEX],
102                     Integer.valueOf(gtfFields[START_LOCATION_INDEX]),
103                     Integer.valueOf(gtfFields[END_LOCATION_INDEX])
104             );
105         }
106         catch (final NumberFormatException ex) {
107             throw new UserException.MalformedFile("Cannot read integer value for start/end position!");
108         }
109 
110         baseData.gtfSourceFileType       = gtfFileType;
111 
112         baseData.annotationSource        = gtfFields[ANNOTATION_SOURCE_INDEX];
113         baseData.featureType             = GencodeGtfFeature.FeatureType.getEnum( gtfFields[FEATURE_TYPE_INDEX].toLowerCase() );
114         baseData.genomicStrand           = convertStringToStrand( gtfFields[GENOMIC_STRAND_INDEX] );
115         baseData.genomicPhase            = GenomicPhase.getEnum( gtfFields[GENOMIC_PHASE_INDEX] );
116 
117         // Get the extra fields from the last column:
118         final String[] extraFields    = gtfFields[EXTRA_FIELDS_INDEX].split(EXTRA_FIELD_DELIMITER, -1);
119 
120         final StringBuilder anonymousOptionalFieldBuilder = new StringBuilder();
121 
122         // Now there are "optional" fields to go through (some actually required, some actually optional),
123         // But we need to match up the field names to the fields themselves:
124         for ( final String extraField : extraFields ) {
125 
126             final String trimmedExtraField = extraField.trim();
127             if (trimmedExtraField.isEmpty()) {
128                 continue;
129             }
130 
131             final int splitPoint = trimmedExtraField.indexOf(EXTRA_FIELD_KEY_VALUE_SPLITTER);
132             if( splitPoint == -1 ) {
133                 throw new UserException.MalformedFile("Extraneous optional field data - not in a key/value pair: " + extraField);
134             }
135 
136             final String fieldName = trimmedExtraField.substring(0, splitPoint).trim();
137 
138             // The value of the field may be between two quotes.
139             // We remove them here.
140             final String rawFieldValue = trimmedExtraField.substring(splitPoint + 1, trimmedExtraField.length());
141             final String fieldValue = StringUtils.remove(rawFieldValue.trim(), '"');
142 
143             if( fieldValue.contains(EXTRA_FIELD_KEY_VALUE_SPLITTER) ){
144                 throw new UserException("Expected a key/value pair but found several values " + fieldName + "/" + fieldValue);
145             }
146 
147             OptionalField<?> optionalField = null;
148 
149             switch (fieldName) {
150                 // Find the right field to set:
151                 case "gene_id":
152                     baseData.geneId = fieldValue;
153                     break;
154                 case "transcript_id":
155                     baseData.transcriptId = fieldValue;
156                     break;
157                 case "gene_type":
158                     baseData.geneType = GeneTranscriptType.getEnum(fieldValue);
159                     break;
160                 // For ENSEMBL GTF files:
161                 case "gene_biotype":
162                     baseData.geneType = GeneTranscriptType.getEnum(fieldValue);
163                     break;
164                 case "gene_status":
165                     baseData.geneStatus = GeneTranscriptStatus.valueOf(fieldValue);
166                     break;
167                 case "gene_name":
168                     baseData.geneName = fieldValue;
169                     break;
170                 case "transcript_type":
171                     baseData.transcriptType = GeneTranscriptType.getEnum(fieldValue);
172                     break;
173                 case "transcript_biotype":
174                     baseData.transcriptType = GeneTranscriptType.getEnum(fieldValue);
175                     break;
176                 case "transcript_status":
177                     baseData.transcriptStatus = GeneTranscriptStatus.valueOf(fieldValue);
178                     break;
179                 case "transcript_name":
180                     baseData.transcriptName = fieldValue;
181                     break;
182                 case "exon_number":
183                     try {
184                         baseData.exonNumber = Integer.valueOf(fieldValue);
185                     }
186                     catch (final NumberFormatException ex) {
187                         throw new UserException.MalformedFile("Could not convert field value into integer: " + fieldValue);
188                     }
189                     break;
190                 case "exon_id":
191                     baseData.exonId = fieldValue;
192                     break;
193                 case "level":
194                     baseData.locusLevel = LocusLevel.getEnum(fieldValue);
195                     break;
196                 case "tag":
197                     optionalField = new OptionalField<>(fieldName, FeatureTag.getEnum(fieldValue));
198                     break;
199                 case "ccdsid":
200                     optionalField = new OptionalField<>(fieldName, fieldValue);
201                     break;
202                 case "havana_gene":
203                     optionalField = new OptionalField<>(fieldName, fieldValue);
204                     break;
205                 case "havana_transcript":
206                     optionalField = new OptionalField<>(fieldName, fieldValue);
207                     break;
208                 case "protein_id":
209                     optionalField = new OptionalField<>(fieldName, fieldValue);
210                     break;
211                 case "ont":
212                     optionalField = new OptionalField<>(fieldName, fieldValue);
213                     break;
214                 case "transcript_support_level":
215                     optionalField = new OptionalField<>(fieldName, TranscriptSupportLevel.getEnum(fieldValue));
216                     break;
217                 case "remap_status":
218                     optionalField = new OptionalField<>(fieldName, RemapStatus.getEnum(fieldValue));
219                     break;
220                 case "remap_original_id":
221                     optionalField = new OptionalField<>(fieldName, fieldValue);
222                     break;
223                 case "remap_original_location":
224                     try {
225                         optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue));
226                     }
227                     catch (final NumberFormatException nfe) {
228                         // We must have gotten a field that has a different format.
229                         // For now, just copy it over:
230                         optionalField = new OptionalField<>(fieldName, fieldValue);
231                     }
232                     break;
233                 case "remap_num_mappings":
234                     optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue));
235                     break;
236                 case "remap_target_status":
237                     optionalField = new OptionalField<>(fieldName, RemapTargetStatus.getEnum(fieldValue));
238                     break;
239                 case "remap_substituted_missing_target":
240                     optionalField = new OptionalField<>(fieldName, fieldValue);
241                     break;
242                 default:
243                     anonymousOptionalFieldBuilder.append(extraField);
244                     anonymousOptionalFieldBuilder.append(EXTRA_FIELD_DELIMITER);
245                     break;
246             }
247 
248             // If the optional field was good, we add it:
249             if ( optionalField != null ) {
250                 baseData.optionalFields.add(optionalField);
251             }
252         }
253 
254         // Save our anonymous optional fields:
255         if ( anonymousOptionalFieldBuilder.length() != 0 ) {
256             baseData.anonymousOptionalFields = anonymousOptionalFieldBuilder.toString();
257         }
258     }
259 
260     /**
261      * Converts the given {@link String} into a {@link Strand}.
262      * @param s {@link String} to convert into a {@link Strand}.
263      * @return The {@link Strand} corresponding to {@code s}.
264      */
convertStringToStrand( final String s )265     private static Strand convertStringToStrand( final String s ) {
266         if ( s.equals("+") ) {
267             return Strand.POSITIVE;
268         }
269         else if ( s.equals("-") ) {
270             return Strand.NEGATIVE;
271         }
272         else {
273             throw new IllegalArgumentException("Unexpected value: " + s);
274         }
275     }
276 
277     /**
278      * Populate this GencodeGtfFeature with the given data.
279      */
GencodeGtfFeature(final GencodeGtfFeatureBaseData baseData)280     protected GencodeGtfFeature(final GencodeGtfFeatureBaseData baseData) {
281         this.baseData = baseData;
282     }
283 
284     // ================================================================================================
285 
286     /**
287      * Create the appropriate {@link GencodeGtfFeature} object based on the given {@code baseData}
288      * @param baseData A {@link GencodeGtfFeatureBaseData} object containing all data for a single line in a GENCODE GTF File.
289      * @return A {@link GencodeGtfFeature} containing the data in {@code baseData}
290      */
create(final GencodeGtfFeatureBaseData baseData)291     public static GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
292         Utils.nonNull(baseData);
293 
294         // Create our feature:
295         return baseData.featureType.create(baseData);
296     }
297 
298     /**
299      * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File.
300      * @param gtfLine A line from a Gencode GTF File to convert into a {@link GencodeGtfFeature} object.
301      * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
302      * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine}
303      */
create(final String gtfLine, final String gtfFileType)304     public static GencodeGtfFeature create(final String gtfLine, final String gtfFileType) {
305         Utils.nonNull(gtfLine);
306         return create(gtfLine.split(FIELD_DELIMITER), gtfFileType);
307     }
308 
309     /**
310      * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File.
311      * @param gtfFields A line from a Gencode GTF File split on the {@link #FIELD_DELIMITER} character.
312      * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
313      * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine}
314      */
create(final String[] gtfFields, final String gtfFileType)315     public static GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
316         Utils.nonNull(gtfFields);
317 
318         // Ensure that the input data are superficially well-formed:
319         if ( gtfFields.length != GencodeGtfCodec.NUM_COLUMNS ) {
320             throw new UserException.MalformedFile("Invalid number of fields in the given GENCODE line " +
321                     " - Given: " + gtfFields.length + " Expected: " + GencodeGtfCodec.NUM_COLUMNS);
322         }
323 
324         final FeatureType featureType = FeatureType.getEnum( gtfFields[FEATURE_TYPE_INDEX] );
325 
326         // Return our feature:
327         return featureType.create(gtfFields, gtfFileType);
328     }
329 
330     // ================================================================================================
331 
332     @Override
getContig()333     public String getContig() {
334         return baseData.genomicPosition.getContig();
335     }
336 
337     @Override
getStart()338     public int getStart() {
339         return baseData.genomicPosition.getStart();
340     }
341 
342     @Override
getEnd()343     public int getEnd() {
344         return baseData.genomicPosition.getEnd();
345     }
346 
347     // ================================================================================================
348 
349     /**
350      * Get all the features from this {@link GencodeGtfFeature} itself.
351      * This is useful to get any subfeatures included in this {@link GencodeGtfFeature}.
352      * @return A {@link List} of the features represented in this {@link GencodeGtfFeature}.
353      */
354     @VisibleForTesting
getAllFeatures()355     List<GencodeGtfFeature> getAllFeatures() {
356         final List<GencodeGtfFeature> list = new ArrayList<>();
357         list.add(this);
358         return list;
359     }
360 
361     /**
362      * Serializes the base data in {@link GencodeGtfFeature} to a string.
363      * @return a {@link String} representing this {@link GencodeGtfFeature}
364      */
serializeToStringHelper()365     private String serializeToStringHelper() {
366 
367         final StringBuilder stringBuilder = new StringBuilder();
368 
369         stringBuilder.append( baseData.genomicPosition.getContig() );
370         stringBuilder.append( '\t' );
371         stringBuilder.append( baseData.annotationSource );
372         stringBuilder.append( '\t' );
373         stringBuilder.append( baseData.featureType );
374         stringBuilder.append( '\t' );
375         stringBuilder.append( baseData.genomicPosition.getStart() );
376         stringBuilder.append( '\t' );
377         stringBuilder.append( baseData.genomicPosition.getEnd() );
378         stringBuilder.append( "\t.\t" );
379         stringBuilder.append( baseData.genomicStrand );
380         stringBuilder.append( '\t' );
381         stringBuilder.append( baseData.genomicPhase );
382         stringBuilder.append( '\t' );
383 
384         if ( baseData.geneId != null ) {
385             stringBuilder.append("gene_id \"");
386             stringBuilder.append(baseData.geneId);
387             stringBuilder.append( "\"; " );
388         }
389         if ( baseData.transcriptId != null) {
390             stringBuilder.append("transcript_id \"");
391             stringBuilder.append(baseData.transcriptId);
392             stringBuilder.append( "\"; " );
393         }
394         if ( baseData.geneType != null ) {
395             stringBuilder.append("gene_type \"");
396             stringBuilder.append(baseData.geneType);
397             stringBuilder.append( "\"; " );
398         }
399         if ( baseData.geneStatus != null ) {
400             stringBuilder.append("gene_status \"");
401             stringBuilder.append(baseData.geneStatus);
402             stringBuilder.append( "\"; " );
403         }
404         if ( baseData.geneName != null ) {
405             stringBuilder.append("gene_name \"");
406             stringBuilder.append(baseData.geneName);
407             stringBuilder.append( "\"; " );
408         }
409         if ( baseData.transcriptType != null ) {
410             stringBuilder.append("transcript_type \"");
411             stringBuilder.append(baseData.transcriptType);
412             stringBuilder.append( "\"; " );
413         }
414         if ( baseData.transcriptStatus != null ) {
415             stringBuilder.append("transcript_status \"");
416             stringBuilder.append(baseData.transcriptStatus);
417             stringBuilder.append( "\"; " );
418         }
419         if ( baseData.transcriptName != null ) {
420             stringBuilder.append("transcript_name \"");
421             stringBuilder.append(baseData.transcriptName);
422             stringBuilder.append( "\"; " );
423         }
424         if ( baseData.exonNumber != NO_EXON_NUMBER ) {
425             stringBuilder.append("exon_number ");
426             stringBuilder.append(baseData.exonNumber);
427             stringBuilder.append( "; " );
428         }
429         if ( baseData.exonId != null) {
430             stringBuilder.append("exon_id \"");
431             stringBuilder.append(baseData.exonId);
432             stringBuilder.append( "\"; ");
433         }
434         if (baseData.locusLevel != null) {
435             stringBuilder.append("level ");
436             stringBuilder.append(baseData.locusLevel);
437             stringBuilder.append("; ");
438         }
439 
440         // = = = = = = = = = = = = = = = = = = = = = = =
441 
442         // Output our optional fields:
443         stringBuilder.append(
444                 baseData.optionalFields.stream().map(Object::toString).collect(Collectors.joining(" "))
445         );
446 
447         if ( baseData.anonymousOptionalFields != null ) {
448             stringBuilder.append(baseData.anonymousOptionalFields);
449         }
450 
451         return stringBuilder.toString().trim();
452     }
453 
454     /**
455      * Serializes all data in {@link GencodeGtfFeature} to a string.
456      * This includes all subfields of child classes.
457      * @return a {@link String} representing this {@link GencodeGtfFeature}
458      */
serializeToString()459     public String serializeToString() {
460         final StringBuilder stringBuilder = new StringBuilder();
461 
462         final List<GencodeGtfFeature> features = getAllFeatures();
463         Collections.sort( features );
464 
465         for ( final GencodeGtfFeature feature : features ) {
466             stringBuilder.append( feature.serializeToStringHelper() );
467             stringBuilder.append("\n");
468         }
469 
470         return stringBuilder.toString().trim();
471     }
472 
473     @Override
toString()474     public String toString() {
475         return serializeToString();
476     }
477 
478     // ================================================================================================
479 
getGtfSourceFileType()480     public String getGtfSourceFileType() { return baseData.gtfSourceFileType; }
481 
getUcscGenomeVersion()482     public String getUcscGenomeVersion() {
483         return ucscGenomeVersion;
484     }
485 
setUcscGenomeVersion(final String ucscGenomeVersion)486     public void setUcscGenomeVersion(final String ucscGenomeVersion) {
487         this.ucscGenomeVersion = ucscGenomeVersion;
488     }
489 
getGenomicPosition()490     public SimpleInterval getGenomicPosition() { return baseData.genomicPosition; }
491 
getFeatureOrderNumber()492     public int getFeatureOrderNumber() { return baseData.featureOrderNumber; }
493 
getChromosomeName()494     public String getChromosomeName() {
495         return baseData.genomicPosition.getContig();
496     }
497 
getAnnotationSource()498     public String getAnnotationSource() {
499         return baseData.annotationSource;
500     }
501 
getFeatureType()502     public FeatureType getFeatureType() {
503         return baseData.featureType;
504     }
505 
getGenomicStartLocation()506     public int getGenomicStartLocation() {
507         return baseData.genomicPosition.getStart();
508     }
509 
getGenomicEndLocation()510     public int getGenomicEndLocation() {
511         return baseData.genomicPosition.getEnd();
512     }
513 
getGenomicStrand()514     public Strand getGenomicStrand() {
515         return baseData.genomicStrand;
516     }
517 
getGenomicPhase()518     public GenomicPhase getGenomicPhase() {
519         return baseData.genomicPhase;
520     }
521 
getGeneId()522     public String getGeneId() {
523         return baseData.geneId;
524     }
525 
getTranscriptId()526     public String getTranscriptId() {
527         return baseData.transcriptId;
528     }
529 
getGeneType()530     public GeneTranscriptType getGeneType() {
531         return baseData.geneType;
532     }
533 
getGeneName()534     public String getGeneName() {
535         return baseData.geneName;
536     }
537 
getTranscriptType()538     public GeneTranscriptType getTranscriptType() {
539         return baseData.transcriptType;
540     }
541 
getTranscriptName()542     public String getTranscriptName() {
543         return baseData.transcriptName;
544     }
545 
getGeneStatus()546     public GeneTranscriptStatus getGeneStatus() {
547         return baseData.geneStatus;
548     }
549 
getTranscriptStatus()550     public GeneTranscriptStatus getTranscriptStatus() {
551         return baseData.transcriptStatus;
552     }
553 
getExonNumber()554     public int getExonNumber() {
555         return baseData.exonNumber;
556     }
557 
getExonId()558     public String getExonId() {
559         return baseData.exonId;
560     }
561 
getLocusLevel()562     public LocusLevel getLocusLevel() {
563         return baseData.locusLevel;
564     }
565 
getOptionalFields()566     public List<OptionalField<?>> getOptionalFields() {
567         return baseData.optionalFields;
568     }
569 
getAnonymousOptionalFields()570     public String getAnonymousOptionalFields() {
571         return baseData.anonymousOptionalFields;
572     }
573 
getOptionalField(final String key)574     public OptionalField<?> getOptionalField(final String key) {
575         for (final OptionalField<?> optionalField : baseData.optionalFields) {
576             if ( optionalField.getName().equals(key) ) {
577                 return optionalField;
578             }
579         }
580         return null;
581     }
582 
583     /**
584      * Comparable interface implementation for {@link GencodeGtfFeature}.
585      *
586      * Order is determined by {@link GencodeGtfFeatureBaseData#featureOrderNumber}
587      *
588      * @param other {@link GencodeGtfFeature} to which to compare
589      * @return -1 if this < other; 0 if this == other; 1 if this > other
590      */
591     @Override
compareTo(final GencodeGtfFeature other)592     public int compareTo(final GencodeGtfFeature other) {
593         Utils.nonNull(other);
594         return (baseData.featureOrderNumber - other.baseData.featureOrderNumber);
595     }
596 
597     @Override
equals(final Object that)598     public boolean equals(final Object that) {
599         if (that == null) {
600             return false;
601         }
602         else if ( this == that ) {
603             return true;
604         }
605 
606         boolean isEqual = that instanceof GencodeGtfFeature;
607         if (isEqual) {
608             final GencodeGtfFeature thatFeature = (GencodeGtfFeature) that;
609             isEqual = Objects.equals(baseData, thatFeature.baseData);
610 
611             if ( isEqual ) {
612                 isEqual = ucscGenomeVersion.equals( thatFeature.getUcscGenomeVersion() );
613             }
614         }
615 
616         return isEqual;
617     }
618 
619     @Override
hashCode()620     public int hashCode() {
621         return baseData != null ? baseData.hashCode() : 0;
622     }
623 
624     /**
625      * Checks if {@code other} is contained within this {@link GencodeGtfFeature}.
626      * Comparison is made using {@link SimpleInterval#contains(Locatable)} ala {@link GencodeGtfFeatureBaseData#genomicPosition}
627      * @param other {@link Locatable} of which to check the bounds.
628      * @return true if {@code other} is contained within the bounds of this {@link GencodeGtfFeature}, false otherwise.
629      */
contains(final Locatable other)630     public boolean contains(final Locatable other) {
631         return baseData.genomicPosition.contains(other);
632     }
633 
634     /**
635      * Checks if {@code other} overlaps with this {@link GencodeGtfFeature}.
636      * Comparison is made using {@link SimpleInterval#overlaps(Locatable)} ala {@link GencodeGtfFeatureBaseData#genomicPosition}
637      * @param other {@link Locatable}-derived class of which to check the bounds.
638      * @return true if {@code other} overlaps the bounds of this {@link GencodeGtfFeature}, false otherwise.
639      */
overlaps(final Locatable other)640     public boolean overlaps(final Locatable other) {
641         return baseData.genomicPosition.overlaps(other);
642     }
643 
setFeatureOrderNumber(final int featureOrderNumber)644     public void setFeatureOrderNumber(final int featureOrderNumber) {
645         this.baseData.featureOrderNumber = featureOrderNumber;
646     }
647 
648     // ================================================================================================
649 
650     static public class OptionalField<T> {
651 
652         private String name;
653         private T value;
654 
OptionalField(final String name, final T value)655         public OptionalField(final String name, final T value) {
656             this.name = name;
657             this.value = value;
658         }
659 
getName()660         public String getName() {
661             return name;
662         }
663 
setName(final String name)664         public void setName(final String name) {
665             this.name = name;
666         }
667 
getValue()668         public T getValue() {
669             return value;
670         }
671 
setValue(final T value)672         public void setValue(final T value) {
673             this.value = value;
674         }
675 
676         @Override
toString()677         public String toString() {
678 
679             final StringBuilder sb = new StringBuilder();
680 
681             sb.append(name);
682             sb.append(" ");
683 
684             // We need to do some formatting for the numbers / non-numbers in the field:
685             final String valueString = value.toString();
686             if ( NUMBER_PATTERN.matcher(valueString).matches() ) {
687                 sb.append(valueString);
688                 sb.append(";");
689             }
690             else {
691                 sb.append("\"");
692                 sb.append(valueString);
693                 sb.append("\";");
694             }
695 
696             return sb.toString();
697         }
698 
699         @Override
hashCode()700         public int hashCode() {
701             int result = name != null ? name.hashCode() : 0;
702             result = 31 * result + (value != null ? value.hashCode() : 0);
703             return result;
704         }
705 
706         @Override
equals(final Object other)707         public boolean equals(final Object other) {
708 
709             if (other == null) {
710                 return false;
711             }
712             else if ( this == other ) {
713                 return true;
714             }
715 
716             if ( !(other instanceof OptionalField) ) {
717                 return false;
718             }
719 
720             final OptionalField<?> otherOptionalField = (OptionalField<?>) other;
721 
722             return (name.equals(otherOptionalField.name)) &&
723                     (value.equals(otherOptionalField.value));
724         }
725     }
726 
727     // ================================================================================================
728 
729 
730 
731     // ================================================================================================
732 
733     /**
734      * Keyword identifying the source of the feature, like a program
735      * (e.g. Augustus or RepeatMasker) or an organization (like TAIR).
736      *
737      * For more information, see:
738      *     https://www.gencodegenes.org/data_format.html
739      *     https://en.wikipedia.org/wiki/General_feature_format
740      */
741     public enum AnnotationSource {
742         ENSEMBL,
743         HAVANA,
744         ena // From ENSEMBLE GTFs
745     }
746 
747     /**
748      * Type of the feature represented in a single line of a GENCODE GTF File.
749      *
750      * For more information, see:
751      *     https://www.gencodegenes.org/data_format.html
752      *     https://en.wikipedia.org/wiki/General_feature_format
753      */
754     public enum FeatureType {
755         GENE("gene"){
create(final GencodeGtfFeatureBaseData baseData)756             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
757                 return GencodeGtfGeneFeature.create(baseData);
758             }
create(final String[] gtfFields, final String gtfFileType)759             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
760                 return GencodeGtfGeneFeature.create(gtfFields, gtfFileType);
761             }
762         },
763         TRANSCRIPT("transcript"){
create(final GencodeGtfFeatureBaseData baseData)764             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
765                 return GencodeGtfTranscriptFeature.create(baseData);
766             }
create(final String[] gtfFields, final String gtfFileType)767             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
768                 return GencodeGtfTranscriptFeature.create(gtfFields, gtfFileType);
769             }
770         },
771         SELENOCYSTEINE("Selenocysteine"){
create(final GencodeGtfFeatureBaseData baseData)772             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
773                 return GencodeGtfSelenocysteineFeature.create(baseData);
774             }
create(final String[] gtfFields, final String gtfFileType)775             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
776                 return GencodeGtfSelenocysteineFeature.create(gtfFields, gtfFileType);
777             }
778         },
779         EXON("exon"){
create(final GencodeGtfFeatureBaseData baseData)780             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
781                 return GencodeGtfExonFeature.create(baseData);
782             }
create(final String[] gtfFields, final String gtfFileType)783             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
784                 return GencodeGtfExonFeature.create(gtfFields, gtfFileType);
785             }
786         },
787         CDS("CDS"){
create(final GencodeGtfFeatureBaseData baseData)788             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
789                 return GencodeGtfCDSFeature.create(baseData);
790             }
create(final String[] gtfFields, final String gtfFileType)791             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
792                 return GencodeGtfCDSFeature.create(gtfFields, gtfFileType);
793             }
794         },
795         START_CODON("start_codon"){
create(final GencodeGtfFeatureBaseData baseData)796             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
797                 return GencodeGtfStartCodonFeature.create(baseData);
798             }
create(final String[] gtfFields, final String gtfFileType)799             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
800                 return GencodeGtfStartCodonFeature.create(gtfFields, gtfFileType);
801             }
802         },
803         STOP_CODON("stop_codon"){
create(final GencodeGtfFeatureBaseData baseData)804             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
805                 return GencodeGtfStopCodonFeature.create(baseData);
806             }
create(final String[] gtfFields, final String gtfFileType)807             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
808                 return GencodeGtfStopCodonFeature.create(gtfFields, gtfFileType);
809             }
810         },
811         UTR("UTR"){
create(final GencodeGtfFeatureBaseData baseData)812             public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) {
813                 return GencodeGtfUTRFeature.create(baseData);
814             }
create(final String[] gtfFields, final String gtfFileType)815             public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) {
816                 return GencodeGtfUTRFeature.create(gtfFields, gtfFileType);
817             }
818         };
819 
820         @SuppressWarnings("unchecked")
821         private static final Map<String, FeatureType> VALUE_MAP =
822                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
823 
824         private final String serialized;
825 
FeatureType(final String serializedValue)826         FeatureType(final String serializedValue) { serialized = serializedValue; }
827 
828         @Override
toString()829         public String toString() { return serialized; }
830 
getEnum(final String s)831         public static FeatureType getEnum(final String s) {
832             final String lowerS = s.toLowerCase();
833             if ( VALUE_MAP.containsKey(lowerS) ){
834                 return VALUE_MAP.get(lowerS);
835             }
836             throw new IllegalArgumentException("Unexpected value: " + s);
837         }
838 
839         /**
840          * Create a {@link GencodeGtfFeature} of this type given {@code baseData}
841          * @param baseData The data to use to create a {@link GencodeGtfFeature}
842          * @return The {@link GencodeGtfFeature} represented by the given {@code baseData}
843          */
create(final GencodeGtfFeatureBaseData baseData)844         abstract public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData);
845 
846         /**
847          * Create a {@link GencodeGtfFeature} of this type given {@code gtfFields}
848          * @param gtfFields The data to use to create a {@link GencodeGtfFeature}
849          * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}.
850          * @return The {@link GencodeGtfFeature} represented by the given {@code gtfFields}
851          */
create(final String[] gtfFields, final String gtfFileType)852         abstract public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType);
853     }
854 
855     /**
856      * Whether the first base of the CDS segment is the first (frame 0), second (frame 1) or third (frame 2) \
857      * in the codon of the ORF.
858      *
859      * For more information, see:
860      *     https://www.gencodegenes.org/data_format.html
861      *     https://en.wikipedia.org/wiki/General_feature_format
862      */
863     public enum GenomicPhase {
864         ZERO("0"),
865         ONE ("1"),
866         TWO ("2"),
867         DOT (".");
868 
869         @SuppressWarnings("unchecked")
870         private static final Map<String, GenomicPhase> VALUE_MAP =
871                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
872 
873         private final String serialized;
874 
GenomicPhase(final String serializedValue)875         GenomicPhase(final String serializedValue) {
876             serialized = serializedValue;
877         }
878 
879         @Override
toString()880         public String toString() {
881             return serialized;
882         }
883 
getEnum(final String s)884         public static GenomicPhase getEnum(final String s) {
885             final String lowerS = s.toLowerCase();
886             if ( VALUE_MAP.containsKey(lowerS) ){
887                 return VALUE_MAP.get(lowerS);
888             }
889             throw new IllegalArgumentException("Unexpected value: " + s);
890         }
891     }
892 
893     /**
894      * Biotype / transcript type for the transcript or gene represented in a feature.
895      * This is a tag of some biological function associated with a feature.
896      *
897      * For more information, see:
898      *     https://www.gencodegenes.org/data_format.html
899      *     https://en.wikipedia.org/wiki/General_feature_format
900      */
901     public enum GeneTranscriptType {
902         // Immunoglobulin (Ig) variable chain and T-cell receptor (TcR) genes imported or annotated according to the IMGT (http://www.imgt.org/)
903         IG_C_GENE("IG_C_gene"),
904         IG_D_GENE("IG_D_gene"),
905         IG_J_GENE("IG_J_gene"),
906         IG_LV_GENE("IG_LV_gene"),
907         IG_V_GENE("IG_V_gene"),
908         TR_C_GENE("TR_C_gene"),
909         TR_J_GENE("TR_J_gene"),
910         TR_V_GENE("TR_V_gene"),
911         TR_D_GENE("TR_D_gene"),
912 
913         // Inactivated immunoglobulin gene.
914         IG_PSEUDOGENE("IG_pseudogene"),
915         IG_C_PSEUDOGENE("IG_C_pseudogene"),
916         IG_J_PSEUDOGENE("IG_J_pseudogene"),
917         IG_V_PSEUDOGENE("IG_V_pseudogene"),
918         TR_V_PSEUDOGENE("TR_V_pseudogene"),
919         TR_J_PSEUDOGENE("TR_J_pseudogene"),
920 
921         // Non-coding RNA predicted using sequences from Rfam (http://rfam.xfam.org/) and miRBase (http://www.mirbase.org/)
922         MT_RRNA("Mt_rRNA"),
923         MT_TRNA("Mt_tRNA"),
924         MIRNA("miRNA"),
925         MISC_RNA("misc_RNA"),
926         RRNA("rRNA"),
927 
928         SCRNA("scRNA"),
929         SNRNA("snRNA"),
930         SNORNA("snoRNA"),
931         RIBOZYME("ribozyme"),
932         SRNA("sRNA"),
933         SCARNA("scaRNA"),
934 
935         // ENSEMBL-Specific values:
936         TRNA("tRNA"),
937         TMRNA("tmRNA"),
938 
939         // Non-coding RNA predicted to be pseudogene by the Ensembl pipeline
940         MT_TRNA_PSEUDOGENE("Mt_tRNA_pseudogene"),
941         TRNA_PSEUDOGENE("tRNA_pseudogene"),
942         SNORNA_PSEUDOGENE("snoRNA_pseudogene"),
943         SNRNA_PSEUDOGENE("snRNA_pseudogene"),
944         SCRNA_PSEUDOGENE("scRNA_pseudogene"),
945         RRNA_PSEUDOGENE("rRNA_pseudogene"),
946         MISC_RNA_PSEUDOGENE("misc_RNA_pseudogene"),
947         MIRNA_PSEUDOGENE("miRNA_pseudogene"),
948 
949         // To be Experimentally Confirmed. This is used for non-spliced EST clusters that have polyA features. This category has been specifically created for the ENCODE project to highlight regions that could indicate the presence of protein coding genes that require experimental validation, either by 5' RACE or RT-PCR to extend the transcripts, or by confirming expression of the putatively-encoded peptide with specific antibodies.
950         TEC("TEC"),
951 
952         // If the coding sequence (following the appropriate reference) of a transcript finishes >50bp from a downstream splice site then it is tagged as NMD. If the variant does not cover the full reference coding sequence then it is annotated as NMD if NMD is unavoidable i.e. no matter what the exon structure of the missing portion is the transcript will be subject to NMD.
953         NONSENSE_MEDIATED_DECAY("nonsense_mediated_decay"),
954 
955         // Transcript that has polyA features (including signal) without a prior stop codon in the CDS, i.e. a non-genomic polyA tail attached directly to the CDS without 3' UTR. These transcripts are subject to degradation.
956         NON_STOP_DECAY("non_stop_decay"),
957 
958         // Alternatively spliced transcript believed to contain intronic sequence relative to other, coding, variants.
959         RETAINED_INTRON("retained_intron"),
960 
961         // Contains an open reading frame (ORF).
962         PROTEIN_CODING("protein_coding"),
963 
964         // Doesn't contain an ORF.
965         PROCESSED_TRANSCRIPT("processed_transcript"),
966 
967         // Transcript which is known from the literature to not be protein coding.
968         NON_CODING("non_coding"),
969 
970         // Transcript believed to be protein coding, but with more than one possible open reading frame.
971         AMBIGUOUS_ORF("ambiguous_orf"),
972 
973         // Long non-coding transcript in introns of a coding gene that does not overlap any exons.
974         SENSE_INTRONIC("sense_intronic"),
975 
976         // Long non-coding transcript that contains a coding gene in its intron on the same strand.
977         SENSE_OVERLAPPING("sense_overlapping"),
978 
979         // Has transcripts that overlap the genomic span (i.e. exon or introns) of a protein-coding locus on the opposite strand.
980         ANTISENSE("antisense"),
981         ANTISENSE_RNA("antisense_RNA"),
982 
983         KNOWN_NCRNA("known_ncrna"),
984 
985         // Have homology to proteins but generally suffer from a disrupted coding sequence and an active homologous gene can be found at another locus. Sometimes these entries have an intact coding sequence or an open but truncated ORF, in which case there is other evidence used (for example genomic polyA stretches at the 3' end) to classify them as a pseudogene. Can be further classified as one of the following.
986         PSEUDOGENE("pseudogene"),
987 
988         // Pseudogene that lack introns and is thought to arise from reverse transcription of mRNA followed by reinsertion of DNA into the genome.
989         PROCESSED_PSEUDOGENE("processed_pseudogene"),
990 
991         // Pseudogene owing to a SNP/DIP but in other individuals/haplotypes/strains the gene is translated.
992         POLYMORPHIC_PSEUDOGENE("polymorphic_pseudogene"),
993 
994         // Pseudogene owing to a reverse transcribed and re-inserted sequence.
995         RETROTRANSPOSED("retrotransposed"),
996 
997         // Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression.
998         TRANSCRIBED_PROCESSED_PSEUDOGENE("transcribed_processed_pseudogene"),
999         TRANSCRIBED_UNPROCESSED_PSEUDOGENE("transcribed_unprocessed_pseudogene"),
1000         TRANSCRIBED_UNITARY_PSEUDOGENE("transcribed_unitary_pseudogene"),
1001 
1002         // Pseudogene that has mass spec data suggesting that it is also translated.
1003         TRANSLATED_PROCESSED_PSEUDOGENE("translated_processed_pseudogene"),
1004         TRANSLATED_UNPROCESSED_PSEUDOGENE("translated_unprocessed_pseudogene"),
1005 
1006         // A species specific unprocessed pseudogene without a parent gene, as it has an active orthologue in another species.
1007         UNITARY_PSEUDOGENE("unitary_pseudogene"),
1008 
1009         // Pseudogene that can contain introns since produced by gene duplication.
1010         UNPROCESSED_PSEUDOGENE("unprocessed_pseudogene"),
1011 
1012         // Used to tag mistakes in the public databases (Ensembl/SwissProt/Trembl)
1013         ARTIFACT("artifact"),
1014 
1015         // Long, intervening noncoding (linc) RNA that can be found in evolutionarily conserved, intergenic regions.
1016         LINCRNA("lincRNA"),
1017         LNCRNA("lncRNA"),
1018 
1019         // Unspliced lncRNA that is several kb in size.
1020         MACRO_LNCRNA("macro_lncRNA"),
1021 
1022         // Transcript where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR.
1023         THREE_PRIME_OVERLAPPING_NCRNA("3prime_overlapping_ncRNA"),
1024 
1025         // Otherwise viable coding region omitted from this alternatively spliced transcript because the splice variation affects a region coding for a protein domain.
1026         DISRUPTED_DOMAIN("disrupted_domain"),
1027 
1028         // Short non coding RNA gene that forms part of the vault ribonucleoprotein complex.
1029         VAULTRNA("vaultRNA"),
1030 
1031         // A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand.
1032         BIDIRECTIONAL_PROMOTER_LNCRNA("bidirectional_promoter_lncRNA");
1033 
1034         @SuppressWarnings("unchecked")
1035         private static final Map<String, GeneTranscriptType> VALUE_MAP =
1036                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
1037 
1038         private final String serialized;
1039 
GeneTranscriptType(final String serializedValue)1040         GeneTranscriptType(final String serializedValue) {
1041             serialized = serializedValue;
1042         }
1043 
1044         @Override
toString()1045         public String toString() {
1046             return serialized;
1047         }
1048 
1049         private static final Map<String, String> SPECIAL_CASE_STRING_VALUE_MAP = createSpecialCaseMap();
1050 
getEnum(final String s)1051         public static GeneTranscriptType getEnum(final String s) {
1052             String lowerS = s.toLowerCase();
1053 
1054             // Handle special cases:
1055             lowerS = SPECIAL_CASE_STRING_VALUE_MAP.getOrDefault(lowerS, lowerS);
1056 
1057             if ( VALUE_MAP.containsKey(lowerS) ){
1058                 return VALUE_MAP.get(lowerS);
1059             }
1060             throw new IllegalArgumentException("Unexpected value: " + s);
1061         }
1062 
1063         /**
1064          * Create a special case map for alternate field names for known {@link GeneTranscriptType}s.
1065          */
createSpecialCaseMap()1066         private static Map<String, String> createSpecialCaseMap() {
1067             final Map<String, String> map = new HashMap<>();
1068 
1069             // From ENSEMBLE GTF files:
1070             map.put("ncrna", "non_coding");
1071 
1072             return map;
1073         }
1074 
1075     }
1076 
1077     /**
1078      * Indication of whether a feature is new, tenatative, or already known.
1079      *
1080      * This attribute was removed after release 25.
1081      *
1082      * For more information, see:
1083      *     https://www.gencodegenes.org/data_format.html
1084      *     https://en.wikipedia.org/wiki/General_feature_format
1085      */
1086     public enum GeneTranscriptStatus {
1087         KNOWN,
1088         NOVEL,
1089         PUTATIVE
1090     }
1091 
1092     /**
1093      * Status of how a position was annotated / verified:
1094      *
1095      *      1 - verified locus
1096      *      2 - manually annotated locus
1097      *      3 - automatically annotated locus
1098      *
1099      * For more information, see:
1100      *     https://www.gencodegenes.org/data_format.html
1101      *     https://en.wikipedia.org/wiki/General_feature_format
1102      */
1103     public enum LocusLevel {
1104         /** Verified locus */
1105         VERIFIED("1"),
1106 
1107         /** Manually annotated locus */
1108         MANUALLY_ANNOTATED("2"),
1109 
1110         /** Automatically annotated locus */
1111         AUTOMATICALLY_ANNOTATED("3");
1112 
1113         @SuppressWarnings("unchecked")
1114         private static final Map<String, LocusLevel> VALUE_MAP =
1115                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
1116 
1117         private final String serialized;
1118 
LocusLevel(final String serializedValue)1119         LocusLevel(final String serializedValue) {
1120             serialized = serializedValue;
1121         }
1122 
1123         @Override
toString()1124         public String toString() {
1125             return serialized;
1126         }
1127 
getEnum(final String s)1128         public static LocusLevel getEnum(final String s) {
1129             final String lowerS = s.toLowerCase();
1130             if ( VALUE_MAP.containsKey(lowerS) ){
1131                 return VALUE_MAP.get(lowerS);
1132             }
1133             throw new IllegalArgumentException("Unexpected value: " + s);
1134         }
1135     }
1136 
1137     /**
1138      * Additional relevant information appended to a feature.
1139      *
1140      * For more information, see:
1141      *     https://www.gencodegenes.org/data_format.html
1142      *     https://en.wikipedia.org/wiki/General_feature_format
1143      *     https://www.gencodegenes.org/pages/tags.html
1144      */
1145     public enum FeatureTag {
1146         /** 3' end extended based on RNA-seq data. */
1147         THREE_PRIME_NESTED_SUPPORTED_EXTENSION("3_nested_supported_extension"),
1148 
1149         /** 3' end extended based on RNA-seq data. */
1150         THREE_PRIME_STANDARD_SUPPORTED_EXTENSION("3_standard_supported_extension"),
1151 
1152         /** annotated based on RNA-seq data. */
1153         FOURFIVEFOUR_RNA_SEQ_SUPPORTED("454_RNA_Seq_supported"),
1154 
1155         /** 5' end extended based on RNA-seq data. */
1156         FIVE_PRIME_NESTED_SUPPORTED_EXTENSION("5_nested_supported_extension"),
1157 
1158         /** 5' end extended based on RNA-seq data. */
1159         FIVE_PRIME_STANDARD_SUPPORTED_EXTENSION("5_standard_supported_extension"),
1160 
1161         /** shares an identical CDS but has alternative 5' UTR with respect to a reference variant. */
1162         ALTERNATIVE_3_UTR("alternative_3_UTR"),
1163 
1164         /** shares an identical CDS but has alternative 3' UTR with respect to a reference variant. */
1165         ALTERNATIVE_5_UTR("alternative_5_UTR"),
1166 
1167         // --------------------------------------------------------------------------------------------------------
1168         // Please note that the ordering of the APPRIS_* tags is also used in sorting here.  Do not re-order!
1169         // --------------------------------------------------------------------------------------------------------
1170         /** Transcript expected to code for the main functional isoform based on a range of protein features (APPRIS pipeline). */
1171         APPRIS_PRINCIPAL("appris_principal"),
1172 
1173         /** (This flag corresponds to the older flag "appris_principal") Where the transcript expected to code for the main */
1174         APPRIS_PRINCIPAL_1("appris_principal_1"),
1175 
1176         /** (This flag corresponds to the older flag "appris_candidate_ccds") Where the APPRIS core modules are unable to choose a */
1177         APPRIS_PRINCIPAL_2("appris_principal_2"),
1178 
1179         /** Where the APPRIS core modules are unable to choose a clear principal variant and there more than one of the variants */
1180         APPRIS_PRINCIPAL_3("appris_principal_3"),
1181 
1182         /** (This flag corresponds to the Ensembl 78 flag "appris_candidate_longest_ccds") Where the APPRIS core modules are unable */
1183         APPRIS_PRINCIPAL_4("appris_principal_4"),
1184 
1185         /** (This flag corresponds to the Ensembl 78 flag "appris_candidate_longest_seq") Where the APPRIS core modules are unable */
1186         APPRIS_PRINCIPAL_5("appris_principal_5"),
1187 
1188         /** Candidate transcript(s) models that are conserved in at least three tested non-primate species. */
1189         APPRIS_ALTERNATIVE_1("appris_alternative_1"),
1190 
1191         /** Candidate transcript(s) models that appear to be conserved in fewer than three tested non-primate species. */
1192         APPRIS_ALTERNATIVE_2("appris_alternative_2"),
1193 
1194         /** where there is no 'appris_principal' variant, the candidate with highest APPRIS score is selected as the primary */
1195         APPRIS_CANDIDATE_HIGHEST_SCORE("appris_candidate_highest_score"),
1196 
1197         /** the "appris_candidate" transcripts where there are several CCDS, in this case APPRIS labels the longest CCDS. */
1198         APPRIS_CANDIDATE_LONGEST_CCDS("appris_candidate_longest_ccds"),
1199 
1200         /** the "appris_candidate" transcript that has an unique CCDS. */
1201         APPRIS_CANDIDATE_CCDS("appris_candidate_ccds"),
1202 
1203         /** where there is no "appris_candidate_ccds" or "appris_candidate_longest_ccds" variant, the longest protein of the */
1204         APPRIS_CANDIDATE_LONGEST_SEQ("appris_candidate_longest_seq"),
1205 
1206         /** where there is no 'appris_principal' variant, the longest of the 'appris_candidate' variants is selected as the primary */
1207         APPRIS_CANDIDATE_LONGEST("appris_candidate_longest"),
1208 
1209         /** where there is no single 'appris_principal' variant the main functional isoform will be translated from one of the */
1210         APPRIS_CANDIDATE("appris_candidate"),
1211 
1212         /** identifies a subset of representative transcripts for each gene; prioritises full-length protein coding transcripts */
1213         BASIC("basic"),
1214 
1215         /** Transcript contains two confidently annotated CDSs. Support may come from eg proteomic data, cross-species conservation */
1216         BICISTRONIC("bicistronic"),
1217 
1218         /** Transcript 5' end overlaps ENCODE or Fantom CAGE cluster. */
1219         CAGE_SUPPORTED_TSS("CAGE_supported_TSS"),
1220 
1221         /** member of the consensus CDS gene set, confirming coding regions between ENSEMBL, UCSC, NCBI and HAVANA. */
1222         CCDS("CCDS"),
1223 
1224         /** The coding region end could not be confirmed. */
1225         CDS_END_NF("cds_end_NF"),
1226 
1227         /** The coding region start could not be confirmed. */
1228         CDS_START_NF("cds_start_NF"),
1229 
1230         /** Transcript QC checked using dotplot to identify features eg splice junctions, end of homology. */
1231         DOTTER_CONFIRMED("dotter_confirmed"),
1232 
1233         /** an upstream ATG is used where a downstream ATG seems more evolutionary conserved. */
1234         DOWNSTREAM_ATG("downstream_ATG"),
1235 
1236         /** Transcript was tested and confirmed experimentally. */
1237         EXP_CONF("exp_conf"),
1238 
1239         /** locus consists of non-overlapping transcript fragments either because of genome assembly issues (i.e., gaps or */
1240         FRAGMENTED_LOCUS("fragmented_locus"),
1241 
1242         /** Transcript model contains all possible in-frame exons supported by homology, experimental evidence or conservation, but */
1243         INFERRED_EXON_COMBINATION("inferred_exon_combination"),
1244 
1245         /** Transcript model is not supported by a single piece of transcript evidence. May be supported by multiple fragments of */
1246         INFERRED_TRANSCRIPT_MODEL("inferred_transcript_model"),
1247 
1248         /** Transcript supported by transcript evidence that, while ampping best-in-genome, shows regions of poor sequence quality. */
1249         LOW_SEQUENCE_QUALITY("low_sequence_quality"),
1250 
1251         /** the mRNA end could not be confirmed. */
1252         MRNA_END_NF("mRNA_end_NF"),
1253 
1254         /** the mRNA start could not be confirmed. */
1255         MRNA_START_NF("mRNA_start_NF"),
1256 
1257         /** the transcript belongs to the MANE Select data set. The Matched Annotation from NCBI and EMBL-EBI project (MANE) is a collaboration between Ensembl-GENCODE and RefSeq to select a default transcript per human protein coding locus that is representative of biology, well-supported, expressed and conserved. This transcript set matches GRCh38 and is 100% identical between RefSeq and Ensembl-GENCODE for 5' UTR, CDS, splicing and 3' UTR. */
1258         MANE_SELECT("MANE_Select"),
1259 
1260         /** in-frame type of variation where, at the acceptor site, some variants splice after the first AG and others after the */
1261         NAGNAG_SPLICE_SITE("NAGNAG_splice_site"),
1262 
1263         /** the locus is a host for small non-coding RNAs. */
1264         NCRNA_HOST("ncRNA_host"),
1265 
1266         /** annotated based on RNA-seq data. */
1267         NESTED_454_RNA_SEQ_SUPPORTED("nested_454_RNA_Seq_supported"),
1268 
1269         /** the transcript looks like it is subject to NMD but publications, experiments or conservation support the translation of */
1270         NMD_EXCEPTION("NMD_exception"),
1271 
1272         /** codon if the transcript were longer but cannot currently be annotated as NMD as does not fulfil all criteria - most */
1273         NMD_LIKELY_IF_EXTENDED("NMD_likely_if_extended"),
1274 
1275         /** the CDS has a non-ATG start and its validity is supported by publication or conservation. */
1276         NON_ATG_START("non_ATG_start"),
1277 
1278         /** the transcript has a non-canonical splice site conserved in other species. */
1279         NON_CANONICAL_CONSERVED("non_canonical_conserved"),
1280 
1281         /** the transcript has a non-canonical splice site explained by a genomic sequencing error. */
1282         NON_CANONICAL_GENOME_SEQUENCE_ERROR("non_canonical_genome_sequence_error"),
1283 
1284         /** the transcript has a non-canonical splice site explained by other reasons. */
1285         NON_CANONICAL_OTHER("non_canonical_other"),
1286 
1287         /** the transcript has a non-canonical splice site explained by a SNP. */
1288         NON_CANONICAL_POLYMORPHISM("non_canonical_polymorphism"),
1289 
1290         /** the transcript has a non-canonical splice site that needs experimental confirmation. */
1291         NON_CANONICAL_TEC("non_canonical_TEC"),
1292 
1293         /** the transcript has a non-canonical splice site explained by a U12 intron (i.e. AT-AC splice site). */
1294         NON_CANONICAL_U12("non_canonical_U12"),
1295 
1296         /** a splice variant for which supporting evidence has not been submitted to databases, i.e. the model is based on */
1297         NON_SUBMITTED_EVIDENCE("non_submitted_evidence"),
1298 
1299         /** a transcript is supported by evidence from same species paralogous loci. */
1300         NOT_BEST_IN_GENOME_EVIDENCE("not_best_in_genome_evidence"),
1301 
1302         /** evidence from other species was used to build model. */
1303         NOT_ORGANISM_SUPPORTED("not_organism_supported"),
1304 
1305         /** protein-coding locus with no paralogues or orthologs. */
1306         ORPHAN("orphan"),
1307 
1308         /** exon(s) of the locus overlap exon(s) of a readthrough transcript or a transcript belonging to another locus. */
1309         OVERLAPPING_LOCUS("overlapping_locus"),
1310 
1311         /** a low confidence upstream ATG existing in other coding variant would lead to NMD in this trancript, that uses the high */
1312         OVERLAPPING_UORF("overlapping_uORF"),
1313 
1314         /** annotation in the pseudo-autosomal region, which is duplicated between chromosomes X and Y. */
1315         PAR("PAR"),
1316 
1317         /** member of the pseudogene set predicted by YALE, UCSC and HAVANA. */
1318         PSEUDO_CONSENS("pseudo_consens"),
1319 
1320         /** a transcript that overlaps two or more independent loci but is considered to belong to a third, separate locus. */
1321         READTHROUGH_TRANSCRIPT("readthrough_transcript"),
1322 
1323         /** locus overlaps a sequence error or an assembly error in the reference genome that affects its annotation (e.g., 1 or */
1324         REFERENCE_GENOME_ERROR("reference_genome_error"),
1325 
1326         /** internal intron of CDS portion of transcript is retained. */
1327         RETAINED_INTRON_CDS("retained_intron_CDS"),
1328 
1329         /** final intron of CDS portion of transcript is retained. */
1330         RETAINED_INTRON_FINAL("retained_intron_final"),
1331 
1332         /** first intron of CDS portion of transcript is retained. */
1333         RETAINED_INTRON_FIRST("retained_intron_first"),
1334 
1335         /** protein-coding locus created via retrotransposition. */
1336         RETROGENE("retrogene"),
1337 
1338         /** Transcript supported by RNAseq data and not supported by mRNA or EST evidence. */
1339         RNA_SEQ_SUPPORTED_ONLY("RNA_Seq_supported_only"),
1340 
1341         /** Transcript annotated based on mixture of RNA-seq data and EST/mRNA/protein evidence. */
1342         RNA_SEQ_SUPPORTED_PARTIAL("RNA_Seq_supported_partial"),
1343 
1344         /** Transcript that contains a CDS that has a translation initiation site supported by Ribosomal Profiling data. */
1345         RP_SUPPORTED_TIS("RP_supported_TIS"),
1346 
1347         /** contains a selenocysteine. */
1348         SELENO("seleno"),
1349 
1350         /** a processed pseudogene with one or more introns still present. These are likely formed through the retrotransposition */
1351         SEMI_PROCESSED("semi_processed"),
1352 
1353         /** Transcript contains at least 1 non-canonical splice junction that is associated with a known or novel genome sequence */
1354         SEQUENCE_ERROR("sequence_error"),
1355 
1356         /** Transcript whose coding sequence contains an internal stop codon that does not cause the translation termination. */
1357         STOP_CODON_READTHROUGH("stop_codon_readthrough"),
1358 
1359         /** Transcript created or extended using assembled RNA-seq long reads. */
1360         TAGENE("TAGENE"),
1361 
1362         /** an upstream ATG exists when a downstream ATG is better supported. */
1363         UPSTREAM_ATG("upstream_ATG"),
1364 
1365         /** a low confidence upstream ATG existing in other coding variant would lead to NMD in this trancript, that uses the high */
1366         UPSTREAM_UORF("upstream_uORF");
1367 
1368         @SuppressWarnings("unchecked")
1369         private static final Map<String, FeatureTag> VALUE_MAP =
1370                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
1371 
1372         private final String serialized;
1373 
FeatureTag(final String serializedValue)1374         FeatureTag(final String serializedValue) {
1375             serialized = serializedValue;
1376         }
1377 
1378         @Override
toString()1379         public String toString() {
1380             return serialized;
1381         }
1382 
getEnum(final String s)1383         public static FeatureTag getEnum(final String s) {
1384             final String lowerS = s.toLowerCase();
1385             if ( VALUE_MAP.containsKey(lowerS) ){
1386                 return VALUE_MAP.get(lowerS);
1387             }
1388             throw new IllegalArgumentException("Unexpected value: " + s);
1389         }
1390     }
1391 
1392     /**
1393      * Transcript score according to how well mRNA and EST alignments match over its full length.
1394      *
1395      * For more information, see:
1396      *     https://www.gencodegenes.org/data_format.html
1397      *     https://en.wikipedia.org/wiki/General_feature_format
1398      */
1399     public enum TranscriptSupportLevel {
1400         /** all splice junctions of the transcript are supported by at least one non-suspect mRNA */
1401         ALL_MRNA_VERIFIED("1"),
1402 
1403         /** the best supporting mRNA is flagged as suspect or the support is from multiple ESTs */
1404         BEST_MRNA_SUSPECT("2"),
1405 
1406         /** the only support is from a single EST */
1407         SINGLE_EST_SUPPORT("3"),
1408 
1409         /** the best supporting EST is flagged as suspect */
1410         BEST_EST_SUSPECT("4"),
1411 
1412         /** no single transcript supports the model structure */
1413         NO_SINGLE_TRANSCRIPT_SUPPORT("5"),
1414 
1415         /** the transcript was not analyzed */
1416         NA("NA");
1417 
1418         @SuppressWarnings("unchecked")
1419         private static final Map<String, TranscriptSupportLevel> VALUE_MAP =
1420                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
1421 
1422         private final String serialized;
1423 
TranscriptSupportLevel(final String serializedValue)1424         TranscriptSupportLevel(final String serializedValue) {
1425             serialized = serializedValue;
1426         }
1427 
1428         @Override
toString()1429         public String toString() {
1430             return serialized;
1431         }
1432 
getEnum(final String s)1433         public static TranscriptSupportLevel getEnum(final String s) {
1434             final String lowerS = s.toLowerCase();
1435             if ( VALUE_MAP.containsKey(lowerS) ){
1436                 return VALUE_MAP.get(lowerS);
1437             }
1438             throw new IllegalArgumentException("Unexpected value: " + s);
1439         }
1440     }
1441 
1442     /**
1443      * Attribute that indicates the status of the mapping.
1444      *
1445      * For more information, see:
1446      *     https://www.gencodegenes.org/data_format.html
1447      *     https://en.wikipedia.org/wiki/General_feature_format
1448      *     http://www.gencodegenes.org/releases/grch37_mapped_releases.html#attrib
1449      */
1450     public enum RemapStatus {
1451         /**
1452          * Gene or transcript completely mapped to the target genome with all features intact.
1453          */
1454         FULL_CONTIG("full_contig"),
1455 
1456         /**
1457          * Gene or transcript completely mapped to the target genome with insertions in some features. These are usually small insertions.
1458          */
1459         FULL_FRAGMENT("full_fragment"),
1460 
1461         /**
1462          * Gene or transcript partially mapped to the target genome.
1463          */
1464         PARTIAL("partial"),
1465 
1466         /**
1467          * Gene or transcript did not map to the target genome.
1468          */
1469         DELETED("deleted"),
1470 
1471         /**
1472          * The source sequence is not in the assembly alignments. This will occur with alt loci genes if the alignments only contain the primary assembly.
1473          */
1474         NO_SEQ_MAP("no_seq_map"),
1475 
1476         /**
1477          * Transcripts in the gene mapped to multiple locations.
1478          */
1479         GENE_CONFLICT("gene_conflict"),
1480 
1481         /**
1482          * Transcripts caused gene length to change by more than 50%. This is to detect mapping to processed pseudogenes and mapping across tandem gene duplications.
1483          */
1484         GENE_SIZE_CHANGE("gene_size_change"),
1485 
1486         /**
1487          * Gene is from a small, automatic (ENSEMBL source) non-coding RNA. Taken from the target annotation.
1488          */
1489         AUTOMATIC_SMALL_NCRNA_GENE("automatic_small_ncrna_gene"),
1490 
1491         /**
1492          * Gene is from an automatic process (ENSEMBL source). Taken from the target annotation.
1493          */
1494         AUTOMATIC_GENE("automatic_gene"),
1495 
1496         /**
1497          * Pseudogene annotations (excluding polymorphic).
1498          */
1499         PSEUDOGENE("pseudogene");
1500 
1501         @SuppressWarnings("unchecked")
1502         private static final Map<String, RemapStatus> VALUE_MAP =
1503                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
1504 
1505         private final String serialized;
1506 
RemapStatus(final String serializedValue)1507         RemapStatus(final String serializedValue) { serialized = serializedValue; }
1508 
1509         @Override
toString()1510         public String toString() {
1511             return serialized;
1512         }
1513 
getEnum(final String s)1514         public static RemapStatus getEnum(final String s) {
1515             final String lowerS = s.toLowerCase();
1516             if ( VALUE_MAP.containsKey(lowerS) ){
1517                 return VALUE_MAP.get(lowerS);
1518             }
1519             throw new IllegalArgumentException("Unexpected value: " + s);
1520         }
1521     }
1522 
1523     /**
1524      * Attribute that compares the mapping to the existing target annotations.
1525      *
1526      * For more information, see:
1527      *     https://www.gencodegenes.org/data_format.html
1528      *     https://en.wikipedia.org/wiki/General_feature_format
1529      *     http://www.gencodegenes.org/releases/grch37_mapped_releases.html#attrib
1530      */
1531     public enum RemapTargetStatus {
1532 
1533         /**
1534          * Gene or transcript was not in target annotations.
1535          */
1536         NEW("new"),
1537 
1538         /**
1539          * Gene or transcript exists in source and target genome, however source was not mapped.
1540          */
1541         LOST("lost"),
1542 
1543         /**
1544          * Gene or transcript overlaps previous version of annotation on target genome.
1545          */
1546         OVERLAP("overlap"),
1547 
1548         /**
1549          * Gene or transcript exists in target, however source mapping is to a different location. This is often mappings to a gene family members or pseudogenes.
1550          */
1551         NONOVERLAP("nonOverlap");
1552 
1553         @SuppressWarnings("unchecked")
1554         private static final Map<String, RemapTargetStatus> VALUE_MAP =
1555                 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));
1556 
1557         private final String serialized;
1558 
RemapTargetStatus(final String serializedValue)1559         RemapTargetStatus(final String serializedValue) {
1560             serialized = serializedValue;
1561         }
1562 
1563         @Override
toString()1564         public String toString() {
1565             return serialized;
1566         }
1567 
getEnum(final String s)1568         public static RemapTargetStatus getEnum(final String s) {
1569             final String lowerS = s.toLowerCase();
1570             if ( VALUE_MAP.containsKey(lowerS) ){
1571                 return VALUE_MAP.get(lowerS);
1572             }
1573             throw new IllegalArgumentException("Unexpected value: " + s);
1574         }
1575     }
1576 }
1577