1 package org.broadinstitute.hellbender.utils.codecs.gtf; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.util.Locatable; 5 import htsjdk.tribble.Feature; 6 import htsjdk.tribble.annotation.Strand; 7 import org.apache.commons.lang3.StringUtils; 8 import org.apache.logging.log4j.LogManager; 9 import org.apache.logging.log4j.Logger; 10 import org.broadinstitute.hellbender.exceptions.UserException; 11 import org.broadinstitute.hellbender.utils.SimpleInterval; 12 import org.broadinstitute.hellbender.utils.Utils; 13 14 import java.util.*; 15 import java.util.regex.Pattern; 16 import java.util.stream.Collectors; 17 18 /** 19 * A {@link GencodeGtfFeature} represents data in a GENCODE GTF file. 20 * 21 * Features are grouped logically by related data. 22 * While the abstract class {@link GencodeGtfFeature} represents a single line 23 * of a GENCODE GTF File, the concrete instantiations represent at least one line, 24 * and often more than one. 25 * 26 * For example, a {@link GencodeGtfGeneFeature} represents all lines in the given 27 * data file with information on a particular gene. This includes all transcripts, 28 * exons, coding regions, etc. in that gene. 29 * 30 * Similarly, a {@link GencodeGtfTranscriptFeature} represents all lines in the given 31 * data file with information on a particular transcript. 32 * 33 * However, a {@link GencodeGtfSelenocysteineFeature} represents a particular line 34 * in the given data file that contains information on a specific selenocysteine. 35 * 36 * The specification of a GTF file is defined here: 37 * http://mblab.wustl.edu/GTF22.html 38 * 39 * Currently only supports GENCODE versions 19-26. 40 * 41 * Created by jonn on 7/21/17. 42 */ 43 public abstract class GencodeGtfFeature implements Feature, Comparable<GencodeGtfFeature> { 44 45 private static final Logger logger = LogManager.getLogger(GencodeGtfFeature.class); 46 47 // =========================================================================== 48 49 50 public static final String ANNOTATION_SOURCE_ENSEMBL = "ENSEMBL"; 51 public static final String ANNOTATION_SOURCE_HAVANA = "HAVANA"; 52 public static final String ANNOTATION_SOURCE_ENA = "ena"; 53 54 // =========================================================================== 55 56 // Metadata fields: 57 58 private static final String FIELD_DELIMITER = "\t"; 59 60 public static final int NO_FEATURE_ORDER = -1; 61 public static final int NO_EXON_NUMBER = -1; 62 63 private static final int NUM_FIELDS = 9; 64 65 private static final int CHROMOSOME_NAME_INDEX = 0; 66 private static final int ANNOTATION_SOURCE_INDEX = 1; 67 private static final int FEATURE_TYPE_INDEX = 2; 68 private static final int START_LOCATION_INDEX = 3; 69 private static final int END_LOCATION_INDEX = 4; 70 private static final int GENOMIC_STRAND_INDEX = 6; 71 private static final int GENOMIC_PHASE_INDEX = 7; 72 private static final int EXTRA_FIELDS_INDEX = 8; 73 74 private static final String EXTRA_FIELD_DELIMITER = ";"; 75 76 private static final int EXTRA_FIELD_KEY_INDEX = 0; 77 private static final int EXTRA_FIELD_VALUE_INDEX = 1; 78 public static final String EXTRA_FIELD_KEY_VALUE_SPLITTER = " "; 79 80 private static final Pattern NUMBER_PATTERN = Pattern.compile("\\d\\d*"); 81 82 private String ucscGenomeVersion = null; 83 @VisibleForTesting 84 final GencodeGtfFeatureBaseData baseData; 85 86 // ================================================================================================ 87 88 /** 89 * Populate this GencodeGtfFeature with the given data. 90 * @param gtfFields {@link String[]} containing an ordered list of fields to use to populate this {@link GencodeGtfFeature}. 91 * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}. 92 */ GencodeGtfFeature(final String[] gtfFields, final String gtfFileType)93 protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType) { 94 95 Utils.validateArg(gtfFields.length == NUM_FIELDS, "Unexpected number of fields: " + gtfFields.length + " != " + NUM_FIELDS); 96 97 baseData = new GencodeGtfFeatureBaseData(); 98 99 try { 100 baseData.genomicPosition = new SimpleInterval( 101 gtfFields[CHROMOSOME_NAME_INDEX], 102 Integer.valueOf(gtfFields[START_LOCATION_INDEX]), 103 Integer.valueOf(gtfFields[END_LOCATION_INDEX]) 104 ); 105 } 106 catch (final NumberFormatException ex) { 107 throw new UserException.MalformedFile("Cannot read integer value for start/end position!"); 108 } 109 110 baseData.gtfSourceFileType = gtfFileType; 111 112 baseData.annotationSource = gtfFields[ANNOTATION_SOURCE_INDEX]; 113 baseData.featureType = GencodeGtfFeature.FeatureType.getEnum( gtfFields[FEATURE_TYPE_INDEX].toLowerCase() ); 114 baseData.genomicStrand = convertStringToStrand( gtfFields[GENOMIC_STRAND_INDEX] ); 115 baseData.genomicPhase = GenomicPhase.getEnum( gtfFields[GENOMIC_PHASE_INDEX] ); 116 117 // Get the extra fields from the last column: 118 final String[] extraFields = gtfFields[EXTRA_FIELDS_INDEX].split(EXTRA_FIELD_DELIMITER, -1); 119 120 final StringBuilder anonymousOptionalFieldBuilder = new StringBuilder(); 121 122 // Now there are "optional" fields to go through (some actually required, some actually optional), 123 // But we need to match up the field names to the fields themselves: 124 for ( final String extraField : extraFields ) { 125 126 final String trimmedExtraField = extraField.trim(); 127 if (trimmedExtraField.isEmpty()) { 128 continue; 129 } 130 131 final int splitPoint = trimmedExtraField.indexOf(EXTRA_FIELD_KEY_VALUE_SPLITTER); 132 if( splitPoint == -1 ) { 133 throw new UserException.MalformedFile("Extraneous optional field data - not in a key/value pair: " + extraField); 134 } 135 136 final String fieldName = trimmedExtraField.substring(0, splitPoint).trim(); 137 138 // The value of the field may be between two quotes. 139 // We remove them here. 140 final String rawFieldValue = trimmedExtraField.substring(splitPoint + 1, trimmedExtraField.length()); 141 final String fieldValue = StringUtils.remove(rawFieldValue.trim(), '"'); 142 143 if( fieldValue.contains(EXTRA_FIELD_KEY_VALUE_SPLITTER) ){ 144 throw new UserException("Expected a key/value pair but found several values " + fieldName + "/" + fieldValue); 145 } 146 147 OptionalField<?> optionalField = null; 148 149 switch (fieldName) { 150 // Find the right field to set: 151 case "gene_id": 152 baseData.geneId = fieldValue; 153 break; 154 case "transcript_id": 155 baseData.transcriptId = fieldValue; 156 break; 157 case "gene_type": 158 baseData.geneType = GeneTranscriptType.getEnum(fieldValue); 159 break; 160 // For ENSEMBL GTF files: 161 case "gene_biotype": 162 baseData.geneType = GeneTranscriptType.getEnum(fieldValue); 163 break; 164 case "gene_status": 165 baseData.geneStatus = GeneTranscriptStatus.valueOf(fieldValue); 166 break; 167 case "gene_name": 168 baseData.geneName = fieldValue; 169 break; 170 case "transcript_type": 171 baseData.transcriptType = GeneTranscriptType.getEnum(fieldValue); 172 break; 173 case "transcript_biotype": 174 baseData.transcriptType = GeneTranscriptType.getEnum(fieldValue); 175 break; 176 case "transcript_status": 177 baseData.transcriptStatus = GeneTranscriptStatus.valueOf(fieldValue); 178 break; 179 case "transcript_name": 180 baseData.transcriptName = fieldValue; 181 break; 182 case "exon_number": 183 try { 184 baseData.exonNumber = Integer.valueOf(fieldValue); 185 } 186 catch (final NumberFormatException ex) { 187 throw new UserException.MalformedFile("Could not convert field value into integer: " + fieldValue); 188 } 189 break; 190 case "exon_id": 191 baseData.exonId = fieldValue; 192 break; 193 case "level": 194 baseData.locusLevel = LocusLevel.getEnum(fieldValue); 195 break; 196 case "tag": 197 optionalField = new OptionalField<>(fieldName, FeatureTag.getEnum(fieldValue)); 198 break; 199 case "ccdsid": 200 optionalField = new OptionalField<>(fieldName, fieldValue); 201 break; 202 case "havana_gene": 203 optionalField = new OptionalField<>(fieldName, fieldValue); 204 break; 205 case "havana_transcript": 206 optionalField = new OptionalField<>(fieldName, fieldValue); 207 break; 208 case "protein_id": 209 optionalField = new OptionalField<>(fieldName, fieldValue); 210 break; 211 case "ont": 212 optionalField = new OptionalField<>(fieldName, fieldValue); 213 break; 214 case "transcript_support_level": 215 optionalField = new OptionalField<>(fieldName, TranscriptSupportLevel.getEnum(fieldValue)); 216 break; 217 case "remap_status": 218 optionalField = new OptionalField<>(fieldName, RemapStatus.getEnum(fieldValue)); 219 break; 220 case "remap_original_id": 221 optionalField = new OptionalField<>(fieldName, fieldValue); 222 break; 223 case "remap_original_location": 224 try { 225 optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue)); 226 } 227 catch (final NumberFormatException nfe) { 228 // We must have gotten a field that has a different format. 229 // For now, just copy it over: 230 optionalField = new OptionalField<>(fieldName, fieldValue); 231 } 232 break; 233 case "remap_num_mappings": 234 optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue)); 235 break; 236 case "remap_target_status": 237 optionalField = new OptionalField<>(fieldName, RemapTargetStatus.getEnum(fieldValue)); 238 break; 239 case "remap_substituted_missing_target": 240 optionalField = new OptionalField<>(fieldName, fieldValue); 241 break; 242 default: 243 anonymousOptionalFieldBuilder.append(extraField); 244 anonymousOptionalFieldBuilder.append(EXTRA_FIELD_DELIMITER); 245 break; 246 } 247 248 // If the optional field was good, we add it: 249 if ( optionalField != null ) { 250 baseData.optionalFields.add(optionalField); 251 } 252 } 253 254 // Save our anonymous optional fields: 255 if ( anonymousOptionalFieldBuilder.length() != 0 ) { 256 baseData.anonymousOptionalFields = anonymousOptionalFieldBuilder.toString(); 257 } 258 } 259 260 /** 261 * Converts the given {@link String} into a {@link Strand}. 262 * @param s {@link String} to convert into a {@link Strand}. 263 * @return The {@link Strand} corresponding to {@code s}. 264 */ convertStringToStrand( final String s )265 private static Strand convertStringToStrand( final String s ) { 266 if ( s.equals("+") ) { 267 return Strand.POSITIVE; 268 } 269 else if ( s.equals("-") ) { 270 return Strand.NEGATIVE; 271 } 272 else { 273 throw new IllegalArgumentException("Unexpected value: " + s); 274 } 275 } 276 277 /** 278 * Populate this GencodeGtfFeature with the given data. 279 */ GencodeGtfFeature(final GencodeGtfFeatureBaseData baseData)280 protected GencodeGtfFeature(final GencodeGtfFeatureBaseData baseData) { 281 this.baseData = baseData; 282 } 283 284 // ================================================================================================ 285 286 /** 287 * Create the appropriate {@link GencodeGtfFeature} object based on the given {@code baseData} 288 * @param baseData A {@link GencodeGtfFeatureBaseData} object containing all data for a single line in a GENCODE GTF File. 289 * @return A {@link GencodeGtfFeature} containing the data in {@code baseData} 290 */ create(final GencodeGtfFeatureBaseData baseData)291 public static GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 292 Utils.nonNull(baseData); 293 294 // Create our feature: 295 return baseData.featureType.create(baseData); 296 } 297 298 /** 299 * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File. 300 * @param gtfLine A line from a Gencode GTF File to convert into a {@link GencodeGtfFeature} object. 301 * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}. 302 * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine} 303 */ create(final String gtfLine, final String gtfFileType)304 public static GencodeGtfFeature create(final String gtfLine, final String gtfFileType) { 305 Utils.nonNull(gtfLine); 306 return create(gtfLine.split(FIELD_DELIMITER), gtfFileType); 307 } 308 309 /** 310 * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File. 311 * @param gtfFields A line from a Gencode GTF File split on the {@link #FIELD_DELIMITER} character. 312 * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}. 313 * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine} 314 */ create(final String[] gtfFields, final String gtfFileType)315 public static GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 316 Utils.nonNull(gtfFields); 317 318 // Ensure that the input data are superficially well-formed: 319 if ( gtfFields.length != GencodeGtfCodec.NUM_COLUMNS ) { 320 throw new UserException.MalformedFile("Invalid number of fields in the given GENCODE line " + 321 " - Given: " + gtfFields.length + " Expected: " + GencodeGtfCodec.NUM_COLUMNS); 322 } 323 324 final FeatureType featureType = FeatureType.getEnum( gtfFields[FEATURE_TYPE_INDEX] ); 325 326 // Return our feature: 327 return featureType.create(gtfFields, gtfFileType); 328 } 329 330 // ================================================================================================ 331 332 @Override getContig()333 public String getContig() { 334 return baseData.genomicPosition.getContig(); 335 } 336 337 @Override getStart()338 public int getStart() { 339 return baseData.genomicPosition.getStart(); 340 } 341 342 @Override getEnd()343 public int getEnd() { 344 return baseData.genomicPosition.getEnd(); 345 } 346 347 // ================================================================================================ 348 349 /** 350 * Get all the features from this {@link GencodeGtfFeature} itself. 351 * This is useful to get any subfeatures included in this {@link GencodeGtfFeature}. 352 * @return A {@link List} of the features represented in this {@link GencodeGtfFeature}. 353 */ 354 @VisibleForTesting getAllFeatures()355 List<GencodeGtfFeature> getAllFeatures() { 356 final List<GencodeGtfFeature> list = new ArrayList<>(); 357 list.add(this); 358 return list; 359 } 360 361 /** 362 * Serializes the base data in {@link GencodeGtfFeature} to a string. 363 * @return a {@link String} representing this {@link GencodeGtfFeature} 364 */ serializeToStringHelper()365 private String serializeToStringHelper() { 366 367 final StringBuilder stringBuilder = new StringBuilder(); 368 369 stringBuilder.append( baseData.genomicPosition.getContig() ); 370 stringBuilder.append( '\t' ); 371 stringBuilder.append( baseData.annotationSource ); 372 stringBuilder.append( '\t' ); 373 stringBuilder.append( baseData.featureType ); 374 stringBuilder.append( '\t' ); 375 stringBuilder.append( baseData.genomicPosition.getStart() ); 376 stringBuilder.append( '\t' ); 377 stringBuilder.append( baseData.genomicPosition.getEnd() ); 378 stringBuilder.append( "\t.\t" ); 379 stringBuilder.append( baseData.genomicStrand ); 380 stringBuilder.append( '\t' ); 381 stringBuilder.append( baseData.genomicPhase ); 382 stringBuilder.append( '\t' ); 383 384 if ( baseData.geneId != null ) { 385 stringBuilder.append("gene_id \""); 386 stringBuilder.append(baseData.geneId); 387 stringBuilder.append( "\"; " ); 388 } 389 if ( baseData.transcriptId != null) { 390 stringBuilder.append("transcript_id \""); 391 stringBuilder.append(baseData.transcriptId); 392 stringBuilder.append( "\"; " ); 393 } 394 if ( baseData.geneType != null ) { 395 stringBuilder.append("gene_type \""); 396 stringBuilder.append(baseData.geneType); 397 stringBuilder.append( "\"; " ); 398 } 399 if ( baseData.geneStatus != null ) { 400 stringBuilder.append("gene_status \""); 401 stringBuilder.append(baseData.geneStatus); 402 stringBuilder.append( "\"; " ); 403 } 404 if ( baseData.geneName != null ) { 405 stringBuilder.append("gene_name \""); 406 stringBuilder.append(baseData.geneName); 407 stringBuilder.append( "\"; " ); 408 } 409 if ( baseData.transcriptType != null ) { 410 stringBuilder.append("transcript_type \""); 411 stringBuilder.append(baseData.transcriptType); 412 stringBuilder.append( "\"; " ); 413 } 414 if ( baseData.transcriptStatus != null ) { 415 stringBuilder.append("transcript_status \""); 416 stringBuilder.append(baseData.transcriptStatus); 417 stringBuilder.append( "\"; " ); 418 } 419 if ( baseData.transcriptName != null ) { 420 stringBuilder.append("transcript_name \""); 421 stringBuilder.append(baseData.transcriptName); 422 stringBuilder.append( "\"; " ); 423 } 424 if ( baseData.exonNumber != NO_EXON_NUMBER ) { 425 stringBuilder.append("exon_number "); 426 stringBuilder.append(baseData.exonNumber); 427 stringBuilder.append( "; " ); 428 } 429 if ( baseData.exonId != null) { 430 stringBuilder.append("exon_id \""); 431 stringBuilder.append(baseData.exonId); 432 stringBuilder.append( "\"; "); 433 } 434 if (baseData.locusLevel != null) { 435 stringBuilder.append("level "); 436 stringBuilder.append(baseData.locusLevel); 437 stringBuilder.append("; "); 438 } 439 440 // = = = = = = = = = = = = = = = = = = = = = = = 441 442 // Output our optional fields: 443 stringBuilder.append( 444 baseData.optionalFields.stream().map(Object::toString).collect(Collectors.joining(" ")) 445 ); 446 447 if ( baseData.anonymousOptionalFields != null ) { 448 stringBuilder.append(baseData.anonymousOptionalFields); 449 } 450 451 return stringBuilder.toString().trim(); 452 } 453 454 /** 455 * Serializes all data in {@link GencodeGtfFeature} to a string. 456 * This includes all subfields of child classes. 457 * @return a {@link String} representing this {@link GencodeGtfFeature} 458 */ serializeToString()459 public String serializeToString() { 460 final StringBuilder stringBuilder = new StringBuilder(); 461 462 final List<GencodeGtfFeature> features = getAllFeatures(); 463 Collections.sort( features ); 464 465 for ( final GencodeGtfFeature feature : features ) { 466 stringBuilder.append( feature.serializeToStringHelper() ); 467 stringBuilder.append("\n"); 468 } 469 470 return stringBuilder.toString().trim(); 471 } 472 473 @Override toString()474 public String toString() { 475 return serializeToString(); 476 } 477 478 // ================================================================================================ 479 getGtfSourceFileType()480 public String getGtfSourceFileType() { return baseData.gtfSourceFileType; } 481 getUcscGenomeVersion()482 public String getUcscGenomeVersion() { 483 return ucscGenomeVersion; 484 } 485 setUcscGenomeVersion(final String ucscGenomeVersion)486 public void setUcscGenomeVersion(final String ucscGenomeVersion) { 487 this.ucscGenomeVersion = ucscGenomeVersion; 488 } 489 getGenomicPosition()490 public SimpleInterval getGenomicPosition() { return baseData.genomicPosition; } 491 getFeatureOrderNumber()492 public int getFeatureOrderNumber() { return baseData.featureOrderNumber; } 493 getChromosomeName()494 public String getChromosomeName() { 495 return baseData.genomicPosition.getContig(); 496 } 497 getAnnotationSource()498 public String getAnnotationSource() { 499 return baseData.annotationSource; 500 } 501 getFeatureType()502 public FeatureType getFeatureType() { 503 return baseData.featureType; 504 } 505 getGenomicStartLocation()506 public int getGenomicStartLocation() { 507 return baseData.genomicPosition.getStart(); 508 } 509 getGenomicEndLocation()510 public int getGenomicEndLocation() { 511 return baseData.genomicPosition.getEnd(); 512 } 513 getGenomicStrand()514 public Strand getGenomicStrand() { 515 return baseData.genomicStrand; 516 } 517 getGenomicPhase()518 public GenomicPhase getGenomicPhase() { 519 return baseData.genomicPhase; 520 } 521 getGeneId()522 public String getGeneId() { 523 return baseData.geneId; 524 } 525 getTranscriptId()526 public String getTranscriptId() { 527 return baseData.transcriptId; 528 } 529 getGeneType()530 public GeneTranscriptType getGeneType() { 531 return baseData.geneType; 532 } 533 getGeneName()534 public String getGeneName() { 535 return baseData.geneName; 536 } 537 getTranscriptType()538 public GeneTranscriptType getTranscriptType() { 539 return baseData.transcriptType; 540 } 541 getTranscriptName()542 public String getTranscriptName() { 543 return baseData.transcriptName; 544 } 545 getGeneStatus()546 public GeneTranscriptStatus getGeneStatus() { 547 return baseData.geneStatus; 548 } 549 getTranscriptStatus()550 public GeneTranscriptStatus getTranscriptStatus() { 551 return baseData.transcriptStatus; 552 } 553 getExonNumber()554 public int getExonNumber() { 555 return baseData.exonNumber; 556 } 557 getExonId()558 public String getExonId() { 559 return baseData.exonId; 560 } 561 getLocusLevel()562 public LocusLevel getLocusLevel() { 563 return baseData.locusLevel; 564 } 565 getOptionalFields()566 public List<OptionalField<?>> getOptionalFields() { 567 return baseData.optionalFields; 568 } 569 getAnonymousOptionalFields()570 public String getAnonymousOptionalFields() { 571 return baseData.anonymousOptionalFields; 572 } 573 getOptionalField(final String key)574 public OptionalField<?> getOptionalField(final String key) { 575 for (final OptionalField<?> optionalField : baseData.optionalFields) { 576 if ( optionalField.getName().equals(key) ) { 577 return optionalField; 578 } 579 } 580 return null; 581 } 582 583 /** 584 * Comparable interface implementation for {@link GencodeGtfFeature}. 585 * 586 * Order is determined by {@link GencodeGtfFeatureBaseData#featureOrderNumber} 587 * 588 * @param other {@link GencodeGtfFeature} to which to compare 589 * @return -1 if this < other; 0 if this == other; 1 if this > other 590 */ 591 @Override compareTo(final GencodeGtfFeature other)592 public int compareTo(final GencodeGtfFeature other) { 593 Utils.nonNull(other); 594 return (baseData.featureOrderNumber - other.baseData.featureOrderNumber); 595 } 596 597 @Override equals(final Object that)598 public boolean equals(final Object that) { 599 if (that == null) { 600 return false; 601 } 602 else if ( this == that ) { 603 return true; 604 } 605 606 boolean isEqual = that instanceof GencodeGtfFeature; 607 if (isEqual) { 608 final GencodeGtfFeature thatFeature = (GencodeGtfFeature) that; 609 isEqual = Objects.equals(baseData, thatFeature.baseData); 610 611 if ( isEqual ) { 612 isEqual = ucscGenomeVersion.equals( thatFeature.getUcscGenomeVersion() ); 613 } 614 } 615 616 return isEqual; 617 } 618 619 @Override hashCode()620 public int hashCode() { 621 return baseData != null ? baseData.hashCode() : 0; 622 } 623 624 /** 625 * Checks if {@code other} is contained within this {@link GencodeGtfFeature}. 626 * Comparison is made using {@link SimpleInterval#contains(Locatable)} ala {@link GencodeGtfFeatureBaseData#genomicPosition} 627 * @param other {@link Locatable} of which to check the bounds. 628 * @return true if {@code other} is contained within the bounds of this {@link GencodeGtfFeature}, false otherwise. 629 */ contains(final Locatable other)630 public boolean contains(final Locatable other) { 631 return baseData.genomicPosition.contains(other); 632 } 633 634 /** 635 * Checks if {@code other} overlaps with this {@link GencodeGtfFeature}. 636 * Comparison is made using {@link SimpleInterval#overlaps(Locatable)} ala {@link GencodeGtfFeatureBaseData#genomicPosition} 637 * @param other {@link Locatable}-derived class of which to check the bounds. 638 * @return true if {@code other} overlaps the bounds of this {@link GencodeGtfFeature}, false otherwise. 639 */ overlaps(final Locatable other)640 public boolean overlaps(final Locatable other) { 641 return baseData.genomicPosition.overlaps(other); 642 } 643 setFeatureOrderNumber(final int featureOrderNumber)644 public void setFeatureOrderNumber(final int featureOrderNumber) { 645 this.baseData.featureOrderNumber = featureOrderNumber; 646 } 647 648 // ================================================================================================ 649 650 static public class OptionalField<T> { 651 652 private String name; 653 private T value; 654 OptionalField(final String name, final T value)655 public OptionalField(final String name, final T value) { 656 this.name = name; 657 this.value = value; 658 } 659 getName()660 public String getName() { 661 return name; 662 } 663 setName(final String name)664 public void setName(final String name) { 665 this.name = name; 666 } 667 getValue()668 public T getValue() { 669 return value; 670 } 671 setValue(final T value)672 public void setValue(final T value) { 673 this.value = value; 674 } 675 676 @Override toString()677 public String toString() { 678 679 final StringBuilder sb = new StringBuilder(); 680 681 sb.append(name); 682 sb.append(" "); 683 684 // We need to do some formatting for the numbers / non-numbers in the field: 685 final String valueString = value.toString(); 686 if ( NUMBER_PATTERN.matcher(valueString).matches() ) { 687 sb.append(valueString); 688 sb.append(";"); 689 } 690 else { 691 sb.append("\""); 692 sb.append(valueString); 693 sb.append("\";"); 694 } 695 696 return sb.toString(); 697 } 698 699 @Override hashCode()700 public int hashCode() { 701 int result = name != null ? name.hashCode() : 0; 702 result = 31 * result + (value != null ? value.hashCode() : 0); 703 return result; 704 } 705 706 @Override equals(final Object other)707 public boolean equals(final Object other) { 708 709 if (other == null) { 710 return false; 711 } 712 else if ( this == other ) { 713 return true; 714 } 715 716 if ( !(other instanceof OptionalField) ) { 717 return false; 718 } 719 720 final OptionalField<?> otherOptionalField = (OptionalField<?>) other; 721 722 return (name.equals(otherOptionalField.name)) && 723 (value.equals(otherOptionalField.value)); 724 } 725 } 726 727 // ================================================================================================ 728 729 730 731 // ================================================================================================ 732 733 /** 734 * Keyword identifying the source of the feature, like a program 735 * (e.g. Augustus or RepeatMasker) or an organization (like TAIR). 736 * 737 * For more information, see: 738 * https://www.gencodegenes.org/data_format.html 739 * https://en.wikipedia.org/wiki/General_feature_format 740 */ 741 public enum AnnotationSource { 742 ENSEMBL, 743 HAVANA, 744 ena // From ENSEMBLE GTFs 745 } 746 747 /** 748 * Type of the feature represented in a single line of a GENCODE GTF File. 749 * 750 * For more information, see: 751 * https://www.gencodegenes.org/data_format.html 752 * https://en.wikipedia.org/wiki/General_feature_format 753 */ 754 public enum FeatureType { 755 GENE("gene"){ create(final GencodeGtfFeatureBaseData baseData)756 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 757 return GencodeGtfGeneFeature.create(baseData); 758 } create(final String[] gtfFields, final String gtfFileType)759 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 760 return GencodeGtfGeneFeature.create(gtfFields, gtfFileType); 761 } 762 }, 763 TRANSCRIPT("transcript"){ create(final GencodeGtfFeatureBaseData baseData)764 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 765 return GencodeGtfTranscriptFeature.create(baseData); 766 } create(final String[] gtfFields, final String gtfFileType)767 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 768 return GencodeGtfTranscriptFeature.create(gtfFields, gtfFileType); 769 } 770 }, 771 SELENOCYSTEINE("Selenocysteine"){ create(final GencodeGtfFeatureBaseData baseData)772 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 773 return GencodeGtfSelenocysteineFeature.create(baseData); 774 } create(final String[] gtfFields, final String gtfFileType)775 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 776 return GencodeGtfSelenocysteineFeature.create(gtfFields, gtfFileType); 777 } 778 }, 779 EXON("exon"){ create(final GencodeGtfFeatureBaseData baseData)780 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 781 return GencodeGtfExonFeature.create(baseData); 782 } create(final String[] gtfFields, final String gtfFileType)783 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 784 return GencodeGtfExonFeature.create(gtfFields, gtfFileType); 785 } 786 }, 787 CDS("CDS"){ create(final GencodeGtfFeatureBaseData baseData)788 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 789 return GencodeGtfCDSFeature.create(baseData); 790 } create(final String[] gtfFields, final String gtfFileType)791 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 792 return GencodeGtfCDSFeature.create(gtfFields, gtfFileType); 793 } 794 }, 795 START_CODON("start_codon"){ create(final GencodeGtfFeatureBaseData baseData)796 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 797 return GencodeGtfStartCodonFeature.create(baseData); 798 } create(final String[] gtfFields, final String gtfFileType)799 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 800 return GencodeGtfStartCodonFeature.create(gtfFields, gtfFileType); 801 } 802 }, 803 STOP_CODON("stop_codon"){ create(final GencodeGtfFeatureBaseData baseData)804 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 805 return GencodeGtfStopCodonFeature.create(baseData); 806 } create(final String[] gtfFields, final String gtfFileType)807 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 808 return GencodeGtfStopCodonFeature.create(gtfFields, gtfFileType); 809 } 810 }, 811 UTR("UTR"){ create(final GencodeGtfFeatureBaseData baseData)812 public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData) { 813 return GencodeGtfUTRFeature.create(baseData); 814 } create(final String[] gtfFields, final String gtfFileType)815 public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType) { 816 return GencodeGtfUTRFeature.create(gtfFields, gtfFileType); 817 } 818 }; 819 820 @SuppressWarnings("unchecked") 821 private static final Map<String, FeatureType> VALUE_MAP = 822 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 823 824 private final String serialized; 825 FeatureType(final String serializedValue)826 FeatureType(final String serializedValue) { serialized = serializedValue; } 827 828 @Override toString()829 public String toString() { return serialized; } 830 getEnum(final String s)831 public static FeatureType getEnum(final String s) { 832 final String lowerS = s.toLowerCase(); 833 if ( VALUE_MAP.containsKey(lowerS) ){ 834 return VALUE_MAP.get(lowerS); 835 } 836 throw new IllegalArgumentException("Unexpected value: " + s); 837 } 838 839 /** 840 * Create a {@link GencodeGtfFeature} of this type given {@code baseData} 841 * @param baseData The data to use to create a {@link GencodeGtfFeature} 842 * @return The {@link GencodeGtfFeature} represented by the given {@code baseData} 843 */ create(final GencodeGtfFeatureBaseData baseData)844 abstract public GencodeGtfFeature create(final GencodeGtfFeatureBaseData baseData); 845 846 /** 847 * Create a {@link GencodeGtfFeature} of this type given {@code gtfFields} 848 * @param gtfFields The data to use to create a {@link GencodeGtfFeature} 849 * @param gtfFileType A {@link String} containing the file type of the GTF data that created this {@link GencodeGtfFeature}. 850 * @return The {@link GencodeGtfFeature} represented by the given {@code gtfFields} 851 */ create(final String[] gtfFields, final String gtfFileType)852 abstract public GencodeGtfFeature create(final String[] gtfFields, final String gtfFileType); 853 } 854 855 /** 856 * Whether the first base of the CDS segment is the first (frame 0), second (frame 1) or third (frame 2) \ 857 * in the codon of the ORF. 858 * 859 * For more information, see: 860 * https://www.gencodegenes.org/data_format.html 861 * https://en.wikipedia.org/wiki/General_feature_format 862 */ 863 public enum GenomicPhase { 864 ZERO("0"), 865 ONE ("1"), 866 TWO ("2"), 867 DOT ("."); 868 869 @SuppressWarnings("unchecked") 870 private static final Map<String, GenomicPhase> VALUE_MAP = 871 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 872 873 private final String serialized; 874 GenomicPhase(final String serializedValue)875 GenomicPhase(final String serializedValue) { 876 serialized = serializedValue; 877 } 878 879 @Override toString()880 public String toString() { 881 return serialized; 882 } 883 getEnum(final String s)884 public static GenomicPhase getEnum(final String s) { 885 final String lowerS = s.toLowerCase(); 886 if ( VALUE_MAP.containsKey(lowerS) ){ 887 return VALUE_MAP.get(lowerS); 888 } 889 throw new IllegalArgumentException("Unexpected value: " + s); 890 } 891 } 892 893 /** 894 * Biotype / transcript type for the transcript or gene represented in a feature. 895 * This is a tag of some biological function associated with a feature. 896 * 897 * For more information, see: 898 * https://www.gencodegenes.org/data_format.html 899 * https://en.wikipedia.org/wiki/General_feature_format 900 */ 901 public enum GeneTranscriptType { 902 // Immunoglobulin (Ig) variable chain and T-cell receptor (TcR) genes imported or annotated according to the IMGT (http://www.imgt.org/) 903 IG_C_GENE("IG_C_gene"), 904 IG_D_GENE("IG_D_gene"), 905 IG_J_GENE("IG_J_gene"), 906 IG_LV_GENE("IG_LV_gene"), 907 IG_V_GENE("IG_V_gene"), 908 TR_C_GENE("TR_C_gene"), 909 TR_J_GENE("TR_J_gene"), 910 TR_V_GENE("TR_V_gene"), 911 TR_D_GENE("TR_D_gene"), 912 913 // Inactivated immunoglobulin gene. 914 IG_PSEUDOGENE("IG_pseudogene"), 915 IG_C_PSEUDOGENE("IG_C_pseudogene"), 916 IG_J_PSEUDOGENE("IG_J_pseudogene"), 917 IG_V_PSEUDOGENE("IG_V_pseudogene"), 918 TR_V_PSEUDOGENE("TR_V_pseudogene"), 919 TR_J_PSEUDOGENE("TR_J_pseudogene"), 920 921 // Non-coding RNA predicted using sequences from Rfam (http://rfam.xfam.org/) and miRBase (http://www.mirbase.org/) 922 MT_RRNA("Mt_rRNA"), 923 MT_TRNA("Mt_tRNA"), 924 MIRNA("miRNA"), 925 MISC_RNA("misc_RNA"), 926 RRNA("rRNA"), 927 928 SCRNA("scRNA"), 929 SNRNA("snRNA"), 930 SNORNA("snoRNA"), 931 RIBOZYME("ribozyme"), 932 SRNA("sRNA"), 933 SCARNA("scaRNA"), 934 935 // ENSEMBL-Specific values: 936 TRNA("tRNA"), 937 TMRNA("tmRNA"), 938 939 // Non-coding RNA predicted to be pseudogene by the Ensembl pipeline 940 MT_TRNA_PSEUDOGENE("Mt_tRNA_pseudogene"), 941 TRNA_PSEUDOGENE("tRNA_pseudogene"), 942 SNORNA_PSEUDOGENE("snoRNA_pseudogene"), 943 SNRNA_PSEUDOGENE("snRNA_pseudogene"), 944 SCRNA_PSEUDOGENE("scRNA_pseudogene"), 945 RRNA_PSEUDOGENE("rRNA_pseudogene"), 946 MISC_RNA_PSEUDOGENE("misc_RNA_pseudogene"), 947 MIRNA_PSEUDOGENE("miRNA_pseudogene"), 948 949 // To be Experimentally Confirmed. This is used for non-spliced EST clusters that have polyA features. This category has been specifically created for the ENCODE project to highlight regions that could indicate the presence of protein coding genes that require experimental validation, either by 5' RACE or RT-PCR to extend the transcripts, or by confirming expression of the putatively-encoded peptide with specific antibodies. 950 TEC("TEC"), 951 952 // If the coding sequence (following the appropriate reference) of a transcript finishes >50bp from a downstream splice site then it is tagged as NMD. If the variant does not cover the full reference coding sequence then it is annotated as NMD if NMD is unavoidable i.e. no matter what the exon structure of the missing portion is the transcript will be subject to NMD. 953 NONSENSE_MEDIATED_DECAY("nonsense_mediated_decay"), 954 955 // Transcript that has polyA features (including signal) without a prior stop codon in the CDS, i.e. a non-genomic polyA tail attached directly to the CDS without 3' UTR. These transcripts are subject to degradation. 956 NON_STOP_DECAY("non_stop_decay"), 957 958 // Alternatively spliced transcript believed to contain intronic sequence relative to other, coding, variants. 959 RETAINED_INTRON("retained_intron"), 960 961 // Contains an open reading frame (ORF). 962 PROTEIN_CODING("protein_coding"), 963 964 // Doesn't contain an ORF. 965 PROCESSED_TRANSCRIPT("processed_transcript"), 966 967 // Transcript which is known from the literature to not be protein coding. 968 NON_CODING("non_coding"), 969 970 // Transcript believed to be protein coding, but with more than one possible open reading frame. 971 AMBIGUOUS_ORF("ambiguous_orf"), 972 973 // Long non-coding transcript in introns of a coding gene that does not overlap any exons. 974 SENSE_INTRONIC("sense_intronic"), 975 976 // Long non-coding transcript that contains a coding gene in its intron on the same strand. 977 SENSE_OVERLAPPING("sense_overlapping"), 978 979 // Has transcripts that overlap the genomic span (i.e. exon or introns) of a protein-coding locus on the opposite strand. 980 ANTISENSE("antisense"), 981 ANTISENSE_RNA("antisense_RNA"), 982 983 KNOWN_NCRNA("known_ncrna"), 984 985 // Have homology to proteins but generally suffer from a disrupted coding sequence and an active homologous gene can be found at another locus. Sometimes these entries have an intact coding sequence or an open but truncated ORF, in which case there is other evidence used (for example genomic polyA stretches at the 3' end) to classify them as a pseudogene. Can be further classified as one of the following. 986 PSEUDOGENE("pseudogene"), 987 988 // Pseudogene that lack introns and is thought to arise from reverse transcription of mRNA followed by reinsertion of DNA into the genome. 989 PROCESSED_PSEUDOGENE("processed_pseudogene"), 990 991 // Pseudogene owing to a SNP/DIP but in other individuals/haplotypes/strains the gene is translated. 992 POLYMORPHIC_PSEUDOGENE("polymorphic_pseudogene"), 993 994 // Pseudogene owing to a reverse transcribed and re-inserted sequence. 995 RETROTRANSPOSED("retrotransposed"), 996 997 // Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression. 998 TRANSCRIBED_PROCESSED_PSEUDOGENE("transcribed_processed_pseudogene"), 999 TRANSCRIBED_UNPROCESSED_PSEUDOGENE("transcribed_unprocessed_pseudogene"), 1000 TRANSCRIBED_UNITARY_PSEUDOGENE("transcribed_unitary_pseudogene"), 1001 1002 // Pseudogene that has mass spec data suggesting that it is also translated. 1003 TRANSLATED_PROCESSED_PSEUDOGENE("translated_processed_pseudogene"), 1004 TRANSLATED_UNPROCESSED_PSEUDOGENE("translated_unprocessed_pseudogene"), 1005 1006 // A species specific unprocessed pseudogene without a parent gene, as it has an active orthologue in another species. 1007 UNITARY_PSEUDOGENE("unitary_pseudogene"), 1008 1009 // Pseudogene that can contain introns since produced by gene duplication. 1010 UNPROCESSED_PSEUDOGENE("unprocessed_pseudogene"), 1011 1012 // Used to tag mistakes in the public databases (Ensembl/SwissProt/Trembl) 1013 ARTIFACT("artifact"), 1014 1015 // Long, intervening noncoding (linc) RNA that can be found in evolutionarily conserved, intergenic regions. 1016 LINCRNA("lincRNA"), 1017 LNCRNA("lncRNA"), 1018 1019 // Unspliced lncRNA that is several kb in size. 1020 MACRO_LNCRNA("macro_lncRNA"), 1021 1022 // Transcript where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR. 1023 THREE_PRIME_OVERLAPPING_NCRNA("3prime_overlapping_ncRNA"), 1024 1025 // Otherwise viable coding region omitted from this alternatively spliced transcript because the splice variation affects a region coding for a protein domain. 1026 DISRUPTED_DOMAIN("disrupted_domain"), 1027 1028 // Short non coding RNA gene that forms part of the vault ribonucleoprotein complex. 1029 VAULTRNA("vaultRNA"), 1030 1031 // A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand. 1032 BIDIRECTIONAL_PROMOTER_LNCRNA("bidirectional_promoter_lncRNA"); 1033 1034 @SuppressWarnings("unchecked") 1035 private static final Map<String, GeneTranscriptType> VALUE_MAP = 1036 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 1037 1038 private final String serialized; 1039 GeneTranscriptType(final String serializedValue)1040 GeneTranscriptType(final String serializedValue) { 1041 serialized = serializedValue; 1042 } 1043 1044 @Override toString()1045 public String toString() { 1046 return serialized; 1047 } 1048 1049 private static final Map<String, String> SPECIAL_CASE_STRING_VALUE_MAP = createSpecialCaseMap(); 1050 getEnum(final String s)1051 public static GeneTranscriptType getEnum(final String s) { 1052 String lowerS = s.toLowerCase(); 1053 1054 // Handle special cases: 1055 lowerS = SPECIAL_CASE_STRING_VALUE_MAP.getOrDefault(lowerS, lowerS); 1056 1057 if ( VALUE_MAP.containsKey(lowerS) ){ 1058 return VALUE_MAP.get(lowerS); 1059 } 1060 throw new IllegalArgumentException("Unexpected value: " + s); 1061 } 1062 1063 /** 1064 * Create a special case map for alternate field names for known {@link GeneTranscriptType}s. 1065 */ createSpecialCaseMap()1066 private static Map<String, String> createSpecialCaseMap() { 1067 final Map<String, String> map = new HashMap<>(); 1068 1069 // From ENSEMBLE GTF files: 1070 map.put("ncrna", "non_coding"); 1071 1072 return map; 1073 } 1074 1075 } 1076 1077 /** 1078 * Indication of whether a feature is new, tenatative, or already known. 1079 * 1080 * This attribute was removed after release 25. 1081 * 1082 * For more information, see: 1083 * https://www.gencodegenes.org/data_format.html 1084 * https://en.wikipedia.org/wiki/General_feature_format 1085 */ 1086 public enum GeneTranscriptStatus { 1087 KNOWN, 1088 NOVEL, 1089 PUTATIVE 1090 } 1091 1092 /** 1093 * Status of how a position was annotated / verified: 1094 * 1095 * 1 - verified locus 1096 * 2 - manually annotated locus 1097 * 3 - automatically annotated locus 1098 * 1099 * For more information, see: 1100 * https://www.gencodegenes.org/data_format.html 1101 * https://en.wikipedia.org/wiki/General_feature_format 1102 */ 1103 public enum LocusLevel { 1104 /** Verified locus */ 1105 VERIFIED("1"), 1106 1107 /** Manually annotated locus */ 1108 MANUALLY_ANNOTATED("2"), 1109 1110 /** Automatically annotated locus */ 1111 AUTOMATICALLY_ANNOTATED("3"); 1112 1113 @SuppressWarnings("unchecked") 1114 private static final Map<String, LocusLevel> VALUE_MAP = 1115 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 1116 1117 private final String serialized; 1118 LocusLevel(final String serializedValue)1119 LocusLevel(final String serializedValue) { 1120 serialized = serializedValue; 1121 } 1122 1123 @Override toString()1124 public String toString() { 1125 return serialized; 1126 } 1127 getEnum(final String s)1128 public static LocusLevel getEnum(final String s) { 1129 final String lowerS = s.toLowerCase(); 1130 if ( VALUE_MAP.containsKey(lowerS) ){ 1131 return VALUE_MAP.get(lowerS); 1132 } 1133 throw new IllegalArgumentException("Unexpected value: " + s); 1134 } 1135 } 1136 1137 /** 1138 * Additional relevant information appended to a feature. 1139 * 1140 * For more information, see: 1141 * https://www.gencodegenes.org/data_format.html 1142 * https://en.wikipedia.org/wiki/General_feature_format 1143 * https://www.gencodegenes.org/pages/tags.html 1144 */ 1145 public enum FeatureTag { 1146 /** 3' end extended based on RNA-seq data. */ 1147 THREE_PRIME_NESTED_SUPPORTED_EXTENSION("3_nested_supported_extension"), 1148 1149 /** 3' end extended based on RNA-seq data. */ 1150 THREE_PRIME_STANDARD_SUPPORTED_EXTENSION("3_standard_supported_extension"), 1151 1152 /** annotated based on RNA-seq data. */ 1153 FOURFIVEFOUR_RNA_SEQ_SUPPORTED("454_RNA_Seq_supported"), 1154 1155 /** 5' end extended based on RNA-seq data. */ 1156 FIVE_PRIME_NESTED_SUPPORTED_EXTENSION("5_nested_supported_extension"), 1157 1158 /** 5' end extended based on RNA-seq data. */ 1159 FIVE_PRIME_STANDARD_SUPPORTED_EXTENSION("5_standard_supported_extension"), 1160 1161 /** shares an identical CDS but has alternative 5' UTR with respect to a reference variant. */ 1162 ALTERNATIVE_3_UTR("alternative_3_UTR"), 1163 1164 /** shares an identical CDS but has alternative 3' UTR with respect to a reference variant. */ 1165 ALTERNATIVE_5_UTR("alternative_5_UTR"), 1166 1167 // -------------------------------------------------------------------------------------------------------- 1168 // Please note that the ordering of the APPRIS_* tags is also used in sorting here. Do not re-order! 1169 // -------------------------------------------------------------------------------------------------------- 1170 /** Transcript expected to code for the main functional isoform based on a range of protein features (APPRIS pipeline). */ 1171 APPRIS_PRINCIPAL("appris_principal"), 1172 1173 /** (This flag corresponds to the older flag "appris_principal") Where the transcript expected to code for the main */ 1174 APPRIS_PRINCIPAL_1("appris_principal_1"), 1175 1176 /** (This flag corresponds to the older flag "appris_candidate_ccds") Where the APPRIS core modules are unable to choose a */ 1177 APPRIS_PRINCIPAL_2("appris_principal_2"), 1178 1179 /** Where the APPRIS core modules are unable to choose a clear principal variant and there more than one of the variants */ 1180 APPRIS_PRINCIPAL_3("appris_principal_3"), 1181 1182 /** (This flag corresponds to the Ensembl 78 flag "appris_candidate_longest_ccds") Where the APPRIS core modules are unable */ 1183 APPRIS_PRINCIPAL_4("appris_principal_4"), 1184 1185 /** (This flag corresponds to the Ensembl 78 flag "appris_candidate_longest_seq") Where the APPRIS core modules are unable */ 1186 APPRIS_PRINCIPAL_5("appris_principal_5"), 1187 1188 /** Candidate transcript(s) models that are conserved in at least three tested non-primate species. */ 1189 APPRIS_ALTERNATIVE_1("appris_alternative_1"), 1190 1191 /** Candidate transcript(s) models that appear to be conserved in fewer than three tested non-primate species. */ 1192 APPRIS_ALTERNATIVE_2("appris_alternative_2"), 1193 1194 /** where there is no 'appris_principal' variant, the candidate with highest APPRIS score is selected as the primary */ 1195 APPRIS_CANDIDATE_HIGHEST_SCORE("appris_candidate_highest_score"), 1196 1197 /** the "appris_candidate" transcripts where there are several CCDS, in this case APPRIS labels the longest CCDS. */ 1198 APPRIS_CANDIDATE_LONGEST_CCDS("appris_candidate_longest_ccds"), 1199 1200 /** the "appris_candidate" transcript that has an unique CCDS. */ 1201 APPRIS_CANDIDATE_CCDS("appris_candidate_ccds"), 1202 1203 /** where there is no "appris_candidate_ccds" or "appris_candidate_longest_ccds" variant, the longest protein of the */ 1204 APPRIS_CANDIDATE_LONGEST_SEQ("appris_candidate_longest_seq"), 1205 1206 /** where there is no 'appris_principal' variant, the longest of the 'appris_candidate' variants is selected as the primary */ 1207 APPRIS_CANDIDATE_LONGEST("appris_candidate_longest"), 1208 1209 /** where there is no single 'appris_principal' variant the main functional isoform will be translated from one of the */ 1210 APPRIS_CANDIDATE("appris_candidate"), 1211 1212 /** identifies a subset of representative transcripts for each gene; prioritises full-length protein coding transcripts */ 1213 BASIC("basic"), 1214 1215 /** Transcript contains two confidently annotated CDSs. Support may come from eg proteomic data, cross-species conservation */ 1216 BICISTRONIC("bicistronic"), 1217 1218 /** Transcript 5' end overlaps ENCODE or Fantom CAGE cluster. */ 1219 CAGE_SUPPORTED_TSS("CAGE_supported_TSS"), 1220 1221 /** member of the consensus CDS gene set, confirming coding regions between ENSEMBL, UCSC, NCBI and HAVANA. */ 1222 CCDS("CCDS"), 1223 1224 /** The coding region end could not be confirmed. */ 1225 CDS_END_NF("cds_end_NF"), 1226 1227 /** The coding region start could not be confirmed. */ 1228 CDS_START_NF("cds_start_NF"), 1229 1230 /** Transcript QC checked using dotplot to identify features eg splice junctions, end of homology. */ 1231 DOTTER_CONFIRMED("dotter_confirmed"), 1232 1233 /** an upstream ATG is used where a downstream ATG seems more evolutionary conserved. */ 1234 DOWNSTREAM_ATG("downstream_ATG"), 1235 1236 /** Transcript was tested and confirmed experimentally. */ 1237 EXP_CONF("exp_conf"), 1238 1239 /** locus consists of non-overlapping transcript fragments either because of genome assembly issues (i.e., gaps or */ 1240 FRAGMENTED_LOCUS("fragmented_locus"), 1241 1242 /** Transcript model contains all possible in-frame exons supported by homology, experimental evidence or conservation, but */ 1243 INFERRED_EXON_COMBINATION("inferred_exon_combination"), 1244 1245 /** Transcript model is not supported by a single piece of transcript evidence. May be supported by multiple fragments of */ 1246 INFERRED_TRANSCRIPT_MODEL("inferred_transcript_model"), 1247 1248 /** Transcript supported by transcript evidence that, while ampping best-in-genome, shows regions of poor sequence quality. */ 1249 LOW_SEQUENCE_QUALITY("low_sequence_quality"), 1250 1251 /** the mRNA end could not be confirmed. */ 1252 MRNA_END_NF("mRNA_end_NF"), 1253 1254 /** the mRNA start could not be confirmed. */ 1255 MRNA_START_NF("mRNA_start_NF"), 1256 1257 /** the transcript belongs to the MANE Select data set. The Matched Annotation from NCBI and EMBL-EBI project (MANE) is a collaboration between Ensembl-GENCODE and RefSeq to select a default transcript per human protein coding locus that is representative of biology, well-supported, expressed and conserved. This transcript set matches GRCh38 and is 100% identical between RefSeq and Ensembl-GENCODE for 5' UTR, CDS, splicing and 3' UTR. */ 1258 MANE_SELECT("MANE_Select"), 1259 1260 /** in-frame type of variation where, at the acceptor site, some variants splice after the first AG and others after the */ 1261 NAGNAG_SPLICE_SITE("NAGNAG_splice_site"), 1262 1263 /** the locus is a host for small non-coding RNAs. */ 1264 NCRNA_HOST("ncRNA_host"), 1265 1266 /** annotated based on RNA-seq data. */ 1267 NESTED_454_RNA_SEQ_SUPPORTED("nested_454_RNA_Seq_supported"), 1268 1269 /** the transcript looks like it is subject to NMD but publications, experiments or conservation support the translation of */ 1270 NMD_EXCEPTION("NMD_exception"), 1271 1272 /** codon if the transcript were longer but cannot currently be annotated as NMD as does not fulfil all criteria - most */ 1273 NMD_LIKELY_IF_EXTENDED("NMD_likely_if_extended"), 1274 1275 /** the CDS has a non-ATG start and its validity is supported by publication or conservation. */ 1276 NON_ATG_START("non_ATG_start"), 1277 1278 /** the transcript has a non-canonical splice site conserved in other species. */ 1279 NON_CANONICAL_CONSERVED("non_canonical_conserved"), 1280 1281 /** the transcript has a non-canonical splice site explained by a genomic sequencing error. */ 1282 NON_CANONICAL_GENOME_SEQUENCE_ERROR("non_canonical_genome_sequence_error"), 1283 1284 /** the transcript has a non-canonical splice site explained by other reasons. */ 1285 NON_CANONICAL_OTHER("non_canonical_other"), 1286 1287 /** the transcript has a non-canonical splice site explained by a SNP. */ 1288 NON_CANONICAL_POLYMORPHISM("non_canonical_polymorphism"), 1289 1290 /** the transcript has a non-canonical splice site that needs experimental confirmation. */ 1291 NON_CANONICAL_TEC("non_canonical_TEC"), 1292 1293 /** the transcript has a non-canonical splice site explained by a U12 intron (i.e. AT-AC splice site). */ 1294 NON_CANONICAL_U12("non_canonical_U12"), 1295 1296 /** a splice variant for which supporting evidence has not been submitted to databases, i.e. the model is based on */ 1297 NON_SUBMITTED_EVIDENCE("non_submitted_evidence"), 1298 1299 /** a transcript is supported by evidence from same species paralogous loci. */ 1300 NOT_BEST_IN_GENOME_EVIDENCE("not_best_in_genome_evidence"), 1301 1302 /** evidence from other species was used to build model. */ 1303 NOT_ORGANISM_SUPPORTED("not_organism_supported"), 1304 1305 /** protein-coding locus with no paralogues or orthologs. */ 1306 ORPHAN("orphan"), 1307 1308 /** exon(s) of the locus overlap exon(s) of a readthrough transcript or a transcript belonging to another locus. */ 1309 OVERLAPPING_LOCUS("overlapping_locus"), 1310 1311 /** a low confidence upstream ATG existing in other coding variant would lead to NMD in this trancript, that uses the high */ 1312 OVERLAPPING_UORF("overlapping_uORF"), 1313 1314 /** annotation in the pseudo-autosomal region, which is duplicated between chromosomes X and Y. */ 1315 PAR("PAR"), 1316 1317 /** member of the pseudogene set predicted by YALE, UCSC and HAVANA. */ 1318 PSEUDO_CONSENS("pseudo_consens"), 1319 1320 /** a transcript that overlaps two or more independent loci but is considered to belong to a third, separate locus. */ 1321 READTHROUGH_TRANSCRIPT("readthrough_transcript"), 1322 1323 /** locus overlaps a sequence error or an assembly error in the reference genome that affects its annotation (e.g., 1 or */ 1324 REFERENCE_GENOME_ERROR("reference_genome_error"), 1325 1326 /** internal intron of CDS portion of transcript is retained. */ 1327 RETAINED_INTRON_CDS("retained_intron_CDS"), 1328 1329 /** final intron of CDS portion of transcript is retained. */ 1330 RETAINED_INTRON_FINAL("retained_intron_final"), 1331 1332 /** first intron of CDS portion of transcript is retained. */ 1333 RETAINED_INTRON_FIRST("retained_intron_first"), 1334 1335 /** protein-coding locus created via retrotransposition. */ 1336 RETROGENE("retrogene"), 1337 1338 /** Transcript supported by RNAseq data and not supported by mRNA or EST evidence. */ 1339 RNA_SEQ_SUPPORTED_ONLY("RNA_Seq_supported_only"), 1340 1341 /** Transcript annotated based on mixture of RNA-seq data and EST/mRNA/protein evidence. */ 1342 RNA_SEQ_SUPPORTED_PARTIAL("RNA_Seq_supported_partial"), 1343 1344 /** Transcript that contains a CDS that has a translation initiation site supported by Ribosomal Profiling data. */ 1345 RP_SUPPORTED_TIS("RP_supported_TIS"), 1346 1347 /** contains a selenocysteine. */ 1348 SELENO("seleno"), 1349 1350 /** a processed pseudogene with one or more introns still present. These are likely formed through the retrotransposition */ 1351 SEMI_PROCESSED("semi_processed"), 1352 1353 /** Transcript contains at least 1 non-canonical splice junction that is associated with a known or novel genome sequence */ 1354 SEQUENCE_ERROR("sequence_error"), 1355 1356 /** Transcript whose coding sequence contains an internal stop codon that does not cause the translation termination. */ 1357 STOP_CODON_READTHROUGH("stop_codon_readthrough"), 1358 1359 /** Transcript created or extended using assembled RNA-seq long reads. */ 1360 TAGENE("TAGENE"), 1361 1362 /** an upstream ATG exists when a downstream ATG is better supported. */ 1363 UPSTREAM_ATG("upstream_ATG"), 1364 1365 /** a low confidence upstream ATG existing in other coding variant would lead to NMD in this trancript, that uses the high */ 1366 UPSTREAM_UORF("upstream_uORF"); 1367 1368 @SuppressWarnings("unchecked") 1369 private static final Map<String, FeatureTag> VALUE_MAP = 1370 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 1371 1372 private final String serialized; 1373 FeatureTag(final String serializedValue)1374 FeatureTag(final String serializedValue) { 1375 serialized = serializedValue; 1376 } 1377 1378 @Override toString()1379 public String toString() { 1380 return serialized; 1381 } 1382 getEnum(final String s)1383 public static FeatureTag getEnum(final String s) { 1384 final String lowerS = s.toLowerCase(); 1385 if ( VALUE_MAP.containsKey(lowerS) ){ 1386 return VALUE_MAP.get(lowerS); 1387 } 1388 throw new IllegalArgumentException("Unexpected value: " + s); 1389 } 1390 } 1391 1392 /** 1393 * Transcript score according to how well mRNA and EST alignments match over its full length. 1394 * 1395 * For more information, see: 1396 * https://www.gencodegenes.org/data_format.html 1397 * https://en.wikipedia.org/wiki/General_feature_format 1398 */ 1399 public enum TranscriptSupportLevel { 1400 /** all splice junctions of the transcript are supported by at least one non-suspect mRNA */ 1401 ALL_MRNA_VERIFIED("1"), 1402 1403 /** the best supporting mRNA is flagged as suspect or the support is from multiple ESTs */ 1404 BEST_MRNA_SUSPECT("2"), 1405 1406 /** the only support is from a single EST */ 1407 SINGLE_EST_SUPPORT("3"), 1408 1409 /** the best supporting EST is flagged as suspect */ 1410 BEST_EST_SUSPECT("4"), 1411 1412 /** no single transcript supports the model structure */ 1413 NO_SINGLE_TRANSCRIPT_SUPPORT("5"), 1414 1415 /** the transcript was not analyzed */ 1416 NA("NA"); 1417 1418 @SuppressWarnings("unchecked") 1419 private static final Map<String, TranscriptSupportLevel> VALUE_MAP = 1420 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 1421 1422 private final String serialized; 1423 TranscriptSupportLevel(final String serializedValue)1424 TranscriptSupportLevel(final String serializedValue) { 1425 serialized = serializedValue; 1426 } 1427 1428 @Override toString()1429 public String toString() { 1430 return serialized; 1431 } 1432 getEnum(final String s)1433 public static TranscriptSupportLevel getEnum(final String s) { 1434 final String lowerS = s.toLowerCase(); 1435 if ( VALUE_MAP.containsKey(lowerS) ){ 1436 return VALUE_MAP.get(lowerS); 1437 } 1438 throw new IllegalArgumentException("Unexpected value: " + s); 1439 } 1440 } 1441 1442 /** 1443 * Attribute that indicates the status of the mapping. 1444 * 1445 * For more information, see: 1446 * https://www.gencodegenes.org/data_format.html 1447 * https://en.wikipedia.org/wiki/General_feature_format 1448 * http://www.gencodegenes.org/releases/grch37_mapped_releases.html#attrib 1449 */ 1450 public enum RemapStatus { 1451 /** 1452 * Gene or transcript completely mapped to the target genome with all features intact. 1453 */ 1454 FULL_CONTIG("full_contig"), 1455 1456 /** 1457 * Gene or transcript completely mapped to the target genome with insertions in some features. These are usually small insertions. 1458 */ 1459 FULL_FRAGMENT("full_fragment"), 1460 1461 /** 1462 * Gene or transcript partially mapped to the target genome. 1463 */ 1464 PARTIAL("partial"), 1465 1466 /** 1467 * Gene or transcript did not map to the target genome. 1468 */ 1469 DELETED("deleted"), 1470 1471 /** 1472 * The source sequence is not in the assembly alignments. This will occur with alt loci genes if the alignments only contain the primary assembly. 1473 */ 1474 NO_SEQ_MAP("no_seq_map"), 1475 1476 /** 1477 * Transcripts in the gene mapped to multiple locations. 1478 */ 1479 GENE_CONFLICT("gene_conflict"), 1480 1481 /** 1482 * Transcripts caused gene length to change by more than 50%. This is to detect mapping to processed pseudogenes and mapping across tandem gene duplications. 1483 */ 1484 GENE_SIZE_CHANGE("gene_size_change"), 1485 1486 /** 1487 * Gene is from a small, automatic (ENSEMBL source) non-coding RNA. Taken from the target annotation. 1488 */ 1489 AUTOMATIC_SMALL_NCRNA_GENE("automatic_small_ncrna_gene"), 1490 1491 /** 1492 * Gene is from an automatic process (ENSEMBL source). Taken from the target annotation. 1493 */ 1494 AUTOMATIC_GENE("automatic_gene"), 1495 1496 /** 1497 * Pseudogene annotations (excluding polymorphic). 1498 */ 1499 PSEUDOGENE("pseudogene"); 1500 1501 @SuppressWarnings("unchecked") 1502 private static final Map<String, RemapStatus> VALUE_MAP = 1503 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 1504 1505 private final String serialized; 1506 RemapStatus(final String serializedValue)1507 RemapStatus(final String serializedValue) { serialized = serializedValue; } 1508 1509 @Override toString()1510 public String toString() { 1511 return serialized; 1512 } 1513 getEnum(final String s)1514 public static RemapStatus getEnum(final String s) { 1515 final String lowerS = s.toLowerCase(); 1516 if ( VALUE_MAP.containsKey(lowerS) ){ 1517 return VALUE_MAP.get(lowerS); 1518 } 1519 throw new IllegalArgumentException("Unexpected value: " + s); 1520 } 1521 } 1522 1523 /** 1524 * Attribute that compares the mapping to the existing target annotations. 1525 * 1526 * For more information, see: 1527 * https://www.gencodegenes.org/data_format.html 1528 * https://en.wikipedia.org/wiki/General_feature_format 1529 * http://www.gencodegenes.org/releases/grch37_mapped_releases.html#attrib 1530 */ 1531 public enum RemapTargetStatus { 1532 1533 /** 1534 * Gene or transcript was not in target annotations. 1535 */ 1536 NEW("new"), 1537 1538 /** 1539 * Gene or transcript exists in source and target genome, however source was not mapped. 1540 */ 1541 LOST("lost"), 1542 1543 /** 1544 * Gene or transcript overlaps previous version of annotation on target genome. 1545 */ 1546 OVERLAP("overlap"), 1547 1548 /** 1549 * Gene or transcript exists in target, however source mapping is to a different location. This is often mappings to a gene family members or pseudogenes. 1550 */ 1551 NONOVERLAP("nonOverlap"); 1552 1553 @SuppressWarnings("unchecked") 1554 private static final Map<String, RemapTargetStatus> VALUE_MAP = 1555 Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v)); 1556 1557 private final String serialized; 1558 RemapTargetStatus(final String serializedValue)1559 RemapTargetStatus(final String serializedValue) { 1560 serialized = serializedValue; 1561 } 1562 1563 @Override toString()1564 public String toString() { 1565 return serialized; 1566 } 1567 getEnum(final String s)1568 public static RemapTargetStatus getEnum(final String s) { 1569 final String lowerS = s.toLowerCase(); 1570 if ( VALUE_MAP.containsKey(lowerS) ){ 1571 return VALUE_MAP.get(lowerS); 1572 } 1573 throw new IllegalArgumentException("Unexpected value: " + s); 1574 } 1575 } 1576 } 1577