1 package org.broadinstitute.hellbender.utils.codecs.gtf; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.util.IOUtil; 5 import htsjdk.tribble.AbstractFeatureCodec; 6 import htsjdk.tribble.readers.LineIterator; 7 import org.apache.logging.log4j.LogManager; 8 import org.apache.logging.log4j.Logger; 9 import org.broadinstitute.hellbender.exceptions.GATKException; 10 import org.broadinstitute.hellbender.exceptions.UserException; 11 12 import java.io.FileNotFoundException; 13 import java.io.IOException; 14 import java.nio.file.Path; 15 import java.util.*; 16 import java.util.regex.Matcher; 17 import java.util.regex.Pattern; 18 19 /** 20 * {@link htsjdk.tribble.Tribble} Codec to read data from a GENCODE GTF file. 21 * 22 * GENCODE GTF Files are defined here: https://www.gencodegenes.org/pages/data_format.html 23 * 24 * This codec will scan through a GENCODE GTF file and return {@link GencodeGtfFeature} objects. 25 * {@link GencodeGtfFeature} objects contain fields that have sub-features. All features are 26 * grouped by gene (this is the natural formatting of a GENCODE GTF file). 27 * 28 * All fields exist in the Abstract {@link GencodeGtfFeature}. The subclasses contain representations of the logical 29 * data hierarchy that reflect how the data were presented in the feature file itself (to preserve the natural 30 * grouping by gene). 31 * The {@link GencodeGtfFeature} logical data hierarchy (NOT the class hierarchy) is as follows 32 * (with | representing a "has a" relationship) 33 * 34 * +--> {@link GencodeGtfGeneFeature} 35 * | 36 * +--> {@link GencodeGtfTranscriptFeature} 37 * | 38 * +--> {@link GencodeGtfSelenocysteineFeature} 39 * +--> {@link GencodeGtfUTRFeature} 40 * +--> {@link GencodeGtfExonFeature} 41 * | 42 * +--> {@link GencodeGtfCDSFeature} 43 * +--> {@link GencodeGtfStartCodonFeature} 44 * +--> {@link GencodeGtfStopCodonFeature} 45 * 46 * {@link htsjdk.tribble.Tribble} indexing has been tested and works as expected. 47 * Does not support {@link htsjdk.tribble.index.tabix.TabixIndex} indexing. 48 * 49 * Unlike many other {@link htsjdk.tribble.Tribble} codecs, this one scans multiple input file lines to produce 50 * a single feature. This is due to how GENCODE GTF files are structured (essentially grouped by contig and gene). 51 * For this reason, {@link GencodeGtfCodec} inherits from {@link AbstractFeatureCodec}, as opposed to {@link htsjdk.tribble.AsciiFeatureCodec} 52 * (i.e. {@link htsjdk.tribble.AsciiFeatureCodec}s read a single line at a time, and {@link AbstractFeatureCodec} do not have that explicit purpose). 53 * 54 * Created by jonn on 7/21/17. 55 */ 56 final public class GencodeGtfCodec extends AbstractGtfCodec { 57 58 private static final Logger logger = LogManager.getLogger(GencodeGtfCodec.class); 59 60 // ============================================================================================================ 61 62 public static final String GENCODE_GTF_FILE_PREFIX = "gencode"; 63 public static final String GTF_FILE_TYPE_STRING = "GENCODE"; 64 65 // ============================================================================================================ 66 67 private static final int GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE = 19; 68 69 /** 70 * Maximum version of gencode that will not generate a warning. This parser will still attempt to parse versions above this number, but a warning about potential errors will appear. 71 */ 72 private static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 34; 73 74 private int currentLineNum = 1; 75 private final List<String> header = new ArrayList<>(); 76 private static final int HEADER_NUM_LINES = 5; 77 78 private static final Pattern VERSION_PATTERN = Pattern.compile("version (\\d+)"); 79 private int versionNumber; 80 81 private static final String commentPrefix = "##"; 82 83 // ============================================================================================================ 84 85 /** 86 * Gets the UCSC version corresponding to the given gencode version. 87 * Version equivalences obtained here: 88 * 89 * https://genome.ucsc.edu/FAQ/FAQreleases.html 90 * https://www.gencodegenes.org/human/releases.html 91 * 92 * @param gencodeVersion The gencode version to convert to UCSC version. 93 * @return The UCSC version in a {@link String} corresponding to the given gencode version. 94 */ getUcscVersionFromGencodeVersion(final int gencodeVersion)95 private static String getUcscVersionFromGencodeVersion(final int gencodeVersion) { 96 if (gencodeVersion < GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE) { 97 throw new GATKException("Gencode version is too far out of date. Cannot decode: " + gencodeVersion); 98 } 99 100 if ( gencodeVersion < 25 ) { 101 return "hg19"; 102 } 103 else { 104 return "hg38"; 105 } 106 } 107 108 // ============================================================================================================ 109 GencodeGtfCodec()110 public GencodeGtfCodec() { 111 super(); 112 } 113 114 // ============================================================================================================ 115 116 @Override getCurrentLineNumber()117 int getCurrentLineNumber() { 118 return currentLineNum; 119 } 120 121 @Override getHeader()122 List<String> getHeader() { 123 return header; 124 } 125 126 @Override readActualHeader(final LineIterator reader)127 List<String> readActualHeader(final LineIterator reader) { 128 129 // Clear our version number too: 130 versionNumber = -1; 131 132 // Read in the header lines: 133 ingestHeaderLines(reader); 134 135 // Validate our header: 136 validateHeader(header, true); 137 138 // Set our version number: 139 setVersionNumber(); 140 141 // Set our line number to be the line of the first actual Feature: 142 currentLineNum = header.size() + 1; 143 144 return header; 145 } 146 147 /** 148 * Sets {@link #versionNumber} to the number corresponding to the value in the header. 149 */ setVersionNumber()150 private void setVersionNumber() { 151 try { 152 final Matcher versionMatcher = VERSION_PATTERN.matcher(header.get(0)); 153 if (!versionMatcher.find() ) { 154 throw new UserException.MalformedFile("Cannot find version number from Gencode GTF header."); 155 } 156 versionNumber = Integer.valueOf(versionMatcher.group(1)); 157 } 158 catch (final NumberFormatException ex) { 159 throw new UserException("Could not read version number from header", ex); 160 } 161 } 162 163 /** 164 * Validates a given {@link GencodeGtfFeature} against a given version of the GENCODE GTF file spec. 165 * This method ensures that all required fields are defined, but does not interrogate their values. 166 * @param feature A {@link GencodeGtfFeature} to validate. MUST NOT BE {@code null}. 167 * @param gtfVersion The GENCODE GTF version against which to validate {@code feature} 168 * @return True if {@code feature} contains all required fields for the given GENCODE GTF version, {@code gtfVersion} 169 */ validateGencodeGtfFeature(final GencodeGtfFeature feature, final int gtfVersion)170 private static boolean validateGencodeGtfFeature(final GencodeGtfFeature feature, final int gtfVersion) { 171 172 if (gtfVersion < GencodeGtfCodec.GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE) { 173 throw new GATKException("Invalid version number for validation: " + gtfVersion + 174 " must be above: " + GencodeGtfCodec.GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE); 175 } 176 177 final GencodeGtfFeature.FeatureType featureType = feature.getFeatureType(); 178 179 if ( gtfVersion < 26 ) { 180 if (feature.getGeneStatus() == null) { 181 return false; 182 } 183 if (feature.getTranscriptStatus() == null) { 184 return false; 185 } 186 } 187 188 if ( (featureType != GencodeGtfFeature.FeatureType.GENE) || 189 (gtfVersion < 21) ) { 190 if (feature.getTranscriptId() == null) { 191 return false; 192 } 193 if (feature.getTranscriptType() == null) { 194 return false; 195 } 196 if (feature.getTranscriptName() == null) { 197 return false; 198 } 199 } 200 201 // Gencode can only have 2 feature types: 202 return feature.getAnnotationSource().equals(GencodeGtfFeature.ANNOTATION_SOURCE_ENSEMBL) || 203 feature.getAnnotationSource().equals(GencodeGtfFeature.ANNOTATION_SOURCE_HAVANA); 204 } 205 206 @Override passesFileNameCheck(final String inputFilePath)207 boolean passesFileNameCheck(final String inputFilePath) { 208 try { 209 final Path p = IOUtil.getPath(inputFilePath); 210 211 return p.getFileName().toString().toLowerCase().startsWith(GENCODE_GTF_FILE_PREFIX) && 212 p.getFileName().toString().toLowerCase().endsWith("." + GTF_FILE_EXTENSION); 213 } 214 catch (final FileNotFoundException ex) { 215 logger.warn("File does not exist! - " + inputFilePath + " - returning name check as failure."); 216 } 217 catch (final IOException ex) { 218 logger.warn("Caught IOException on file: " + inputFilePath + " - returning name check as failure."); 219 } 220 221 return false; 222 } 223 224 @Override getDefaultLineComment()225 String getDefaultLineComment() { 226 return commentPrefix; 227 } 228 229 @Override getAllLineComments()230 Set<String> getAllLineComments() { 231 return Collections.unmodifiableSet(new HashSet<>(Collections.singletonList(commentPrefix))); 232 } 233 234 @Override getGtfFileType()235 String getGtfFileType() { 236 return GTF_FILE_TYPE_STRING; 237 } 238 239 @Override validateFeatureSubtype(final GencodeGtfFeature feature )240 boolean validateFeatureSubtype(final GencodeGtfFeature feature ) { 241 return validateGencodeGtfFeature( feature, versionNumber ); 242 } 243 244 @Override incrementLineNumber()245 void incrementLineNumber() { 246 ++currentLineNum; 247 } 248 249 @Override getUcscVersionNumber()250 String getUcscVersionNumber() { 251 return getUcscVersionFromGencodeVersion(versionNumber); 252 } 253 254 // ============================================================================================================ 255 256 /** 257 * Check if the given header of a tentative GENCODE GTF file is, in fact, the header to such a file. 258 * Will also return true if the file is a general GTF file (i.e. a GTF file that was not created and 259 * maintained by GENCODE). 260 * @param header Header lines to check for conformity to GENCODE GTF specifications. 261 * @param throwIfInvalid If true, will throw a {@link UserException.MalformedFile} if the header is invalid. 262 * @return true if the given {@code header} is that of a GENCODE GTF file; false otherwise. 263 */ 264 @VisibleForTesting validateHeader(final List<String> header, final boolean throwIfInvalid)265 boolean validateHeader(final List<String> header, final boolean throwIfInvalid) { 266 if ( header.size() != HEADER_NUM_LINES) { 267 if ( throwIfInvalid ) { 268 throw new UserException.MalformedFile( 269 "GENCODE GTF Header is of unexpected length: " + 270 header.size() + " != " + HEADER_NUM_LINES); 271 } 272 else { 273 return false; 274 } 275 } 276 277 // Check the normal commented fields: 278 if ( !checkHeaderLineStartsWith(header,0, "description:") ) { 279 return false; 280 } 281 282 if ( !header.get(0).contains("version") ) { 283 if ( throwIfInvalid ) { 284 throw new UserException.MalformedFile( 285 "GENCODE GTF Header line 1 does not contain version specification: " + 286 header.get(0)); 287 } 288 else { 289 return false; 290 } 291 } 292 293 // Grab the version from the file and make sure it's within the acceptable range: 294 final Matcher versionMatcher = VERSION_PATTERN.matcher(header.get(0)); 295 if ( !versionMatcher.find() ) { 296 if ( throwIfInvalid ) { 297 throw new UserException.MalformedFile( 298 "GENCODE GTF Header line 1 does not contain a recognizable version number: " + 299 header.get(0)); 300 } 301 else { 302 return false; 303 } 304 } 305 306 try { 307 final int versionNumber = Integer.valueOf(versionMatcher.group(1)); 308 if (versionNumber < GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE) { 309 final String message = "GENCODE GTF Header line 1 has an out-of-date (< v" + GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE + " version number (" + 310 versionNumber + "): " + header.get(0); 311 if (throwIfInvalid) { 312 throw new UserException.MalformedFile(message); 313 } else { 314 logger.warn(message + " Continuing, but errors may occur."); 315 } 316 } 317 318 if (versionNumber > GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE) { 319 logger.warn("GENCODE GTF Header line 1 has a version number that is above maximum tested version (v " + GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE + ") (given: " + 320 versionNumber + "): " + header.get(0) + " Continuing, but errors may occur."); 321 } 322 } 323 catch (final NumberFormatException ex) { 324 if ( throwIfInvalid ) { 325 throw new UserException("Could not create number value for version: " + versionMatcher.group(1), ex); 326 } 327 else { 328 return false; 329 } 330 } 331 332 return checkHeaderLineStartsWith(header, 1, "provider: GENCODE") && 333 checkHeaderLineStartsWith(header, 2, "contact: gencode") && 334 checkHeaderLineStartsWith(header, 3, "format: gtf") && 335 checkHeaderLineStartsWith(header, 4, "date:"); 336 } 337 } 338