1 package org.broadinstitute.hellbender.utils.codecs.gtf; 2 3 import com.google.common.annotations.VisibleForTesting; 4 import htsjdk.samtools.util.IOUtil; 5 import htsjdk.tribble.readers.LineIterator; 6 import org.apache.logging.log4j.LogManager; 7 import org.apache.logging.log4j.Logger; 8 import org.broadinstitute.hellbender.exceptions.UserException; 9 10 import java.io.FileNotFoundException; 11 import java.io.IOException; 12 import java.nio.file.Path; 13 import java.util.*; 14 15 /** 16 * Codec to decode data in GTF format from ENSEMBL. 17 * According to ENSEMBL, GTF files downloaded from them conform to GFF version 2 (http://gmod.org/wiki/GFF2). 18 */ 19 final public class EnsemblGtfCodec extends AbstractGtfCodec { 20 21 private static final Logger logger = LogManager.getLogger(EnsemblGtfCodec.class); 22 23 //================================================================================================================== 24 // Public Static Members: 25 26 public static String GTF_FILE_TYPE_STRING = "ENSEMBL"; 27 28 //================================================================================================================== 29 // Private Static Members: 30 31 private static String VERSION_FIELD = "genome-version"; 32 private static String DEFAULT_VERSION = "ENSEMBL_DEFAULT_VERSION"; 33 private static final Set<String> COMMENT_PREFIXES = Collections.unmodifiableSet(new LinkedHashSet<>(Arrays.asList("#!", "##"))); 34 35 //================================================================================================================== 36 // Private Members: 37 38 private final List<String> header = new ArrayList<>(); 39 private int currentLineNum = 1; 40 private String version = null; 41 42 //================================================================================================================== 43 // Constructors: 44 EnsemblGtfCodec()45 public EnsemblGtfCodec() { 46 super(); 47 } 48 49 //================================================================================================================== 50 // Override Methods: 51 52 @Override getGtfFileType()53 String getGtfFileType() { 54 return GTF_FILE_TYPE_STRING; 55 } 56 57 @Override getDefaultLineComment()58 String getDefaultLineComment() { 59 return COMMENT_PREFIXES.iterator().next(); 60 } 61 62 @Override getAllLineComments()63 Set<String> getAllLineComments() { 64 return COMMENT_PREFIXES; 65 } 66 67 @Override getCurrentLineNumber()68 int getCurrentLineNumber() { 69 return currentLineNum; 70 } 71 72 @Override getHeader()73 List<String> getHeader() { 74 return header; 75 } 76 77 @Override passesFileNameCheck(final String inputFilePath)78 boolean passesFileNameCheck(final String inputFilePath) { 79 try { 80 final Path p = IOUtil.getPath(inputFilePath); 81 82 return p.getFileName().toString().toLowerCase().endsWith("." + GTF_FILE_EXTENSION); 83 } 84 catch (final FileNotFoundException ex) { 85 logger.warn("File does not exist! - " + inputFilePath + " - returning name check as failure."); 86 } 87 catch (final IOException ex) { 88 logger.warn("Caught IOException on file: " + inputFilePath + " - returning name check as failure."); 89 } 90 91 return false; 92 } 93 94 @Override readActualHeader(final LineIterator reader)95 List<String> readActualHeader(final LineIterator reader) { 96 97 // Read in the header lines: 98 ingestHeaderLines(reader); 99 100 // Validate our header: 101 validateHeader(header, true); 102 103 // Set our line number to be the line of the first actual Feature: 104 currentLineNum = header.size() + 1; 105 106 // Set up our version number: 107 populateVersionNumber(); 108 109 return header; 110 } 111 112 @Override validateFeatureSubtype(final GencodeGtfFeature feature)113 boolean validateFeatureSubtype(final GencodeGtfFeature feature) { 114 return validateEnsemblGtfFeature( feature ); 115 } 116 117 @Override incrementLineNumber()118 void incrementLineNumber() { 119 ++currentLineNum; 120 } 121 122 @Override getUcscVersionNumber()123 String getUcscVersionNumber() { 124 return version; 125 } 126 127 @Override 128 /** 129 * {@inheritDoc} 130 * 131 * Because ENSEMBL GTF files are strictly a superset of GENCODE GTF files, we need to do some extra checks here to 132 * make sure that this file can NOT be decoded by {@link GencodeGtfCodec} but can still be decoded by this 133 * {@link EnsemblGtfCodec}. 134 */ canDecode(final String inputFilePath)135 public boolean canDecode(final String inputFilePath) { 136 137 // Create a GencodeGtfCodec so we can see if it will decode the input file. 138 final GencodeGtfCodec gencodeGtfCodec = new GencodeGtfCodec(); 139 if ( gencodeGtfCodec.canDecode(inputFilePath) ) { 140 // Uh oh! We can decode this as GENCODE. 141 // So we should NOT decode this as ENSEMBL. 142 return false; 143 } 144 145 return super.canDecode(inputFilePath); 146 } 147 148 //================================================================================================================== 149 // Static Methods: 150 151 /** 152 * Validates a given {@link GencodeGtfFeature} against a given version of the ENSEMBL GTF file spec. 153 * This method ensures that all required fields are defined, but does not interrogate their values. 154 * @param feature A {@link GencodeGtfFeature} to validate. MUST NOT BE {@code null}. 155 * @return True if {@code feature} contains all required fields for the given GENCODE GTF version, {@code gtfVersion} 156 */ validateEnsemblGtfFeature(final GencodeGtfFeature feature)157 private static boolean validateEnsemblGtfFeature(final GencodeGtfFeature feature) { 158 159 final GencodeGtfFeature.FeatureType featureType = feature.getFeatureType(); 160 161 if ( featureType != GencodeGtfFeature.FeatureType.GENE) { 162 if (feature.getTranscriptId() == null) { 163 return false; 164 } 165 if (feature.getTranscriptType() == null) { 166 return false; 167 } 168 if (feature.getTranscriptName() == null) { 169 return false; 170 } 171 } 172 173 return true; 174 } 175 176 //================================================================================================================== 177 // Instance Methods: 178 populateVersionNumber()179 private void populateVersionNumber() { 180 181 // If `genome-version` was specified in the header, we should use that. 182 // Otherwise we can return a placeholder. 183 184 String ver = DEFAULT_VERSION; 185 186 // Attempt to get the version from the header: 187 for ( final String line : header ) { 188 for ( final String comment : getAllLineComments() ) { 189 if ( line.startsWith(comment + VERSION_FIELD) ) { 190 ver = line.replaceFirst(comment + VERSION_FIELD + "\\s*", "").trim(); 191 } 192 } 193 } 194 195 version = ver; 196 } 197 198 /** 199 * Check if the given header of a tentative ENSEMBL GTF file is, in fact, the header to such a file. 200 * @param header Header lines to check for conformity to ENSEMBL GTF specifications. 201 * @param throwIfInvalid If true, will throw a {@link UserException.MalformedFile} if the header is invalid. 202 * @return true if the given {@code header} is that of a ENSEMBL GTF file; false otherwise. 203 */ 204 @VisibleForTesting validateHeader(final List<String> header, final boolean throwIfInvalid)205 boolean validateHeader(final List<String> header, final boolean throwIfInvalid) { 206 // As it turns out, the ENSEMBL GTF header is pretty loosy-goosy. 207 // No fields are required, and therefore it could actually be empty. 208 209 // Rather than attempting to validate the file, here we just 210 // assert that all header lines begin with a comment (they should already). 211 int lineNum = 1; 212 for (final String line : header) { 213 214 if ( !isLineCommented(line) ) { 215 if ( throwIfInvalid ) { 216 throw new UserException.MalformedFile("ENSEMBL GTF Header line " + lineNum + " is not commented: " + line); 217 } 218 else { 219 return false; 220 } 221 } 222 223 ++lineNum; 224 } 225 226 return true; 227 } 228 229 //================================================================================================================== 230 // Helper Data Types: 231 232 } 233