1 package org.broadinstitute.hellbender.utils.codecs.gtf;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.util.IOUtil;
5 import htsjdk.tribble.readers.LineIterator;
6 import org.apache.logging.log4j.LogManager;
7 import org.apache.logging.log4j.Logger;
8 import org.broadinstitute.hellbender.exceptions.UserException;
9 
10 import java.io.FileNotFoundException;
11 import java.io.IOException;
12 import java.nio.file.Path;
13 import java.util.*;
14 
15 /**
16  * Codec to decode data in GTF format from ENSEMBL.
17  * According to ENSEMBL, GTF files downloaded from them conform to GFF version 2 (http://gmod.org/wiki/GFF2).
18  */
19 final public class EnsemblGtfCodec extends AbstractGtfCodec {
20 
21     private static final Logger logger = LogManager.getLogger(EnsemblGtfCodec.class);
22 
23     //==================================================================================================================
24     // Public Static Members:
25 
26     public static String GTF_FILE_TYPE_STRING = "ENSEMBL";
27 
28     //==================================================================================================================
29     // Private Static Members:
30 
31     private static String            VERSION_FIELD    = "genome-version";
32     private static String            DEFAULT_VERSION  = "ENSEMBL_DEFAULT_VERSION";
33     private static final Set<String> COMMENT_PREFIXES = Collections.unmodifiableSet(new LinkedHashSet<>(Arrays.asList("#!", "##")));
34 
35     //==================================================================================================================
36     // Private Members:
37 
38     private final        List<String> header          = new ArrayList<>();
39     private              int          currentLineNum  = 1;
40     private              String       version         = null;
41 
42     //==================================================================================================================
43     // Constructors:
44 
EnsemblGtfCodec()45     public EnsemblGtfCodec() {
46         super();
47     }
48 
49     //==================================================================================================================
50     // Override Methods:
51 
52     @Override
getGtfFileType()53     String getGtfFileType() {
54         return GTF_FILE_TYPE_STRING;
55     }
56 
57     @Override
getDefaultLineComment()58     String getDefaultLineComment() {
59         return COMMENT_PREFIXES.iterator().next();
60     }
61 
62     @Override
getAllLineComments()63     Set<String> getAllLineComments() {
64         return COMMENT_PREFIXES;
65     }
66 
67     @Override
getCurrentLineNumber()68     int getCurrentLineNumber() {
69         return currentLineNum;
70     }
71 
72     @Override
getHeader()73     List<String> getHeader() {
74         return header;
75     }
76 
77     @Override
passesFileNameCheck(final String inputFilePath)78     boolean passesFileNameCheck(final String inputFilePath) {
79         try {
80             final Path p = IOUtil.getPath(inputFilePath);
81 
82             return p.getFileName().toString().toLowerCase().endsWith("." + GTF_FILE_EXTENSION);
83         }
84         catch (final FileNotFoundException ex) {
85             logger.warn("File does not exist! - " + inputFilePath + " - returning name check as failure.");
86         }
87         catch (final IOException ex) {
88             logger.warn("Caught IOException on file: " + inputFilePath + " - returning name check as failure.");
89         }
90 
91         return false;
92     }
93 
94     @Override
readActualHeader(final LineIterator reader)95     List<String> readActualHeader(final LineIterator reader) {
96 
97         // Read in the header lines:
98         ingestHeaderLines(reader);
99 
100         // Validate our header:
101         validateHeader(header, true);
102 
103         // Set our line number to be the line of the first actual Feature:
104         currentLineNum = header.size() + 1;
105 
106         // Set up our version number:
107         populateVersionNumber();
108 
109         return header;
110     }
111 
112     @Override
validateFeatureSubtype(final GencodeGtfFeature feature)113     boolean validateFeatureSubtype(final GencodeGtfFeature feature) {
114         return validateEnsemblGtfFeature( feature );
115     }
116 
117     @Override
incrementLineNumber()118     void incrementLineNumber() {
119         ++currentLineNum;
120     }
121 
122     @Override
getUcscVersionNumber()123     String getUcscVersionNumber() {
124         return version;
125     }
126 
127     @Override
128     /**
129      * {@inheritDoc}
130      *
131      * Because ENSEMBL GTF files are strictly a superset of GENCODE GTF files, we need to do some extra checks here to
132      * make sure that this file can NOT be decoded by {@link GencodeGtfCodec} but can still be decoded by this
133      * {@link EnsemblGtfCodec}.
134      */
canDecode(final String inputFilePath)135     public boolean canDecode(final String inputFilePath) {
136 
137         // Create a GencodeGtfCodec so we can see if it will decode the input file.
138         final GencodeGtfCodec gencodeGtfCodec = new GencodeGtfCodec();
139         if ( gencodeGtfCodec.canDecode(inputFilePath) ) {
140             // Uh oh!  We can decode this as GENCODE.
141             // So we should NOT decode this as ENSEMBL.
142             return false;
143         }
144 
145         return super.canDecode(inputFilePath);
146     }
147 
148     //==================================================================================================================
149     // Static Methods:
150 
151     /**
152      * Validates a given {@link GencodeGtfFeature} against a given version of the ENSEMBL GTF file spec.
153      * This method ensures that all required fields are defined, but does not interrogate their values.
154      * @param feature A {@link GencodeGtfFeature} to validate.  MUST NOT BE {@code null}.
155      * @return True if {@code feature} contains all required fields for the given GENCODE GTF version, {@code gtfVersion}
156      */
validateEnsemblGtfFeature(final GencodeGtfFeature feature)157     private static boolean validateEnsemblGtfFeature(final GencodeGtfFeature feature) {
158 
159         final GencodeGtfFeature.FeatureType featureType = feature.getFeatureType();
160 
161         if ( featureType != GencodeGtfFeature.FeatureType.GENE) {
162             if (feature.getTranscriptId() == null) {
163                 return false;
164             }
165             if (feature.getTranscriptType() == null) {
166                 return false;
167             }
168             if (feature.getTranscriptName() == null) {
169                 return false;
170             }
171         }
172 
173         return true;
174     }
175 
176     //==================================================================================================================
177     // Instance Methods:
178 
populateVersionNumber()179     private void populateVersionNumber() {
180 
181         // If `genome-version` was specified in the header, we should use that.
182         // Otherwise we can return a placeholder.
183 
184         String ver = DEFAULT_VERSION;
185 
186         // Attempt to get the version from the header:
187         for ( final String line : header ) {
188             for ( final String comment : getAllLineComments() ) {
189                 if ( line.startsWith(comment + VERSION_FIELD) ) {
190                     ver = line.replaceFirst(comment + VERSION_FIELD + "\\s*", "").trim();
191                 }
192             }
193         }
194 
195         version = ver;
196     }
197 
198     /**
199      * Check if the given header of a tentative ENSEMBL GTF file is, in fact, the header to such a file.
200      * @param header Header lines to check for conformity to ENSEMBL GTF specifications.
201      * @param throwIfInvalid If true, will throw a {@link UserException.MalformedFile} if the header is invalid.
202      * @return true if the given {@code header} is that of a ENSEMBL GTF file; false otherwise.
203      */
204     @VisibleForTesting
validateHeader(final List<String> header, final boolean throwIfInvalid)205     boolean validateHeader(final List<String> header, final boolean throwIfInvalid) {
206         // As it turns out, the ENSEMBL GTF header is pretty loosy-goosy.
207         // No fields are required, and therefore it could actually be empty.
208 
209         // Rather than attempting to validate the file, here we just
210         // assert that all header lines begin with a comment (they should already).
211         int lineNum = 1;
212         for (final String line : header) {
213 
214             if ( !isLineCommented(line) ) {
215                 if ( throwIfInvalid ) {
216                     throw new UserException.MalformedFile("ENSEMBL GTF Header line " + lineNum + " is not commented: " + line);
217                 }
218                 else {
219                     return false;
220                 }
221             }
222 
223             ++lineNum;
224         }
225 
226         return true;
227     }
228 
229     //==================================================================================================================
230     // Helper Data Types:
231 
232 }
233