1 package org.broadinstitute.hellbender.utils.codecs.gtf;
2 
3 import com.google.common.annotations.VisibleForTesting;
4 import htsjdk.samtools.util.IOUtil;
5 import htsjdk.tribble.AbstractFeatureCodec;
6 import htsjdk.tribble.readers.LineIterator;
7 import org.apache.logging.log4j.LogManager;
8 import org.apache.logging.log4j.Logger;
9 import org.broadinstitute.hellbender.exceptions.GATKException;
10 import org.broadinstitute.hellbender.exceptions.UserException;
11 
12 import java.io.FileNotFoundException;
13 import java.io.IOException;
14 import java.nio.file.Path;
15 import java.util.*;
16 import java.util.regex.Matcher;
17 import java.util.regex.Pattern;
18 
19 /**
20  * {@link htsjdk.tribble.Tribble} Codec to read data from a GENCODE GTF file.
21  *
22  * GENCODE GTF Files are defined here: https://www.gencodegenes.org/pages/data_format.html
23  *
24  * This codec will scan through a GENCODE GTF file and return {@link GencodeGtfFeature} objects.
25  * {@link GencodeGtfFeature} objects contain fields that have sub-features.  All features are
26  * grouped by gene (this is the natural formatting of a GENCODE GTF file).
27  *
28  * All fields exist in the Abstract {@link GencodeGtfFeature}.  The subclasses contain representations of the logical
29  * data hierarchy that reflect how the data were presented in the feature file itself (to preserve the natural
30  * grouping by gene).
31  * The {@link GencodeGtfFeature} logical data hierarchy (NOT the class hierarchy) is as follows
32  * (with | representing a "has a" relationship)
33  *
34  * +--> {@link GencodeGtfGeneFeature}
35  *    |
36  *    +--> {@link GencodeGtfTranscriptFeature}
37  *       |
38  *       +--> {@link GencodeGtfSelenocysteineFeature}
39  *       +--> {@link GencodeGtfUTRFeature}
40  *       +--> {@link GencodeGtfExonFeature}
41  *          |
42  *          +--> {@link GencodeGtfCDSFeature}
43  *          +--> {@link GencodeGtfStartCodonFeature}
44  *          +--> {@link GencodeGtfStopCodonFeature}
45  *
46  * {@link htsjdk.tribble.Tribble} indexing has been tested and works as expected.
47  * Does not support {@link htsjdk.tribble.index.tabix.TabixIndex} indexing.
48  *
49  * Unlike many other {@link htsjdk.tribble.Tribble} codecs, this one scans multiple input file lines to produce
50  * a single feature.  This is due to how GENCODE GTF files are structured (essentially grouped by contig and gene).
51  * For this reason, {@link GencodeGtfCodec} inherits from {@link AbstractFeatureCodec}, as opposed to {@link htsjdk.tribble.AsciiFeatureCodec}
52  * (i.e. {@link htsjdk.tribble.AsciiFeatureCodec}s read a single line at a time, and {@link AbstractFeatureCodec} do not have that explicit purpose).
53  *
54  * Created by jonn on 7/21/17.
55  */
56 final public class GencodeGtfCodec extends AbstractGtfCodec {
57 
58     private static final Logger logger = LogManager.getLogger(GencodeGtfCodec.class);
59 
60     // ============================================================================================================
61 
62     public static final String GENCODE_GTF_FILE_PREFIX = "gencode";
63     public static final String GTF_FILE_TYPE_STRING = "GENCODE";
64 
65     // ============================================================================================================
66 
67     private static final int GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE = 19;
68 
69     /**
70      * Maximum version of gencode that will not generate a warning.  This parser will still attempt to parse versions above this number, but a warning about potential errors will appear.
71      */
72     private static final int GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE = 34;
73 
74     private int currentLineNum = 1;
75     private final List<String> header = new ArrayList<>();
76     private static final int HEADER_NUM_LINES = 5;
77 
78     private static final Pattern VERSION_PATTERN = Pattern.compile("version (\\d+)");
79     private int versionNumber;
80 
81     private static final String commentPrefix = "##";
82 
83     // ============================================================================================================
84 
85     /**
86      * Gets the UCSC version corresponding to the given gencode version.
87      * Version equivalences obtained here:
88      *
89      *  https://genome.ucsc.edu/FAQ/FAQreleases.html
90      *  https://www.gencodegenes.org/human/releases.html
91      *
92      * @param gencodeVersion The gencode version to convert to UCSC version.
93      * @return The UCSC version in a {@link String} corresponding to the given gencode version.
94      */
getUcscVersionFromGencodeVersion(final int gencodeVersion)95     private static String getUcscVersionFromGencodeVersion(final int gencodeVersion) {
96         if (gencodeVersion < GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE) {
97             throw new GATKException("Gencode version is too far out of date.  Cannot decode: " + gencodeVersion);
98         }
99 
100         if ( gencodeVersion < 25 ) {
101             return "hg19";
102         }
103         else {
104             return "hg38";
105         }
106     }
107 
108     // ============================================================================================================
109 
GencodeGtfCodec()110     public GencodeGtfCodec() {
111         super();
112     }
113 
114     // ============================================================================================================
115 
116     @Override
getCurrentLineNumber()117     int getCurrentLineNumber() {
118         return currentLineNum;
119     }
120 
121     @Override
getHeader()122     List<String> getHeader() {
123         return header;
124     }
125 
126     @Override
readActualHeader(final LineIterator reader)127     List<String> readActualHeader(final LineIterator reader) {
128 
129         // Clear our version number too:
130         versionNumber = -1;
131 
132         // Read in the header lines:
133         ingestHeaderLines(reader);
134 
135         // Validate our header:
136         validateHeader(header, true);
137 
138         // Set our version number:
139         setVersionNumber();
140 
141         // Set our line number to be the line of the first actual Feature:
142         currentLineNum = header.size() + 1;
143 
144         return header;
145     }
146 
147     /**
148      * Sets {@link #versionNumber} to the number corresponding to the value in the header.
149      */
setVersionNumber()150     private void setVersionNumber() {
151         try {
152             final Matcher versionMatcher = VERSION_PATTERN.matcher(header.get(0));
153             if (!versionMatcher.find() ) {
154                 throw new UserException.MalformedFile("Cannot find version number from Gencode GTF header.");
155             }
156             versionNumber = Integer.valueOf(versionMatcher.group(1));
157         }
158         catch (final NumberFormatException ex) {
159             throw new UserException("Could not read version number from header", ex);
160         }
161     }
162 
163     /**
164      * Validates a given {@link GencodeGtfFeature} against a given version of the GENCODE GTF file spec.
165      * This method ensures that all required fields are defined, but does not interrogate their values.
166      * @param feature A {@link GencodeGtfFeature} to validate.  MUST NOT BE {@code null}.
167      * @param gtfVersion The GENCODE GTF version against which to validate {@code feature}
168      * @return True if {@code feature} contains all required fields for the given GENCODE GTF version, {@code gtfVersion}
169      */
validateGencodeGtfFeature(final GencodeGtfFeature feature, final int gtfVersion)170     private static boolean validateGencodeGtfFeature(final GencodeGtfFeature feature, final int gtfVersion) {
171 
172         if (gtfVersion < GencodeGtfCodec.GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE) {
173             throw new GATKException("Invalid version number for validation: " + gtfVersion +
174                     " must be above: " + GencodeGtfCodec.GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE);
175         }
176 
177         final GencodeGtfFeature.FeatureType featureType = feature.getFeatureType();
178 
179         if ( gtfVersion < 26 ) {
180             if (feature.getGeneStatus() == null) {
181                 return false;
182             }
183             if (feature.getTranscriptStatus() == null) {
184                 return false;
185             }
186         }
187 
188         if ( (featureType != GencodeGtfFeature.FeatureType.GENE) ||
189                 (gtfVersion < 21) ) {
190             if (feature.getTranscriptId() == null) {
191                 return false;
192             }
193             if (feature.getTranscriptType() == null) {
194                 return false;
195             }
196             if (feature.getTranscriptName() == null) {
197                 return false;
198             }
199         }
200 
201         // Gencode can only have 2 feature types:
202         return feature.getAnnotationSource().equals(GencodeGtfFeature.ANNOTATION_SOURCE_ENSEMBL) ||
203                 feature.getAnnotationSource().equals(GencodeGtfFeature.ANNOTATION_SOURCE_HAVANA);
204     }
205 
206     @Override
passesFileNameCheck(final String inputFilePath)207     boolean passesFileNameCheck(final String inputFilePath) {
208         try {
209             final Path p = IOUtil.getPath(inputFilePath);
210 
211             return p.getFileName().toString().toLowerCase().startsWith(GENCODE_GTF_FILE_PREFIX) &&
212                     p.getFileName().toString().toLowerCase().endsWith("." + GTF_FILE_EXTENSION);
213         }
214         catch (final FileNotFoundException ex) {
215             logger.warn("File does not exist! - " + inputFilePath + " - returning name check as failure.");
216         }
217         catch (final IOException ex) {
218             logger.warn("Caught IOException on file: " + inputFilePath + " - returning name check as failure.");
219         }
220 
221         return false;
222     }
223 
224     @Override
getDefaultLineComment()225     String getDefaultLineComment() {
226         return commentPrefix;
227     }
228 
229     @Override
getAllLineComments()230     Set<String> getAllLineComments() {
231         return Collections.unmodifiableSet(new HashSet<>(Collections.singletonList(commentPrefix)));
232     }
233 
234     @Override
getGtfFileType()235     String  getGtfFileType() {
236         return GTF_FILE_TYPE_STRING;
237     }
238 
239     @Override
validateFeatureSubtype(final GencodeGtfFeature feature )240     boolean validateFeatureSubtype(final GencodeGtfFeature feature ) {
241         return validateGencodeGtfFeature( feature, versionNumber );
242     }
243 
244     @Override
incrementLineNumber()245     void incrementLineNumber() {
246         ++currentLineNum;
247     }
248 
249     @Override
getUcscVersionNumber()250     String getUcscVersionNumber() {
251         return getUcscVersionFromGencodeVersion(versionNumber);
252     }
253 
254     // ============================================================================================================
255 
256     /**
257      * Check if the given header of a tentative GENCODE GTF file is, in fact, the header to such a file.
258      * Will also return true if the file is a general GTF file (i.e. a GTF file that was not created and
259      * maintained by GENCODE).
260      * @param header Header lines to check for conformity to GENCODE GTF specifications.
261      * @param throwIfInvalid If true, will throw a {@link UserException.MalformedFile} if the header is invalid.
262      * @return true if the given {@code header} is that of a GENCODE GTF file; false otherwise.
263      */
264     @VisibleForTesting
validateHeader(final List<String> header, final boolean throwIfInvalid)265     boolean validateHeader(final List<String> header, final boolean throwIfInvalid) {
266         if ( header.size() != HEADER_NUM_LINES) {
267             if ( throwIfInvalid ) {
268                 throw new UserException.MalformedFile(
269                         "GENCODE GTF Header is of unexpected length: " +
270                                 header.size() + " != " + HEADER_NUM_LINES);
271             }
272             else {
273                 return false;
274             }
275         }
276 
277         // Check the normal commented fields:
278         if ( !checkHeaderLineStartsWith(header,0, "description:") ) {
279             return false;
280         }
281 
282         if ( !header.get(0).contains("version") ) {
283             if ( throwIfInvalid ) {
284                 throw new UserException.MalformedFile(
285                         "GENCODE GTF Header line 1 does not contain version specification: " +
286                                 header.get(0));
287             }
288             else {
289                 return false;
290             }
291         }
292 
293         // Grab the version from the file and make sure it's within the acceptable range:
294         final Matcher versionMatcher = VERSION_PATTERN.matcher(header.get(0));
295         if ( !versionMatcher.find() ) {
296             if ( throwIfInvalid ) {
297                 throw new UserException.MalformedFile(
298                         "GENCODE GTF Header line 1 does not contain a recognizable version number: " +
299                                 header.get(0));
300             }
301             else {
302                 return false;
303             }
304         }
305 
306         try {
307             final int versionNumber = Integer.valueOf(versionMatcher.group(1));
308             if (versionNumber < GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE) {
309                 final String message = "GENCODE GTF Header line 1 has an out-of-date (< v" + GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE + " version number (" +
310                         versionNumber + "): " + header.get(0);
311                 if (throwIfInvalid) {
312                     throw new UserException.MalformedFile(message);
313                 } else {
314                     logger.warn(message + "   Continuing, but errors may occur.");
315                 }
316             }
317 
318             if (versionNumber > GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE) {
319                 logger.warn("GENCODE GTF Header line 1 has a version number that is above maximum tested version (v " + GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE + ") (given: " +
320                         versionNumber + "): " + header.get(0) + "   Continuing, but errors may occur.");
321             }
322         }
323         catch (final NumberFormatException ex) {
324             if ( throwIfInvalid ) {
325                 throw new UserException("Could not create number value for version: " + versionMatcher.group(1), ex);
326             }
327             else {
328                 return false;
329             }
330         }
331 
332         return checkHeaderLineStartsWith(header, 1, "provider: GENCODE") &&
333                 checkHeaderLineStartsWith(header, 2, "contact: gencode") &&
334                 checkHeaderLineStartsWith(header, 3, "format: gtf") &&
335                 checkHeaderLineStartsWith(header, 4, "date:");
336     }
337 }
338