1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Jochen Singer <jochen.singer@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef INCLUDE_SEQAN_GFF_IO_GFF_IO_BASE_H_
36 #define INCLUDE_SEQAN_GFF_IO_GFF_IO_BASE_H_
37 
38 namespace seqan {
39 
40 // ============================================================================
41 // Tags, Classes, Enums
42 // ============================================================================
43 
44 // ----------------------------------------------------------------------------
45 // Tag Gff
46 // ----------------------------------------------------------------------------
47 
48 /*!
49  * @tag FileFormats#Gff
50  * @brief Tag for selecting the GFF format.
51  *
52  * Both the GFF and the GTF file format are represented by @link GffRecord @endlink in SeqAn.
53  * Tags and functions in this group can be used for I/O of both formats to and from @link GffRecord @endlink objects.
54  *
55  * @signature typedef Tag<TagGff_> Gff;
56  */
57 struct TagGff_;
58 typedef Tag<TagGff_> Gff;
59 
60 // ----------------------------------------------------------------------------
61 // Tag Gtf
62 // ----------------------------------------------------------------------------
63 
64 /*!
65  * @tag FileFormats#Gtf
66  * @brief Tag for selecting the GTF format.
67  *
68  * @signature typedef Tag<TagGtf_> Gtf;
69  */
70 struct TagGtf_;
71 typedef Tag<TagGtf_> Gtf;
72 
73 // ----------------------------------------------------------------------------
74 // Class MagicHeader
75 // ----------------------------------------------------------------------------
76 
77 template <typename T>
78 struct MagicHeader<Gtf, T> :
79     public MagicHeader<Nothing, T> {};
80 
81 template <typename T>
82 struct MagicHeader<Gff, T> :
83     public MagicHeader<Nothing, T> {};
84 
85 // ----------------------------------------------------------------------------
86 // Class FileExtensions
87 // ----------------------------------------------------------------------------
88 
89 template <typename T>
90 struct FileExtensions<Gff, T>
91 {
92     static char const * VALUE[2];    // default is one extension
93 };
94 
95 template <typename T>
96 char const * FileExtensions<Gff, T>::VALUE[2] =
97 {
98     ".gff",     // default output extension
99     ".gff3"
100 };
101 
102 template <typename T>
103 struct FileExtensions<Gtf, T>
104 {
105     static char const * VALUE[1];    // default is one extension
106 };
107 
108 template <typename T>
109 char const * FileExtensions<Gtf, T>::VALUE[1] =
110 {
111     ".gtf"     // default output extension
112 };
113 
114 // ----------------------------------------------------------------------------
115 // Class GffRecord
116 // ----------------------------------------------------------------------------
117 
118 /*!
119  * @class GffRecord
120  * @implements FormattedFileRecordConcept
121  * @headerfile <seqan/gff_io.h>
122  * @brief Represent a record from a GFF or GTF file.
123  *
124  * @signature class GffRecord;
125  */
126 struct GffRecord
127 {
128     /*!
129      * @var int32_t GffRecord::INVALID_IDX;
130      * @brief Static member with invalid/sentinel rID value.
131      */
132     static int32_t const INVALID_POS = 2147483647;  // TODO(singer): Should be std::numeric_limits<int32_t>::max(), but that is not a constant expression :(
133 
134     /*!
135      * @var CharString GffRecord::ref;
136      * @brief The sequence name of the record.
137      *
138      * The ID of the landmark used to establish the coordinate system for the current feature, most often the
139      * contig/chromosome name.
140      */
141     CharString ref;
142 
143     /*!
144      * @var CharString GffRecord::source;
145      * @brief The source of the record.
146      *
147      * The source is a free text qualifier intended to describe the algorithm or operating procedure that generated this
148      * feature.
149      */
150     CharString source;
151 
152     /*!
153      * @var CharString GffRecord::type;
154      * @brief The type of the record.
155      */
156     CharString type;
157 
158     /*!
159      * @var TCharStringSet GffRecord::tagNames;
160      * @brief The names of the attributes of the record, StringSet of CharString.
161      *
162      * For each value there is a name associated in @link GffRecord::tagNames tagNames @endlink.
163      */
164     StringSet<CharString> tagNames;
165 
166     /*!
167      * @var TCharStringSet GffRecord::tagValues;
168      * @brief The values of the attributes of the record, StringSet of CharString.
169      *
170      * @section Remarks
171      *
172      * For each name there is a value associated in GffRecord::tagValues.
173      */
174     StringSet<CharString> tagValues;
175 
176     /*!
177      * @var int32_t GffRecord::beginPos;
178      * @brief The begin position of the record.
179      */
180     uint32_t beginPos;
181 
182     /*!
183      * @var int32_t GffRecord::endPos;
184      * @brief The end position of the record.
185      *
186      * GFF and GTF use 1-based positions in text, but they are stored as 0-based coordinates.
187      */
188     uint32_t endPos;
189 
190     /*!
191      * @var float GffRecord::score;
192      * @brief The score of the record.
193      */
194     float score;
195 
196     /*!
197      * @var char GffRecord::strand;
198      * @brief The strand the record belongs to.
199      *
200      * The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, and . for
201      * features that are not stranded.
202      */
203     char strand;
204 
205     /*!
206      * @var char GffRecord::phase;
207      * @brief The phase of the record.
208      *
209      * For features of type "CDS", the phase indicates where the feature begins with reference to the reading frame.
210      * The phase is one of the integers 0, 1, or 2, indicating the number of bases that should be removed from the
211      * beginning of this feature to reach the first base of the next codon.
212      */
213     char phase;
214 
215     // TODO(holtgrew): C++11 will have a nan() function, use this instead then.
216     /*!
217      * @fn GffRecord::INVALID_SCORE
218      * @signature static float INVALID_SCORE()
219      * @brief Returns invalid score (NaN float value).
220      *
221      * The term <tt>x != x</tt> (for <tt>float x</tt> is only true if <tt>x</tt> is a NaN.
222      */
223     static float INVALID_SCORE()
224     {
225         union
226         {
227             uint32_t u;
228             float f;
229         } tmp;
230         tmp.u = 0x7F800001;
231         return tmp.f;
232     }
233 
234     GffRecord() :
235         beginPos(-1), endPos(-1), score(INVALID_SCORE()),
236         strand('.'), phase('.')
237     {}
238 };
239 
240 // ============================================================================
241 // Functions
242 // ============================================================================
243 
244 // ----------------------------------------------------------------------------
245 // Function _parseReadGffKeyValue
246 // ----------------------------------------------------------------------------
247 
248 template <typename TForwardIter, typename TKeyString, typename TValueString>
249 inline void
250 _parseReadGffKeyValue(TValueString & outValue, TKeyString & key, TForwardIter & iter)
251 {
252     //TODO(singer): AssertList functor would be need
253     char c = value(iter);
254     if (IsWhitespace()(c) || c == '=')
255         SEQAN_THROW(ParseError("The key field of an attribute is empty!"));
256 
257     for (; !atEnd(iter); goNext(iter))
258     {
259         c = value(iter);
260         if (IsNewline()(c) || c == ' ' || c == '=' || c == ';')
261             break;
262         appendValue(key, c);
263     }
264     if (!atEnd(iter) && value(iter) == ';')
265     {
266         skipOne(iter);
267         return;
268     }
269 
270     if (IsNewline()(value(iter)))
271         return;
272 
273     skipUntil(iter, NotFunctor<IsWhitespace>());
274 
275     if (value(iter) == '=')
276     {
277         skipOne(iter);
278         skipUntil(iter, NotFunctor<IsWhitespace>());
279     }
280 
281     if (value(iter) == '"')
282     {
283         // Handle the case of a string literal.
284         skipOne(iter);
285         skipUntil(iter, NotFunctor<IsWhitespace>());
286         readUntil(outValue, iter, OrFunctor<EqualsChar<'"'>, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
287         skipOne(iter);
288 
289         // Go over the trailing semicolon and any trailing space.
290         skipUntil(iter, NotFunctor<OrFunctor<EqualsChar<';'>, EqualsChar<' '> > >());
291     }
292     else
293     {
294         // Read until the first semicolon, return at whitespace.
295         readUntil(outValue, iter, OrFunctor<EqualsChar<';'>, IsNewline>());
296 
297         // Skip semicolon and spaces if any.
298         skipUntil(iter, NotFunctor<OrFunctor<EqualsChar<';'>, EqualsChar<' '> > >());
299     }
300     return;
301 }
302 
303 // ----------------------------------------------------------------------------
304 // Function clear
305 // ----------------------------------------------------------------------------
306 
307 /*!
308  * @fn GffRecord#clear
309  * @brief Reset a @link GffRecord @endlink object.
310  *
311  * @signature void clear(record);
312  *
313  * @param[in,out] record The GffRecord to reset.
314  */
315 inline void clear(GffRecord & record)
316 {
317     record.beginPos = -1;
318     record.endPos = -1;
319     record.score = record.INVALID_SCORE();
320     record.strand = '.';
321     record.phase = '.';
322 
323     clear(record.ref);
324     clear(record.source);
325     clear(record.type);
326     clear(record.tagNames);
327     clear(record.tagValues);
328 }
329 
330 // ----------------------------------------------------------------------------
331 // Function readRecord
332 // ----------------------------------------------------------------------------
333 
334 // NOTE(esiragusa): dox disabled.
335 /*
336  * @fn GffFileIO#readRecord
337  * @brief Read one GFF/GTF record from a SinglePassRecordReader.
338  *
339  * @signature void readRecord(record, context, iter);
340  *
341  * @param[out]    record  The GffRecord to write the results to.
342  * @param[in,out] context A CharString to use for buffers.
343  * @param[in,out] iter    A @link ForwardIteratorConcept forward iterator @endlink to use for reading.
344  *
345  * @throws IOError if something went wrong.
346  */
347 template <typename TFwdIterator>
348 void readRecord(GffRecord & record, CharString & buffer, TFwdIterator & iter)
349 {
350     IsNewline isNewline;
351 
352     // skip commented lines as well as ## directives
353     skipUntil(iter, NotFunctor<IsWhitespace>());  //skip empty lines
354     while (!atEnd(iter) && value(iter) == '#')
355         skipLine(iter);
356     skipUntil(iter, NotFunctor<IsWhitespace>());  //skip empty lines
357 
358     clear(record);
359 
360     // read column 1: seqid
361     readUntil(record.ref, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
362     skipOne(iter);
363 
364     // read column 2: source
365     readUntil(record.source, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
366 
367     if (record.source == ".")
368         clear(record.source);
369 
370     skipOne(iter);
371 
372     // read column 3: type
373     readUntil(record.type, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
374     skipOne(iter);
375 
376     // read column 4: begin position
377     clear(buffer);
378     readUntil(buffer, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
379     record.beginPos = lexicalCast<uint32_t>(buffer);
380     --record.beginPos;  // Translate from 1-based to 0-based.
381     skipOne(iter);
382 
383     // read column 5: end position
384     clear(buffer);
385     readUntil(buffer, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
386     record.endPos = lexicalCast<uint32_t>(buffer);
387     skipOne(iter);
388 
389     //check if end < begin
390     if (record.endPos < record.beginPos)
391         SEQAN_THROW(ParseError("Begin position of GFF/GTF record is larger than end position!"));
392 
393     // read column 6: score
394     clear(buffer);
395     readUntil(buffer, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >());
396     if (buffer != ".")
397         record.score = lexicalCast<float>(buffer);
398     skipOne(iter, IsTab());
399 
400     // read column 7: strand
401     readOne(record.strand, iter, OrFunctor<OrFunctor<EqualsChar<'-'>, EqualsChar<'+'> >, EqualsChar<'.'> >());
402     skipOne(iter, IsTab());
403 
404     // read column 8: phase
405     readOne(record.phase, iter, OrFunctor<EqualsChar<'.'>, IsInRange<'0', '2'> >());
406 
407     // It's fine if there are no attributes and the line ends here.
408     if (atEnd(iter) || isNewline(value(iter)))
409     {
410         skipLine(iter);
411         return;
412     }
413     skipOne(iter, IsTab());
414     // There is often a space character between phase and attribute columns.
415     // We can safely skip that!
416     skipUntil(iter, NotFunctor<IsSpace>());  //skip empty lines
417 
418     // read column 9: attributes
419     while (!atEnd(iter))
420     {
421 
422         CharString _key;
423         CharString _value;
424         // Read next key/value pair.
425         _parseReadGffKeyValue(_value, _key, iter);
426 
427         appendValue(record.tagNames, _key);
428         appendValue(record.tagValues, _value);
429 
430         clear(_key);
431         clear(_value);
432 
433         // At end of line:  Skip EOL and break.
434         if (!atEnd(iter) && isNewline(value(iter)))
435         {
436             skipOne(iter);
437             break;
438         }
439     }
440 
441     // The last line might be a "### directive" specifically in GFF3
442     // Need to skip it to avoid another call of readRecords
443     skipUntil(iter, NotFunctor<IsWhitespace>());  //skip empty lines
444     while (!atEnd(iter) && value(iter) == '#')
445         skipLine(iter);
446     skipUntil(iter, NotFunctor<IsWhitespace>());  //skip empty lines
447 
448     return;
449 }
450 
451 // ----------------------------------------------------------------------------
452 // Function _writeSemicolonSensitive()
453 // ----------------------------------------------------------------------------
454 
455 // This function checks if the string to be written contains a semicolon. If
456 // this is the case then quotes are written around the string.
457 // Returns false on success.
458 
459 template <typename TTargetStream, typename TString>
460 inline void
461 _writeInQuotes(TTargetStream & target, TString & temp)
462 {
463     // TODO(jsinger): What about escaping quote chars '"'?
464     writeValue(target, '"');
465     write(target, temp);
466     writeValue(target, '"');
467 }
468 
469 template <typename TTarget, typename TString, typename TMustBeQuotedFunctor>
470 inline void
471 _writePossiblyInQuotes(TTarget& target, TString & source, TMustBeQuotedFunctor const &func)
472 {
473     // TODO(jsinger): What about escaping quote chars '"'?
474     typedef typename Iterator<TString>::Type TIter;
475     TIter itEnd = end(source, Standard());
476     for (TIter it = begin(source, Standard()); it != itEnd; ++it)
477     {
478         // we have a problem if the string contains a '"' or a line break
479         if (value(it) =='\n' || value(it) == '"')
480             SEQAN_THROW(ParseError("Attribute contains illegal character!"));
481 
482         if (func(*it))
483         {
484             _writeInQuotes(target, source);
485             return;
486         }
487     }
488     write(target, source);
489 }
490 
491 // ----------------------------------------------------------------------------
492 // Function writeRecord()
493 // ----------------------------------------------------------------------------
494 
495 // NOTE(esiragusa): dox disabled.
496 /*
497  * @fn GffFileIO#writeRecord
498  * @brief Writes a @link GffRecord @endlink to a stream as GFF or GTF.
499  *
500  * @signature void writeRecord(stream, record, tag);
501  *
502  * @param[in,out] stream  The @link OutputIteratorConcept output iterator @endlink to write to.
503  * @param[in]     record  The @link GffRecord @endlink to write out.
504  * @param[in]     tag     A tag to select the file format, either @link GffFileIO#Gff @endlink or @link GffFileIO#Gtf
505  *                        @endlink.
506  *
507  * @throws IOError if something went wrong.
508  */
509 
510 template <typename TFormatTag>
511 struct GffRecordKeyMustBeQuoted_;
512 
513 template <typename TFormatTag>
514 struct GffRecordValueMustBeQuoted_;
515 
516 // GFF quotation rules
517 
518 template <>
519 struct GffRecordKeyMustBeQuoted_<Gff>
520 {
521     bool operator() (char c) const
522     {
523         return c == ';' || c == '=';
524     }
525 };
526 
527 template <>
528 struct GffRecordValueMustBeQuoted_<Gff> :
529     GffRecordKeyMustBeQuoted_<Gff> {};
530 
531 // GTF quotation rules
532 
533 template <>
534 struct GffRecordKeyMustBeQuoted_<Gtf>
535 {
536     bool operator() (char c) const
537     {
538         return c == ';' || c == ' ';
539     }
540 };
541 
542 template <>
543 struct GffRecordValueMustBeQuoted_<Gtf>
544 {
545     bool operator() (char c) const
546     {
547 //        return c == ';' || c == ' ' || !isdigit(c);
548         return !isdigit(c);     // is equivalent to the above, quote everything except integral values
549     }
550 };
551 
552 template <typename TTarget>
553 inline void
554 _writeAdditionalSeperator(TTarget const & /*target*/, Gff)
555 {
556     return;
557 }
558 
559 template <typename TTarget>
560 inline void
561 _writeAdditionalSeperator(TTarget & target, Gtf)
562 {
563     writeValue(target, ' ');
564     return;
565 }
566 
567 
568 template <typename TTarget, typename TTag>
569 inline void
570 _writeAttributes(TTarget & target, GffRecord const & record, TTag const & tag)
571 {
572     const char separatorBetweenTagAndValue = (IsSameType<TTag, Gff>::VALUE)? '=' : ' ';
573     for (unsigned i = 0; i < length(record.tagNames); ++i)
574     {
575         if (i != 0)
576         {
577             writeValue(target, ';');
578 
579             // In GTF files a space follows the semicolon
580             _writeAdditionalSeperator(target, tag);
581        }
582 
583         _writePossiblyInQuotes(target, record.tagNames[i], GffRecordKeyMustBeQuoted_<TTag>());
584 
585         if (!empty(record.tagValues[i]))
586         {
587             writeValue(target, separatorBetweenTagAndValue);
588             _writePossiblyInQuotes(target, record.tagValues[i], GffRecordValueMustBeQuoted_<TTag>());
589         }
590     }
591 
592     // In GTF files each (especially the last) attribute must end with a semi-colon
593     if (IsSameType<TTag, Gtf>::VALUE && !empty(record.tagNames))
594         writeValue(target, ';');
595 
596     return;
597 }
598 
599 template <typename TTarget, typename TFormat>
600 inline void
601 writeRecord(TTarget & target, GffRecord const & record, Tag<TFormat> const & tag)
602 {
603     // ignore empty annotations, i.e. annotations that are 'guessed' by implicit information from their children (in GFF)
604     if (empty(record.ref))
605         return;
606 
607     // write column 1: seqid
608     //typename Iterator<TSeqId const, Rooted>::Type itRef = begin(record.ref);
609     write(target, record.ref);
610     writeValue(target, '\t');
611 
612     // write column 2: source
613     if (empty(record.source))
614         writeValue(target, '.');
615     else
616         write(target, record.source);
617     writeValue(target, '\t');
618 
619     // write column 3: type
620     write(target, record.type);
621     writeValue(target, '\t');
622 
623     // write column 4: begin position
624     if (record.beginPos != (unsigned)-1)
625         appendNumber(target, record.beginPos + 1);
626     else
627         SEQAN_THROW(ParseError("No start position!"));
628     writeValue(target, '\t');
629 
630     // write column 5: end position
631     if (record.endPos != (unsigned)-1 && record.beginPos <= record.endPos)
632         appendNumber(target, record.endPos);
633     else
634         SEQAN_THROW(ParseError("No end position!"));
635     writeValue(target, '\t');
636 
637     // write column 6: score
638     if (record.score != record.score)
639         writeValue(target, '.');
640     else
641         appendNumber(target, record.score);
642     writeValue(target, '\t');
643 
644     // write column 7: strand
645     writeValue(target, record.strand);
646     writeValue(target, '\t');
647 
648     // write column 8: phase
649     writeValue(target, record.phase);
650     writeValue(target, '\t');
651 
652     // write column 9: attributes
653     // only until length - 1, because there is no semicolon at the end of the line
654 
655     _writeAttributes(target, record, tag);
656 
657     writeValue(target, '\n');
658     return;
659 }
660 
661 }  // namespace seqan
662 
663 #endif  // INCLUDE_SEQAN_GFF_IO_GFF_IO_BASE_H_
664 
665