1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: Jochen Singer <jochen.singer@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef INCLUDE_SEQAN_GFF_IO_GFF_IO_BASE_H_ 36 #define INCLUDE_SEQAN_GFF_IO_GFF_IO_BASE_H_ 37 38 namespace seqan { 39 40 // ============================================================================ 41 // Tags, Classes, Enums 42 // ============================================================================ 43 44 // ---------------------------------------------------------------------------- 45 // Tag Gff 46 // ---------------------------------------------------------------------------- 47 48 /*! 49 * @tag FileFormats#Gff 50 * @brief Tag for selecting the GFF format. 51 * 52 * Both the GFF and the GTF file format are represented by @link GffRecord @endlink in SeqAn. 53 * Tags and functions in this group can be used for I/O of both formats to and from @link GffRecord @endlink objects. 54 * 55 * @signature typedef Tag<TagGff_> Gff; 56 */ 57 struct TagGff_; 58 typedef Tag<TagGff_> Gff; 59 60 // ---------------------------------------------------------------------------- 61 // Tag Gtf 62 // ---------------------------------------------------------------------------- 63 64 /*! 65 * @tag FileFormats#Gtf 66 * @brief Tag for selecting the GTF format. 67 * 68 * @signature typedef Tag<TagGtf_> Gtf; 69 */ 70 struct TagGtf_; 71 typedef Tag<TagGtf_> Gtf; 72 73 // ---------------------------------------------------------------------------- 74 // Class MagicHeader 75 // ---------------------------------------------------------------------------- 76 77 template <typename T> 78 struct MagicHeader<Gtf, T> : 79 public MagicHeader<Nothing, T> {}; 80 81 template <typename T> 82 struct MagicHeader<Gff, T> : 83 public MagicHeader<Nothing, T> {}; 84 85 // ---------------------------------------------------------------------------- 86 // Class FileExtensions 87 // ---------------------------------------------------------------------------- 88 89 template <typename T> 90 struct FileExtensions<Gff, T> 91 { 92 static char const * VALUE[2]; // default is one extension 93 }; 94 95 template <typename T> 96 char const * FileExtensions<Gff, T>::VALUE[2] = 97 { 98 ".gff", // default output extension 99 ".gff3" 100 }; 101 102 template <typename T> 103 struct FileExtensions<Gtf, T> 104 { 105 static char const * VALUE[1]; // default is one extension 106 }; 107 108 template <typename T> 109 char const * FileExtensions<Gtf, T>::VALUE[1] = 110 { 111 ".gtf" // default output extension 112 }; 113 114 // ---------------------------------------------------------------------------- 115 // Class GffRecord 116 // ---------------------------------------------------------------------------- 117 118 /*! 119 * @class GffRecord 120 * @implements FormattedFileRecordConcept 121 * @headerfile <seqan/gff_io.h> 122 * @brief Represent a record from a GFF or GTF file. 123 * 124 * @signature class GffRecord; 125 */ 126 struct GffRecord 127 { 128 /*! 129 * @var int32_t GffRecord::INVALID_IDX; 130 * @brief Static member with invalid/sentinel rID value. 131 */ 132 static int32_t const INVALID_POS = 2147483647; // TODO(singer): Should be std::numeric_limits<int32_t>::max(), but that is not a constant expression :( 133 134 /*! 135 * @var CharString GffRecord::ref; 136 * @brief The sequence name of the record. 137 * 138 * The ID of the landmark used to establish the coordinate system for the current feature, most often the 139 * contig/chromosome name. 140 */ 141 CharString ref; 142 143 /*! 144 * @var CharString GffRecord::source; 145 * @brief The source of the record. 146 * 147 * The source is a free text qualifier intended to describe the algorithm or operating procedure that generated this 148 * feature. 149 */ 150 CharString source; 151 152 /*! 153 * @var CharString GffRecord::type; 154 * @brief The type of the record. 155 */ 156 CharString type; 157 158 /*! 159 * @var TCharStringSet GffRecord::tagNames; 160 * @brief The names of the attributes of the record, StringSet of CharString. 161 * 162 * For each value there is a name associated in @link GffRecord::tagNames tagNames @endlink. 163 */ 164 StringSet<CharString> tagNames; 165 166 /*! 167 * @var TCharStringSet GffRecord::tagValues; 168 * @brief The values of the attributes of the record, StringSet of CharString. 169 * 170 * @section Remarks 171 * 172 * For each name there is a value associated in GffRecord::tagValues. 173 */ 174 StringSet<CharString> tagValues; 175 176 /*! 177 * @var int32_t GffRecord::beginPos; 178 * @brief The begin position of the record. 179 */ 180 uint32_t beginPos; 181 182 /*! 183 * @var int32_t GffRecord::endPos; 184 * @brief The end position of the record. 185 * 186 * GFF and GTF use 1-based positions in text, but they are stored as 0-based coordinates. 187 */ 188 uint32_t endPos; 189 190 /*! 191 * @var float GffRecord::score; 192 * @brief The score of the record. 193 */ 194 float score; 195 196 /*! 197 * @var char GffRecord::strand; 198 * @brief The strand the record belongs to. 199 * 200 * The strand of the feature. + for positive strand (relative to the landmark), - for minus strand, and . for 201 * features that are not stranded. 202 */ 203 char strand; 204 205 /*! 206 * @var char GffRecord::phase; 207 * @brief The phase of the record. 208 * 209 * For features of type "CDS", the phase indicates where the feature begins with reference to the reading frame. 210 * The phase is one of the integers 0, 1, or 2, indicating the number of bases that should be removed from the 211 * beginning of this feature to reach the first base of the next codon. 212 */ 213 char phase; 214 215 // TODO(holtgrew): C++11 will have a nan() function, use this instead then. 216 /*! 217 * @fn GffRecord::INVALID_SCORE 218 * @signature static float INVALID_SCORE() 219 * @brief Returns invalid score (NaN float value). 220 * 221 * The term <tt>x != x</tt> (for <tt>float x</tt> is only true if <tt>x</tt> is a NaN. 222 */ 223 static float INVALID_SCORE() 224 { 225 union 226 { 227 uint32_t u; 228 float f; 229 } tmp; 230 tmp.u = 0x7F800001; 231 return tmp.f; 232 } 233 234 GffRecord() : 235 beginPos(-1), endPos(-1), score(INVALID_SCORE()), 236 strand('.'), phase('.') 237 {} 238 }; 239 240 // ============================================================================ 241 // Functions 242 // ============================================================================ 243 244 // ---------------------------------------------------------------------------- 245 // Function _parseReadGffKeyValue 246 // ---------------------------------------------------------------------------- 247 248 template <typename TForwardIter, typename TKeyString, typename TValueString> 249 inline void 250 _parseReadGffKeyValue(TValueString & outValue, TKeyString & key, TForwardIter & iter) 251 { 252 //TODO(singer): AssertList functor would be need 253 char c = value(iter); 254 if (IsWhitespace()(c) || c == '=') 255 SEQAN_THROW(ParseError("The key field of an attribute is empty!")); 256 257 for (; !atEnd(iter); goNext(iter)) 258 { 259 c = value(iter); 260 if (IsNewline()(c) || c == ' ' || c == '=' || c == ';') 261 break; 262 appendValue(key, c); 263 } 264 if (!atEnd(iter) && value(iter) == ';') 265 { 266 skipOne(iter); 267 return; 268 } 269 270 if (IsNewline()(value(iter))) 271 return; 272 273 skipUntil(iter, NotFunctor<IsWhitespace>()); 274 275 if (value(iter) == '=') 276 { 277 skipOne(iter); 278 skipUntil(iter, NotFunctor<IsWhitespace>()); 279 } 280 281 if (value(iter) == '"') 282 { 283 // Handle the case of a string literal. 284 skipOne(iter); 285 skipUntil(iter, NotFunctor<IsWhitespace>()); 286 readUntil(outValue, iter, OrFunctor<EqualsChar<'"'>, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 287 skipOne(iter); 288 289 // Go over the trailing semicolon and any trailing space. 290 skipUntil(iter, NotFunctor<OrFunctor<EqualsChar<';'>, EqualsChar<' '> > >()); 291 } 292 else 293 { 294 // Read until the first semicolon, return at whitespace. 295 readUntil(outValue, iter, OrFunctor<EqualsChar<';'>, IsNewline>()); 296 297 // Skip semicolon and spaces if any. 298 skipUntil(iter, NotFunctor<OrFunctor<EqualsChar<';'>, EqualsChar<' '> > >()); 299 } 300 return; 301 } 302 303 // ---------------------------------------------------------------------------- 304 // Function clear 305 // ---------------------------------------------------------------------------- 306 307 /*! 308 * @fn GffRecord#clear 309 * @brief Reset a @link GffRecord @endlink object. 310 * 311 * @signature void clear(record); 312 * 313 * @param[in,out] record The GffRecord to reset. 314 */ 315 inline void clear(GffRecord & record) 316 { 317 record.beginPos = -1; 318 record.endPos = -1; 319 record.score = record.INVALID_SCORE(); 320 record.strand = '.'; 321 record.phase = '.'; 322 323 clear(record.ref); 324 clear(record.source); 325 clear(record.type); 326 clear(record.tagNames); 327 clear(record.tagValues); 328 } 329 330 // ---------------------------------------------------------------------------- 331 // Function readRecord 332 // ---------------------------------------------------------------------------- 333 334 // NOTE(esiragusa): dox disabled. 335 /* 336 * @fn GffFileIO#readRecord 337 * @brief Read one GFF/GTF record from a SinglePassRecordReader. 338 * 339 * @signature void readRecord(record, context, iter); 340 * 341 * @param[out] record The GffRecord to write the results to. 342 * @param[in,out] context A CharString to use for buffers. 343 * @param[in,out] iter A @link ForwardIteratorConcept forward iterator @endlink to use for reading. 344 * 345 * @throws IOError if something went wrong. 346 */ 347 template <typename TFwdIterator> 348 void readRecord(GffRecord & record, CharString & buffer, TFwdIterator & iter) 349 { 350 IsNewline isNewline; 351 352 // skip commented lines as well as ## directives 353 skipUntil(iter, NotFunctor<IsWhitespace>()); //skip empty lines 354 while (!atEnd(iter) && value(iter) == '#') 355 skipLine(iter); 356 skipUntil(iter, NotFunctor<IsWhitespace>()); //skip empty lines 357 358 clear(record); 359 360 // read column 1: seqid 361 readUntil(record.ref, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 362 skipOne(iter); 363 364 // read column 2: source 365 readUntil(record.source, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 366 367 if (record.source == ".") 368 clear(record.source); 369 370 skipOne(iter); 371 372 // read column 3: type 373 readUntil(record.type, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 374 skipOne(iter); 375 376 // read column 4: begin position 377 clear(buffer); 378 readUntil(buffer, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 379 record.beginPos = lexicalCast<uint32_t>(buffer); 380 --record.beginPos; // Translate from 1-based to 0-based. 381 skipOne(iter); 382 383 // read column 5: end position 384 clear(buffer); 385 readUntil(buffer, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 386 record.endPos = lexicalCast<uint32_t>(buffer); 387 skipOne(iter); 388 389 //check if end < begin 390 if (record.endPos < record.beginPos) 391 SEQAN_THROW(ParseError("Begin position of GFF/GTF record is larger than end position!")); 392 393 // read column 6: score 394 clear(buffer); 395 readUntil(buffer, iter, OrFunctor<IsTab, AssertFunctor<NotFunctor<IsNewline>, ParseError, Gff> >()); 396 if (buffer != ".") 397 record.score = lexicalCast<float>(buffer); 398 skipOne(iter, IsTab()); 399 400 // read column 7: strand 401 readOne(record.strand, iter, OrFunctor<OrFunctor<EqualsChar<'-'>, EqualsChar<'+'> >, EqualsChar<'.'> >()); 402 skipOne(iter, IsTab()); 403 404 // read column 8: phase 405 readOne(record.phase, iter, OrFunctor<EqualsChar<'.'>, IsInRange<'0', '2'> >()); 406 407 // It's fine if there are no attributes and the line ends here. 408 if (atEnd(iter) || isNewline(value(iter))) 409 { 410 skipLine(iter); 411 return; 412 } 413 skipOne(iter, IsTab()); 414 // There is often a space character between phase and attribute columns. 415 // We can safely skip that! 416 skipUntil(iter, NotFunctor<IsSpace>()); //skip empty lines 417 418 // read column 9: attributes 419 while (!atEnd(iter)) 420 { 421 422 CharString _key; 423 CharString _value; 424 // Read next key/value pair. 425 _parseReadGffKeyValue(_value, _key, iter); 426 427 appendValue(record.tagNames, _key); 428 appendValue(record.tagValues, _value); 429 430 clear(_key); 431 clear(_value); 432 433 // At end of line: Skip EOL and break. 434 if (!atEnd(iter) && isNewline(value(iter))) 435 { 436 skipOne(iter); 437 break; 438 } 439 } 440 441 // The last line might be a "### directive" specifically in GFF3 442 // Need to skip it to avoid another call of readRecords 443 skipUntil(iter, NotFunctor<IsWhitespace>()); //skip empty lines 444 while (!atEnd(iter) && value(iter) == '#') 445 skipLine(iter); 446 skipUntil(iter, NotFunctor<IsWhitespace>()); //skip empty lines 447 448 return; 449 } 450 451 // ---------------------------------------------------------------------------- 452 // Function _writeSemicolonSensitive() 453 // ---------------------------------------------------------------------------- 454 455 // This function checks if the string to be written contains a semicolon. If 456 // this is the case then quotes are written around the string. 457 // Returns false on success. 458 459 template <typename TTargetStream, typename TString> 460 inline void 461 _writeInQuotes(TTargetStream & target, TString & temp) 462 { 463 // TODO(jsinger): What about escaping quote chars '"'? 464 writeValue(target, '"'); 465 write(target, temp); 466 writeValue(target, '"'); 467 } 468 469 template <typename TTarget, typename TString, typename TMustBeQuotedFunctor> 470 inline void 471 _writePossiblyInQuotes(TTarget& target, TString & source, TMustBeQuotedFunctor const &func) 472 { 473 // TODO(jsinger): What about escaping quote chars '"'? 474 typedef typename Iterator<TString>::Type TIter; 475 TIter itEnd = end(source, Standard()); 476 for (TIter it = begin(source, Standard()); it != itEnd; ++it) 477 { 478 // we have a problem if the string contains a '"' or a line break 479 if (value(it) =='\n' || value(it) == '"') 480 SEQAN_THROW(ParseError("Attribute contains illegal character!")); 481 482 if (func(*it)) 483 { 484 _writeInQuotes(target, source); 485 return; 486 } 487 } 488 write(target, source); 489 } 490 491 // ---------------------------------------------------------------------------- 492 // Function writeRecord() 493 // ---------------------------------------------------------------------------- 494 495 // NOTE(esiragusa): dox disabled. 496 /* 497 * @fn GffFileIO#writeRecord 498 * @brief Writes a @link GffRecord @endlink to a stream as GFF or GTF. 499 * 500 * @signature void writeRecord(stream, record, tag); 501 * 502 * @param[in,out] stream The @link OutputIteratorConcept output iterator @endlink to write to. 503 * @param[in] record The @link GffRecord @endlink to write out. 504 * @param[in] tag A tag to select the file format, either @link GffFileIO#Gff @endlink or @link GffFileIO#Gtf 505 * @endlink. 506 * 507 * @throws IOError if something went wrong. 508 */ 509 510 template <typename TFormatTag> 511 struct GffRecordKeyMustBeQuoted_; 512 513 template <typename TFormatTag> 514 struct GffRecordValueMustBeQuoted_; 515 516 // GFF quotation rules 517 518 template <> 519 struct GffRecordKeyMustBeQuoted_<Gff> 520 { 521 bool operator() (char c) const 522 { 523 return c == ';' || c == '='; 524 } 525 }; 526 527 template <> 528 struct GffRecordValueMustBeQuoted_<Gff> : 529 GffRecordKeyMustBeQuoted_<Gff> {}; 530 531 // GTF quotation rules 532 533 template <> 534 struct GffRecordKeyMustBeQuoted_<Gtf> 535 { 536 bool operator() (char c) const 537 { 538 return c == ';' || c == ' '; 539 } 540 }; 541 542 template <> 543 struct GffRecordValueMustBeQuoted_<Gtf> 544 { 545 bool operator() (char c) const 546 { 547 // return c == ';' || c == ' ' || !isdigit(c); 548 return !isdigit(c); // is equivalent to the above, quote everything except integral values 549 } 550 }; 551 552 template <typename TTarget> 553 inline void 554 _writeAdditionalSeperator(TTarget const & /*target*/, Gff) 555 { 556 return; 557 } 558 559 template <typename TTarget> 560 inline void 561 _writeAdditionalSeperator(TTarget & target, Gtf) 562 { 563 writeValue(target, ' '); 564 return; 565 } 566 567 568 template <typename TTarget, typename TTag> 569 inline void 570 _writeAttributes(TTarget & target, GffRecord const & record, TTag const & tag) 571 { 572 const char separatorBetweenTagAndValue = (IsSameType<TTag, Gff>::VALUE)? '=' : ' '; 573 for (unsigned i = 0; i < length(record.tagNames); ++i) 574 { 575 if (i != 0) 576 { 577 writeValue(target, ';'); 578 579 // In GTF files a space follows the semicolon 580 _writeAdditionalSeperator(target, tag); 581 } 582 583 _writePossiblyInQuotes(target, record.tagNames[i], GffRecordKeyMustBeQuoted_<TTag>()); 584 585 if (!empty(record.tagValues[i])) 586 { 587 writeValue(target, separatorBetweenTagAndValue); 588 _writePossiblyInQuotes(target, record.tagValues[i], GffRecordValueMustBeQuoted_<TTag>()); 589 } 590 } 591 592 // In GTF files each (especially the last) attribute must end with a semi-colon 593 if (IsSameType<TTag, Gtf>::VALUE && !empty(record.tagNames)) 594 writeValue(target, ';'); 595 596 return; 597 } 598 599 template <typename TTarget, typename TFormat> 600 inline void 601 writeRecord(TTarget & target, GffRecord const & record, Tag<TFormat> const & tag) 602 { 603 // ignore empty annotations, i.e. annotations that are 'guessed' by implicit information from their children (in GFF) 604 if (empty(record.ref)) 605 return; 606 607 // write column 1: seqid 608 //typename Iterator<TSeqId const, Rooted>::Type itRef = begin(record.ref); 609 write(target, record.ref); 610 writeValue(target, '\t'); 611 612 // write column 2: source 613 if (empty(record.source)) 614 writeValue(target, '.'); 615 else 616 write(target, record.source); 617 writeValue(target, '\t'); 618 619 // write column 3: type 620 write(target, record.type); 621 writeValue(target, '\t'); 622 623 // write column 4: begin position 624 if (record.beginPos != (unsigned)-1) 625 appendNumber(target, record.beginPos + 1); 626 else 627 SEQAN_THROW(ParseError("No start position!")); 628 writeValue(target, '\t'); 629 630 // write column 5: end position 631 if (record.endPos != (unsigned)-1 && record.beginPos <= record.endPos) 632 appendNumber(target, record.endPos); 633 else 634 SEQAN_THROW(ParseError("No end position!")); 635 writeValue(target, '\t'); 636 637 // write column 6: score 638 if (record.score != record.score) 639 writeValue(target, '.'); 640 else 641 appendNumber(target, record.score); 642 writeValue(target, '\t'); 643 644 // write column 7: strand 645 writeValue(target, record.strand); 646 writeValue(target, '\t'); 647 648 // write column 8: phase 649 writeValue(target, record.phase); 650 writeValue(target, '\t'); 651 652 // write column 9: attributes 653 // only until length - 1, because there is no semicolon at the end of the line 654 655 _writeAttributes(target, record, tag); 656 657 writeValue(target, '\n'); 658 return; 659 } 660 661 } // namespace seqan 662 663 #endif // INCLUDE_SEQAN_GFF_IO_GFF_IO_BASE_H_ 664 665