1#------------------------------------------------------------------ 2# 3# BioPerl module Bio::Tools::GuessSeqFormat 4# 5# Please direct questions and support issues to <bioperl-l@bioperl.org> 6# 7# Cared for by Andreas Kähäri, andreas.kahari@ebi.ac.uk 8# 9# You may distribute this module under the same terms as perl itself 10#------------------------------------------------------------------ 11 12=encoding utf-8 13 14=head1 NAME 15 16Bio::Tools::GuessSeqFormat - Module for determining the sequence 17format of the contents of a file, a string, or through a 18filehandle. 19 20=head1 SYNOPSIS 21 22 # To guess the format of a flat file, given a filename: 23 my $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename ); 24 my $format = $guesser->guess; 25 26 # To guess the format from an already open filehandle: 27 my $guesser = Bio::Tools::GuessSeqFormat->new( -fh => $filehandle ); 28 my $format = $guesser->guess; 29 # The filehandle will be returned to its original position. Note that this 30 # filehandle can be STDIN. 31 32 # To guess the format of one or several lines of text (with 33 # embedded newlines): 34 my $guesser = Bio::Tools::GuessSeqFormat->new( -text => $linesoftext ); 35 my $format = $guesser->guess; 36 37 # To create a Bio::Tools::GuessSeqFormat object and set the 38 # filename, filehandle, or line to parse afterwards: 39 my $guesser = Bio::Tools::GuessSeqFormat->new(); 40 $guesser->file($filename); 41 $guesser->fh($filehandle); 42 $guesser->text($linesoftext); 43 44 # To guess in one go, given e.g. a filename: 45 my $format = Bio::Tools::GuessSeqFormat->new( -file => $filename )->guess; 46 47=head1 DESCRIPTION 48 49Bio::Tools::GuessSeqFormat tries to guess the format ("swiss", 50"pir", "fasta" etc.) of the sequence or MSA in a file, in a 51scalar, or through a filehandle. 52 53The guess() method of a Bio::Tools::GuessSeqFormat object will 54examine the data, line by line, until it finds a line to which 55only one format can be assigned. If no conclusive guess can be 56made, undef is returned. 57 58If the Bio::Tools::GuessSeqFormat object is given a filehandle, 59e.g. STDIN, it will be restored to its original position on 60return from the guess() method. 61 62=head2 Formats 63 64Tests are currently implemented for the following formats: 65 66=over 67 68=item * 69 70ACeDB ("ace") 71 72=item * 73 74Blast ("blast") 75 76=item * 77 78ClustalW ("clustalw") 79 80=item * 81 82Codata ("codata") 83 84=item * 85 86EMBL ("embl") 87 88=item * 89 90FastA sequence ("fasta") 91 92=item * 93 94FastQ sequence ("fastq") 95 96=item * 97 98FastXY/FastA alignment ("fastxy") 99 100=item * 101 102Game XML ("game") 103 104=item * 105 106GCG ("gcg") 107 108=item * 109 110GCG Blast ("gcgblast") 111 112=item * 113 114GCG FastA ("gcgfasta") 115 116=item * 117 118GDE ("gde") 119 120=item * 121 122Genbank ("genbank") 123 124=item * 125 126Genscan ("genscan") 127 128=item * 129 130GFF ("gff") 131 132=item * 133 134HMMER ("hmmer") 135 136=item * 137 138PAUP/NEXUS ("nexus") 139 140=item * 141 142Phrap assembly file ("phrap") 143 144=item * 145 146NBRF/PIR ("pir") 147 148=item * 149 150Mase ("mase") 151 152=item * 153 154Mega ("mega") 155 156=item * 157 158GCG/MSF ("msf") 159 160=item * 161 162Pfam ("pfam") 163 164=item * 165 166Phylip ("phylip") 167 168=item * 169 170Prodom ("prodom") 171 172=item * 173 174Raw ("raw") 175 176=item * 177 178RSF ("rsf") 179 180=item * 181 182Selex ("selex") 183 184=item * 185 186Stockholm ("stockholm") 187 188=item * 189 190Swissprot ("swiss") 191 192=item * 193 194Tab ("tab") 195 196=item * 197 198Variant Call Format ("vcf") 199 200=back 201 202=head1 FEEDBACK 203 204=head2 Mailing Lists 205 206User feedback is an integral part of the evolution of this and 207other Bioperl modules. Send your comments and suggestions 208preferably to one of the Bioperl mailing lists. Your 209participation is much appreciated. 210 211 bioperl-l@bioperl.org - General discussion 212 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 213 214=head2 Support 215 216Please direct usage questions or support issues to the mailing list: 217 218I<bioperl-l@bioperl.org> 219 220rather than to the module maintainer directly. Many experienced and 221reponsive experts will be able look at the problem and quickly 222address it. Please include a thorough description of the problem 223with code and data examples if at all possible. 224 225=head2 Reporting Bugs 226 227Report bugs to the Bioperl bug tracking system to help us 228keep track the bugs and their resolution. Bug reports can be 229submitted via the web: 230 231 https://github.com/bioperl/bioperl-live/issues 232 233=head1 AUTHOR 234 235Andreas KE<228>hE<228>ri, andreas.kahari@ebi.ac.uk 236 237=head1 CONTRIBUTORS 238 239Heikki LehvE<228>slaiho, heikki-at-bioperl-dot-org 240Mark A. Jensen, maj-at-fortinbras-dot-us 241 242=cut 243 244 245package Bio::Tools::GuessSeqFormat; 246$Bio::Tools::GuessSeqFormat::VERSION = '1.7.7'; 247use strict; 248use warnings; 249 250 251use base qw(Bio::Root::Root); 252 253=head1 METHODS 254 255Methods available to Bio::Tools::GuessSeqFormat objects 256are described below. Methods with names beginning with an 257underscore are considered to be internal. 258 259=cut 260 261=head2 new 262 263 Title : new 264 Usage : $guesser = Bio::Tools::GuessSeqFormat->new( ... ); 265 Function : Creates a new object. 266 Example : See SYNOPSIS. 267 Returns : A new object. 268 Arguments : -file The filename of the file whose format is to 269 be guessed, e.g. STDIN, or 270 -fh An already opened filehandle from which a text 271 stream may be read, or 272 -text A scalar containing one or several lines of 273 text with embedded newlines. 274 275 If more than one of the above arguments are given, they 276 are tested in the order -text, -file, -fh, and the first 277 available argument will be used. 278 279=cut 280 281sub new 282{ 283 my $class = shift; 284 my @args = @_; 285 286 my $self = $class->SUPER::new(@args); 287 288 my $attr; 289 my $value; 290 291 while (@args) { 292 $attr = shift @args; 293 $attr = lc $attr; 294 $value = shift @args; 295 $self->{$attr} = $value; 296 } 297 298 return $self; 299} 300 301=head2 file 302 303 Title : file 304 Usage : $guesser->file($filename); 305 $filename = $guesser->file; 306 Function : Gets or sets the current filename associated with 307 an object. 308 Returns : The new filename. 309 Arguments : The filename of the file whose format is to be 310 guessed. 311 312 A call to this method will clear the current filehandle and 313 the current lines of text associated with the object. 314 315=cut 316 317sub file 318{ 319 # Sets and/or returns the filename to use. 320 my $self = shift; 321 my $file = shift; 322 323 if (defined $file) { 324 # Set the active filename, and clear the filehandle and 325 # text line, if present. 326 $self->{-file} = $file; 327 $self->{-fh} = $self->{-text} = undef; 328 } 329 330 return $self->{-file}; 331} 332 333=head2 fh 334 335 Title : fh 336 Usage : $guesser->fh($filehandle); 337 $filehandle = $guesser->fh; 338 Function : Gets or sets the current filehandle associated with 339 an object. 340 Returns : The new filehandle. 341 Arguments : An already opened filehandle from which a text 342 stream may be read. 343 344 A call to this method will clear the current filename and 345 the current lines of text associated with the object. 346 347=cut 348 349sub fh 350{ 351 # Sets and/or returns the filehandle to use. 352 my $self = shift; 353 my $fh = shift; 354 355 if (defined $fh) { 356 # Set the active filehandle, and clear the filename and 357 # text line, if present. 358 $self->{-fh} = $fh; 359 $self->{-file} = $self->{-text} = undef; 360 } 361 362 return $self->{-fh}; 363} 364 365 366=head2 text 367 368 Title : text 369 Usage : $guesser->text($linesoftext); 370 $linesofext = $guesser->text; 371 Function : Gets or sets the current text associated with an 372 object. 373 Returns : The new lines of texts. 374 Arguments : A scalar containing one or several lines of text, 375 including embedded newlines. 376 377 A call to this method will clear the current filename and 378 the current filehandle associated with the object. 379 380=cut 381 382sub text 383{ 384 # Sets and/or returns the text lines to use. 385 my $self = shift; 386 my $text = shift; 387 388 if (defined $text) { 389 # Set the active text lines, and clear the filehandle 390 # and filename, if present. 391 $self->{-text} = $text; 392 $self->{-fh} = $self->{-file} = undef; 393 } 394 395 return $self->{-text}; 396} 397 398=head2 guess 399 400 Title : guess 401 Usage : $format = $guesser->guess; 402 @format = $guesser->guess; # if given a line of text 403 Function : Guesses the format of the data accociated with the 404 object. 405 Returns : A format string such as "swiss" or "pir". If a 406 format can not be found, undef is returned. 407 Arguments : None. 408 409 If the object is associated with a filehandle, the position 410 of the filehandle will be returned to its original position 411 before the method returns. 412 413=cut 414 415our %formats = ( 416 ace => { test => \&_possibly_ace }, 417 blast => { test => \&_possibly_blast }, 418 bowtie => { test => \&_possibly_bowtie }, 419 clustalw => { test => \&_possibly_clustalw }, 420 codata => { test => \&_possibly_codata }, 421 embl => { test => \&_possibly_embl }, 422 fasta => { test => \&_possibly_fasta }, 423 fastq => { test => \&_possibly_fastq }, 424 fastxy => { test => \&_possibly_fastxy }, 425 game => { test => \&_possibly_game }, 426 gcg => { test => \&_possibly_gcg }, 427 gcgblast => { test => \&_possibly_gcgblast }, 428 gcgfasta => { test => \&_possibly_gcgfasta }, 429 gde => { test => \&_possibly_gde }, 430 genbank => { test => \&_possibly_genbank }, 431 genscan => { test => \&_possibly_genscan }, 432 gff => { test => \&_possibly_gff }, 433 hmmer => { test => \&_possibly_hmmer }, 434 nexus => { test => \&_possibly_nexus }, 435 mase => { test => \&_possibly_mase }, 436 mega => { test => \&_possibly_mega }, 437 msf => { test => \&_possibly_msf }, 438 pfam => { test => \&_possibly_pfam }, 439 phrap => { test => \&_possibly_phrap }, 440 phylip => { test => \&_possibly_phylip }, 441 pir => { test => \&_possibly_pir }, 442 prodom => { test => \&_possibly_prodom }, 443 raw => { test => \&_possibly_raw }, 444 rsf => { test => \&_possibly_rsf }, 445 selex => { test => \&_possibly_selex }, 446 stockholm => { test => \&_possibly_stockholm }, 447 swiss => { test => \&_possibly_swiss }, 448 tab => { test => \&_possibly_tab }, 449 vcf => { test => \&_possibly_vcf }, 450); 451 452sub guess 453{ 454 my $self = shift; 455 456 while (my ($fmt_key) = each (%formats)) { 457 $formats{$fmt_key}{fmt_string} = $fmt_key; 458 } 459 460 my $fh; 461 my $start_pos; 462 if (defined $self->{-text}) { 463 # Break the text into separate lines. 464 my $text = $self->{-text}; 465 open $fh, '<', \$text or $self->throw("Could not read from string: $!"); 466 467 } elsif (defined $self->{-file}) { 468 # If given a filename, open the file. 469 my $file = $self->{-file}; 470 open $fh, '<', $file or $self->throw("Could not read file '$file': $!"); 471 472 } elsif (defined $self->{-fh}) { 473 # If given a filehandle, get the current position in the stream. 474 $fh = $self->{-fh}; 475 if (not seek $fh, 0, 1) { # seek to current position to determine seekability 476 # Work around non-seekable filehandles if IO::Scalar is available 477 # (adapted from http://www.perlmonks.org/?node_id=33587) 478 # IO::Mark may be an option for very large streams? 479 $self->throw("Need IO::Scalar to guess from unseekable filehandles") 480 if not eval { require IO::Scalar }; 481 my $data; 482 { local $/; $data = <$fh>; $.-- }; # copy raw data from fh 483 tie *$fh, 'IO::Scalar', my $s; # replace fh by scalar-tied fh 484 print $fh $data; # write raw data to tied fh 485 seek $fh, 0, 0; # return to start of tied fh 486 } 487 $start_pos = tell $fh; 488 } 489 490 my $done = 0; 491 my $lineno = 0; 492 my $guess; 493 while (!$done) { 494 my $line; # The next line of the file. 495 my $match = 0; # Number of possible formats of this line. 496 497 last if (!defined($line = <$fh>)); 498 next if ($line =~ /^\s*$/); # Skip white and empty lines. 499 chomp $line; 500 $line =~ s/\r$//; # Fix for DOS files on Unix. 501 ++$lineno; 502 503 while (my ($fmt_key, $fmt) = each (%formats)) { 504 if ($fmt->{test}($line, $lineno)) { 505 ++$match; 506 $guess = $fmt->{fmt_string}; 507 } 508 } 509 510 # We're done if there was only one match. 511 $done = ($match == 1); 512 } 513 514 if (defined $self->{-fh}) { 515 # Go back to original position in filehandle 516 seek $fh, $start_pos, 0 or $self->throw("Could not reset filehandle $fh: $!"); 517 } else { 518 # Close the filehandle we opened 519 close $fh; 520 } 521 return ($done ? $guess : undef); 522} 523 524=head1 HELPER SUBROUTINES 525 526All helper subroutines will, given a line of text and the line 527number of the same line, return 1 if the line possibly is from a 528file of the type that they perform a test of. 529 530A zero return value does not mean that the line is not part 531of a certain type of file, just that the test did not find any 532characteristics of that type of file in the line. 533 534=head2 _possibly_ace 535 536From bioperl test data, and from 537"http://www.isrec.isb-sib.ch/DEA/module8/B_Stevenson/Practicals/transcriptome_recon/transcriptome_recon.html". 538 539=cut 540 541sub _possibly_ace 542{ 543 my ($line, $lineno) = (shift, shift); 544 return ($line =~ /^(?:Sequence|Peptide|DNA|Protein) [":]/); 545} 546 547=head2 _possibly_blast 548 549 From various blast results. 550 551=cut 552 553sub _possibly_blast 554{ 555 my ($line, $lineno) = (shift, shift); 556 return ($lineno == 1 && 557 $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/); 558} 559 560=head2 _possibly_bowtie 561 562Contributed by kortsch. 563 564=cut 565 566sub _possibly_bowtie 567{ 568 my ($line, $lineno) = (shift, shift); 569 return ($line =~ /^[[:graph:]]+\t[-+]\t[[:graph:]]+\t\d+\t([[:alpha:]]+)\t([[:graph:]]+)\t\d+\t[[:graph:]]?/) 570 && length($1)==length($2); 571} 572 573=head2 _possibly_clustalw 574 575From "http://www.ebi.ac.uk/help/formats.html". 576 577=cut 578 579sub _possibly_clustalw 580{ 581 my ($line, $lineno) = (shift, shift); 582 return ($lineno == 1 && $line =~ /CLUSTAL/); 583} 584 585=head2 _possibly_codata 586 587From "http://www.ebi.ac.uk/help/formats.html". 588 589=cut 590 591sub _possibly_codata 592{ 593 my ($line, $lineno) = (shift, shift); 594 return (($lineno == 1 && $line =~ /^ENTRY/) || 595 ($lineno == 2 && $line =~ /^SEQUENCE/) || 596 $line =~ m{^(?:ENTRY|SEQUENCE|///)}); 597} 598 599=head2 _possibly_embl 600 601From 602"http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3.3". 603 604=cut 605 606sub _possibly_embl 607{ 608 my ($line, $lineno) = (shift, shift); 609 return ($lineno == 1 && $line =~ /^ID / && $line =~ /BP\.$/); 610} 611 612=head2 _possibly_fasta 613 614From "http://www.ebi.ac.uk/help/formats.html". 615 616=cut 617 618sub _possibly_fasta 619{ 620 my ($line, $lineno) = (shift, shift); 621 return (($lineno != 1 && $line =~ /^[A-IK-NP-Z]+$/i) || 622 $line =~ /^>\s*\w/); 623} 624 625=head2 _possibly_fastq 626 627From bioperl test data. 628 629=cut 630 631sub _possibly_fastq 632{ 633 my ($line, $lineno) = (shift, shift); 634 return ( ($lineno == 1 && $line =~ /^@/) || 635 ($lineno == 3 && $line =~ /^\+/) ); 636} 637 638=head2 _possibly_fastxy 639 640From bioperl test data. 641 642=cut 643 644sub _possibly_fastxy 645{ 646 my ($line, $lineno) = (shift, shift); 647 return (($lineno == 1 && $line =~ /^ FAST(?:XY|A)/) || 648 ($lineno == 2 && $line =~ /^ version \d/)); 649} 650 651=head2 _possibly_game 652 653From bioperl testdata. 654 655=cut 656 657sub _possibly_game 658{ 659 my ($line, $lineno) = (shift, shift); 660 return ($line =~ /^<!DOCTYPE game/); 661} 662 663=head2 _possibly_gcg 664 665From bioperl, Bio::SeqIO::gcg. 666 667=cut 668 669sub _possibly_gcg 670{ 671 my ($line, $lineno) = (shift, shift); 672 return ($line =~ /Length: .*Type: .*Check: .*\.\.$/); 673} 674 675=head2 _possibly_gcgblast 676 677From bioperl testdata. 678 679=cut 680 681sub _possibly_gcgblast 682{ 683 my ($line, $lineno) = (shift, shift); 684 return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) || 685 ($lineno == 2 && 686 $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/)); 687} 688 689=head2 _possibly_gcgfasta 690 691From bioperl testdata. 692 693=cut 694 695sub _possibly_gcgfasta 696{ 697 my ($line, $lineno) = (shift, shift); 698 return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) || 699 ($lineno == 2 && $line =~ /FASTA/)); 700} 701 702=head2 _possibly_gde 703 704From "http://www.ebi.ac.uk/help/formats.html". 705 706=cut 707 708sub _possibly_gde 709{ 710 my ($line, $lineno) = (shift, shift); 711 return ($line =~ /^[{}]$/ || 712 $line =~ /^(?:name|longname|sequence-ID| 713 creation-date|direction|strandedness| 714 type|offset|group-ID|creator|descrip| 715 comment|sequence)/x); 716} 717 718=head2 _possibly_genbank 719 720From "http://www.ebi.ac.uk/help/formats.html". 721Format of [apparantly optional] file header from 722"http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm". (TODO: dead link) 723 724=cut 725 726sub _possibly_genbank 727{ 728 my ($line, $lineno) = (shift, shift); 729 return (($lineno == 1 && $line =~ /GENETIC SEQUENCE DATA BANK/) || 730 ($lineno == 1 && $line =~ /^LOCUS /) || 731 ($lineno == 2 && $line =~ /^DEFINITION /) || 732 ($lineno == 3 && $line =~ /^ACCESSION /)); 733} 734 735=head2 _possibly_genscan 736 737From bioperl test data. 738 739=cut 740 741sub _possibly_genscan 742{ 743 my ($line, $lineno) = (shift, shift); 744 return (($lineno == 1 && $line =~ /^GENSCAN.*Date.*Time/) || 745 ($line =~ /^(?:Sequence\s+\w+|Parameter matrix|Predicted genes)/)); 746} 747 748=head2 _possibly_gff 749 750From bioperl test data. 751 752=cut 753 754sub _possibly_gff 755{ 756 my ($line, $lineno) = (shift, shift); 757 return (($lineno == 1 && $line =~ /^##gff-version/) || 758 ($lineno == 2 && $line =~ /^##date/)); 759} 760 761=head2 _possibly_hmmer 762 763From bioperl test data. 764 765=cut 766 767sub _possibly_hmmer 768{ 769 my ($line, $lineno) = (shift, shift); 770 return (($lineno == 2 && $line =~ /^HMMER/) || 771 ($lineno == 3 && 772 $line =~ /Washington University School of Medicine/)); 773} 774 775=head2 _possibly_nexus 776 777From "http://paup.csit.fsu.edu/nfiles.html". 778 779=cut 780 781sub _possibly_nexus 782{ 783 my ($line, $lineno) = (shift, shift); 784 return ($lineno == 1 && $line =~ /^#NEXUS/); 785} 786 787=head2 _possibly_mase 788 789From bioperl test data. 790More detail from "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm" (TODO: dead link) 791 792=cut 793 794sub _possibly_mase 795{ 796 my ($line, $lineno) = (shift, shift); 797 return (($lineno == 1 && $line =~ /^;;/) || 798 ($lineno > 1 && $line =~ /^;[^;]?/)); 799} 800 801=head2 _possibly_mega 802 803From the ensembl broswer (AlignView data export). 804 805=cut 806 807sub _possibly_mega 808{ 809 my ($line, $lineno) = (shift, shift); 810 return ($lineno == 1 && $line =~ /^#mega$/); 811} 812 813 814=head2 _possibly_msf 815 816From "http://www.ebi.ac.uk/help/formats.html". 817 818=cut 819 820sub _possibly_msf 821{ 822 my ($line, $lineno) = (shift, shift); 823 return ($line =~ m{^//} || 824 $line =~ /MSF:.*Type:.*Check:|Name:.*Len:/); 825} 826 827=head2 _possibly_phrap 828 829From "http://biodata.ccgb.umn.edu/docs/contigimage.html". (TODO: dead link) 830From "http://genetics.gene.cwru.edu/gene508/Lec6.htm". (TODO: dead link) 831From bioperl test data ("*.ace.1" files). 832 833=cut 834 835sub _possibly_phrap 836{ 837 my ($line, $lineno) = (shift, shift); 838 return ($line =~ /^(?:AS\ |CO\ Contig|BQ|AF\ |BS\ |RD\ | 839 QA\ |DS\ |RT\{)/x); 840} 841 842=head2 _possibly_pir 843 844From "http://www.ebi.ac.uk/help/formats.html". 845The ".,()" spotted in bioperl test data. 846 847=cut 848 849sub _possibly_pir # "NBRF/PIR" (?) 850{ 851 my ($line, $lineno) = (shift, shift); 852 return (($lineno != 1 && $line =~ /^[\sA-IK-NP-Z.,()]+\*?$/i) || 853 $line =~ /^>(?:P1|F1|DL|DC|RL|RC|N3|N1);/); 854} 855 856=head2 _possibly_pfam 857 858From bioperl test data. 859 860=cut 861 862sub _possibly_pfam 863{ 864 my ($line, $lineno) = (shift, shift); 865 return ($line =~ m{^\w+/\d+-\d+\s+[A-IK-NP-Z.]+}i); 866} 867 868=head2 _possibly_phylip 869 870From "http://www.ebi.ac.uk/help/formats.html". Initial space 871allowed on first line (spotted in ensembl AlignView exported 872data). 873 874=cut 875 876sub _possibly_phylip 877{ 878 my ($line, $lineno) = (shift, shift); 879 return (($lineno == 1 && $line =~ /^\s*\d+\s\d+/) || 880 ($lineno == 2 && $line =~ /^\w\s+[A-IK-NP-Z\s]+/) || 881 ($lineno == 3 && $line =~ /(?:^\w\s+[A-IK-NP-Z\s]+|\s+[A-IK-NP-Z\s]+)/) 882 ); 883} 884 885=head2 _possibly_prodom 886 887From "http://prodom.prabi.fr/prodom/current/documentation/data.php". 888 889=cut 890 891sub _possibly_prodom 892{ 893 my ($line, $lineno) = (shift, shift); 894 return ($lineno == 1 && $line =~ /^ID / && $line =~ /\d+ seq\.$/); 895} 896 897=head2 _possibly_raw 898 899From "http://www.ebi.ac.uk/help/formats.html". 900 901=cut 902 903sub _possibly_raw 904{ 905 my ($line, $lineno) = (shift, shift); 906 return ($line =~ /^[A-Za-z\s]+$/); 907} 908 909=head2 _possibly_rsf 910 911From "http://www.ebi.ac.uk/help/formats.html". 912 913=cut 914 915sub _possibly_rsf 916{ 917 my ($line, $lineno) = (shift, shift); 918 return (($lineno == 1 && $line =~ /^!!RICH_SEQUENCE/) || 919 $line =~ /^[{}]$/ || 920 $line =~ /^(?:name|type|longname| 921 checksum|creation-date|strand|sequence)/x); 922} 923 924=head2 _possibly_selex 925 926From "http://www.ebc.ee/WWW/hmmer2-html/node27.html". 927 928Assuming presence of Selex file header. Data exported by 929Bioperl on Pfam and Selex formats are identical, but Pfam file 930only holds one alignment. 931 932=cut 933 934sub _possibly_selex 935{ 936 my ($line, $lineno) = (shift, shift); 937 return (($lineno == 1 && $line =~ /^#=ID /) || 938 ($lineno == 2 && $line =~ /^#=AC /) || 939 ($line =~ /^#=SQ /)); 940} 941 942=head2 _possibly_stockholm 943 944From bioperl test data. 945 946=cut 947 948sub _possibly_stockholm 949{ 950 my ($line, $lineno) = (shift, shift); 951 return (($lineno == 1 && $line =~ /^# STOCKHOLM/) || 952 $line =~ /^#=(?:GF|GS) /); 953} 954 955 956 957=head2 _possibly_swiss 958 959From "http://ca.expasy.org/sprot/userman.html#entrystruc". 960 961=cut 962 963sub _possibly_swiss 964{ 965 my ($line, $lineno) = (shift, shift); 966 return ($lineno == 1 && $line =~ /^ID / && $line =~ /AA\.$/); 967} 968 969=head2 _possibly_tab 970 971Contributed by Heikki. 972 973=cut 974 975sub _possibly_tab 976{ 977 my ($line, $lineno) = (shift, shift); 978 return ($lineno == 1 && $line =~ /^[^\t]+\t[^\t]+/) ; 979} 980 981=head2 _possibly_vcf 982 983From "http://www.1000genomes.org/wiki/analysis/vcf4.0". 984 985Assumptions made about sanity - format and date lines are line 1 and 2 986respectively. This is not specified in the format document. 987 988=cut 989 990sub _possibly_vcf 991{ 992 my ($line, $lineno) = (shift, shift); 993 return (($lineno == 1 && $line =~ /##fileformat=VCFv/) || 994 ($lineno == 2 && $line =~ /##fileDate=/)); 995} 996 997 998 9991; 1000