1#------------------------------------------------------------------
2#
3# BioPerl module Bio::Tools::GuessSeqFormat
4#
5# Please direct questions and support issues to <bioperl-l@bioperl.org>
6#
7# Cared for by Andreas Kähäri, andreas.kahari@ebi.ac.uk
8#
9# You may distribute this module under the same terms as perl itself
10#------------------------------------------------------------------
11
12=encoding utf-8
13
14=head1 NAME
15
16Bio::Tools::GuessSeqFormat - Module for determining the sequence
17format of the contents of a file, a string, or through a
18filehandle.
19
20=head1 SYNOPSIS
21
22    # To guess the format of a flat file, given a filename:
23    my $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename );
24    my $format  = $guesser->guess;
25
26    # To guess the format from an already open filehandle:
27    my $guesser = Bio::Tools::GuessSeqFormat->new( -fh => $filehandle );
28    my $format  = $guesser->guess;
29    # The filehandle will be returned to its original position. Note that this
30    # filehandle can be STDIN.
31
32    # To guess the format of one or several lines of text (with
33    # embedded newlines):
34    my $guesser = Bio::Tools::GuessSeqFormat->new( -text => $linesoftext );
35    my $format = $guesser->guess;
36
37    # To create a Bio::Tools::GuessSeqFormat object and set the
38    # filename, filehandle, or line to parse afterwards:
39    my $guesser = Bio::Tools::GuessSeqFormat->new();
40    $guesser->file($filename);
41    $guesser->fh($filehandle);
42    $guesser->text($linesoftext);
43
44    # To guess in one go, given e.g. a filename:
45    my $format = Bio::Tools::GuessSeqFormat->new( -file => $filename )->guess;
46
47=head1 DESCRIPTION
48
49Bio::Tools::GuessSeqFormat tries to guess the format ("swiss",
50"pir", "fasta" etc.) of the sequence or MSA in a file, in a
51scalar, or through a filehandle.
52
53The guess() method of a Bio::Tools::GuessSeqFormat object will
54examine the data, line by line, until it finds a line to which
55only one format can be assigned.  If no conclusive guess can be
56made, undef is returned.
57
58If the Bio::Tools::GuessSeqFormat object is given a filehandle,
59e.g. STDIN, it will be restored to its original position on
60return from the guess() method.
61
62=head2 Formats
63
64Tests are currently implemented for the following formats:
65
66=over
67
68=item *
69
70ACeDB ("ace")
71
72=item *
73
74Blast ("blast")
75
76=item *
77
78ClustalW ("clustalw")
79
80=item *
81
82Codata ("codata")
83
84=item *
85
86EMBL ("embl")
87
88=item *
89
90FastA sequence ("fasta")
91
92=item *
93
94FastQ sequence ("fastq")
95
96=item *
97
98FastXY/FastA alignment ("fastxy")
99
100=item *
101
102Game XML ("game")
103
104=item *
105
106GCG ("gcg")
107
108=item *
109
110GCG Blast ("gcgblast")
111
112=item *
113
114GCG FastA ("gcgfasta")
115
116=item *
117
118GDE ("gde")
119
120=item *
121
122Genbank ("genbank")
123
124=item *
125
126Genscan ("genscan")
127
128=item *
129
130GFF ("gff")
131
132=item *
133
134HMMER ("hmmer")
135
136=item *
137
138PAUP/NEXUS ("nexus")
139
140=item *
141
142Phrap assembly file ("phrap")
143
144=item *
145
146NBRF/PIR ("pir")
147
148=item *
149
150Mase ("mase")
151
152=item *
153
154Mega ("mega")
155
156=item *
157
158GCG/MSF ("msf")
159
160=item *
161
162Pfam ("pfam")
163
164=item *
165
166Phylip ("phylip")
167
168=item *
169
170Prodom ("prodom")
171
172=item *
173
174Raw ("raw")
175
176=item *
177
178RSF ("rsf")
179
180=item *
181
182Selex ("selex")
183
184=item *
185
186Stockholm ("stockholm")
187
188=item *
189
190Swissprot ("swiss")
191
192=item *
193
194Tab ("tab")
195
196=item *
197
198Variant Call Format ("vcf")
199
200=back
201
202=head1 FEEDBACK
203
204=head2 Mailing Lists
205
206User feedback is an integral part of the evolution of this and
207other Bioperl modules.  Send your comments and suggestions
208preferably to one of the Bioperl mailing lists.  Your
209participation is much appreciated.
210
211  bioperl-l@bioperl.org                  - General discussion
212  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
213
214=head2 Support
215
216Please direct usage questions or support issues to the mailing list:
217
218I<bioperl-l@bioperl.org>
219
220rather than to the module maintainer directly. Many experienced and
221reponsive experts will be able look at the problem and quickly
222address it. Please include a thorough description of the problem
223with code and data examples if at all possible.
224
225=head2 Reporting Bugs
226
227Report bugs to the Bioperl bug tracking system to help us
228keep track the bugs and their resolution.  Bug reports can be
229submitted via the web:
230
231  https://github.com/bioperl/bioperl-live/issues
232
233=head1 AUTHOR
234
235Andreas KE<228>hE<228>ri, andreas.kahari@ebi.ac.uk
236
237=head1 CONTRIBUTORS
238
239Heikki LehvE<228>slaiho, heikki-at-bioperl-dot-org
240Mark A. Jensen, maj-at-fortinbras-dot-us
241
242=cut
243
244
245package Bio::Tools::GuessSeqFormat;
246$Bio::Tools::GuessSeqFormat::VERSION = '1.7.7';
247use strict;
248use warnings;
249
250
251use base qw(Bio::Root::Root);
252
253=head1 METHODS
254
255Methods available to Bio::Tools::GuessSeqFormat objects
256are described below.  Methods with names beginning with an
257underscore are considered to be internal.
258
259=cut
260
261=head2 new
262
263 Title      : new
264 Usage      : $guesser = Bio::Tools::GuessSeqFormat->new( ... );
265 Function   : Creates a new object.
266 Example    : See SYNOPSIS.
267 Returns    : A new object.
268 Arguments  : -file The filename of the file whose format is to
269                    be guessed, e.g. STDIN, or
270              -fh   An already opened filehandle from which a text
271                    stream may be read, or
272              -text A scalar containing one or several lines of
273                    text with embedded newlines.
274
275    If more than one of the above arguments are given, they
276    are tested in the order -text, -file, -fh, and the first
277    available argument will be used.
278
279=cut
280
281sub new
282{
283    my $class = shift;
284    my @args  = @_;
285
286    my $self = $class->SUPER::new(@args);
287
288    my $attr;
289    my $value;
290
291    while (@args) {
292        $attr = shift @args;
293        $attr = lc $attr;
294        $value = shift @args;
295        $self->{$attr} = $value;
296    }
297
298    return $self;
299}
300
301=head2 file
302
303 Title      : file
304 Usage      : $guesser->file($filename);
305              $filename = $guesser->file;
306 Function   : Gets or sets the current filename associated with
307              an object.
308 Returns    : The new filename.
309 Arguments  : The filename of the file whose format is to be
310              guessed.
311
312    A call to this method will clear the current filehandle and
313    the current lines of text associated with the object.
314
315=cut
316
317sub file
318{
319    # Sets and/or returns the filename to use.
320    my $self = shift;
321    my $file = shift;
322
323    if (defined $file) {
324        # Set the active filename, and clear the filehandle and
325        # text line, if present.
326        $self->{-file} = $file;
327        $self->{-fh} = $self->{-text} = undef;
328    }
329
330    return $self->{-file};
331}
332
333=head2 fh
334
335 Title      : fh
336 Usage      : $guesser->fh($filehandle);
337              $filehandle = $guesser->fh;
338 Function   : Gets or sets the current filehandle associated with
339              an object.
340 Returns    : The new filehandle.
341 Arguments  : An already opened filehandle from which a text
342              stream may be read.
343
344    A call to this method will clear the current filename and
345    the current lines of text associated with the object.
346
347=cut
348
349sub fh
350{
351    # Sets and/or returns the filehandle to use.
352    my $self = shift;
353    my $fh = shift;
354
355    if (defined $fh) {
356        # Set the active filehandle, and clear the filename and
357        # text line, if present.
358        $self->{-fh} = $fh;
359        $self->{-file} = $self->{-text} = undef;
360    }
361
362    return $self->{-fh};
363}
364
365
366=head2 text
367
368 Title      : text
369 Usage      : $guesser->text($linesoftext);
370              $linesofext = $guesser->text;
371 Function   : Gets or sets the current text associated with an
372              object.
373 Returns    : The new lines of texts.
374 Arguments  : A scalar containing one or several lines of text,
375              including embedded newlines.
376
377    A call to this method will clear the current filename and
378    the current filehandle associated with the object.
379
380=cut
381
382sub text
383{
384    # Sets and/or returns the text lines to use.
385    my $self = shift;
386    my $text = shift;
387
388    if (defined $text) {
389        # Set the active text lines, and clear the filehandle
390        # and filename, if present.
391        $self->{-text} = $text;
392        $self->{-fh} = $self->{-file} = undef;
393    }
394
395    return $self->{-text};
396}
397
398=head2 guess
399
400 Title      : guess
401 Usage      : $format = $guesser->guess;
402              @format = $guesser->guess; # if given a line of text
403 Function   : Guesses the format of the data accociated with the
404              object.
405 Returns    : A format string such as "swiss" or "pir".  If a
406              format can not be found, undef is returned.
407 Arguments  : None.
408
409    If the object is associated with a filehandle, the position
410    of the filehandle will be returned to its original position
411    before the method returns.
412
413=cut
414
415our %formats = (
416    ace         => { test => \&_possibly_ace        },
417    blast       => { test => \&_possibly_blast      },
418    bowtie      => { test => \&_possibly_bowtie     },
419    clustalw    => { test => \&_possibly_clustalw   },
420    codata      => { test => \&_possibly_codata     },
421    embl        => { test => \&_possibly_embl       },
422    fasta       => { test => \&_possibly_fasta      },
423    fastq       => { test => \&_possibly_fastq      },
424    fastxy      => { test => \&_possibly_fastxy     },
425    game        => { test => \&_possibly_game       },
426    gcg         => { test => \&_possibly_gcg        },
427    gcgblast    => { test => \&_possibly_gcgblast   },
428    gcgfasta    => { test => \&_possibly_gcgfasta   },
429    gde         => { test => \&_possibly_gde        },
430    genbank     => { test => \&_possibly_genbank    },
431    genscan     => { test => \&_possibly_genscan    },
432    gff         => { test => \&_possibly_gff        },
433    hmmer       => { test => \&_possibly_hmmer      },
434    nexus       => { test => \&_possibly_nexus      },
435    mase        => { test => \&_possibly_mase       },
436    mega        => { test => \&_possibly_mega       },
437    msf         => { test => \&_possibly_msf        },
438    pfam        => { test => \&_possibly_pfam       },
439    phrap       => { test => \&_possibly_phrap      },
440    phylip      => { test => \&_possibly_phylip     },
441    pir         => { test => \&_possibly_pir        },
442    prodom      => { test => \&_possibly_prodom     },
443    raw         => { test => \&_possibly_raw        },
444    rsf         => { test => \&_possibly_rsf        },
445    selex       => { test => \&_possibly_selex      },
446    stockholm   => { test => \&_possibly_stockholm  },
447    swiss       => { test => \&_possibly_swiss      },
448    tab         => { test => \&_possibly_tab        },
449    vcf         => { test => \&_possibly_vcf        },
450);
451
452sub guess
453{
454    my $self = shift;
455
456    while (my ($fmt_key) = each (%formats)) {
457        $formats{$fmt_key}{fmt_string} = $fmt_key;
458    }
459
460    my $fh;
461    my $start_pos;
462    if (defined $self->{-text}) {
463        # Break the text into separate lines.
464        my $text = $self->{-text};
465        open $fh, '<', \$text or $self->throw("Could not read from string: $!");
466
467    } elsif (defined $self->{-file}) {
468        # If given a filename, open the file.
469        my $file = $self->{-file};
470        open $fh, '<', $file or $self->throw("Could not read file '$file': $!");
471
472    } elsif (defined $self->{-fh}) {
473        # If given a filehandle, get the current position in the stream.
474        $fh = $self->{-fh};
475        if (not seek $fh, 0, 1) { # seek to current position to determine seekability
476            # Work around non-seekable filehandles if IO::Scalar is available
477            # (adapted from http://www.perlmonks.org/?node_id=33587)
478            # IO::Mark may be an option for very large streams?
479            $self->throw("Need IO::Scalar to guess from unseekable filehandles")
480                if not eval { require IO::Scalar };
481            my $data;
482            { local $/; $data = <$fh>; $.-- };  # copy raw data from fh
483            tie *$fh, 'IO::Scalar', my $s;      # replace fh by scalar-tied fh
484            print $fh $data;                    # write raw data to tied fh
485            seek $fh, 0, 0;                     # return to start of tied fh
486        }
487        $start_pos = tell $fh;
488    }
489
490    my $done  = 0;
491    my $lineno = 0;
492    my $guess;
493    while (!$done) {
494        my $line;       # The next line of the file.
495        my $match = 0;  # Number of possible formats of this line.
496
497        last if (!defined($line = <$fh>));
498        next if ($line =~ /^\s*$/); # Skip white and empty lines.
499        chomp $line;
500        $line =~ s/\r$//;   # Fix for DOS files on Unix.
501        ++$lineno;
502
503        while (my ($fmt_key, $fmt) = each (%formats)) {
504            if ($fmt->{test}($line, $lineno)) {
505                ++$match;
506                $guess = $fmt->{fmt_string};
507            }
508        }
509
510        # We're done if there was only one match.
511        $done = ($match == 1);
512    }
513
514    if (defined $self->{-fh}) {
515        # Go back to original position in filehandle
516        seek $fh, $start_pos, 0 or $self->throw("Could not reset filehandle $fh: $!");
517    } else {
518        # Close the filehandle we opened
519        close $fh;
520    }
521    return ($done ? $guess : undef);
522}
523
524=head1 HELPER SUBROUTINES
525
526All helper subroutines will, given a line of text and the line
527number of the same line, return 1 if the line possibly is from a
528file of the type that they perform a test of.
529
530A zero return value does not mean that the line is not part
531of a certain type of file, just that the test did not find any
532characteristics of that type of file in the line.
533
534=head2 _possibly_ace
535
536From bioperl test data, and from
537"http://www.isrec.isb-sib.ch/DEA/module8/B_Stevenson/Practicals/transcriptome_recon/transcriptome_recon.html".
538
539=cut
540
541sub _possibly_ace
542{
543    my ($line, $lineno) = (shift, shift);
544    return ($line =~ /^(?:Sequence|Peptide|DNA|Protein) [":]/);
545}
546
547=head2 _possibly_blast
548
549 From various blast results.
550
551=cut
552
553sub _possibly_blast
554{
555    my ($line, $lineno) = (shift, shift);
556    return ($lineno == 1 &&
557        $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/);
558}
559
560=head2 _possibly_bowtie
561
562Contributed by kortsch.
563
564=cut
565
566sub _possibly_bowtie
567{
568    my ($line, $lineno) = (shift, shift);
569    return ($line =~ /^[[:graph:]]+\t[-+]\t[[:graph:]]+\t\d+\t([[:alpha:]]+)\t([[:graph:]]+)\t\d+\t[[:graph:]]?/)
570            && length($1)==length($2);
571}
572
573=head2 _possibly_clustalw
574
575From "http://www.ebi.ac.uk/help/formats.html".
576
577=cut
578
579sub _possibly_clustalw
580{
581    my ($line, $lineno) = (shift, shift);
582    return ($lineno == 1 && $line =~ /CLUSTAL/);
583}
584
585=head2 _possibly_codata
586
587From "http://www.ebi.ac.uk/help/formats.html".
588
589=cut
590
591sub _possibly_codata
592{
593    my ($line, $lineno) = (shift, shift);
594    return (($lineno == 1 && $line =~ /^ENTRY/) ||
595            ($lineno == 2 && $line =~ /^SEQUENCE/) ||
596            $line =~ m{^(?:ENTRY|SEQUENCE|///)});
597}
598
599=head2 _possibly_embl
600
601From
602"http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3.3".
603
604=cut
605
606sub _possibly_embl
607{
608    my ($line, $lineno) = (shift, shift);
609    return ($lineno == 1 && $line =~ /^ID   / && $line =~ /BP\.$/);
610}
611
612=head2 _possibly_fasta
613
614From "http://www.ebi.ac.uk/help/formats.html".
615
616=cut
617
618sub _possibly_fasta
619{
620    my ($line, $lineno) = (shift, shift);
621    return (($lineno != 1 && $line =~ /^[A-IK-NP-Z]+$/i) ||
622            $line =~ /^>\s*\w/);
623}
624
625=head2 _possibly_fastq
626
627From bioperl test data.
628
629=cut
630
631sub _possibly_fastq
632{
633    my ($line, $lineno) = (shift, shift);
634    return ( ($lineno == 1 && $line =~ /^@/) ||
635             ($lineno == 3 && $line =~ /^\+/) );
636}
637
638=head2 _possibly_fastxy
639
640From bioperl test data.
641
642=cut
643
644sub _possibly_fastxy
645{
646    my ($line, $lineno) = (shift, shift);
647    return (($lineno == 1 && $line =~ /^ FAST(?:XY|A)/) ||
648            ($lineno == 2 && $line =~ /^ version \d/));
649}
650
651=head2 _possibly_game
652
653From bioperl testdata.
654
655=cut
656
657sub _possibly_game
658{
659    my ($line, $lineno) = (shift, shift);
660    return ($line =~ /^<!DOCTYPE game/);
661}
662
663=head2 _possibly_gcg
664
665From bioperl, Bio::SeqIO::gcg.
666
667=cut
668
669sub _possibly_gcg
670{
671    my ($line, $lineno) = (shift, shift);
672    return ($line =~ /Length: .*Type: .*Check: .*\.\.$/);
673}
674
675=head2 _possibly_gcgblast
676
677From bioperl testdata.
678
679=cut
680
681sub _possibly_gcgblast
682{
683    my ($line, $lineno) = (shift, shift);
684    return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) ||
685            ($lineno == 2 &&
686             $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/));
687}
688
689=head2 _possibly_gcgfasta
690
691From bioperl testdata.
692
693=cut
694
695sub _possibly_gcgfasta
696{
697    my ($line, $lineno) = (shift, shift);
698    return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) ||
699            ($lineno == 2 && $line =~ /FASTA/));
700}
701
702=head2 _possibly_gde
703
704From "http://www.ebi.ac.uk/help/formats.html".
705
706=cut
707
708sub _possibly_gde
709{
710    my ($line, $lineno) = (shift, shift);
711    return ($line =~ /^[{}]$/ ||
712            $line =~ /^(?:name|longname|sequence-ID|
713                          creation-date|direction|strandedness|
714                          type|offset|group-ID|creator|descrip|
715                          comment|sequence)/x);
716}
717
718=head2 _possibly_genbank
719
720From "http://www.ebi.ac.uk/help/formats.html".
721Format of [apparantly optional] file header from
722"http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm". (TODO: dead link)
723
724=cut
725
726sub _possibly_genbank
727{
728    my ($line, $lineno) = (shift, shift);
729    return (($lineno == 1 && $line =~ /GENETIC SEQUENCE DATA BANK/) ||
730            ($lineno == 1 && $line =~ /^LOCUS /) ||
731            ($lineno == 2 && $line =~ /^DEFINITION /) ||
732            ($lineno == 3 && $line =~ /^ACCESSION /));
733}
734
735=head2 _possibly_genscan
736
737From bioperl test data.
738
739=cut
740
741sub _possibly_genscan
742{
743    my ($line, $lineno) = (shift, shift);
744    return (($lineno == 1 && $line =~ /^GENSCAN.*Date.*Time/) ||
745            ($line =~ /^(?:Sequence\s+\w+|Parameter matrix|Predicted genes)/));
746}
747
748=head2 _possibly_gff
749
750From bioperl test data.
751
752=cut
753
754sub _possibly_gff
755{
756    my ($line, $lineno) = (shift, shift);
757    return (($lineno == 1 && $line =~ /^##gff-version/) ||
758            ($lineno == 2 && $line =~ /^##date/));
759}
760
761=head2 _possibly_hmmer
762
763From bioperl test data.
764
765=cut
766
767sub _possibly_hmmer
768{
769    my ($line, $lineno) = (shift, shift);
770    return (($lineno == 2 && $line =~ /^HMMER/) ||
771            ($lineno == 3 &&
772             $line =~ /Washington University School of Medicine/));
773}
774
775=head2 _possibly_nexus
776
777From "http://paup.csit.fsu.edu/nfiles.html".
778
779=cut
780
781sub _possibly_nexus
782{
783    my ($line, $lineno) = (shift, shift);
784    return ($lineno == 1 && $line =~ /^#NEXUS/);
785}
786
787=head2 _possibly_mase
788
789From bioperl test data.
790More detail from "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm" (TODO: dead link)
791
792=cut
793
794sub _possibly_mase
795{
796    my ($line, $lineno) = (shift, shift);
797    return (($lineno == 1 && $line =~ /^;;/) ||
798            ($lineno > 1 && $line =~ /^;[^;]?/));
799}
800
801=head2 _possibly_mega
802
803From the ensembl broswer (AlignView data export).
804
805=cut
806
807sub _possibly_mega
808{
809    my ($line, $lineno) = (shift, shift);
810    return ($lineno == 1 && $line =~ /^#mega$/);
811}
812
813
814=head2 _possibly_msf
815
816From "http://www.ebi.ac.uk/help/formats.html".
817
818=cut
819
820sub _possibly_msf
821{
822    my ($line, $lineno) = (shift, shift);
823    return ($line =~ m{^//} ||
824            $line =~ /MSF:.*Type:.*Check:|Name:.*Len:/);
825}
826
827=head2 _possibly_phrap
828
829From "http://biodata.ccgb.umn.edu/docs/contigimage.html". (TODO: dead link)
830From "http://genetics.gene.cwru.edu/gene508/Lec6.htm".    (TODO: dead link)
831From bioperl test data ("*.ace.1" files).
832
833=cut
834
835sub _possibly_phrap
836{
837    my ($line, $lineno) = (shift, shift);
838    return ($line =~ /^(?:AS\ |CO\ Contig|BQ|AF\ |BS\ |RD\ |
839                          QA\ |DS\ |RT\{)/x);
840}
841
842=head2 _possibly_pir
843
844From "http://www.ebi.ac.uk/help/formats.html".
845The ".,()" spotted in bioperl test data.
846
847=cut
848
849sub _possibly_pir # "NBRF/PIR" (?)
850{
851    my ($line, $lineno) = (shift, shift);
852    return (($lineno != 1 && $line =~ /^[\sA-IK-NP-Z.,()]+\*?$/i) ||
853            $line =~ /^>(?:P1|F1|DL|DC|RL|RC|N3|N1);/);
854}
855
856=head2 _possibly_pfam
857
858From bioperl test data.
859
860=cut
861
862sub _possibly_pfam
863{
864    my ($line, $lineno) = (shift, shift);
865    return ($line =~ m{^\w+/\d+-\d+\s+[A-IK-NP-Z.]+}i);
866}
867
868=head2 _possibly_phylip
869
870From "http://www.ebi.ac.uk/help/formats.html".  Initial space
871allowed on first line (spotted in ensembl AlignView exported
872data).
873
874=cut
875
876sub _possibly_phylip
877{
878    my ($line, $lineno) = (shift, shift);
879    return (($lineno == 1 && $line =~ /^\s*\d+\s\d+/) ||
880            ($lineno == 2 && $line =~ /^\w\s+[A-IK-NP-Z\s]+/) ||
881            ($lineno == 3 && $line =~ /(?:^\w\s+[A-IK-NP-Z\s]+|\s+[A-IK-NP-Z\s]+)/)
882           );
883}
884
885=head2 _possibly_prodom
886
887From "http://prodom.prabi.fr/prodom/current/documentation/data.php".
888
889=cut
890
891sub _possibly_prodom
892{
893    my ($line, $lineno) = (shift, shift);
894    return ($lineno == 1 && $line =~ /^ID   / && $line =~ /\d+ seq\.$/);
895}
896
897=head2 _possibly_raw
898
899From "http://www.ebi.ac.uk/help/formats.html".
900
901=cut
902
903sub _possibly_raw
904{
905    my ($line, $lineno) = (shift, shift);
906    return ($line =~ /^[A-Za-z\s]+$/);
907}
908
909=head2 _possibly_rsf
910
911From "http://www.ebi.ac.uk/help/formats.html".
912
913=cut
914
915sub _possibly_rsf
916{
917    my ($line, $lineno) = (shift, shift);
918    return (($lineno == 1 && $line =~ /^!!RICH_SEQUENCE/) ||
919            $line =~ /^[{}]$/ ||
920            $line =~ /^(?:name|type|longname|
921                          checksum|creation-date|strand|sequence)/x);
922}
923
924=head2 _possibly_selex
925
926From "http://www.ebc.ee/WWW/hmmer2-html/node27.html".
927
928Assuming presence of Selex file header.  Data exported by
929Bioperl on Pfam and Selex formats are identical, but Pfam file
930only holds one alignment.
931
932=cut
933
934sub _possibly_selex
935{
936    my ($line, $lineno) = (shift, shift);
937    return (($lineno == 1 && $line =~ /^#=ID /) ||
938            ($lineno == 2 && $line =~ /^#=AC /) ||
939            ($line =~ /^#=SQ /));
940}
941
942=head2 _possibly_stockholm
943
944From bioperl test data.
945
946=cut
947
948sub _possibly_stockholm
949{
950    my ($line, $lineno) = (shift, shift);
951    return (($lineno == 1 && $line =~ /^# STOCKHOLM/) ||
952            $line =~ /^#=(?:GF|GS) /);
953}
954
955
956
957=head2 _possibly_swiss
958
959From "http://ca.expasy.org/sprot/userman.html#entrystruc".
960
961=cut
962
963sub _possibly_swiss
964{
965    my ($line, $lineno) = (shift, shift);
966    return ($lineno == 1 && $line =~ /^ID   / && $line =~ /AA\.$/);
967}
968
969=head2 _possibly_tab
970
971Contributed by Heikki.
972
973=cut
974
975sub _possibly_tab
976{
977    my ($line, $lineno) = (shift, shift);
978    return ($lineno == 1 && $line =~ /^[^\t]+\t[^\t]+/) ;
979}
980
981=head2 _possibly_vcf
982
983From "http://www.1000genomes.org/wiki/analysis/vcf4.0".
984
985Assumptions made about sanity - format and date lines are line 1 and 2
986respectively. This is not specified in the format document.
987
988=cut
989
990sub _possibly_vcf
991{
992    my ($line, $lineno) = (shift, shift);
993    return (($lineno == 1 && $line =~ /##fileformat=VCFv/) ||
994            ($lineno == 2 && $line =~ /##fileDate=/));
995}
996
997
998
9991;
1000