1#
2# BioPerl module for Bio::AlignIO::arp
3#
4# Copyright Chris Fields
5#
6# You may distribute this module under the same terms as perl itself
7# POD documentation - main docs before the code
8
9=head1 NAME
10
11Bio::AlignIO::arp - ARP MSA Sequence input/output stream
12
13=head1 SYNOPSIS
14
15Do not use this module directly.  Use it via the L<Bio::AlignIO>
16class.
17
18=head1 DESCRIPTION
19
20This object can create L<Bio::SimpleAlign> objects from
21ARP flat files.  These are typically configuration-like data files
22for the program Arlequin.  For more information, see:
23
24  http://lgb.unige.ch/arlequin/
25
26For the moment, this retains the allele sequence data in the DATA section and
27inserts them into SimpleAlign objects. ARP files that contain other data (RFLP,
28etc.) are not expected to parse properly.  Also, if the DNA data is actually SNP
29data, then the LocatableSeq object instantiation will throw an error.
30
31This is now set up as a generic parser (i.e. it parses everything) and
32collects as much data as possible into the SimpleAlign object.  The following
33in a general mapping of where data can be found:
34
35    Tag        SimpleAlign
36               Method
37    ----------------------------------------------------------------------
38    Title      description
39    SampleName id
40    ----------------------------------------------------------------------
41
42    Tag        Bio::Annotation   TagName                    Bio::Annotation
43               Class                                        Parameters
44    ----------------------------------------------------------------------
45     NE        SimpleValue       pfam_family_accession      value
46     NL        SimpleValue       sequence_start_stop        value
47     SS        SimpleValue       sec_structure_source       value
48     BM        SimpleValue       build_model                value
49     RN        Reference         reference                  *
50    ----------------------------------------------------------------------
51  * RN is generated based on the number of Bio::Annotation::Reference objects
52
53In addition, the number of samples found in the alignment is retained in a
54Bio::Annotation::TagTree object in the annotation collection and is accessible
55via:
56
57  ($samples) = $aln->annotation->get_Annotations('Samples');
58  say $samples->display_text;
59  # or use other relevant TagTree methods to retrieve data
60
61=head1 FEEDBACK
62
63=head2 Support
64
65Please direct usage questions or support issues to the mailing list:
66
67I<bioperl-l@bioperl.org>
68
69rather than to the module maintainer directly. Many experienced and
70reponsive experts will be able look at the problem and quickly
71address it. Please include a thorough description of the problem
72with code and data examples if at all possible.
73
74=head2 Reporting Bugs
75
76Report bugs to the Bioperl bug tracking system to help us keep track
77the bugs and their resolution.  Bug reports can be submitted via the
78web:
79
80  https://github.com/bioperl/bioperl-live/issues
81
82=head1 AUTHORS
83
84Chris Fields (cjfields)
85
86=head1 APPENDIX
87
88The rest of the documentation details each of the object
89methods. Internal methods are usually preceded with a _
90
91=cut
92
93# Let the code begin...
94
95package Bio::AlignIO::arp;
96$Bio::AlignIO::arp::VERSION = '1.7.7';
97use strict;
98use base qw(Bio::AlignIO);
99
100use Data::Dumper;
101use Bio::Annotation::AnnotationFactory;
102
103=head2 next_aln
104
105 Title   : next_aln
106 Usage   : $aln = $stream->next_aln
107 Function: returns the next alignment in the stream.
108 Returns : Bio::Align::AlignI object - returns 0 on end of file
109            or on error
110 Args    : -width => optional argument to specify the width sequence
111           will be written (60 chars by default)
112
113See L<Bio::Align::AlignI>
114
115=cut
116
117sub next_aln {
118    my $self = shift;
119    my $aln = Bio::SimpleAlign->new(-source => 'arp');
120    my ($data, $cur_block, $cur_type, $cur_data);
121    SCAN:
122    while (defined ($data = $self->_readline) ) {
123        next if $data =~ m{^\s*$}xms;
124        if ($data =~ m{\[{1,2}(\w+)\]{1,2}}xms) {
125            $self->{state}->{current_block} = $1;
126            next SCAN;
127        }
128        elsif ($data =~ m{^\s*(\w+)=\s?(\S[^\n]*$)}xms) {
129            ($cur_type, $cur_data) = ($1, $2);
130            if ($cur_data =~ m{^\s*\{\s*$}) {
131                $self->throw("Curly block must be embedded in a named Block")
132                    if !exists($self->{state}->{current_block});
133                $self->{state}->{in_curly_block} = 1;
134                next SCAN;
135            }
136            $cur_data =~ s{[\"\']}{}g;
137            $cur_data =~ s{\s*$}{};
138            # per alignment annotation data (i.e. Sample Blocks) or
139            # annotation data retained for each alignment?
140            $self->{state}->{current_block} eq 'Samples' ?
141                push @{$self->{state}->{SampleAnnotation}->{$cur_type}}, $cur_data :
142                push @{$self->{state}->{Annotation}->{$cur_type}}, $cur_data;
143        }
144        elsif ($data =~ m{^\s*\}\s*$}xms) {
145            $self->throw("Unmatched bracket in ARP file:\n$data") if
146                !exists($self->{state}->{in_curly_block});
147            if ($self->{state}->{current_block} eq 'Samples') {;
148                my $ac = $self->_process_annotation($aln);
149                delete $self->{state}->{SampleAnnotation};
150            } else {
151                # process other data at a later point
152            }
153            delete $self->{state}->{blockdata};
154            $self->{state}->{in_curly_block} = 0;
155            last SCAN;
156        }
157        else {
158            # all other data should be in a curly block and have a block title
159            $self->throw("Data found outside of proper block:\n$data") if
160                !exists($self->{state}->{current_block}) && !$self->{state}->{in_curly_block};
161            # bypass commented stuff (but we may want to process it at a later
162            # point, so turn back here)
163            next if $data =~ m{^\s*\#}xms;
164            if ($self->{state}->{current_block} eq 'Samples') {
165                chomp $data;
166                # we have two possible ways to deal with sample number, either
167                # clone the LocatableSeq (in which case we need to deal with ID
168                # duplication), or store as annotation data. I chose the latter
169                # route using a Bio::Annotation::TagTree. YMMV - cjfields 10-15-08
170                my ($ls, $samples) = $self->_process_sequence($data);
171                my $id = $ls->id;
172                push @{ $self->{state}->{SampleAnnotation}->{Samples} }, [$id => $samples];
173                $aln->add_seq($ls);
174            } else {
175                # add elsif's for further processing
176                #$self->debug('Unmatched data in block '.
177                #             $self->{state}->{current_block}.
178                #             ":\n$data\n");
179                $self->{state}->{blockdata} .= $data;
180            }
181        }
182    }
183    # alignments only returned if they contain sequences
184    return $aln if $aln->num_sequences;
185    return;
186}
187
188=head2 write_aln
189
190 Title   : write_aln
191 Usage   : $stream->write_aln(@aln)
192 Function: writes the $aln object into the stream in xmfa format
193 Returns : 1 for success and 0 for error
194 Args    : L<Bio::Align::AlignI> object
195
196See L<Bio::Align::AlignI>
197
198=cut
199
200sub write_aln {
201    my ($self,@aln) = @_;
202    $self->throw_not_implemented;
203}
204
205################ PRIVATE SUBS ################
206
207sub _process_sequence {
208    my ($self, $raw) = @_;
209    return unless defined $raw;
210    $raw =~ s{(?:^\s+|\s+$)}{}g;
211    my ($id, $samples, $seq) = split(' ', $raw);
212    my $ls = Bio::LocatableSeq->new('-seq'        => $seq,
213                                    '-start'      => 1,
214                                    '-display_id' => $id,
215				    '-alphabet'   => $self->alphabet);
216    return($ls, $samples);
217}
218
219sub _process_annotation {
220    my ($self, $aln) = @_;
221    my $coll = Bio::Annotation::Collection->new();
222    my $factory = Bio::Annotation::AnnotationFactory->new(-type => 'Bio::Annotation::SimpleValue');
223    for my $anntype (qw(SampleAnnotation Annotation)) {
224        for my $key (keys %{ $self->{state}->{$anntype} }) {
225            if ($key eq 'Title') {
226                $aln->description($self->{state}->{$anntype}->{$key}[0]);
227            } elsif ($key eq 'Samples') {
228                $factory->type('Bio::Annotation::TagTree');
229                $coll->add_Annotation($key, $factory->create_object(
230                    -value => [$key => $self->{state}->{$anntype}->{$key}]));
231                $factory->type('Bio::Annotation::SimpleValue');
232            } elsif ($key eq 'SampleName') {
233                $aln->id($self->{state}->{$anntype}->{$key}[0]);
234            } else {
235                $self->throw('Expecting an array reference') unless
236                    ref $self->{state}->{$anntype}->{$key} eq 'ARRAY';
237                for my $a (@{ $self->{state}->{$anntype}->{$key} }) {
238                    $coll->add_Annotation($key, $factory->create_object(
239                        -value => $a) );
240                }
241            }
242        }
243    }
244    #$self->debug("Collection:".Dumper($coll)."\n");
245    $aln->annotation($coll);
246}
247
2481;
249