1# 2# BioPerl module for Bio::AlignIO::arp 3# 4# Copyright Chris Fields 5# 6# You may distribute this module under the same terms as perl itself 7# POD documentation - main docs before the code 8 9=head1 NAME 10 11Bio::AlignIO::arp - ARP MSA Sequence input/output stream 12 13=head1 SYNOPSIS 14 15Do not use this module directly. Use it via the L<Bio::AlignIO> 16class. 17 18=head1 DESCRIPTION 19 20This object can create L<Bio::SimpleAlign> objects from 21ARP flat files. These are typically configuration-like data files 22for the program Arlequin. For more information, see: 23 24 http://lgb.unige.ch/arlequin/ 25 26For the moment, this retains the allele sequence data in the DATA section and 27inserts them into SimpleAlign objects. ARP files that contain other data (RFLP, 28etc.) are not expected to parse properly. Also, if the DNA data is actually SNP 29data, then the LocatableSeq object instantiation will throw an error. 30 31This is now set up as a generic parser (i.e. it parses everything) and 32collects as much data as possible into the SimpleAlign object. The following 33in a general mapping of where data can be found: 34 35 Tag SimpleAlign 36 Method 37 ---------------------------------------------------------------------- 38 Title description 39 SampleName id 40 ---------------------------------------------------------------------- 41 42 Tag Bio::Annotation TagName Bio::Annotation 43 Class Parameters 44 ---------------------------------------------------------------------- 45 NE SimpleValue pfam_family_accession value 46 NL SimpleValue sequence_start_stop value 47 SS SimpleValue sec_structure_source value 48 BM SimpleValue build_model value 49 RN Reference reference * 50 ---------------------------------------------------------------------- 51 * RN is generated based on the number of Bio::Annotation::Reference objects 52 53In addition, the number of samples found in the alignment is retained in a 54Bio::Annotation::TagTree object in the annotation collection and is accessible 55via: 56 57 ($samples) = $aln->annotation->get_Annotations('Samples'); 58 say $samples->display_text; 59 # or use other relevant TagTree methods to retrieve data 60 61=head1 FEEDBACK 62 63=head2 Support 64 65Please direct usage questions or support issues to the mailing list: 66 67I<bioperl-l@bioperl.org> 68 69rather than to the module maintainer directly. Many experienced and 70reponsive experts will be able look at the problem and quickly 71address it. Please include a thorough description of the problem 72with code and data examples if at all possible. 73 74=head2 Reporting Bugs 75 76Report bugs to the Bioperl bug tracking system to help us keep track 77the bugs and their resolution. Bug reports can be submitted via the 78web: 79 80 https://github.com/bioperl/bioperl-live/issues 81 82=head1 AUTHORS 83 84Chris Fields (cjfields) 85 86=head1 APPENDIX 87 88The rest of the documentation details each of the object 89methods. Internal methods are usually preceded with a _ 90 91=cut 92 93# Let the code begin... 94 95package Bio::AlignIO::arp; 96$Bio::AlignIO::arp::VERSION = '1.7.7'; 97use strict; 98use base qw(Bio::AlignIO); 99 100use Data::Dumper; 101use Bio::Annotation::AnnotationFactory; 102 103=head2 next_aln 104 105 Title : next_aln 106 Usage : $aln = $stream->next_aln 107 Function: returns the next alignment in the stream. 108 Returns : Bio::Align::AlignI object - returns 0 on end of file 109 or on error 110 Args : -width => optional argument to specify the width sequence 111 will be written (60 chars by default) 112 113See L<Bio::Align::AlignI> 114 115=cut 116 117sub next_aln { 118 my $self = shift; 119 my $aln = Bio::SimpleAlign->new(-source => 'arp'); 120 my ($data, $cur_block, $cur_type, $cur_data); 121 SCAN: 122 while (defined ($data = $self->_readline) ) { 123 next if $data =~ m{^\s*$}xms; 124 if ($data =~ m{\[{1,2}(\w+)\]{1,2}}xms) { 125 $self->{state}->{current_block} = $1; 126 next SCAN; 127 } 128 elsif ($data =~ m{^\s*(\w+)=\s?(\S[^\n]*$)}xms) { 129 ($cur_type, $cur_data) = ($1, $2); 130 if ($cur_data =~ m{^\s*\{\s*$}) { 131 $self->throw("Curly block must be embedded in a named Block") 132 if !exists($self->{state}->{current_block}); 133 $self->{state}->{in_curly_block} = 1; 134 next SCAN; 135 } 136 $cur_data =~ s{[\"\']}{}g; 137 $cur_data =~ s{\s*$}{}; 138 # per alignment annotation data (i.e. Sample Blocks) or 139 # annotation data retained for each alignment? 140 $self->{state}->{current_block} eq 'Samples' ? 141 push @{$self->{state}->{SampleAnnotation}->{$cur_type}}, $cur_data : 142 push @{$self->{state}->{Annotation}->{$cur_type}}, $cur_data; 143 } 144 elsif ($data =~ m{^\s*\}\s*$}xms) { 145 $self->throw("Unmatched bracket in ARP file:\n$data") if 146 !exists($self->{state}->{in_curly_block}); 147 if ($self->{state}->{current_block} eq 'Samples') {; 148 my $ac = $self->_process_annotation($aln); 149 delete $self->{state}->{SampleAnnotation}; 150 } else { 151 # process other data at a later point 152 } 153 delete $self->{state}->{blockdata}; 154 $self->{state}->{in_curly_block} = 0; 155 last SCAN; 156 } 157 else { 158 # all other data should be in a curly block and have a block title 159 $self->throw("Data found outside of proper block:\n$data") if 160 !exists($self->{state}->{current_block}) && !$self->{state}->{in_curly_block}; 161 # bypass commented stuff (but we may want to process it at a later 162 # point, so turn back here) 163 next if $data =~ m{^\s*\#}xms; 164 if ($self->{state}->{current_block} eq 'Samples') { 165 chomp $data; 166 # we have two possible ways to deal with sample number, either 167 # clone the LocatableSeq (in which case we need to deal with ID 168 # duplication), or store as annotation data. I chose the latter 169 # route using a Bio::Annotation::TagTree. YMMV - cjfields 10-15-08 170 my ($ls, $samples) = $self->_process_sequence($data); 171 my $id = $ls->id; 172 push @{ $self->{state}->{SampleAnnotation}->{Samples} }, [$id => $samples]; 173 $aln->add_seq($ls); 174 } else { 175 # add elsif's for further processing 176 #$self->debug('Unmatched data in block '. 177 # $self->{state}->{current_block}. 178 # ":\n$data\n"); 179 $self->{state}->{blockdata} .= $data; 180 } 181 } 182 } 183 # alignments only returned if they contain sequences 184 return $aln if $aln->num_sequences; 185 return; 186} 187 188=head2 write_aln 189 190 Title : write_aln 191 Usage : $stream->write_aln(@aln) 192 Function: writes the $aln object into the stream in xmfa format 193 Returns : 1 for success and 0 for error 194 Args : L<Bio::Align::AlignI> object 195 196See L<Bio::Align::AlignI> 197 198=cut 199 200sub write_aln { 201 my ($self,@aln) = @_; 202 $self->throw_not_implemented; 203} 204 205################ PRIVATE SUBS ################ 206 207sub _process_sequence { 208 my ($self, $raw) = @_; 209 return unless defined $raw; 210 $raw =~ s{(?:^\s+|\s+$)}{}g; 211 my ($id, $samples, $seq) = split(' ', $raw); 212 my $ls = Bio::LocatableSeq->new('-seq' => $seq, 213 '-start' => 1, 214 '-display_id' => $id, 215 '-alphabet' => $self->alphabet); 216 return($ls, $samples); 217} 218 219sub _process_annotation { 220 my ($self, $aln) = @_; 221 my $coll = Bio::Annotation::Collection->new(); 222 my $factory = Bio::Annotation::AnnotationFactory->new(-type => 'Bio::Annotation::SimpleValue'); 223 for my $anntype (qw(SampleAnnotation Annotation)) { 224 for my $key (keys %{ $self->{state}->{$anntype} }) { 225 if ($key eq 'Title') { 226 $aln->description($self->{state}->{$anntype}->{$key}[0]); 227 } elsif ($key eq 'Samples') { 228 $factory->type('Bio::Annotation::TagTree'); 229 $coll->add_Annotation($key, $factory->create_object( 230 -value => [$key => $self->{state}->{$anntype}->{$key}])); 231 $factory->type('Bio::Annotation::SimpleValue'); 232 } elsif ($key eq 'SampleName') { 233 $aln->id($self->{state}->{$anntype}->{$key}[0]); 234 } else { 235 $self->throw('Expecting an array reference') unless 236 ref $self->{state}->{$anntype}->{$key} eq 'ARRAY'; 237 for my $a (@{ $self->{state}->{$anntype}->{$key} }) { 238 $coll->add_Annotation($key, $factory->create_object( 239 -value => $a) ); 240 } 241 } 242 } 243 } 244 #$self->debug("Collection:".Dumper($coll)."\n"); 245 $aln->annotation($coll); 246} 247 2481; 249