1# 2# BioPerl module for Bio::AlignIO::emboss 3# 4# Please direct questions and support issues to <bioperl-l@bioperl.org> 5# 6# Cared for by Jason Stajich <jason@bioperl.org> 7# 8# Copyright Jason Stajich 9# 10# You may distribute this module under the same terms as perl itself 11 12# POD documentation - main docs before the code 13 14=head1 NAME 15 16Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle) 17 18=head1 SYNOPSIS 19 20 # do not use the object directly 21 use Bio::AlignIO; 22 # read in an alignment from the EMBOSS program water 23 my $in = Bio::AlignIO->new(-format => 'emboss', 24 -file => 'seq.water'); 25 while( my $aln = $in->next_aln ) { 26 # do something with the alignment 27 } 28 29=head1 DESCRIPTION 30 31This object handles parsing and writing pairwise sequence alignments 32from the EMBOSS suite. 33 34=head1 FEEDBACK 35 36=head2 Mailing Lists 37 38User feedback is an integral part of the evolution of this and other 39Bioperl modules. Send your comments and suggestions preferably to 40the Bioperl mailing list. Your participation is much appreciated. 41 42 bioperl-l@bioperl.org - General discussion 43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 44 45=head2 Support 46 47Please direct usage questions or support issues to the mailing list: 48 49I<bioperl-l@bioperl.org> 50 51rather than to the module maintainer directly. Many experienced and 52reponsive experts will be able look at the problem and quickly 53address it. Please include a thorough description of the problem 54with code and data examples if at all possible. 55 56=head2 Reporting Bugs 57 58Report bugs to the Bioperl bug tracking system to help us keep track 59of the bugs and their resolution. Bug reports can be submitted via the 60web: 61 62 https://github.com/bioperl/bioperl-live/issues 63 64=head1 AUTHOR - Jason Stajich 65 66Email jason@bioperl.org 67 68=head1 APPENDIX 69 70The rest of the documentation details each of the object methods. 71Internal methods are usually preceded with a _ 72 73=cut 74 75 76# Let the code begin... 77 78 79package Bio::AlignIO::emboss; 80$Bio::AlignIO::emboss::VERSION = '1.7.7'; 81use vars qw($EMBOSSTitleLen $EMBOSSLineLen); 82use strict; 83 84use Bio::LocatableSeq; 85 86use base qw(Bio::AlignIO); 87 88BEGIN { 89 $EMBOSSTitleLen = 13; 90 $EMBOSSLineLen = 50; 91} 92 93sub _initialize { 94 my($self,@args) = @_; 95 $self->SUPER::_initialize(@args); 96 $self->{'_type'} = undef; 97} 98 99=head2 next_aln 100 101 Title : next_aln 102 Usage : $aln = $stream->next_aln() 103 Function: returns the next alignment in the stream. 104 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file 105 or on error 106 Args : NONE 107 108=cut 109 110sub next_aln { 111 my ($self) = @_; 112 my $seenbegin = 0; 113 my %data = ( 'seq1' => { 114 'start'=> undef, 115 'end'=> undef, 116 'name' => '', 117 'data' => '' }, 118 'seq2' => { 119 'start'=> undef, 120 'end'=> undef, 121 'name' => '', 122 'data' => '' }, 123 'align' => '', 124 'type' => $self->{'_type'}, # to restore type from 125 # previous aln if possible 126 ); 127 my %names; 128 while( defined($_ = $self->_readline) ) { 129 next if( /^\#?\s+$/ || /^\#+\s*$/ ); 130 if( /^\#(\=|\-)+\s*$/) { 131 last if( $seenbegin); 132 } elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ || 133 /^\#\s+Program:\s+(\S+)/ ) 134 { 135 my ($name1,$name2) = ($2,$3); 136 if( ! defined $name1 ) { # Handle EMBOSS 2.2.X 137 $data{'type'} = $1; 138 $name1 = $name2 = ''; 139 } else { 140 $data{'type'} = $1 eq 'Local' ? 'water' : 'needle'; 141 } 142 $data{'seq1'}->{'name'} = $name1; 143 $data{'seq2'}->{'name'} = $name2; 144 145 $self->{'_type'} = $data{'type'}; 146 147 } elsif( /Score:\s+(\S+)/ ) { 148 $data{'score'} = $1; 149 } elsif( /^\#\s+(1|2):\s+(\S+)/ && ! $data{"seq$1"}->{'name'} ) { 150 my $nm = $2; 151 $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length 152 if( $names{$nm} ) { 153 $nm .= "-". $names{$nm}; 154 } 155 $names{$nm}++; 156 $data{"seq$1"}->{'name'} = $nm; 157 } elsif( $data{'seq1'}->{'name'} && 158 /^\Q$data{'seq1'}->{'name'}/ ) { 159 my $count = 0; 160 $seenbegin = 1; 161 my @current; 162 while( defined ($_) ) { 163 my $align_other = ''; 164 my $delayed; 165 if($count == 0 || $count == 2 ) { 166 my @l = split; 167 my ($seq,$align,$start,$end); 168 if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) { 169 # weird boundary condition 170 ($start,$align,$end) = @l; 171 } elsif( @l == 3 ) { 172 $align = ''; 173 ($seq,$start,$end) = @l 174 } else { 175 ($seq,$start,$align,$end) = @l; 176 } 177 178 my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2'); 179 $data{$seqname}->{'data'} .= $align; 180 $data{$seqname}->{'start'} ||= $start; 181 $data{$seqname}->{'end'} = $end; 182 $current[$count] = [ $start,$align || '']; 183 } else { 184 s/^\s+//; 185 s/\s+$//; 186 $data{'align'} .= $_; 187 } 188 189 BOTTOM: 190 last if( $count++ == 2); 191 $_ = $self->_readline(); 192 } 193 194 if( $data{'type'} eq 'needle' ) { 195 # which ever one is shorter we want to bring it up to 196 # length. Man this stinks. 197 my ($s1,$s2) = ($data{'seq1'}, $data{'seq2'}); 198 199 my $d = length($current[0]->[1]) - length($current[2]->[1]); 200 if( $d < 0 ) { # s1 is smaller, need to add some 201 # compare the starting points for this alignment line 202 if( $current[0]->[0] <= 1 ) { 203 $s1->{'data'} = ('-' x abs($d)) . $s1->{'data'}; 204 $data{'align'} = (' 'x abs($d)).$data{'align'}; 205 } else { 206 $s1->{'data'} .= '-' x abs($d); 207 $data{'align'} .= ' 'x abs($d); 208 } 209 } elsif( $d > 0) { # s2 is smaller, need to add some 210 if( $current[2]->[0] <= 1 ) { 211 $s2->{'data'} = ('-' x abs($d)) . $s2->{'data'}; 212 $data{'align'} = (' 'x abs($d)).$data{'align'}; 213 } else { 214 $s2->{'data'} .= '-' x abs($d); 215 $data{'align'} .= ' 'x abs($d); 216 } 217 } 218 } 219 220 } 221 } 222 return unless $seenbegin; 223 my $aln = Bio::SimpleAlign->new(-verbose => $self->verbose(), 224 -score => $data{'score'}, 225 -source => "EMBOSS-".$data{'type'}); 226 227 foreach my $seqname ( qw(seq1 seq2) ) { 228 return unless ( defined $data{$seqname} ); 229 $data{$seqname}->{'name'} ||= $seqname; 230 my $seq = Bio::LocatableSeq->new 231 ('-seq' => $data{$seqname}->{'data'}, 232 '-display_id' => $data{$seqname}->{'name'}, 233 '-start' => $data{$seqname}->{'start'}, 234 '-end' => $data{$seqname}->{'end'}, 235 '-alphabet' => $self->alphabet, 236 ); 237 $aln->add_seq($seq); 238 } 239 return $aln; 240} 241 242=head2 write_aln 243 244 Title : write_aln 245 Usage : $stream->write_aln(@aln) 246 Function: writes the $aln object into the stream in emboss format 247 Returns : 1 for success and 0 for error 248 Args : L<Bio::Align::AlignI> object 249 250 251=cut 252 253sub write_aln { 254 my ($self,@aln) = @_; 255 256 $self->throw("Sorry: writing emboss output is not currently available! \n"); 257} 258 2591; 260