1#
2# BioPerl module for Bio::AlignIO::emboss
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Jason Stajich <jason@bioperl.org>
7#
8# Copyright Jason Stajich
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle)
17
18=head1 SYNOPSIS
19
20    # do not use the object directly
21    use Bio::AlignIO;
22    # read in an alignment from the EMBOSS program water
23    my $in = Bio::AlignIO->new(-format => 'emboss',
24                              -file   => 'seq.water');
25    while( my $aln = $in->next_aln ) {
26    # do something with the alignment
27    }
28
29=head1 DESCRIPTION
30
31This object handles parsing and writing pairwise sequence alignments
32from the EMBOSS suite.
33
34=head1 FEEDBACK
35
36=head2 Mailing Lists
37
38User feedback is an integral part of the evolution of this and other
39Bioperl modules. Send your comments and suggestions preferably to
40the Bioperl mailing list.  Your participation is much appreciated.
41
42  bioperl-l@bioperl.org                  - General discussion
43  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
44
45=head2 Support
46
47Please direct usage questions or support issues to the mailing list:
48
49I<bioperl-l@bioperl.org>
50
51rather than to the module maintainer directly. Many experienced and
52reponsive experts will be able look at the problem and quickly
53address it. Please include a thorough description of the problem
54with code and data examples if at all possible.
55
56=head2 Reporting Bugs
57
58Report bugs to the Bioperl bug tracking system to help us keep track
59of the bugs and their resolution. Bug reports can be submitted via the
60web:
61
62  https://github.com/bioperl/bioperl-live/issues
63
64=head1 AUTHOR - Jason Stajich
65
66Email jason@bioperl.org
67
68=head1 APPENDIX
69
70The rest of the documentation details each of the object methods.
71Internal methods are usually preceded with a _
72
73=cut
74
75
76# Let the code begin...
77
78
79package Bio::AlignIO::emboss;
80$Bio::AlignIO::emboss::VERSION = '1.7.7';
81use vars qw($EMBOSSTitleLen $EMBOSSLineLen);
82use strict;
83
84use Bio::LocatableSeq;
85
86use base qw(Bio::AlignIO);
87
88BEGIN {
89    $EMBOSSTitleLen    = 13;
90    $EMBOSSLineLen     = 50;
91}
92
93sub _initialize {
94    my($self,@args) = @_;
95    $self->SUPER::_initialize(@args);
96    $self->{'_type'} = undef;
97}
98
99=head2 next_aln
100
101 Title   : next_aln
102 Usage   : $aln = $stream->next_aln()
103 Function: returns the next alignment in the stream.
104 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file
105	    or on error
106 Args    : NONE
107
108=cut
109
110sub next_aln {
111    my ($self) = @_;
112    my $seenbegin = 0;
113    my %data = ( 'seq1' => {
114		     'start'=> undef,
115		     'end'=> undef,
116		     'name' => '',
117		     'data' => '' },
118		 'seq2' => {
119		     'start'=> undef,
120		     'end'=> undef,
121		     'name' => '',
122		     'data' => '' },
123		 'align' => '',
124		 'type'  => $self->{'_type'},  # to restore type from
125		                                     # previous aln if possible
126		 );
127    my %names;
128    while( defined($_ = $self->_readline) ) {
129	next if( /^\#?\s+$/ || /^\#+\s*$/ );
130	if( /^\#(\=|\-)+\s*$/) {
131	    last if( $seenbegin);
132	} elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ ||
133		 /^\#\s+Program:\s+(\S+)/ )
134	{
135	    my ($name1,$name2) = ($2,$3);
136	    if( ! defined $name1 ) { # Handle EMBOSS 2.2.X
137		$data{'type'} = $1;
138		$name1 = $name2 = '';
139	    } else {
140		$data{'type'} = $1 eq 'Local' ? 'water' : 'needle';
141	    }
142	    $data{'seq1'}->{'name'} = $name1;
143	    $data{'seq2'}->{'name'} = $name2;
144
145	    $self->{'_type'} = $data{'type'};
146
147	} elsif( /Score:\s+(\S+)/ ) {
148	    $data{'score'} = $1;
149	} elsif( /^\#\s+(1|2):\s+(\S+)/ && !  $data{"seq$1"}->{'name'} ) {
150	    my $nm = $2;
151	    $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length
152	    if( $names{$nm} ) {
153		$nm .= "-". $names{$nm};
154	    }
155	    $names{$nm}++;
156	    $data{"seq$1"}->{'name'} = $nm;
157	} elsif( $data{'seq1'}->{'name'} &&
158		 /^\Q$data{'seq1'}->{'name'}/ ) {
159	    my $count = 0;
160	    $seenbegin = 1;
161	    my @current;
162	    while( defined ($_) ) {
163		my $align_other = '';
164		my $delayed;
165		if($count == 0 || $count == 2 ) {
166		    my @l = split;
167		    my ($seq,$align,$start,$end);
168		    if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) {
169			# weird boundary condition
170			($start,$align,$end) = @l;
171		    } elsif( @l == 3 ) {
172			$align = '';
173			($seq,$start,$end) = @l
174		    } else {
175			($seq,$start,$align,$end) = @l;
176 		    }
177
178		    my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2');
179		    $data{$seqname}->{'data'} .= $align;
180		    $data{$seqname}->{'start'} ||= $start;
181		    $data{$seqname}->{'end'} = $end;
182		    $current[$count] = [ $start,$align || ''];
183		} else {
184		    s/^\s+//;
185		    s/\s+$//;
186		    $data{'align'} .= $_;
187		}
188
189	      BOTTOM:
190		last if( $count++ == 2);
191		$_ = $self->_readline();
192	    }
193
194	    if( $data{'type'} eq 'needle' ) {
195		# which ever one is shorter we want to bring it up to
196		# length.  Man this stinks.
197		my ($s1,$s2) =  ($data{'seq1'}, $data{'seq2'});
198
199		my $d = length($current[0]->[1]) - length($current[2]->[1]);
200		if( $d < 0 ) { # s1 is smaller, need to add some
201		    # compare the starting points for this alignment line
202		    if( $current[0]->[0] <= 1 ) {
203			$s1->{'data'} = ('-' x abs($d)) . $s1->{'data'};
204			$data{'align'} = (' 'x abs($d)).$data{'align'};
205		    } else {
206			$s1->{'data'} .= '-' x abs($d);
207			$data{'align'} .= ' 'x abs($d);
208		    }
209		} elsif( $d > 0) { # s2 is smaller, need to add some
210		    if( $current[2]->[0] <= 1 ) {
211			$s2->{'data'} = ('-' x abs($d)) . $s2->{'data'};
212			$data{'align'} = (' 'x abs($d)).$data{'align'};
213		    } else {
214			$s2->{'data'} .= '-' x abs($d);
215			$data{'align'} .= ' 'x abs($d);
216		    }
217		}
218	    }
219
220	}
221    }
222    return unless $seenbegin;
223    my $aln =  Bio::SimpleAlign->new(-verbose => $self->verbose(),
224				     -score   => $data{'score'},
225				     -source => "EMBOSS-".$data{'type'});
226
227    foreach my $seqname ( qw(seq1 seq2) ) {
228	return unless ( defined $data{$seqname} );
229	$data{$seqname}->{'name'} ||= $seqname;
230	my $seq = Bio::LocatableSeq->new
231	    ('-seq'         => $data{$seqname}->{'data'},
232	     '-display_id'  => $data{$seqname}->{'name'},
233	     '-start'       => $data{$seqname}->{'start'},
234	     '-end'         => $data{$seqname}->{'end'},
235	     '-alphabet'    => $self->alphabet,
236	     );
237	$aln->add_seq($seq);
238    }
239    return $aln;
240}
241
242=head2 write_aln
243
244 Title   : write_aln
245 Usage   : $stream->write_aln(@aln)
246 Function: writes the $aln object into the stream in emboss format
247 Returns : 1 for success and 0 for error
248 Args    : L<Bio::Align::AlignI> object
249
250
251=cut
252
253sub write_aln {
254    my ($self,@aln) = @_;
255
256    $self->throw("Sorry: writing emboss output is not currently available! \n");
257}
258
2591;
260