1#---------------------------------------------------------
2
3=head1 NAME
4
5Bio::Matrix::PSM::IO::masta - motif fasta format parser
6
7=head1 SYNOPSIS
8
9MASTA is a position frequency matrix format similar to
10fasta. It contains one ID row just like fasta and then the actual
11data, which is tab delimited:
12
13  0.1	0.62	.017	0.11
14  0.22	0.13	0.54	0.11
15
16Or A,C,G and T could be horizontally positioned (positioning is
17automatically detected).  Please note masta will parse only DNA at the
18moment.
19
20It will also convert a set of aligned sequences:
21ACATGCAT
22ACAGGGAT
23ACAGGCAT
24ACCGGCAT
25
26to a PFM (SiteMatrix object). When writing if you supply SEQ it will
27write 10 random instances, which represent correctly the frequency and
28can be used as an input for weblogo creation purposes.
29
30See Bio::Matrix::PSM::IO for detailed documentation on how to use masta parser
31
32=head1 DESCRIPTION
33
34Parser for meme.
35
36=head1 FEEDBACK
37
38=head2 Mailing Lists
39
40User feedback is an integral part of the evolution of this
41and other Bioperl modules. Send your comments and suggestions preferably
42 to one of the Bioperl mailing lists.
43Your participation is much appreciated.
44
45  bioperl-l@bioperl.org                  - General discussion
46  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
47
48=head2 Support
49
50Please direct usage questions or support issues to the mailing list:
51
52I<bioperl-l@bioperl.org>
53
54rather than to the module maintainer directly. Many experienced and
55reponsive experts will be able look at the problem and quickly
56address it. Please include a thorough description of the problem
57with code and data examples if at all possible.
58
59=head2 Reporting Bugs
60
61Report bugs to the Bioperl bug tracking system to help us keep track
62the bugs and their resolution.  Bug reports can be submitted via the
63web:
64
65  https://github.com/bioperl/bioperl-live/issues
66
67=head1 AUTHOR - Stefan Kirov
68
69Email skirov@utk.edu
70
71=head1 APPENDIX
72
73=cut
74
75
76# Let the code begin...
77package Bio::Matrix::PSM::IO::masta;
78$Bio::Matrix::PSM::IO::masta::VERSION = '1.7.7';
79use Bio::Matrix::PSM::SiteMatrix;
80use vars qw(@HEADER);
81use strict;
82
83use base qw(Bio::Matrix::PSM::IO Bio::Root::Root);
84
85
86
87=head2 new
88
89 Title   : new
90 Usage   : my $psmIO =  Bio::Matrix::PSM::IO->new(-format=> 'masta',
91						 -file  => $file,
92                                                 -mtype => 'PWM');
93 Function: Associates a file with the appropriate parser
94 Throws  :
95 Example :
96 Args    : hash
97 Returns : "Bio::Matrix::PSM::$format"->new(@args);
98
99=cut
100
101sub new {
102    my($class, @args)=@_;
103    my $self = $class->SUPER::new(@args);
104    my ($file)=$self->_rearrange(['FILE'], @args);
105    my ($query,$tr1)=split(/\./,$file,2);
106    $self->{file}  = $file;
107    $self->{_end}  = 0;
108    $self->{mtype} = uc($self->_rearrange(['MTYPE'], @args) || "PFM");
109    $self->_initialize_io(@args) || $self->warn("Did you intend to use STDIN?"); #Read only for now
110    return $self;
111}
112
113=head2 write_psm
114
115 Title   : write_psm
116 Usage   :
117 Function: writes a pfm/pwm/raw sequence in a simple masta format
118 Throws  :
119 Example :
120 Args    : SiteMatrix object, type (optional string: PWM, SEQ or PFM)
121 Returns :
122
123=cut
124
125sub write_psm {
126    my ($self,$matrix,$type)=@_;
127    $self->{mtype} = uc($type) if ($type);
128    my $idline=">". $matrix->id . "\n";
129    $self->_print($idline);
130    unless ($self->{mtype} eq 'SEQ') {
131	while (my %h=$matrix->next_pos) {
132	    my $row=$self->{mtype} eq 'PWM' ? join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"):join("\t",$h{pA},$h{pC},$h{pG},$h{pT},"\n");
133	    $self->_print ($row);
134	}
135    } else {
136	my @seq;
137	while (my %h=$matrix->next_pos) {
138	    my ($a,$c,$g,$t)=_freq_to_count(\%h);
139	    $self->throw("Could not convert from frequency to count\n") if (($a+$c+$g+$t) !=10);
140	    for my $i (0..$a-1) {$seq[$i].='A';}
141	    my $m=$a+$c;
142	    for my $i ($a..$m-1) {$seq[$i].='C';}
143	    my $n=$a+$c+$g;
144	    for my $i ($m..$n-1) {$seq[$i].='G';}
145	    for my $i ($n..9) {$seq[$i].='T';}
146	}
147	foreach my $s (@seq) {
148	    $s.="\n";
149	    $self->_print ($s);
150	}
151    }
152}
153
154=head2 next_matrix
155
156  Title   : next_matrix
157  Usage   : my $matrix = $psmio->next_matrix;
158  Function: Alias of next_psm function
159
160=cut
161
162sub next_matrix {
163    shift->next_psm(@_);
164}
165
166=head2 next_psm
167
168 Title   : next_psm
169 Usage   : my $matrix=$psmio->next_psm;
170 Function: returns the next matrix in the stream
171 Throws  : If there is you mix different types, for example weights and
172           frequencies occur in the same entry You can mix weights, but these
173           should be designated by different ID lines
174 Example :
175 Args    :
176 Returns : Bio::Matrix::PSM::SiteMatrix
177
178=cut
179
180sub next_psm {
181    my $self=shift;
182    return if ($self->{_end});
183    my $line=$self->_readline;
184    $self->throw("No ID line- wrong format\n") unless ($line=~/^>/);
185    my ($id,$desc)=split(/[\t\s]+/,$line,2);
186    $id=~s/>//;
187    my ($mtype,$format,@mdata,$len);
188    $self->{_mtype} = 0;
189    while ($line=$self->_readline) {
190	next if $line =~ /^\s+$/;# There should not be empty lines, but just in case...
191	chomp $line;
192	if ($line =~ /^>/) {
193	    $self->_pushback($line);
194	    last;
195	}
196
197	if ($line !~ /[^ACGTacgt]/g) {
198	    # This is a set of aligned sequences
199	    $self->throw("Mixing between types is not allowed or a parsing error occurred\n")
200		if (($self->{_mtype} != 3) && ($mtype)) ;
201	    $self->throw("Bad sequence- different length: $line\n")
202		if (($len) && ($len!=length($line)));
203	    $len=length($line) unless ($len);
204	    push @mdata,$line;
205	    $self->{_mtype}=3;
206	} else {
207		# do not strip 'e's since they are part of number notation for small/big numbers
208	    $line=~s/[a-df-zA-DF-Z]//g; #Well we may wanna do a hash and auto check for letter order if there is a really boring talk...
209	    $line=~s/^[\s\t]+//;
210	    $line=~s/[\s\t]+/\t/g;
211	    my @data=split(/[\s\t]+/,$line);
212	    if ($#data==3) {
213		$self->throw("Mixing between types is not allowed or a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=1)) ;
214		$self->{_mtype}=1;
215		$mtype=1;
216	    }
217	    else   {
218		$self->throw("Mixing between types is not allowedor a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=2)) ;
219		$self->{_mtype}=2;
220		$mtype=1;
221	    }
222	    push @mdata,\@data;
223	}
224    }
225    $self->{_end} = 1 if (!defined $line || $line !~ /^>/);
226    return _make_matrix(\@mdata,$self->{_mtype},$id,$desc);
227}
228
229sub _make_matrix {
230    my ($mdata,$type,$id,$desc)=@_;
231    if ($type==1) {
232	my @rearr=_rearrange_matrix($mdata);
233	$mdata=\@rearr;
234    }
235#Auto recognition for what type is this entry (PFM, PWM or simple count)
236#A bit dangerous, I hate too much auto stuff, but I want to be able to mix different
237#types in a single file
238    my $mformat='count';
239    my ($a,$c,$g,$t);
240    if ($type == 3 ) {
241	($a,$c,$g,$t)= &_count_positions($mdata);
242    } else {
243	($a,$c,$g,$t)=@{$mdata};
244	my $k=$a->[0]+$c->[0]+$g->[0]+$t->[0];
245	my $l= ($a->[0]+$c->[0]+$g->[0]+$t->[0]) -
246	    (abs($a->[0])+abs($c->[0])+abs($g->[0])+abs($t->[0]));
247	$mformat='freq' if (($k==1) && ($l==0));
248	$mformat='pwm' if ($l!=0);
249    }
250    my (@fa,@fc,@fg,@ft,%mparam);
251
252    if ($mformat eq 'pwm') {
253	foreach my $i (0..$#{$a}) {
254	    my $ca=exp $a->[$i];
255	    my $cc=exp $c->[$i];
256	    my $cg=exp $g->[$i];
257	    my $ct=exp $t->[$i];
258	    my $all=$ca+$cc+$cg+$ct;
259	    push @fa,($ca/$all)*100;
260	    push @fc,($cc/$all)*100;
261	    push @fg,($cg/$all)*100;
262	    push @ft,($ct/$all)*100;
263	}
264    }
265    $desc.=", source is $mformat";
266    if ($mformat eq 'pwm') {
267	$desc=~s/^pwm//;
268	%mparam=(-pA=>\@fa,-pC=>\@fc,-pG=>\@fg,-pT=>\@ft,-id=>$id,-desc=>$desc,
269		 -lA=>$a,-lC=>$c,-lG=>$g,-lT=>$t);
270    }
271    else {
272	%mparam=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,-id=>$id,-desc=>$desc);
273    }
274    return new Bio::Matrix::PSM::SiteMatrix(%mparam);
275}
276
277sub _rearrange_matrix {
278    my $mdata=shift;
279    my (@a,@c,@g,@t);
280    foreach my $entry (@{$mdata}) {
281	my ($a,$c,$g,$t)=@$entry;
282	push @a,$a;
283	push @c,$c;
284	push @g,$g;
285	push @t,$t;
286    }
287    return \@a,\@c,\@g,\@t;
288}
289
290
291sub _count_positions {
292    my $seq=shift;
293    my %pos;
294    my $l=length($seq->[0])-1;
295    for( my $i = 0; $i <= $l; $i++ ) {
296	for ( qw(A C G T) ) {
297	    $pos{$_}->[$i] = 0;
298	}
299    }
300    foreach my $sequence (@{$seq}) {
301	my @let= split(//,$sequence);
302	for my $i (0..$#let) {
303	    $pos{uc($let[$i])}->[$i]++;
304	}
305    }
306    return $pos{A},$pos{C},$pos{G},$pos{T};
307}
308
309
310sub _freq_to_count {
311    my $h=shift;
312    my $a=int(10*$h->{pA}+0.5);
313    my $c=int(10*$h->{pC}+0.5);
314    my $g=int(10*$h->{pG}+0.5);
315    my $t=int(10*$h->{pT}+0.5);
316    return ($a,$c,$g,$t);
317}
318
3191;
320