1#--------------------------------------------------------- 2 3=head1 NAME 4 5Bio::Matrix::PSM::IO::masta - motif fasta format parser 6 7=head1 SYNOPSIS 8 9MASTA is a position frequency matrix format similar to 10fasta. It contains one ID row just like fasta and then the actual 11data, which is tab delimited: 12 13 0.1 0.62 .017 0.11 14 0.22 0.13 0.54 0.11 15 16Or A,C,G and T could be horizontally positioned (positioning is 17automatically detected). Please note masta will parse only DNA at the 18moment. 19 20It will also convert a set of aligned sequences: 21ACATGCAT 22ACAGGGAT 23ACAGGCAT 24ACCGGCAT 25 26to a PFM (SiteMatrix object). When writing if you supply SEQ it will 27write 10 random instances, which represent correctly the frequency and 28can be used as an input for weblogo creation purposes. 29 30See Bio::Matrix::PSM::IO for detailed documentation on how to use masta parser 31 32=head1 DESCRIPTION 33 34Parser for meme. 35 36=head1 FEEDBACK 37 38=head2 Mailing Lists 39 40User feedback is an integral part of the evolution of this 41and other Bioperl modules. Send your comments and suggestions preferably 42 to one of the Bioperl mailing lists. 43Your participation is much appreciated. 44 45 bioperl-l@bioperl.org - General discussion 46 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 47 48=head2 Support 49 50Please direct usage questions or support issues to the mailing list: 51 52I<bioperl-l@bioperl.org> 53 54rather than to the module maintainer directly. Many experienced and 55reponsive experts will be able look at the problem and quickly 56address it. Please include a thorough description of the problem 57with code and data examples if at all possible. 58 59=head2 Reporting Bugs 60 61Report bugs to the Bioperl bug tracking system to help us keep track 62the bugs and their resolution. Bug reports can be submitted via the 63web: 64 65 https://github.com/bioperl/bioperl-live/issues 66 67=head1 AUTHOR - Stefan Kirov 68 69Email skirov@utk.edu 70 71=head1 APPENDIX 72 73=cut 74 75 76# Let the code begin... 77package Bio::Matrix::PSM::IO::masta; 78$Bio::Matrix::PSM::IO::masta::VERSION = '1.7.7'; 79use Bio::Matrix::PSM::SiteMatrix; 80use vars qw(@HEADER); 81use strict; 82 83use base qw(Bio::Matrix::PSM::IO Bio::Root::Root); 84 85 86 87=head2 new 88 89 Title : new 90 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=> 'masta', 91 -file => $file, 92 -mtype => 'PWM'); 93 Function: Associates a file with the appropriate parser 94 Throws : 95 Example : 96 Args : hash 97 Returns : "Bio::Matrix::PSM::$format"->new(@args); 98 99=cut 100 101sub new { 102 my($class, @args)=@_; 103 my $self = $class->SUPER::new(@args); 104 my ($file)=$self->_rearrange(['FILE'], @args); 105 my ($query,$tr1)=split(/\./,$file,2); 106 $self->{file} = $file; 107 $self->{_end} = 0; 108 $self->{mtype} = uc($self->_rearrange(['MTYPE'], @args) || "PFM"); 109 $self->_initialize_io(@args) || $self->warn("Did you intend to use STDIN?"); #Read only for now 110 return $self; 111} 112 113=head2 write_psm 114 115 Title : write_psm 116 Usage : 117 Function: writes a pfm/pwm/raw sequence in a simple masta format 118 Throws : 119 Example : 120 Args : SiteMatrix object, type (optional string: PWM, SEQ or PFM) 121 Returns : 122 123=cut 124 125sub write_psm { 126 my ($self,$matrix,$type)=@_; 127 $self->{mtype} = uc($type) if ($type); 128 my $idline=">". $matrix->id . "\n"; 129 $self->_print($idline); 130 unless ($self->{mtype} eq 'SEQ') { 131 while (my %h=$matrix->next_pos) { 132 my $row=$self->{mtype} eq 'PWM' ? join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"):join("\t",$h{pA},$h{pC},$h{pG},$h{pT},"\n"); 133 $self->_print ($row); 134 } 135 } else { 136 my @seq; 137 while (my %h=$matrix->next_pos) { 138 my ($a,$c,$g,$t)=_freq_to_count(\%h); 139 $self->throw("Could not convert from frequency to count\n") if (($a+$c+$g+$t) !=10); 140 for my $i (0..$a-1) {$seq[$i].='A';} 141 my $m=$a+$c; 142 for my $i ($a..$m-1) {$seq[$i].='C';} 143 my $n=$a+$c+$g; 144 for my $i ($m..$n-1) {$seq[$i].='G';} 145 for my $i ($n..9) {$seq[$i].='T';} 146 } 147 foreach my $s (@seq) { 148 $s.="\n"; 149 $self->_print ($s); 150 } 151 } 152} 153 154=head2 next_matrix 155 156 Title : next_matrix 157 Usage : my $matrix = $psmio->next_matrix; 158 Function: Alias of next_psm function 159 160=cut 161 162sub next_matrix { 163 shift->next_psm(@_); 164} 165 166=head2 next_psm 167 168 Title : next_psm 169 Usage : my $matrix=$psmio->next_psm; 170 Function: returns the next matrix in the stream 171 Throws : If there is you mix different types, for example weights and 172 frequencies occur in the same entry You can mix weights, but these 173 should be designated by different ID lines 174 Example : 175 Args : 176 Returns : Bio::Matrix::PSM::SiteMatrix 177 178=cut 179 180sub next_psm { 181 my $self=shift; 182 return if ($self->{_end}); 183 my $line=$self->_readline; 184 $self->throw("No ID line- wrong format\n") unless ($line=~/^>/); 185 my ($id,$desc)=split(/[\t\s]+/,$line,2); 186 $id=~s/>//; 187 my ($mtype,$format,@mdata,$len); 188 $self->{_mtype} = 0; 189 while ($line=$self->_readline) { 190 next if $line =~ /^\s+$/;# There should not be empty lines, but just in case... 191 chomp $line; 192 if ($line =~ /^>/) { 193 $self->_pushback($line); 194 last; 195 } 196 197 if ($line !~ /[^ACGTacgt]/g) { 198 # This is a set of aligned sequences 199 $self->throw("Mixing between types is not allowed or a parsing error occurred\n") 200 if (($self->{_mtype} != 3) && ($mtype)) ; 201 $self->throw("Bad sequence- different length: $line\n") 202 if (($len) && ($len!=length($line))); 203 $len=length($line) unless ($len); 204 push @mdata,$line; 205 $self->{_mtype}=3; 206 } else { 207 # do not strip 'e's since they are part of number notation for small/big numbers 208 $line=~s/[a-df-zA-DF-Z]//g; #Well we may wanna do a hash and auto check for letter order if there is a really boring talk... 209 $line=~s/^[\s\t]+//; 210 $line=~s/[\s\t]+/\t/g; 211 my @data=split(/[\s\t]+/,$line); 212 if ($#data==3) { 213 $self->throw("Mixing between types is not allowed or a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=1)) ; 214 $self->{_mtype}=1; 215 $mtype=1; 216 } 217 else { 218 $self->throw("Mixing between types is not allowedor a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=2)) ; 219 $self->{_mtype}=2; 220 $mtype=1; 221 } 222 push @mdata,\@data; 223 } 224 } 225 $self->{_end} = 1 if (!defined $line || $line !~ /^>/); 226 return _make_matrix(\@mdata,$self->{_mtype},$id,$desc); 227} 228 229sub _make_matrix { 230 my ($mdata,$type,$id,$desc)=@_; 231 if ($type==1) { 232 my @rearr=_rearrange_matrix($mdata); 233 $mdata=\@rearr; 234 } 235#Auto recognition for what type is this entry (PFM, PWM or simple count) 236#A bit dangerous, I hate too much auto stuff, but I want to be able to mix different 237#types in a single file 238 my $mformat='count'; 239 my ($a,$c,$g,$t); 240 if ($type == 3 ) { 241 ($a,$c,$g,$t)= &_count_positions($mdata); 242 } else { 243 ($a,$c,$g,$t)=@{$mdata}; 244 my $k=$a->[0]+$c->[0]+$g->[0]+$t->[0]; 245 my $l= ($a->[0]+$c->[0]+$g->[0]+$t->[0]) - 246 (abs($a->[0])+abs($c->[0])+abs($g->[0])+abs($t->[0])); 247 $mformat='freq' if (($k==1) && ($l==0)); 248 $mformat='pwm' if ($l!=0); 249 } 250 my (@fa,@fc,@fg,@ft,%mparam); 251 252 if ($mformat eq 'pwm') { 253 foreach my $i (0..$#{$a}) { 254 my $ca=exp $a->[$i]; 255 my $cc=exp $c->[$i]; 256 my $cg=exp $g->[$i]; 257 my $ct=exp $t->[$i]; 258 my $all=$ca+$cc+$cg+$ct; 259 push @fa,($ca/$all)*100; 260 push @fc,($cc/$all)*100; 261 push @fg,($cg/$all)*100; 262 push @ft,($ct/$all)*100; 263 } 264 } 265 $desc.=", source is $mformat"; 266 if ($mformat eq 'pwm') { 267 $desc=~s/^pwm//; 268 %mparam=(-pA=>\@fa,-pC=>\@fc,-pG=>\@fg,-pT=>\@ft,-id=>$id,-desc=>$desc, 269 -lA=>$a,-lC=>$c,-lG=>$g,-lT=>$t); 270 } 271 else { 272 %mparam=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,-id=>$id,-desc=>$desc); 273 } 274 return new Bio::Matrix::PSM::SiteMatrix(%mparam); 275} 276 277sub _rearrange_matrix { 278 my $mdata=shift; 279 my (@a,@c,@g,@t); 280 foreach my $entry (@{$mdata}) { 281 my ($a,$c,$g,$t)=@$entry; 282 push @a,$a; 283 push @c,$c; 284 push @g,$g; 285 push @t,$t; 286 } 287 return \@a,\@c,\@g,\@t; 288} 289 290 291sub _count_positions { 292 my $seq=shift; 293 my %pos; 294 my $l=length($seq->[0])-1; 295 for( my $i = 0; $i <= $l; $i++ ) { 296 for ( qw(A C G T) ) { 297 $pos{$_}->[$i] = 0; 298 } 299 } 300 foreach my $sequence (@{$seq}) { 301 my @let= split(//,$sequence); 302 for my $i (0..$#let) { 303 $pos{uc($let[$i])}->[$i]++; 304 } 305 } 306 return $pos{A},$pos{C},$pos{G},$pos{T}; 307} 308 309 310sub _freq_to_count { 311 my $h=shift; 312 my $a=int(10*$h->{pA}+0.5); 313 my $c=int(10*$h->{pC}+0.5); 314 my $g=int(10*$h->{pG}+0.5); 315 my $t=int(10*$h->{pT}+0.5); 316 return ($a,$c,$g,$t); 317} 318 3191; 320