1#--------------------------------------------------------- 2 3=head1 NAME 4 5Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a 6position scoring matrix (or position weight matrix) and log-odds 7 8=head1 SYNOPSIS 9 10 use Bio::Matrix::PSM::SiteMatrix; 11 # Create from memory by supplying probability matrix hash 12 # both as strings or arrays 13 # where the frequencies $a,$c,$g and $t are supplied either as 14 # arrayref or string. Accordingly, lA, lC, lG and lT are the log 15 # odds (only as arrays, no checks done right now) 16 my ($a,$c,$g,$t,$score,$ic, $mid)=@_; 17 #or 18 my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001', 19 '100104',0.001,19.2,'CRE1'); 20 #Where a stands for all (this frequency=1), see explanation below 21 my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t, 22 -lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l, 23 -IC=>$ic,-e_val=>$score, -id=>$mid); 24 my $site=Bio::Matrix::PSM::SiteMatrix->new(%param); 25 #Or get it from a file: 26 use Bio::Matrix::PSM::IO; 27 my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac'); 28 while (my $psm=$psmIO->next_psm) { 29 #Now we have a Bio::Matrix::PSM::Psm object, 30 # see Bio::Matrix::PSM::PsmI for details 31 #This is a Bio::Matrix::PSM::SiteMatrix object now 32 my $matrix=$psm->matrix; 33 } 34 35 # Get a simple consensus, where alphabet is {A,C,G,T,N}, 36 # choosing the character that both satisfies a supplied or default threshold 37 # frequency and is the most frequenct character at each position, or N. 38 # So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15, 39 # the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it 40 # would be 'N'. 41 my $consensus=$site->consensus; 42 43 # Get the IUPAC ambiguity code representation of the data in the matrix. 44 # Because the frequencies may have been pseudo-count corrected, insignificant 45 # frequences (below 0.05 by default) are ignored. So a position with 46 # A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M', 47 # while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and 48 # 0.25, 0.25, 0.25, 0.25 would get 'N'. 49 my $iupac=$site->IUPAC; 50 51 # Getting/using regular expression (a representation of the IUPAC string) 52 my $regexp=$site->regexp; 53 my $count=grep($regexp,$seq); 54 my $count=($seq=~ s/$regexp/$1/eg); 55 print "Motif $mid is present $count times in this sequence\n"; 56 57=head1 DESCRIPTION 58 59SiteMatrix is designed to provide some basic methods when working with position 60scoring (weight) matrices, such as transcription factor binding sites for 61example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is 62the minimum information you should provide to construct a PSM object. The 63vectors can be provided as strings with frequenciesx10 rounded to an int, going 64from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed 65representation of a matrix and it is quite useful when working with relational 66DB. If arrays are provided as an input (references to arrays actually) they can 67be any number, real or integer (frequency or count). 68 69When creating the object you can ask the constructor to make a simple pseudo 70count correction by adding a number (typically 1) to all positions (with the 71-correction option). After adding the number the frequencies will be 72calculated. Only use correction when you supply counts, not frequencies. 73 74Throws an exception if: You mix as an input array and string (for example A 75matrix is given as array, C - as string). The position vector is (0,0,0,0). One 76of the probability vectors is shorter than the rest. 77 78Summary of the methods I use most frequently (details below): 79 80 iupac - return IUPAC compliant consensus as a string 81 score - Returns the score as a real number 82 IC - information content. Returns a real number 83 id - identifier. Returns a string 84 accession - accession number. Returns a string 85 next_pos - return the sequence probably for each letter, IUPAC 86 symbol, IUPAC probability and simple sequence 87 consenus letter for this position. Rewind at the end. Returns a hash. 88 pos - current position get/set. Returns an integer. 89 regexp - construct a regular expression based on IUPAC consensus. 90 For example AGWV will be [Aa][Gg][AaTt][AaCcGg] 91 width - site width 92 get_string - gets the probability vector for a single base as a string. 93 get_array - gets the probability vector for a single base as an array. 94 get_logs_array - gets the log-odds vector for a single base as an array. 95 96New methods, which might be of interest to anyone who wants to store 97PSM in a relational database without creating an entry for each 98position is the ability to compress the PSM vector into a string with 99losing usually less than 1% of the data. this can be done with: 100 101 my $str=$matrix->get_compressed_freq('A'); 102or 103 my $str=$matrix->get_compressed_logs('A'); 104 105Loading from a database should be done with new, but is not yest implemented. 106However you can still uncompress such string with: 107 108 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM 109or 110 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds 111 112=head1 FEEDBACK 113 114=head2 Mailing Lists 115 116User feedback is an integral part of the evolution of this and other 117Bioperl modules. Send your comments and suggestions preferably to one 118of the Bioperl mailing lists. Your participation is much appreciated. 119 120 bioperl-l@bioperl.org - General discussion 121 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 122 123=head2 Support 124 125Please direct usage questions or support issues to the mailing list: 126 127I<bioperl-l@bioperl.org> 128 129rather than to the module maintainer directly. Many experienced and 130reponsive experts will be able look at the problem and quickly 131address it. Please include a thorough description of the problem 132with code and data examples if at all possible. 133 134=head2 Reporting Bugs 135 136Report bugs to the Bioperl bug tracking system to help us keep track 137the bugs and their resolution. Bug reports can be submitted via the 138web: 139 140 https://github.com/bioperl/bioperl-live/issues 141 142=head1 AUTHOR - Stefan Kirov 143 144Email skirov@utk.edu 145 146=head1 CONTRIBUTORS 147 148Sendu Bala, bix@sendu.me.uk 149 150=head1 APPENDIX 151 152The rest of the documentation details each of the object methods. 153Internal methods are usually preceded with a _ 154 155=cut 156 157# Let the code begin... 158package Bio::Matrix::PSM::SiteMatrix; 159$Bio::Matrix::PSM::SiteMatrix::VERSION = '1.7.7'; 160use strict; 161 162use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI); 163 164=head2 new 165 166 Title : new 167 Usage : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c, 168 -pG=>$g,-pT=>$t, 169 -IC=>$ic, 170 -e_val=>$score, 171 -id=>$mid); 172 Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory 173 Throws : If inconsistent data for all vectors (A,C,G and T) is 174 provided, if you mix input types (string vs array) or if a 175 position freq is 0. 176 Returns : Bio::Matrix::PSM::SiteMatrix object 177 Args : -pA => vector with the frequencies or counts of A 178 -pC => vector for C 179 -pG => vector for G 180 -pt => vector for T 181 -lA => vector for the log of A 182 -lC => vector for the log of C 183 -lG => vector for the log of G 184 -lT => vector for the log of T 185 -IC => real number, the information content of this matrix 186 -e_val => real number, the expect value 187 -id => string, an identifier 188 -width => int, width of the matrix in nucleotides 189 -sites => int, the number of sites that went into this matrix 190 -model => hash ref, background frequencies for A, C, G and T 191 -correction => number, the number to add to all positions to achieve 192 pseudo count correction (default 0: no correction) 193 NB: do not use correction when your input is 194 frequences! 195 -accession_number => string, an accession number 196 197 Vectors can be strings of the frequencies where the frequencies are 198 multiplied by 10 and rounded to the nearest whole number, and where 199 'a' is used to denote the maximal frequency 10. There should be no 200 punctuation (spaces etc.) in the string. For example, 'a0501'. 201 Alternatively frequencies or counts can be represented by an array 202 ref containing the counts, frequencies or logs as any kind of 203 number. 204 205=cut 206 207sub new { 208 my ($class, @args) = @_; 209 my $self = $class->SUPER::new(@args); 210 my $consensus; 211 # Too many things to rearrange, and I am creating simultanuously >500 212 # such objects routinely, so this becomes performance issue 213 my %input; 214 while (@args) { 215 (my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)! 216 $input{$key} = shift @args; 217 } 218 $self->{_position} = 0; 219 $self->{IC} = $input{IC}; 220 $self->{e_val} = $input{e_val}; 221 $self->{width} = $input{width}; 222 $self->{logA} = $input{lA}; 223 $self->{logC} = $input{lC}; 224 $self->{logG} = $input{lG}; 225 $self->{logT} = $input{lT}; 226 $self->{sites} = $input{sites}; 227 $self->{id} = $input{id} || 'null'; 228 $self->{correction} = $input{correction} || 0; 229 $self->{accession_number} = $input{accession_number}; 230 return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT})); 231 232 # This should go to _initialize? 233 # Check for input type- no mixing alllowed, throw ex 234 if (ref($input{pA}) =~ /ARRAY/i ) { 235 $self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC})); 236 $self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG})); 237 $self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT})); 238 $self->{probA} = $input{pA}; 239 $self->{probC} = $input{pC}; 240 $self->{probG} = $input{pG}; 241 $self->{probT} = $input{pT}; 242 } 243 else { 244 $self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC})); 245 $self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG})); 246 $self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT})); 247 $self->{probA} = [split(//,$input{pA})]; 248 $self->{probC} = [split(//,$input{pC})]; 249 $self->{probG} = [split(//,$input{pG})]; 250 $self->{probT} = [split(//,$input{pT})]; 251 for (my $i=0; $i<= @{$self->{probA}}+1; $i++) { 252 # we implictely assume these are MEME-style frequencies x 10, so 253 # 'a' represents the 'maximum', 10. Other positions can actually 254 # add up to over 10 due to rounding, but I don't think that is a 255 # problem? 256 if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') { 257 ${$self->{probA}}[$i]='10'; 258 } 259 if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') { 260 ${$self->{probC}}[$i]='10'; 261 } 262 if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') { 263 ${$self->{probG}}[$i]='10'; 264 } 265 if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') { 266 ${$self->{probT}}[$i]='10'; 267 } 268 } 269 } 270 271 # Check for position with 0 for all bases, throw exception if so 272 for (my $i=0;$i <= $#{$self->{probA}}; $i++) { 273 if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) { 274 $self->throw("Position meaningless-all frequencies are 0"); 275 } 276 277 # apply pseudo-count correction to all values - this will result in 278 # very bad frequencies if the input is already frequences and a 279 # correction value as large as 1 is used! 280 if ($self->{correction}) { 281 ${$self->{probA}}[$i] += $self->{correction}; 282 ${$self->{probC}}[$i] += $self->{correction}; 283 ${$self->{probG}}[$i] += $self->{correction}; 284 ${$self->{probT}}[$i] += $self->{correction}; 285 } 286 287 # (re)calculate frequencies 288 my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]; 289 ${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div; 290 ${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div; 291 ${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div; 292 ${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div; 293 } 294 295 # Calculate the logs 296 if ((!defined($self->{logA})) && ($input{model})) { 297 $self->calc_weight($input{model}); 298 } 299 300 # Make consensus, throw if any one of the vectors is shorter 301 $self->_calculate_consensus; 302 return $self; 303} 304 305=head2 _calculate_consensus 306 307 Title : _calculate_consensus 308 Function: Internal stuff 309 310=cut 311 312sub _calculate_consensus { 313 my $self=shift; 314 my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}}); 315 my $len=$#{$self->{probA}}; 316 $self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc); 317 $self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt); 318 $self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg); 319 for (my $i=0; $i<$len+1; $i++) { 320 #*** IUPACp values not actually used (eg. by next_pos) 321 (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]); 322 (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]); 323 } 324 return $self; 325} 326 327=head2 calc_weight 328 329 Title : calc_weight 330 Usage : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568}); 331 Function: Recalculates the PSM (or weights) based on the PFM (the frequency 332 matrix) and user supplied background model. 333 Throws : if no model is supplied 334 Returns : n/a 335 Args : reference to a hash with background frequencies for A,C,G and T 336 337=cut 338 339sub calc_weight { 340 my ($self, $model) = @_; 341 my %model; 342 $model{probA}=$model->{A}; 343 $model{probC}=$model->{C}; 344 $model{probG}=$model->{G}; 345 $model{probT}=$model->{T}; 346 foreach my $let (qw(probA probC probG probT)) { 347 my @str; 348 $self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1)); 349 foreach my $f (@{$self->{$let}}) { 350 my $w=log($f)-log($model{$let}); 351 push @str,$w; 352 } 353 my $llet=$let; 354 $llet=~s/prob/log/; 355 $self->{$llet}=\@str; 356 } 357 return $self; 358} 359 360=head2 next_pos 361 362 Title : next_pos 363 Usage : 364 Function: Retrieves the next position features: frequencies for A,C,G,T, the 365 main letter (as in consensus) and the probabilty for this letter to 366 occur at this position and the current position 367 Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel) 368 Args : none 369 370=cut 371 372sub next_pos { 373 my $self = shift; 374 $self->throw("instance method called on class") unless ref $self; 375 my $len=@{$self->{seq}}; 376 my $pos=$self->{_position}; 377 # End reached? 378 if ($pos<$len) { 379 my $pA=${$self->{probA}}[$pos]; 380 my $pC=${$self->{probC}}[$pos]; 381 my $pG=${$self->{probG}}[$pos]; 382 my $pT=${$self->{probT}}[$pos]; 383 my $lA=${$self->{logA}}[$pos]; 384 my $lC=${$self->{logC}}[$pos]; 385 my $lG=${$self->{logG}}[$pos]; 386 my $lT=${$self->{logT}}[$pos]; 387 my $base=${$self->{seq}}[$pos]; 388 my $prob=${$self->{seqp}}[$pos]; 389 $self->{_position}++; 390 my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob); 391 return %seq; 392 } 393 else {$self->{_position}=0; return;} 394} 395 396=head2 curpos 397 398 Title : curpos 399 Usage : 400 Function: Gets/sets the current position. Converts to 0 if argument is minus 401 and to width if greater than width 402 Returns : integer 403 Args : integer 404 405=cut 406 407sub curpos { 408 my $self = shift; 409 my $prev = $self->{_position}; 410 if (@_) { $self->{_position} = shift; } 411 return $prev; 412} 413 414=head2 e_val 415 416 Title : e_val 417 Usage : 418 Function: Gets/sets the e-value 419 Returns : real number 420 Args : none to get, real number to set 421 422=cut 423 424sub e_val { 425 my $self = shift; 426 my $prev = $self->{e_val}; 427 if (@_) { $self->{e_val} = shift; } 428 return $prev; 429} 430 431=head2 IC 432 433 Title : IC 434 Usage : 435 Function: Get/set the Information Content 436 Returns : real number 437 Args : none to get, real number to set 438 439=cut 440 441sub IC { 442 my $self = shift; 443 my $prev = $self->{IC}; 444 if (@_) { $self->{IC} = shift; } 445 return $prev; 446} 447 448=head2 accession_number 449 450 Title : accession_number 451 Function: Get/set the accession number, this will be unique id for the 452 SiteMatrix object as well for any other object, inheriting from 453 SiteMatrix 454 Returns : string 455 Args : none to get, string to set 456 457=cut 458 459sub accession_number { 460 my $self = shift; 461 my $prev = $self->{accession_number}; 462 if (@_) { $self->{accession_number} = shift; } 463 return $prev; 464} 465 466=head2 consensus 467 468 Title : consensus 469 Usage : 470 Function: Returns the consensus 471 Returns : string 472 Args : (optional) threshold value 1 to 10, default 5 473 '5' means the returned characters had a 50% or higher presence at 474 their position 475 476=cut 477 478sub consensus { 479 my ($self, $thresh) = @_; 480 if ($thresh) { 481 my $len=$#{$self->{probA}}; 482 for (my $i=0; $i<$len+1; $i++) { 483 (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh); 484 } 485 } 486 my $consensus=''; 487 foreach my $letter (@{$self->{seq}}) { 488 $consensus .= $letter; 489 } 490 return $consensus; 491} 492 493=head2 width 494 495 Title : width 496 Usage : 497 Function: Returns the length of the sites in used to make this matrix 498 Returns : int 499 Args : none 500 501=cut 502 503sub width { 504 my $self = shift; 505 my $width=@{$self->{probA}}; 506 return $width; 507} 508 509=head2 sites 510 511 Title : sites 512 Usage : 513 Function: Get/set the number of sites that were used to make this matrix 514 Returns : int 515 Args : none to get, int to set 516 517=cut 518 519sub sites { 520 my $self = shift; 521 if (@_) { $self->{sites} = shift } 522 return $self->{sites} || return; 523} 524 525=head2 IUPAC 526 527 Title : IUPAC 528 Usage : 529 Function: Returns IUPAC compliant consensus 530 Returns : string 531 Args : optionally, also supply a whole number (int) of 1 or higher to set 532 the significance level when considering the frequencies. 1 (the 533 default) means a 0.05 significance level: frequencies lower than 534 0.05 will be ignored. 2 Means a 0.005 level, and so on. 535 536=cut 537 538sub IUPAC { 539 my ($self, $thresh) = @_; 540 if ($thresh) { 541 my $len=$#{$self->{probA}}; 542 for (my $i=0; $i<$len+1; $i++) { 543 (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh); 544 } 545 } 546 my $iu=$self->{IUPAC}; 547 my $iupac=''; 548 foreach my $let (@{$iu}) { 549 $iupac .= $let; 550 } 551 return $iupac; 552} 553 554=head2 _to_IUPAC 555 556 Title : _to_IUPAC 557 Usage : 558 Function: Converts a single position to IUPAC compliant symbol. 559 For rules see the implementation 560 Returns : char, real number 561 Args : real numbers for frequencies of A,C,G,T (positional) 562 563 optionally, also supply a whole number (int) of 1 or higher to set 564 the significance level when considering the frequencies. 1 (the 565 default) means a 0.05 significance level: frequencies lower than 566 0.05 will be ignored. 2 Means a 0.005 level, and so on. 567 568=cut 569 570sub _to_IUPAC { 571 my ($a, $c, $g, $t, $thresh) = @_; 572 $thresh ||= 1; 573 $thresh = int($thresh); 574 $a = sprintf ("%.${thresh}f", $a); 575 $c = sprintf ("%.${thresh}f", $c); 576 $g = sprintf ("%.${thresh}f", $g); 577 $t = sprintf ("%.${thresh}f", $t); 578 579 my $total = $a + $c + $g + $t; 580 581 return 'A' if ($a == $total); 582 return 'G' if ($g == $total); 583 return 'C' if ($c == $total); 584 return 'T' if ($t == $total); 585 my $r=$g+$a; 586 return 'R' if ($r == $total); 587 my $y=$t+$c; 588 return 'Y' if ($y == $total); 589 my $m=$a+$c; 590 return 'M' if ($m == $total); 591 my $k=$g+$t; 592 return 'K' if ($k == $total); 593 my $s=$g+$c; 594 return 'S' if ($s == $total); 595 my $w=$a+$t; 596 return 'W' if ($w == $total); 597 my $d=$r+$t; 598 return 'D' if ($d == $total); 599 my $v=$r+$c; 600 return 'V' if ($v == $total); 601 my $b=$y+$g; 602 return 'B' if ($b == $total); 603 my $h=$y+$a; 604 return 'H' if ($h == $total); 605 return 'N'; 606} 607 608=head2 _to_cons 609 610 Title : _to_cons 611 Usage : 612 Function: Converts a single position to simple consensus character and returns 613 its probability. For rules see the implementation 614 Returns : char, real number 615 Args : real numbers for A,C,G,T (positional), and optional 5th argument of 616 threshold (as a number between 1 and 10, where 5 is default and 617 means the returned character had a 50% or higher presence at this 618 position) 619 620=cut 621 622sub _to_cons { 623 my ($A, $C, $G, $T, $thresh) = @_; 624 $thresh ||= 5; 625 626 # this multiplication by 10 is just to satisfy the thresh range of 1-10 627 my $a = $A * 10; 628 my $c = $C * 10; 629 my $g = $G * 10; 630 my $t = $T * 10; 631 632 return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh)); 633 return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g)); 634 635 # threshold could be lower than 50%, so must check is not only over 636 # threshold, but also the highest frequency 637 return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g)); 638 return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g)); 639 return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a)); 640 return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a)); 641 642 return 'N',10; 643} 644 645=head2 get_string 646 647 Title : get_string 648 Usage : 649 Function: Returns given probability vector as a string. Useful if you want to 650 store things in a rel database, where arrays are not first choice 651 Throws : If the argument is outside {A,C,G,T} 652 Returns : string 653 Args : character {A,C,G,T} 654 655=cut 656 657sub get_string { 658 my $self=shift; 659 my $base=shift; 660 my $string=''; 661 my @prob; 662 663 BASE: { 664 if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; } 665 if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; } 666 if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; } 667 if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; } 668 $self->throw ("No such base: $base!\n"); 669 } 670 671 foreach my $prob (@prob) { 672 my $corrected = $prob*10; 673 my $next=sprintf("%.0f",$corrected); 674 $next='a' if ($next eq '10'); 675 $string .= $next; 676 } 677 return $string; 678} 679 680=head2 get_array 681 682 Title : get_array 683 Usage : 684 Function: Returns an array with frequencies for a specified base 685 Returns : array 686 Args : char 687 688=cut 689 690sub get_array { 691 my $self=shift; 692 my $base=uc(shift); 693 return @{$self->{probA}} if ($base eq 'A'); 694 return @{$self->{probC}} if ($base eq 'C'); 695 return @{$self->{probG}} if ($base eq 'G'); 696 return @{$self->{probT}} if ($base eq 'T'); 697 $self->throw("No such base: $base!\n"); 698} 699 700=head2 get_logs_array 701 702 Title : get_logs_array 703 Usage : 704 Function: Returns an array with log_odds for a specified base 705 Returns : array 706 Args : char 707 708=cut 709 710sub get_logs_array { 711 my $self=shift; 712 my $base=uc(shift); 713 return @{$self->{logA}} if (($base eq 'A') && ($self->{logA})); 714 return @{$self->{logC}} if (($base eq 'C') && ($self->{logC})); 715 return @{$self->{logG}} if (($base eq 'G') && ($self->{logG})); 716 return @{$self->{logT}} if (($base eq 'T') && ($self->{logT})); 717 $self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T))); 718 return; 719} 720 721=head2 id 722 723 Title : id 724 Usage : 725 Function: Gets/sets the site id 726 Returns : string 727 Args : string 728 729=cut 730 731sub id { 732 my $self = shift; 733 my $prev = $self->{id}; 734 if (@_) { $self->{id} = shift; } 735 return $prev; 736} 737 738=head2 regexp 739 740 Title : regexp 741 Usage : 742 Function: Returns a regular expression which matches the IUPAC convention. 743 N will match X, N, - and . 744 Returns : string 745 Args : none (works at the threshold last used for making the IUPAC string) 746 747=cut 748 749sub regexp { 750 my $self=shift; 751 my $regexp; 752 foreach my $letter (@{$self->{IUPAC}}) { 753 my $reg; 754 LETTER: { 755 if ($letter eq 'A') { $reg='[Aa]'; last LETTER; } 756 if ($letter eq 'C') { $reg='[Cc]'; last LETTER; } 757 if ($letter eq 'G') { $reg='[Gg]'; last LETTER; } 758 if ($letter eq 'T') { $reg='[Tt]'; last LETTER; } 759 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; } 760 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; } 761 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; } 762 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; } 763 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; } 764 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; } 765 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; } 766 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; } 767 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; } 768 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; } 769 $reg='\S'; 770 } 771 $regexp .= $reg; 772 } 773 return $regexp; 774} 775 776=head2 regexp_array 777 778 Title : regexp_array 779 Usage : 780 Function: Returns a regular expression which matches the IUPAC convention. 781 N will match X, N, - and . 782 Returns : array 783 Args : none (works at the threshold last used for making the IUPAC string) 784 To do : I have separated regexp and regexp_array, but 785 maybe they can be rewritten as one - just check what should be returned 786 787=cut 788 789sub regexp_array { 790 my $self=shift; 791 my @regexp; 792 foreach my $letter (@{$self->{IUPAC}}) { 793 my $reg; 794 LETTER: { 795 if ($letter eq 'A') { $reg='[Aa]'; last LETTER; } 796 if ($letter eq 'C') { $reg='[Cc]'; last LETTER; } 797 if ($letter eq 'G') { $reg='[Gg]'; last LETTER; } 798 if ($letter eq 'T') { $reg='[Tt]'; last LETTER; } 799 if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; } 800 if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; } 801 if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; } 802 if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; } 803 if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; } 804 if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; } 805 if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; } 806 if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; } 807 if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; } 808 if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; } 809 $reg='\S'; 810 } 811 push @regexp,$reg; 812 } 813 return @regexp; 814} 815 816 817=head2 _compress_array 818 819 Title : _compress_array 820 Usage : 821 Function: Will compress an array of real signed numbers to a string (ie vector 822 of bytes) -127 to +127 for bi-directional(signed) and 0..255 for 823 unsigned 824 Returns : String 825 Args : array reference, followed by an max value and direction (optional, 826 default 1-unsigned),1 unsigned, any other is signed. 827 828=cut 829 830sub _compress_array { 831 my ($array,$lm,$direct)=@_; 832 my $str; 833 return unless(($array) && ($lm)); 834 $direct=1 unless ($direct); 835 my $k1= ($direct==1) ? (255/$lm) : (127/$lm); 836 foreach my $c (@{$array}) { 837 $c=$lm if ($c>$lm); 838 $c=-$lm if (($c<-$lm) && ($direct !=1)); 839 $c=0 if (($c<0) && ($direct ==1)); 840 my $byte=int($k1*$c); 841 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits 842 my $char=chr($byte); 843 $str.=$char; 844 } 845 return $str; 846} 847 848=head2 _uncompress_string 849 850 Title : _uncompress_string 851 Usage : 852 Function: Will uncompress a string (vector of bytes) to create an array of 853 real signed numbers (opposite to_compress_array) 854 Returns : string, followed by an max value and 855 direction (optional, default 1-unsigned), 1 unsigned, any other is signed. 856 Args : array 857 858=cut 859 860sub _uncompress_string { 861 my ($str,$lm,$direct)=@_; 862 my @array; 863 return unless(($str) && ($lm)); 864 $direct=1 unless ($direct); 865 my $k1= ($direct==1) ? (255/$lm) : (127/$lm); 866 foreach my $c (split(//,$str)) { 867 my $byte=ord($c); 868 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits 869 my $num=$byte/$k1; 870 push @array,$num; 871 } 872 return @array; 873} 874 875=head2 get_compressed_freq 876 877 Title : get_compressed_freq 878 Usage : 879 Function: A method to provide a compressed frequency vector. It uses one byte 880 to code the frequence for one of the probability vectors for one 881 position. Useful for relational database. Improvement of the previous 882 0..a coding. 883 Example : my $strA=$self->get_compressed_freq('A'); 884 Returns : String 885 Args : char 886 887=cut 888 889sub get_compressed_freq { 890 my $self=shift; 891 my $base=shift; 892 my $string=''; 893 my @prob; 894 BASE: { 895 if ($base eq 'A') { 896 @prob= @{$self->{probA}} unless (!defined($self->{probA})); 897 last BASE; 898 } 899 if ($base eq 'G') { 900 @prob= @{$self->{probG}} unless (!defined($self->{probG})); 901 last BASE; 902 } 903 if ($base eq 'C') { 904 @prob= @{$self->{probC}} unless (!defined($self->{probC})); 905 last BASE; 906 } 907 if ($base eq 'T') { 908 @prob= @{$self->{probT}} unless (!defined($self->{probT})); 909 last BASE; 910 } 911 $self->throw ("No such base: $base!\n"); 912 } 913 my $str= _compress_array(\@prob,1,1); 914 return $str; 915} 916 917=head2 get_compressed_logs 918 919 Title : get_compressed_logs 920 Usage : 921 Function: A method to provide a compressed log-odd vector. It uses one byte to 922 code the log value for one of the log-odds vectors for one position. 923 Example : my $strA=$self->get_compressed_logs('A'); 924 Returns : String 925 Args : char 926 927=cut 928 929sub get_compressed_logs { 930 my $self=shift; 931 my $base=shift; 932 my $string=''; 933 my @prob; 934 BASE: { 935 if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; } 936 if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; } 937 if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; } 938 if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; } 939 $self->throw ("No such base: $base!\n"); 940 } 941 return _compress_array(\@prob,1000,2); 942} 943 944=head2 sequence_match_weight 945 946 Title : sequence_match_weight 947 Usage : 948 Function: This method will calculate the score of a match, based on the PWM 949 if such is associated with the matrix object. Returns undef if no 950 PWM data is available. 951 Throws : if the length of the sequence is different from the matrix width 952 Example : my $score=$matrix->sequence_match_weight('ACGGATAG'); 953 Returns : Floating point 954 Args : string 955 956=cut 957 958sub sequence_match_weight { 959 my ($self,$seq)=@_; 960 return unless ($self->{logA}); 961 my $width=$self->width; 962 $self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}}); 963 $seq = uc($seq); 964 my @seq=split(//,$seq); 965 my $score = 0; 966 my $i=0; 967 foreach my $pos (@seq) { 968 my $tv = 'log'.$pos; 969 $self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv}; 970 $score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0; 971 $i++; 972 } 973 return $score; 974} 975 976=head2 get_all_vectors 977 978 Title : get_all_vectors 979 Usage : 980 Function: returns all possible sequence vectors to satisfy the PFM under 981 a given threshold 982 Throws : If threshold outside of 0..1 (no sense to do that) 983 Example : my @vectors=$self->get_all_vectors(4); 984 Returns : Array of strings 985 Args : (optional) floating 986 987=cut 988 989sub get_all_vectors { 990 my $self=shift; 991 my $thresh=shift; 992 $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1)); 993 my @seq=split(//,$self->consensus($thresh*10)); 994 my @perm; 995 for my $i (0..@{$self->{probA}}) { 996 push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh); 997 push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh); 998 push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh); 999 push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh); 1000 push @{$perm[$i]},'N' if ($seq[$i] eq 'N'); 1001 } 1002 my $fpos=shift @perm; 1003 my @strings=@$fpos; 1004 foreach my $pos (@perm) { 1005 my @newstr; 1006 foreach my $let (@$pos) { 1007 foreach my $string (@strings) { 1008 my $newstring = $string . $let; 1009 push @newstr,$newstring; 1010 } 1011 } 1012 @strings=@newstr; 1013 } 1014 return @strings; 1015} 1016 10171; 1018