1#---------------------------------------------------------
2
3=head1 NAME
4
5Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a
6position scoring matrix (or position weight matrix) and log-odds
7
8=head1 SYNOPSIS
9
10  use Bio::Matrix::PSM::SiteMatrix;
11  # Create from memory by supplying probability matrix hash
12  # both as strings or arrays
13  # where the frequencies  $a,$c,$g and $t are supplied either as
14  # arrayref or string. Accordingly, lA, lC, lG and lT are the log
15  # odds (only as arrays, no checks done right now)
16  my ($a,$c,$g,$t,$score,$ic, $mid)=@_;
17  #or
18  my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001',
19                                    '100104',0.001,19.2,'CRE1');
20  #Where a stands for all (this frequency=1), see explanation below
21  my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,
22             -lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l,
23             -IC=>$ic,-e_val=>$score, -id=>$mid);
24  my $site=Bio::Matrix::PSM::SiteMatrix->new(%param);
25  #Or get it from a file:
26  use Bio::Matrix::PSM::IO;
27  my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac');
28  while (my $psm=$psmIO->next_psm) {
29    #Now we have a Bio::Matrix::PSM::Psm object,
30    # see Bio::Matrix::PSM::PsmI for details
31    #This is a Bio::Matrix::PSM::SiteMatrix object now
32    my $matrix=$psm->matrix;
33  }
34
35  # Get a simple consensus, where alphabet is {A,C,G,T,N},
36  # choosing the character that both satisfies a supplied or default threshold
37  # frequency and is the most frequenct character at each position, or N.
38  # So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15,
39  # the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it
40  # would be 'N'.
41  my $consensus=$site->consensus;
42
43  # Get the IUPAC ambiguity code representation of the data in the matrix.
44  # Because the frequencies may have been pseudo-count corrected, insignificant
45  # frequences (below 0.05 by default) are ignored. So a position with
46  # A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M',
47  # while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and
48  # 0.25, 0.25, 0.25, 0.25 would get 'N'.
49  my $iupac=$site->IUPAC;
50
51  # Getting/using regular expression (a representation of the IUPAC string)
52  my $regexp=$site->regexp;
53  my $count=grep($regexp,$seq);
54  my $count=($seq=~ s/$regexp/$1/eg);
55  print "Motif $mid is present $count times in this sequence\n";
56
57=head1 DESCRIPTION
58
59SiteMatrix is designed to provide some basic methods when working with position
60scoring (weight) matrices, such as transcription factor binding sites for
61example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is
62the minimum information you should provide to construct a PSM object. The
63vectors can be provided as strings with frequenciesx10 rounded to an int, going
64from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed
65representation of a matrix and it is quite useful when working with relational
66DB. If arrays are provided as an input (references to arrays actually) they can
67be any number, real or integer (frequency or count).
68
69When creating the object you can ask the constructor to make a simple pseudo
70count correction by adding a number (typically 1) to all positions (with the
71-correction option). After adding the number the frequencies will be
72calculated. Only use correction when you supply counts, not frequencies.
73
74Throws an exception if: You mix as an input array and string (for example A
75matrix is given as array, C - as string). The position vector is (0,0,0,0). One
76of the probability vectors is shorter than the rest.
77
78Summary of the methods I use most frequently (details below):
79
80  iupac - return IUPAC compliant consensus as a string
81  score - Returns the score as a real number
82  IC - information content. Returns a real number
83  id - identifier. Returns a string
84  accession - accession number. Returns a string
85  next_pos - return the sequence probably for each letter, IUPAC
86      symbol, IUPAC probability and simple sequence
87  consenus letter for this position. Rewind at the end. Returns a hash.
88  pos - current position get/set. Returns an integer.
89  regexp - construct a regular expression based on IUPAC consensus.
90      For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
91  width - site width
92  get_string - gets the probability vector for a single base as a string.
93  get_array - gets the probability vector for a single base as an array.
94  get_logs_array - gets the log-odds vector for a single base as an array.
95
96New methods, which might be of interest to anyone who wants to store
97PSM in a relational database without creating an entry for each
98position is the ability to compress the PSM vector into a string with
99losing usually less than 1% of the data.  this can be done with:
100
101  my $str=$matrix->get_compressed_freq('A');
102or
103  my $str=$matrix->get_compressed_logs('A');
104
105Loading from a database should be done with new, but is not yest implemented.
106However you can still uncompress such string with:
107
108  my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
109or
110  my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
111
112=head1 FEEDBACK
113
114=head2 Mailing Lists
115
116User feedback is an integral part of the evolution of this and other
117Bioperl modules. Send your comments and suggestions preferably to one
118of the Bioperl mailing lists.  Your participation is much appreciated.
119
120  bioperl-l@bioperl.org                  - General discussion
121  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
122
123=head2 Support
124
125Please direct usage questions or support issues to the mailing list:
126
127I<bioperl-l@bioperl.org>
128
129rather than to the module maintainer directly. Many experienced and
130reponsive experts will be able look at the problem and quickly
131address it. Please include a thorough description of the problem
132with code and data examples if at all possible.
133
134=head2 Reporting Bugs
135
136Report bugs to the Bioperl bug tracking system to help us keep track
137the bugs and their resolution.  Bug reports can be submitted via the
138web:
139
140  https://github.com/bioperl/bioperl-live/issues
141
142=head1 AUTHOR - Stefan Kirov
143
144Email skirov@utk.edu
145
146=head1 CONTRIBUTORS
147
148Sendu Bala, bix@sendu.me.uk
149
150=head1 APPENDIX
151
152The rest of the documentation details each of the object methods.
153Internal methods are usually preceded with a _
154
155=cut
156
157# Let the code begin...
158package Bio::Matrix::PSM::SiteMatrix;
159$Bio::Matrix::PSM::SiteMatrix::VERSION = '1.7.7';
160use strict;
161
162use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
163
164=head2 new
165
166 Title   : new
167 Usage   : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c,
168						     -pG=>$g,-pT=>$t,
169						     -IC=>$ic,
170						     -e_val=>$score,
171						     -id=>$mid);
172 Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory
173 Throws :  If inconsistent data for all vectors (A,C,G and T) is
174           provided, if you mix input types (string vs array) or if a
175           position freq is 0.
176 Returns :  Bio::Matrix::PSM::SiteMatrix object
177 Args    :  -pA    => vector with the frequencies or counts of A
178            -pC    => vector for C
179            -pG    => vector for G
180            -pt    => vector for T
181            -lA    => vector for the log of A
182            -lC    => vector for the log of C
183            -lG    => vector for the log of G
184            -lT    => vector for the log of T
185            -IC    => real number, the information content of this matrix
186            -e_val => real number, the expect value
187            -id    => string, an identifier
188            -width => int, width of the matrix in nucleotides
189            -sites => int, the number of sites that went into this matrix
190            -model => hash ref, background frequencies for A, C, G and T
191            -correction => number, the number to add to all positions to achieve
192                           pseudo count correction (default 0: no correction)
193                           NB: do not use correction when your input is
194                           frequences!
195            -accession_number => string, an accession number
196
197            Vectors can be strings of the frequencies where the frequencies are
198            multiplied by 10 and rounded to the nearest whole number, and where
199            'a' is used to denote the maximal frequency 10. There should be no
200            punctuation (spaces etc.) in the string. For example, 'a0501'.
201            Alternatively frequencies or counts can be represented by an array
202            ref containing the counts, frequencies or logs as any kind of
203            number.
204
205=cut
206
207sub new {
208    my ($class, @args) = @_;
209    my $self = $class->SUPER::new(@args);
210    my $consensus;
211    # Too many things to rearrange, and I am creating simultanuously >500
212    # such objects routinely, so this becomes performance issue
213    my %input;
214    while (@args) {
215        (my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)!
216        $input{$key} = shift @args;
217    }
218    $self->{_position}   = 0;
219    $self->{IC}     = $input{IC};
220    $self->{e_val}  = $input{e_val};
221    $self->{width}  = $input{width};
222	$self->{logA}   = $input{lA};
223	$self->{logC}   = $input{lC};
224	$self->{logG}   = $input{lG};
225	$self->{logT}   = $input{lT};
226    $self->{sites}  = $input{sites};
227    $self->{id}     = $input{id} || 'null';
228    $self->{correction} = $input{correction} || 0;
229    $self->{accession_number} = $input{accession_number};
230	return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT}));
231
232    # This should go to _initialize?
233    # Check for input type- no mixing alllowed, throw ex
234    if (ref($input{pA}) =~ /ARRAY/i ) {
235        $self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC}));
236        $self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG}));
237        $self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT}));
238        $self->{probA} = $input{pA};
239        $self->{probC} = $input{pC};
240        $self->{probG} = $input{pG};
241        $self->{probT} = $input{pT};
242    }
243    else {
244        $self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC}));
245        $self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG}));
246        $self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT}));
247        $self->{probA} = [split(//,$input{pA})];
248        $self->{probC} = [split(//,$input{pC})];
249        $self->{probG} = [split(//,$input{pG})];
250        $self->{probT} = [split(//,$input{pT})];
251        for (my $i=0; $i<= @{$self->{probA}}+1; $i++) {
252            # we implictely assume these are MEME-style frequencies x 10, so
253            # 'a' represents the 'maximum', 10. Other positions can actually
254            # add up to over 10 due to rounding, but I don't think that is a
255            # problem?
256            if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') {
257                ${$self->{probA}}[$i]='10';
258            }
259            if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') {
260                ${$self->{probC}}[$i]='10';
261            }
262            if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') {
263                ${$self->{probG}}[$i]='10';
264            }
265            if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') {
266                ${$self->{probT}}[$i]='10';
267            }
268        }
269    }
270
271    # Check for position with 0 for all bases, throw exception if so
272    for (my $i=0;$i <= $#{$self->{probA}}; $i++) {
273        if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) {
274            $self->throw("Position meaningless-all frequencies are 0");
275        }
276
277        # apply pseudo-count correction to all values - this will result in
278        # very bad frequencies if the input is already frequences and a
279        # correction value as large as 1 is used!
280        if ($self->{correction}) {
281            ${$self->{probA}}[$i] += $self->{correction};
282            ${$self->{probC}}[$i] += $self->{correction};
283            ${$self->{probG}}[$i] += $self->{correction};
284            ${$self->{probT}}[$i] += $self->{correction};
285        }
286
287        # (re)calculate frequencies
288        my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i];
289        ${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div;
290        ${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div;
291        ${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div;
292        ${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div;
293    }
294
295    # Calculate the logs
296    if ((!defined($self->{logA})) && ($input{model})) {
297        $self->calc_weight($input{model});
298    }
299
300    # Make consensus, throw if any one of the vectors is shorter
301    $self->_calculate_consensus;
302    return $self;
303}
304
305=head2 _calculate_consensus
306
307 Title   : _calculate_consensus
308 Function: Internal stuff
309
310=cut
311
312sub _calculate_consensus {
313    my $self=shift;
314    my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}});
315    my $len=$#{$self->{probA}};
316    $self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc);
317    $self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt);
318    $self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg);
319    for (my $i=0; $i<$len+1; $i++) {
320        #*** IUPACp values not actually used (eg. by next_pos)
321        (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
322        (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
323    }
324    return $self;
325}
326
327=head2 calc_weight
328
329 Title   : calc_weight
330 Usage   : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568});
331 Function: Recalculates the PSM (or weights) based on the PFM (the frequency
332           matrix) and user supplied background model.
333 Throws  : if no model is supplied
334 Returns : n/a
335 Args    : reference to a hash with background frequencies for A,C,G and T
336
337=cut
338
339sub calc_weight {
340    my ($self, $model) = @_;
341    my %model;
342    $model{probA}=$model->{A};
343    $model{probC}=$model->{C};
344    $model{probG}=$model->{G};
345    $model{probT}=$model->{T};
346    foreach my $let (qw(probA probC probG probT)) {
347      my @str;
348      $self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1));
349      foreach my $f (@{$self->{$let}}) {
350        my $w=log($f)-log($model{$let});
351        push @str,$w;
352      }
353      my $llet=$let;
354      $llet=~s/prob/log/;
355      $self->{$llet}=\@str;
356    }
357    return $self;
358}
359
360=head2 next_pos
361
362 Title   : next_pos
363 Usage   :
364 Function: Retrieves the next position features: frequencies for A,C,G,T, the
365           main letter (as in consensus) and the probabilty for this letter to
366           occur at this position and the current position
367 Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel)
368 Args    : none
369
370=cut
371
372sub next_pos {
373    my $self = shift;
374    $self->throw("instance method called on class") unless ref $self;
375    my $len=@{$self->{seq}};
376    my $pos=$self->{_position};
377    # End reached?
378    if ($pos<$len) {
379	my $pA=${$self->{probA}}[$pos];
380	my $pC=${$self->{probC}}[$pos];
381	my $pG=${$self->{probG}}[$pos];
382	my $pT=${$self->{probT}}[$pos];
383	my $lA=${$self->{logA}}[$pos];
384	my $lC=${$self->{logC}}[$pos];
385	my $lG=${$self->{logG}}[$pos];
386	my $lT=${$self->{logT}}[$pos];
387	my $base=${$self->{seq}}[$pos];
388	my $prob=${$self->{seqp}}[$pos];
389	$self->{_position}++;
390	my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob);
391	return %seq;
392    }
393    else {$self->{_position}=0; return;}
394}
395
396=head2 curpos
397
398 Title   : curpos
399 Usage   :
400 Function: Gets/sets the current position. Converts to 0 if argument is minus
401           and to width if greater than width
402 Returns : integer
403 Args    : integer
404
405=cut
406
407sub curpos {
408    my $self = shift;
409    my $prev = $self->{_position};
410    if (@_) { $self->{_position} = shift; }
411    return $prev;
412}
413
414=head2 e_val
415
416 Title   : e_val
417 Usage   :
418 Function: Gets/sets the e-value
419 Returns : real number
420 Args    : none to get, real number to set
421
422=cut
423
424sub e_val {
425    my $self = shift;
426    my $prev = $self->{e_val};
427    if (@_) { $self->{e_val} = shift; }
428    return $prev;
429}
430
431=head2 IC
432
433 Title   : IC
434 Usage   :
435 Function: Get/set the Information Content
436 Returns : real number
437 Args    : none to get, real number to set
438
439=cut
440
441sub IC {
442    my $self = shift;
443    my $prev = $self->{IC};
444    if (@_) { $self->{IC} = shift; }
445    return $prev;
446}
447
448=head2 accession_number
449
450 Title   : accession_number
451 Function: Get/set the accession number, this will be unique id for the
452           SiteMatrix object as well for any other object, inheriting from
453           SiteMatrix
454 Returns : string
455 Args    : none to get, string to set
456
457=cut
458
459sub accession_number {
460    my $self = shift;
461    my $prev = $self->{accession_number};
462    if (@_) { $self->{accession_number} = shift; }
463    return $prev;
464}
465
466=head2 consensus
467
468 Title   : consensus
469 Usage   :
470 Function: Returns the consensus
471 Returns : string
472 Args    : (optional) threshold value 1 to 10, default 5
473           '5' means the returned characters had a 50% or higher presence at
474           their position
475
476=cut
477
478sub consensus {
479    my ($self, $thresh) = @_;
480    if ($thresh) {
481        my $len=$#{$self->{probA}};
482        for (my $i=0; $i<$len+1; $i++) {
483            (${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
484        }
485    }
486    my $consensus='';
487    foreach my $letter (@{$self->{seq}}) {
488        $consensus .= $letter;
489    }
490    return $consensus;
491}
492
493=head2 width
494
495 Title   : width
496 Usage   :
497 Function: Returns the length of the sites in used to make this matrix
498 Returns : int
499 Args    : none
500
501=cut
502
503sub width {
504    my $self = shift;
505    my $width=@{$self->{probA}};
506    return $width;
507}
508
509=head2 sites
510
511 Title   : sites
512 Usage   :
513 Function: Get/set the number of sites that were used to make this matrix
514 Returns : int
515 Args    : none to get, int to set
516
517=cut
518
519sub sites {
520    my $self = shift;
521    if (@_) { $self->{sites} = shift }
522    return $self->{sites} || return;
523}
524
525=head2 IUPAC
526
527 Title   : IUPAC
528 Usage   :
529 Function: Returns IUPAC compliant consensus
530 Returns : string
531 Args    : optionally, also supply a whole number (int) of 1 or higher to set
532           the significance level when considering the frequencies. 1 (the
533           default) means a 0.05 significance level: frequencies lower than
534           0.05 will be ignored. 2 Means a 0.005 level, and so on.
535
536=cut
537
538sub IUPAC {
539	my ($self, $thresh) = @_;
540    if ($thresh) {
541        my $len=$#{$self->{probA}};
542        for (my $i=0; $i<$len+1; $i++) {
543            (${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
544        }
545    }
546	my $iu=$self->{IUPAC};
547	my $iupac='';
548	foreach my $let (@{$iu}) {
549		$iupac .= $let;
550	}
551    return $iupac;
552}
553
554=head2 _to_IUPAC
555
556 Title   : _to_IUPAC
557 Usage   :
558 Function: Converts a single position to IUPAC compliant symbol.
559           For rules see the implementation
560 Returns : char, real number
561 Args    : real numbers for frequencies of A,C,G,T (positional)
562
563           optionally, also supply a whole number (int) of 1 or higher to set
564           the significance level when considering the frequencies. 1 (the
565           default) means a 0.05 significance level: frequencies lower than
566           0.05 will be ignored. 2 Means a 0.005 level, and so on.
567
568=cut
569
570sub _to_IUPAC {
571	my ($a, $c, $g, $t, $thresh) = @_;
572    $thresh ||= 1;
573    $thresh = int($thresh);
574    $a = sprintf ("%.${thresh}f", $a);
575    $c = sprintf ("%.${thresh}f", $c);
576    $g = sprintf ("%.${thresh}f", $g);
577    $t = sprintf ("%.${thresh}f", $t);
578
579    my $total = $a + $c + $g + $t;
580
581	return 'A' if ($a == $total);
582	return 'G' if ($g == $total);
583	return 'C' if ($c == $total);
584	return 'T' if ($t == $total);
585	my $r=$g+$a;
586	return 'R' if ($r == $total);
587	my $y=$t+$c;
588	return 'Y' if ($y == $total);
589	my $m=$a+$c;
590	return 'M' if ($m == $total);
591	my $k=$g+$t;
592	return 'K' if ($k == $total);
593	my $s=$g+$c;
594	return 'S' if ($s == $total);
595	my $w=$a+$t;
596	return 'W' if ($w == $total);
597	my $d=$r+$t;
598	return 'D' if ($d == $total);
599	my $v=$r+$c;
600	return 'V' if ($v == $total);
601	my $b=$y+$g;
602	return 'B' if ($b == $total);
603	my $h=$y+$a;
604	return 'H' if ($h == $total);
605	return 'N';
606}
607
608=head2 _to_cons
609
610 Title   : _to_cons
611 Usage   :
612 Function: Converts a single position to simple consensus character and returns
613           its probability. For rules see the implementation
614 Returns : char, real number
615 Args    : real numbers for A,C,G,T (positional), and optional 5th argument of
616           threshold (as a number between 1 and 10, where 5 is default and
617           means the returned character had a 50% or higher presence at this
618           position)
619
620=cut
621
622sub _to_cons {
623	my ($A, $C, $G, $T, $thresh) = @_;
624    $thresh ||= 5;
625
626    # this multiplication by 10 is just to satisfy the thresh range of 1-10
627	my $a = $A * 10;
628	my $c = $C * 10;
629	my $g = $G * 10;
630	my $t = $T * 10;
631
632    return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh));
633	return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g));
634
635    # threshold could be lower than 50%, so must check is not only over
636    # threshold, but also the highest frequency
637	return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g));
638	return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g));
639	return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a));
640	return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a));
641
642    return 'N',10;
643}
644
645=head2 get_string
646
647 Title   : get_string
648 Usage   :
649 Function: Returns given probability vector as a string. Useful if you want to
650           store things in a rel database, where arrays are not first choice
651 Throws  : If the argument is outside {A,C,G,T}
652 Returns : string
653 Args    : character {A,C,G,T}
654
655=cut
656
657sub get_string {
658	my $self=shift;
659	my $base=shift;
660	my $string='';
661	my @prob;
662
663	BASE: {
664		if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; }
665		if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; }
666		if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; }
667		if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; }
668		$self->throw ("No such base: $base!\n");
669	}
670
671    foreach  my $prob (@prob) {
672        my $corrected = $prob*10;
673        my $next=sprintf("%.0f",$corrected);
674        $next='a' if ($next eq '10');
675        $string .= $next;
676    }
677    return $string;
678}
679
680=head2 get_array
681
682 Title   : get_array
683 Usage   :
684 Function: Returns an array with frequencies for a specified base
685 Returns : array
686 Args    : char
687
688=cut
689
690sub get_array {
691	my $self=shift;
692	my $base=uc(shift);
693	return  @{$self->{probA}} if ($base eq 'A');
694	return  @{$self->{probC}} if ($base eq 'C');
695	return  @{$self->{probG}} if ($base eq 'G');
696	return  @{$self->{probT}} if ($base eq 'T');
697	$self->throw("No such base: $base!\n");
698}
699
700=head2 get_logs_array
701
702 Title   : get_logs_array
703 Usage   :
704 Function: Returns an array with log_odds for a specified base
705 Returns : array
706 Args    : char
707
708=cut
709
710sub get_logs_array {
711	my $self=shift;
712	my $base=uc(shift);
713	return  @{$self->{logA}} if (($base eq 'A')  && ($self->{logA}));
714	return  @{$self->{logC}} if (($base eq 'C')  && ($self->{logC}));
715	return  @{$self->{logG}} if (($base eq 'G')  && ($self->{logG}));
716	return  @{$self->{logT}} if (($base eq 'T')  && ($self->{logT}));
717	$self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T)));
718    return;
719}
720
721=head2 id
722
723 Title   : id
724 Usage   :
725 Function: Gets/sets the site id
726 Returns : string
727 Args    : string
728
729=cut
730
731sub id {
732    my $self = shift;
733    my $prev = $self->{id};
734    if (@_) { $self->{id} = shift; }
735    return $prev;
736}
737
738=head2 regexp
739
740 Title   : regexp
741 Usage   :
742 Function: Returns a regular expression which matches the IUPAC convention.
743           N will match X, N, - and .
744 Returns : string
745 Args    : none (works at the threshold last used for making the IUPAC string)
746
747=cut
748
749sub regexp {
750	my $self=shift;
751	my $regexp;
752	foreach my $letter (@{$self->{IUPAC}}) {
753		my $reg;
754		LETTER: {
755			if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
756			if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
757			if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
758			if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
759			if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
760			if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
761			if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
762			if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
763			if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
764			if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
765			if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
766			if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
767			if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
768			if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
769			$reg='\S';
770		}
771		$regexp .= $reg;
772	}
773    return $regexp;
774}
775
776=head2 regexp_array
777
778 Title   : regexp_array
779 Usage   :
780 Function: Returns a regular expression which matches the IUPAC convention.
781           N will match X, N, - and .
782 Returns : array
783 Args    : none (works at the threshold last used for making the IUPAC string)
784 To do   : I have separated regexp and regexp_array, but
785           maybe they can be rewritten as one - just check what should be returned
786
787=cut
788
789sub regexp_array {
790	my $self=shift;
791	my @regexp;
792	foreach my $letter (@{$self->{IUPAC}}) {
793		my $reg;
794		LETTER: {
795			if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
796			if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
797			if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
798			if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
799			if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
800			if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
801			if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
802			if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
803			if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
804			if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
805			if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
806			if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
807			if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
808			if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
809			$reg='\S';
810		}
811		push @regexp,$reg;
812	}
813    return @regexp;
814}
815
816
817=head2 _compress_array
818
819 Title   : _compress_array
820 Usage   :
821 Function: Will compress an array of real signed numbers to a string (ie vector
822           of bytes) -127 to +127 for bi-directional(signed) and 0..255 for
823           unsigned
824 Returns : String
825 Args    : array reference, followed by an max value and direction (optional,
826           default 1-unsigned),1 unsigned, any other is signed.
827
828=cut
829
830sub _compress_array {
831	my ($array,$lm,$direct)=@_;
832	my $str;
833	return  unless(($array) && ($lm));
834	$direct=1 unless ($direct);
835	my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
836	foreach my $c (@{$array}) {
837		$c=$lm if ($c>$lm);
838		$c=-$lm if (($c<-$lm) && ($direct !=1));
839    $c=0 if (($c<0) && ($direct ==1));
840		my $byte=int($k1*$c);
841    $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
842    my $char=chr($byte);
843		$str.=$char;
844	}
845	return $str;
846}
847
848=head2 _uncompress_string
849
850 Title   : _uncompress_string
851 Usage   :
852 Function: Will uncompress a string (vector of bytes) to create an array of
853           real signed numbers (opposite to_compress_array)
854 Returns : string, followed by an max value and
855 		   direction (optional, default 1-unsigned), 1 unsigned, any other is signed.
856 Args    : array
857
858=cut
859
860sub _uncompress_string {
861	my ($str,$lm,$direct)=@_;
862	my @array;
863	return unless(($str) && ($lm));
864	$direct=1 unless ($direct);
865	my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
866	foreach my $c (split(//,$str)) {
867		my $byte=ord($c);
868		$byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
869		my $num=$byte/$k1;
870		push @array,$num;
871	}
872	return @array;
873}
874
875=head2 get_compressed_freq
876
877 Title   : get_compressed_freq
878 Usage   :
879 Function: A method to provide a compressed frequency vector. It uses one byte
880           to code the frequence for one of the probability vectors for one
881           position. Useful for relational database. Improvement of the previous
882           0..a coding.
883 Example :  my $strA=$self->get_compressed_freq('A');
884 Returns :  String
885 Args    :  char
886
887=cut
888
889sub get_compressed_freq {
890	my $self=shift;
891	my $base=shift;
892	my $string='';
893	my @prob;
894	BASE: {
895		if ($base eq 'A') {
896      @prob= @{$self->{probA}} unless (!defined($self->{probA}));
897      last BASE;
898    }
899  		if ($base eq 'G') {
900      @prob= @{$self->{probG}} unless (!defined($self->{probG}));
901      last BASE;
902    }
903  		if ($base eq 'C') {
904      @prob= @{$self->{probC}} unless (!defined($self->{probC}));
905      last BASE;
906    }
907  		if ($base eq 'T') {
908      @prob= @{$self->{probT}} unless (!defined($self->{probT}));
909      last BASE;
910    }
911		$self->throw ("No such base: $base!\n");
912	}
913	my $str= _compress_array(\@prob,1,1);
914    return $str;
915}
916
917=head2 get_compressed_logs
918
919 Title   : get_compressed_logs
920 Usage   :
921 Function: A method to provide a compressed log-odd vector. It uses one byte to
922 		   code the log value for one of the log-odds vectors for one position.
923 Example : my $strA=$self->get_compressed_logs('A');
924 Returns : String
925 Args    : char
926
927=cut
928
929sub get_compressed_logs {
930	my $self=shift;
931	my $base=shift;
932	my $string='';
933	my @prob;
934	BASE: {
935		if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; }
936		if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; }
937		if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; }
938		if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; }
939		$self->throw ("No such base: $base!\n");
940	}
941	return _compress_array(\@prob,1000,2);
942}
943
944=head2 sequence_match_weight
945
946 Title   : sequence_match_weight
947 Usage   :
948 Function: This method will calculate the score of a match, based on the PWM
949           if such is associated with the matrix object. Returns undef if no
950           PWM data is available.
951 Throws  : if the length of the sequence is different from the matrix width
952 Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
953 Returns : Floating point
954 Args    : string
955
956=cut
957
958sub sequence_match_weight {
959    my ($self,$seq)=@_;
960    return unless ($self->{logA});
961    my $width=$self->width;
962    $self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}});
963    $seq = uc($seq);
964    my @seq=split(//,$seq);
965    my $score = 0;
966    my $i=0;
967    foreach my $pos (@seq) {
968        my $tv = 'log'.$pos;
969        $self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv};
970        $score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0;
971        $i++;
972    }
973    return $score;
974}
975
976=head2 get_all_vectors
977
978 Title   : get_all_vectors
979 Usage   :
980 Function: returns all possible sequence vectors to satisfy the PFM under
981           a given threshold
982 Throws  : If threshold outside of 0..1 (no sense to do that)
983 Example : my @vectors=$self->get_all_vectors(4);
984 Returns : Array of strings
985 Args    : (optional) floating
986
987=cut
988
989sub get_all_vectors {
990	my $self=shift;
991	my $thresh=shift;
992    $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
993    my @seq=split(//,$self->consensus($thresh*10));
994    my @perm;
995    for my $i (0..@{$self->{probA}}) {
996        push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
997        push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
998        push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
999        push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
1000        push @{$perm[$i]},'N' if  ($seq[$i] eq 'N');
1001    }
1002    my $fpos=shift @perm;
1003    my @strings=@$fpos;
1004    foreach my $pos (@perm) {
1005        my @newstr;
1006        foreach my $let (@$pos) {
1007            foreach my $string (@strings) {
1008                my $newstring = $string . $let;
1009                push @newstr,$newstring;
1010            }
1011        }
1012        @strings=@newstr;
1013    }
1014	return @strings;
1015}
1016
10171;
1018