1package PWM;
2
3use strict;
4use warnings;
5use Carp;
6
7my $PSEUDOCOUNT = 0.1;
8
9srand(1); # reproducibility
10
11###
12sub new {
13    my $packagename = shift;
14
15    my $self = { pos_freqs => [],
16                 pos_probs => [],
17                 _pwm_built_flag => 0,
18    };
19
20    bless($self, $packagename);
21
22    return($self);
23}
24
25
26sub is_pwm_built {
27    my ($self) = @_;
28
29    return($self->{_pwm_built_flag});
30
31}
32
33sub has_pos_freqs {
34    my ($self) = @_;
35
36    if (@{$self->{pos_freqs}}) {
37        return(1);
38    }
39    else {
40        return(0);
41    }
42}
43
44
45
46sub add_feature_seq_to_pwm {
47    my ($self, $feature_seq) = @_;
48
49    $feature_seq = uc $feature_seq;
50
51    my $pwm_len = $self->get_pwm_length();
52    if ($pwm_len && length($feature_seq) != $pwm_len) {
53        confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched.";
54    }
55    if ($feature_seq =~ /[^GATC]/) {
56        print STDERR "Error, feature_seq: $feature_seq contains non-GATC chars... skipping\n";
57        return;
58    }
59
60    my @chars = split(//, $feature_seq);
61    for (my $i = 0; $i <= $#chars; $i++) {
62        my $char = $chars[$i];
63
64        $self->{pos_freqs}->[$i]->{$char}++;
65    }
66
67    return;
68}
69
70
71sub remove_feature_seq_from_pwm {
72    my ($self, $feature_seq) = @_;
73
74    if (! $self->has_pos_freqs()) {
75        confess "Error, pwm obj doesn't have position frequencies set";
76    }
77
78    $feature_seq = uc $feature_seq;
79
80    my $pwm_len = $self->get_pwm_length();
81    if ($pwm_len && length($feature_seq) != $pwm_len) {
82        confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched.";
83    }
84    if ($feature_seq =~ /[^GATC]/) {
85        print STDERR "Warning, feature_seq: $feature_seq contains non-GATC chars... skipping.\n";
86        return;
87    }
88
89    my @chars = split(//, $feature_seq);
90    for (my $i = 0; $i <= $#chars; $i++) {
91        my $char = $chars[$i];
92        $self->{pos_freqs}->[$i]->{$char}--;
93    }
94
95    return;
96}
97
98
99
100sub get_pwm_length {
101    my ($self) = @_;
102
103    my $pwm_length;
104
105    if ($self->is_pwm_built()) {
106        $pwm_length = $#{$self->{pos_probs}} + 1;
107    }
108    else {
109        $pwm_length = $#{$self->{pos_freqs}} + 1;
110    }
111
112    return($pwm_length);
113}
114
115
116sub simulate_feature {
117    my ($self) = @_;
118
119    if (! $self->is_pwm_built()) {
120        confess "Error, pwm not built yet";
121    }
122
123    my $probs_aref = $self->{pos_probs};
124
125    my @chars = qw(G A T C);
126    my $feature_seq = "";
127
128    my $pwm_length = $self->get_pwm_length();
129    for (my $i = 0; $i < $pwm_length; $i++) {
130        my $sum_prob = 0;
131        my $selected_flag = 0;
132        #print "rand val: $rand_val\n";
133        my $tot_prob = 0;
134        for (my $j = 0; $j <= $#chars; $j++) {
135            my $char = $chars[$j];
136            my $p = $probs_aref->[$i]->{$char};
137            $tot_prob += $p;
138        }
139        my $rand_val = rand($tot_prob); # tot_prob should be 1 or very close...
140
141
142        for (my $j = 0; $j <= $#chars; $j++) {
143            my $char = $chars[$j];
144            my $p = $probs_aref->[$i]->{$char};
145            #print "char: $char, p: $p\n";
146            $sum_prob += $p;
147            if ($j == $#chars) {
148                $sum_prob = 1;
149            }
150
151            if ($rand_val <= $sum_prob) {
152                # choose char
153                $feature_seq .= $char;
154                $selected_flag = 1;
155                last;
156            }
157        }
158        if (! $selected_flag) {
159            confess "Error, didn't select a random char for feature seq";
160        }
161    }
162
163    return($feature_seq);
164
165}
166
167
168
169####
170sub write_pwm_file {
171    my ($self, $filename) = @_;
172
173    unless ($self->is_pwm_built()) {
174        confess "Error, pwm needs to be built first before writing to file";
175    }
176
177    open (my $ofh, ">$filename") or confess "Error, cannot write to file: $filename";
178
179    my $pos_probs_aref = $self->{pos_probs};
180    my $pwm_length = $self->get_pwm_length();
181
182    my @chars = qw(G A T C);
183    print $ofh join("\t", "pos", @chars) . "\n";
184    for (my $i = 0; $i < $pwm_length; $i++) {
185
186        my @vals = ($i);
187        foreach my $char (@chars) {
188            my $prob = $pos_probs_aref->[$i]->{$char} || 0;
189            push (@vals, $prob);
190        }
191
192        print $ofh join("\t", @vals) . "\n";
193    }
194
195    close $ofh;
196
197    return;
198}
199
200
201
202####
203sub build_pwm {
204    my ($self) = @_;
205
206    my $pos_freqs_aref = $self->{pos_freqs};
207
208    for (my $i = 0; $i <= $#{$pos_freqs_aref}; $i++) {
209        my @chars = keys %{$pos_freqs_aref->[$i]};
210
211        my $sum = 0;
212        foreach my $char (@chars) {
213            my $val = $pos_freqs_aref->[$i]->{$char};
214            $sum += $val;
215        }
216
217        # now convert to relative freqs
218        @chars = qw(G A T C); # the complete set of chars we care about.
219        foreach my $char (@chars) {
220            my $val = $pos_freqs_aref->[$i]->{$char} || 0;
221            my $prob = sprintf("%.6f", ($val + $PSEUDOCOUNT) / ($sum + 4 * $PSEUDOCOUNT) );
222            $self->{pos_probs}->[$i]->{$char} = $prob;
223        }
224
225    }
226
227    $self->{_pwm_built_flag} = 1;
228
229    return;
230}
231
232
233sub score_pwm_using_base_freqs {
234    my ($self, $target_sequence, $base_freqs_href, %options ) = @_;
235
236    ###  Options can include:
237    ###
238    ###   mask => [ coordA, coordB, coordC ],  # ignored pwm positions in scoring
239    ###   pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive
240    ###
241
242    for my $key (keys %options) {
243        if (! grep {$_ eq $key} ('mask', 'pwm_range')) {
244            confess "Error, option $key is not recognized";
245        }
246    }
247
248    #print STDERR "target_seq: [$target_sequence]\n";
249
250    $target_sequence = uc $target_sequence;
251    unless ($target_sequence =~ /^[GATC]+$/) {
252        # can only score GATC-containing sequences.
253        return("NA");
254    }
255
256    if (! $self->is_pwm_built()) {
257        confess("pwm not built yet!");
258    }
259
260    my $pwm_length = $self->get_pwm_length();
261
262    if (length($target_sequence) != $pwm_length) {
263        confess "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length";
264    }
265
266    my %mask;
267    if (my $mask_positions_aref = $options{'mask'}) {
268        %mask = map { + $_ => 1 } @$mask_positions_aref;
269    }
270
271    my $motif_score = 0;
272
273    my @seqarray = split(//, $target_sequence);
274
275    my ($pwm_start, $pwm_end) = (0, $pwm_length-1);
276    if (my $pwm_range_aref = $options{'pwm_range'}) {
277        ($pwm_start, $pwm_end) = @$pwm_range_aref;
278    }
279
280    for (my $i = $pwm_start; $i <= $pwm_end; $i++) {
281
282        if ($mask{$i}) {
283            next;
284        }
285
286        my $char = $seqarray[$i];
287        my $prob = $self->{pos_probs}->[$i]->{$char};
288
289        unless ($prob) {
290            return("NA");
291        }
292
293        my $prob_rand = $base_freqs_href->{$char};
294        unless ($prob_rand) {
295            die "Error, no non-zero probability value specified for char [$char] ";
296        }
297
298        my $loglikelihood = log($prob/$prob_rand);
299        $motif_score += $loglikelihood;
300
301    }
302
303    return($motif_score);
304
305}
306
307
308sub score_plus_minus_pwm {
309    my ($self, $target_sequence, $pwm_minus, %options) = @_;
310
311    ###  Options can include:
312    ###
313    ###   mask => [ coordA, coordB, coordC ],  # ignored pwm positions in scoring
314    ###   pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive
315    ###
316
317    for my $key (keys %options) {
318        if (! grep {$_ eq $key} ('mask', 'pwm_range')) {
319            confess "Error, option $key is not recognized";
320        }
321    }
322
323    #print STDERR "target_seq: [$target_sequence]\n";
324
325    $target_sequence = uc $target_sequence;
326    unless ($target_sequence =~ /^[GATC]+$/) {
327        # can only score GATC-containing sequences.
328        return("NA");
329    }
330
331    if (! $self->is_pwm_built()) {
332        confess("pwm not built yet!");
333    }
334
335    my $pwm_length = $self->get_pwm_length();
336
337    if (length($target_sequence) != $pwm_length) {
338        confess "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length";
339    }
340
341    my %mask;
342    if (my $mask_positions_aref = $options{'mask'}) {
343        %mask = map { + $_ => 1 } @$mask_positions_aref;
344    }
345
346    my $motif_score = 0;
347
348    my @seqarray = split(//, $target_sequence);
349
350    my ($pwm_start, $pwm_end) = (0, $pwm_length-1);
351    if (my $pwm_range_aref = $options{'pwm_range'}) {
352        ($pwm_start, $pwm_end) = @$pwm_range_aref;
353    }
354
355    for (my $i = $pwm_start; $i <= $pwm_end; $i++) {
356
357        if ($mask{$i}) {
358            #print STDERR "masking $i\n";
359            next;
360        }
361
362        my $char = $seqarray[$i];
363        my $prob = $self->{pos_probs}->[$i]->{$char};
364
365        unless ($prob) {
366            return("NA");
367        }
368
369        my $prob_rand = $pwm_minus->{pos_probs}->[$i]->{$char};
370        unless ($prob_rand) {
371            die "Error, no non-zero probability value specified for char [$char] ";
372        }
373
374        my $loglikelihood = log($prob/$prob_rand);
375        $motif_score += $loglikelihood;
376
377    }
378
379    return($motif_score);
380
381}
382
383####
384sub load_pwm_from_file {
385    my ($self, $pwm_file) = @_;
386
387    open(my $fh, $pwm_file) or confess "Error, cannot open file: $pwm_file";
388
389    my $header = <$fh>;
390    chomp $header;
391    my @chars = split(/\t/, $header);
392    shift @chars;
393
394    while (<$fh>) {
395        chomp;
396        my @x = split(/\t/);
397        my $idx_pos = shift @x;
398        for (my $i = 0; $i <= $#chars; $i++) {
399            my $char = $chars[$i];
400            my $prob = $x[$i];
401
402            $self->{pos_probs}->[$idx_pos]->{$char} = $prob;
403        }
404    }
405    close $fh;
406
407    $self->{_pwm_built_flag} = 1;
408
409    return;
410}
411
412
4131; #EOM
414