1package PWM; 2 3use strict; 4use warnings; 5use Carp; 6 7my $PSEUDOCOUNT = 0.1; 8 9srand(1); # reproducibility 10 11### 12sub new { 13 my $packagename = shift; 14 15 my $self = { pos_freqs => [], 16 pos_probs => [], 17 _pwm_built_flag => 0, 18 }; 19 20 bless($self, $packagename); 21 22 return($self); 23} 24 25 26sub is_pwm_built { 27 my ($self) = @_; 28 29 return($self->{_pwm_built_flag}); 30 31} 32 33sub has_pos_freqs { 34 my ($self) = @_; 35 36 if (@{$self->{pos_freqs}}) { 37 return(1); 38 } 39 else { 40 return(0); 41 } 42} 43 44 45 46sub add_feature_seq_to_pwm { 47 my ($self, $feature_seq) = @_; 48 49 $feature_seq = uc $feature_seq; 50 51 my $pwm_len = $self->get_pwm_length(); 52 if ($pwm_len && length($feature_seq) != $pwm_len) { 53 confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched."; 54 } 55 if ($feature_seq =~ /[^GATC]/) { 56 print STDERR "Error, feature_seq: $feature_seq contains non-GATC chars... skipping\n"; 57 return; 58 } 59 60 my @chars = split(//, $feature_seq); 61 for (my $i = 0; $i <= $#chars; $i++) { 62 my $char = $chars[$i]; 63 64 $self->{pos_freqs}->[$i]->{$char}++; 65 } 66 67 return; 68} 69 70 71sub remove_feature_seq_from_pwm { 72 my ($self, $feature_seq) = @_; 73 74 if (! $self->has_pos_freqs()) { 75 confess "Error, pwm obj doesn't have position frequencies set"; 76 } 77 78 $feature_seq = uc $feature_seq; 79 80 my $pwm_len = $self->get_pwm_length(); 81 if ($pwm_len && length($feature_seq) != $pwm_len) { 82 confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched."; 83 } 84 if ($feature_seq =~ /[^GATC]/) { 85 print STDERR "Warning, feature_seq: $feature_seq contains non-GATC chars... skipping.\n"; 86 return; 87 } 88 89 my @chars = split(//, $feature_seq); 90 for (my $i = 0; $i <= $#chars; $i++) { 91 my $char = $chars[$i]; 92 $self->{pos_freqs}->[$i]->{$char}--; 93 } 94 95 return; 96} 97 98 99 100sub get_pwm_length { 101 my ($self) = @_; 102 103 my $pwm_length; 104 105 if ($self->is_pwm_built()) { 106 $pwm_length = $#{$self->{pos_probs}} + 1; 107 } 108 else { 109 $pwm_length = $#{$self->{pos_freqs}} + 1; 110 } 111 112 return($pwm_length); 113} 114 115 116sub simulate_feature { 117 my ($self) = @_; 118 119 if (! $self->is_pwm_built()) { 120 confess "Error, pwm not built yet"; 121 } 122 123 my $probs_aref = $self->{pos_probs}; 124 125 my @chars = qw(G A T C); 126 my $feature_seq = ""; 127 128 my $pwm_length = $self->get_pwm_length(); 129 for (my $i = 0; $i < $pwm_length; $i++) { 130 my $sum_prob = 0; 131 my $selected_flag = 0; 132 #print "rand val: $rand_val\n"; 133 my $tot_prob = 0; 134 for (my $j = 0; $j <= $#chars; $j++) { 135 my $char = $chars[$j]; 136 my $p = $probs_aref->[$i]->{$char}; 137 $tot_prob += $p; 138 } 139 my $rand_val = rand($tot_prob); # tot_prob should be 1 or very close... 140 141 142 for (my $j = 0; $j <= $#chars; $j++) { 143 my $char = $chars[$j]; 144 my $p = $probs_aref->[$i]->{$char}; 145 #print "char: $char, p: $p\n"; 146 $sum_prob += $p; 147 if ($j == $#chars) { 148 $sum_prob = 1; 149 } 150 151 if ($rand_val <= $sum_prob) { 152 # choose char 153 $feature_seq .= $char; 154 $selected_flag = 1; 155 last; 156 } 157 } 158 if (! $selected_flag) { 159 confess "Error, didn't select a random char for feature seq"; 160 } 161 } 162 163 return($feature_seq); 164 165} 166 167 168 169#### 170sub write_pwm_file { 171 my ($self, $filename) = @_; 172 173 unless ($self->is_pwm_built()) { 174 confess "Error, pwm needs to be built first before writing to file"; 175 } 176 177 open (my $ofh, ">$filename") or confess "Error, cannot write to file: $filename"; 178 179 my $pos_probs_aref = $self->{pos_probs}; 180 my $pwm_length = $self->get_pwm_length(); 181 182 my @chars = qw(G A T C); 183 print $ofh join("\t", "pos", @chars) . "\n"; 184 for (my $i = 0; $i < $pwm_length; $i++) { 185 186 my @vals = ($i); 187 foreach my $char (@chars) { 188 my $prob = $pos_probs_aref->[$i]->{$char} || 0; 189 push (@vals, $prob); 190 } 191 192 print $ofh join("\t", @vals) . "\n"; 193 } 194 195 close $ofh; 196 197 return; 198} 199 200 201 202#### 203sub build_pwm { 204 my ($self) = @_; 205 206 my $pos_freqs_aref = $self->{pos_freqs}; 207 208 for (my $i = 0; $i <= $#{$pos_freqs_aref}; $i++) { 209 my @chars = keys %{$pos_freqs_aref->[$i]}; 210 211 my $sum = 0; 212 foreach my $char (@chars) { 213 my $val = $pos_freqs_aref->[$i]->{$char}; 214 $sum += $val; 215 } 216 217 # now convert to relative freqs 218 @chars = qw(G A T C); # the complete set of chars we care about. 219 foreach my $char (@chars) { 220 my $val = $pos_freqs_aref->[$i]->{$char} || 0; 221 my $prob = sprintf("%.6f", ($val + $PSEUDOCOUNT) / ($sum + 4 * $PSEUDOCOUNT) ); 222 $self->{pos_probs}->[$i]->{$char} = $prob; 223 } 224 225 } 226 227 $self->{_pwm_built_flag} = 1; 228 229 return; 230} 231 232 233sub score_pwm_using_base_freqs { 234 my ($self, $target_sequence, $base_freqs_href, %options ) = @_; 235 236 ### Options can include: 237 ### 238 ### mask => [ coordA, coordB, coordC ], # ignored pwm positions in scoring 239 ### pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive 240 ### 241 242 for my $key (keys %options) { 243 if (! grep {$_ eq $key} ('mask', 'pwm_range')) { 244 confess "Error, option $key is not recognized"; 245 } 246 } 247 248 #print STDERR "target_seq: [$target_sequence]\n"; 249 250 $target_sequence = uc $target_sequence; 251 unless ($target_sequence =~ /^[GATC]+$/) { 252 # can only score GATC-containing sequences. 253 return("NA"); 254 } 255 256 if (! $self->is_pwm_built()) { 257 confess("pwm not built yet!"); 258 } 259 260 my $pwm_length = $self->get_pwm_length(); 261 262 if (length($target_sequence) != $pwm_length) { 263 confess "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length"; 264 } 265 266 my %mask; 267 if (my $mask_positions_aref = $options{'mask'}) { 268 %mask = map { + $_ => 1 } @$mask_positions_aref; 269 } 270 271 my $motif_score = 0; 272 273 my @seqarray = split(//, $target_sequence); 274 275 my ($pwm_start, $pwm_end) = (0, $pwm_length-1); 276 if (my $pwm_range_aref = $options{'pwm_range'}) { 277 ($pwm_start, $pwm_end) = @$pwm_range_aref; 278 } 279 280 for (my $i = $pwm_start; $i <= $pwm_end; $i++) { 281 282 if ($mask{$i}) { 283 next; 284 } 285 286 my $char = $seqarray[$i]; 287 my $prob = $self->{pos_probs}->[$i]->{$char}; 288 289 unless ($prob) { 290 return("NA"); 291 } 292 293 my $prob_rand = $base_freqs_href->{$char}; 294 unless ($prob_rand) { 295 die "Error, no non-zero probability value specified for char [$char] "; 296 } 297 298 my $loglikelihood = log($prob/$prob_rand); 299 $motif_score += $loglikelihood; 300 301 } 302 303 return($motif_score); 304 305} 306 307 308sub score_plus_minus_pwm { 309 my ($self, $target_sequence, $pwm_minus, %options) = @_; 310 311 ### Options can include: 312 ### 313 ### mask => [ coordA, coordB, coordC ], # ignored pwm positions in scoring 314 ### pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive 315 ### 316 317 for my $key (keys %options) { 318 if (! grep {$_ eq $key} ('mask', 'pwm_range')) { 319 confess "Error, option $key is not recognized"; 320 } 321 } 322 323 #print STDERR "target_seq: [$target_sequence]\n"; 324 325 $target_sequence = uc $target_sequence; 326 unless ($target_sequence =~ /^[GATC]+$/) { 327 # can only score GATC-containing sequences. 328 return("NA"); 329 } 330 331 if (! $self->is_pwm_built()) { 332 confess("pwm not built yet!"); 333 } 334 335 my $pwm_length = $self->get_pwm_length(); 336 337 if (length($target_sequence) != $pwm_length) { 338 confess "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length"; 339 } 340 341 my %mask; 342 if (my $mask_positions_aref = $options{'mask'}) { 343 %mask = map { + $_ => 1 } @$mask_positions_aref; 344 } 345 346 my $motif_score = 0; 347 348 my @seqarray = split(//, $target_sequence); 349 350 my ($pwm_start, $pwm_end) = (0, $pwm_length-1); 351 if (my $pwm_range_aref = $options{'pwm_range'}) { 352 ($pwm_start, $pwm_end) = @$pwm_range_aref; 353 } 354 355 for (my $i = $pwm_start; $i <= $pwm_end; $i++) { 356 357 if ($mask{$i}) { 358 #print STDERR "masking $i\n"; 359 next; 360 } 361 362 my $char = $seqarray[$i]; 363 my $prob = $self->{pos_probs}->[$i]->{$char}; 364 365 unless ($prob) { 366 return("NA"); 367 } 368 369 my $prob_rand = $pwm_minus->{pos_probs}->[$i]->{$char}; 370 unless ($prob_rand) { 371 die "Error, no non-zero probability value specified for char [$char] "; 372 } 373 374 my $loglikelihood = log($prob/$prob_rand); 375 $motif_score += $loglikelihood; 376 377 } 378 379 return($motif_score); 380 381} 382 383#### 384sub load_pwm_from_file { 385 my ($self, $pwm_file) = @_; 386 387 open(my $fh, $pwm_file) or confess "Error, cannot open file: $pwm_file"; 388 389 my $header = <$fh>; 390 chomp $header; 391 my @chars = split(/\t/, $header); 392 shift @chars; 393 394 while (<$fh>) { 395 chomp; 396 my @x = split(/\t/); 397 my $idx_pos = shift @x; 398 for (my $i = 0; $i <= $#chars; $i++) { 399 my $char = $chars[$i]; 400 my $prob = $x[$i]; 401 402 $self->{pos_probs}->[$idx_pos]->{$char} = $prob; 403 } 404 } 405 close $fh; 406 407 $self->{_pwm_built_flag} = 1; 408 409 return; 410} 411 412 4131; #EOM 414