1package Bio::Graphics::Wiggle::Loader; 2 3=head1 SYNOPSIS 4 5 my $loader = Bio::Graphics::Wiggle::Loader->new('/base/directory/for/wigfiles','wibfilename'); 6 my $fh = IO::File->new('uploaded_file.txt'); 7 $loader->load($fh); 8 9 my $gff3_file = $loader->featurefile('gff3',$method,$source); 10 my $featurefile = $loader->featurefile('featurefile'); 11 my @features = $loader->features(); 12 13=head1 USAGE 14 15This module loads Bio::Graphics::Wiggle files from source files that 16use Jim Kent's "WIG" format: 17 18http://genome.ucsc.edu/google/goldenPath/help/wiggle.html 19 20Several data sets can be grouped together in a single WIG source 21file. The load() method accepts the path to a WIG source file, and 22will create one or more .wib ("wiggle binary") databases of 23quantitative data in the directory indicated when you created the 24loader. Call the featurefile() method to return a text file in either 25GFF3 or Bio::Graphics::FeatureFile format, suitable for loading into a 26gbrowse database. 27 28=head2 METHODS 29 30=over 4 31 32=item $loader = Bio::Graphics::Wiggle::Loader->new('/base/directory' [,'my_data']) 33 34Create a new loader. The first argument specifies the base directory 35in which the loaded .wib files will be created. The second argument 36specifies the base name for the created .wib files, or "track" if not 37specified. 38 39=item $loader->load($fh) 40 41Load the data from a source WIG file opened on a filehandle. 42 43=item $data = $loader->featurefile($type [,$method,$source]) 44 45Return the data corresponding to a GFF3 or 46Bio::Graphics::FeatureFile. The returned file will have one feature 47per WIG track, and a properly formatted "wigfile" attribute that 48directs Bio::Graphics to the location of the quantitative data. 49 50$type is one of "gff3" or "featurefile". In the case of "gff3", you 51may specify an optional method and source for use in describing each 52feature. In the case of "featurefile", the returned file will contain 53GBrowse stanzas that describe a reasonable starting format to display 54the data. 55 56=item @features = $loader->features 57 58Returns one or more Bio::Graphics::Features objects, which can be used to 59create Bio::Graphics tracks with the wiggle_xyplot (and related) glyphs. 60 61=item $loader->allow_sampling(1) 62 63If allow_sampling() is passed a true value, then very large files 64(more than 5 MB) will undergo a sampling procedure to find their 65minimum and maximum values and standard deviation. Otherwise, file 66will be read in its entirety to generate those statistics. 67 68=back 69 70=head2 EXTENSIONS 71 72Several extensions to the WIG format "track" declaration are recognized. 73 74=over 4 75 76=item transform=<transform> 77 78Specify a transform to be performed on all numeric data within this 79track prior to loading into the binary wig file. Currently, the 80following three declarations are recognized: 81 82 transform=logtransform y' = 0 for y == 0 83 y' = log(y) for y > 0 84 y' = -log(-y) for y < 0 85 86 87 transform=logsquared y' = log(y**2) for y != 0 88 y' = 0 for y == 0 89 90 transform=none y' = y (no transform - the default) 91 92=item trim=<trim> 93 94Specify a trimming function to be performed on the data prior to 95scaling. Currently, the following trim functions are recognized: 96 97 trim=stdev1 trim to plus/minus 1 standard deviation of the mean 98 trim=stdev2 trim to plus/minus 2 standard deviations of the mean (default) 99 trim=stdevN trim to plus/minus N standard deviations of the mean 100 trim=none no trimming 101 102=back 103 104Example entended track declaration: 105 106 track type=wiggle_0 name="example" description="20 degrees, 2 hr" \ 107 trim=stdev2 transform=logsquared 108 109=cut 110 111use strict; 112 113use Carp 'croak'; 114use Statistics::Descriptive; 115use IO::Seekable; 116use File::Spec; 117use Bio::Graphics::Wiggle; 118use Bio::Graphics::FeatureFile; 119use Text::ParseWords(); 120use File::stat; 121use CGI 'escape'; 122 123use vars '%color_name'; 124 125# If a WIG file is very large (> 5 Mb) 126use constant BIG_FILE => 5_000_000; 127use constant BIG_FILE_SAMPLES => 5_000; # number of probes to make 128use constant DEFAULT_METHOD => 'microarray_oligo'; 129use constant DEFAULT_SOURCE => '.'; 130 131sub new { 132 my $class = shift; 133 my $base = shift 134 or croak "Usage: Bio::Graphics::Wiggle::Loader->new('/base/path','trackname')"; 135 my $trackname = shift || 'track'; 136 my $wigclass = shift || 'Bio::Graphics::Wiggle'; 137 -d $base && -w _ or croak "$base is not a writeable directory"; 138 return bless { 139 base => $base, 140 tracks => {}, 141 trackname => $trackname, 142 tracknum => '000', 143 track_options => {}, 144 allow_sampling => 0, 145 wigclass => $wigclass, 146 },ref $class || $class; 147} 148sub allow_sampling { 149 my $self = shift; 150 my $d = $self->{allow_sampling}; 151 $self->{allow_sampling} = shift if @_; 152 $d; 153} 154sub wigclass { 155 my $self = shift; 156 my $d = $self->{wigclass}; 157 $self->{wigclass} = shift if @_; 158 return $d; 159} 160sub basedir { shift->{base} } 161sub wigfiles { shift->{wigfiles} } 162sub conf_stanzas { 163 my $self = shift; 164 my ($method,$source) = @_; 165 $method ||= DEFAULT_METHOD; 166 $source ||= DEFAULT_SOURCE; 167 168 my $tracks = $self->{tracks}; 169 my @lines = (); 170 for my $track (sort keys %$tracks) { 171 172 my $options = $tracks->{$track}{display_options}; 173 my $name = $options->{name} ||= $track; 174 175 $options->{visibility} ||= 'dense'; 176 $options->{color} ||= $options->{visibility} =~ /pack/i ? '255,0,0' : '0,0,0'; 177 $options->{altColor} ||= $options->{visibility} =~ /pack/i ? '0,0,255' : '0,0,0'; 178 179 # stanza 180 push @lines,"[$track]"; 181 if (my $graph_type = $options->{glyph}) { 182 if ($graph_type =~ /box/) { 183 push @lines, "glyph = wiggle_box"; 184 } 185 else { 186 push @lines,"glyph = ". 187 ($graph_type =~/density/ ? 'wiggle_density' : 'wiggle_xyplot'); 188 } 189 } 190 else { 191 push @lines,"glyph = ". 192 ($options->{visibility}=~/pack/ ? 'wiggle_density' : 'wiggle_xyplot'); 193 } 194 push @lines,"key = $options->{name}" 195 if $options->{name}; 196 push @lines,"description = $options->{description}" 197 if $options->{description}; 198 if (my $color = $options->{color}) { 199 push @lines,"bgcolor=".format_color($color); 200 } 201 if (my $color = $options->{altColor}) { 202 push @lines,"fgcolor=" . format_color($color); 203 } 204 if (exists $options->{viewLimits} and my ($low,$hi) = split ':',$options->{viewLimits}) { 205 push @lines,"min_score = $low"; 206 push @lines,"max_score = $hi"; 207 } 208 if (exists $options->{maxHeightPixels} and my ($max,$default,$min) = 209 split ':',$options->{maxHeightPixels}) { 210 push @lines,"height = $default"; 211 } 212 push @lines,"smoothing = $options->{windowingFunction}" 213 if $options->{windowingFunction}; 214 215 my $smoothing_window = $options->{smoothingWindow} || 0; 216 217 push @lines,"smoothing window = $options->{smoothingWindow}" 218 if $options->{smoothingWindow}; 219 push @lines,''; 220 } 221 return join "\n",@lines; 222} 223 224sub featurefile { 225 my $self = shift; 226 my $type = shift; 227 my ($method,$source) = @_; 228 229 $method ||= DEFAULT_METHOD; 230 $source ||= DEFAULT_SOURCE; 231 232 $type ||= 'featurefile'; 233 $type =~ /^(gff3|featurefile)$/i 234 or croak "featurefile type must be one of 'gff3' or 'featurefile'"; 235 236 my @lines; 237 my $tracks = $self->{tracks}; 238 239 if ($type eq 'gff3') { 240 push @lines,"##gff-version 3",""; 241 } 242 else { 243 push @lines,$self->conf_stanzas($method,$source),""; 244 } 245 246 for my $track (sort keys %$tracks) { 247 my $options = $tracks->{$track}{display_options}; 248 my $name = $options->{name} ||= $track; 249 my $seqids = $tracks->{$track}{seqids}; 250 my $note = escape($options->{description}); 251 my @attributes; 252 push @attributes,qq(Name=$name) if defined $name; 253 push @attributes,qq(Note=$note) if defined $note; 254 255 # data, sorted by chromosome 256 my @seqid = sort keys %$seqids; 257 258 for my $seqid (@seqid) { 259 $seqid or next; 260 $tracks->{$track}{seqids}{$seqid}{wig}->write(); 261 my $attributes = join ';',(@attributes,"wigfile=$seqids->{$seqid}{wigpath}"); 262 if ($type eq 'gff3') { 263 push @lines,join "\t",($seqid,$source,$method, 264 $seqids->{$seqid}{start}, 265 $seqids->{$seqid}{end}, 266 '.','.','.', 267 $attributes 268 ); 269 } else { 270 push @lines,''; 271 push @lines,"reference=$seqid"; 272 push @lines,"$track $seqid.data $seqids->{$seqid}{start}..$seqids->{$seqid}{end} $attributes"; 273 } 274 275 } 276 277 } 278 279 return join("\n",@lines)."\n"; 280} 281 282sub features { 283 my $self = shift; 284 my $text = $self->featurefile('featurefile'); 285 my $file = Bio::Graphics::FeatureFile->new(-text=>$text); 286 return $file->features; 287} 288 289 290sub load { 291 my $self = shift; 292 my $infh = shift; 293 my $format = 'none'; 294 295 local $_; 296 LINE: while (<$infh>) { 297 chomp; 298 next if /^#/; 299 next unless /\S/; 300 301 if (/^track/) { 302 $self->process_track_line($_); 303 next; 304 } 305 306 if (/^fixedStep/) { 307 $self->process_fixed_step_declaration($_); 308 $format = 'fixed'; 309 } 310 311 if (/^variableStep/) { 312 $self->process_variable_step_declaration($_); 313 $format = 'variable'; 314 } 315 316 if (/^\S+\s+\d+\s+\d+\s+-?[\dEe.]+/) { 317 $self->process_first_bedline($_); 318 $format = 'bed'; 319 } 320 321 if ($format ne 'none') { 322 # remember where we are, find min and max values, return 323 my $pos = tell($infh); 324 $self->minmax($infh,$format eq 'bed' ? $_ : '') 325 unless $self->{track_options}{chrom} && 326 exists $self->current_track->{seqids}{$self->{track_options}{chrom}}{min}; 327 seek($infh,$pos,0); 328 329 $self->process_bed($infh,$_) if $format eq 'bed'; 330 $self->process_fixedline($infh) if $format eq 'fixed'; 331 $self->process_variableline($infh) if $format eq 'variable'; 332 333 $format = 'none'; 334 } 335 336 redo LINE if defined $_ && /^(track|variableStep|fixedStep)/; 337 } 338 339 return 1; 340} 341 342sub process_track_line { 343 my $self = shift; 344 my $line = shift; 345 my @tokens = shellwords($line); 346 shift @tokens; 347 my %options = map {split '='} @tokens; 348 $options{type} eq 'wiggle_0' or croak "invalid/unknown wiggle track type $options{type}"; 349 delete $options{type}; 350 $self->{tracknum}++; 351 $self->current_track->{display_options} = \%options; 352} 353 354sub process_fixed_step_declaration { 355 my $self = shift; 356 my $line = shift; 357 my @tokens = shellwords($line); 358 shift @tokens; 359 my %options = map {split '='} @tokens; 360 exists $options{chrom} or croak "invalid fixedStep line: need a chrom option"; 361 exists $options{start} or croak "invalid fixedStep line: need a start option"; 362 exists $options{step} or croak "invalid fixedStep line: need a step option"; 363 $self->{track_options} = \%options; 364} 365 366sub process_variable_step_declaration { 367 my $self = shift; 368 my $line = shift; 369 my @tokens = shellwords($line); 370 shift @tokens; 371 my %options = map {split '='} @tokens; 372 exists $options{chrom} or croak "invalid variableStep line: need a chrom option"; 373 $self->{track_options} = \%options; 374} 375 376sub process_first_bedline { 377 my $self = shift; 378 my $line = shift; 379 my @tokens = shellwords($line); 380 $self->{track_options} = {chrom => $tokens[0]}; 381} 382 383sub current_track { 384 my $self = shift; 385 return $self->{tracks}{$self->{tracknum}} ||= {}; 386} 387 388sub minmax { 389 my $self = shift; 390 my ($infh,$bedline) = @_; 391 local $_; 392 393 my $transform = $self->get_transform; 394 395 my $seqids = ($self->current_track->{seqids} ||= {}); 396 my $chrom = $self->{track_options}{chrom}; 397 398 if ($self->allow_sampling && (my $size = stat($infh)->size) > BIG_FILE) { 399 warn "Wiggle file is very large; resorting to genome-wide sample statistics for $chrom.\n"; 400 $self->{FILEWIDE_STATS} ||= $self->sample_file($infh,BIG_FILE_SAMPLES); 401 for (keys %{$self->{FILEWIDE_STATS}}) { 402 $seqids->{$chrom}{$_} = $self->{FILEWIDE_STATS}{$_}; 403 } 404 return; 405 } 406 407 my %stats; 408 if ($bedline) { # left-over BED line 409 my @tokens = split /\s+/,$bedline; 410 my $seqid = $tokens[0]; 411 my $value = $tokens[-1]; 412 $value = $transform->($self,$value) if $transform; 413 $stats{$seqid} ||= Statistics::Descriptive::Sparse->new(); 414 $stats{$seqid}->add_data($value); 415 } 416 417 while (<$infh>) { 418 last if /^track/; 419 last if /chrom=(\S+)/ && $1 ne $chrom; 420 next if /^\#|fixedStep|variableStep/; 421 my @tokens = split(/\s+/,$_) or next; 422 my $seqid = @tokens > 3 ? $tokens[0] : $chrom; 423 my $value = $tokens[-1]; 424 $value = $transform->($self,$value) if $transform; 425 $stats{$seqid} ||= Statistics::Descriptive::Sparse->new(); 426 $stats{$seqid}->add_data($value); 427 } 428 429 for my $seqid (keys %stats) { 430 $seqids->{$seqid}{min} = $stats{$seqid}->min(); 431 $seqids->{$seqid}{max} = $stats{$seqid}->max(); 432 $seqids->{$seqid}{mean} = $stats{$seqid}->mean(); 433 $seqids->{$seqid}{stdev} = $stats{$seqid}->standard_deviation(); 434 } 435} 436 437sub sample_file { 438 my $self = shift; 439 440 my ($fh,$samples) = @_; 441 442 my $transform = $self->get_transform; 443 444 my $stats = Statistics::Descriptive::Sparse->new(); 445 446 my $size = stat($fh)->size; 447 my $count=0; 448 while ($count < $samples) { 449 seek($fh,int(rand $size),0) or die; 450 scalar <$fh>; # toss first line 451 my $line = <$fh>; # next full line 452 $line or next; 453 my @tokens = split /\s+/,$line; 454 my $value = $tokens[-1]; 455 next unless $value =~ /^[\d\seE.+-]+$/; # non-numeric 456 $value = $transform->($self,$value) if $transform; 457 $stats->add_data($value); 458 $count++; 459 } 460 461 return { 462 min => $stats->min, 463 max => $stats->max, 464 mean => $stats->mean, 465 stdev => $stats->standard_deviation, 466 }; 467} 468 469sub get_transform { 470 my $self = shift; 471 my $transform = $self->current_track->{display_options}{transform}; 472 return $self->can($transform) if $transform; 473} 474 475# one and only transform currently defined 476# Natural log of the square of the value. 477# Return 0 if the value is 0 478sub logsquared { 479 my $self = shift; 480 my $value = shift; 481 return 0 if $value == 0; 482 return log($value**2); 483} 484 485sub logtransform { 486 my $self = shift; 487 my $value = shift; 488 return 0 if $value == 0; 489 if ($value < 0) { 490 return -log(-$value); 491 } else { 492 return log($value); 493 } 494} 495 496sub process_bed { 497 my $self = shift; 498 my $infh = shift; 499 my $oops = shift; 500 my $transform = $self->get_transform; 501 $self->process_bedline($oops) if $oops; 502 while (<$infh>) { 503 last if /^track/; 504 next if /^#/; 505 chomp; 506 $self->process_bedline($_); 507 } 508} 509 510sub process_bedline { 511 my $self = shift; 512 my ($line,$transform) = @_; 513 514 my ($seqid,$start,$end,$value) = split /\s+/,$line; 515 $value = $transform->($self,$value) if $transform; 516 $start++; # to 1-based coordinates 517 518 my $wigfile = $self->wigfile($seqid); 519 $wigfile->set_range($start=>$end, $value); 520 521 # update span 522 $self->current_track->{seqids}{$seqid}{start} = $start 523 unless exists $self->current_track->{seqids}{$seqid}{start} 524 and $self->current_track->{seqids}{$seqid}{start} < $start; 525 526 $self->current_track->{seqids}{$seqid}{end} = $end 527 unless exists $self->current_track->{seqids}{$seqid}{end} 528 and $self->current_track->{seqids}{$seqid}{end} > $end; 529} 530 531sub process_fixedline { 532 my $self = shift; 533 my $infh = shift; 534 my $seqid = $self->{track_options}{chrom}; 535 my $wigfile = $self->wigfile($seqid); 536 my $start = $self->{track_options}{start}; 537 my $step = $self->{track_options}{step}; 538 my $span = $wigfile->span; 539 540 # update start and end positions 541 $self->{track_options}{span} ||= $wigfile->span || 1; 542 my $chrom = $self->current_track->{seqids}{$seqid}; 543 $chrom->{start} = $start 544 if !defined $chrom->{start} || $chrom->{start} > $start; 545 my $end = $chrom->{start} + $span - 1; 546 $chrom->{end} = $end 547 if !defined $chrom->{end} || $chrom->{end} < $end; 548 549 my $transform = $self->get_transform; 550 551 # write out data in 500K chunks for efficiency 552 my @buffer; 553 while (<$infh>) { 554 last if /^(track|variableStep|fixedStep)/; 555 next if /^#/; 556 chomp; 557 push @buffer,$_; 558 if (@buffer >= 500_000) { 559 @buffer = map {$transform->($self,$_)} @buffer if $transform; 560 $wigfile->set_values($start=>\@buffer); 561 my $big_step = $step * @buffer; 562 $start += $big_step; 563 $self->current_track->{seqids}{$seqid}{end} = $start + $big_step - 1 + $span; 564 @buffer = (); # reset at the end 565 } 566 567 } 568 @buffer = map {$transform->($self,$_)} @buffer if $transform; 569 $wigfile->set_values($start=>\@buffer) if @buffer; 570 $self->current_track->{seqids}{$seqid}{end} = 571 $start + @buffer*$step - 1 + $span; 572} 573 574sub process_variableline { 575 my $self = shift; 576 my $infh = shift; 577 my $seqid = $self->{track_options}{chrom}; 578 my $chrom = $self->current_track->{seqids}{$seqid}; 579 my $wigfile = $self->wigfile($seqid); 580 my $span = $wigfile->span; 581 my $transform = $self->get_transform; 582 583 while (<$infh>) { 584 last if /^(track|variableStep|fixedStep)/; 585 next if /^#/; 586 chomp; 587 my ($start,$value) = split /\s+/ or next; 588 $value = $transform->($self,$value) if $transform; 589 eval { 590 $wigfile->set_value($start=>$value); 591 1; 592 } or croak "Data error on line $.: $_\nDetails: $@"; 593 594 # update span 595 $chrom->{start} = $start 596 if !defined $chrom->{start} || $chrom->{start} > $start; 597 my $end = $start + $span - 1; 598 $chrom->{end} = $end 599 if !defined $chrom->{end} || $chrom->{end} < $end; 600 601 } 602 603 $self->current_track->{seqids}{$seqid}{end} 604 ||= $self->current_track->{seqids}{$seqid}{start}; 605} 606 607sub wigfile { 608 my $self = shift; 609 my $seqid = shift; 610 my $ts = time(); 611 my $current_track = $self->{tracknum}; 612 my $tname = $self->{trackname} || 'track'; 613 unless (exists $self->current_track->{seqids}{$seqid}{wig}) { 614 my $path = File::Spec->catfile($self->{base},"$tname\_$current_track.$seqid.$ts.wib"); 615 my @stats; 616 foreach (qw(min max mean stdev)) { 617 my $value = $self->current_track->{seqids}{$seqid}{$_} || 618 $self->{FILEWIDE_STATS}{$_} || next; 619 push @stats,($_=>$value); 620 } 621 622 my $step = $self->{track_options}{step} || 1; 623 my $span = $self->{track_options}{span} || 624 $self->{track_options}{step} || 625 1; 626 my $trim = $self->current_track->{display_options}{trim} || 'stdev10'; 627 my $transform = $self->current_track->{display_options}{transform}; 628 my $class = $self->wigclass; 629 unless ($class->can('new')) { 630 warn "loading $class"; 631 eval "require $class"; 632 die $@ if $@; 633 } 634 my $wigfile = $class->new( 635 $path, 636 1, 637 { 638 seqid => $seqid, 639 step => $step, 640 span => $span, 641 trim => $trim, 642 @stats, 643 }, 644 ); 645 $wigfile or croak "Couldn't create wigfile $wigfile: $!"; 646 $self->current_track->{seqids}{$seqid}{wig} = $wigfile; 647 $self->current_track->{seqids}{$seqid}{wigpath} = $path; 648 } 649 return $self->current_track->{seqids}{$seqid}{wig}; 650} 651 652sub format_color { 653 my $rgb = shift; 654 return $rgb unless $rgb =~ /\d+,\d+,\d+/; 655 my ($r,$g,$b) = split ',',$rgb; 656 my $hex = '#'.join '',map {sprintf("%02X",$_)}($r,$g,$b); 657 return translate_color($hex); 658} 659 660# use English names for the most common colors 661sub translate_color { 662 my $clr = shift; 663 unless (%color_name) { 664 while (<DATA>) { 665 chomp; 666 my ($hex,$name) = split or next; 667 $color_name{$hex} = $name; 668 } 669 } 670 return $color_name{$clr} || $clr; 671} 672 673# work around an annoying uninit variable warning from Text::Parsewords 674sub shellwords { 675 my @args = @_; 676 return unless @args; 677 foreach(@args) { 678 s/^\s+//; 679 s/\s+$//; 680 $_ = '' unless defined $_; 681 } 682 my @result = Text::ParseWords::shellwords(@args); 683 return @result; 684} 685 6861; 687 688 689__DATA__ 690#000000 black 691#FFFFFF white 692#0000FF blue 693#00FF00 green 694#FF0000 red 695#FFFF00 yellow 696#00FFFF cyan 697#FF00FF magenta 698#C0C0C0 gray 699 700 701__END__ 702 703=head1 SEE ALSO 704 705L<Bio::Graphics::Wiggle>, 706L<Bio::Graphics::Glyph::wiggle_xyplot>, 707L<Bio::Graphics::Glyph::wiggle_density>, 708L<Bio::Graphics::Panel>, 709L<Bio::Graphics::Glyph>, 710L<Bio::Graphics::Feature>, 711L<Bio::Graphics::FeatureFile> 712 713=head1 AUTHOR 714 715Lincoln Stein E<lt>lstein@cshl.orgE<gt>. 716 717Copyright (c) 2007 Cold Spring Harbor Laboratory 718 719This package and its accompanying libraries is free software; you can 720redistribute it and/or modify it under the terms of the GPL (either 721version 1, or at your option, any later version) or the Artistic 722License 2.0. Refer to LICENSE for the full license text. In addition, 723please see DISCLAIMER.txt for disclaimers of warranty. 724 725=cut 726