1package Bio::Graphics::Wiggle::Loader;
2
3=head1 SYNOPSIS
4
5  my $loader = Bio::Graphics::Wiggle::Loader->new('/base/directory/for/wigfiles','wibfilename');
6  my $fh = IO::File->new('uploaded_file.txt');
7  $loader->load($fh);
8
9  my $gff3_file   = $loader->featurefile('gff3',$method,$source);
10  my $featurefile = $loader->featurefile('featurefile');
11  my @features    = $loader->features();
12
13=head1 USAGE
14
15This module loads Bio::Graphics::Wiggle files from source files that
16use Jim Kent's "WIG" format:
17
18http://genome.ucsc.edu/google/goldenPath/help/wiggle.html
19
20Several data sets can be grouped together in a single WIG source
21file. The load() method accepts the path to a WIG source file, and
22will create one or more .wib ("wiggle binary") databases of
23quantitative data in the directory indicated when you created the
24loader. Call the featurefile() method to return a text file in either
25GFF3 or Bio::Graphics::FeatureFile format, suitable for loading into a
26gbrowse database.
27
28=head2 METHODS
29
30=over 4
31
32=item $loader = Bio::Graphics::Wiggle::Loader->new('/base/directory' [,'my_data'])
33
34Create a new loader. The first argument specifies the base directory
35in which the loaded .wib files will be created. The second argument
36specifies the base name for the created .wib files, or "track" if not
37specified.
38
39=item $loader->load($fh)
40
41Load the data from a source WIG file opened on a filehandle.
42
43=item $data = $loader->featurefile($type [,$method,$source])
44
45Return the data corresponding to a GFF3 or
46Bio::Graphics::FeatureFile. The returned file will have one feature
47per WIG track, and a properly formatted "wigfile" attribute that
48directs Bio::Graphics to the location of the quantitative data.
49
50$type is one of "gff3" or "featurefile". In the case of "gff3", you
51may specify an optional method and source for use in describing each
52feature. In the case of "featurefile", the returned file will contain
53GBrowse stanzas that describe a reasonable starting format to display
54the data.
55
56=item @features = $loader->features
57
58Returns one or more Bio::Graphics::Features objects, which can be used to
59create Bio::Graphics tracks with the wiggle_xyplot (and related) glyphs.
60
61=item $loader->allow_sampling(1)
62
63If allow_sampling() is passed a true value, then very large files
64(more than 5 MB) will undergo a sampling procedure to find their
65minimum and maximum values and standard deviation. Otherwise, file
66will be read in its entirety to generate those statistics.
67
68=back
69
70=head2 EXTENSIONS
71
72Several extensions to the WIG format "track" declaration are recognized.
73
74=over 4
75
76=item transform=<transform>
77
78Specify a transform to be performed on all numeric data within this
79track prior to loading into the binary wig file. Currently, the
80following three declarations are recognized:
81
82 transform=logtransform y' = 0          for  y == 0
83                        y' = log(y)     for  y >  0
84                        y' = -log(-y)   for  y <  0
85
86
87 transform=logsquared  y' = log(y**2) for y != 0
88                       y' = 0         for y == 0
89
90 transform=none        y' = y   (no transform - the default)
91
92=item trim=<trim>
93
94Specify a trimming function to be performed on the data prior to
95scaling. Currently, the following trim functions are recognized:
96
97 trim=stdev1           trim to plus/minus 1 standard deviation of the mean
98 trim=stdev2           trim to plus/minus 2 standard deviations of the mean (default)
99 trim=stdevN           trim to plus/minus N standard deviations of the mean
100 trim=none             no trimming
101
102=back
103
104Example entended track declaration:
105
106 track type=wiggle_0 name="example" description="20 degrees, 2 hr"  \
107       trim=stdev2 transform=logsquared
108
109=cut
110
111use strict;
112
113use Carp 'croak';
114use Statistics::Descriptive;
115use IO::Seekable;
116use File::Spec;
117use Bio::Graphics::Wiggle;
118use Bio::Graphics::FeatureFile;
119use Text::ParseWords();
120use File::stat;
121use CGI 'escape';
122
123use vars '%color_name';
124
125# If a WIG file is very large (> 5 Mb)
126use constant BIG_FILE         => 5_000_000;
127use constant BIG_FILE_SAMPLES => 5_000;     # number of probes to make
128use constant DEFAULT_METHOD   => 'microarray_oligo';
129use constant DEFAULT_SOURCE   => '.';
130
131sub new {
132  my $class = shift;
133  my $base      = shift
134                 or croak "Usage: Bio::Graphics::Wiggle::Loader->new('/base/path','trackname')";
135  my $trackname = shift || 'track';
136  my $wigclass  = shift || 'Bio::Graphics::Wiggle';
137  -d $base && -w _  or croak "$base is not a writeable directory";
138  return bless {
139		base            => $base,
140		tracks          => {},
141		trackname       => $trackname,
142	        tracknum        => '000',
143		track_options   => {},
144		allow_sampling  => 0,
145		wigclass        => $wigclass,
146	       },ref $class || $class;
147}
148sub allow_sampling {
149    my $self = shift;
150    my $d    = $self->{allow_sampling};
151    $self->{allow_sampling} = shift if @_;
152    $d;
153}
154sub wigclass {
155    my $self = shift;
156    my $d    = $self->{wigclass};
157    $self->{wigclass} = shift if @_;
158    return $d;
159}
160sub basedir  { shift->{base}     }
161sub wigfiles { shift->{wigfiles} }
162sub conf_stanzas {
163    my $self = shift;
164    my ($method,$source) = @_;
165    $method ||= DEFAULT_METHOD;
166    $source ||= DEFAULT_SOURCE;
167
168    my $tracks = $self->{tracks};
169    my @lines = ();
170    for my $track (sort keys %$tracks) {
171
172	my $options    = $tracks->{$track}{display_options};
173	my $name       = $options->{name} ||= $track;
174
175	$options->{visibility} ||= 'dense';
176	$options->{color}      ||= $options->{visibility} =~ /pack/i ? '255,0,0' : '0,0,0';
177	$options->{altColor}   ||= $options->{visibility} =~ /pack/i ? '0,0,255' : '0,0,0';
178
179	# stanza
180	push @lines,"[$track]";
181	if (my $graph_type = $options->{glyph}) {
182	    if ($graph_type =~ /box/) {
183		push @lines, "glyph       = wiggle_box";
184	    }
185	    else {
186		push @lines,"glyph       = ".
187		    ($graph_type =~/density/ ? 'wiggle_density' : 'wiggle_xyplot');
188	    }
189	}
190	else {
191	    push @lines,"glyph       = ".
192		($options->{visibility}=~/pack/ ? 'wiggle_density' : 'wiggle_xyplot');
193	}
194	push @lines,"key         = $options->{name}"
195	    if $options->{name};
196	push @lines,"description = $options->{description}"
197	    if $options->{description};
198	if (my $color = $options->{color}) {
199	    push @lines,"bgcolor=".format_color($color);
200	}
201	if (my $color = $options->{altColor}) {
202	    push @lines,"fgcolor=" . format_color($color);
203	}
204	if (exists $options->{viewLimits} and my ($low,$hi) = split ':',$options->{viewLimits}) {
205	    push @lines,"min_score   =  $low";
206	    push @lines,"max_score   =  $hi";
207	}
208	if (exists $options->{maxHeightPixels} and my ($max,$default,$min) =
209	    split ':',$options->{maxHeightPixels}) {
210	    push @lines,"height  = $default";
211	}
212	push @lines,"smoothing        = $options->{windowingFunction}"
213	    if $options->{windowingFunction};
214
215	my $smoothing_window = $options->{smoothingWindow} || 0;
216
217	push @lines,"smoothing window = $options->{smoothingWindow}"
218	    if $options->{smoothingWindow};
219	push @lines,'';
220    }
221    return join "\n",@lines;
222}
223
224sub featurefile {
225  my $self             = shift;
226  my $type             = shift;
227  my ($method,$source) = @_;
228
229  $method ||= DEFAULT_METHOD;
230  $source ||= DEFAULT_SOURCE;
231
232  $type ||= 'featurefile';
233  $type =~ /^(gff3|featurefile)$/i
234    or croak "featurefile type must be one of 'gff3' or 'featurefile'";
235
236  my @lines;
237  my $tracks = $self->{tracks};
238
239  if ($type eq 'gff3') {
240      push @lines,"##gff-version 3","";
241  }
242  else {
243      push @lines,$self->conf_stanzas($method,$source),"";
244  }
245
246  for my $track (sort keys %$tracks) {
247    my $options    = $tracks->{$track}{display_options};
248    my $name       = $options->{name} ||= $track;
249    my $seqids     = $tracks->{$track}{seqids};
250    my $note       = escape($options->{description});
251    my @attributes;
252    push @attributes,qq(Name=$name)        if defined $name;
253    push @attributes,qq(Note=$note)        if defined $note;
254
255    # data, sorted by chromosome
256    my @seqid  = sort keys %$seqids;
257
258    for my $seqid (@seqid) {
259      $seqid or next;
260      $tracks->{$track}{seqids}{$seqid}{wig}->write();
261      my $attributes = join ';',(@attributes,"wigfile=$seqids->{$seqid}{wigpath}");
262      if ($type eq 'gff3') {
263	  push @lines,join "\t",($seqid,$source,$method,
264				 $seqids->{$seqid}{start},
265				 $seqids->{$seqid}{end},
266				 '.','.','.',
267				 $attributes
268	  );
269      } else {
270	  push @lines,'';
271	  push @lines,"reference=$seqid";
272	  push @lines,"$track $seqid.data $seqids->{$seqid}{start}..$seqids->{$seqid}{end} $attributes";
273      }
274
275    }
276
277  }
278
279  return join("\n",@lines)."\n";
280}
281
282sub features {
283    my $self = shift;
284    my $text = $self->featurefile('featurefile');
285    my $file = Bio::Graphics::FeatureFile->new(-text=>$text);
286    return $file->features;
287}
288
289
290sub load {
291  my $self  = shift;
292  my $infh  = shift;
293  my $format = 'none';
294
295  local $_;
296 LINE: while (<$infh>) {
297    chomp;
298    next if /^#/;
299    next unless /\S/;
300
301    if (/^track/) {
302      $self->process_track_line($_);
303      next;
304    }
305
306    if (/^fixedStep/) {
307      $self->process_fixed_step_declaration($_);
308      $format = 'fixed';
309    }
310
311    if (/^variableStep/) {
312      $self->process_variable_step_declaration($_);
313      $format = 'variable';
314    }
315
316    if (/^\S+\s+\d+\s+\d+\s+-?[\dEe.]+/) {
317      $self->process_first_bedline($_);
318      $format    = 'bed';
319    }
320
321    if ($format ne 'none') {
322      # remember where we are, find min and max values, return
323      my $pos = tell($infh);
324      $self->minmax($infh,$format eq 'bed' ? $_ : '')
325	  unless $self->{track_options}{chrom} &&
326	         exists $self->current_track->{seqids}{$self->{track_options}{chrom}}{min};
327      seek($infh,$pos,0);
328
329      $self->process_bed($infh,$_)        if $format eq 'bed';
330      $self->process_fixedline($infh)     if $format eq 'fixed';
331      $self->process_variableline($infh)  if $format eq 'variable';
332
333      $format = 'none';
334    }
335
336    redo LINE if defined $_ && /^(track|variableStep|fixedStep)/;
337  }
338
339  return 1;
340}
341
342sub process_track_line {
343  my $self      = shift;
344  my $line      = shift;
345  my @tokens    = shellwords($line);
346  shift @tokens;
347  my %options = map {split '='} @tokens;
348  $options{type} eq 'wiggle_0' or croak "invalid/unknown wiggle track type $options{type}";
349  delete $options{type};
350  $self->{tracknum}++;
351  $self->current_track->{display_options} = \%options;
352}
353
354sub process_fixed_step_declaration {
355  my $self  = shift;
356  my $line  = shift;
357  my @tokens    = shellwords($line);
358  shift @tokens;
359  my %options = map {split '='} @tokens;
360  exists $options{chrom}        or croak "invalid fixedStep line: need a chrom option";
361  exists $options{start}        or croak "invalid fixedStep line: need a start option";
362  exists $options{step}         or croak "invalid fixedStep line: need a step  option";
363  $self->{track_options} = \%options;
364}
365
366sub process_variable_step_declaration {
367  my $self  = shift;
368  my $line  = shift;
369  my @tokens    = shellwords($line);
370  shift @tokens;
371  my %options = map {split '='} @tokens;
372  exists $options{chrom}        or croak "invalid variableStep line: need a chrom option";
373  $self->{track_options} = \%options;
374}
375
376sub process_first_bedline {
377  my $self  = shift;
378  my $line  = shift;
379  my @tokens    = shellwords($line);
380  $self->{track_options} = {chrom => $tokens[0]};
381}
382
383sub current_track {
384  my $self = shift;
385  return $self->{tracks}{$self->{tracknum}} ||= {};
386}
387
388sub minmax {
389  my $self   = shift;
390  my ($infh,$bedline) = @_;
391  local $_;
392
393  my $transform  = $self->get_transform;
394
395  my $seqids = ($self->current_track->{seqids} ||= {});
396  my $chrom  = $self->{track_options}{chrom};
397
398  if ($self->allow_sampling && (my $size = stat($infh)->size) > BIG_FILE) {
399      warn "Wiggle file is very large; resorting to genome-wide sample statistics for $chrom.\n";
400      $self->{FILEWIDE_STATS} ||= $self->sample_file($infh,BIG_FILE_SAMPLES);
401      for (keys %{$self->{FILEWIDE_STATS}}) {
402	$seqids->{$chrom}{$_} = $self->{FILEWIDE_STATS}{$_};
403      }
404      return;
405  }
406
407  my %stats;
408  if ($bedline) {  # left-over BED line
409      my @tokens = split /\s+/,$bedline;
410      my $seqid  = $tokens[0];
411      my $value  = $tokens[-1];
412      $value = $transform->($self,$value) if $transform;
413      $stats{$seqid} ||= Statistics::Descriptive::Sparse->new();
414      $stats{$seqid}->add_data($value);
415  }
416
417  while (<$infh>) {
418      last if /^track/;
419      last if /chrom=(\S+)/ && $1 ne $chrom;
420      next if /^\#|fixedStep|variableStep/;
421      my @tokens = split(/\s+/,$_) or next;
422      my $seqid  = @tokens > 3 ? $tokens[0] : $chrom;
423      my $value  = $tokens[-1];
424      $value = $transform->($self,$value) if $transform;
425      $stats{$seqid} ||= Statistics::Descriptive::Sparse->new();
426      $stats{$seqid}->add_data($value);
427  }
428
429  for my $seqid (keys %stats) {
430      $seqids->{$seqid}{min}    = $stats{$seqid}->min();
431      $seqids->{$seqid}{max}    = $stats{$seqid}->max();
432      $seqids->{$seqid}{mean}   = $stats{$seqid}->mean();
433      $seqids->{$seqid}{stdev}  = $stats{$seqid}->standard_deviation();
434  }
435}
436
437sub sample_file {
438    my $self = shift;
439
440    my ($fh,$samples) = @_;
441
442    my $transform  = $self->get_transform;
443
444    my $stats = Statistics::Descriptive::Sparse->new();
445
446    my $size = stat($fh)->size;
447    my $count=0;
448    while ($count < $samples) {
449	seek($fh,int(rand $size),0) or die;
450	scalar <$fh>; # toss first line
451	my $line = <$fh>; # next full line
452	$line or next;
453	my @tokens = split /\s+/,$line;
454	my $value  = $tokens[-1];
455	next unless $value =~ /^[\d\seE.+-]+$/; # non-numeric
456	$value = $transform->($self,$value) if $transform;
457	$stats->add_data($value);
458	$count++;
459    }
460
461    return {
462	min   => $stats->min,
463	max   => $stats->max,
464	mean  => $stats->mean,
465	stdev => $stats->standard_deviation,
466    };
467}
468
469sub get_transform {
470    my $self = shift;
471    my $transform  = $self->current_track->{display_options}{transform};
472    return $self->can($transform) if $transform;
473}
474
475# one and only transform currently defined
476# Natural log of the square of the value.
477# Return 0 if the value is 0
478sub logsquared {
479    my $self  = shift;
480    my $value = shift;
481    return 0 if $value == 0;
482    return log($value**2);
483}
484
485sub logtransform {
486  my $self  = shift;
487  my $value = shift;
488  return 0 if $value == 0;
489  if ($value < 0) {
490    return -log(-$value);
491  } else {
492    return log($value);
493  }
494}
495
496sub process_bed {
497  my $self = shift;
498  my $infh = shift;
499  my $oops = shift;
500  my $transform  = $self->get_transform;
501  $self->process_bedline($oops) if $oops;
502  while (<$infh>) {
503    last if /^track/;
504    next if /^#/;
505    chomp;
506    $self->process_bedline($_);
507  }
508}
509
510sub process_bedline {
511  my $self = shift;
512  my ($line,$transform) = @_;
513
514  my ($seqid,$start,$end,$value) = split /\s+/,$line;
515  $value = $transform->($self,$value) if $transform;
516  $start++;   # to 1-based coordinates
517
518  my $wigfile = $self->wigfile($seqid);
519  $wigfile->set_range($start=>$end, $value);
520
521  # update span
522  $self->current_track->{seqids}{$seqid}{start} = $start
523    unless exists $self->current_track->{seqids}{$seqid}{start}
524      and $self->current_track->{seqids}{$seqid}{start} < $start;
525
526  $self->current_track->{seqids}{$seqid}{end}   = $end
527    unless exists $self->current_track->{seqids}{$seqid}{end}
528      and $self->current_track->{seqids}{$seqid}{end} > $end;
529}
530
531sub process_fixedline {
532  my $self  = shift;
533  my $infh  = shift;
534  my $seqid   = $self->{track_options}{chrom};
535  my $wigfile = $self->wigfile($seqid);
536  my $start   = $self->{track_options}{start};
537  my $step    = $self->{track_options}{step};
538  my $span    = $wigfile->span;
539
540  # update start and end positions
541  $self->{track_options}{span} ||= $wigfile->span || 1;
542  my $chrom = $self->current_track->{seqids}{$seqid};
543  $chrom->{start} = $start
544      if !defined $chrom->{start} || $chrom->{start} > $start;
545  my $end = $chrom->{start} + $span - 1;
546  $chrom->{end}   = $end
547      if !defined $chrom->{end} || $chrom->{end} < $end;
548
549  my $transform  = $self->get_transform;
550
551  # write out data in 500K chunks for efficiency
552  my @buffer;
553  while (<$infh>) {
554    last if /^(track|variableStep|fixedStep)/;
555    next if /^#/;
556    chomp;
557    push @buffer,$_;
558    if (@buffer >= 500_000) {
559	@buffer = map {$transform->($self,$_)} @buffer if $transform;
560	$wigfile->set_values($start=>\@buffer);
561	my $big_step = $step * @buffer;
562	$start += $big_step;
563	$self->current_track->{seqids}{$seqid}{end} = $start + $big_step - 1 + $span;
564	@buffer = (); # reset at the end
565    }
566
567  }
568  @buffer = map {$transform->($self,$_)} @buffer if $transform;
569  $wigfile->set_values($start=>\@buffer)         if @buffer;
570  $self->current_track->{seqids}{$seqid}{end} =
571      $start + @buffer*$step - 1 + $span;
572}
573
574sub process_variableline {
575  my $self  = shift;
576  my $infh  = shift;
577  my $seqid   = $self->{track_options}{chrom};
578  my $chrom   = $self->current_track->{seqids}{$seqid};
579  my $wigfile = $self->wigfile($seqid);
580  my $span    = $wigfile->span;
581  my $transform  = $self->get_transform;
582
583  while (<$infh>) {
584    last if /^(track|variableStep|fixedStep)/;
585    next if /^#/;
586    chomp;
587    my ($start,$value) = split /\s+/ or next;
588    $value = $transform->($self,$value) if $transform;
589    eval {
590	$wigfile->set_value($start=>$value);
591	1;
592    } or croak "Data error on line $.: $_\nDetails: $@";
593
594    # update span
595    $chrom->{start} = $start
596	if !defined $chrom->{start} || $chrom->{start} > $start;
597    my $end = $start + $span - 1;
598    $chrom->{end}   = $end
599	if !defined $chrom->{end} || $chrom->{end} < $end;
600
601  }
602
603  $self->current_track->{seqids}{$seqid}{end}
604        ||= $self->current_track->{seqids}{$seqid}{start};
605}
606
607sub wigfile {
608  my $self  = shift;
609  my $seqid = shift;
610  my $ts    = time();
611  my $current_track = $self->{tracknum};
612  my $tname         = $self->{trackname} || 'track';
613  unless (exists $self->current_track->{seqids}{$seqid}{wig}) {
614    my $path    = File::Spec->catfile($self->{base},"$tname\_$current_track.$seqid.$ts.wib");
615    my @stats;
616    foreach (qw(min max mean stdev)) {
617	my $value = $self->current_track->{seqids}{$seqid}{$_} ||
618	    $self->{FILEWIDE_STATS}{$_} || next;
619	push @stats,($_=>$value);
620   }
621
622    my $step = $self->{track_options}{step} || 1;
623    my $span = $self->{track_options}{span} ||
624	$self->{track_options}{step}        ||
625	1;
626    my $trim      = $self->current_track->{display_options}{trim} || 'stdev10';
627    my $transform = $self->current_track->{display_options}{transform};
628    my $class   = $self->wigclass;
629    unless ($class->can('new')) {
630	warn "loading $class";
631	eval "require $class";
632	die $@ if $@;
633    }
634    my $wigfile = $class->new(
635	$path,
636	1,
637	{
638	    seqid => $seqid,
639	    step  => $step,
640	    span  => $span,
641	    trim  => $trim,
642	    @stats,
643	},
644	);
645    $wigfile or croak "Couldn't create wigfile $wigfile: $!";
646    $self->current_track->{seqids}{$seqid}{wig}     = $wigfile;
647    $self->current_track->{seqids}{$seqid}{wigpath} = $path;
648  }
649  return $self->current_track->{seqids}{$seqid}{wig};
650}
651
652sub format_color {
653  my $rgb = shift;
654  return $rgb unless $rgb =~ /\d+,\d+,\d+/;
655  my ($r,$g,$b) = split ',',$rgb;
656  my $hex = '#'.join '',map {sprintf("%02X",$_)}($r,$g,$b);
657  return translate_color($hex);
658}
659
660# use English names for the most common colors
661sub translate_color {
662  my $clr = shift;
663  unless  (%color_name) {
664    while (<DATA>) {
665      chomp;
666      my ($hex,$name) = split or next;
667      $color_name{$hex} = $name;
668    }
669  }
670  return $color_name{$clr} || $clr;
671}
672
673# work around an annoying uninit variable warning from Text::Parsewords
674sub shellwords {
675    my @args = @_;
676    return unless @args;
677    foreach(@args) {
678	s/^\s+//;
679	s/\s+$//;
680	$_ = '' unless defined $_;
681    }
682    my @result = Text::ParseWords::shellwords(@args);
683    return @result;
684}
685
6861;
687
688
689__DATA__
690#000000 black
691#FFFFFF white
692#0000FF blue
693#00FF00 green
694#FF0000 red
695#FFFF00 yellow
696#00FFFF cyan
697#FF00FF magenta
698#C0C0C0 gray
699
700
701__END__
702
703=head1 SEE ALSO
704
705L<Bio::Graphics::Wiggle>,
706L<Bio::Graphics::Glyph::wiggle_xyplot>,
707L<Bio::Graphics::Glyph::wiggle_density>,
708L<Bio::Graphics::Panel>,
709L<Bio::Graphics::Glyph>,
710L<Bio::Graphics::Feature>,
711L<Bio::Graphics::FeatureFile>
712
713=head1 AUTHOR
714
715Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
716
717Copyright (c) 2007 Cold Spring Harbor Laboratory
718
719This package and its accompanying libraries is free software; you can
720redistribute it and/or modify it under the terms of the GPL (either
721version 1, or at your option, any later version) or the Artistic
722License 2.0.  Refer to LICENSE for the full license text. In addition,
723please see DISCLAIMER.txt for disclaimers of warranty.
724
725=cut
726