1package Bio::SCF;
2
3use strict;
4use vars qw($VERSION @ISA);
5
6require DynaLoader;
7use Bio::SCF::Arrays;
8use Carp 'croak';
9
10@ISA = qw(DynaLoader);
11$VERSION = '1.03';
12use constant KEYS => {
13		     index	=> 0,
14		     A		=> 1,
15		     C		=> 2,
16		     G		=> 3,
17		     T		=> 4,
18		     bases	=> 5,
19		     spare1	=> 6,
20		     spare1	=> 7,
21		     spare1	=> 8,
22		     samplesA	=> 11,
23		     samplesC	=> 12,
24		     samplesG	=> 13,
25		     samplesT	=> 14
26		    };
27
28use constant HEADER_FIELDS => {
29			       samples_length => 0,
30			       bases_length   => 1,
31			       version        => 2,
32			       sample_size    => 3,
33			       code_set       => 4,
34};
35
36bootstrap Bio::SCF $VERSION;
37
38sub new {
39  my $class       = shift;
40  my $file_name   = shift;
41  my $sample_hash = shift || 0;
42
43  defined $file_name or die "SCF :: Unable to tie hash to undefined file name\n";
44  my $scf_pointer;
45  if ($sample_hash) {
46    $scf_pointer = $file_name; # file name became scf pointer
47  }
48  else {
49    if ( defined fileno($file_name)){
50      $scf_pointer = get_scf_fpointer($file_name); # file_name here is file handle
51    }else{
52      $scf_pointer = get_scf_pointer($file_name); # actually reads scf file into memory
53    }
54  }
55  my $scf_file = {
56		  file_name     => $file_name,
57		  scf_pointer   => $scf_pointer,
58		  sample_hash   => $sample_hash,
59		  cache         => {}
60		 };
61  return bless $scf_file, $class;
62}
63
64
65sub TIEHASH {
66  shift->new(@_);
67}
68
69sub FETCH {
70  my $self = shift;
71  my $key = shift;
72  my @array;
73
74  if ($self->{sample_hash}) {
75    my $k = "sample_$key";
76    return $self->{cache}{$k} if exists $self->{cache}{$k};
77    tie @array, 'Bio::SCF::Arrays', $self->{scf_pointer}, $k;
78    return $self->{cache}{$k} = \@array;
79  }
80
81  else {
82
83    if (defined( my $header_field = HEADER_FIELDS->{$key})) {
84      return get_from_header($self->{scf_pointer}, $header_field);
85    }
86
87    elsif ($key eq "comments") {
88      return get_comments($self->{scf_pointer});
89    }
90
91    elsif ($key eq 'samples') {
92      return $self->{cache}{$key} if exists $self->{cache}{$key};
93      my %sample;
94      tie %sample, 'Bio::SCF', $self->{scf_pointer}, 1;
95      $self->{cache}{key} = \%sample;
96      return \%sample;
97    }
98
99    elsif (exists KEYS->{$key}) {
100      return $self->{cache}{$key} if exists $self->{cache}{$key};
101      tie @array, 'Bio::SCF::Arrays', $self->{scf_pointer}, $key;
102      $self->{cache}{$key} = \@array;
103      return \@array;
104    }
105
106  }
107}
108
109sub bases_length {
110  my $self = shift;
111  get_from_header($self->{scf_pointer},HEADER_FIELDS->{bases_length});
112}
113
114sub samples_length {
115  my $self = shift;
116  get_from_header($self->{scf_pointer},HEADER_FIELDS->{samples_length});
117}
118
119sub sample_size {
120  my $self = shift;
121  get_from_header($self->{scf_pointer},HEADER_FIELDS->{sample_size});
122}
123
124sub code_set {
125  my $self = shift;
126  get_from_header($self->{scf_pointer},HEADER_FIELDS->{code_set});
127}
128
129sub index {
130  my $self = shift;
131  my $index = shift;
132  my $d = $self->at('index',$index);
133  $self->set('index',$index,shift) if @_;
134  $d;
135}
136
137sub sample {
138  my $self  = shift;
139  my $base  = uc shift;
140  my $index = shift;
141  my $d = $self->at("samples${base}",$index);
142  $self->set("samples${base}",$index,shift) if @_;
143  $d;
144}
145
146sub base {
147  my $self = shift;
148  my $index = shift;
149  my $d = $self->at('bases',$index);
150  $self->set('bases',$index,shift) if @_;
151  $d;
152}
153
154sub base_score {
155  my $self  = shift;
156  my $base  = uc shift;
157  my $index = shift;
158  my $d = $self->at($base,$index);
159  $self->set($base,$index,shift) if @_;
160  $d;
161}
162
163sub score {
164  my $self  = shift;
165  my $index = shift;
166  my $base = uc $self->base($index);
167  my $d = exists KEYS->{$base} ? $self->base_score($base,$index) : 0;
168  $self->set($base,$index,shift) if @_;
169  $d;
170}
171
172sub comments {
173  my $self = shift;
174  get_comments($self->{scf_pointer});
175}
176
177sub at {
178  my $self = shift;
179  # possible keys { bases, A, C, G, T, spare1/2/3, sampleA/C/G/T }
180  my $key   = shift;
181  my $index = shift;
182  return get_at($self->{scf_pointer}, $index, KEYS->{$key});
183}
184
185sub set {
186  my $self = shift;
187  # possible keys { bases, A, C, G, T, spare1/2/3, sampleA/C/G/T }
188  my $key = shift;
189  my $index = shift;
190  my $value = shift or die "Bio::SCF::set(...) value not defined\n";
191  if ( $key eq "bases" ){
192    set_base_at($self->{scf_pointer}, $index, KEYS->{$key}, $value);
193  }else{
194    set_at($self->{scf_pointer}, $index, KEYS->{$key}, $value);
195  }
196}
197
198sub write {
199  my $self = shift;
200  my $file_name = shift || $self->{file_name};
201  return scf_write($self->{scf_pointer}, $file_name);
202}
203
204sub fwrite {
205  my $self = shift;
206  my $file_handle = shift ||
207    die "Bio::SCF::fwrite(...) :  file handle is not defined\n";
208  return scf_fwrite($self->{scf_pointer}, $file_handle);
209}
210
211sub STORE {
212  my $self = shift;
213  my $key = shift;
214  my $value = shift;
215 SWITCH: {
216    $key eq "comments" && do {
217      set_comments($self->{scf_pointer}, $value);
218      last SWITCH;
219    };
220    die "Bio::SCF::STORE field $key doesn't exist or not allowed to be modified\n";
221  }
222}
223
224sub FIRSTKEY {
225  my $self = shift;
226  my $a = keys %{KEYS()};
227  each %{KEYS()}
228}
229
230sub NEXTKEY {
231  my $self = shift;
232  each %{KEYS()};
233}
234
235sub CLEAR {
236  croak "The Bio::SCF module does not support this operation";
237}
238
239sub DELETE {
240  croak "The Bio::SCF module does not support this operation";
241}
242
243sub DESTROY {
244  my $self = shift;
245  Bio::SCF::scf_free($self->{scf_pointer}) unless $self->{sample_hash};
246}
247
248# Autoload methods go after =cut, and are processed by the autosplit program.
249
2501;
251__END__
252
253=head1 NAME
254
255Bio::SCF - Perl extension for reading and writting SCF sequence files
256
257=head1 SYNOPSIS
258
259use Bio::SCF;
260
261# tied interface
262tie %hash,'Bio::SCF','my_scf_file.scf';
263
264my $sequence_length            = $hash{bases_length};
265my $chromatogram_sample_length = $hash{samples_length};
266my $third_base                 = $hash{bases}[2];
267my $quality_score              = $hash{$third_base}[2];
268my $sample_A_at_time_1400      = $hash{samples}{A}[1400];
269
270# change the third base and write out new file
271$hash{bases}[2] = 'C';
272tied (%hash)->write('new.scf');
273
274# object-oriented interface
275my $scf                        = Bio::SCF->new('my_scf_file.scf');
276my $sequence_length            = $scf->bases_length;
277my $chromatogram_sample_length = $scf->samples_length;
278my $third_base                 = $scf->bases(2);
279my $quality_score              = $scf->score(2);
280my $sample_A_at_time_1400      = $scf->sample('A',1400);
281
282# change the third base and write out new file
283$scf->bases(2,'C');
284$scf->write('new.scf');
285
286=head1 DESCRIPTION
287
288This module provides a perl interface to SCF DNA sequencing files. It
289has both tied hash and an object-oriented interfaces. It provides the
290ability to read fields from SCF files and limited ability to modify
291them and write them back.
292
293=head2 Tied Methods
294
295=over 4
296
297=item $obj = tie %hash,'Bio::SCF',$filename_or_handle
298
299Tie the Bio::SCF module to a filename or filehandle. If successful, tie()
300will return the object.
301
302=item $value = $hash{'key'}
303
304Fetch a field from the SCF file. Valid keys are as follows:
305
306  Key             Value
307  ---             -----
308
309  bases_length    Number of called bases in the sequence (read-only)
310
311  samples_length  Number of samples in the file (read-only)
312
313  version         SCF version (read-only)
314
315  code_set        Code set used to code bases (read-only)
316
317  comments        Structured comments (read-only)
318
319  bases           Array reference to a list of the base calls
320
321  index           Array reference to a list of the sample position
322                    for each of the base calls (e.g. the position of
323                    the base calling peak)
324
325  A               An array reference that can be used to determine the
326                    probability that the base in position $i is an "A".
327
328  G               An array reference that can be used to determine the
329                    probability that the base in position $i is a "G".
330
331  C               An array reference that can be used to determine the
332                    probability that the base in position $i is a "C".
333
334  T               An array reference that can be used to determine the
335                    probability that the base in position $i is a "T".
336
337  samples         A hash reference with keys "A", "C", "G" and "T". The
338                    value of each hash is an array reference to the list
339                    of intensity values for each sample.
340
341To get the length of the called sequence:              $scf{bases_length}
342
343To get the value of the called sequence at position 3: $scf{bases}[3]
344
345To get the sample position at which base 3 was called: $scf{index}[3]
346
347To get the value of the "C" curve under base 3:        $scf{samples}{C}[$scf{index}[3]]
348
349To get the probability that base 3 is a "C":           $scf{C}[3]
350
351To print out the chromatogram as a four-column list:
352
353    my $samples = $scf{samples};
354    for (my $i = 0; $i<$scf{samples_length}; $i++) {
355       print join "\t",$samples->{C}[$i],$samples->{G}[$i],
356                       $samples->{A}[$i],$samples->{T}[$i],"\n";
357    }
358
359=item $scf{bases}[$index] = $new_value
360
361The base call probability scores, base call values, base call
362positions, and sample values are all read/write, so that you can
363change them:
364
365   $samples->{C}[500] = 0;
366
367=item each %scf
368
369Will return keys and values for the tied object.
370
371=item delete $scf{$key}
372
373=item %scf = ()
374
375These operations are not supported and will return a run-time error
376
377=back
378
379=head2 Object Methods
380
381=over 4
382
383=item $scf = Bio::SCF->new($scf_file_or_filehandle)
384
385Create a new Bio::SCF object. The single argument is the name of a file or
386a previously-opened filehandle. If successful, new() returns the Bio::SCF
387object.
388
389=item $length = $scf->bases_length
390
391Return the length of the called sequence.
392
393=item $samples = $scf->samples_length
394
395Return the length of the list of chromatogram samples in the
396file. There are four sample series, one for each base.
397
398=item $sample_size = $scf->sample_size
399
400Returns the size of each sample (bytes).
401
402=item $code_set = $scf->code_set
403
404Return the code set used for base calling.
405
406=item $base = $scf->base($base_no [,$new_base])
407
408Get the base call at the indicated position. If a new value is
409provided, will change the base call to the indicated base.
410
411=item $index = $scf->index($base_no [,$new_index])
412
413Translates the indicated base position into the sample index for that
414called base. Here is how to fetch the intensity values at base number 5:
415
416  my $sample_index = $scf->index(5);
417  my ($g,$a,$t,$c) = map { $scf->sample($_,$sample_index) } qw(G A T C);
418
419If you provide a new value for the sample index, it will be updated.
420
421=item $base_score = $scf->base_score($base,$base_no [,$new_score])
422
423Get the probability that the indicated base occurs at position
424$base_no. Here is how to fetch the probabilities for the four bases at
425base position 5:
426
427  my ($g,$a,$t,$c) = map { $scf->base_score($_,5) } qw(G A T C);
428
429If you provide a new value for the base probability score, it will be
430updated.
431
432=item $score = $scf->score($base_no)
433
434Get the quality score for the called base at the indicated position.
435
436=item $intensity = $scf->sample($base,$sample_index [,$new_value])
437
438Get the intensity value for the channel corresponding to the indicated
439base at the indicated sample index. You may update the intensity by
440providing a new value.
441
442=item $scf->write('file_path')
443
444Write the updated SCF file to the indicated file path.
445
446=item $scf->fwrite($file_handle)
447
448Write the updated SCF file to the indicated filehandle. The file must
449previously have been opened for writing. The filehandle is actually
450reopened in append mode, so you can call fwrite() multiple times and
451interperse your own record separators.
452
453=back
454
455=head1 EXAMPLES
456
457Reading information from a preexisting file:
458
459   tie %scf, 'Bio::SCF', "data.scf";
460   print "Base calls:\n";
461   for ( my $i=0; $i<$scf{bases}; $i++ ){
462      print "$scf{base}[$i] ";
463   }
464   print "\n";
465
466   print "Intensity values for the A curve\n";
467   for ( my $i=0; $i<$scf{samples}; $i++ ){
468      print "$scf{sample}{A}[$i];
469   }
470   print "\n";
471
472Another example, where we set all bases to "A", indexes to 10 and write
473the file back:
474
475   my $obj = tie %scf,'Bio::SCF','data.scf';
476   for (0...@{$scf{bases}}-1){
477      $scf{base}[$_] = "A";
478      $obj->set('index', $_, 10);
479   }
480   $obj->write('data.scf');
481
482=head1 AUTHOR
483
484Dmitri Priimak, priimak@cshl.org (1999)
485
486with some cleanups by
487Lincoln Stein, lstein@cshl.edu (2006)
488
489This package and its accompanying libraries is free software; you can
490redistribute it and/or modify it under the terms of the GPL (either
491version 1, or at your option, any later version) or the Artistic
492License 2.0.  Refer to LICENSE for the full license text. In addition,
493please see DISCLAIMER for disclaimers of warranty.
494
495=head1 SEE ALSO
496
497perl(1).
498
499=cut
500