1package Bio::SCF; 2 3use strict; 4use vars qw($VERSION @ISA); 5 6require DynaLoader; 7use Bio::SCF::Arrays; 8use Carp 'croak'; 9 10@ISA = qw(DynaLoader); 11$VERSION = '1.03'; 12use constant KEYS => { 13 index => 0, 14 A => 1, 15 C => 2, 16 G => 3, 17 T => 4, 18 bases => 5, 19 spare1 => 6, 20 spare1 => 7, 21 spare1 => 8, 22 samplesA => 11, 23 samplesC => 12, 24 samplesG => 13, 25 samplesT => 14 26 }; 27 28use constant HEADER_FIELDS => { 29 samples_length => 0, 30 bases_length => 1, 31 version => 2, 32 sample_size => 3, 33 code_set => 4, 34}; 35 36bootstrap Bio::SCF $VERSION; 37 38sub new { 39 my $class = shift; 40 my $file_name = shift; 41 my $sample_hash = shift || 0; 42 43 defined $file_name or die "SCF :: Unable to tie hash to undefined file name\n"; 44 my $scf_pointer; 45 if ($sample_hash) { 46 $scf_pointer = $file_name; # file name became scf pointer 47 } 48 else { 49 if ( defined fileno($file_name)){ 50 $scf_pointer = get_scf_fpointer($file_name); # file_name here is file handle 51 }else{ 52 $scf_pointer = get_scf_pointer($file_name); # actually reads scf file into memory 53 } 54 } 55 my $scf_file = { 56 file_name => $file_name, 57 scf_pointer => $scf_pointer, 58 sample_hash => $sample_hash, 59 cache => {} 60 }; 61 return bless $scf_file, $class; 62} 63 64 65sub TIEHASH { 66 shift->new(@_); 67} 68 69sub FETCH { 70 my $self = shift; 71 my $key = shift; 72 my @array; 73 74 if ($self->{sample_hash}) { 75 my $k = "sample_$key"; 76 return $self->{cache}{$k} if exists $self->{cache}{$k}; 77 tie @array, 'Bio::SCF::Arrays', $self->{scf_pointer}, $k; 78 return $self->{cache}{$k} = \@array; 79 } 80 81 else { 82 83 if (defined( my $header_field = HEADER_FIELDS->{$key})) { 84 return get_from_header($self->{scf_pointer}, $header_field); 85 } 86 87 elsif ($key eq "comments") { 88 return get_comments($self->{scf_pointer}); 89 } 90 91 elsif ($key eq 'samples') { 92 return $self->{cache}{$key} if exists $self->{cache}{$key}; 93 my %sample; 94 tie %sample, 'Bio::SCF', $self->{scf_pointer}, 1; 95 $self->{cache}{key} = \%sample; 96 return \%sample; 97 } 98 99 elsif (exists KEYS->{$key}) { 100 return $self->{cache}{$key} if exists $self->{cache}{$key}; 101 tie @array, 'Bio::SCF::Arrays', $self->{scf_pointer}, $key; 102 $self->{cache}{$key} = \@array; 103 return \@array; 104 } 105 106 } 107} 108 109sub bases_length { 110 my $self = shift; 111 get_from_header($self->{scf_pointer},HEADER_FIELDS->{bases_length}); 112} 113 114sub samples_length { 115 my $self = shift; 116 get_from_header($self->{scf_pointer},HEADER_FIELDS->{samples_length}); 117} 118 119sub sample_size { 120 my $self = shift; 121 get_from_header($self->{scf_pointer},HEADER_FIELDS->{sample_size}); 122} 123 124sub code_set { 125 my $self = shift; 126 get_from_header($self->{scf_pointer},HEADER_FIELDS->{code_set}); 127} 128 129sub index { 130 my $self = shift; 131 my $index = shift; 132 my $d = $self->at('index',$index); 133 $self->set('index',$index,shift) if @_; 134 $d; 135} 136 137sub sample { 138 my $self = shift; 139 my $base = uc shift; 140 my $index = shift; 141 my $d = $self->at("samples${base}",$index); 142 $self->set("samples${base}",$index,shift) if @_; 143 $d; 144} 145 146sub base { 147 my $self = shift; 148 my $index = shift; 149 my $d = $self->at('bases',$index); 150 $self->set('bases',$index,shift) if @_; 151 $d; 152} 153 154sub base_score { 155 my $self = shift; 156 my $base = uc shift; 157 my $index = shift; 158 my $d = $self->at($base,$index); 159 $self->set($base,$index,shift) if @_; 160 $d; 161} 162 163sub score { 164 my $self = shift; 165 my $index = shift; 166 my $base = uc $self->base($index); 167 my $d = exists KEYS->{$base} ? $self->base_score($base,$index) : 0; 168 $self->set($base,$index,shift) if @_; 169 $d; 170} 171 172sub comments { 173 my $self = shift; 174 get_comments($self->{scf_pointer}); 175} 176 177sub at { 178 my $self = shift; 179 # possible keys { bases, A, C, G, T, spare1/2/3, sampleA/C/G/T } 180 my $key = shift; 181 my $index = shift; 182 return get_at($self->{scf_pointer}, $index, KEYS->{$key}); 183} 184 185sub set { 186 my $self = shift; 187 # possible keys { bases, A, C, G, T, spare1/2/3, sampleA/C/G/T } 188 my $key = shift; 189 my $index = shift; 190 my $value = shift or die "Bio::SCF::set(...) value not defined\n"; 191 if ( $key eq "bases" ){ 192 set_base_at($self->{scf_pointer}, $index, KEYS->{$key}, $value); 193 }else{ 194 set_at($self->{scf_pointer}, $index, KEYS->{$key}, $value); 195 } 196} 197 198sub write { 199 my $self = shift; 200 my $file_name = shift || $self->{file_name}; 201 return scf_write($self->{scf_pointer}, $file_name); 202} 203 204sub fwrite { 205 my $self = shift; 206 my $file_handle = shift || 207 die "Bio::SCF::fwrite(...) : file handle is not defined\n"; 208 return scf_fwrite($self->{scf_pointer}, $file_handle); 209} 210 211sub STORE { 212 my $self = shift; 213 my $key = shift; 214 my $value = shift; 215 SWITCH: { 216 $key eq "comments" && do { 217 set_comments($self->{scf_pointer}, $value); 218 last SWITCH; 219 }; 220 die "Bio::SCF::STORE field $key doesn't exist or not allowed to be modified\n"; 221 } 222} 223 224sub FIRSTKEY { 225 my $self = shift; 226 my $a = keys %{KEYS()}; 227 each %{KEYS()} 228} 229 230sub NEXTKEY { 231 my $self = shift; 232 each %{KEYS()}; 233} 234 235sub CLEAR { 236 croak "The Bio::SCF module does not support this operation"; 237} 238 239sub DELETE { 240 croak "The Bio::SCF module does not support this operation"; 241} 242 243sub DESTROY { 244 my $self = shift; 245 Bio::SCF::scf_free($self->{scf_pointer}) unless $self->{sample_hash}; 246} 247 248# Autoload methods go after =cut, and are processed by the autosplit program. 249 2501; 251__END__ 252 253=head1 NAME 254 255Bio::SCF - Perl extension for reading and writting SCF sequence files 256 257=head1 SYNOPSIS 258 259use Bio::SCF; 260 261# tied interface 262tie %hash,'Bio::SCF','my_scf_file.scf'; 263 264my $sequence_length = $hash{bases_length}; 265my $chromatogram_sample_length = $hash{samples_length}; 266my $third_base = $hash{bases}[2]; 267my $quality_score = $hash{$third_base}[2]; 268my $sample_A_at_time_1400 = $hash{samples}{A}[1400]; 269 270# change the third base and write out new file 271$hash{bases}[2] = 'C'; 272tied (%hash)->write('new.scf'); 273 274# object-oriented interface 275my $scf = Bio::SCF->new('my_scf_file.scf'); 276my $sequence_length = $scf->bases_length; 277my $chromatogram_sample_length = $scf->samples_length; 278my $third_base = $scf->bases(2); 279my $quality_score = $scf->score(2); 280my $sample_A_at_time_1400 = $scf->sample('A',1400); 281 282# change the third base and write out new file 283$scf->bases(2,'C'); 284$scf->write('new.scf'); 285 286=head1 DESCRIPTION 287 288This module provides a perl interface to SCF DNA sequencing files. It 289has both tied hash and an object-oriented interfaces. It provides the 290ability to read fields from SCF files and limited ability to modify 291them and write them back. 292 293=head2 Tied Methods 294 295=over 4 296 297=item $obj = tie %hash,'Bio::SCF',$filename_or_handle 298 299Tie the Bio::SCF module to a filename or filehandle. If successful, tie() 300will return the object. 301 302=item $value = $hash{'key'} 303 304Fetch a field from the SCF file. Valid keys are as follows: 305 306 Key Value 307 --- ----- 308 309 bases_length Number of called bases in the sequence (read-only) 310 311 samples_length Number of samples in the file (read-only) 312 313 version SCF version (read-only) 314 315 code_set Code set used to code bases (read-only) 316 317 comments Structured comments (read-only) 318 319 bases Array reference to a list of the base calls 320 321 index Array reference to a list of the sample position 322 for each of the base calls (e.g. the position of 323 the base calling peak) 324 325 A An array reference that can be used to determine the 326 probability that the base in position $i is an "A". 327 328 G An array reference that can be used to determine the 329 probability that the base in position $i is a "G". 330 331 C An array reference that can be used to determine the 332 probability that the base in position $i is a "C". 333 334 T An array reference that can be used to determine the 335 probability that the base in position $i is a "T". 336 337 samples A hash reference with keys "A", "C", "G" and "T". The 338 value of each hash is an array reference to the list 339 of intensity values for each sample. 340 341To get the length of the called sequence: $scf{bases_length} 342 343To get the value of the called sequence at position 3: $scf{bases}[3] 344 345To get the sample position at which base 3 was called: $scf{index}[3] 346 347To get the value of the "C" curve under base 3: $scf{samples}{C}[$scf{index}[3]] 348 349To get the probability that base 3 is a "C": $scf{C}[3] 350 351To print out the chromatogram as a four-column list: 352 353 my $samples = $scf{samples}; 354 for (my $i = 0; $i<$scf{samples_length}; $i++) { 355 print join "\t",$samples->{C}[$i],$samples->{G}[$i], 356 $samples->{A}[$i],$samples->{T}[$i],"\n"; 357 } 358 359=item $scf{bases}[$index] = $new_value 360 361The base call probability scores, base call values, base call 362positions, and sample values are all read/write, so that you can 363change them: 364 365 $samples->{C}[500] = 0; 366 367=item each %scf 368 369Will return keys and values for the tied object. 370 371=item delete $scf{$key} 372 373=item %scf = () 374 375These operations are not supported and will return a run-time error 376 377=back 378 379=head2 Object Methods 380 381=over 4 382 383=item $scf = Bio::SCF->new($scf_file_or_filehandle) 384 385Create a new Bio::SCF object. The single argument is the name of a file or 386a previously-opened filehandle. If successful, new() returns the Bio::SCF 387object. 388 389=item $length = $scf->bases_length 390 391Return the length of the called sequence. 392 393=item $samples = $scf->samples_length 394 395Return the length of the list of chromatogram samples in the 396file. There are four sample series, one for each base. 397 398=item $sample_size = $scf->sample_size 399 400Returns the size of each sample (bytes). 401 402=item $code_set = $scf->code_set 403 404Return the code set used for base calling. 405 406=item $base = $scf->base($base_no [,$new_base]) 407 408Get the base call at the indicated position. If a new value is 409provided, will change the base call to the indicated base. 410 411=item $index = $scf->index($base_no [,$new_index]) 412 413Translates the indicated base position into the sample index for that 414called base. Here is how to fetch the intensity values at base number 5: 415 416 my $sample_index = $scf->index(5); 417 my ($g,$a,$t,$c) = map { $scf->sample($_,$sample_index) } qw(G A T C); 418 419If you provide a new value for the sample index, it will be updated. 420 421=item $base_score = $scf->base_score($base,$base_no [,$new_score]) 422 423Get the probability that the indicated base occurs at position 424$base_no. Here is how to fetch the probabilities for the four bases at 425base position 5: 426 427 my ($g,$a,$t,$c) = map { $scf->base_score($_,5) } qw(G A T C); 428 429If you provide a new value for the base probability score, it will be 430updated. 431 432=item $score = $scf->score($base_no) 433 434Get the quality score for the called base at the indicated position. 435 436=item $intensity = $scf->sample($base,$sample_index [,$new_value]) 437 438Get the intensity value for the channel corresponding to the indicated 439base at the indicated sample index. You may update the intensity by 440providing a new value. 441 442=item $scf->write('file_path') 443 444Write the updated SCF file to the indicated file path. 445 446=item $scf->fwrite($file_handle) 447 448Write the updated SCF file to the indicated filehandle. The file must 449previously have been opened for writing. The filehandle is actually 450reopened in append mode, so you can call fwrite() multiple times and 451interperse your own record separators. 452 453=back 454 455=head1 EXAMPLES 456 457Reading information from a preexisting file: 458 459 tie %scf, 'Bio::SCF', "data.scf"; 460 print "Base calls:\n"; 461 for ( my $i=0; $i<$scf{bases}; $i++ ){ 462 print "$scf{base}[$i] "; 463 } 464 print "\n"; 465 466 print "Intensity values for the A curve\n"; 467 for ( my $i=0; $i<$scf{samples}; $i++ ){ 468 print "$scf{sample}{A}[$i]; 469 } 470 print "\n"; 471 472Another example, where we set all bases to "A", indexes to 10 and write 473the file back: 474 475 my $obj = tie %scf,'Bio::SCF','data.scf'; 476 for (0...@{$scf{bases}}-1){ 477 $scf{base}[$_] = "A"; 478 $obj->set('index', $_, 10); 479 } 480 $obj->write('data.scf'); 481 482=head1 AUTHOR 483 484Dmitri Priimak, priimak@cshl.org (1999) 485 486with some cleanups by 487Lincoln Stein, lstein@cshl.edu (2006) 488 489This package and its accompanying libraries is free software; you can 490redistribute it and/or modify it under the terms of the GPL (either 491version 1, or at your option, any later version) or the Artistic 492License 2.0. Refer to LICENSE for the full license text. In addition, 493please see DISCLAIMER for disclaimers of warranty. 494 495=head1 SEE ALSO 496 497perl(1). 498 499=cut 500