1# 2# BioPerl module for Bio::DB::IndexedBase 3# 4# You may distribute this module under the same terms as perl itself 5# 6 7=head1 NAME 8 9Bio::DB::IndexedBase - Base class for modules using indexed sequence files 10 11=head1 SYNOPSIS 12 13 use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase 14 15 # 1/ Bio::SeqIO-style access 16 17 # Index some sequence files 18 my $db = Bio::DB::XXX->new('/path/to/file'); # from a single file 19 my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files 20 my $db = Bio::DB::XXX->new('/path/to/files/'); # from a directory 21 22 # Get IDs of all the sequences in the database 23 my @ids = $db->get_all_primary_ids; 24 25 # Get a specific sequence 26 my $seq = $db->get_Seq_by_id('CHROMOSOME_I'); 27 28 # Loop through all sequences 29 my $stream = $db->get_PrimarySeq_stream; 30 while (my $seq = $stream->next_seq) { 31 # Do something... 32 } 33 34 35 # 2/ Access via filehandle 36 my $fh = Bio::DB::XXX->newFh('/path/to/file'); 37 while (my $seq = <$fh>) { 38 # Do something... 39 } 40 41 42 # 3/ Tied-hash access 43 tie %sequences, 'Bio::DB::XXX', '/path/to/file'; 44 print $sequences{'CHROMOSOME_I:1,20000'}; 45 46=head1 DESCRIPTION 47 48Bio::DB::IndexedBase provides a base class for modules that want to index 49and read sequence files and provides persistent, random access to each sequence 50entry, without bringing the entire file into memory. This module is compliant 51with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use 52Bio::DB::IndexedBase. 53 54When you initialize the module, you point it at a single file, several files, or 55a directory of files. The first time it is run, the module generates an index 56of the content of the files using the AnyDBM_File module (BerkeleyDB preferred, 57followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the 58index file to find the sequence file and offset for any requested sequence. If 59one of the source files is updated, the module reindexes just that one file. You 60can also force reindexing manually at any time. For improved performance, the 61module keeps a cache of open filehandles, closing less-recently used ones when 62the cache is full. 63 64Entries may have any line length up to 65,536 characters, and different line 65lengths are allowed in the same file. However, within a sequence entry, all 66lines must be the same length except for the last. An error will be thrown if 67this is not the case! 68 69This module was developed for use with the C. elegans and human genomes, and has 70been tested with sequence segments as large as 20 megabases. Indexing the C. 71elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5 72minutes on my 300 MHz pentium laptop. On the same system, average access time 73for any 200-mer within the C. elegans genome was E<lt>0.02s. 74 75=head1 DATABASE CREATION AND INDEXING 76 77The two constructors for this class are new() and newFh(). The former creates a 78Bio::DB::IndexedBase object which is accessed via method calls. The latter 79creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence 80objects in a stream fashion. There is also a tied hash interface. 81 82=over 83 84=item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options]) 85 86Create a new Bio::DB::IndexedBase object from the files designated by $path 87$path may be a single file, an arrayref of files, or a directory containing 88such files. 89 90After the database is created, you can use methods like get_all_primary_ids() 91and get_Seq_by_id() to retrieve sequence objects. 92 93=item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options]) 94 95Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading 96from this filehandle with E<lt>E<gt> will return a stream of sequence objects, 97Bio::SeqIO style. The path and the options should be specified as for new(). 98 99=item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args] 100 101Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated 102path to the files. The optional @args list is the same set used by new(). If 103successful, tie() returns the tied object, undef otherwise. 104 105Once tied, you can use the hash to retrieve an individual sequence by 106its ID, like this: 107 108 my $seq = $db{CHROMOSOME_I}; 109 110The keys() and values() functions will return the sequence IDs and their 111sequences, respectively. In addition, each() can be used to iterate over the 112entire data set: 113 114 while (my ($id,$sequence) = each %db) { 115 print "$id => $sequence\n"; 116 } 117 118 119When dealing with very large sequences, you can avoid bringing them into memory 120by calling each() in a scalar context. This returns the key only. You can then 121use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods. 122 123 while (my $id = each %db) { 124 print "$id: $db{$sequence:1,100}\n"; 125 print "$id: ".tied(%db)->length($id)."\n"; 126 } 127 128In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly 129to retrieve the first and next ID in the database, respectively. This allows one to 130write the following iterative loop using just the object-oriented interface: 131 132 my $db = Bio::DB::IndexedBase->new('/path/to/file'); 133 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) { 134 # do something with sequence 135 } 136 137=back 138 139=head1 INDEX CONTENT 140 141Several attributes of each sequence are stored in the index file. Given a 142sequence ID, these attributes can be retrieved using the following methods: 143 144=over 145 146=item offset($id) 147 148Get the offset of the indicated sequence from the beginning of the file in which 149it is located. The offset points to the beginning of the sequence, not the 150beginning of the header line. 151 152=item strlen($id) 153 154Get the number of characters in the sequence string. 155 156=item length($id) 157 158Get the number of residues of the sequence. 159 160=item linelen($id) 161 162Get the length of the line for this sequence. If the sequence is wrapped, then 163linelen() is likely to be much shorter than strlen(). 164 165=item headerlen($id) 166 167Get the length of the header line for the indicated sequence. 168 169=item header_offset 170 171Get the offset of the header line for the indicated sequence from the beginning 172of the file in which it is located. This attribute is not stored. It is 173calculated from offset() and headerlen(). 174 175=item alphabet($id) 176 177Get the molecular type (alphabet) of the indicated sequence. This method handles 178residues according to the IUPAC convention. 179 180=item file($id) 181 182Get the the name of the file in which the indicated sequence can be found. 183 184=back 185 186=head1 INTERFACE COMPLIANCE NOTES 187 188Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the 189Bio::RandomAccessI interfaces. 190 191Database do not necessarily provide any meaningful internal primary ID for the 192sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are 193the IDs of the sequences. This means that the same ID passed to get_Seq_by_id() 194and get_Seq_by_primary_id() will return the same sequence. 195 196Since this database index has no notion of sequence version or namespace, the 197get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical. 198 199=head1 BUGS 200 201When a sequence is deleted from one of the files, this deletion is not detected 202by the module and removed from the index. As a result, a "ghost" entry will 203remain in the index and will return garbage results if accessed. 204 205Also, if you are indexing a directory, it is wise to not add or remove files 206from it. 207 208In case you have changed the files in a directory, or the sequences in a file, 209you can to rebuild the entire index, either by deleting it manually, or by 210passing -reindex=E<gt>1 to new() when initializing the module. 211 212=head1 SEE ALSO 213 214L<DB_File> 215 216L<Bio::DB::Fasta> 217 218L<Bio::DB::Qual> 219 220=head1 AUTHOR 221 222Lincoln Stein E<lt>lstein@cshl.orgE<gt>. 223 224Copyright (c) 2001 Cold Spring Harbor Laboratory. 225 226Florent Angly (for the modularization) 227 228This library is free software; you can redistribute it and/or modify 229it under the same terms as Perl itself. See DISCLAIMER.txt for 230disclaimers of warranty. 231 232=head1 APPENDIX 233 234The rest of the documentation details each of the object 235methods. Internal methods are usually preceded with a _ 236 237=cut 238 239 240package Bio::DB::IndexedBase; 241$Bio::DB::IndexedBase::VERSION = '1.7.7'; 242 243BEGIN { 244 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File) 245 if(!$INC{'AnyDBM_File.pm'}); 246} 247 248use strict; 249use warnings; 250use IO::File; 251use AnyDBM_File; 252use Fcntl; 253use File::Spec; 254use File::Basename qw(basename dirname); 255use Bio::PrimarySeq; 256 257use base qw(Bio::DB::SeqI); 258 259# Store offset, strlen, linelen, headerlen, type and fileno 260use constant STRUCT => 'NNNnnCa*'; # 32-bit file offset and seq length 261use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit 262 263use constant NA => 0; 264use constant DNA => 1; 265use constant RNA => 2; 266use constant PROTEIN => 3; 267 268# You can avoid dying if you want but you may get incorrect results 269use constant DIE_ON_MISSMATCHED_LINES => 1; 270 271=head2 new 272 273 Title : new 274 Usage : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1); 275 Function: Initialize a new database object 276 Returns : A Bio::DB::IndexedBase object 277 Args : A single file, or path to dir, or arrayref of files 278 Optional arguments: 279 280 Option Description Default 281 ----------- ----------- ------- 282 -glob Glob expression to search for files in directories * 283 -makeid A code subroutine for transforming IDs None 284 -maxopen Maximum size of filehandle cache 32 285 -debug Turn on status messages 0 286 -reindex Force the index to be rebuilt 0 287 -dbmargs Additional arguments to pass to the DBM routine None 288 -index_name Name of the file that will hold the indices 289 -clean Remove the index file when finished 0 290 291The -dbmargs option can be used to control the format of the index. For example, 292you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and 293retrieved alphabetically. Note that you must use the same arguments every time 294you open the index! 295 296The -makeid option gives you a chance to modify sequence IDs during indexing. 297For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense 298that GenBank Fasta files use. The original header line can be recovered later. 299The option value for -makeid should be a code reference that takes a scalar 300argument (the full header line) and returns a scalar or an array of scalars (the 301ID or IDs you want to assign). For example: 302 303 $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi); 304 305 sub extract_gi { 306 # Extract GI from GenBank 307 my $header = shift; 308 my ($id) = ($header =~ /gi\|(\d+)/m); 309 return $id || ''; 310 } 311 312extract_gi() will be called with the full header line, e.g. a Fasta line would 313include the "E<gt>", the ID and the description: 314 315 >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3) 316 317In the database, this sequence can now be retrieved by its GI instead of its 318complete ID: 319 320 my $seq = $db->get_Seq_by_id(352962132); 321 322The -makeid option is ignored after the index is constructed. 323 324=cut 325 326sub new { 327 my ($class, $path, %opts) = @_; 328 329 my $self = bless { 330 debug => $opts{-debug} || 0, 331 makeid => $opts{-makeid}, 332 glob => $opts{-glob} || eval '$'.$class.'::file_glob' || '*', 333 maxopen => $opts{-maxopen} || 32, 334 clean => $opts{-clean} || 0, 335 dbmargs => $opts{-dbmargs} || undef, 336 fhcache => {}, 337 cacheseq => {}, 338 curopen => 0, 339 openseq => 1, 340 dirname => undef, 341 offsets => undef, 342 index_name => $opts{-index_name}, 343 obj_class => eval '$'.$class.'::obj_class', 344 offset_meth => \&{$class.'::_calculate_offsets'}, 345 fileno2path => [], 346 filepath2no => {}, 347 }, $class; 348 349 my ($offsets, $dirname); 350 my $ref = ref $path || ''; 351 if ( $ref eq 'ARRAY' ) { 352 $offsets = $self->index_files($path, $opts{-reindex}); 353 require Cwd; 354 $dirname = Cwd::getcwd(); 355 } else { 356 $self->{index_name} ||= $self->_default_index_name($path); 357 if (-d $path) { 358 # because Win32 glob() is broken with respect to long file names 359 # that contain whitespace. 360 $path = Win32::GetShortPathName($path) 361 if $^O =~ /^MSWin/i && eval 'use Win32; 1'; 362 $offsets = $self->index_dir($path, $opts{-reindex}); 363 $dirname = $path; 364 } elsif (-f _) { 365 $offsets = $self->index_file($path, $opts{-reindex}); 366 $dirname = dirname($path); 367 } else { 368 $self->throw( "No file or directory called '$path'"); 369 } 370 } 371 @{$self}{qw(dirname offsets)} = ($dirname, $offsets); 372 373 return $self; 374} 375 376 377=head2 newFh 378 379 Title : newFh 380 Usage : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options); 381 Function: Index and get a new Fh for a single file, several files or a directory 382 Returns : Filehandle object 383 Args : Same as new() 384 385=cut 386 387sub newFh { 388 my ($class, @args) = @_; 389 my $self = $class->new(@args); 390 require Symbol; 391 my $fh = Symbol::gensym; 392 tie $$fh, 'Bio::DB::Indexed::Stream', $self 393 or $self->throw("Could not tie filehandle: $!"); 394 return $fh; 395} 396 397 398=head2 dbmargs 399 400 Title : dbmargs 401 Usage : my @args = $db->dbmargs; 402 Function: Get stored dbm arguments 403 Returns : Array 404 Args : None 405 406=cut 407 408sub dbmargs { 409 my $self = shift; 410 my $args = $self->{dbmargs} or return; 411 return ref($args) eq 'ARRAY' ? @$args : $args; 412} 413 414 415=head2 glob 416 417 Title : glob 418 Usage : my $glob = $db->glob; 419 Function: Get the expression used to match files in directories 420 Returns : String 421 Args : None 422 423=cut 424 425sub glob { 426 my $self = shift; 427 return $self->{glob}; 428} 429 430 431=head2 index_dir 432 433 Title : index_dir 434 Usage : $db->index_dir($dir); 435 Function: Index the files that match -glob in the given directory 436 Returns : Hashref of offsets 437 Args : Dirname 438 Boolean to force a reindexing the directory 439 440=cut 441 442sub index_dir { 443 my ($self, $dir, $force_reindex) = @_; 444 my @files = glob( File::Spec->catfile($dir, $self->{glob}) ); 445 return if scalar @files == 0; 446 $self->{index_name} ||= $self->_default_index_name($dir); 447 my $offsets = $self->_index_files(\@files, $force_reindex); 448 return $offsets; 449} 450 451 452=head2 get_all_primary_ids 453 454 Title : get_all_primary_ids, get_all_ids, ids 455 Usage : my @ids = $db->get_all_primary_ids; 456 Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method 457 implementation. Note that in this implementation, the internal 458 database primary IDs are also the sequence IDs. 459 Returns : List of ids 460 Args : None 461 462=cut 463 464sub get_all_primary_ids { 465 return keys %{shift->{offsets}}; 466} 467 468{ 469no warnings 'once'; 470*ids = *get_all_ids = \&get_all_primary_ids; 471} 472 473 474=head2 index_file 475 476 Title : index_file 477 Usage : $db->index_file($filename); 478 Function: Index the given file 479 Returns : Hashref of offsets 480 Args : Filename 481 Boolean to force reindexing the file 482 483=cut 484 485sub index_file { 486 my ($self, $file, $force_reindex) = @_; 487 $self->{index_name} ||= $self->_default_index_name($file); 488 my $offsets = $self->_index_files([$file], $force_reindex); 489 return $offsets; 490} 491 492sub _default_index_name { 493 my ($self,$path) = @_; 494 return File::Spec->catfile($path,'directory.index') if -d $path; 495 return "$path.index"; 496} 497 498=head2 index_files 499 500 Title : index_files 501 Usage : $db->index_files(\@files); 502 Function: Index the given files 503 Returns : Hashref of offsets 504 Args : Arrayref of filenames 505 Boolean to force reindexing the files 506 507=cut 508 509sub index_files { 510 my ($self, $files, $force_reindex) = @_; 511 my @paths = map { File::Spec->rel2abs($_) } @$files; 512 require Digest::MD5; 513 my $digest = Digest::MD5::md5_hex( join('', sort @paths) ); 514 $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files 515 my $offsets = $self->_index_files($files, $force_reindex); 516 return $offsets; 517} 518 519 520=head2 index_name 521 522 Title : index_name 523 Usage : my $indexname = $db->index_name($path); 524 Function: Get the full name of the index file 525 Returns : String 526 Args : None 527 528=cut 529 530sub index_name { 531 return shift->{index_name}; 532} 533 534 535=head2 path 536 537 Title : path 538 Usage : my $path = $db->path($path); 539 Function: When a single file or a directory of files is indexed, this returns 540 the file directory. When indexing an arbitrary list of files, the 541 return value is the path of the current working directory. 542 Returns : String 543 Args : None 544 545=cut 546 547sub path { 548 return shift->{dirname}; 549} 550 551 552=head2 get_PrimarySeq_stream 553 554 Title : get_PrimarySeq_stream 555 Usage : my $stream = $db->get_PrimarySeq_stream(); 556 Function: Get a SeqIO-like stream of sequence objects. The stream supports a 557 single method, next_seq(). Each call to next_seq() returns a new 558 PrimarySeqI compliant sequence object, until no more sequences remain. 559 This is a Bio::DB::SeqI method implementation. 560 Returns : A Bio::DB::Indexed::Stream object 561 Args : None 562 563=cut 564 565sub get_PrimarySeq_stream { 566 my $self = shift; 567 return Bio::DB::Indexed::Stream->new($self); 568} 569 570 571=head2 get_Seq_by_id 572 573 Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id 574 Usage : my $seq = $db->get_Seq_by_id($id); 575 Function: Given an ID, fetch the corresponding sequence from the database. 576 This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation. 577 Returns : A sequence object 578 Args : ID 579 580=cut 581 582sub get_Seq_by_id { 583 my ($self, $id) = @_; 584 $self->throw('Need to provide a sequence ID') if not defined $id; 585 return if not exists $self->{offsets}{$id}; 586 return $self->{obj_class}->new($self, $id); 587} 588 589{ 590no warnings 'once'; 591*get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id; 592} 593 594 595=head2 _calculate_offsets 596 597 Title : _calculate_offsets 598 Usage : $db->_calculate_offsets($filename, $offsets); 599 Function: This method calculates the sequence offsets in a file based on ID and 600 should be implemented by classes that use Bio::DB::IndexedBase. 601 Returns : Hash of offsets 602 Args : File to process 603 Hashref of file offsets keyed by IDs. 604 605=cut 606 607sub _calculate_offsets { 608 my $self = shift; 609 $self->throw_not_implemented(); 610} 611 612 613sub _index_files { 614 # Do the indexing of the given files using the index file on record 615 my ($self, $files, $force_reindex) = @_; 616 617 $self->_set_pack_method( @$files ); 618 619 # Get name of index file 620 my $index = $self->index_name; 621 622 # If caller has requested reindexing, unlink the index file. 623 if ($force_reindex) { 624 # Tied-hash in Strawberry Perl creates "$file.index" 625 unlink $index if -e $index; 626 # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir" 627 unlink "$index.dir" if -e "$index.dir"; 628 unlink "$index.pag" if -e "$index.pag"; 629 } 630 631 # Get the modification time of the index 632 my $indextime = (stat $index)[9] || 0; 633 634 # Register files and find if there has been any update 635 my $modtime = 0; 636 my @updated; 637 for my $file (@$files) { 638 # Register file 639 $self->_path2fileno(basename($file)); 640 # Any update? 641 my $m = (stat $file)[9] || 0; 642 if ($m > $modtime) { 643 $modtime = $m; 644 } 645 if ($m > $indextime) { 646 push @updated, $file; 647 } 648 } 649 650 # Get termination length from first file 651 $self->{termination_length} = $self->_calc_termination_length( $files->[0] ); 652 653 # Reindex contents of changed files if needed 654 my $reindex = $force_reindex || (scalar @updated > 0); 655 $self->{offsets} = $self->_open_index($index, $reindex) or return; 656 if ($reindex) { 657 $self->{indexing} = $index; 658 for my $file (@updated) { 659 my $fileno = $self->_path2fileno(basename($file)); 660 &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets}); 661 } 662 delete $self->{indexing}; 663 } 664 665 # Closing and reopening might help corrupted index file problem on Windows 666 $self->_close_index($self->{offsets}); 667 668 return $self->{offsets} = $self->_open_index($index); 669} 670 671 672sub _open_index { 673 # Open index file in read-only or write mode 674 my ($self, $index_file, $write) = @_; 675 my %offsets; 676 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY; 677 my @dbmargs = $self->dbmargs; 678 tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs 679 or $self->throw( "Could not open index file $index_file: $!"); 680 return \%offsets; 681} 682 683 684sub _close_index { 685 # Close index file 686 my ($self, $index) = @_; 687 untie %$index; 688 return 1; 689} 690 691# Compiling the below regular expression speeds up _parse_compound_id 692my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x; 693 694sub _parse_compound_id { 695 # Handle compound IDs: 696 # $db->seq($id) 697 # $db->seq($id, $start, $stop, $strand) 698 # $db->seq("$id:$start,$stop") 699 # $db->seq("$id:$start..$stop") 700 # $db->seq("$id:$start-$stop") 701 # $db->seq("$id:$start,$stop/$strand") 702 # $db->seq("$id:$start..$stop/$strand") 703 # $db->seq("$id:$start-$stop/$strand") 704 # $db->seq("$id/$strand") 705 my ($self, $id, $start, $stop, $strand) = @_; 706 707 if ( (not defined $start ) && 708 (not defined $stop ) && 709 (not defined $strand) && 710 ($id =~ m{$compound_id}) ) { 711 # Start, stop and strand not provided and ID looks like a compound ID 712 ($id, $start, $stop, $strand) = ($1, $2, $3, $4); 713 } 714 715 # Start, stop and strand defaults 716 $stop ||= $self->length($id) || 0; # 0 if sequence not found in database 717 $start ||= ($stop > 0) ? 1 : 0; 718 $strand ||= 1; 719 720 # Convert numbers such as 1_000_000 to 1000000 721 $start =~ s/_//g; 722 $stop =~ s/_//g; 723 724 if ($start > $stop) { 725 # Change the strand 726 ($start, $stop) = ($stop, $start); 727 $strand *= -1; 728 } 729 730 return $id, $start, $stop, $strand; 731} 732 733 734sub _guess_alphabet { 735 # Determine the molecular type of the given sequence string: 736 # 'dna', 'rna', 'protein' or '' (unknown/empty) 737 my ($self, $string) = @_; 738 # Handle IUPAC residues like PrimarySeq does 739 my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1); 740 return $alphabet eq 'dna' ? DNA 741 : $alphabet eq 'rna' ? RNA 742 : $alphabet eq 'protein' ? PROTEIN 743 : NA; 744} 745 746 747sub _makeid { 748 # Process the header line by applying any transformation given in -makeid 749 my ($self, $header_line) = @_; 750 return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1; 751} 752 753 754sub _check_linelength { 755 # Check that the line length is valid. Generate an error otherwise. 756 my ($self, $linelength) = @_; 757 return if not defined $linelength; 758 $self->throw( 759 "Each line of the file must be less than 65,536 characters. Line ". 760 "$. is $linelength chars." 761 ) if $linelength > 65535; 762} 763 764 765sub _calc_termination_length { 766 # Try the beginning of the file to determine termination length 767 # Account for crlf-terminated Windows and Mac files 768 my ($self, $file) = @_; 769 my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!"); 770 771 # In Windows, text files have '\r\n' as line separator, but when reading in 772 # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", 773 # "length $_" will report 4 although the line is 5 bytes in length. 774 # We assume that all lines have the same line separator and only read current line. 775 my $init_pos = tell($fh); 776 my $curr_line = <$fh>; 777 my $pos_diff = tell($fh) - $init_pos; 778 my $correction = $pos_diff - length $curr_line; 779 close $fh; 780 781 $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction; 782 return $self->{termination_length}; 783} 784 785 786sub _calc_offset { 787 # Get the offset of the n-th residue of the sequence with the given ID 788 # and termination length (tl) 789 my ($self, $id, $n) = @_; 790 my $tl = $self->{termination_length}; 791 $n--; 792 my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3]; 793 $n = 0 if $n < 0; 794 $n = $seqlen-1 if $n >= $seqlen; 795 return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl); 796} 797 798 799sub _fh { 800 # Given a sequence ID, return the filehandle on which to find this sequence 801 my ($self, $id) = @_; 802 $self->throw('Need to provide a sequence ID') if not defined $id; 803 my $file = $self->file($id) or return; 804 return eval { 805 $self->_fhcache( File::Spec->catfile($self->{dirname}, $file)); 806 } || $self->throw( "Can't open file $file" ); 807} 808 809 810sub _fhcache { 811 my ($self, $path) = @_; 812 if (!$self->{fhcache}{$path}) { 813 if ($self->{curopen} >= $self->{maxopen}) { 814 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} 815 keys %{$self->{fhcache}}; 816 splice(@lru, $self->{maxopen} / 3); 817 $self->{curopen} -= @lru; 818 for (@lru) { 819 delete $self->{fhcache}{$_}; 820 } 821 } 822 $self->{fhcache}{$path} = IO::File->new($path) || return; 823 binmode $self->{fhcache}{$path}; 824 $self->{curopen}++; 825 } 826 $self->{cacheseq}{$path}++; 827 return $self->{fhcache}{$path}; 828} 829 830 831#------------------------------------------------------------- 832# Methods to store and retrieve data from indexed file 833# 834 835=head2 offset 836 837 Title : offset 838 Usage : my $offset = $db->offset($id); 839 Function: Get the offset of the indicated sequence from the beginning of the 840 file in which it is located. The offset points to the beginning of 841 the sequence, not the beginning of the header line. 842 Returns : String 843 Args : ID of sequence 844 845=cut 846 847sub offset { 848 my ($self, $id) = @_; 849 $self->throw('Need to provide a sequence ID') if not defined $id; 850 my $offset = $self->{offsets}{$id} or return; 851 return (&{$self->{unpackmeth}}($offset))[0]; 852} 853 854 855=head2 strlen 856 857 Title : strlen 858 Usage : my $length = $db->strlen($id); 859 Function: Get the number of characters in the sequence string. 860 Returns : Integer 861 Args : ID of sequence 862 863=cut 864 865sub strlen { 866 my ($self, $id) = @_; 867 $self->throw('Need to provide a sequence ID') if not defined $id; 868 my $offset = $self->{offsets}{$id} or return; 869 return (&{$self->{unpackmeth}}($offset))[1]; 870} 871 872 873=head2 length 874 875 Title : length 876 Usage : my $length = $db->length($id); 877 Function: Get the number of residues of the sequence. 878 Returns : Integer 879 Args : ID of sequence 880 881=cut 882 883sub length { 884 my ($self, $id) = @_; 885 $self->throw('Need to provide a sequence ID') if not defined $id; 886 my $offset = $self->{offsets}{$id} or return; 887 return (&{$self->{unpackmeth}}($offset))[2]; 888} 889 890 891=head2 linelen 892 893 Title : linelen 894 Usage : my $linelen = $db->linelen($id); 895 Function: Get the length of the line for this sequence. 896 Returns : Integer 897 Args : ID of sequence 898 899=cut 900 901sub linelen { 902 my ($self, $id) = @_; 903 $self->throw('Need to provide a sequence ID') if not defined $id; 904 my $offset = $self->{offsets}{$id} or return; 905 return (&{$self->{unpackmeth}}($offset))[3]; 906} 907 908 909=head2 headerlen 910 911 Title : headerlen 912 Usage : my $length = $db->headerlen($id); 913 Function: Get the length of the header line for the indicated sequence. 914 Returns : Integer 915 Args : ID of sequence 916 917=cut 918 919sub headerlen { 920 my ($self, $id) = @_; 921 $self->throw('Need to provide a sequence ID') if not defined $id; 922 my $offset = $self->{offsets}{$id} or return; 923 return (&{$self->{unpackmeth}}($offset))[4]; 924} 925 926 927=head2 header_offset 928 929 Title : header_offset 930 Usage : my $offset = $db->header_offset($id); 931 Function: Get the offset of the header line for the indicated sequence from 932 the beginning of the file in which it is located. 933 Returns : String 934 Args : ID of sequence 935 936=cut 937 938sub header_offset { 939 my ($self, $id) = @_; 940 $self->throw('Need to provide a sequence ID') if not defined $id; 941 return if not $self->{offsets}{$id}; 942 return $self->offset($id) - $self->headerlen($id); 943} 944 945 946=head2 alphabet 947 948 Title : alphabet 949 Usage : my $alphabet = $db->alphabet($id); 950 Function: Get the molecular type of the indicated sequence: dna, rna or protein 951 Returns : String 952 Args : ID of sequence 953 954=cut 955 956sub alphabet { 957 my ($self, $id) = @_; 958 $self->throw('Need to provide a sequence ID') if not defined $id; 959 my $offset = $self->{offsets}{$id} or return; 960 my $alphabet = (&{$self->{unpackmeth}}($offset))[5]; 961 return : $alphabet == Bio::DB::IndexedBase::DNA ? 'dna' 962 : $alphabet == Bio::DB::IndexedBase::RNA ? 'rna' 963 : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein' 964 : ''; 965} 966 967 968=head2 file 969 970 Title : file 971 Usage : my $file = $db->file($id); 972 Function: Get the the name of the file in which the indicated sequence can be 973 found. 974 Returns : String 975 Args : ID of sequence 976 977=cut 978 979sub file { 980 my ($self, $id) = @_; 981 $self->throw('Need to provide a sequence ID') if not defined $id; 982 my $offset = $self->{offsets}{$id} or return; 983 return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]); 984} 985 986 987sub _fileno2path { 988 my ($self, $fileno) = @_; 989 return $self->{fileno2path}->[$fileno]; 990} 991 992 993sub _path2fileno { 994 my ($self, $path) = @_; 995 if ( not exists $self->{filepath2no}->{$path} ) { 996 my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++); 997 $self->{fileno2path}->[$fileno] = $path; # Save path 998 } 999 return $self->{filepath2no}->{$path}; 1000 1001} 1002 1003 1004sub _packSmall { 1005 return pack STRUCT, @_; 1006} 1007 1008 1009sub _packBig { 1010 return pack STRUCTBIG, @_; 1011} 1012 1013 1014sub _unpackSmall { 1015 return unpack STRUCT, shift; 1016} 1017 1018 1019sub _unpackBig { 1020 return unpack STRUCTBIG, shift; 1021} 1022 1023 1024sub _set_pack_method { 1025 # Determine whether to use 32 or 64 bit integers for the given files. 1026 my $self = shift; 1027 # Find the maximum file size: 1028 my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_; 1029 my $fourGB = (2 ** 32) - 1; 1030 1031 if ($maxsize > $fourGB) { 1032 # At least one file exceeds 4Gb - we will need to use 64 bit ints 1033 $self->{packmeth} = \&_packBig; 1034 $self->{unpackmeth} = \&_unpackBig; 1035 } else { 1036 $self->{packmeth} = \&_packSmall; 1037 $self->{unpackmeth} = \&_unpackSmall; 1038 } 1039 return 1; 1040} 1041 1042 1043#------------------------------------------------------------- 1044# Tied hash logic 1045# 1046 1047sub TIEHASH { 1048 return shift->new(@_); 1049} 1050 1051 1052sub FETCH { 1053 return shift->subseq(@_); 1054} 1055 1056 1057sub STORE { 1058 shift->throw("Read-only database"); 1059} 1060 1061 1062sub DELETE { 1063 shift->throw("Read-only database"); 1064} 1065 1066 1067sub CLEAR { 1068 shift->throw("Read-only database"); 1069} 1070 1071 1072sub EXISTS { 1073 return defined shift->offset(@_); 1074} 1075 1076 1077sub FIRSTKEY { 1078 return tied(%{shift->{offsets}})->FIRSTKEY(@_); 1079} 1080 1081 1082sub NEXTKEY { 1083 return tied(%{shift->{offsets}})->NEXTKEY(@_); 1084} 1085 1086 1087sub DESTROY { 1088 my $self = shift; 1089 1090 # Close filehandles 1091 while (my ($file, $fh) = each %{ $self->{fhcache} }) { 1092 if (defined $fh) { 1093 $fh->close; 1094 } 1095 } 1096 $self->_close_index($self->{offsets}); 1097 1098 if ( $self->{clean} || $self->{indexing} ) { 1099 # Indexing aborted or cleaning requested. Delete the index file. 1100 my $index = $self->{index_name}; 1101 1102 # Tied-hash in Strawberry Perl creates "$file.index" 1103 unlink $index if -e $index; 1104 # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir" 1105 unlink "$index.dir" if -e "$index.dir"; 1106 unlink "$index.pag" if -e "$index.pag"; 1107 } 1108 return 1; 1109} 1110 1111 1112#------------------------------------------------------------- 1113# stream-based access to the database 1114# 1115 1116package Bio::DB::Indexed::Stream; 1117$Bio::DB::Indexed::Stream::VERSION = '1.7.7'; 1118use base qw(Tie::Handle Bio::DB::SeqI); 1119 1120 1121sub new { 1122 my ($class, $db) = @_; 1123 my $key = $db->FIRSTKEY; 1124 return bless { 1125 db => $db, 1126 key => $key 1127 }, $class; 1128} 1129 1130sub next_seq { 1131 my $self = shift; 1132 my ($key, $db) = @{$self}{'key', 'db'}; 1133 return if not defined $key; 1134 my $value = $db->get_Seq_by_id($key); 1135 $self->{key} = $db->NEXTKEY($key); 1136 return $value; 1137} 1138 1139sub TIEHANDLE { 1140 my ($class, $db) = @_; 1141 return $class->new($db); 1142} 1143 1144sub READLINE { 1145 my $self = shift; 1146 return $self->next_seq || undef; 1147} 1148 1149 11501; 1151