1#
2# BioPerl module for Bio::DB::IndexedBase
3#
4# You may distribute this module under the same terms as perl itself
5#
6
7=head1 NAME
8
9Bio::DB::IndexedBase - Base class for modules using indexed sequence files
10
11=head1 SYNOPSIS
12
13  use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
14
15  # 1/ Bio::SeqIO-style access
16
17  # Index some sequence files
18  my $db = Bio::DB::XXX->new('/path/to/file');    # from a single file
19  my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
20  my $db = Bio::DB::XXX->new('/path/to/files/');  # from a directory
21
22  # Get IDs of all the sequences in the database
23  my @ids = $db->get_all_primary_ids;
24
25  # Get a specific sequence
26  my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
27
28  # Loop through all sequences
29  my $stream = $db->get_PrimarySeq_stream;
30  while (my $seq = $stream->next_seq) {
31    # Do something...
32  }
33
34
35  # 2/ Access via filehandle
36  my $fh = Bio::DB::XXX->newFh('/path/to/file');
37  while (my $seq = <$fh>) {
38    # Do something...
39  }
40
41
42  # 3/ Tied-hash access
43  tie %sequences, 'Bio::DB::XXX', '/path/to/file';
44  print $sequences{'CHROMOSOME_I:1,20000'};
45
46=head1 DESCRIPTION
47
48Bio::DB::IndexedBase provides a base class for modules that want to index
49and read sequence files and provides persistent, random access to each sequence
50entry, without bringing the entire file into memory. This module is compliant
51with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
52Bio::DB::IndexedBase.
53
54When you initialize the module, you point it at a single file, several files, or
55a directory of files. The first time it is run, the module generates an index
56of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
57followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
58index file to find the sequence file and offset for any requested sequence. If
59one of the source files is updated, the module reindexes just that one file. You
60can also force reindexing manually at any time. For improved performance, the
61module keeps a cache of open filehandles, closing less-recently used ones when
62the cache is full.
63
64Entries may have any line length up to 65,536 characters, and different line
65lengths are allowed in the same file.  However, within a sequence entry, all
66lines must be the same length except for the last. An error will be thrown if
67this is not the case!
68
69This module was developed for use with the C. elegans and human genomes, and has
70been tested with sequence segments as large as 20 megabases. Indexing the C.
71elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
72minutes on my 300 MHz pentium laptop. On the same system, average access time
73for any 200-mer within the C. elegans genome was E<lt>0.02s.
74
75=head1 DATABASE CREATION AND INDEXING
76
77The two constructors for this class are new() and newFh(). The former creates a
78Bio::DB::IndexedBase object which is accessed via method calls. The latter
79creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
80objects in a stream fashion. There is also a tied hash interface.
81
82=over
83
84=item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
85
86Create a new Bio::DB::IndexedBase object from the files designated by $path
87$path may be a single file, an arrayref of files, or a directory containing
88such files.
89
90After the database is created, you can use methods like get_all_primary_ids()
91and get_Seq_by_id() to retrieve sequence objects.
92
93=item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
94
95Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
96from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
97Bio::SeqIO style. The path and the options should be specified as for new().
98
99=item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
100
101Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
102path to the files. The optional @args list is the same set used by new(). If
103successful, tie() returns the tied object, undef otherwise.
104
105Once tied, you can use the hash to retrieve an individual sequence by
106its ID, like this:
107
108  my $seq = $db{CHROMOSOME_I};
109
110The keys() and values() functions will return the sequence IDs and their
111sequences, respectively.  In addition, each() can be used to iterate over the
112entire data set:
113
114 while (my ($id,$sequence) = each %db) {
115    print "$id => $sequence\n";
116 }
117
118
119When dealing with very large sequences, you can avoid bringing them into memory
120by calling each() in a scalar context.  This returns the key only.  You can then
121use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
122
123 while (my $id = each %db) {
124    print "$id: $db{$sequence:1,100}\n";
125    print "$id: ".tied(%db)->length($id)."\n";
126 }
127
128In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
129to retrieve the first and next ID in the database, respectively. This allows one to
130write the following iterative loop using just the object-oriented interface:
131
132 my $db = Bio::DB::IndexedBase->new('/path/to/file');
133 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
134    # do something with sequence
135 }
136
137=back
138
139=head1 INDEX CONTENT
140
141Several attributes of each sequence are stored in the index file. Given a
142sequence ID, these attributes can be retrieved using the following methods:
143
144=over
145
146=item offset($id)
147
148Get the offset of the indicated sequence from the beginning of the file in which
149it is located. The offset points to the beginning of the sequence, not the
150beginning of the header line.
151
152=item strlen($id)
153
154Get the number of characters in the sequence string.
155
156=item length($id)
157
158Get the number of residues of the sequence.
159
160=item linelen($id)
161
162Get the length of the line for this sequence. If the sequence is wrapped, then
163linelen() is likely to be much shorter than strlen().
164
165=item headerlen($id)
166
167Get the length of the header line for the indicated sequence.
168
169=item header_offset
170
171Get the offset of the header line for the indicated sequence from the beginning
172of the file in which it is located. This attribute is not stored. It is
173calculated from offset() and headerlen().
174
175=item alphabet($id)
176
177Get the molecular type (alphabet) of the indicated sequence. This method handles
178residues according to the IUPAC convention.
179
180=item file($id)
181
182Get the the name of the file in which the indicated sequence can be found.
183
184=back
185
186=head1 INTERFACE COMPLIANCE NOTES
187
188Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
189Bio::RandomAccessI interfaces.
190
191Database do not necessarily provide any meaningful internal primary ID for the
192sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
193the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
194and get_Seq_by_primary_id() will return the same sequence.
195
196Since this database index has no notion of sequence version or namespace, the
197get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
198
199=head1 BUGS
200
201When a sequence is deleted from one of the files, this deletion is not detected
202by the module and removed from the index. As a result, a "ghost" entry will
203remain in the index and will return garbage results if accessed.
204
205Also, if you are indexing a directory, it is wise to not add or remove files
206from it.
207
208In case you have changed the files in a directory, or the sequences in a file,
209you can to rebuild the entire index, either by deleting it manually, or by
210passing -reindex=E<gt>1 to new() when initializing the module.
211
212=head1 SEE ALSO
213
214L<DB_File>
215
216L<Bio::DB::Fasta>
217
218L<Bio::DB::Qual>
219
220=head1 AUTHOR
221
222Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
223
224Copyright (c) 2001 Cold Spring Harbor Laboratory.
225
226Florent Angly (for the modularization)
227
228This library is free software; you can redistribute it and/or modify
229it under the same terms as Perl itself.  See DISCLAIMER.txt for
230disclaimers of warranty.
231
232=head1 APPENDIX
233
234The rest of the documentation details each of the object
235methods. Internal methods are usually preceded with a _
236
237=cut
238
239
240package Bio::DB::IndexedBase;
241$Bio::DB::IndexedBase::VERSION = '1.7.7';
242
243BEGIN {
244    @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
245        if(!$INC{'AnyDBM_File.pm'});
246}
247
248use strict;
249use warnings;
250use IO::File;
251use AnyDBM_File;
252use Fcntl;
253use File::Spec;
254use File::Basename qw(basename dirname);
255use Bio::PrimarySeq;
256
257use base qw(Bio::DB::SeqI);
258
259# Store offset, strlen, linelen, headerlen, type and fileno
260use constant STRUCT    => 'NNNnnCa*'; # 32-bit file offset and seq length
261use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
262
263use constant NA        => 0;
264use constant DNA       => 1;
265use constant RNA       => 2;
266use constant PROTEIN   => 3;
267
268# You can avoid dying if you want but you may get incorrect results
269use constant DIE_ON_MISSMATCHED_LINES => 1;
270
271=head2 new
272
273 Title   : new
274 Usage   : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
275 Function: Initialize a new database object
276 Returns : A Bio::DB::IndexedBase object
277 Args    : A single file, or path to dir, or arrayref of files
278           Optional arguments:
279
280 Option        Description                                         Default
281 -----------   -----------                                         -------
282 -glob         Glob expression to search for files in directories  *
283 -makeid       A code subroutine for transforming IDs              None
284 -maxopen      Maximum size of filehandle cache                    32
285 -debug        Turn on status messages                             0
286 -reindex      Force the index to be rebuilt                       0
287 -dbmargs      Additional arguments to pass to the DBM routine     None
288 -index_name   Name of the file that will hold the indices
289 -clean        Remove the index file when finished                 0
290
291The -dbmargs option can be used to control the format of the index. For example,
292you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
293retrieved alphabetically. Note that you must use the same arguments every time
294you open the index!
295
296The -makeid option gives you a chance to modify sequence IDs during indexing.
297For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
298that GenBank Fasta files use. The original header line can be recovered later.
299The option value for -makeid should be a code reference that takes a scalar
300argument (the full header line) and returns a scalar or an array of scalars (the
301ID or IDs you want to assign). For example:
302
303  $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
304
305  sub extract_gi {
306      # Extract GI from GenBank
307      my $header = shift;
308      my ($id) = ($header =~ /gi\|(\d+)/m);
309      return $id || '';
310  }
311
312extract_gi() will be called with the full header line, e.g. a Fasta line would
313include the "E<gt>", the ID and the description:
314
315 >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
316
317In the database, this sequence can now be retrieved by its GI instead of its
318complete ID:
319
320 my $seq = $db->get_Seq_by_id(352962132);
321
322The -makeid option is ignored after the index is constructed.
323
324=cut
325
326sub new {
327    my ($class, $path, %opts) = @_;
328
329    my $self = bless {
330        debug       => $opts{-debug}   || 0,
331        makeid      => $opts{-makeid},
332        glob        => $opts{-glob}    || eval '$'.$class.'::file_glob' || '*',
333        maxopen     => $opts{-maxopen} || 32,
334        clean       => $opts{-clean}   || 0,
335        dbmargs     => $opts{-dbmargs} || undef,
336        fhcache     => {},
337        cacheseq    => {},
338        curopen     => 0,
339        openseq     => 1,
340        dirname     => undef,
341        offsets     => undef,
342        index_name  => $opts{-index_name},
343        obj_class   => eval '$'.$class.'::obj_class',
344        offset_meth => \&{$class.'::_calculate_offsets'},
345        fileno2path => [],
346        filepath2no => {},
347    }, $class;
348
349    my ($offsets, $dirname);
350    my $ref = ref $path || '';
351    if ( $ref eq 'ARRAY' ) {
352        $offsets = $self->index_files($path, $opts{-reindex});
353        require Cwd;
354        $dirname = Cwd::getcwd();
355    } else {
356  $self->{index_name} ||= $self->_default_index_name($path);
357        if (-d $path) {
358            # because Win32 glob() is broken with respect to long file names
359            # that contain whitespace.
360            $path = Win32::GetShortPathName($path)
361                if $^O =~ /^MSWin/i && eval 'use Win32; 1';
362            $offsets = $self->index_dir($path, $opts{-reindex});
363            $dirname = $path;
364        } elsif (-f _) {
365            $offsets = $self->index_file($path, $opts{-reindex});
366            $dirname = dirname($path);
367        } else {
368            $self->throw( "No file or directory called '$path'");
369        }
370    }
371    @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
372
373    return $self;
374}
375
376
377=head2 newFh
378
379 Title   : newFh
380 Usage   : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
381 Function: Index and get a new Fh for a single file, several files or a directory
382 Returns : Filehandle object
383 Args    : Same as new()
384
385=cut
386
387sub newFh {
388    my ($class, @args) = @_;
389    my $self = $class->new(@args);
390    require Symbol;
391    my $fh = Symbol::gensym;
392    tie $$fh, 'Bio::DB::Indexed::Stream', $self
393        or $self->throw("Could not tie filehandle: $!");
394    return $fh;
395}
396
397
398=head2 dbmargs
399
400 Title   : dbmargs
401 Usage   : my @args = $db->dbmargs;
402 Function: Get stored dbm arguments
403 Returns : Array
404 Args    : None
405
406=cut
407
408sub dbmargs {
409    my $self = shift;
410    my $args = $self->{dbmargs} or return;
411    return ref($args) eq 'ARRAY' ? @$args : $args;
412}
413
414
415=head2 glob
416
417 Title   : glob
418 Usage   : my $glob = $db->glob;
419 Function: Get the expression used to match files in directories
420 Returns : String
421 Args    : None
422
423=cut
424
425sub glob {
426    my $self = shift;
427    return $self->{glob};
428}
429
430
431=head2 index_dir
432
433 Title   : index_dir
434 Usage   : $db->index_dir($dir);
435 Function: Index the files that match -glob in the given directory
436 Returns : Hashref of offsets
437 Args    : Dirname
438           Boolean to force a reindexing the directory
439
440=cut
441
442sub index_dir {
443    my ($self, $dir, $force_reindex) = @_;
444    my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
445    return if scalar @files == 0;
446    $self->{index_name} ||= $self->_default_index_name($dir);
447    my $offsets = $self->_index_files(\@files, $force_reindex);
448    return $offsets;
449}
450
451
452=head2 get_all_primary_ids
453
454 Title   : get_all_primary_ids, get_all_ids, ids
455 Usage   : my @ids = $db->get_all_primary_ids;
456 Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
457           implementation. Note that in this implementation, the internal
458           database primary IDs are also the sequence IDs.
459 Returns : List of ids
460 Args    : None
461
462=cut
463
464sub get_all_primary_ids  {
465    return keys %{shift->{offsets}};
466}
467
468{
469no warnings 'once';
470*ids = *get_all_ids = \&get_all_primary_ids;
471}
472
473
474=head2 index_file
475
476 Title   : index_file
477 Usage   : $db->index_file($filename);
478 Function: Index the given file
479 Returns : Hashref of offsets
480 Args    : Filename
481           Boolean to force reindexing the file
482
483=cut
484
485sub index_file {
486    my ($self, $file, $force_reindex) = @_;
487    $self->{index_name} ||= $self->_default_index_name($file);
488    my $offsets = $self->_index_files([$file], $force_reindex);
489    return $offsets;
490}
491
492sub _default_index_name {
493    my ($self,$path) = @_;
494    return File::Spec->catfile($path,'directory.index') if -d $path;
495    return "$path.index";
496}
497
498=head2 index_files
499
500 Title   : index_files
501 Usage   : $db->index_files(\@files);
502 Function: Index the given files
503 Returns : Hashref of offsets
504 Args    : Arrayref of filenames
505           Boolean to force reindexing the files
506
507=cut
508
509sub index_files {
510    my ($self, $files, $force_reindex) = @_;
511    my @paths = map { File::Spec->rel2abs($_) } @$files;
512    require Digest::MD5;
513    my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
514    $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
515    my $offsets = $self->_index_files($files, $force_reindex);
516    return $offsets;
517}
518
519
520=head2 index_name
521
522 Title   : index_name
523 Usage   : my $indexname = $db->index_name($path);
524 Function: Get the full name of the index file
525 Returns : String
526 Args    : None
527
528=cut
529
530sub index_name {
531    return shift->{index_name};
532}
533
534
535=head2 path
536
537 Title   : path
538 Usage   : my $path = $db->path($path);
539 Function: When a single file or a directory of files is indexed, this returns
540           the file directory. When indexing an arbitrary list of files, the
541           return value is the path of the current working directory.
542 Returns : String
543 Args    : None
544
545=cut
546
547sub path {
548    return shift->{dirname};
549}
550
551
552=head2 get_PrimarySeq_stream
553
554 Title   : get_PrimarySeq_stream
555 Usage   : my $stream = $db->get_PrimarySeq_stream();
556 Function: Get a SeqIO-like stream of sequence objects. The stream supports a
557           single method, next_seq(). Each call to next_seq() returns a new
558           PrimarySeqI compliant sequence object, until no more sequences remain.
559           This is a Bio::DB::SeqI method implementation.
560 Returns : A Bio::DB::Indexed::Stream object
561 Args    : None
562
563=cut
564
565sub get_PrimarySeq_stream {
566    my $self = shift;
567    return Bio::DB::Indexed::Stream->new($self);
568}
569
570
571=head2 get_Seq_by_id
572
573 Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
574 Usage   : my $seq = $db->get_Seq_by_id($id);
575 Function: Given an ID, fetch the corresponding sequence from the database.
576           This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
577 Returns : A sequence object
578 Args    : ID
579
580=cut
581
582sub get_Seq_by_id {
583    my ($self, $id) = @_;
584    $self->throw('Need to provide a sequence ID') if not defined $id;
585    return if not exists $self->{offsets}{$id};
586    return $self->{obj_class}->new($self, $id);
587}
588
589{
590no warnings 'once';
591*get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
592}
593
594
595=head2 _calculate_offsets
596
597 Title   : _calculate_offsets
598 Usage   : $db->_calculate_offsets($filename, $offsets);
599 Function: This method calculates the sequence offsets in a file based on ID and
600           should be implemented by classes that use Bio::DB::IndexedBase.
601 Returns : Hash of offsets
602 Args    : File to process
603           Hashref of file offsets keyed by IDs.
604
605=cut
606
607sub _calculate_offsets {
608    my $self = shift;
609    $self->throw_not_implemented();
610}
611
612
613sub _index_files {
614    # Do the indexing of the given files using the index file on record
615    my ($self, $files, $force_reindex) = @_;
616
617    $self->_set_pack_method( @$files );
618
619    # Get name of index file
620    my $index = $self->index_name;
621
622    # If caller has requested reindexing, unlink the index file.
623    if ($force_reindex) {
624        # Tied-hash in Strawberry Perl creates "$file.index"
625        unlink $index if -e $index;
626        # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
627        unlink "$index.dir" if -e "$index.dir";
628        unlink "$index.pag" if -e "$index.pag";
629    }
630
631    # Get the modification time of the index
632    my $indextime = (stat $index)[9] || 0;
633
634    # Register files and find if there has been any update
635    my $modtime = 0;
636    my @updated;
637    for my $file (@$files) {
638        # Register file
639        $self->_path2fileno(basename($file));
640        # Any update?
641        my $m = (stat $file)[9] || 0;
642        if ($m > $modtime) {
643           $modtime = $m;
644        }
645        if ($m > $indextime) {
646           push @updated, $file;
647        }
648    }
649
650    # Get termination length from first file
651    $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
652
653    # Reindex contents of changed files if needed
654    my $reindex      = $force_reindex || (scalar @updated > 0);
655    $self->{offsets} = $self->_open_index($index, $reindex) or return;
656    if ($reindex) {
657        $self->{indexing} = $index;
658        for my $file (@updated) {
659            my $fileno = $self->_path2fileno(basename($file));
660            &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
661        }
662        delete $self->{indexing};
663    }
664
665    # Closing and reopening might help corrupted index file problem on Windows
666    $self->_close_index($self->{offsets});
667
668    return $self->{offsets} = $self->_open_index($index);
669}
670
671
672sub _open_index {
673    # Open index file in read-only or write mode
674    my ($self, $index_file, $write) = @_;
675    my %offsets;
676    my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
677    my @dbmargs = $self->dbmargs;
678    tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
679        or $self->throw( "Could not open index file $index_file: $!");
680    return \%offsets;
681}
682
683
684sub _close_index {
685    # Close index file
686    my ($self, $index) = @_;
687    untie %$index;
688    return 1;
689}
690
691# Compiling the below regular expression speeds up _parse_compound_id
692my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
693
694sub _parse_compound_id {
695    # Handle compound IDs:
696    #     $db->seq($id)
697    #     $db->seq($id, $start, $stop, $strand)
698    #     $db->seq("$id:$start,$stop")
699    #     $db->seq("$id:$start..$stop")
700    #     $db->seq("$id:$start-$stop")
701    #     $db->seq("$id:$start,$stop/$strand")
702    #     $db->seq("$id:$start..$stop/$strand")
703    #     $db->seq("$id:$start-$stop/$strand")
704    #     $db->seq("$id/$strand")
705    my ($self, $id, $start, $stop, $strand) = @_;
706
707    if ( (not defined $start ) &&
708         (not defined $stop  ) &&
709         (not defined $strand) &&
710         ($id =~ m{$compound_id}) ) {
711        # Start, stop and strand not provided and ID looks like a compound ID
712        ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
713    }
714
715    # Start, stop and strand defaults
716    $stop   ||= $self->length($id) || 0; # 0 if sequence not found in database
717    $start  ||= ($stop > 0) ? 1 : 0;
718    $strand ||= 1;
719
720    # Convert numbers such as 1_000_000 to 1000000
721    $start =~ s/_//g;
722    $stop  =~ s/_//g;
723
724    if ($start > $stop) {
725        # Change the strand
726        ($start, $stop) = ($stop, $start);
727        $strand *= -1;
728    }
729
730    return $id, $start, $stop, $strand;
731}
732
733
734sub _guess_alphabet {
735    # Determine the molecular type of the given sequence string:
736    #    'dna', 'rna', 'protein' or '' (unknown/empty)
737    my ($self, $string) = @_;
738    # Handle IUPAC residues like PrimarySeq does
739    my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
740    return $alphabet eq 'dna' ? DNA
741           : $alphabet eq 'rna' ? RNA
742           : $alphabet eq 'protein' ? PROTEIN
743           : NA;
744}
745
746
747sub _makeid {
748    # Process the header line by applying any transformation given in -makeid
749    my ($self, $header_line) = @_;
750    return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
751}
752
753
754sub _check_linelength {
755    # Check that the line length is valid. Generate an error otherwise.
756    my ($self, $linelength) = @_;
757    return if not defined $linelength;
758    $self->throw(
759        "Each line of the file must be less than 65,536 characters. Line ".
760        "$. is $linelength chars."
761    ) if $linelength > 65535;
762}
763
764
765sub _calc_termination_length {
766    # Try the beginning of the file to determine termination length
767    # Account for crlf-terminated Windows and Mac files
768    my ($self, $file) = @_;
769    my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
770
771    # In Windows, text files have '\r\n' as line separator, but when reading in
772    # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
773    # "length $_" will report 4 although the line is 5 bytes in length.
774    # We assume that all lines have the same line separator and only read current line.
775    my $init_pos   = tell($fh);
776    my $curr_line  = <$fh>;
777    my $pos_diff   = tell($fh) - $init_pos;
778    my $correction = $pos_diff - length $curr_line;
779    close $fh;
780
781    $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
782    return $self->{termination_length};
783}
784
785
786sub _calc_offset {
787    # Get the offset of the n-th residue of the sequence with the given ID
788    # and termination length (tl)
789    my ($self, $id, $n) = @_;
790    my $tl = $self->{termination_length};
791    $n--;
792    my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
793    $n = 0            if $n < 0;
794    $n = $seqlen-1 if $n >= $seqlen;
795    return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
796}
797
798
799sub _fh {
800    # Given a sequence ID, return the filehandle on which to find this sequence
801    my ($self, $id) = @_;
802    $self->throw('Need to provide a sequence ID') if not defined $id;
803    my $file = $self->file($id) or return;
804    return eval {
805      $self->_fhcache( File::Spec->catfile($self->{dirname}, $file));
806    } || $self->throw( "Can't open file $file" );
807}
808
809
810sub _fhcache {
811    my ($self, $path) = @_;
812    if (!$self->{fhcache}{$path}) {
813        if ($self->{curopen} >= $self->{maxopen}) {
814            my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
815                keys %{$self->{fhcache}};
816            splice(@lru, $self->{maxopen} / 3);
817            $self->{curopen} -= @lru;
818            for (@lru) {
819                delete $self->{fhcache}{$_};
820            }
821        }
822        $self->{fhcache}{$path} = IO::File->new($path) || return;
823        binmode $self->{fhcache}{$path};
824        $self->{curopen}++;
825    }
826    $self->{cacheseq}{$path}++;
827    return $self->{fhcache}{$path};
828}
829
830
831#-------------------------------------------------------------
832# Methods to store and retrieve data from indexed file
833#
834
835=head2 offset
836
837 Title   : offset
838 Usage   : my $offset = $db->offset($id);
839 Function: Get the offset of the indicated sequence from the beginning of the
840           file in which it is located. The offset points to the beginning of
841           the sequence, not the beginning of the header line.
842 Returns : String
843 Args    : ID of sequence
844
845=cut
846
847sub offset {
848    my ($self, $id) = @_;
849    $self->throw('Need to provide a sequence ID') if not defined $id;
850    my $offset = $self->{offsets}{$id} or return;
851    return (&{$self->{unpackmeth}}($offset))[0];
852}
853
854
855=head2 strlen
856
857 Title   : strlen
858 Usage   : my $length = $db->strlen($id);
859 Function: Get the number of characters in the sequence string.
860 Returns : Integer
861 Args    : ID of sequence
862
863=cut
864
865sub strlen {
866    my ($self, $id) = @_;
867    $self->throw('Need to provide a sequence ID') if not defined $id;
868    my $offset = $self->{offsets}{$id} or return;
869    return (&{$self->{unpackmeth}}($offset))[1];
870}
871
872
873=head2 length
874
875 Title   : length
876 Usage   : my $length = $db->length($id);
877 Function: Get the number of residues of the sequence.
878 Returns : Integer
879 Args    : ID of sequence
880
881=cut
882
883sub length {
884    my ($self, $id) = @_;
885    $self->throw('Need to provide a sequence ID') if not defined $id;
886    my $offset = $self->{offsets}{$id} or return;
887    return (&{$self->{unpackmeth}}($offset))[2];
888}
889
890
891=head2 linelen
892
893 Title   : linelen
894 Usage   : my $linelen = $db->linelen($id);
895 Function: Get the length of the line for this sequence.
896 Returns : Integer
897 Args    : ID of sequence
898
899=cut
900
901sub linelen {
902    my ($self, $id) = @_;
903    $self->throw('Need to provide a sequence ID') if not defined $id;
904    my $offset = $self->{offsets}{$id} or return;
905    return (&{$self->{unpackmeth}}($offset))[3];
906}
907
908
909=head2 headerlen
910
911 Title   : headerlen
912 Usage   : my $length = $db->headerlen($id);
913 Function: Get the length of the header line for the indicated sequence.
914 Returns : Integer
915 Args    : ID of sequence
916
917=cut
918
919sub headerlen {
920    my ($self, $id) = @_;
921    $self->throw('Need to provide a sequence ID') if not defined $id;
922    my $offset = $self->{offsets}{$id} or return;
923    return (&{$self->{unpackmeth}}($offset))[4];
924}
925
926
927=head2 header_offset
928
929 Title   : header_offset
930 Usage   : my $offset = $db->header_offset($id);
931 Function: Get the offset of the header line for the indicated sequence from
932           the beginning of the file in which it is located.
933 Returns : String
934 Args    : ID of sequence
935
936=cut
937
938sub header_offset {
939    my ($self, $id) = @_;
940    $self->throw('Need to provide a sequence ID') if not defined $id;
941    return if not $self->{offsets}{$id};
942    return $self->offset($id) - $self->headerlen($id);
943}
944
945
946=head2 alphabet
947
948 Title   : alphabet
949 Usage   : my $alphabet = $db->alphabet($id);
950 Function: Get the molecular type of the indicated sequence: dna, rna or protein
951 Returns : String
952 Args    : ID of sequence
953
954=cut
955
956sub alphabet {
957    my ($self, $id) = @_;
958    $self->throw('Need to provide a sequence ID') if not defined $id;
959    my $offset = $self->{offsets}{$id} or return;
960    my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
961    return : $alphabet == Bio::DB::IndexedBase::DNA     ? 'dna'
962           : $alphabet == Bio::DB::IndexedBase::RNA     ? 'rna'
963           : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
964           : '';
965}
966
967
968=head2 file
969
970 Title   : file
971 Usage   : my $file = $db->file($id);
972 Function: Get the the name of the file in which the indicated sequence can be
973           found.
974 Returns : String
975 Args    : ID of sequence
976
977=cut
978
979sub file {
980    my ($self, $id) = @_;
981    $self->throw('Need to provide a sequence ID') if not defined $id;
982    my $offset = $self->{offsets}{$id} or return;
983    return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
984}
985
986
987sub _fileno2path {
988    my ($self, $fileno) = @_;
989    return $self->{fileno2path}->[$fileno];
990}
991
992
993sub _path2fileno {
994    my ($self, $path) = @_;
995    if ( not exists $self->{filepath2no}->{$path} ) {
996        my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
997        $self->{fileno2path}->[$fileno] = $path; # Save path
998    }
999    return $self->{filepath2no}->{$path};
1000
1001}
1002
1003
1004sub _packSmall {
1005    return pack STRUCT, @_;
1006}
1007
1008
1009sub _packBig {
1010    return pack STRUCTBIG, @_;
1011}
1012
1013
1014sub _unpackSmall {
1015    return unpack STRUCT, shift;
1016}
1017
1018
1019sub _unpackBig {
1020    return unpack STRUCTBIG, shift;
1021}
1022
1023
1024sub _set_pack_method {
1025    # Determine whether to use 32 or 64 bit integers for the given files.
1026    my $self = shift;
1027    # Find the maximum file size:
1028    my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
1029    my $fourGB    = (2 ** 32) - 1;
1030
1031    if ($maxsize > $fourGB) {
1032        # At least one file exceeds 4Gb - we will need to use 64 bit ints
1033        $self->{packmeth}   = \&_packBig;
1034        $self->{unpackmeth} = \&_unpackBig;
1035    } else {
1036        $self->{packmeth}   = \&_packSmall;
1037        $self->{unpackmeth} = \&_unpackSmall;
1038    }
1039    return 1;
1040}
1041
1042
1043#-------------------------------------------------------------
1044# Tied hash logic
1045#
1046
1047sub TIEHASH {
1048    return shift->new(@_);
1049}
1050
1051
1052sub FETCH {
1053    return shift->subseq(@_);
1054}
1055
1056
1057sub STORE {
1058    shift->throw("Read-only database");
1059}
1060
1061
1062sub DELETE {
1063    shift->throw("Read-only database");
1064}
1065
1066
1067sub CLEAR {
1068    shift->throw("Read-only database");
1069}
1070
1071
1072sub EXISTS {
1073    return defined shift->offset(@_);
1074}
1075
1076
1077sub FIRSTKEY {
1078    return tied(%{shift->{offsets}})->FIRSTKEY(@_);
1079}
1080
1081
1082sub NEXTKEY {
1083    return tied(%{shift->{offsets}})->NEXTKEY(@_);
1084}
1085
1086
1087sub DESTROY {
1088    my $self = shift;
1089
1090    # Close filehandles
1091    while (my ($file, $fh) = each %{ $self->{fhcache} }) {
1092        if (defined $fh) {
1093            $fh->close;
1094        }
1095    }
1096    $self->_close_index($self->{offsets});
1097
1098    if ( $self->{clean} || $self->{indexing} ) {
1099        # Indexing aborted or cleaning requested. Delete the index file.
1100        my $index = $self->{index_name};
1101
1102        # Tied-hash in Strawberry Perl creates "$file.index"
1103        unlink $index if -e $index;
1104        # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1105        unlink "$index.dir" if -e "$index.dir";
1106        unlink "$index.pag" if -e "$index.pag";
1107    }
1108    return 1;
1109}
1110
1111
1112#-------------------------------------------------------------
1113# stream-based access to the database
1114#
1115
1116package Bio::DB::Indexed::Stream;
1117$Bio::DB::Indexed::Stream::VERSION = '1.7.7';
1118use base qw(Tie::Handle Bio::DB::SeqI);
1119
1120
1121sub new {
1122    my ($class, $db) = @_;
1123    my $key = $db->FIRSTKEY;
1124    return bless {
1125        db  => $db,
1126        key => $key
1127    }, $class;
1128}
1129
1130sub next_seq {
1131    my $self = shift;
1132    my ($key, $db) = @{$self}{'key', 'db'};
1133    return if not defined $key;
1134    my $value = $db->get_Seq_by_id($key);
1135    $self->{key} = $db->NEXTKEY($key);
1136    return $value;
1137}
1138
1139sub TIEHANDLE {
1140    my ($class, $db) = @_;
1141    return $class->new($db);
1142}
1143
1144sub READLINE {
1145    my $self = shift;
1146    return $self->next_seq || undef;
1147}
1148
1149
11501;
1151