1#
2# BioPerl module for Bio::Index::BlastTable
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Chris Fields <cjfields@uiuc.edu>
7#
8# Copyright Chris Fields
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Index::BlastTable - Indexes tabular Blast reports (-m 8 or -m 9 format) and
17supports retrieval based on query accession(s)
18
19=head1 SYNOPSIS
20
21    use strict;
22    use Bio::Index::BlastTable;
23    my ($indexfile,$file1,$file2,$query);
24    my $index = Bio::Index::BlastTable->new(-filename => $indexfile,
25				                          -write_flag => 1);
26    $index->make_index($file1,$file2);
27
28    my $data = $index->get_stream($query);
29
30    my $blast_result = $index->fetch_report($query);
31    print "query is ", $blast_result->query_name, "\n";
32    while ( my $hit = $blast_result->next_hit ) {
33        print "Name ", $hit->name,"\n";
34        while ( my $hsp = $hit->next_hsp ) {
35            print "Score ", $hsp->score;
36        }
37        print "\n";
38    }
39
40=head1 DESCRIPTION
41
42This object allows one to build an index on a tabular BLAST file (or files)
43and provide quick access to the blast report for that accession.  This also
44allows for ID parsing using a callback:
45
46   $inx->id_parser(\&get_id);
47   # make the index
48   $inx->make_index($file_name);
49
50   # here is where the retrieval key is specified
51   sub get_id {
52      my $line = shift;
53      $line =~ /^>.+gi\|(\d+)/;
54      $1;
55   }
56
57The indexer is capable of indexing based on multiple IDs passed back from the
58callback; this is assuming of course all IDs are unique.
59
60Note: for best results 'use strict'.
61
62=head1 FEEDBACK
63
64=head2 Mailing Lists
65
66User feedback is an integral part of the evolution of this and other
67Bioperl modules. Send your comments and suggestions preferably to
68the Bioperl mailing list.  Your participation is much appreciated.
69
70  bioperl-l@bioperl.org                  - General discussion
71  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
72
73=head2 Support
74
75Please direct usage questions or support issues to the mailing list:
76
77I<bioperl-l@bioperl.org>
78
79rather than to the module maintainer directly. Many experienced and
80reponsive experts will be able look at the problem and quickly
81address it. Please include a thorough description of the problem
82with code and data examples if at all possible.
83
84=head2 Reporting Bugs
85
86Report bugs to the Bioperl bug tracking system to help us keep track
87of the bugs and their resolution. Bug reports can be submitted via the
88web:
89
90  https://github.com/bioperl/bioperl-live/issues
91
92=head1 AUTHOR - Jason Stajich
93
94Email jason-at-bioperl-dot-org
95
96=head1 APPENDIX
97
98The rest of the documentation details each of the object methods.
99Internal methods are usually preceded with a _
100
101=cut
102
103# Let the code begin...
104
105package Bio::Index::BlastTable;
106$Bio::Index::BlastTable::VERSION = '1.7.7';
107use strict;
108
109use IO::String;
110use Bio::SearchIO;
111use base qw(Bio::Index::Abstract Bio::Root::Root);
112
113sub _version {
114    return $Bio::Index::BlastTable::VERSION;
115}
116
117=head2 new
118
119  Usage   : $index = Bio::Index::Abstract->new(
120                -filename    => $dbm_file,
121                -write_flag  => 0,
122                -dbm_package => 'DB_File',
123                -verbose     => 0);
124
125  Function: Returns a new index object.  If filename is
126            specified, then open_dbm() is immediately called.
127            Bio::Index::Abstract->new() will usually be called
128            directly only when opening an existing index.
129  Returns : A new index object
130  Args    : -filename    The name of the dbm index file.
131            -write_flag  TRUE if write access to the dbm file is
132                         needed.
133            -dbm_package The Perl dbm module to use for the
134                         index.
135            -verbose     Print debugging output to STDERR if
136                         TRUE.
137
138=cut
139
140sub new {
141  my($class,@args) = @_;
142  my $self = $class->SUPER::new(@args);
143}
144
145=head2 Bio::Index::Blast implemented methods
146
147=cut
148
149=head2 fetch_report
150
151 Title   : fetch_report
152 Usage   : my $blastreport = $idx->fetch_report($id);
153 Function: Returns a Bio::SearchIO report object
154           for a specific blast report
155 Returns : Bio::SearchIO
156 Args    : valid id
157
158=cut
159
160sub fetch_report{
161	my ($self,$id) = @_;
162	my $fh = $self->get_stream($id);
163	my $report = Bio::SearchIO->new(-noclose => 1,
164									-format => 'blasttable',
165						            -fh => $fh);
166	return $report->next_result;
167}
168
169
170=head2 Require methods from Bio::Index::Abstract
171
172=cut
173
174=head2 _index_file
175
176  Title   : _index_file
177  Usage   : $index->_index_file( $file_name, $i )
178  Function: Specialist function to index BLAST report file(s).
179            Is provided with a filename and an integer
180            by make_index in its SUPER class.
181  Example :
182  Returns :
183  Args    :
184
185=cut
186
187sub _index_file {
188	my( $self,
189		 $file, # File name
190		 $i,    # Index-number of file being indexed
191	  ) = @_;
192
193	open my $BLAST, '<', $file or $self->throw("Could not read file '$file': $!");
194	my $indexpoint = 0;
195	my $lastline = 0;
196    my $last_query = '';
197    my $is_m9;
198
199    # In Windows, text files have '\r\n' as line separator, but when reading in
200    # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
201    # "length $_" will report 4 although the line is 5 bytes in length.
202    # We assume that all lines have the same line separator and only read current line.
203    my $init_pos   = tell($BLAST);
204    my $curr_line  = <$BLAST>;
205    my $pos_diff   = tell($BLAST) - $init_pos;
206    my $correction = $pos_diff - length $curr_line;
207    seek $BLAST, $init_pos, 0; # Rewind position to proceed to read the file
208
209	while( <$BLAST> ) {
210        if (m{^#}) {
211            $is_m9 ||= 1;
212            if(m{^#\s+T?BLAST[PNX]}i ) {
213                $indexpoint = tell($BLAST) - length($_) - $correction;
214            }
215            next
216        }
217
218        if (/^(?:([^\t]+)\t)(?:[^\t]+\t){7,}/) {
219            next if $last_query eq $1;
220            $indexpoint = tell($BLAST) - length($_) - $correction unless $is_m9;
221            foreach my $id ($self->id_parser()->($1)) {
222				$self->debug("id is $id, begin is $indexpoint\n");
223				$self->add_record($id, $i, $indexpoint);
224			}
225            $last_query = $1;
226        }
227	}
228}
229
230# shamelessly stolen from Bio::Index::Fasta
231
232=head2 id_parser
233
234  Title   : id_parser
235  Usage   : $index->id_parser( CODE )
236  Function: Stores or returns the code used by record_id to
237            parse the ID for record from a string.  Useful
238            for (for instance) specifying a different
239            parser for different flavours of blast dbs.
240            Returns \&default_id_parser (see below) if not
241            set. If you supply your own id_parser
242            subroutine, then it should expect a fasta
243            description line.  An entry will be added to
244            the index for each string in the list returned.
245  Example : $index->id_parser( \&my_id_parser )
246  Returns : ref to CODE if called without arguments
247  Args    : CODE
248
249=cut
250
251sub id_parser {
252	my( $self, $code ) =@_;
253
254	if ($code) {
255		$self->{'_id_parser'} = $code;
256	}
257	return $self->{'_id_parser'} || \&default_id_parser;
258}
259
260=head2 default_id_parser
261
262  Title   : default_id_parser
263  Usage   : $id = default_id_parser( $header )
264  Function: The default Blast Query ID parser for Bio::Index::Blast.pm
265            Returns $1 from applying the regexp /^>\s*(\S+)/
266            to $header.
267  Returns : ID string
268  Args    : a header line string
269
270=cut
271
272sub default_id_parser
273{
274	if ($_[0] =~ /^\s*(\S+)/) {
275		return $1;
276	} else {
277		return;
278	}
279}
280
281=head2 Bio::Index::Abstract methods
282
283=cut
284
285=head2 filename
286
287 Title   : filename
288 Usage   : $value = $self->filename();
289           $self->filename($value);
290 Function: Gets or sets the name of the dbm index file.
291 Returns : The current value of filename
292 Args    : Value of filename if setting, or none if
293           getting the value.
294
295=head2 write_flag
296
297 Title   : write_flag
298 Usage   : $value = $self->write_flag();
299           $self->write_flag($value);
300 Function: Gets or sets the value of write_flag, which
301           is whether the dbm file should be opened with
302           write access.
303 Returns : The current value of write_flag (default 0)
304 Args    : Value of write_flag if setting, or none if
305           getting the value.
306
307=head2 dbm_package
308
309 Usage   : $value = $self->dbm_package();
310           $self->dbm_package($value);
311
312 Function: Gets or sets the name of the Perl dbm module used.
313           If the value is unset, then it returns the value of
314           the package variable $USE_DBM_TYPE or if that is
315           unset, then it chooses the best available dbm type,
316           choosing 'DB_File' in preference to 'SDBM_File'.
317           Bio::Abstract::Index may work with other dbm file
318           types.
319
320 Returns : The current value of dbm_package
321 Args    : Value of dbm_package if setting, or none if
322           getting the value.
323
324
325=head2 get_stream
326
327 Title   : get_stream
328 Usage   : $stream = $index->get_stream( $id );
329 Function: Returns a file handle with the file pointer
330           at the approprite place
331
332           This provides for a way to get the actual
333           file contents and not an object
334
335           WARNING: you must parse the record deliminter
336           *yourself*. Abstract won't do this for you
337           So this code
338
339           $fh = $index->get_stream($myid);
340           while( <$fh> ) {
341              # do something
342           }
343           will parse the entire file if you do not put in
344           a last statement in, like
345
346           while( <$fh> ) {
347              /^\/\// && last; # end of record
348              # do something
349           }
350
351 Returns : A filehandle object
352 Args    : string represents the accession number
353 Notes   : This method should not be used without forethought
354
355
356=head2 open_dbm
357
358  Usage   : $index->open_dbm()
359  Function: Opens the dbm file associated with the index
360            object.  Write access is only given if explicitly
361            asked for by calling new(-write => 1) or having set
362            the write_flag(1) on the index object.  The type of
363            dbm file opened is that returned by dbm_package().
364            The name of the file to be is opened is obtained by
365            calling the filename() method.
366
367  Example : $index->_open_dbm()
368  Returns : 1 on success
369
370
371=head2 _version
372
373  Title   : _version
374  Usage   : $type = $index->_version()
375  Function: Returns a string which identifes the version of an
376            index module.  Used to permanently identify an index
377            file as having been created by a particular version
378            of the index module.  Must be provided by the sub class
379  Example :
380  Returns :
381  Args    : none
382
383=head2 _filename
384
385  Title   : _filename
386  Usage   : $index->_filename( FILE INT )
387  Function: Indexes the file
388  Example :
389  Returns :
390  Args    :
391
392=head2 _file_handle
393
394  Title   : _file_handle
395  Usage   : $fh = $index->_file_handle( INT )
396  Function: Returns an open filehandle for the file
397            index INT.  On opening a new filehandle it
398            caches it in the @{$index->_filehandle} array.
399            If the requested filehandle is already open,
400            it simply returns it from the array.
401  Example : $fist_file_indexed = $index->_file_handle( 0 );
402  Returns : ref to a filehandle
403  Args    : INT
404
405=head2 _file_count
406
407  Title   : _file_count
408  Usage   : $index->_file_count( INT )
409  Function: Used by the index building sub in a sub class to
410            track the number of files indexed.  Sets or gets
411            the number of files indexed when called with or
412            without an argument.
413  Example :
414  Returns : INT
415  Args    : INT
416
417
418=head2 add_record
419
420  Title   : add_record
421  Usage   : $index->add_record( $id, @stuff );
422  Function: Calls pack_record on @stuff, and adds the result
423            of pack_record to the index database under key $id.
424            If $id is a reference to an array, then a new entry
425            is added under a key corresponding to each element
426            of the array.
427  Example : $index->add_record( $id, $fileNumber, $begin, $end )
428  Returns : TRUE on success or FALSE on failure
429  Args    : ID LIST
430
431=head2 pack_record
432
433  Title   : pack_record
434  Usage   : $packed_string = $index->pack_record( LIST )
435  Function: Packs an array of scalars into a single string
436            joined by ASCII 034 (which is unlikely to be used
437            in any of the strings), and returns it.
438  Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
439  Returns : STRING or undef
440  Args    : LIST
441
442=head2 unpack_record
443
444  Title   : unpack_record
445  Usage   : $index->unpack_record( STRING )
446  Function: Splits the sting provided into an array,
447            splitting on ASCII 034.
448  Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
449  Returns : A 3 element ARRAY
450  Args    : STRING containing ASCII 034
451
452=head2 DESTROY
453
454 Title   : DESTROY
455 Usage   : Called automatically when index goes out of scope
456 Function: Closes connection to database and handles to
457           sequence files
458 Returns : NEVER
459 Args    : NONE
460
461
462=cut
463
4641;
465