1#
2# BioPerl module for Bio::Index::Blast
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Jason Stajich <jason@bioperl.org>
7#
8# Copyright Jason Stajich
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Index::Blast - Indexes Blast reports and supports retrieval
17based on query accession(s)
18
19=head1 SYNOPSIS
20
21  use strict;
22  use Bio::Index::Blast;
23
24  my ($indexfile,$file1,$file2,$query);
25  my $index = Bio::Index::Blast->new(-filename => $indexfile,
26				                         -write_flag => 1);
27  $index->make_index($file1,$file2);
28
29  my $fh = $index->get_stream($query);
30
31  my $blast_report = Bio::SearchIO->new(-noclose => 1,
32                                        -format  => 'blast',
33                                        -fh      => $fh);
34  my $result = $blast_report->next_result;
35  print $result->algorithm, "\n";
36  my $hit = $result->next_hit;
37  print $hit->description, "\n";
38  my $hsp = $hit->next_hsp;
39  print $hsp->bits, "\n";
40
41=head1 DESCRIPTION
42
43This object allows one to build an index on a blast file (or files)
44and provide quick access to the blast report for that accession.
45
46This also allows for ID parsing using a callback:
47
48   $inx->id_parser(\&get_id);
49   # make the index
50   $inx->make_index($file_name);
51
52   # here is where the retrieval key is specified
53   sub get_id {
54      my $line = shift;
55      $line =~ /^gi\|(\d+)/;
56      $1;
57   }
58
59The indexer is capable of indexing based on multiple IDs passed back from the
60callback; this is assuming of course all IDs are unique.
61
62Note: for best results 'use strict'.
63
64=head1 FEEDBACK
65
66=head2 Mailing Lists
67
68User feedback is an integral part of the evolution of this and other
69Bioperl modules. Send your comments and suggestions preferably to
70the Bioperl mailing list.  Your participation is much appreciated.
71
72  bioperl-l@bioperl.org                  - General discussion
73  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
74
75=head2 Support
76
77Please direct usage questions or support issues to the mailing list:
78
79I<bioperl-l@bioperl.org>
80
81rather than to the module maintainer directly. Many experienced and
82reponsive experts will be able look at the problem and quickly
83address it. Please include a thorough description of the problem
84with code and data examples if at all possible.
85
86=head2 Reporting Bugs
87
88Report bugs to the Bioperl bug tracking system to help us keep track
89of the bugs and their resolution. Bug reports can be submitted via the
90web:
91
92  https://github.com/bioperl/bioperl-live/issues
93
94=head1 AUTHOR - Jason Stajich
95
96Email jason-at-bioperl-dot-org
97
98=head1 APPENDIX
99
100The rest of the documentation details each of the object methods.
101Internal methods are usually preceded with a _
102
103=cut
104
105# Let the code begin...
106
107package Bio::Index::Blast;
108$Bio::Index::Blast::VERSION = '1.7.7';
109use strict;
110
111use IO::String;
112use Bio::SearchIO;
113use base qw(Bio::Index::Abstract Bio::Root::Root);
114
115sub _version {
116    return $Bio::Index::Blast::VERSION;
117}
118
119=head2 new
120
121  Usage   : $index = Bio::Index::Abstract->new(
122                -filename    => $dbm_file,
123                -write_flag  => 0,
124                -dbm_package => 'DB_File',
125                -verbose     => 0);
126
127  Function: Returns a new index object.  If filename is
128            specified, then open_dbm() is immediately called.
129            Bio::Index::Abstract->new() will usually be called
130            directly only when opening an existing index.
131  Returns : A new index object
132  Args    : -filename    The name of the dbm index file.
133            -write_flag  TRUE if write access to the dbm file is
134                         needed.
135            -dbm_package The Perl dbm module to use for the
136                         index.
137            -verbose     Print debugging output to STDERR if
138                         TRUE.
139
140=cut
141
142sub new {
143    my($class,@args) = @_;
144    my $self = $class->SUPER::new(@args);
145    my ($type) = $self->_rearrange([qw(PARSER)], @args);
146    $type && $self->blast_parser_type($type);
147    $self;
148}
149
150=head2 Bio::Index::Blast implemented methods
151
152=cut
153
154=head2 fetch_report
155
156 Title   : fetch_report
157 Usage   : my $blastreport = $idx->fetch_report($id);
158 Function: Returns a Bio::SearchIO report object
159           for a specific blast report
160 Returns : Bio::SearchIO
161 Args    : valid id
162
163=cut
164
165sub fetch_report{
166	my ($self,$id) = @_;
167	my $fh = $self->get_stream($id);
168	my $report = Bio::SearchIO->new(-noclose => 1,
169                                    -format => $self->blast_parser_type,
170                                    -fh => $fh);
171	return $report->next_result;
172}
173
174=head2 fetch_result
175
176 Title   : fetch_result
177 Usage   : my $blastreport = $idx->fetch_result($id);
178 Function: Returns a Bio::SearchIO report object
179           for a specific blast report
180 Returns : Bio::SearchIO
181 Args    : valid id
182 Note    : alias of fetch_report()
183
184=cut
185
186*fetch_result = \&fetch_report;
187
188=head2 Require methods from Bio::Index::Abstract
189
190=cut
191
192=head2 _index_file
193
194  Title   : _index_file
195  Usage   : $index->_index_file( $file_name, $i )
196  Function: Specialist function to index BLAST report file(s).
197            Is provided with a filename and an integer
198            by make_index in its SUPER class.
199  Example :
200  Returns :
201  Args    :
202
203=cut
204
205sub _index_file {
206	my( $self,
207		 $file, # File name
208		 $i,    # Index-number of file being indexed
209	  ) = @_;
210
211	my( $begin,  # Offset from start of file of the start
212		          # of the last found record.
213	  );
214
215	open my $BLAST, '<', $file or $self->throw("Could not read file '$file': $!");
216
217	my (@data, @records);
218	my $indexpoint = 0;
219	my $lastline = 0;
220	my $prefix = '';
221
222	# In Windows, text files have '\r\n' as line separator, but when reading in
223	# text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
224	# "length $_" will report 4 although the line is 5 bytes in length.
225	# We assume that all lines have the same line separator and only read current line.
226	my $init_pos   = tell($BLAST);
227	my $curr_line  = <$BLAST>;
228	my $pos_diff   = tell($BLAST) - $init_pos;
229	my $correction = $pos_diff - length $curr_line;
230	seek $BLAST, $init_pos, 0; # Rewind position to proceed to read the file
231
232	# fencepost problem: we basically just find the top and the query
233	while( my $line = <$BLAST> ) {
234
235		# in recent RPS-BLAST output the only delimiter between result
236		# sections is '^Query=' - in other BLAST outputs you
237		# can use '^(RPS-|T?)BLAST(P?|N?|X?)'
238
239		if ( $line =~ /^(RPS-|T?)BLAST(P?|N?|X?)/ ) {
240			$prefix = $1;
241			$indexpoint = tell($BLAST) - length($line) - $correction;
242		}
243		if ( $line =~ /^Query=\s*([^\n]+)$/ ) {
244
245			$indexpoint = tell($BLAST) - length($line) - $correction if ( $prefix eq 'RPS-' );
246
247			foreach my $id ($self->id_parser()->($1)) {
248				$self->debug("id is $id, begin is $indexpoint\n");
249				$self->add_record($id, $i, $indexpoint);
250			}
251		}
252	}
253}
254
255# shamelessly stolen from Bio::Index::Fasta
256
257=head2 id_parser
258
259  Title   : id_parser
260  Usage   : $index->id_parser( CODE )
261  Function: Stores or returns the code used by record_id to
262            parse the ID for record from a string.  Useful
263            for (for instance) specifying a different
264            parser for different flavours of blast dbs.
265            Returns \&default_id_parser (see below) if not
266            set. If you supply your own id_parser
267            subroutine, then it should expect a fasta
268            description line.  An entry will be added to
269            the index for each string in the list returned.
270  Example : $index->id_parser( \&my_id_parser )
271  Returns : ref to CODE if called without arguments
272  Args    : CODE
273
274=cut
275
276sub id_parser {
277	my( $self, $code ) =@_;
278
279	if ($code) {
280		$self->{'_id_parser'} = $code;
281	}
282	return $self->{'_id_parser'} || \&default_id_parser;
283}
284
285=head2 default_id_parser
286
287  Title   : default_id_parser
288  Usage   : $id = default_id_parser( $header )
289  Function: The default Blast Query ID parser for Bio::Index::Blast.pm
290            Returns $1 from applying the regexp /^>\s*(\S+)/
291            to $header.
292  Returns : ID string
293  Args    : a header line string
294
295=cut
296
297sub default_id_parser {
298	if ($_[0] =~ /^\s*(\S+)/) {
299		return $1;
300	} else {
301		return;
302	}
303}
304
305=head2 blast_parser_type
306
307  Title   : blast_parser_type
308  Usage   : $index->blast_parser_type() # returns
309  Function: Get/Set SearchIO-based text (-m0) BLAST parser. Only values in
310            local %VALID_PARSERS hash allowed.
311  Returns : String
312  Args    : [optional]
313  Note    : This only allows simple text-based parsing options; tabular, XML,
314            or others are not supported (see Bio::Index::BlastTable for tab
315            output).
316
317=cut
318
319my %VALID_PARSERS = map {$_ =>1} qw(blast blast_pull);
320
321sub blast_parser_type {
322    my ($self, $type) = @_;
323    if ($type) {
324        $self->throw("$type is not a supported BLAST text parser") unless
325            exists $VALID_PARSERS{$type};
326        $self->{_blast_parser_type} = $type;
327    }
328    return $self->{_blast_parser_type} || 'blast';
329}
330
331=head2 Bio::Index::Abstract methods
332
333=cut
334
335=head2 filename
336
337 Title   : filename
338 Usage   : $value = $self->filename();
339           $self->filename($value);
340 Function: Gets or sets the name of the dbm index file.
341 Returns : The current value of filename
342 Args    : Value of filename if setting, or none if
343           getting the value.
344
345=head2 write_flag
346
347 Title   : write_flag
348 Usage   : $value = $self->write_flag();
349           $self->write_flag($value);
350 Function: Gets or sets the value of write_flag, which
351           is whether the dbm file should be opened with
352           write access.
353 Returns : The current value of write_flag (default 0)
354 Args    : Value of write_flag if setting, or none if
355           getting the value.
356
357=head2 dbm_package
358
359 Usage   : $value = $self->dbm_package();
360           $self->dbm_package($value);
361
362 Function: Gets or sets the name of the Perl dbm module used.
363           If the value is unset, then it returns the value of
364           the package variable $USE_DBM_TYPE or if that is
365           unset, then it chooses the best available dbm type,
366           choosing 'DB_File' in preference to 'SDBM_File'.
367           Bio::Abstract::Index may work with other dbm file
368           types.
369
370 Returns : The current value of dbm_package
371 Args    : Value of dbm_package if setting, or none if
372           getting the value.
373
374
375=head2 get_stream
376
377 Title   : get_stream
378 Usage   : $stream = $index->get_stream( $id );
379 Function: Returns a file handle with the file pointer
380           at the approprite place
381
382           This provides for a way to get the actual
383           file contents and not an object
384
385           WARNING: you must parse the record deliminter
386           *yourself*. Abstract won't do this for you
387           So this code
388
389           $fh = $index->get_stream($myid);
390           while( <$fh> ) {
391              # do something
392           }
393           will parse the entire file if you do not put in
394           a last statement in, like
395
396           while( <$fh> ) {
397              /^\/\// && last; # end of record
398              # do something
399           }
400
401 Returns : A filehandle object
402 Args    : string represents the accession number
403 Notes   : This method should not be used without forethought
404
405
406=head2 open_dbm
407
408  Usage   : $index->open_dbm()
409  Function: Opens the dbm file associated with the index
410            object.  Write access is only given if explicitly
411            asked for by calling new(-write => 1) or having set
412            the write_flag(1) on the index object.  The type of
413            dbm file opened is that returned by dbm_package().
414            The name of the file to be is opened is obtained by
415            calling the filename() method.
416
417  Example : $index->_open_dbm()
418  Returns : 1 on success
419
420
421=head2 _version
422
423  Title   : _version
424  Usage   : $type = $index->_version()
425  Function: Returns a string which identifes the version of an
426            index module.  Used to permanently identify an index
427            file as having been created by a particular version
428            of the index module.  Must be provided by the sub class
429  Example :
430  Returns :
431  Args    : none
432
433=head2 _filename
434
435  Title   : _filename
436  Usage   : $index->_filename( FILE INT )
437  Function: Indexes the file
438  Example :
439  Returns :
440  Args    :
441
442=head2 _file_handle
443
444  Title   : _file_handle
445  Usage   : $fh = $index->_file_handle( INT )
446  Function: Returns an open filehandle for the file
447            index INT.  On opening a new filehandle it
448            caches it in the @{$index->_filehandle} array.
449            If the requested filehandle is already open,
450            it simply returns it from the array.
451  Example : $fist_file_indexed = $index->_file_handle( 0 );
452  Returns : ref to a filehandle
453  Args    : INT
454
455=head2 _file_count
456
457  Title   : _file_count
458  Usage   : $index->_file_count( INT )
459  Function: Used by the index building sub in a sub class to
460            track the number of files indexed.  Sets or gets
461            the number of files indexed when called with or
462            without an argument.
463  Example :
464  Returns : INT
465  Args    : INT
466
467
468=head2 add_record
469
470  Title   : add_record
471  Usage   : $index->add_record( $id, @stuff );
472  Function: Calls pack_record on @stuff, and adds the result
473            of pack_record to the index database under key $id.
474            If $id is a reference to an array, then a new entry
475            is added under a key corresponding to each element
476            of the array.
477  Example : $index->add_record( $id, $fileNumber, $begin, $end )
478  Returns : TRUE on success or FALSE on failure
479  Args    : ID LIST
480
481=head2 pack_record
482
483  Title   : pack_record
484  Usage   : $packed_string = $index->pack_record( LIST )
485  Function: Packs an array of scalars into a single string
486            joined by ASCII 034 (which is unlikely to be used
487            in any of the strings), and returns it.
488  Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
489  Returns : STRING or undef
490  Args    : LIST
491
492=head2 unpack_record
493
494  Title   : unpack_record
495  Usage   : $index->unpack_record( STRING )
496  Function: Splits the sting provided into an array,
497            splitting on ASCII 034.
498  Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
499  Returns : A 3 element ARRAY
500  Args    : STRING containing ASCII 034
501
502=head2 DESTROY
503
504 Title   : DESTROY
505 Usage   : Called automatically when index goes out of scope
506 Function: Closes connection to database and handles to
507           sequence files
508 Returns : NEVER
509 Args    : NONE
510
511
512=cut
513
5141;
515