1# 2# BioPerl module for Bio::Index::Blast 3# 4# Please direct questions and support issues to <bioperl-l@bioperl.org> 5# 6# Cared for by Jason Stajich <jason@bioperl.org> 7# 8# Copyright Jason Stajich 9# 10# You may distribute this module under the same terms as perl itself 11 12# POD documentation - main docs before the code 13 14=head1 NAME 15 16Bio::Index::Blast - Indexes Blast reports and supports retrieval 17based on query accession(s) 18 19=head1 SYNOPSIS 20 21 use strict; 22 use Bio::Index::Blast; 23 24 my ($indexfile,$file1,$file2,$query); 25 my $index = Bio::Index::Blast->new(-filename => $indexfile, 26 -write_flag => 1); 27 $index->make_index($file1,$file2); 28 29 my $fh = $index->get_stream($query); 30 31 my $blast_report = Bio::SearchIO->new(-noclose => 1, 32 -format => 'blast', 33 -fh => $fh); 34 my $result = $blast_report->next_result; 35 print $result->algorithm, "\n"; 36 my $hit = $result->next_hit; 37 print $hit->description, "\n"; 38 my $hsp = $hit->next_hsp; 39 print $hsp->bits, "\n"; 40 41=head1 DESCRIPTION 42 43This object allows one to build an index on a blast file (or files) 44and provide quick access to the blast report for that accession. 45 46This also allows for ID parsing using a callback: 47 48 $inx->id_parser(\&get_id); 49 # make the index 50 $inx->make_index($file_name); 51 52 # here is where the retrieval key is specified 53 sub get_id { 54 my $line = shift; 55 $line =~ /^gi\|(\d+)/; 56 $1; 57 } 58 59The indexer is capable of indexing based on multiple IDs passed back from the 60callback; this is assuming of course all IDs are unique. 61 62Note: for best results 'use strict'. 63 64=head1 FEEDBACK 65 66=head2 Mailing Lists 67 68User feedback is an integral part of the evolution of this and other 69Bioperl modules. Send your comments and suggestions preferably to 70the Bioperl mailing list. Your participation is much appreciated. 71 72 bioperl-l@bioperl.org - General discussion 73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 74 75=head2 Support 76 77Please direct usage questions or support issues to the mailing list: 78 79I<bioperl-l@bioperl.org> 80 81rather than to the module maintainer directly. Many experienced and 82reponsive experts will be able look at the problem and quickly 83address it. Please include a thorough description of the problem 84with code and data examples if at all possible. 85 86=head2 Reporting Bugs 87 88Report bugs to the Bioperl bug tracking system to help us keep track 89of the bugs and their resolution. Bug reports can be submitted via the 90web: 91 92 https://github.com/bioperl/bioperl-live/issues 93 94=head1 AUTHOR - Jason Stajich 95 96Email jason-at-bioperl-dot-org 97 98=head1 APPENDIX 99 100The rest of the documentation details each of the object methods. 101Internal methods are usually preceded with a _ 102 103=cut 104 105# Let the code begin... 106 107package Bio::Index::Blast; 108$Bio::Index::Blast::VERSION = '1.7.7'; 109use strict; 110 111use IO::String; 112use Bio::SearchIO; 113use base qw(Bio::Index::Abstract Bio::Root::Root); 114 115sub _version { 116 return $Bio::Index::Blast::VERSION; 117} 118 119=head2 new 120 121 Usage : $index = Bio::Index::Abstract->new( 122 -filename => $dbm_file, 123 -write_flag => 0, 124 -dbm_package => 'DB_File', 125 -verbose => 0); 126 127 Function: Returns a new index object. If filename is 128 specified, then open_dbm() is immediately called. 129 Bio::Index::Abstract->new() will usually be called 130 directly only when opening an existing index. 131 Returns : A new index object 132 Args : -filename The name of the dbm index file. 133 -write_flag TRUE if write access to the dbm file is 134 needed. 135 -dbm_package The Perl dbm module to use for the 136 index. 137 -verbose Print debugging output to STDERR if 138 TRUE. 139 140=cut 141 142sub new { 143 my($class,@args) = @_; 144 my $self = $class->SUPER::new(@args); 145 my ($type) = $self->_rearrange([qw(PARSER)], @args); 146 $type && $self->blast_parser_type($type); 147 $self; 148} 149 150=head2 Bio::Index::Blast implemented methods 151 152=cut 153 154=head2 fetch_report 155 156 Title : fetch_report 157 Usage : my $blastreport = $idx->fetch_report($id); 158 Function: Returns a Bio::SearchIO report object 159 for a specific blast report 160 Returns : Bio::SearchIO 161 Args : valid id 162 163=cut 164 165sub fetch_report{ 166 my ($self,$id) = @_; 167 my $fh = $self->get_stream($id); 168 my $report = Bio::SearchIO->new(-noclose => 1, 169 -format => $self->blast_parser_type, 170 -fh => $fh); 171 return $report->next_result; 172} 173 174=head2 fetch_result 175 176 Title : fetch_result 177 Usage : my $blastreport = $idx->fetch_result($id); 178 Function: Returns a Bio::SearchIO report object 179 for a specific blast report 180 Returns : Bio::SearchIO 181 Args : valid id 182 Note : alias of fetch_report() 183 184=cut 185 186*fetch_result = \&fetch_report; 187 188=head2 Require methods from Bio::Index::Abstract 189 190=cut 191 192=head2 _index_file 193 194 Title : _index_file 195 Usage : $index->_index_file( $file_name, $i ) 196 Function: Specialist function to index BLAST report file(s). 197 Is provided with a filename and an integer 198 by make_index in its SUPER class. 199 Example : 200 Returns : 201 Args : 202 203=cut 204 205sub _index_file { 206 my( $self, 207 $file, # File name 208 $i, # Index-number of file being indexed 209 ) = @_; 210 211 my( $begin, # Offset from start of file of the start 212 # of the last found record. 213 ); 214 215 open my $BLAST, '<', $file or $self->throw("Could not read file '$file': $!"); 216 217 my (@data, @records); 218 my $indexpoint = 0; 219 my $lastline = 0; 220 my $prefix = ''; 221 222 # In Windows, text files have '\r\n' as line separator, but when reading in 223 # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n", 224 # "length $_" will report 4 although the line is 5 bytes in length. 225 # We assume that all lines have the same line separator and only read current line. 226 my $init_pos = tell($BLAST); 227 my $curr_line = <$BLAST>; 228 my $pos_diff = tell($BLAST) - $init_pos; 229 my $correction = $pos_diff - length $curr_line; 230 seek $BLAST, $init_pos, 0; # Rewind position to proceed to read the file 231 232 # fencepost problem: we basically just find the top and the query 233 while( my $line = <$BLAST> ) { 234 235 # in recent RPS-BLAST output the only delimiter between result 236 # sections is '^Query=' - in other BLAST outputs you 237 # can use '^(RPS-|T?)BLAST(P?|N?|X?)' 238 239 if ( $line =~ /^(RPS-|T?)BLAST(P?|N?|X?)/ ) { 240 $prefix = $1; 241 $indexpoint = tell($BLAST) - length($line) - $correction; 242 } 243 if ( $line =~ /^Query=\s*([^\n]+)$/ ) { 244 245 $indexpoint = tell($BLAST) - length($line) - $correction if ( $prefix eq 'RPS-' ); 246 247 foreach my $id ($self->id_parser()->($1)) { 248 $self->debug("id is $id, begin is $indexpoint\n"); 249 $self->add_record($id, $i, $indexpoint); 250 } 251 } 252 } 253} 254 255# shamelessly stolen from Bio::Index::Fasta 256 257=head2 id_parser 258 259 Title : id_parser 260 Usage : $index->id_parser( CODE ) 261 Function: Stores or returns the code used by record_id to 262 parse the ID for record from a string. Useful 263 for (for instance) specifying a different 264 parser for different flavours of blast dbs. 265 Returns \&default_id_parser (see below) if not 266 set. If you supply your own id_parser 267 subroutine, then it should expect a fasta 268 description line. An entry will be added to 269 the index for each string in the list returned. 270 Example : $index->id_parser( \&my_id_parser ) 271 Returns : ref to CODE if called without arguments 272 Args : CODE 273 274=cut 275 276sub id_parser { 277 my( $self, $code ) =@_; 278 279 if ($code) { 280 $self->{'_id_parser'} = $code; 281 } 282 return $self->{'_id_parser'} || \&default_id_parser; 283} 284 285=head2 default_id_parser 286 287 Title : default_id_parser 288 Usage : $id = default_id_parser( $header ) 289 Function: The default Blast Query ID parser for Bio::Index::Blast.pm 290 Returns $1 from applying the regexp /^>\s*(\S+)/ 291 to $header. 292 Returns : ID string 293 Args : a header line string 294 295=cut 296 297sub default_id_parser { 298 if ($_[0] =~ /^\s*(\S+)/) { 299 return $1; 300 } else { 301 return; 302 } 303} 304 305=head2 blast_parser_type 306 307 Title : blast_parser_type 308 Usage : $index->blast_parser_type() # returns 309 Function: Get/Set SearchIO-based text (-m0) BLAST parser. Only values in 310 local %VALID_PARSERS hash allowed. 311 Returns : String 312 Args : [optional] 313 Note : This only allows simple text-based parsing options; tabular, XML, 314 or others are not supported (see Bio::Index::BlastTable for tab 315 output). 316 317=cut 318 319my %VALID_PARSERS = map {$_ =>1} qw(blast blast_pull); 320 321sub blast_parser_type { 322 my ($self, $type) = @_; 323 if ($type) { 324 $self->throw("$type is not a supported BLAST text parser") unless 325 exists $VALID_PARSERS{$type}; 326 $self->{_blast_parser_type} = $type; 327 } 328 return $self->{_blast_parser_type} || 'blast'; 329} 330 331=head2 Bio::Index::Abstract methods 332 333=cut 334 335=head2 filename 336 337 Title : filename 338 Usage : $value = $self->filename(); 339 $self->filename($value); 340 Function: Gets or sets the name of the dbm index file. 341 Returns : The current value of filename 342 Args : Value of filename if setting, or none if 343 getting the value. 344 345=head2 write_flag 346 347 Title : write_flag 348 Usage : $value = $self->write_flag(); 349 $self->write_flag($value); 350 Function: Gets or sets the value of write_flag, which 351 is whether the dbm file should be opened with 352 write access. 353 Returns : The current value of write_flag (default 0) 354 Args : Value of write_flag if setting, or none if 355 getting the value. 356 357=head2 dbm_package 358 359 Usage : $value = $self->dbm_package(); 360 $self->dbm_package($value); 361 362 Function: Gets or sets the name of the Perl dbm module used. 363 If the value is unset, then it returns the value of 364 the package variable $USE_DBM_TYPE or if that is 365 unset, then it chooses the best available dbm type, 366 choosing 'DB_File' in preference to 'SDBM_File'. 367 Bio::Abstract::Index may work with other dbm file 368 types. 369 370 Returns : The current value of dbm_package 371 Args : Value of dbm_package if setting, or none if 372 getting the value. 373 374 375=head2 get_stream 376 377 Title : get_stream 378 Usage : $stream = $index->get_stream( $id ); 379 Function: Returns a file handle with the file pointer 380 at the approprite place 381 382 This provides for a way to get the actual 383 file contents and not an object 384 385 WARNING: you must parse the record deliminter 386 *yourself*. Abstract won't do this for you 387 So this code 388 389 $fh = $index->get_stream($myid); 390 while( <$fh> ) { 391 # do something 392 } 393 will parse the entire file if you do not put in 394 a last statement in, like 395 396 while( <$fh> ) { 397 /^\/\// && last; # end of record 398 # do something 399 } 400 401 Returns : A filehandle object 402 Args : string represents the accession number 403 Notes : This method should not be used without forethought 404 405 406=head2 open_dbm 407 408 Usage : $index->open_dbm() 409 Function: Opens the dbm file associated with the index 410 object. Write access is only given if explicitly 411 asked for by calling new(-write => 1) or having set 412 the write_flag(1) on the index object. The type of 413 dbm file opened is that returned by dbm_package(). 414 The name of the file to be is opened is obtained by 415 calling the filename() method. 416 417 Example : $index->_open_dbm() 418 Returns : 1 on success 419 420 421=head2 _version 422 423 Title : _version 424 Usage : $type = $index->_version() 425 Function: Returns a string which identifes the version of an 426 index module. Used to permanently identify an index 427 file as having been created by a particular version 428 of the index module. Must be provided by the sub class 429 Example : 430 Returns : 431 Args : none 432 433=head2 _filename 434 435 Title : _filename 436 Usage : $index->_filename( FILE INT ) 437 Function: Indexes the file 438 Example : 439 Returns : 440 Args : 441 442=head2 _file_handle 443 444 Title : _file_handle 445 Usage : $fh = $index->_file_handle( INT ) 446 Function: Returns an open filehandle for the file 447 index INT. On opening a new filehandle it 448 caches it in the @{$index->_filehandle} array. 449 If the requested filehandle is already open, 450 it simply returns it from the array. 451 Example : $fist_file_indexed = $index->_file_handle( 0 ); 452 Returns : ref to a filehandle 453 Args : INT 454 455=head2 _file_count 456 457 Title : _file_count 458 Usage : $index->_file_count( INT ) 459 Function: Used by the index building sub in a sub class to 460 track the number of files indexed. Sets or gets 461 the number of files indexed when called with or 462 without an argument. 463 Example : 464 Returns : INT 465 Args : INT 466 467 468=head2 add_record 469 470 Title : add_record 471 Usage : $index->add_record( $id, @stuff ); 472 Function: Calls pack_record on @stuff, and adds the result 473 of pack_record to the index database under key $id. 474 If $id is a reference to an array, then a new entry 475 is added under a key corresponding to each element 476 of the array. 477 Example : $index->add_record( $id, $fileNumber, $begin, $end ) 478 Returns : TRUE on success or FALSE on failure 479 Args : ID LIST 480 481=head2 pack_record 482 483 Title : pack_record 484 Usage : $packed_string = $index->pack_record( LIST ) 485 Function: Packs an array of scalars into a single string 486 joined by ASCII 034 (which is unlikely to be used 487 in any of the strings), and returns it. 488 Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end ) 489 Returns : STRING or undef 490 Args : LIST 491 492=head2 unpack_record 493 494 Title : unpack_record 495 Usage : $index->unpack_record( STRING ) 496 Function: Splits the sting provided into an array, 497 splitting on ASCII 034. 498 Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} ) 499 Returns : A 3 element ARRAY 500 Args : STRING containing ASCII 034 501 502=head2 DESTROY 503 504 Title : DESTROY 505 Usage : Called automatically when index goes out of scope 506 Function: Closes connection to database and handles to 507 sequence files 508 Returns : NEVER 509 Args : NONE 510 511 512=cut 513 5141; 515