1# 2# BioPerl module for Bio::SearchIO::infernal 3# 4# Please direct questions and support issues to <bioperl-l@bioperl.org> 5# 6# Cared for by Chris Fields <cjfields-at-uiuc-dot-edu> 7# 8# Copyright Chris Fields 9# 10# You may distribute this module under the same terms as perl itself 11 12# POD documentation - main docs before the code 13 14=head1 NAME 15 16Bio::SearchIO::infernal - SearchIO-based Infernal parser 17 18=head1 SYNOPSIS 19 20 my $parser = Bio::SearchIO->new(-format => 'infernal', 21 -file => 'purine.inf'); 22 while( my $result = $parser->next_result ) { 23 # general result info, such as model used, Infernal version 24 while( my $hit = $result->next_hit ) { 25 while( my $hsp = $hit->next_hsp ) { 26 # ... 27 } 28 } 29 } 30 31=head1 DESCRIPTION 32 33This is a SearchIO-based parser for Infernal output from the cmsearch program. 34It currently parses cmsearch output for Infernal versions 0.7-1.1; older 35versions may work but will not be supported. 36 37The latest version of Infernal is 1.1. The output has changed substantially 38relative to version 1.0. Versions 1.x are stable releases (and output has 39stabilized) therefore it is highly recommended that users upgrade to using 40the latest Infernal release. Support for the older pre-v.1 developer releases 41will be dropped for future core 1.6 releases. 42 43=head1 FEEDBACK 44 45=head2 Mailing Lists 46 47User feedback is an integral part of the evolution of this and other 48Bioperl modules. Send your comments and suggestions preferably to 49the Bioperl mailing list. Your participation is much appreciated. 50 51 bioperl-l@bioperl.org - General discussion 52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 53 54=head2 Support 55 56Please direct usage questions or support issues to the mailing list: 57 58I<bioperl-l@bioperl.org> 59 60rather than to the module maintainer directly. Many experienced and 61reponsive experts will be able look at the problem and quickly 62address it. Please include a thorough description of the problem 63with code and data examples if at all possible. 64 65=head2 Reporting Bugs 66 67Report bugs to the Bioperl bug tracking system to help us keep track 68of the bugs and their resolution. Bug reports can be submitted via the 69web: 70 71 https://github.com/bioperl/bioperl-live/issues 72 73=head1 AUTHOR - Chris Fields 74 75Email cjfields-at-uiuc-dot-edu 76 77=head1 CONTRIBUTORS 78 79 Jeffrey Barrick, Michigan State University 80 Paul Cantalupo, University of Pittsburgh 81 82=head1 APPENDIX 83 84The rest of the documentation details each of the object methods. 85Internal methods are usually preceded with a _ 86 87=cut 88 89# Let the code begin... 90 91package Bio::SearchIO::infernal; 92$Bio::SearchIO::infernal::VERSION = '1.7.7'; 93use strict; 94 95use Data::Dumper; 96use base qw(Bio::SearchIO); 97 98our %MODEMAP = ( 99 'Result' => 'result', 100 'Hit' => 'hit', 101 'Hsp' => 'hsp' 102 ); 103 104our %MAPPING = ( 105 'Hsp_bit-score' => 'HSP-bits', 106 'Hsp_score' => 'HSP-score', 107 'Hsp_evalue' => 'HSP-evalue', # evalues only in v0.81, optional 108 'Hsp_pvalue' => 'HSP-pvalue', # pvalues only in v0.81, optional 109 'Hsp_query-from' => 'HSP-query_start', 110 'Hsp_query-to' => 'HSP-query_end', 111 'Hsp_query-strand'=> 'HSP-query_strand', 112 'Hsp_hit-from' => 'HSP-hit_start', 113 'Hsp_hit-to' => 'HSP-hit_end', 114 'Hsp_hit-strand' => 'HSP-hit_strand', 115 'Hsp_gaps' => 'HSP-hsp_gaps', 116 'Hsp_hitgaps' => 'HSP-hit_gaps', 117 'Hsp_querygaps' => 'HSP-query_gaps', 118 'Hsp_qseq' => 'HSP-query_seq', 119 'Hsp_ncline' => 'HSP-nc_seq', 120 'Hsp_hseq' => 'HSP-hit_seq', 121 'Hsp_midline' => 'HSP-homology_seq', 122 'Hsp_pline' => 'HSP-pp_seq', 123 'Hsp_structure' => 'HSP-meta', 124 'Hsp_align-len' => 'HSP-hsp_length', 125 'Hsp_stranded' => 'HSP-stranded', 126 127 'Hit_id' => 'HIT-name', 128 'Hit_len' => 'HIT-length', 129 'Hit_gi' => 'HIT-ncbi_gi', 130 'Hit_accession' => 'HIT-accession', 131 'Hit_desc' => 'HIT-description', 132 'Hit_def' => 'HIT-description', 133 'Hit_signif' => 'HIT-significance', # evalues in v1.1 and v0.81, optional 134 'Hit_p' => 'HIT-p', # pvalues only in 1.0, optional 135 'Hit_score' => 'HIT-score', # best HSP bit score (in v1.1, the only HSP bit score) 136 'Hit_bits' => 'HIT-bits', # best HSP bit score (ditto) 137 138 'Infernal_program' => 'RESULT-algorithm_name', # get/set 139 'Infernal_version' => 'RESULT-algorithm_version', # get/set 140 'Infernal_query-def'=> 'RESULT-query_name', # get/set 141 'Infernal_query-len'=> 'RESULT-query_length', 142 'Infernal_query-acc'=> 'RESULT-query_accession', # get/set 143 'Infernal_querydesc'=> 'RESULT-query_description', # get/set 144 'Infernal_cm' => 'RESULT-cm_name', 145 'Infernal_db' => 'RESULT-database_name', # get/set 146 'Infernal_db-len' => 'RESULT-database_entries', # in v1.1 only 147 'Infernal_db-let' => 'RESULT-database_letters', # in v1.1 only 148 ); 149 150my $MINSCORE = 0; 151my $DEFAULT_ALGORITHM = 'cmsearch'; 152my $DEFAULT_VERSION = '1.1'; 153 154my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown gap); 155my %STRUCTURE_SYMBOLS = ( 156 '5-prime' => '<', 157 '3-prime' => '>', 158 'single-strand' => ':', 159 'unknown' => '?', 160 'gap' => '.' 161 ); 162 163=head2 new 164 165 Title : new 166 Usage : my $obj = Bio::SearchIO::infernal->new(); 167 Function: Builds a new Bio::SearchIO::infernal object 168 Returns : Bio::SearchIO::infernal 169 Args : -fh/-file => cmsearch (infernal) filename 170 -format => 'infernal' 171 -model => query model (Rfam ID) (default undef) 172 -database => database name (default undef) 173 -query_acc => query accession, eg. Rfam accession RF#### 174 -query_desc => query description, eg. Rfam description 175 -hsp_minscore => minimum HSP score cutoff 176 -convert_meta => boolean, set to convert meta string to simple WUSS format 177 -symbols => hash ref of structure symbols to use 178 (default symbols in %STRUCTURE_SYMBOLS hash) 179 180=cut 181 182sub _initialize { 183 my ( $self, @args ) = @_; 184 $self->SUPER::_initialize(@args); 185 my ($model, $database, $convert, $symbols, $cutoff, 186 $desc, $accession, $algorithm, $version) = 187 $self->_rearrange([qw(MODEL 188 DATABASE 189 CONVERT_META 190 SYMBOLS 191 HSP_MINSCORE 192 QUERY_DESC 193 QUERY_ACC 194 ALGORITHM 195 VERSION)],@args); 196 my $handler = $self->_eventHandler; 197 $handler->register_factory( 198 'result', 199 Bio::Factory::ObjectFactory->new( 200 -type => 'Bio::Search::Result::INFERNALResult', 201 -interface => 'Bio::Search::Result::ResultI', 202 -verbose => $self->verbose 203 ) 204 ); 205 206 $handler->register_factory( 207 'hit', 208 Bio::Factory::ObjectFactory->new( 209 -type => 'Bio::Search::Hit::ModelHit', 210 -interface => 'Bio::Search::Hit::HitI', 211 -verbose => $self->verbose 212 ) 213 ); 214 215 $handler->register_factory( 216 'hsp', 217 Bio::Factory::ObjectFactory->new( 218 -type => 'Bio::Search::HSP::ModelHSP', 219 -interface => 'Bio::Search::HSP::HSPI', 220 -verbose => $self->verbose 221 ) 222 ); 223 224 defined $model && $self->model($model); 225 defined $database && $self->database($database); 226 defined $accession && $self->query_accession($accession); 227 defined $convert && $self->convert_meta($convert); 228 defined $desc && $self->query_description($desc); 229 230 $version ||= $DEFAULT_VERSION; 231 $self->version($version); 232 $symbols ||= \%STRUCTURE_SYMBOLS; 233 $self->structure_symbols($symbols); 234 $cutoff ||= $MINSCORE; 235 $self->hsp_minscore($cutoff); 236 $algorithm ||= $DEFAULT_ALGORITHM; 237 $self->algorithm($algorithm); 238} 239 240=head2 next_result 241 242 Title : next_result 243 Usage : my $hit = $searchio->next_result; 244 Function: Returns the next Result from a search 245 Returns : Bio::Search::Result::ResultI object 246 Args : none 247 248=cut 249 250sub next_result { 251 my ($self) = @_; 252 unless (exists $self->{'_handlerset'}) { 253 my $line; 254 while ($line = $self->_readline) { 255 # advance to first line 256 next if $line =~ m{^\s*$}; 257 # newer output starts with model name 258 if ($line =~ m{^\#\s+cmsearch\s}) { 259 my $secondline = $self->_readline; 260 if ($secondline =~ m{INFERNAL 1\.1}) { 261 $self->{'_handlerset'} = '1.1'; 262 } 263 else { 264 $self->{'_handlerset'} = 'latest'; # v1.0 265 } 266 $self->_pushback($secondline); 267 } 268 elsif ($line =~ m{^CM\s\d+:}) { 269 $self->{'_handlerset'} = 'pre-1.0'; 270 } 271 else { 272 $self->{'_handlerset'} ='old'; 273 } 274 last; 275 } 276 $self->_pushback($line); 277 } 278 return ($self->{'_handlerset'} eq '1.1') ? $self->_parse_v1_1 : 279 ($self->{'_handlerset'} eq 'latest') ? $self->_parse_latest : 280 ($self->{'_handlerset'} eq 'pre-1.0') ? $self->_parse_pre : 281 $self->_parse_old; 282} 283 284 285sub _parse_v1_1 { 286 my ($self) = @_; 287 my $seentop = 0; 288 local $/ = "\n"; 289 my ($accession, $description) = ($self->query_accession, $self->query_description); 290 my ($buffer, $last, %modelcounter, @hit_list, %hitindex, 291 @hsp_list, %hspindex); 292 $self->start_document(); 293 $buffer = $self->_readline; 294 if ( !defined $buffer || $buffer =~ m/^\[ok/ ) { # end of report 295 return undef; 296 } 297 else { 298 $self->_pushback($buffer); 299 } 300 301 PARSER: # Parse each line of report 302 while ( defined( $buffer = $self->_readline ) ) { 303 my $hit_counter = 0; 304 my $lineorig = $buffer; 305 chomp $buffer; 306 307 # INFERNAL program name 308 if ( $buffer =~ m/^\#\s(\S+)\s\:\:\s/ ) { 309 $seentop = 1; 310 my $prog = $1; 311 $self->start_element( { 'Name' => 'Result' } ); 312 $self->element_hash( { 'Infernal_program' => uc($prog) } ); 313 } 314 315 # INFERNAL version and release date 316 elsif ( $buffer =~ m/^\#\sINFERNAL\s+(\S+)\s+\((.+)\)/ ) { 317 my $version = $1; 318 my $versiondate = $2; 319 $self->{'_cmidline'} = $buffer; 320 $self->element_hash( { 'Infernal_version' => $version } ); 321 } 322 323 # Query info 324 elsif ( $buffer =~ /^\#\squery (?:\w+ )?file\:\s+(\S+)/ ) { 325 $self->{'_cmfileline'} = $lineorig; 326 $self->element_hash( { 'Infernal_cm' => $1 } ); 327 } 328 329 # Database info 330 elsif ( $buffer =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ) { 331 $self->{'_cmseqline'} = $lineorig; 332 $self->element_hash( { 'Infernal_db' => $1 } ); 333 } 334 335 # Query data 336 elsif ( $buffer =~ m/^Query:\s+(\S+)\s+\[CLEN=(\d+)\]$/ ) { 337 $self->element_hash( { 'Infernal_query-def' => $1, 338 'Infernal_query-len' => $2, 339 'Infernal_query-acc' => $accession, 340 'Infernal_querydesc' => $description, 341 } ); 342 } 343 344 # Get query accession 345 elsif ( $buffer =~ s/^Accession:\s+// && ! $accession) { 346 $buffer =~ s/\s+$//; 347 $self->element_hash( { 'Infernal_query-acc' => $buffer } ); 348 } 349 350 # Get query description 351 elsif ( $buffer =~ s/^Description:\s+// && ! $description) { 352 $buffer =~ s/\s+$//; 353 $self->element_hash( { 'Infernal_querydesc' => $buffer } ); 354 } 355 356 # Process hit table - including those below inclusion threshold 357 elsif ( $buffer =~ m/^Hit scores:/) { 358 @hit_list = (); # here is case there are multi-query reports 359 while ( defined( $buffer = $self->_readline ) ) { 360 if ( $buffer =~ m/^Hit alignments:/ ) { 361 $self->_pushback($buffer); 362 last; 363 } 364 elsif ( $buffer =~ m/^\s+rank\s+E-value/ 365 || $buffer =~ m/\-\-\-/ 366 || $buffer =~ m/^$/ 367 || $buffer =~ m/No hits detected/ ) { 368 next; 369 } 370 371 # Process hit 372 $hit_counter++; 373 my ($rank, $threshold, $eval, $score, 374 $bias, $hitid, $start, $end, $strand, 375 $mdl, $truc, $gc, @desc) = split( " ", $buffer ); 376 my $desc = join " ", @desc; 377 $desc = '' if ( !defined($desc) ); 378 379 push @hit_list, [ $hitid, $desc, $eval, $score ]; 380 $hitindex{ $hitid.$hit_counter } = $#hit_list; 381 } 382 } 383 384 # Process hit alignments 385 elsif ( $buffer =~ /^Hit alignments:/ ) { 386 my $hitid; 387 my $align_counter = 0; 388 while ( defined( $buffer = $self->_readline ) ) { 389 if ( $buffer =~ /^Internal CM pipeline statistics summary/ ) { 390 $self->_pushback($buffer); 391 last; 392 } 393 if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) { # defline of hit 394 $hitid = $1; 395 my $desc = $2; 396 $align_counter++; 397 my $hitid_alignctr = $hitid.$align_counter; 398 $modelcounter{$hitid_alignctr} = 0; 399 400 # The Hit Description from the Hit table can be truncated if 401 # it is too long, so use the '>>' line description instead 402 $hit_list[ $hitindex{$hitid_alignctr} ][1] = $desc; 403 404 # Process hit information table 405 while ( defined( $buffer = $self->_readline ) ) { 406 if ( $buffer =~ m/^Internal CM pipeline statistics/ 407 || $buffer =~ m/NC$/ 408 || $buffer =~ m/^\>\>/ ) { 409 $self->_pushback($buffer); 410 last; 411 } 412 elsif ( $buffer =~ m/^\s+rank\s+E-value/ 413 || $buffer =~ m/^\s----/ 414 || $buffer =~ m/^$/ 415 || $buffer =~ m/No hits detected/ ) { 416 next; 417 } 418 419 # Get hsp data from table, push into @hsp; 420 my ( $rank, $threshold, $eval, 421 $score, $bias, $model, 422 $cm_start, $cm_stop, $cm_cov, 423 $seq_start, $seq_stop, $seq_strand, $seq_cov, 424 $acc, $trunc, $gc, 425 ) = split( " ", $buffer ); 426 427 # Try to get the Hit Length from the alignment information. 428 # For cmsearch, if sequence coverage ends in ']' it means that the 429 # alignment runs full-length flush to the end of the target. 430 my $hitlength = ( $seq_cov =~ m/\]$/ ) ? $seq_stop : 0; 431 432 my $tmphit = $hit_list[ $hitindex{$hitid_alignctr} ]; 433 if ( !defined $tmphit ) { 434 $self->warn("Incomplete information: can't find HSP $hitid in list of hits\n"); 435 next; 436 } 437 438 push @hsp_list, [ $hitid, 439 $cm_start, $cm_stop, 440 $seq_start, $seq_stop, 441 $score, $eval, 442 $hitlength]; 443 $modelcounter{$hitid_alignctr}++; 444 my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr}; 445 $hspindex{$hsp_key} = $#hsp_list; 446 } 447 } 448 elsif ( $buffer =~ m/NC$/ ) { # start of HSP 449 # need CS line to get number of spaces before structure data 450 my $csline = $self->_readline; 451 $csline =~ m/^(\s+)\S+ CS$/; 452 my $offset = length($1); 453 $self->_pushback($csline); 454 $self->_pushback($buffer); # set up for loop 455 456 my ($ct, $strln) = 0; 457 my $hspdata; 458 HSP: 459 my %hspline = ('0' => 'nc', '1' => 'meta', 460 '2' => 'query', '3' => 'midline', 461 '4' => 'hit', '5' => 'pp'); 462 HSP: 463 while (defined ($buffer = $self->_readline)) { 464 chomp $buffer; 465 if ( $buffer =~ /^>>\s/ 466 || $buffer =~ /^Internal CM pipeline statistics/) { 467 $self->_pushback($buffer); 468 last HSP; 469 } 470 elsif ( $ct % 6 == 0 && $buffer =~ /^$/ ) { 471 next; 472 } 473 my $iterator = $ct % 6; 474 # NC line ends with ' NC' so remove these from the strlen count 475 $strln = ( length($buffer) - 3 ) if $iterator == 0; 476 my $data = substr($buffer, $offset, $strln-$offset); 477 $hspdata->{ $hspline{$iterator} } .= $data; 478 479 $ct++; 480 } # 'HSP' while loop 481 482 my $strlen = 0; 483 # catch any insertions and add them into the actual length 484 while ($hspdata->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) { 485 $strlen += $1; 486 } 487 # add on the actual residues 488 $strlen += $hspdata->{'query'} =~ tr{A-Za-z}{A-Za-z}; 489 my $metastr = ($self->convert_meta) ? ($self->simple_meta($hspdata->{'meta'})) : 490 $hspdata->{'meta'}; 491 492 my $hitid_alignctr = $hitid . $align_counter; 493 my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr}; 494 my $hsp = $hsp_list[ $hspindex{$hsp_key} ]; 495 push (@$hsp, $hspdata->{'nc'}, $metastr, 496 $hspdata->{'query'}, $hspdata->{'midline'}, 497 $hspdata->{'hit'}, $hspdata->{'pp'}); 498 } 499 } 500 } # end of 'Hit alignments:' section of report 501 502 # Process internal pipeline stats (end of report) 503 elsif ( $buffer =~ m/Internal CM pipeline statistics summary:/ ) { 504 while ( defined( $buffer = $self->_readline ) ) { 505 last if ( $buffer =~ m!^//! ); 506 507 if ( $buffer =~ /^Target sequences:\s+(\d+)\s+\((\d+) residues/ ) { 508 $self->element_hash( { 'Infernal_db-len' => $1, 509 'Infernal_db-let' => $2, } ); 510 } 511 } 512 last; # of the outer while defined $self->readline 513 } 514 515 # Leftovers 516 else { 517 #print STDERR "Missed line: $buffer\n"; 518 $self->debug($buffer); 519 } 520 $last = $buffer; 521 } # PARSER end 522 523 # Final processing of hits and hsps 524 my $hit_counter = 0; 525 foreach my $hit ( @hit_list ) { 526 $hit_counter++; 527 my ($hit_name, $hit_desc, $hit_signif, $hit_score) = @$hit; 528 my $num_hsp = $modelcounter{$hit_name . $hit_counter} || 0; 529 530 $self->start_element( { 'Name' => 'Hit' } ); 531 $self->element_hash( {'Hit_id' => $hit_name, 532 'Hit_desc' => $hit_desc, 533 'Hit_signif'=> $hit_signif, 534 'Hit_score' => $hit_score, 535 'Hit_bits' => $hit_score, } ); 536 for my $i ( 1 .. $num_hsp ) { 537 my $hsp_key = $hit_name . $hit_counter . "_" . $i; 538 my $hsp = $hsp_list[ $hspindex{$hsp_key} ]; 539 if ( defined $hsp ) { 540 my $hspid = shift @$hsp; 541 542 my ($cm_start, $cm_stop, $seq_start, $seq_stop, 543 $score, $eval, $hitlength, $ncline, 544 $csline, $qseq, $midline, $hseq, $pline) = @$hsp; 545 if ( $hitlength != 0 ) { 546 $self->element( 547 { 'Name' => 'Hit_len', 'Data' => $hitlength } 548 ); 549 } 550 551 $self->start_element( { 'Name' => 'Hsp' } ); 552 $self->element_hash( { 'Hsp_stranded' => 'HIT', 553 'Hsp_query-from' => $cm_start, 554 'Hsp_query-to' => $cm_stop, 555 'Hsp_hit-from' => $seq_start, 556 'Hsp_hit-to' => $seq_stop, 557 'Hsp_score' => $score, 558 'Hsp_bit-score' => $score, 559 'Hsp_evalue' => $eval, 560 'Hsp_ncline' => $ncline, 561 'Hsp_structure' => $csline, 562 'Hsp_qseq' => $qseq, 563 'Hsp_midline' => $midline, 564 'Hsp_hseq' => $hseq, 565 'Hsp_pline' => $pline, 566 } ); 567 568 $self->end_element( { 'Name' => 'Hsp' } ); 569 } 570 } 571 $self->end_element( { 'Name' => 'Hit' } ); 572 } 573 574 $self->end_element( { 'Name' => 'Result' } ); 575 my $result = $self->end_document(); 576 return $result; 577} 578 579 580=head2 start_element 581 582 Title : start_element 583 Usage : $eventgenerator->start_element 584 Function: Handles a start element event 585 Returns : none 586 Args : hashref with at least 2 keys 'Data' and 'Name' 587 588 589=cut 590 591sub start_element { 592 my ( $self, $data ) = @_; 593 594 # we currently don't care about attributes 595 my $nm = $data->{'Name'}; 596 my $type = $MODEMAP{$nm}; 597 if ($type) { 598 if ( $self->_eventHandler->will_handle($type) ) { 599 my $func = sprintf( "start_%s", lc $type ); 600 $self->_eventHandler->$func( $data->{'Attributes'} ); 601 } 602 unshift @{ $self->{'_elements'} }, $type; 603 } 604 if ( defined $type 605 && $type eq 'result' ) 606 { 607 $self->{'_values'} = {}; 608 $self->{'_result'} = undef; 609 } 610} 611 612=head2 end_element 613 614 Title : start_element 615 Usage : $eventgenerator->end_element 616 Function: Handles an end element event 617 Returns : none 618 Args : hashref with at least 2 keys, 'Data' and 'Name' 619 620=cut 621 622sub end_element { 623 my ( $self, $data ) = @_; 624 my $nm = $data->{'Name'}; 625 my $type = $MODEMAP{$nm}; 626 my $rc; 627 628 if ($type) { 629 if ( $self->_eventHandler->will_handle($type) ) { 630 my $func = sprintf( "end_%s", lc $type ); 631 $rc = $self->_eventHandler->$func( $self->{'_reporttype'}, 632 $self->{'_values'} ); 633 } 634 my $lastelem = shift @{ $self->{'_elements'} }; 635 636 # Infernal 1.1 allows one to know hit->length in some instances 637 # so remove it so it doesn't carry over to next hit. Tried flushing 638 # all 'type' values from {_values} buffer but it breaks legacy tests 639 if ($type eq 'hit' ) { 640 delete $self->{_values}{'HIT-length'}; 641 delete $self->{_values}{'HSP-hit_length'}; 642 } 643 } 644 elsif ( $MAPPING{$nm} ) { 645 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) { 646 my $key = ( keys %{ $MAPPING{$nm} } )[0]; 647 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } = 648 $self->{'_last_data'}; 649 } 650 else { 651 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'}; 652 } 653 } 654 else { 655 $self->debug("unknown nm $nm, ignoring\n"); 656 } 657 $self->{'_last_data'} = ''; # remove read data if we are at 658 # end of an element 659 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' ); 660 return $rc; 661} 662 663=head2 element 664 665 Title : element 666 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); 667 Function: Convenience method that calls start_element, characters, end_element 668 Returns : none 669 Args : Hash ref with the keys 'Name' and 'Data' 670 671=cut 672 673sub element { 674 my ( $self, $data ) = @_; 675 # simple data calls (%MAPPING) do not need start_element 676 $self->characters($data); 677 $self->end_element($data); 678} 679 680=head2 element_hash 681 682 Title : element 683 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start, 684 'Hsp_hit-to' => $end, 685 'Hsp_score' => $lastscore}); 686 Function: Convenience method that takes multiple simple data elements and 687 maps to appropriate parameters 688 Returns : none 689 Args : Hash ref with the mapped key (in %MAPPING) and value 690 691=cut 692 693sub element_hash { 694 my ($self, $data) = @_; 695 $self->throw("Must provide data hash ref") if !$data || !ref($data); 696 for my $nm (sort keys %{$data}) { 697 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o; 698 if ( $MAPPING{$nm} ) { 699 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) { 700 my $key = ( keys %{ $MAPPING{$nm} } )[0]; 701 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } = 702 $data->{$nm}; 703 } 704 else { 705 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm}; 706 } 707 } 708 } 709} 710 711=head2 characters 712 713 Title : characters 714 Usage : $eventgenerator->characters($str) 715 Function: Send a character events 716 Returns : none 717 Args : string 718 719 720=cut 721 722sub characters { 723 my ( $self, $data ) = @_; 724 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o ); 725 $self->{'_last_data'} = $data->{'Data'}; 726} 727 728=head2 within_element 729 730 Title : within_element 731 Usage : if( $eventgenerator->within_element($element) ) {} 732 Function: Test if we are within a particular element 733 This is different than 'in' because within can be tested 734 for a whole block. 735 Returns : boolean 736 Args : string element name 737 738=cut 739 740sub within_element { 741 my ( $self, $name ) = @_; 742 return 0 743 if ( !defined $name 744 || !defined $self->{'_elements'} 745 || scalar @{ $self->{'_elements'} } == 0 ); 746 foreach ( @{ $self->{'_elements'} } ) { 747 return 1 if ( $_ eq $name ); 748 } 749 return 0; 750} 751 752=head2 in_element 753 754 Title : in_element 755 Usage : if( $eventgenerator->in_element($element) ) {} 756 Function: Test if we are in a particular element 757 This is different than 'within' because 'in' only 758 tests its immediate parent. 759 Returns : boolean 760 Args : string element name 761 762=cut 763 764sub in_element { 765 my ( $self, $name ) = @_; 766 return 0 if !defined $self->{'_elements'}->[0]; 767 return ( $self->{'_elements'}->[0] eq $name ); 768} 769 770=head2 start_document 771 772 Title : start_document 773 Usage : $eventgenerator->start_document 774 Function: Handle a start document event 775 Returns : none 776 Args : none 777 778=cut 779 780sub start_document { 781 my ($self) = @_; 782 $self->{'_lasttype'} = ''; 783 $self->{'_values'} = {}; 784 $self->{'_result'} = undef; 785 $self->{'_elements'} = []; 786} 787 788=head2 end_document 789 790 Title : end_document 791 Usage : $eventgenerator->end_document 792 Function: Handles an end document event 793 Returns : Bio::Search::Result::ResultI object 794 Args : none 795 796=cut 797 798sub end_document { 799 my ($self) = @_; 800 return $self->{'_result'}; 801} 802 803=head2 result_count 804 805 Title : result_count 806 Usage : my $count = $searchio->result_count 807 Function: Returns the number of results we have processed 808 Returns : integer 809 Args : none 810 811=cut 812 813sub result_count { 814 my $self = shift; 815 return $self->{'_result_count'}; 816} 817 818=head2 model 819 820 Title : model 821 Usage : my $model = $parser->model(); 822 Function: Get/Set model; Infernal currently does not output 823 the model name (Rfam ID) 824 Returns : String (name of model) 825 Args : [optional] String (name of model) 826 827=cut 828 829sub model { 830 my $self = shift; 831 return $self->{'_model'} = shift if @_; 832 return $self->{'_model'}; 833} 834 835=head2 database 836 837 Title : database 838 Usage : my $database = $parser->database(); 839 Function: Get/Set database; pre-v.1 versions of Infernal do not output 840 the database name 841 Returns : String (database name) 842 Args : [optional] String (database name) 843 844=cut 845 846sub database { 847 my $self = shift; 848 return $self->{'_database'} = shift if @_; 849 return $self->{'_database'}; 850} 851 852=head2 algorithm 853 854 Title : algorithm 855 Usage : my $algorithm = $parser->algorithm(); 856 Function: Get/Set algorithm; pre-v.1 versions of Infernal do not output 857 the algorithm name 858 Returns : String (algorithm name) 859 Args : [optional] String (algorithm name) 860 861=cut 862 863sub algorithm { 864 my $self = shift; 865 return $self->{'_algorithm'} = shift if @_; 866 return $self->{'_algorithm'}; 867} 868 869=head2 query_accession 870 871 Title : query_accession 872 Usage : my $acc = $parser->query_accession(); 873 Function: Get/Set query (model) accession; pre-v1.1 Infernal does not output 874 the accession number (Rfam accession #) 875 Returns : String (accession) 876 Args : [optional] String (accession) 877 878=cut 879 880sub query_accession { 881 my $self = shift; 882 return $self->{'_query_accession'} = shift if @_; 883 return $self->{'_query_accession'}; 884} 885 886=head2 query_description 887 888 Title : query_description 889 Usage : my $acc = $parser->query_description(); 890 Function: Get/Set query (model) description; pre-v1.1 Infernal does not output 891 the Rfam description 892 Returns : String (description) 893 Args : [optional] String (description) 894 895=cut 896 897sub query_description { 898 my $self = shift; 899 return $self->{'_query_description'} = shift if @_; 900 return $self->{'_query_description'}; 901} 902 903=head2 hsp_minscore 904 905 Title : hsp_minscore 906 Usage : my $cutoff = $parser->hsp_minscore(); 907 Function: Get/Set min bit score cutoff (for generating Hits/HSPs) 908 Returns : score (number) 909 Args : [optional] score (number) 910 911=cut 912 913sub hsp_minscore { 914 my $self = shift; 915 return $self->{'_hsp_minscore'} = shift if @_; 916 return $self->{'_hsp_minscore'}; 917} 918 919=head2 convert_meta 920 921 Title : convert_meta 922 Usage : $parser->convert_meta(1); 923 Function: Get/Set boolean flag for converting Infernal WUSS format 924 to a simple bracketed format (simple WUSS by default) 925 Returns : boolean flag (TRUE or FALSE) 926 Args : [optional] boolean (eval's to TRUE or FALSE) 927 928=cut 929 930sub convert_meta { 931 my $self = shift; 932 return $self->{'_convert_meta'} = shift if @_; 933 return $self->{'_convert_meta'}; 934} 935 936=head2 version 937 938 Title : version 939 Usage : $parser->version(); 940 Function: Set the Infernal cmsearch version 941 Returns : version 942 Args : [optional] version 943 944=cut 945 946sub version { 947 my $self = shift; 948 return $self->{'_version'} = shift if @_; 949 return $self->{'_version'}; 950} 951 952=head2 structure_symbols 953 954 Title : structure_symbols 955 Usage : my $hashref = $parser->structure_symbols(); 956 Function: Get/Set RNA structure symbols 957 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc) 958 : default = < (5-prime) 959 > (3-prime) 960 : (single-strand) 961 ? (unknown) 962 . (gap) 963 Args : Hash ref of substitute delimiters, using above keys. 964 965=cut 966 967sub structure_symbols { 968 my ($self, $delim) = @_; 969 if ($delim) { 970 if (ref($delim) =~ m{HASH}) { 971 my %data = %{ $delim }; 972 for my $d (@VALID_SYMBOLS) { 973 if ( exists $data{$d} ) { 974 $self->{'_delimiter'}->{$d} = $data{$d}; 975 } 976 } 977 } else { 978 $self->throw("Args to helix_delimiters() should be in a hash reference"); 979 } 980 } 981 return $self->{'_delimiter'}; 982} 983 984=head2 simple_meta 985 986 Title : simple_meta 987 Usage : my $string = $parser->simple_meta($str); 988 Function: converts more complex WUSS meta format into simple bracket format 989 using symbols defined in structure_symbols() 990 Returns : converted string 991 Args : [required] string to convert 992 Note : This is a very simple conversion method to get simple bracketed 993 format from Infernal data. If the convert_meta() flag is set, 994 this is the method used to convert the strings. 995 996=cut 997 998sub simple_meta { 999 my ($self, $str) = @_; 1000 $self->throw("No string arg sent!") if !$str; 1001 my $structs = $self->structure_symbols(); 1002 my ($ls, $rs, $ss, $unk, $gap) = ($structs->{'5-prime'}, $structs->{'3-prime'}, 1003 $structs->{'single-strand'}, $structs->{'unknown'}, 1004 $structs->{'gap'}); 1005 $str =~ s{[\(\<\[\{]}{$ls}g; 1006 $str =~ s{[\)\>\]\}]}{$rs}g; 1007 $str =~ s{[:,_-]}{$ss}g; 1008 $str =~ s{\.}{$gap}g; 1009 # unknown not handled yet 1010 return $str; 1011} 1012 1013## private methods 1014 1015# this is a hack which guesses the format and sets the handler for parsing in 1016# an instance; it'll be taken out when infernal 1.0 is released 1017 1018sub _parse_latest { 1019 my ($self) = @_; 1020 my $seentop = 0; 1021 local $/ = "\n"; 1022 my ($accession, $description) = ($self->query_accession, $self->query_description); 1023 my ($maxscore, $mineval, $minpval); 1024 $self->start_document(); 1025 my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend); 1026 PARSER: 1027 while (my $line = $self->_readline) { 1028 next if $line =~ m{^\s+$}; 1029 # stats aren't parsed yet... 1030 if ($line =~ m{^\#\s+cmsearch}xms) { 1031 $seentop = 1; 1032 $self->start_element({'Name' => 'Result'}); 1033 $self->element_hash({ 1034 'Infernal_program' => 'CMSEARCH' 1035 }); 1036 } 1037 elsif ($line =~ m{^\#\sINFERNAL\s+(\d+\.\d+)}xms) { 1038 $self->element_hash({ 1039 'Infernal_version' => $1, 1040 }); 1041 } 1042 elsif ($line =~ m{^\#\scommand:.*?\s(\S+)$}xms) { 1043 $self->element_hash({ 1044 'Infernal_db' => $1, 1045 }); 1046 } 1047 elsif ($line =~ m{^\#\s+dbsize\(Mb\):\s+(\d+\.\d+)}xms) { 1048 # store absolute DB length 1049 $self->element_hash({ 1050 'Infernal_db-let' => $1 * 1e6 1051 }); 1052 } 1053 elsif ($line =~ m{^CM(?:\s(\d+))?:\s*(\S+)}xms) { 1054 # not sure, but it's possible single reports may contain multiple 1055 # models; if so, they should be rolled over into a new ResultI 1056 #print STDERR "ACC: $accession\nDESC: $description\n"; 1057 $self->element_hash({ 1058 'Infernal_query-def' => $2, # present in output now 1059 'Infernal_query-acc' => $accession, 1060 'Infernal_querydesc' => $description 1061 }); 1062 } 1063 elsif ($line =~ m{^>\s*(\S+)} ){ 1064 #$self->debug("Start Hit: Found hit:$1\n"); 1065 if ($self->in_element('hit')) { 1066 $self->element_hash({'Hit_score' => $maxscore, 1067 'Hit_bits' => $maxscore}); 1068 ($maxscore, $minpval, $mineval) = undef; 1069 $self->end_element({'Name' => 'Hit'}); 1070 } 1071 $lasthit = $1; 1072 } 1073 elsif ($line =~ m{ 1074 ^\sQuery\s=\s\d+\s-\s\d+,\s # Query start/end 1075 Target\s=\s(\d+)\s-\s(\d+) # Target start/end 1076 }xmso) { 1077 # Query (model) start/end always the same, determined from 1078 # the HSP length 1079 ($laststart, $lastend) = ($1, $2); 1080 #$self->debug("Found hit coords:$laststart - $lastend\n"); 1081 } elsif ($line =~ m{ 1082 ^\sScore\s=\s([\d\.]+),\s # Score = Bitscore (for now) 1083 (?:E\s=\s([\d\.e-]+),\s # E-val optional 1084 P\s=\s([\d\.e-]+),\s)? # P-val optional 1085 GC\s= # GC not captured 1086 }xmso 1087 ) { 1088 ($lastscore, $lasteval, $lastpval) = ($1, $2, $3); 1089 #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||'')); 1090 $maxscore ||= $lastscore; 1091 if ($lasteval && $lastpval) { 1092 $mineval ||= $lasteval; 1093 $minpval ||= $lastpval; 1094 $mineval = ($mineval > $lasteval) ? $lasteval : 1095 $mineval; 1096 $minpval = ($minpval > $lastpval) ? $lastpval : 1097 $minpval; 1098 } 1099 $maxscore = ($maxscore < $lastscore) ? $lastscore : 1100 $maxscore; 1101 if (!$self->within_element('hit')) { 1102 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit); 1103 $self->start_element({'Name' => 'Hit'}); 1104 $self->element_hash({ 1105 'Hit_id' => $lasthit, 1106 'Hit_accession' => $ver ? "$acc.$ver" : 1107 $acc ? $acc : $lasthit, 1108 'Hit_gi' => $gi 1109 }); 1110 } 1111 if (!$self->in_element('hsp')) { 1112 $self->start_element({'Name' => 'Hsp'}); 1113 } 1114 1115 # hsp is similar to older output 1116 } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP 1117 $self->_pushback($line); # set up for loop 1118 #$self->debug("Start HSP\n"); 1119 # what is length of the gap to the structure data? 1120 my $offset = length($1); 1121 my ($ct, $strln) = 0; 1122 my $hsp; 1123 HSP: 1124 my %hsp_key = ('0' => 'meta', 1125 '1' => 'query', 1126 '2' => 'midline', 1127 '3' => 'hit'); 1128 HSP: 1129 while (defined ($line = $self->_readline)) { 1130 chomp $line; 1131 next if (!$line); # toss empty lines 1132 # next if $line =~ m{^\s*$}; # toss empty lines 1133 # it is possible to have homology lines consisting 1134 # entirely of spaces if the subject has a large 1135 # insertion where nothing matches the model 1136 1137 # exit loop if at end of file or upon next hit/HSP 1138 if ($line =~ m{^\s{0,2}\S+}) { 1139 $self->_pushback($line); 1140 last HSP; 1141 } 1142 # iterate to keep track of each line (4 lines per hsp block) 1143 my $iterator = $ct % 4; 1144 # strlen set only with structure lines (proper length) 1145 $strln = length($line) if $iterator == 0; 1146 # only grab the data needed (hit start and stop in hit line above) 1147 1148 my $data = substr($line, $offset, $strln-$offset); 1149 $hsp->{ $hsp_key{$iterator} } .= $data; 1150 1151 $ct++; 1152 } 1153 1154 # query start, end are from the actual query length (entire hit is 1155 # mapped to CM data, so all CM data is represented) 1156 # works for now... 1157 if ($self->in_element('hsp')) { 1158 # In some cases with HSPs unaligned residues are present in 1159 # the hit or query (Ex: '*[ 8]*' is 8 unaligned residues). 1160 # This info needs to be passed on unmodifed to the HSP class 1161 # and handled there as it is subjectively changed based on 1162 # use. 1163 my $strlen = 0; 1164 1165 # catch any insertions and add them into the actual length 1166 while ($hsp->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) { 1167 $strlen += $1; 1168 } 1169 # add on the actual residues 1170 $strlen += $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z}; 1171 1172 my $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) : 1173 $hsp->{'meta'}; 1174 $self->element_hash( 1175 {'Hsp_stranded' => 'HIT', 1176 'Hsp_qseq' => $hsp->{'query'}, 1177 'Hsp_hseq' => $hsp->{'hit'}, 1178 'Hsp_midline' => $hsp->{'midline'}, 1179 'Hsp_structure' => $metastr, 1180 'Hsp_query-from' => 1, 1181 'Infernal_query-len' => $strlen, 1182 'Hsp_query-to' => $strlen, 1183 'Hsp_hit-from' => $laststart, 1184 'Hsp_hit-to' => $lastend, 1185 'Hsp_score' => $lastscore, 1186 'Hsp_bit-score' => $lastscore, 1187 }); 1188 $self->element_hash( 1189 {'Hsp_evalue' => $lasteval, 1190 'Hsp_pvalue' => $lastpval, 1191 }) if ($lasteval && $lastpval); 1192 $self->end_element({'Name' => 'Hsp'}); 1193 } 1194 # result now ends with // and 'Fin' 1195 } elsif ($line =~ m{^//}xms ) { 1196 if ($self->within_element('result') && $seentop) { 1197 if ($self->in_element('hit')) { 1198 $self->element_hash({'Hit_score' => $maxscore, 1199 'Hit_bits' => $maxscore}); 1200 # don't know where to put minpval yet 1201 $self->element_hash({'Hit_signif' => $mineval}) if $mineval; 1202 $self->element_hash({'Hit_p' => $minpval}) if $minpval; 1203 $self->end_element({'Name' => 'Hit'}); 1204 } 1205 last PARSER; 1206 } 1207 } 1208 } 1209 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } ); 1210 $self->end_element( { 'Name' => 'Result' } ) if $seentop; 1211 return $self->end_document(); 1212} 1213 1214# cmsearch 0.81 (pre-1.0) 1215sub _parse_pre { 1216 my ($self) = @_; 1217 my $seentop = 0; 1218 local $/ = "\n"; 1219 my ($accession, $db, $algorithm, $description, $version) = 1220 ($self->query_accession, $self->database, $self->algorithm, 1221 $self->query_description, '0.81'); 1222 my ($maxscore, $mineval, $minpval); 1223 $self->start_document(); 1224 my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend); 1225 PARSER: 1226 while (my $line = $self->_readline) { 1227 next if $line =~ m{^\s+$}; 1228 # stats aren't parsed yet... 1229 if ($line =~ m{CM\s\d+:\s*(\S+)}xms) { 1230 #$self->debug("Start Result: Found model:$1\n"); 1231 if (!$self->within_element('result')) { 1232 $seentop = 1; 1233 $self->start_element({'Name' => 'Result'}); 1234 $self->element_hash({ 1235 'Infernal_program' => $algorithm, 1236 'Infernal_query-def' => $1, # present in output now 1237 'Infernal_query-acc' => $accession, 1238 'Infernal_querydesc' => $description, 1239 'Infernal_db' => $db 1240 }); 1241 } 1242 } elsif ($line =~ m{^>\s*(\S+)} ){ 1243 #$self->debug("Start Hit: Found hit:$1\n"); 1244 if ($self->in_element('hit')) { 1245 $self->element_hash({'Hit_score' => $maxscore, 1246 'Hit_bits' => $maxscore}); 1247 ($maxscore, $minpval, $mineval) = undef; 1248 $self->end_element({'Name' => 'Hit'}); 1249 } 1250 $lasthit = $1; 1251 } 1252 elsif ($line =~ m{ 1253 ^\sQuery\s=\s\d+\s-\s\d+,\s # Query start/end 1254 Target\s=\s(\d+)\s-\s(\d+) # Target start/end 1255 }xmso) { 1256 # Query (model) start/end always the same, determined from 1257 # the HSP length 1258 ($laststart, $lastend) = ($1, $2); 1259 #$self->debug("Found hit coords:$laststart - $lastend\n"); 1260 } elsif ($line =~ m{ 1261 ^\sScore\s=\s([\d\.]+),\s # Score = Bitscore (for now) 1262 (?:E\s=\s([\d\.e-]+),\s # E-val optional 1263 P\s=\s([\d\.e-]+),\s)? # P-val optional 1264 GC\s= # GC not captured 1265 }xmso 1266 ) { 1267 ($lastscore, $lasteval, $lastpval) = ($1, $2, $3); 1268 #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||'')); 1269 $maxscore ||= $lastscore; 1270 if ($lasteval && $lastpval) { 1271 $mineval ||= $lasteval; 1272 $minpval ||= $lastpval; 1273 $mineval = ($mineval > $lasteval) ? $lasteval : 1274 $mineval; 1275 $minpval = ($minpval > $lastpval) ? $lastpval : 1276 $minpval; 1277 } 1278 $maxscore = ($maxscore < $lastscore) ? $lastscore : 1279 $maxscore; 1280 if (!$self->within_element('hit')) { 1281 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit); 1282 $self->start_element({'Name' => 'Hit'}); 1283 $self->element_hash({ 1284 'Hit_id' => $lasthit, 1285 'Hit_accession' => $ver ? "$acc.$ver" : 1286 $acc ? $acc : $lasthit, 1287 'Hit_gi' => $gi 1288 }); 1289 } 1290 if (!$self->in_element('hsp')) { 1291 $self->start_element({'Name' => 'Hsp'}); 1292 } 1293 1294 # hsp is similar to older output 1295 } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP 1296 $self->_pushback($line); # set up for loop 1297 #$self->debug("Start HSP\n"); 1298 # what is length of the gap to the structure data? 1299 my $offset = length($1); 1300 my ($ct, $strln) = 0; 1301 my $hsp; 1302 HSP: 1303 my %hsp_key = ('0' => 'meta', 1304 '1' => 'query', 1305 '2' => 'midline', 1306 '3' => 'hit'); 1307 HSP: 1308 while (defined ($line = $self->_readline)) { 1309 chomp $line; 1310 next if (!$line); # toss empty lines 1311 # next if $line =~ m{^\s*$}; # toss empty lines 1312 # it is possible to have homology lines consisting 1313 # entirely of spaces if the subject has a large 1314 # insertion where nothing matches the model 1315 1316 # exit loop if at end of file or upon next hit/HSP 1317 if ($line =~ m{^\s{0,2}\S+}) { 1318 $self->_pushback($line); 1319 last HSP; 1320 } 1321 # iterate to keep track of each line (4 lines per hsp block) 1322 my $iterator = $ct%4; 1323 # strlen set only with structure lines (proper length) 1324 $strln = length($line) if $iterator == 0; 1325 # only grab the data needed (hit start and stop in hit line above) 1326 1327 my $data = substr($line, $offset, $strln-$offset); 1328 $hsp->{ $hsp_key{$iterator} } .= $data; 1329 1330 $ct++; 1331 } 1332 1333 # query start, end are from the actual query length (entire hit is 1334 # mapped to CM data, so all CM data is represented) 1335 # works for now... 1336 if ($self->in_element('hsp')) { 1337 my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z}; 1338 1339 my $metastr; 1340 $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) : 1341 ($hsp->{'meta'}); 1342 $self->element_hash( 1343 {'Hsp_stranded' => 'HIT', 1344 'Hsp_qseq' => $hsp->{'query'}, 1345 'Hsp_hseq' => $hsp->{'hit'}, 1346 'Hsp_midline' => $hsp->{'midline'}, 1347 'Hsp_structure' => $metastr, 1348 'Hsp_query-from' => 1, 1349 'Infernal_query-len' => $strlen, 1350 'Hsp_query-to' => $strlen, 1351 'Hsp_hit-from' => $laststart, 1352 'Hsp_hit-to' => $lastend, 1353 'Hsp_score' => $lastscore, 1354 'Hsp_bit-score' => $lastscore, 1355 }); 1356 $self->element_hash( 1357 {'Hsp_evalue' => $lasteval, 1358 'Hsp_pvalue' => $lastpval, 1359 }) if ($lasteval && $lastpval); 1360 $self->end_element({'Name' => 'Hsp'}); 1361 } 1362 # result now ends with // and 'Fin' 1363 } elsif ($line =~ m{^//}xms ) { 1364 if ($self->within_element('result') && $seentop) { 1365 $self->element( 1366 {'Name' => 'Infernal_version', 1367 'Data' => $version} 1368 ); 1369 if ($self->in_element('hit')) { 1370 $self->element_hash({'Hit_score' => $maxscore, 1371 'Hit_bits' => $maxscore}); 1372 # don't know where to put minpval yet 1373 $self->element_hash({'Hit_signif' => $mineval}) if $mineval; 1374 $self->end_element({'Name' => 'Hit'}); 1375 } 1376 last PARSER; 1377 } 1378 } 1379 } 1380 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } ); 1381 $self->end_element( { 'Name' => 'Result' } ) if $seentop; 1382 return $self->end_document(); 1383} 1384 1385# cmsearch 0.72 and below; will likely be dropped when Infernal 1.0 is released 1386sub _parse_old { 1387 my ($self) = @_; 1388 my $seentop = 0; 1389 local $/ = "\n"; 1390 my ($accession, $db, $algorithm, $model, $description, $version) = 1391 ($self->query_accession, $self->database, $self->algorithm, 1392 $self->model, $self->query_description, $self->version); 1393 my $maxscore; 1394 my $cutoff = $self->hsp_minscore; 1395 $self->start_document(); 1396 local ($_); 1397 my $line; 1398 my ($lasthit, $lastscore, $laststart, $lastend); 1399 my $hitline; 1400 PARSER: 1401 while ( defined( $line = $self->_readline ) ) { 1402 next if $line =~ m{^\s+$}; 1403 # bypass this for now... 1404 next if $line =~ m{^HMM\shit}; 1405 # pre-0.81 1406 if ($line =~ m{^sequence:\s+(\S+)} ){ 1407 if (!$self->within_element('result')) { 1408 $seentop = 1; 1409 $self->start_element({'Name' => 'Result'}); 1410 $self->element_hash({ 1411 'Infernal_program' => $algorithm, 1412 'Infernal_query-def' => $model, 1413 'Infernal_query-acc' => $accession, 1414 'Infernal_querydesc' => $description, 1415 'Infernal_db' => $db 1416 }); 1417 } 1418 if ($self->in_element('hit')) { 1419 $self->element_hash({'Hit_score' => $maxscore, 1420 'Hit_bits' => $maxscore}); 1421 $maxscore = undef; 1422 $self->end_element({'Name' => 'Hit'}); 1423 } 1424 $lasthit = $1; 1425 } elsif ($line =~ m{^hit\s+\d+\s+:\s+(\d+)\s+(\d+)\s+(\d+\.\d+)\s+bits}xms) { 1426 ($laststart, $lastend, $lastscore) = ($1, $2, $3); 1427 $maxscore = $lastscore unless $maxscore; 1428 if ($lastscore > $cutoff) { 1429 if (!$self->within_element('hit')) { 1430 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit); 1431 $self->start_element({'Name' => 'Hit'}); 1432 $self->element_hash({ 1433 'Hit_id' => $lasthit, 1434 'Hit_accession' => $ver ? "$acc.$ver" : 1435 $acc ? $acc : $lasthit, 1436 'Hit_gi' => $gi 1437 }); 1438 } 1439 # necessary as infernal 0.71 has repeated hit line 1440 if (!$self->in_element('hsp')) { 1441 $self->start_element({'Name' => 'Hsp'}); 1442 } 1443 $maxscore = ($maxscore < $lastscore) ? $lastscore : 1444 $maxscore; 1445 } 1446 } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP 1447 $self->_pushback($line); # set up for loop 1448 # what is length of the gap to the structure data? 1449 my $offset = length($1); 1450 my ($ct, $strln) = 0; 1451 my $hsp; 1452 HSP: 1453 my %hsp_key = ('0' => 'meta', 1454 '1' => 'query', 1455 '2' => 'midline', 1456 '3' => 'hit'); 1457 HSP: 1458 while ($line = $self->_readline) { 1459 next if $line =~ m{^\s*$}; # toss empty lines 1460 chomp $line; 1461 # exit loop if at end of file or upon next hit/HSP 1462 if (!defined($line) || $line =~ m{^\S+}) { 1463 $self->_pushback($line); 1464 last HSP; 1465 } 1466 # iterate to keep track of each line (4 lines per hsp block) 1467 my $iterator = $ct%4; 1468 # strlen set only with structure lines (proper length) 1469 $strln = length($line) if $iterator == 0; 1470 # only grab the data needed (hit start and stop in hit line above) 1471 1472 my $data = substr($line, $offset, $strln-$offset); 1473 $hsp->{ $hsp_key{$iterator} } .= $data; 1474 $ct++; 1475 } 1476 # query start, end are from the actual query length (entire hit is 1477 # mapped to CM data, so all CM data is represented) 1478 # works for now... 1479 if ($self->in_element('hsp')) { 1480 my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z}; 1481 1482 my $metastr; 1483 # Ugh...these should be passed in a hash 1484 $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) : 1485 ($hsp->{'meta'}); 1486 $self->element_hash( 1487 {'Hsp_stranded' => 'HIT', 1488 'Hsp_qseq' => $hsp->{'query'}, 1489 'Hsp_hseq' => $hsp->{'hit'}, 1490 'Hsp_midline' => $hsp->{'midline'}, 1491 'Hsp_structure' => $metastr, 1492 'Hsp_query-from' => 1, 1493 'Infernal_query-len' => $strlen, 1494 'Hsp_query-to' => $strlen, 1495 'Hsp_hit-from' => $laststart, 1496 'Hsp_hit-to' => $lastend, 1497 'Hsp_score' => $lastscore, 1498 'Hsp_bit-score' => $lastscore 1499 }); 1500 $self->end_element({'Name' => 'Hsp'}); 1501 } 1502 } elsif ($line =~ m{^memory}xms || $line =~ m{^CYK\smemory}xms ) { 1503 if ($self->within_element('result') && $seentop) { 1504 $self->element( 1505 {'Name' => 'Infernal_version', 1506 'Data' => $version} 1507 ); 1508 if ($self->in_element('hit')) { 1509 $self->element_hash({'Hit_score' => $maxscore, 1510 'Hit_bits' => $maxscore}); 1511 $self->end_element({'Name' => 'Hit'}); 1512 } 1513 last PARSER; 1514 } 1515 } 1516 } 1517 $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } ); 1518 $self->end_element( { 'Name' => 'Result' } ) if $seentop; 1519 return $self->end_document(); 1520} 1521 15221; 1523