1#
2# BioPerl module for Bio::SearchIO::infernal
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7#
8# Copyright Chris Fields
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::SearchIO::infernal - SearchIO-based Infernal parser
17
18=head1 SYNOPSIS
19
20  my $parser = Bio::SearchIO->new(-format => 'infernal',
21                                  -file => 'purine.inf');
22  while( my $result = $parser->next_result ) {
23        # general result info, such as model used, Infernal version
24        while( my $hit = $result->next_hit ) {
25            while( my $hsp = $hit->next_hsp ) {
26                # ...
27            }
28        }
29  }
30
31=head1 DESCRIPTION
32
33This is a SearchIO-based parser for Infernal output from the cmsearch program.
34It currently parses cmsearch output for Infernal versions 0.7-1.1; older
35versions may work but will not be supported.
36
37The latest version of Infernal is 1.1. The output has changed substantially
38relative to version 1.0. Versions 1.x are stable releases (and output has
39stabilized) therefore it is highly recommended that users upgrade to using
40the latest Infernal release. Support for the older pre-v.1 developer releases
41will be dropped for future core 1.6 releases.
42
43=head1 FEEDBACK
44
45=head2 Mailing Lists
46
47User feedback is an integral part of the evolution of this and other
48Bioperl modules. Send your comments and suggestions preferably to
49the Bioperl mailing list.  Your participation is much appreciated.
50
51  bioperl-l@bioperl.org                  - General discussion
52  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
53
54=head2 Support
55
56Please direct usage questions or support issues to the mailing list:
57
58I<bioperl-l@bioperl.org>
59
60rather than to the module maintainer directly. Many experienced and
61reponsive experts will be able look at the problem and quickly
62address it. Please include a thorough description of the problem
63with code and data examples if at all possible.
64
65=head2 Reporting Bugs
66
67Report bugs to the Bioperl bug tracking system to help us keep track
68of the bugs and their resolution. Bug reports can be submitted via the
69web:
70
71  https://github.com/bioperl/bioperl-live/issues
72
73=head1 AUTHOR - Chris Fields
74
75Email cjfields-at-uiuc-dot-edu
76
77=head1 CONTRIBUTORS
78
79  Jeffrey Barrick, Michigan State University
80  Paul Cantalupo, University of Pittsburgh
81
82=head1 APPENDIX
83
84The rest of the documentation details each of the object methods.
85Internal methods are usually preceded with a _
86
87=cut
88
89# Let the code begin...
90
91package Bio::SearchIO::infernal;
92$Bio::SearchIO::infernal::VERSION = '1.7.7';
93use strict;
94
95use Data::Dumper;
96use base qw(Bio::SearchIO);
97
98our %MODEMAP = (
99	    'Result'             => 'result',
100	    'Hit'                => 'hit',
101	    'Hsp'                => 'hsp'
102	    );
103
104our %MAPPING = (
105        'Hsp_bit-score'   => 'HSP-bits',
106        'Hsp_score'       => 'HSP-score',
107        'Hsp_evalue'      => 'HSP-evalue', # evalues only in v0.81, optional
108        'Hsp_pvalue'      => 'HSP-pvalue', # pvalues only in v0.81, optional
109        'Hsp_query-from'  => 'HSP-query_start',
110        'Hsp_query-to'    => 'HSP-query_end',
111        'Hsp_query-strand'=> 'HSP-query_strand',
112        'Hsp_hit-from'    => 'HSP-hit_start',
113        'Hsp_hit-to'      => 'HSP-hit_end',
114        'Hsp_hit-strand'  => 'HSP-hit_strand',
115        'Hsp_gaps'        => 'HSP-hsp_gaps',
116        'Hsp_hitgaps'     => 'HSP-hit_gaps',
117        'Hsp_querygaps'   => 'HSP-query_gaps',
118        'Hsp_qseq'        => 'HSP-query_seq',
119        'Hsp_ncline'      => 'HSP-nc_seq',
120        'Hsp_hseq'        => 'HSP-hit_seq',
121        'Hsp_midline'     => 'HSP-homology_seq',
122        'Hsp_pline'       => 'HSP-pp_seq',
123        'Hsp_structure'   => 'HSP-meta',
124        'Hsp_align-len'   => 'HSP-hsp_length',
125        'Hsp_stranded'    => 'HSP-stranded',
126
127        'Hit_id'        => 'HIT-name',
128        'Hit_len'       => 'HIT-length',
129        'Hit_gi'        => 'HIT-ncbi_gi',
130        'Hit_accession' => 'HIT-accession',
131        'Hit_desc'      => 'HIT-description',
132        'Hit_def'       => 'HIT-description',
133        'Hit_signif'    => 'HIT-significance', # evalues in v1.1 and v0.81, optional
134        'Hit_p'         => 'HIT-p',            # pvalues only in 1.0, optional
135        'Hit_score'     => 'HIT-score', # best HSP bit score (in v1.1, the only HSP bit score)
136        'Hit_bits'      => 'HIT-bits',  # best HSP bit score (ditto)
137
138        'Infernal_program'  => 'RESULT-algorithm_name', # get/set
139        'Infernal_version'  => 'RESULT-algorithm_version', # get/set
140        'Infernal_query-def'=> 'RESULT-query_name', # get/set
141        'Infernal_query-len'=> 'RESULT-query_length',
142        'Infernal_query-acc'=> 'RESULT-query_accession', # get/set
143        'Infernal_querydesc'=> 'RESULT-query_description', # get/set
144        'Infernal_cm'       => 'RESULT-cm_name',
145        'Infernal_db'       => 'RESULT-database_name',  # get/set
146        'Infernal_db-len'   => 'RESULT-database_entries', # in v1.1 only
147        'Infernal_db-let'   => 'RESULT-database_letters', # in v1.1 only
148	     );
149
150my $MINSCORE = 0;
151my $DEFAULT_ALGORITHM = 'cmsearch';
152my $DEFAULT_VERSION = '1.1';
153
154my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown gap);
155my %STRUCTURE_SYMBOLS = (
156                   '5-prime'        => '<',
157                   '3-prime'        => '>',
158                   'single-strand'  => ':',
159                   'unknown'        => '?',
160                   'gap'            => '.'
161                   );
162
163=head2 new
164
165 Title   : new
166 Usage   : my $obj = Bio::SearchIO::infernal->new();
167 Function: Builds a new Bio::SearchIO::infernal object
168 Returns : Bio::SearchIO::infernal
169 Args    : -fh/-file      => cmsearch (infernal) filename
170           -format        => 'infernal'
171           -model         => query model (Rfam ID) (default undef)
172           -database      => database name (default undef)
173           -query_acc     => query accession, eg. Rfam accession RF####
174           -query_desc    => query description, eg. Rfam description
175           -hsp_minscore  => minimum HSP score cutoff
176           -convert_meta  => boolean, set to convert meta string to simple WUSS format
177           -symbols       => hash ref of structure symbols to use
178                             (default symbols in %STRUCTURE_SYMBOLS hash)
179
180=cut
181
182sub _initialize {
183    my ( $self, @args ) = @_;
184    $self->SUPER::_initialize(@args);
185    my ($model, $database, $convert, $symbols, $cutoff,
186        $desc, $accession, $algorithm, $version) =
187        $self->_rearrange([qw(MODEL
188                          DATABASE
189                          CONVERT_META
190                          SYMBOLS
191                          HSP_MINSCORE
192                          QUERY_DESC
193                          QUERY_ACC
194                          ALGORITHM
195                          VERSION)],@args);
196    my $handler = $self->_eventHandler;
197    $handler->register_factory(
198        'result',
199        Bio::Factory::ObjectFactory->new(
200            -type      => 'Bio::Search::Result::INFERNALResult',
201            -interface => 'Bio::Search::Result::ResultI',
202            -verbose   => $self->verbose
203        )
204    );
205
206    $handler->register_factory(
207        'hit',
208        Bio::Factory::ObjectFactory->new(
209            -type      => 'Bio::Search::Hit::ModelHit',
210            -interface => 'Bio::Search::Hit::HitI',
211            -verbose   => $self->verbose
212        )
213    );
214
215    $handler->register_factory(
216        'hsp',
217        Bio::Factory::ObjectFactory->new(
218            -type      => 'Bio::Search::HSP::ModelHSP',
219            -interface => 'Bio::Search::HSP::HSPI',
220            -verbose   => $self->verbose
221        )
222    );
223
224    defined $model     && $self->model($model);
225    defined $database  && $self->database($database);
226    defined $accession && $self->query_accession($accession);
227    defined $convert   && $self->convert_meta($convert);
228    defined $desc      && $self->query_description($desc);
229
230    $version ||= $DEFAULT_VERSION;
231    $self->version($version);
232    $symbols ||= \%STRUCTURE_SYMBOLS;
233    $self->structure_symbols($symbols);
234    $cutoff ||= $MINSCORE;
235    $self->hsp_minscore($cutoff);
236    $algorithm ||= $DEFAULT_ALGORITHM;
237    $self->algorithm($algorithm);
238}
239
240=head2 next_result
241
242 Title   : next_result
243 Usage   : my $hit = $searchio->next_result;
244 Function: Returns the next Result from a search
245 Returns : Bio::Search::Result::ResultI object
246 Args    : none
247
248=cut
249
250sub next_result {
251    my ($self) = @_;
252    unless (exists $self->{'_handlerset'}) {
253        my $line;
254        while ($line = $self->_readline) {
255            # advance to first line
256            next if $line =~ m{^\s*$};
257            # newer output starts with model name
258            if ($line =~ m{^\#\s+cmsearch\s}) {
259              my $secondline = $self->_readline;
260              if ($secondline =~ m{INFERNAL 1\.1}) {
261                $self->{'_handlerset'} = '1.1';
262              }
263              else {
264                $self->{'_handlerset'} = 'latest';  # v1.0
265              }
266              $self->_pushback($secondline);
267            }
268            elsif ($line =~ m{^CM\s\d+:}) {
269              $self->{'_handlerset'} = 'pre-1.0';
270            }
271            else {
272              $self->{'_handlerset'} ='old';
273            }
274            last;
275        }
276        $self->_pushback($line);
277    }
278    return ($self->{'_handlerset'} eq '1.1')     ? $self->_parse_v1_1 :
279           ($self->{'_handlerset'} eq 'latest')  ? $self->_parse_latest :
280           ($self->{'_handlerset'} eq 'pre-1.0') ? $self->_parse_pre :
281           $self->_parse_old;
282}
283
284
285sub _parse_v1_1 {
286  my ($self) = @_;
287  my $seentop = 0;
288  local $/ = "\n";
289  my ($accession, $description) = ($self->query_accession, $self->query_description);
290  my ($buffer, $last, %modelcounter, @hit_list, %hitindex,
291                                     @hsp_list, %hspindex);
292  $self->start_document();
293  $buffer = $self->_readline;
294  if ( !defined $buffer || $buffer =~ m/^\[ok/ ) {  # end of report
295      return undef;
296  }
297  else {
298      $self->_pushback($buffer);
299  }
300
301  PARSER: # Parse each line of report
302  while ( defined( $buffer = $self->_readline ) ) {
303    my $hit_counter = 0;
304    my $lineorig = $buffer;
305    chomp $buffer;
306
307    # INFERNAL program name
308    if ( $buffer =~ m/^\#\s(\S+)\s\:\:\s/ ) {
309      $seentop = 1;
310      my $prog = $1;
311      $self->start_element( { 'Name' => 'Result' } );
312      $self->element_hash( { 'Infernal_program' => uc($prog) } );
313    }
314
315    # INFERNAL version and release date
316    elsif ( $buffer =~ m/^\#\sINFERNAL\s+(\S+)\s+\((.+)\)/ ) {
317      my $version     = $1;
318      my $versiondate = $2;
319      $self->{'_cmidline'} = $buffer;
320      $self->element_hash( { 'Infernal_version' => $version } );
321    }
322
323    # Query info
324    elsif ( $buffer =~ /^\#\squery (?:\w+ )?file\:\s+(\S+)/ ) {
325      $self->{'_cmfileline'} = $lineorig;
326      $self->element_hash( { 'Infernal_cm' => $1 } );
327    }
328
329    # Database info
330    elsif ( $buffer =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ) {
331      $self->{'_cmseqline'} = $lineorig;
332      $self->element_hash( { 'Infernal_db' => $1 } );
333    }
334
335    # Query data
336    elsif ( $buffer =~ m/^Query:\s+(\S+)\s+\[CLEN=(\d+)\]$/ ) {
337      $self->element_hash( { 'Infernal_query-def' => $1,
338                             'Infernal_query-len' => $2,
339                             'Infernal_query-acc' => $accession,
340                             'Infernal_querydesc' => $description,
341                            } );
342    }
343
344    # Get query accession
345    elsif ( $buffer =~ s/^Accession:\s+// && ! $accession) {
346      $buffer =~ s/\s+$//;
347      $self->element_hash( { 'Infernal_query-acc' => $buffer } );
348    }
349
350    # Get query description
351    elsif ( $buffer =~ s/^Description:\s+// && ! $description) {
352      $buffer =~ s/\s+$//;
353      $self->element_hash( { 'Infernal_querydesc' => $buffer } );
354    }
355
356    # Process hit table - including those below inclusion threshold
357    elsif ( $buffer =~ m/^Hit scores:/) {
358      @hit_list = ();   # here is case there are multi-query reports
359      while ( defined( $buffer = $self->_readline ) ) {
360        if ( $buffer =~ m/^Hit alignments:/ ) {
361          $self->_pushback($buffer);
362          last;
363        }
364        elsif (   $buffer =~ m/^\s+rank\s+E-value/
365               || $buffer =~ m/\-\-\-/
366               || $buffer =~ m/^$/
367               || $buffer =~ m/No hits detected/ ) {
368          next;
369        }
370
371        # Process hit
372        $hit_counter++;
373        my ($rank, $threshold, $eval, $score,
374            $bias, $hitid, $start, $end, $strand,
375            $mdl, $truc, $gc, @desc) = split( " ", $buffer );
376        my $desc = join " ", @desc;
377        $desc = '' if ( !defined($desc) );
378
379        push @hit_list, [ $hitid, $desc, $eval, $score ];
380        $hitindex{ $hitid.$hit_counter } = $#hit_list;
381      }
382    }
383
384    # Process hit alignments
385    elsif ( $buffer =~ /^Hit alignments:/ ) {
386      my $hitid;
387      my $align_counter = 0;
388      while ( defined( $buffer = $self->_readline ) ) {
389        if ( $buffer =~ /^Internal CM pipeline statistics summary/ ) {
390          $self->_pushback($buffer);
391          last;
392        }
393        if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) {  # defline of hit
394          $hitid    = $1;
395          my $desc = $2;
396          $align_counter++;
397          my $hitid_alignctr = $hitid.$align_counter;
398          $modelcounter{$hitid_alignctr} = 0;
399
400          # The Hit Description from the Hit table can be truncated if
401          # it is too long, so use the '>>' line description instead
402          $hit_list[ $hitindex{$hitid_alignctr} ][1] = $desc;
403
404          # Process hit information table
405          while ( defined( $buffer = $self->_readline ) ) {
406            if (   $buffer =~ m/^Internal CM pipeline statistics/
407                || $buffer =~ m/NC$/
408                || $buffer =~ m/^\>\>/ ) {
409              $self->_pushback($buffer);
410              last;
411            }
412            elsif (   $buffer =~ m/^\s+rank\s+E-value/
413                   || $buffer =~ m/^\s----/
414                   || $buffer =~ m/^$/
415                   || $buffer =~ m/No hits detected/ ) {
416              next;
417            }
418
419            # Get hsp data from table, push into @hsp;
420            my ( $rank,      $threshold, $eval,
421                 $score,     $bias,      $model,
422                 $cm_start,  $cm_stop,   $cm_cov,
423                 $seq_start, $seq_stop,  $seq_strand, $seq_cov,
424                 $acc,       $trunc,     $gc,
425                 ) = split( " ", $buffer );
426
427            # Try to get the Hit Length from the alignment information.
428            # For cmsearch, if sequence coverage ends in ']' it means that the
429            # alignment runs full-length flush to the end of the target.
430            my $hitlength = ( $seq_cov =~ m/\]$/ ) ? $seq_stop : 0;
431
432            my $tmphit = $hit_list[ $hitindex{$hitid_alignctr} ];
433            if ( !defined $tmphit ) {
434              $self->warn("Incomplete information: can't find HSP $hitid in list of hits\n");
435              next;
436            }
437
438            push @hsp_list, [ $hitid,
439                              $cm_start, $cm_stop,
440                              $seq_start, $seq_stop,
441                              $score,     $eval,
442                              $hitlength];
443            $modelcounter{$hitid_alignctr}++;
444            my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr};
445            $hspindex{$hsp_key} = $#hsp_list;
446          }
447        }
448        elsif ( $buffer =~ m/NC$/ ) { # start of HSP
449          # need CS line to get number of spaces before structure data
450          my $csline = $self->_readline;
451          $csline =~ m/^(\s+)\S+ CS$/;
452          my $offset = length($1);
453          $self->_pushback($csline);
454          $self->_pushback($buffer); # set up for loop
455
456          my ($ct, $strln) = 0;
457          my $hspdata;
458          HSP:
459          my %hspline = ('0' => 'nc',    '1' => 'meta',
460                         '2' => 'query', '3' => 'midline',
461                         '4' => 'hit',   '5' => 'pp');
462          HSP:
463          while (defined ($buffer = $self->_readline)) {
464            chomp $buffer;
465            if (   $buffer =~ /^>>\s/
466                || $buffer =~ /^Internal CM pipeline statistics/) {
467              $self->_pushback($buffer);
468              last HSP;
469            }
470            elsif ( $ct % 6 == 0 && $buffer =~ /^$/ ) {
471              next;
472            }
473            my $iterator = $ct % 6;
474            # NC line ends with ' NC' so remove these from the strlen count
475            $strln = ( length($buffer) - 3 ) if $iterator == 0;
476            my $data = substr($buffer, $offset, $strln-$offset);
477            $hspdata->{ $hspline{$iterator} } .= $data;
478
479            $ct++;
480          } # 'HSP' while loop
481
482          my $strlen = 0;
483          # catch any insertions and add them into the actual length
484          while ($hspdata->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) {
485            $strlen += $1;
486          }
487          # add on the actual residues
488          $strlen += $hspdata->{'query'} =~ tr{A-Za-z}{A-Za-z};
489          my $metastr = ($self->convert_meta) ? ($self->simple_meta($hspdata->{'meta'})) :
490                              $hspdata->{'meta'};
491
492          my $hitid_alignctr = $hitid . $align_counter;
493          my $hsp_key = $hitid_alignctr . "_" . $modelcounter{$hitid_alignctr};
494          my $hsp = $hsp_list[ $hspindex{$hsp_key} ];
495          push (@$hsp, $hspdata->{'nc'},    $metastr,
496                       $hspdata->{'query'}, $hspdata->{'midline'},
497                       $hspdata->{'hit'},   $hspdata->{'pp'});
498        }
499      }
500    }  # end of 'Hit alignments:' section of report
501
502    # Process internal pipeline stats (end of report)
503    elsif ( $buffer =~ m/Internal CM pipeline statistics summary:/ ) {
504      while ( defined( $buffer = $self->_readline ) ) {
505        last if ( $buffer =~ m!^//! );
506
507        if ( $buffer =~ /^Target sequences:\s+(\d+)\s+\((\d+) residues/ ) {
508          $self->element_hash( { 'Infernal_db-len' => $1,
509                                 'Infernal_db-let' => $2, } );
510        }
511      }
512      last;    # of the outer while defined $self->readline
513    }
514
515    # Leftovers
516    else {
517      #print STDERR "Missed line: $buffer\n";
518      $self->debug($buffer);
519    }
520    $last = $buffer;
521  } # PARSER end
522
523  # Final processing of hits and hsps
524  my $hit_counter = 0;
525  foreach my $hit ( @hit_list ) {
526    $hit_counter++;
527    my ($hit_name, $hit_desc, $hit_signif, $hit_score) = @$hit;
528    my $num_hsp = $modelcounter{$hit_name . $hit_counter} || 0;
529
530    $self->start_element( { 'Name' => 'Hit' } );
531    $self->element_hash( {'Hit_id'    => $hit_name,
532                          'Hit_desc'  => $hit_desc,
533                          'Hit_signif'=> $hit_signif,
534                          'Hit_score' => $hit_score,
535                          'Hit_bits'  => $hit_score, } );
536    for my $i ( 1 .. $num_hsp ) {
537      my $hsp_key = $hit_name . $hit_counter . "_" . $i;
538      my $hsp = $hsp_list[ $hspindex{$hsp_key} ];
539      if ( defined $hsp ) {
540        my $hspid = shift @$hsp;
541
542        my ($cm_start,  $cm_stop, $seq_start, $seq_stop,
543            $score,     $eval,    $hitlength, $ncline,
544            $csline, $qseq, $midline, $hseq, $pline) = @$hsp;
545        if ( $hitlength != 0 ) {
546            $self->element(
547                { 'Name' => 'Hit_len', 'Data' => $hitlength }
548            );
549        }
550
551        $self->start_element( { 'Name' => 'Hsp' } );
552        $self->element_hash( { 'Hsp_stranded'   => 'HIT',
553                               'Hsp_query-from' => $cm_start,
554                               'Hsp_query-to'   => $cm_stop,
555                               'Hsp_hit-from'   => $seq_start,
556                               'Hsp_hit-to'     => $seq_stop,
557                               'Hsp_score'      => $score,
558                               'Hsp_bit-score'  => $score,
559                               'Hsp_evalue'     => $eval,
560                               'Hsp_ncline'     => $ncline,
561                               'Hsp_structure'  => $csline,
562                               'Hsp_qseq'       => $qseq,
563                               'Hsp_midline'    => $midline,
564                               'Hsp_hseq'       => $hseq,
565                               'Hsp_pline'      => $pline,
566                             } );
567
568        $self->end_element( { 'Name' => 'Hsp' } );
569      }
570    }
571    $self->end_element( { 'Name' => 'Hit' } );
572  }
573
574  $self->end_element( { 'Name' => 'Result' } );
575  my $result = $self->end_document();
576  return $result;
577}
578
579
580=head2 start_element
581
582 Title   : start_element
583 Usage   : $eventgenerator->start_element
584 Function: Handles a start element event
585 Returns : none
586 Args    : hashref with at least 2 keys 'Data' and 'Name'
587
588
589=cut
590
591sub start_element {
592    my ( $self, $data ) = @_;
593
594    # we currently don't care about attributes
595    my $nm   = $data->{'Name'};
596    my $type = $MODEMAP{$nm};
597    if ($type) {
598        if ( $self->_eventHandler->will_handle($type) ) {
599            my $func = sprintf( "start_%s", lc $type );
600            $self->_eventHandler->$func( $data->{'Attributes'} );
601        }
602        unshift @{ $self->{'_elements'} }, $type;
603    }
604    if ( defined $type
605        && $type eq 'result' )
606    {
607        $self->{'_values'} = {};
608        $self->{'_result'} = undef;
609    }
610}
611
612=head2 end_element
613
614 Title   : start_element
615 Usage   : $eventgenerator->end_element
616 Function: Handles an end element event
617 Returns : none
618 Args    : hashref with at least 2 keys, 'Data' and 'Name'
619
620=cut
621
622sub end_element {
623    my ( $self, $data ) = @_;
624    my $nm   = $data->{'Name'};
625    my $type = $MODEMAP{$nm};
626    my $rc;
627
628    if ($type) {
629        if ( $self->_eventHandler->will_handle($type) ) {
630            my $func = sprintf( "end_%s", lc $type );
631            $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
632                $self->{'_values'} );
633        }
634        my $lastelem = shift @{ $self->{'_elements'} };
635
636        # Infernal 1.1 allows one to know hit->length in some instances
637        # so remove it so it doesn't carry over to next hit. Tried flushing
638        # all 'type' values from {_values} buffer but it breaks legacy tests
639        if ($type eq 'hit' ) {
640          delete $self->{_values}{'HIT-length'};
641          delete $self->{_values}{'HSP-hit_length'};
642        }
643    }
644    elsif ( $MAPPING{$nm} ) {
645        if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
646            my $key = ( keys %{ $MAPPING{$nm} } )[0];
647            $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
648              $self->{'_last_data'};
649        }
650        else {
651            $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
652        }
653    }
654    else {
655        $self->debug("unknown nm $nm, ignoring\n");
656    }
657    $self->{'_last_data'} = '';    # remove read data if we are at
658                                   # end of an element
659    $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
660    return $rc;
661}
662
663=head2 element
664
665 Title   : element
666 Usage   : $eventhandler->element({'Name' => $name, 'Data' => $str});
667 Function: Convenience method that calls start_element, characters, end_element
668 Returns : none
669 Args    : Hash ref with the keys 'Name' and 'Data'
670
671=cut
672
673sub element {
674    my ( $self, $data ) = @_;
675    # simple data calls (%MAPPING) do not need start_element
676    $self->characters($data);
677    $self->end_element($data);
678}
679
680=head2 element_hash
681
682 Title   : element
683 Usage   : $eventhandler->element_hash({'Hsp_hit-from' => $start,
684                                        'Hsp_hit-to'   => $end,
685                                        'Hsp_score'    => $lastscore});
686 Function: Convenience method that takes multiple simple data elements and
687           maps to appropriate parameters
688 Returns : none
689 Args    : Hash ref with the mapped key (in %MAPPING) and value
690
691=cut
692
693sub element_hash {
694    my ($self, $data) = @_;
695    $self->throw("Must provide data hash ref") if !$data || !ref($data);
696    for my $nm (sort keys %{$data}) {
697        next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
698        if ( $MAPPING{$nm} ) {
699            if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
700                my $key = ( keys %{ $MAPPING{$nm} } )[0];
701                $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
702                  $data->{$nm};
703            }
704            else {
705                $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
706            }
707        }
708    }
709}
710
711=head2 characters
712
713 Title   : characters
714 Usage   : $eventgenerator->characters($str)
715 Function: Send a character events
716 Returns : none
717 Args    : string
718
719
720=cut
721
722sub characters {
723    my ( $self, $data ) = @_;
724    return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
725    $self->{'_last_data'} = $data->{'Data'};
726}
727
728=head2 within_element
729
730 Title   : within_element
731 Usage   : if( $eventgenerator->within_element($element) ) {}
732 Function: Test if we are within a particular element
733           This is different than 'in' because within can be tested
734           for a whole block.
735 Returns : boolean
736 Args    : string element name
737
738=cut
739
740sub within_element {
741    my ( $self, $name ) = @_;
742    return 0
743      if ( !defined $name
744        || !defined $self->{'_elements'}
745        || scalar @{ $self->{'_elements'} } == 0 );
746    foreach ( @{ $self->{'_elements'} } ) {
747        return 1 if ( $_ eq $name );
748    }
749    return 0;
750}
751
752=head2 in_element
753
754 Title   : in_element
755 Usage   : if( $eventgenerator->in_element($element) ) {}
756 Function: Test if we are in a particular element
757           This is different than 'within' because 'in' only
758           tests its immediate parent.
759 Returns : boolean
760 Args    : string element name
761
762=cut
763
764sub in_element {
765    my ( $self, $name ) = @_;
766    return 0 if !defined $self->{'_elements'}->[0];
767    return ( $self->{'_elements'}->[0] eq $name );
768}
769
770=head2 start_document
771
772 Title   : start_document
773 Usage   : $eventgenerator->start_document
774 Function: Handle a start document event
775 Returns : none
776 Args    : none
777
778=cut
779
780sub start_document {
781    my ($self) = @_;
782    $self->{'_lasttype'} = '';
783    $self->{'_values'}   = {};
784    $self->{'_result'}   = undef;
785    $self->{'_elements'} = [];
786}
787
788=head2 end_document
789
790 Title   : end_document
791 Usage   : $eventgenerator->end_document
792 Function: Handles an end document event
793 Returns : Bio::Search::Result::ResultI object
794 Args    : none
795
796=cut
797
798sub end_document {
799    my ($self) = @_;
800    return $self->{'_result'};
801}
802
803=head2 result_count
804
805 Title   : result_count
806 Usage   : my $count = $searchio->result_count
807 Function: Returns the number of results we have processed
808 Returns : integer
809 Args    : none
810
811=cut
812
813sub result_count {
814    my $self = shift;
815    return $self->{'_result_count'};
816}
817
818=head2 model
819
820 Title   : model
821 Usage   : my $model = $parser->model();
822 Function: Get/Set model; Infernal currently does not output
823           the model name (Rfam ID)
824 Returns : String (name of model)
825 Args    : [optional] String (name of model)
826
827=cut
828
829sub model {
830    my $self = shift;
831    return $self->{'_model'} = shift if @_;
832    return $self->{'_model'};
833}
834
835=head2 database
836
837 Title   : database
838 Usage   : my $database = $parser->database();
839 Function: Get/Set database; pre-v.1 versions of Infernal do not output
840           the database name
841 Returns : String (database name)
842 Args    : [optional] String (database name)
843
844=cut
845
846sub database {
847    my $self = shift;
848    return $self->{'_database'} = shift if @_;
849    return $self->{'_database'};
850}
851
852=head2 algorithm
853
854 Title   : algorithm
855 Usage   : my $algorithm = $parser->algorithm();
856 Function: Get/Set algorithm; pre-v.1 versions of Infernal do not output
857           the algorithm name
858 Returns : String (algorithm name)
859 Args    : [optional] String (algorithm name)
860
861=cut
862
863sub algorithm {
864    my $self = shift;
865    return $self->{'_algorithm'} = shift if @_;
866    return $self->{'_algorithm'};
867}
868
869=head2 query_accession
870
871 Title   : query_accession
872 Usage   : my $acc = $parser->query_accession();
873 Function: Get/Set query (model) accession; pre-v1.1 Infernal does not output
874           the accession number (Rfam accession #)
875 Returns : String (accession)
876 Args    : [optional] String (accession)
877
878=cut
879
880sub query_accession {
881    my $self = shift;
882    return $self->{'_query_accession'} = shift if @_;
883    return $self->{'_query_accession'};
884}
885
886=head2 query_description
887
888 Title   : query_description
889 Usage   : my $acc = $parser->query_description();
890 Function: Get/Set query (model) description; pre-v1.1 Infernal does not output
891           the Rfam description
892 Returns : String (description)
893 Args    : [optional] String (description)
894
895=cut
896
897sub query_description {
898    my $self = shift;
899    return $self->{'_query_description'} = shift if @_;
900    return $self->{'_query_description'};
901}
902
903=head2 hsp_minscore
904
905 Title   : hsp_minscore
906 Usage   : my $cutoff = $parser->hsp_minscore();
907 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
908 Returns : score (number)
909 Args    : [optional] score (number)
910
911=cut
912
913sub hsp_minscore {
914    my $self = shift;
915    return $self->{'_hsp_minscore'} = shift if @_;
916    return $self->{'_hsp_minscore'};
917}
918
919=head2 convert_meta
920
921 Title   : convert_meta
922 Usage   : $parser->convert_meta(1);
923 Function: Get/Set boolean flag for converting Infernal WUSS format
924           to a simple bracketed format (simple WUSS by default)
925 Returns : boolean flag (TRUE or FALSE)
926 Args    : [optional] boolean (eval's to TRUE or FALSE)
927
928=cut
929
930sub convert_meta {
931    my $self = shift;
932    return $self->{'_convert_meta'} = shift if @_;
933    return $self->{'_convert_meta'};
934}
935
936=head2 version
937
938 Title   : version
939 Usage   : $parser->version();
940 Function: Set the Infernal cmsearch version
941 Returns : version
942 Args    : [optional] version
943
944=cut
945
946sub version {
947    my $self = shift;
948    return $self->{'_version'} = shift if @_;
949    return $self->{'_version'};
950}
951
952=head2 structure_symbols
953
954 Title   : structure_symbols
955 Usage   : my $hashref = $parser->structure_symbols();
956 Function: Get/Set RNA structure symbols
957 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
958         : default = < (5-prime)
959                     > (3-prime)
960                     : (single-strand)
961                     ? (unknown)
962                     . (gap)
963 Args    : Hash ref of substitute delimiters, using above keys.
964
965=cut
966
967sub structure_symbols {
968    my ($self, $delim) = @_;
969    if ($delim) {
970        if (ref($delim) =~ m{HASH}) {
971            my %data = %{ $delim };
972            for my $d (@VALID_SYMBOLS) {
973                if ( exists $data{$d} ) {
974                    $self->{'_delimiter'}->{$d} = $data{$d};
975                }
976            }
977        } else {
978            $self->throw("Args to helix_delimiters() should be in a hash reference");
979        }
980    }
981    return $self->{'_delimiter'};
982}
983
984=head2 simple_meta
985
986 Title   : simple_meta
987 Usage   : my $string = $parser->simple_meta($str);
988 Function: converts more complex WUSS meta format into simple bracket format
989           using symbols defined in structure_symbols()
990 Returns : converted string
991 Args    : [required] string to convert
992 Note    : This is a very simple conversion method to get simple bracketed
993           format from Infernal data.  If the convert_meta() flag is set,
994           this is the method used to convert the strings.
995
996=cut
997
998sub simple_meta {
999    my ($self, $str) = @_;
1000    $self->throw("No string arg sent!") if !$str;
1001    my $structs = $self->structure_symbols();
1002    my ($ls, $rs, $ss, $unk, $gap) = ($structs->{'5-prime'}, $structs->{'3-prime'},
1003                                $structs->{'single-strand'}, $structs->{'unknown'},
1004                                $structs->{'gap'});
1005    $str =~ s{[\(\<\[\{]}{$ls}g;
1006    $str =~ s{[\)\>\]\}]}{$rs}g;
1007    $str =~ s{[:,_-]}{$ss}g;
1008    $str =~ s{\.}{$gap}g;
1009    # unknown not handled yet
1010    return $str;
1011}
1012
1013## private methods
1014
1015# this is a hack which guesses the format and sets the handler for parsing in
1016# an instance; it'll be taken out when infernal 1.0 is released
1017
1018sub _parse_latest {
1019    my ($self) = @_;
1020    my $seentop = 0;
1021    local $/ = "\n";
1022    my ($accession, $description) = ($self->query_accession, $self->query_description);
1023    my ($maxscore, $mineval, $minpval);
1024    $self->start_document();
1025    my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend);
1026    PARSER:
1027    while (my $line = $self->_readline) {
1028        next if $line =~ m{^\s+$};
1029        # stats aren't parsed yet...
1030		if ($line =~ m{^\#\s+cmsearch}xms) {
1031			$seentop = 1;
1032			$self->start_element({'Name' => 'Result'});
1033			$self->element_hash({
1034					'Infernal_program'   => 'CMSEARCH'
1035				});
1036		}
1037		elsif ($line =~ m{^\#\sINFERNAL\s+(\d+\.\d+)}xms) {
1038			$self->element_hash({
1039					'Infernal_version'   => $1,
1040				});
1041		}
1042		elsif ($line =~ m{^\#\scommand:.*?\s(\S+)$}xms) {
1043			$self->element_hash({
1044					'Infernal_db'   => $1,
1045				});
1046		}
1047		elsif ($line =~ m{^\#\s+dbsize\(Mb\):\s+(\d+\.\d+)}xms) {
1048			# store absolute DB length
1049			$self->element_hash({
1050					'Infernal_db-let'   => $1 * 1e6
1051				});
1052		}
1053		elsif ($line =~ m{^CM(?:\s(\d+))?:\s*(\S+)}xms) {
1054			# not sure, but it's possible single reports may contain multiple
1055			# models; if so, they should be rolled over into a new ResultI
1056			#print STDERR "ACC: $accession\nDESC: $description\n";
1057			$self->element_hash({
1058			        'Infernal_query-def' => $2, # present in output now
1059			        'Infernal_query-acc' => $accession,
1060			        'Infernal_querydesc' => $description
1061			    });
1062        }
1063		elsif ($line =~ m{^>\s*(\S+)} ){
1064            #$self->debug("Start Hit: Found hit:$1\n");
1065            if ($self->in_element('hit')) {
1066                $self->element_hash({'Hit_score' => $maxscore,
1067                                     'Hit_bits'  => $maxscore});
1068                ($maxscore, $minpval, $mineval) = undef;
1069                $self->end_element({'Name' => 'Hit'});
1070            }
1071            $lasthit = $1;
1072        }
1073        elsif ($line =~ m{
1074            ^\sQuery\s=\s\d+\s-\s\d+,\s   # Query start/end
1075            Target\s=\s(\d+)\s-\s(\d+)    # Target start/end
1076            }xmso) {
1077            # Query (model) start/end always the same, determined from
1078            # the HSP length
1079            ($laststart, $lastend) = ($1, $2);
1080            #$self->debug("Found hit coords:$laststart - $lastend\n");
1081        } elsif ($line =~ m{
1082            ^\sScore\s=\s([\d\.]+),\s   # Score = Bitscore (for now)
1083            (?:E\s=\s([\d\.e-]+),\s     # E-val optional
1084             P\s=\s([\d\.e-]+),\s)?     # P-val optional
1085            GC\s=                       # GC not captured
1086            }xmso
1087                 ) {
1088            ($lastscore, $lasteval, $lastpval) = ($1, $2, $3);
1089            #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||''));
1090            $maxscore ||= $lastscore;
1091            if ($lasteval && $lastpval) {
1092                $mineval ||= $lasteval;
1093                $minpval ||= $lastpval;
1094                $mineval = ($mineval > $lasteval)  ? $lasteval :
1095                        $mineval;
1096                $minpval = ($minpval > $lastpval)  ? $lastpval :
1097                        $minpval;
1098            }
1099            $maxscore = ($maxscore < $lastscore)  ? $lastscore :
1100                        $maxscore;
1101            if (!$self->within_element('hit')) {
1102                my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1103                $self->start_element({'Name' => 'Hit'});
1104                $self->element_hash({
1105                    'Hit_id'           => $lasthit,
1106                    'Hit_accession'    => $ver ? "$acc.$ver" :
1107                                           $acc ? $acc : $lasthit,
1108                    'Hit_gi'           => $gi
1109                    });
1110            }
1111            if (!$self->in_element('hsp')) {
1112                $self->start_element({'Name' => 'Hsp'});
1113            }
1114
1115            # hsp is similar to older output
1116        } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP
1117            $self->_pushback($line); # set up for loop
1118            #$self->debug("Start HSP\n");
1119            # what is length of the gap to the structure data?
1120            my $offset = length($1);
1121            my ($ct, $strln) = 0;
1122            my $hsp;
1123            HSP:
1124            my %hsp_key = ('0' => 'meta',
1125               '1' => 'query',
1126               '2' => 'midline',
1127               '3' => 'hit');
1128            HSP:
1129            while (defined ($line = $self->_readline)) {
1130                chomp $line;
1131          		next if (!$line); # toss empty lines
1132                # next if $line =~ m{^\s*$}; # toss empty lines
1133                # it is possible to have homology lines consisting
1134                # entirely of spaces if the subject has a large
1135                # insertion where nothing matches the model
1136
1137                # exit loop if at end of file or upon next hit/HSP
1138                if ($line =~ m{^\s{0,2}\S+}) {
1139                    $self->_pushback($line);
1140                    last HSP;
1141                }
1142                # iterate to keep track of each line (4 lines per hsp block)
1143                my $iterator = $ct % 4;
1144                # strlen set only with structure lines (proper length)
1145                $strln = length($line) if $iterator == 0;
1146                # only grab the data needed (hit start and stop in hit line above)
1147
1148                my $data = substr($line, $offset, $strln-$offset);
1149                $hsp->{ $hsp_key{$iterator} } .= $data;
1150
1151                $ct++;
1152            }
1153
1154            # query start, end are from the actual query length (entire hit is
1155            # mapped to CM data, so all CM data is represented)
1156            # works for now...
1157            if ($self->in_element('hsp')) {
1158				# In some cases with HSPs unaligned residues are present in
1159				# the hit or query (Ex: '*[ 8]*' is 8 unaligned residues).
1160				# This info needs to be passed on unmodifed to the HSP class
1161				# and handled there as it is subjectively changed based on
1162				# use.
1163                my $strlen = 0;
1164
1165				# catch any insertions and add them into the actual length
1166				while ($hsp->{'query'} =~ m{\*\[\s*(\d+)\s*\]\*}g) {
1167					$strlen += $1;
1168				}
1169				# add on the actual residues
1170				$strlen += $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1171
1172                my $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) :
1173                            $hsp->{'meta'};
1174                $self->element_hash(
1175                               {'Hsp_stranded'  => 'HIT',
1176                                'Hsp_qseq'      => $hsp->{'query'},
1177                                'Hsp_hseq'      => $hsp->{'hit'},
1178                                'Hsp_midline'   => $hsp->{'midline'},
1179                                'Hsp_structure' => $metastr,
1180                                'Hsp_query-from' => 1,
1181                                'Infernal_query-len' => $strlen,
1182                                'Hsp_query-to'   => $strlen,
1183                                'Hsp_hit-from'  => $laststart,
1184                                'Hsp_hit-to'    => $lastend,
1185                                'Hsp_score'     => $lastscore,
1186                                'Hsp_bit-score' => $lastscore,
1187                            });
1188                $self->element_hash(
1189                               {'Hsp_evalue'    => $lasteval,
1190                                'Hsp_pvalue'    => $lastpval,
1191                            }) if ($lasteval && $lastpval);
1192                $self->end_element({'Name' => 'Hsp'});
1193            }
1194        # result now ends with // and 'Fin'
1195        } elsif ($line =~ m{^//}xms )  {
1196            if ($self->within_element('result') && $seentop) {
1197                if ($self->in_element('hit')) {
1198                    $self->element_hash({'Hit_score'    => $maxscore,
1199                                         'Hit_bits'     => $maxscore});
1200                    # don't know where to put minpval yet
1201                    $self->element_hash({'Hit_signif'   => $mineval}) if $mineval;
1202                    $self->element_hash({'Hit_p'        => $minpval}) if $minpval;
1203                    $self->end_element({'Name' => 'Hit'});
1204                }
1205                last PARSER;
1206            }
1207        }
1208    }
1209    $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1210    $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1211    return $self->end_document();
1212}
1213
1214# cmsearch 0.81 (pre-1.0)
1215sub _parse_pre {
1216    my ($self) = @_;
1217    my $seentop = 0;
1218    local $/ = "\n";
1219    my ($accession, $db, $algorithm, $description, $version) =
1220       ($self->query_accession, $self->database, $self->algorithm,
1221        $self->query_description, '0.81');
1222    my ($maxscore, $mineval, $minpval);
1223    $self->start_document();
1224    my ($lasthit, $lastscore, $lasteval, $lastpval, $laststart, $lastend);
1225    PARSER:
1226    while (my $line = $self->_readline) {
1227        next if $line =~ m{^\s+$};
1228        # stats aren't parsed yet...
1229        if ($line =~ m{CM\s\d+:\s*(\S+)}xms) {
1230            #$self->debug("Start Result: Found model:$1\n");
1231            if (!$self->within_element('result')) {
1232                $seentop = 1;
1233                $self->start_element({'Name' => 'Result'});
1234                $self->element_hash({
1235                        'Infernal_program'   => $algorithm,
1236                        'Infernal_query-def' => $1, # present in output now
1237                        'Infernal_query-acc' => $accession,
1238                        'Infernal_querydesc' => $description,
1239                        'Infernal_db'        => $db
1240                    });
1241            }
1242        } elsif ($line =~ m{^>\s*(\S+)} ){
1243            #$self->debug("Start Hit: Found hit:$1\n");
1244            if ($self->in_element('hit')) {
1245                $self->element_hash({'Hit_score' => $maxscore,
1246                                     'Hit_bits'  => $maxscore});
1247                ($maxscore, $minpval, $mineval) = undef;
1248                $self->end_element({'Name' => 'Hit'});
1249            }
1250            $lasthit = $1;
1251        }
1252        elsif ($line =~ m{
1253            ^\sQuery\s=\s\d+\s-\s\d+,\s   # Query start/end
1254            Target\s=\s(\d+)\s-\s(\d+)    # Target start/end
1255            }xmso) {
1256            # Query (model) start/end always the same, determined from
1257            # the HSP length
1258            ($laststart, $lastend) = ($1, $2);
1259            #$self->debug("Found hit coords:$laststart - $lastend\n");
1260        } elsif ($line =~ m{
1261            ^\sScore\s=\s([\d\.]+),\s   # Score = Bitscore (for now)
1262            (?:E\s=\s([\d\.e-]+),\s     # E-val optional
1263             P\s=\s([\d\.e-]+),\s)?     # P-val optional
1264            GC\s=                       # GC not captured
1265            }xmso
1266                 ) {
1267            ($lastscore, $lasteval, $lastpval) = ($1, $2, $3);
1268            #$self->debug(sprintf("Found hit data:Score:%s,Eval:%s,Pval:%s\n",$lastscore, $lasteval||'', $lastpval||''));
1269            $maxscore ||= $lastscore;
1270            if ($lasteval && $lastpval) {
1271                $mineval ||= $lasteval;
1272                $minpval ||= $lastpval;
1273                $mineval = ($mineval > $lasteval)  ? $lasteval :
1274                        $mineval;
1275                $minpval = ($minpval > $lastpval)  ? $lastpval :
1276                        $minpval;
1277            }
1278            $maxscore = ($maxscore < $lastscore)  ? $lastscore :
1279                        $maxscore;
1280            if (!$self->within_element('hit')) {
1281                my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1282                $self->start_element({'Name' => 'Hit'});
1283                $self->element_hash({
1284                    'Hit_id'           => $lasthit,
1285                    'Hit_accession'    => $ver ? "$acc.$ver" :
1286                                           $acc ? $acc : $lasthit,
1287                    'Hit_gi'           => $gi
1288                    });
1289            }
1290            if (!$self->in_element('hsp')) {
1291                $self->start_element({'Name' => 'Hsp'});
1292            }
1293
1294            # hsp is similar to older output
1295        } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP
1296            $self->_pushback($line); # set up for loop
1297            #$self->debug("Start HSP\n");
1298            # what is length of the gap to the structure data?
1299            my $offset = length($1);
1300            my ($ct, $strln) = 0;
1301            my $hsp;
1302            HSP:
1303            my %hsp_key = ('0' => 'meta',
1304               '1' => 'query',
1305               '2' => 'midline',
1306               '3' => 'hit');
1307            HSP:
1308            while (defined ($line = $self->_readline)) {
1309                chomp $line;
1310          		next if (!$line); # toss empty lines
1311                # next if $line =~ m{^\s*$}; # toss empty lines
1312                # it is possible to have homology lines consisting
1313                # entirely of spaces if the subject has a large
1314                # insertion where nothing matches the model
1315
1316                # exit loop if at end of file or upon next hit/HSP
1317                if ($line =~ m{^\s{0,2}\S+}) {
1318                    $self->_pushback($line);
1319                    last HSP;
1320                }
1321                # iterate to keep track of each line (4 lines per hsp block)
1322                my $iterator = $ct%4;
1323                # strlen set only with structure lines (proper length)
1324                $strln = length($line) if $iterator == 0;
1325                # only grab the data needed (hit start and stop in hit line above)
1326
1327                my $data = substr($line, $offset, $strln-$offset);
1328                $hsp->{ $hsp_key{$iterator} } .= $data;
1329
1330                $ct++;
1331            }
1332
1333            # query start, end are from the actual query length (entire hit is
1334            # mapped to CM data, so all CM data is represented)
1335            # works for now...
1336            if ($self->in_element('hsp')) {
1337                my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1338
1339                my $metastr;
1340                $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) :
1341                            ($hsp->{'meta'});
1342                $self->element_hash(
1343                               {'Hsp_stranded'  => 'HIT',
1344                                'Hsp_qseq'      => $hsp->{'query'},
1345                                'Hsp_hseq'      => $hsp->{'hit'},
1346                                'Hsp_midline'   => $hsp->{'midline'},
1347                                'Hsp_structure' => $metastr,
1348                                'Hsp_query-from' => 1,
1349                                'Infernal_query-len' => $strlen,
1350                                'Hsp_query-to'   => $strlen,
1351                                'Hsp_hit-from'  => $laststart,
1352                                'Hsp_hit-to'    => $lastend,
1353                                'Hsp_score'     => $lastscore,
1354                                'Hsp_bit-score' => $lastscore,
1355                            });
1356                $self->element_hash(
1357                               {'Hsp_evalue'    => $lasteval,
1358                                'Hsp_pvalue'    => $lastpval,
1359                            }) if ($lasteval && $lastpval);
1360                $self->end_element({'Name' => 'Hsp'});
1361            }
1362        # result now ends with // and 'Fin'
1363        } elsif ($line =~ m{^//}xms )  {
1364            if ($self->within_element('result') && $seentop) {
1365                $self->element(
1366                            {'Name' => 'Infernal_version',
1367                             'Data' => $version}
1368                            );
1369                if ($self->in_element('hit')) {
1370                    $self->element_hash({'Hit_score'    => $maxscore,
1371                                         'Hit_bits'     => $maxscore});
1372                    # don't know where to put minpval yet
1373                    $self->element_hash({'Hit_signif'   => $mineval}) if $mineval;
1374                    $self->end_element({'Name' => 'Hit'});
1375                }
1376                last PARSER;
1377            }
1378        }
1379    }
1380    $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1381    $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1382    return $self->end_document();
1383}
1384
1385# cmsearch 0.72 and below; will likely be dropped when Infernal 1.0 is released
1386sub _parse_old {
1387    my ($self) = @_;
1388    my $seentop = 0;
1389    local $/ = "\n";
1390    my ($accession, $db, $algorithm, $model, $description, $version) =
1391       ($self->query_accession, $self->database, $self->algorithm,
1392        $self->model, $self->query_description, $self->version);
1393    my $maxscore;
1394    my $cutoff = $self->hsp_minscore;
1395    $self->start_document();
1396    local ($_);
1397    my $line;
1398    my ($lasthit, $lastscore, $laststart, $lastend);
1399    my $hitline;
1400    PARSER:
1401    while ( defined( $line = $self->_readline ) ) {
1402        next if $line =~ m{^\s+$};
1403        # bypass this for now...
1404        next if $line =~ m{^HMM\shit};
1405        # pre-0.81
1406        if ($line =~ m{^sequence:\s+(\S+)} ){
1407            if (!$self->within_element('result')) {
1408                $seentop = 1;
1409                $self->start_element({'Name' => 'Result'});
1410                $self->element_hash({
1411                        'Infernal_program'   => $algorithm,
1412                        'Infernal_query-def' => $model,
1413                        'Infernal_query-acc' => $accession,
1414                        'Infernal_querydesc' => $description,
1415                        'Infernal_db'        => $db
1416                    });
1417            }
1418            if ($self->in_element('hit')) {
1419                $self->element_hash({'Hit_score' => $maxscore,
1420                                     'Hit_bits'  => $maxscore});
1421                $maxscore = undef;
1422                $self->end_element({'Name' => 'Hit'});
1423            }
1424            $lasthit = $1;
1425        } elsif ($line =~ m{^hit\s+\d+\s+:\s+(\d+)\s+(\d+)\s+(\d+\.\d+)\s+bits}xms) {
1426            ($laststart, $lastend, $lastscore) = ($1, $2, $3);
1427            $maxscore = $lastscore unless $maxscore;
1428            if ($lastscore > $cutoff) {
1429                if (!$self->within_element('hit')) {
1430                    my ($gi, $acc, $ver) = $self->_get_seq_identifiers($lasthit);
1431                    $self->start_element({'Name' => 'Hit'});
1432                    $self->element_hash({
1433                        'Hit_id'           => $lasthit,
1434                        'Hit_accession'    => $ver ? "$acc.$ver" :
1435                                               $acc ? $acc : $lasthit,
1436                        'Hit_gi'           => $gi
1437                        });
1438                }
1439                # necessary as infernal 0.71 has repeated hit line
1440                if (!$self->in_element('hsp')) {
1441                    $self->start_element({'Name' => 'Hsp'});
1442                }
1443                $maxscore = ($maxscore < $lastscore)  ? $lastscore :
1444                            $maxscore;
1445            }
1446        } elsif ($line =~ m{^(\s+)[<>\{\}\(\)\[\]:_,-\.]+}xms) { # start of HSP
1447            $self->_pushback($line); # set up for loop
1448            # what is length of the gap to the structure data?
1449            my $offset = length($1);
1450            my ($ct, $strln) = 0;
1451            my $hsp;
1452            HSP:
1453            my %hsp_key = ('0' => 'meta',
1454               '1' => 'query',
1455               '2' => 'midline',
1456               '3' => 'hit');
1457            HSP:
1458            while ($line = $self->_readline) {
1459                next if $line =~ m{^\s*$}; # toss empty lines
1460                chomp $line;
1461                # exit loop if at end of file or upon next hit/HSP
1462                if (!defined($line) || $line =~ m{^\S+}) {
1463                    $self->_pushback($line);
1464                    last HSP;
1465                }
1466                # iterate to keep track of each line (4 lines per hsp block)
1467                my $iterator = $ct%4;
1468                # strlen set only with structure lines (proper length)
1469                $strln = length($line) if $iterator == 0;
1470                # only grab the data needed (hit start and stop in hit line above)
1471
1472                my $data = substr($line, $offset, $strln-$offset);
1473                $hsp->{ $hsp_key{$iterator} } .= $data;
1474                $ct++;
1475            }
1476            # query start, end are from the actual query length (entire hit is
1477            # mapped to CM data, so all CM data is represented)
1478            # works for now...
1479            if ($self->in_element('hsp')) {
1480                my $strlen = $hsp->{'query'} =~ tr{A-Za-z}{A-Za-z};
1481
1482		my $metastr;
1483                # Ugh...these should be passed in a hash
1484                $metastr = ($self->convert_meta) ? ($self->simple_meta($hsp->{'meta'})) :
1485                            ($hsp->{'meta'});
1486                $self->element_hash(
1487                               {'Hsp_stranded'  => 'HIT',
1488                                'Hsp_qseq'      => $hsp->{'query'},
1489                                'Hsp_hseq'      => $hsp->{'hit'},
1490                                'Hsp_midline'   => $hsp->{'midline'},
1491                                'Hsp_structure' => $metastr,
1492                                'Hsp_query-from' => 1,
1493                                'Infernal_query-len' => $strlen,
1494                                'Hsp_query-to'   => $strlen,
1495                                'Hsp_hit-from'  => $laststart,
1496                                'Hsp_hit-to'    => $lastend,
1497                                'Hsp_score'     => $lastscore,
1498                                'Hsp_bit-score' => $lastscore
1499                            });
1500                $self->end_element({'Name' => 'Hsp'});
1501            }
1502        } elsif ($line =~ m{^memory}xms || $line =~ m{^CYK\smemory}xms )  {
1503            if ($self->within_element('result') && $seentop) {
1504                $self->element(
1505                            {'Name' => 'Infernal_version',
1506                             'Data' => $version}
1507                            );
1508                if ($self->in_element('hit')) {
1509                    $self->element_hash({'Hit_score'    => $maxscore,
1510                                         'Hit_bits'     => $maxscore});
1511                    $self->end_element({'Name' => 'Hit'});
1512                }
1513                last PARSER;
1514            }
1515        }
1516    }
1517    $self->within_element('hit') && $self->end_element( { 'Name' => 'Hit' } );
1518    $self->end_element( { 'Name' => 'Result' } ) if $seentop;
1519    return $self->end_document();
1520}
1521
15221;
1523