1#
2# BioPerl module for Bio::Tools::QRNA
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Jason Stajich <jason-at-bioperl-dot-org>
7#
8# Copyright Jason Stajich
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Tools::QRNA - A Parser for qrna output
17
18=head1 SYNOPSIS
19
20  use Bio::Tools::QRNA;
21  my $parser = Bio::Tools::QRNA->new(-file => $qrnaoutput);
22  while( my $feature = $parser->next_feature ) {
23    # do something here
24  }
25
26=head1 DESCRIPTION
27
28Parses QRNA output (E.Rivas:
29http://selab.janelia.org/software.html
30ftp://selab.janelia.org/pub/software/qrna/).
31
32This module is not complete, but currently it packs information from
33each QRNA alignment into a single Bio::SeqFeature::Generic object.
34
35Not all options for QRNA output have been tested or tried.  It has
36been tested on sliding window output (-w -x) and shuffled output (-b
37or -B).
38
39See t/QRNA.t for example usage.
40
41At some point we may have more complicated feature object which will
42support this data rather than forcing most of the information into
43tag/value pairs in a SeqFeature::Generic.
44
45Running with -verbose =E<gt> 1 will store extra data in the feature.  The
46entire unparsed entry for a particular feature will be stored as a
47string in the tag 'entry' it is accessible via:
48
49  my ($entry) = $f->each_tag_value('entry');
50
51The winning model for any given alignment test will be the name stored
52in the primary_tag field of feature.  The bit score will stored in the
53score field.  The logoddpost is available via the a tag/value pair.
54This example code will show how to print out the score and log odds
55post for each model.
56
57  # assuming you got a feature already
58  print "model score logoddspost\n";
59  foreach my $model ( qw(OTH COD RNA) ) {
60    my ($score)       = $f->get_tag_values("$model\_score");
61    my ($logoddspost) = $f->get_tag_values("$model\_logoddspost");
62    print "$model $score $logoddspost\n";
63  }
64
65The start and end of the alignment for both the query and hit sequence
66are available through the L<Bio::SeqFeature::FeaturePair> interface,
67specifically L<Bio::SeqFeature::FeaturePair::feature1> and
68L<Bio::SeqFeature::FeaturePair::feature2>.  Additionally if you have
69run QRNA with an input file which has the location of the alignment
70stored in the FASTA filename as in (ID/START-END) which is the default
71output format from L<Bio::AlignIO::fasta> produced alignment output,
72this module will re-number start/end for the two sequences so they are
73in the actual coordinates of the sequence rather than the relative
74coordinates of the alignment.  You may find the bioperl utillity
75script search2alnblocks useful in creating your input files for QRNA.
76
77Some other words of warning, QRNA uses a 0 based numbering system for
78sequence locations, Bioperl uses a 1 based system.  You'll notice that
79locations will be +1 they are reported in the raw QRNA output.
80
81=head1 FEEDBACK
82
83=head2 Mailing Lists
84
85User feedback is an integral part of the evolution of this and other
86Bioperl modules. Send your comments and suggestions preferably to
87the Bioperl mailing list.  Your participation is much appreciated.
88
89  bioperl-l@bioperl.org                  - General discussion
90  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
91
92=head2 Support
93
94Please direct usage questions or support issues to the mailing list:
95
96I<bioperl-l@bioperl.org>
97
98rather than to the module maintainer directly. Many experienced and
99reponsive experts will be able look at the problem and quickly
100address it. Please include a thorough description of the problem
101with code and data examples if at all possible.
102
103=head2 Reporting Bugs
104
105Report bugs to the Bioperl bug tracking system to help us keep track
106of the bugs and their resolution. Bug reports can be submitted via
107the web:
108
109  https://github.com/bioperl/bioperl-live/issues
110
111=head1 AUTHOR - Jason Stajich
112
113Email jason-at-bioperl-dot-org
114
115=head1 APPENDIX
116
117The rest of the documentation details each of the object methods.
118Internal methods are usually preceded with a _
119
120=cut
121
122
123# Let the code begin...
124
125
126package Bio::Tools::QRNA;
127$Bio::Tools::QRNA::VERSION = '1.7.7';
128use vars qw(@Models);
129use strict;
130
131use Bio::SeqFeature::Generic;
132use Bio::SeqFeature::FeaturePair;
133
134use base qw(Bio::Root::IO Bio::SeqAnalysisParserI);
135@Models = qw(OTH COD RNA);
136
137=head2 new
138
139 Title   : new
140 Usage   : my $obj = Bio::Tools::QRNA->new();
141 Function: Builds a new Bio::Tools::QRNA object
142 Returns : an instance of Bio::Tools::QRNA
143 Args    : -fh/-file filehandle/filename standard input for
144                     Bio::Root:IO objects
145
146=cut
147
148=head2 next_feature
149
150 Title   : next_feature
151 Usage   : my $feature = $parser->next_feature
152 Function: Get the next QRNA feature
153 Returns :
154 Args    :
155
156
157=cut
158
159sub next_feature {
160    my ($self) = @_;
161    my $f = shift @{$self->{'_parsed_features'} || []};
162    if( ! defined $f && $self->_parse_pair ) {
163	$f = shift @{$self->{'_parsed_features'} || []};
164    }
165    return $f;
166}
167
168sub _parse_pair {
169   my ($self,@args) = @_;
170   my (@features,%data);
171   my $seenstart = 0;
172   while( defined( $_ = $self->_readline) ) {
173       next if( /^\#\-\-/o );
174       if( /^\#\s+(qrna)\s+(\S+)\s+\(([^\)]+)\)/o ) {
175	   $self->program_name($1);
176	   $self->program_version($2);
177	   $self->program_date($3);
178       } elsif( /^\#\s+(PAM model)\s+\=\s+(.+)\s+$/o ) {
179	   $self->PAM_model($2);
180       } elsif( /^\#\s+(RNA model)\s+\=\s+(\S+)/o ) {
181	   $self->RNA_model($2);
182       } elsif( /^\#\s+(seq file)\s+\=\s+(.+)\s+$/o ) {
183	   $self->seq_file($2);
184       } elsif( /^\#\s+(\d+)\s+\[([\-+])\s+strand\]/o ) {
185	   if( $seenstart ) {
186	       if( $data{'alignment_len'} ) {
187		   push @features, $self->_make_feature(\%data);
188	       }
189	       $self->_pushback($_);
190	       last;
191	   }
192	   $seenstart = 1;
193       } elsif( /^\#/ ) {
194	   next;
195       } elsif( />(\S+)\s+\((\d+)\)/ ) {
196	   if( @{$data{'seqs'} || []} == 2 ) {
197	       $self->warn( "already seen seqs ".join(' ', ,map { $_->[0] }
198						      @{$data{'seqs'}}). "\n");
199	   } else {
200	       push @{$data{'seqs'}}, [$1,$2];
201	   }
202       } elsif( /^length alignment:\s+(\d+)\s+\(id\=(\d+(\.\d+)?)\)/o ) {
203
204	   if( $data{'alignment_len'} ) {
205	       push @features, $self->_make_feature(\%data);
206	       # reset all the data but the 'seqs' field
207	       %data  = ( 'seqs' => $data{'seqs'} );
208	   }
209
210	   if( /\(((sre_)?shuffled)\)/ ) {
211	       $data{'shuffled'} = $1;
212	   }
213	   $data{'alignment_len'} = $1;
214	   $data{'alignment_pid'} = $2;
215       } elsif ( /^pos([XY]):\s+(\d+)\-(\d+)\s+\[(\d+)\-(\d+)\]\((\d+)\)\s+
216		 \-\-\s+\((\S+\s+\S+\s+\S+\s+\S+)\)/ox ) {
217	   $data{"seq\_$1"}->{'aln'} = [ $2,$3, $4,$5, $6];
218	   @{$data{"seq\_$1"}->{'base_comp'}} = split(/\s+/,$7);
219       } elsif( /^winner\s+\=\s+(\S{3})/ ) {
220	   $data{'winning_model'} = $1;
221       } elsif( /^(\S{3})\s+ends\s+\=\s+(\-?\d+)\s+(\-?\d+)/ ) {
222	   # QRNA is 0-based
223	   # Bioperl is 1 based
224	   $data{'model_location'}->{$1} = [ $2,$3 ];
225       }  elsif( /^\s+(logoddspost)?OTH\s+\=\s+/ox ) {
226	   while( /(\S+)\s+\=\s+(\-?\d+(\.\d+))/g ) {
227	       my ($model,$score)= ($1,$2);
228	       if( $model =~ s/^logoddspost// ) {
229		   $data{'model_scores'}->{'logoddspost'}->{$model} = $score;
230	       } else {
231		   $data{'model_scores'}->{'bits'}->{$model} = $score;
232	       }
233	   }
234       }
235       $data{'entry'} .= $_;
236   }
237   if( @features ) {
238       push @{$self->{'_parsed_features'}}, @features;
239       return scalar @features;
240   }
241   return 0;
242}
243
244=head2 PAM_model
245
246 Title   : PAM_model
247 Usage   : $obj->PAM_model($newval)
248 Function:
249 Example :
250 Returns : value of PAM_model (a scalar)
251 Args    : on set, new value (a scalar or undef, optional)
252
253
254=cut
255
256sub PAM_model{
257    my $self = shift;
258    return $self->{'PAM_model'} = shift if @_;
259    return $self->{'PAM_model'};
260}
261
262=head2 RNA_model
263
264 Title   : RNA_model
265 Usage   : $obj->RNA_model($newval)
266 Function:
267 Example :
268 Returns : value of RNA_model (a scalar)
269 Args    : on set, new value (a scalar or undef, optional)
270
271
272=cut
273
274sub RNA_model{
275    my $self = shift;
276
277    return $self->{'RNA_model'} = shift if @_;
278    return $self->{'RNA_model'};
279}
280
281=head2 seq_file
282
283 Title   : seq_file
284 Usage   : $obj->seq_file($newval)
285 Function:
286 Example :
287 Returns : value of seq_file (a scalar)
288 Args    : on set, new value (a scalar or undef, optional)
289
290
291=cut
292
293sub seq_file{
294    my $self = shift;
295
296    return $self->{'seq_file'} = shift if @_;
297    return $self->{'seq_file'};
298}
299
300
301=head2 program_name
302
303 Title   : program_name
304 Usage   : $obj->program_name($newval)
305 Function:
306 Example :
307 Returns : value of program_name (a scalar)
308 Args    : on set, new value (a scalar or undef, optional)
309
310
311=cut
312
313sub program_name{
314    my $self = shift;
315
316    return $self->{'program_name'} = shift if @_;
317    return $self->{'program_name'} || 'qrna';
318}
319
320=head2 program_version
321
322 Title   : program_version
323 Usage   : $obj->program_version($newval)
324 Function:
325 Example :
326 Returns : value of program_version (a scalar)
327 Args    : on set, new value (a scalar or undef, optional)
328
329
330=cut
331
332sub program_version{
333    my $self = shift;
334
335    return $self->{'program_version'} = shift if @_;
336    return $self->{'program_version'};
337}
338
339=head2 program_date
340
341 Title   : program_date
342 Usage   : $obj->program_date($newval)
343 Function:
344 Example :
345 Returns : value of program_date (a scalar)
346 Args    : on set, new value (a scalar or undef, optional)
347
348
349=cut
350
351sub program_date{
352    my $self = shift;
353    return $self->{'program_date'} = shift if @_;
354    return $self->{'program_date'};
355}
356
357sub _make_feature {
358    my ($self,$data) = @_;
359    my ($qoffset,$hoffset) = (1,1);
360    # when you run qrna and have produced ID/START-END
361    # formatted input strings we can remap the location
362    # to the original
363
364    # name is stored as the first entry in the seq array ref
365    my ($qid,$hid) = ( $data->{'seqs'}->[0]->[0],
366		       $data->{'seqs'}->[1]->[0]);
367    if( $qid =~ /(\S+)\/(\d+)\-(\d+)/ ) {
368	($qid,$qoffset) = ($1,$2);
369    }
370    if( $hid =~ /(\S+)\/(\d+)\-(\d+)/ ) {
371	($hid,$hoffset) = ($1,$2);
372    }
373
374    my $f = Bio::SeqFeature::FeaturePair->new();
375
376    my ($s,$e) = @{$data->{'model_location'}->{$data->{'winning_model'}}};
377    my $qf = Bio::SeqFeature::Generic->new
378	( -primary_tag => $data->{'winning_model'},
379	  -source_tag  => $self->program_name,
380	  -score       => $data->{'model_scores'}->{'bits'}->{$data->{'winning_model'}},
381	  -start       => $s+$qoffset,
382	  -end         => $e+$qoffset,
383	  -seq_id      => $qid,
384	  -strand      => ($s < $e ) ? 1 : -1,
385	  );
386
387    my $hf = Bio::SeqFeature::Generic->new
388	( -primary_tag => $qf->primary_tag,
389	  -source_tag  => $qf->source_tag,
390	  -score       => $qf->score,
391	  -seq_id      => $hid,
392	  -start       => $s + $hoffset,
393	  -end         => $e + $hoffset,
394	  -strand      => $qf->strand,
395	  );
396    $f->feature1($qf);
397    $f->feature2($hf);
398    $f->add_tag_value('alignment_len', $data->{'alignment_len'});
399    $f->add_tag_value('alignment_pid', $data->{'alignment_pid'});
400    # store the other model scores and data
401    foreach my $model ( @Models ) {
402	$f->add_tag_value("$model\_score", $data->{'model_scores'}->{'bits'}->{$model});
403	$f->add_tag_value("$model\_logoddspost", $data->{'model_scores'}->{'logoddspost'}->{$model});
404	if( ! $data->{'model_location'}->{$model} ) {
405	    if( $self->verbose > 0 ) {
406		$self->debug( $data->{'entry'} );
407	    }
408	    $self->throw("no location parsed for $model in ",
409	    (map { @$_ } @{$data->{'seqs'}}), " ", $f->start, " ", $f->end);
410	} else {
411	    $f->add_tag_value("$model\_positions",
412			      join("..",@{$data->{'model_location'}->{$model} }));
413	}
414    }
415    # probably a better way to store this - as
416    # a seq object perhaps
417    $f->add_tag_value('seq1', @{$data->{'seqs'}->[0]});
418    $f->add_tag_value('seq2', @{$data->{'seqs'}->[1]});
419    $f->add_tag_value('entry', $data->{'entry'}) if $self->verbose > 0;
420    if( $data->{'shuffled'} ) {
421	$f->add_tag_value('shuffled', $data->{'shuffled'});
422    }
423    return $f;
424}
4251;
426