1# $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $
2#
3# BioPerl module for Bio::SearchIO::gmap_f9
4#
5# Cared for by George Hartzell <hartzell@alerce.com>
6#
7# Copyright George Hartzell
8#
9# You may distribute this module under the same terms as perl itself
10
11# POD documentation - main docs before the code
12
13=head1 NAME
14
15Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
16
17=head1 SYNOPSIS
18
19   # Do not use this object directly - it is used as part of the
20   # Bio::SearchIO system.
21
22    use Bio::SearchIO;
23    my $searchio = Bio::SearchIO->new(-format => 'gmap',
24                                      -file   => 't/data/her2.gmapz');
25    while( my $result = $searchio->next_result ) {
26        while( my $hit = $result->next_hit ) {
27            while( my $hsp = $hit->next_hsp ) {
28                # ...
29            }
30        }
31    }
32
33
34=head1 DESCRIPTION
35
36This object encapsulated the necessary methods for generating events
37suitable for building Bio::Search objects from a GMAP "compressed"
38report (from gmap run with -Z flag) Read the L<Bio::SearchIO> for more
39information about how to use this.
40
41=head2 REVERSE STRAND AND BIOPERL COORDINATES
42
43I believe that I'm doing the correct thing when reporting hits on the
44negative strand of the genome.  In particular, I've compared the
45"exons" this code generates with the set returned by ncbi's megablast
46web service.  NCBI's hsp's are ordered differently and have a
47different genomic location (off by ~18,000,000 bases, padding?) but
48the starts, ends, and lengths were similar and my strand handling
49matches theirs.  E.g.
50
51   CDNA                            GENOME
52 start  end    strand   start           end             strand
53
54blast
55  1913	2989	1	86236731	86237808	-1
56  1	475	1	86260509	86260983	-1
57  1510	1727	1	86240259	86240476	-1
58  841	989	1	86243034	86243182	-1
59  1381	1514	1	86240630	86240763	-1
60  989	1122	1	86242457	86242590	-1
61  599	729	1	86247470	86247600	-1
62  473	608	1	86259972	86260107	-1
63  1255	1382	1	86240837	86240964	-1
64  730	842	1	86244040	86244152	-1
65  1813	1921	1	86238123	86238231	-1
66  1725	1814	1	86239747	86239836	-1
67  1167	1256	1	86241294	86241383	-1
68  1120	1188	1	86242319	86242387	-1
69
70gmap
71  1	475	1	104330509	104330983	-1
72  476	600	1	104329980	104330104	-1
73  601	729	1	104317470	104317598	-1
74  730	841	1	104314041	104314152	-1
75  842	989	1	104313034	104313181	-1
76  990	1121	1	104312458	104312589	-1
77  1122	1187	1	104312320	104312385	-1
78  1188	1256	1	104311294	104311362	-1
79  1257	1382	1	104310837	104310962	-1
80  1383	1511	1	104310633	104310761	-1
81  1512	1726	1	104310260	104310474	-1
82  1727	1814	1	104309747	104309834	-1
83  1815	1917	1	104308127	104308229	-1
84  1918	2989	1	104306731	104307802	-1
85
86=head1 FEEDBACK
87
88=head2 Mailing Lists
89
90User feedback is an integral part of the evolution of this and other
91Bioperl modules. Send your comments and suggestions preferably to
92the Bioperl mailing list.  Your participation is much appreciated.
93
94  bioperl-l@bioperl.org                  - General discussion
95  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
96
97=head2 Reporting Bugs
98
99Report bugs to the Bioperl bug tracking system to help us keep track
100of the bugs and their resolution. Bug reports can be submitted via
101the web:
102
103  https://github.com/bioperl/bioperl-live/issues
104
105=head1 AUTHOR - George Hartzell
106
107Email hartzell@alerce.com
108
109=head1 CONTRIBUTORS
110
111Additional contributors names and emails here
112
113=head1 APPENDIX
114
115The rest of the documentation details each of the object methods.
116Internal methods are usually preceded with an underscore (_).
117
118=cut
119
120
121# Let the code begin...
122
123
124package Bio::SearchIO::gmap_f9;
125$Bio::SearchIO::gmap_f9::VERSION = '1.7.7';
126use strict;
127use warnings;
128
129use Bio::Search::Hit::GenericHit;
130use Bio::Search::HSP::GenericHSP;
131
132use base qw(Bio::SearchIO );
133
134use Data::Dumper;
135
136=head2 next_result
137
138 Title   : next_result
139 Usage   : $result = stream->next_result
140 Function: Reads the next ResultI object from the stream and returns it.
141 Returns : A Bio::Search::Result::ResultI object
142 Args    : n/a
143
144=cut
145
146sub next_result {
147  my $self = shift;
148
149  my $info = [];
150  my $result;
151  my $hit;
152  my @hsp_info;
153  my $previous_hit_pos;
154
155  while ( $_ = $self->_readline ) {
156      if ( $_ =~ /^>/ ) {	# looking at the start of a result
157	  if ($result) {	#    and done if there's one in progress
158	      $self->_pushback($_);
159	      goto DONE;
160	  }
161	  else {		#    otherwise start a new one.
162	      my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|;
163
164	      $result = Bio::Search::Result::GenericResult->new();
165	      $result->algorithm('gmap');
166	      $result->query_name($id);
167	      $result->query_accession($id);
168	      $result->query_description($desc);
169	      #$self->warn("Take care of MD5!\n");
170
171	      $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
172							  "NONE_SPECIFIED");
173	  }
174      }
175      else {			# add another position to the hit/hsp
176	  # 468 H	1956 C	-14:104307764 2298317517 C	H
177	  # 468 	1957 A	-14:104307763 2298317516 A
178	  my $c;		# info about a column
179	  ($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
180	   $c->{query_base},
181           $c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
182	   $c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
183	      = ($_ =~
184		 m|
185                   (\d+)[ ]?(.)?[\t]
186                   (\d+)[ ]?(.)?[\t]
187                   # TODO chromosome isn't a number... X, Y, MT....
188                   (\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.)
189                   [\t]?(.)?
190                  |xo
191		);
192
193	  if ($previous_hit_pos &&
194	      (abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
195	      $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
196							  "NONE_SPECIFIED",
197							);
198	      $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
199	      @hsp_info = ();
200	  }
201	  push @hsp_info, $c;
202	  $previous_hit_pos = $c->{hit_pos};
203      }
204  }
205
206 DONE:
207  if ($result) {
208      $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
209
210      my ($hit_length,$query_length);
211      for my $hsp ($hit->hsps) {
212	  $hit_length += $hsp->length();
213      $query_length += $hsp->length('query');
214      }
215      $hit->length($hit_length);
216      $hit->query_length($query_length);
217      # update this now that we actually know something useful.q
218      $hit->name($hsp_info[0]->{hit_chromo});
219
220      $result->add_hit($hit) if ($hit);
221  }
222
223  return($result);
224}
225
226
227
228sub _hsp_from_info {
229    my $self = shift;
230    my $info = shift;
231    my $a = {};			# args w/ which we'll create hsp
232    my $hsp;
233    my $identical;
234
235    $a->{-algorithm} = 'GMAP';
236
237    for my $c (@{$info}) {
238	$a->{-query_seq} .= $c->{query_base};
239	$a->{-hit_seq} .= $c->{hit_base};
240    $a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
241	$identical++ if ( $c->{query_base} eq $c->{hit_base} );
242    }
243
244    $a->{-query_seq} =~ s| |\-|g;		# switch to bioperl gaps.
245    $a->{-hit_seq} =~ s| |\-|g;
246
247    $a->{-conserved} = $a->{-identical} = $identical;
248
249    # use the coordinates from from gmap's -f 9 output to
250    # determine whether gmap revcomped the query sequence
251    # to generate the alignment.  Note that this is not
252    # the same as the cDNA's sense/anti-sense-ness.
253    $a->{-stranded} = 'both';
254
255    $a->{-query_start} = $info->[0]->{query_pos};
256    $a->{-query_end} = $info->[-1]->{query_pos};
257    $a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
258
259    # hit can be either strand, -f 9 output tells us which.
260    # we don't have to worry about it here, but telling the generichsp code
261    # that this hit is 'stranded', it compares the start and end positions
262    # sets it for us.
263    $a->{-hit_start} = $info->[0]->{hit_pos};
264    $a->{-hit_end} = $info->[-1]->{hit_pos};
265
266    $a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
267
268    $a->{-hsp_length} =
269	$a->{-query_length} > $a->{-hit_length} ?
270	    $a->{-query_length} : $a->{-hit_length};
271
272    $hsp = Bio::Search::HSP::GenericHSP->new( %$a );
273
274    return $hsp;
275}
276
277# TODO (adjust regexp to swallow lines w/out md5 sig's.
278sub _parse_path_header {
279    my $self = shift;
280    my $path_line = shift;
281    my $path = {};
282
283    (
284     $path->{query},
285     $path->{db},
286     $path->{path_num},
287     $path->{path_total_num},
288     $path->{query_length},
289     $path->{exon_count},
290     $path->{trimmed_coverage},
291     $path->{percent_identity},
292     $path->{query_start},
293     $path->{query_end},
294     $path->{whole_genome_start},
295     $path->{whole_genome_end},
296     $path->{chromosome},
297     $path->{chromo_start},
298     $path->{chromo_end},
299     $path->{strand},
300     $path->{sense},
301     $path->{md5},
302    ) =
303	($_ =~ qr|
304                >
305                ([^ ]*)[ ]	# the query id}, followed by a space
306	        ([^ ]*)[ ]      # the genome database, followed by a space
307                (\d+)/(\d+)[ ]	# path_num/path_total_num (e.g. 3/12)
308	        (\d+)[ ]	# query length, followed by a space
309	        (\d+)[ ]	# hsp/exon count, followed by a space
310                (\d+\.\d*)[ ]	# trimmed coverage
311                (\d+\.\d*)[ ]	# percent identity
312                (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
313                (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
314                (\d+):		# chromosome number
315                (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
316                ([+-])[ ]	# strand, followed by a space
317                dir:(.*) # dir:sense or dir:antisense
318                [ ]md5:([\dabcdefg]+) # md5 signature
319	 |x
320	);
321
322    $path->{query} or $self->throw("query was not found in path line.");
323    $path->{db} or $self->throw("db was not found in path line.");
324    $path->{path_num} or $self->throw("path_num was not found in path line.");
325    $path->{path_total_num} or
326	$self->throw("path_total_num was not found in path line.");
327    $path->{query_length} or
328	$self->throw("query_length was not found in path line.");
329    $path->{exon_count} or
330	$self->throw("exon_count was not found in path line.");
331    $path->{trimmed_coverage} or
332	$self->throw("trimmed_coverage was not found in path line.");
333    $path->{percent_identity} or
334	$self->throw("percent_identity was not found in path line.");
335    $path->{query_start} or
336	$self->throw("query_start was not found in path line.");
337    $path->{query_end} or
338	$self->throw("query_end was not found in path line.");
339    $path->{whole_genome_start} or
340	$self->throw("whole_genome_start was not found in path line.");
341    $path->{whole_genome_end} or
342	$self->throw("whole_genome_end was not found in path line.");
343    $path->{chromosome} or
344	$self->throw("chromosome was not found in path line.");
345    $path->{chromo_start} or
346	$self->throw("chromo_start was not found in path line.");
347    $path->{chromo_end} or
348	$self->throw("chromo_end was not found in path line.");
349    $path->{strand} or $self->throw("strand was not found in path line.");
350    $path->{sense} or $self->throw("sense was not found in path line.");
351
352    return $path;
353}
354
355sub _parse_alignment_line {
356    my $self = shift;
357    my $a_line = shift;
358    my $align = {};
359
360    (
361     $align->{chromo_start},
362     $align->{chromo_end},
363     $align->{query_start},
364     $align->{query_end},
365     $align->{percent_identity},
366     $align->{align_length},
367     $align->{intron_length},
368    ) =
369	($_ =~ qr|
370                [\t]
371                ([\d]+)[ ]	# start in chromosome coord.
372                ([\d]+)[ ]	# end in chromosome coord.
373                ([\d]+)[ ]	# start in query coord.
374                ([\d]+)[ ]	# end in query coord.
375                ([\d]+) 	# percent identity (as integer)
376                [\t].*[\t]	# skip the edit script
377                ([\d]+) 	# length of alignment block.
378                [\t]*([\d]+)* 	# length of following intron.
379	 |x
380	);
381
382     $align->{chromo_start}
383 	or $self->throw("chromo_start missing in alignment line.");
384     $align->{chromo_end},
385 	or $self->throw("chromo_end was missing in alignment line.");
386     $align->{query_start},
387 	or $self->throw("query_start was missing in alignment line.");
388     $align->{query_end},
389 	or $self->throw("query_end was missing in alignment line.");
390     $align->{percent_identity},
391 	or $self->throw("percent_identity was missing in alignment line.");
392     $align->{align_length},
393 	or $self->throw("align_length was missing in alignment line.");
394
395    return $align;
396}
397
3981;
399