1# $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $ 2# 3# BioPerl module for Bio::SearchIO::gmap_f9 4# 5# Cared for by George Hartzell <hartzell@alerce.com> 6# 7# Copyright George Hartzell 8# 9# You may distribute this module under the same terms as perl itself 10 11# POD documentation - main docs before the code 12 13=head1 NAME 14 15Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format) 16 17=head1 SYNOPSIS 18 19 # Do not use this object directly - it is used as part of the 20 # Bio::SearchIO system. 21 22 use Bio::SearchIO; 23 my $searchio = Bio::SearchIO->new(-format => 'gmap', 24 -file => 't/data/her2.gmapz'); 25 while( my $result = $searchio->next_result ) { 26 while( my $hit = $result->next_hit ) { 27 while( my $hsp = $hit->next_hsp ) { 28 # ... 29 } 30 } 31 } 32 33 34=head1 DESCRIPTION 35 36This object encapsulated the necessary methods for generating events 37suitable for building Bio::Search objects from a GMAP "compressed" 38report (from gmap run with -Z flag) Read the L<Bio::SearchIO> for more 39information about how to use this. 40 41=head2 REVERSE STRAND AND BIOPERL COORDINATES 42 43I believe that I'm doing the correct thing when reporting hits on the 44negative strand of the genome. In particular, I've compared the 45"exons" this code generates with the set returned by ncbi's megablast 46web service. NCBI's hsp's are ordered differently and have a 47different genomic location (off by ~18,000,000 bases, padding?) but 48the starts, ends, and lengths were similar and my strand handling 49matches theirs. E.g. 50 51 CDNA GENOME 52 start end strand start end strand 53 54blast 55 1913 2989 1 86236731 86237808 -1 56 1 475 1 86260509 86260983 -1 57 1510 1727 1 86240259 86240476 -1 58 841 989 1 86243034 86243182 -1 59 1381 1514 1 86240630 86240763 -1 60 989 1122 1 86242457 86242590 -1 61 599 729 1 86247470 86247600 -1 62 473 608 1 86259972 86260107 -1 63 1255 1382 1 86240837 86240964 -1 64 730 842 1 86244040 86244152 -1 65 1813 1921 1 86238123 86238231 -1 66 1725 1814 1 86239747 86239836 -1 67 1167 1256 1 86241294 86241383 -1 68 1120 1188 1 86242319 86242387 -1 69 70gmap 71 1 475 1 104330509 104330983 -1 72 476 600 1 104329980 104330104 -1 73 601 729 1 104317470 104317598 -1 74 730 841 1 104314041 104314152 -1 75 842 989 1 104313034 104313181 -1 76 990 1121 1 104312458 104312589 -1 77 1122 1187 1 104312320 104312385 -1 78 1188 1256 1 104311294 104311362 -1 79 1257 1382 1 104310837 104310962 -1 80 1383 1511 1 104310633 104310761 -1 81 1512 1726 1 104310260 104310474 -1 82 1727 1814 1 104309747 104309834 -1 83 1815 1917 1 104308127 104308229 -1 84 1918 2989 1 104306731 104307802 -1 85 86=head1 FEEDBACK 87 88=head2 Mailing Lists 89 90User feedback is an integral part of the evolution of this and other 91Bioperl modules. Send your comments and suggestions preferably to 92the Bioperl mailing list. Your participation is much appreciated. 93 94 bioperl-l@bioperl.org - General discussion 95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 96 97=head2 Reporting Bugs 98 99Report bugs to the Bioperl bug tracking system to help us keep track 100of the bugs and their resolution. Bug reports can be submitted via 101the web: 102 103 https://github.com/bioperl/bioperl-live/issues 104 105=head1 AUTHOR - George Hartzell 106 107Email hartzell@alerce.com 108 109=head1 CONTRIBUTORS 110 111Additional contributors names and emails here 112 113=head1 APPENDIX 114 115The rest of the documentation details each of the object methods. 116Internal methods are usually preceded with an underscore (_). 117 118=cut 119 120 121# Let the code begin... 122 123 124package Bio::SearchIO::gmap_f9; 125$Bio::SearchIO::gmap_f9::VERSION = '1.7.7'; 126use strict; 127use warnings; 128 129use Bio::Search::Hit::GenericHit; 130use Bio::Search::HSP::GenericHSP; 131 132use base qw(Bio::SearchIO ); 133 134use Data::Dumper; 135 136=head2 next_result 137 138 Title : next_result 139 Usage : $result = stream->next_result 140 Function: Reads the next ResultI object from the stream and returns it. 141 Returns : A Bio::Search::Result::ResultI object 142 Args : n/a 143 144=cut 145 146sub next_result { 147 my $self = shift; 148 149 my $info = []; 150 my $result; 151 my $hit; 152 my @hsp_info; 153 my $previous_hit_pos; 154 155 while ( $_ = $self->_readline ) { 156 if ( $_ =~ /^>/ ) { # looking at the start of a result 157 if ($result) { # and done if there's one in progress 158 $self->_pushback($_); 159 goto DONE; 160 } 161 else { # otherwise start a new one. 162 my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|; 163 164 $result = Bio::Search::Result::GenericResult->new(); 165 $result->algorithm('gmap'); 166 $result->query_name($id); 167 $result->query_accession($id); 168 $result->query_description($desc); 169 #$self->warn("Take care of MD5!\n"); 170 171 $hit ||= Bio::Search::Hit::GenericHit->new( -name => 172 "NONE_SPECIFIED"); 173 } 174 } 175 else { # add another position to the hit/hsp 176 # 468 H 1956 C -14:104307764 2298317517 C H 177 # 468 1957 A -14:104307763 2298317516 A 178 my $c; # info about a column 179 ($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos}, 180 $c->{query_base}, 181 $c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos}, 182 $c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa}) 183 = ($_ =~ 184 m| 185 (\d+)[ ]?(.)?[\t] 186 (\d+)[ ]?(.)?[\t] 187 # TODO chromosome isn't a number... X, Y, MT.... 188 (\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.) 189 [\t]?(.)? 190 |xo 191 ); 192 193 if ($previous_hit_pos && 194 (abs($c->{hit_pos} - $previous_hit_pos) > 1)) { 195 $hit ||= Bio::Search::Hit::GenericHit->new( -name => 196 "NONE_SPECIFIED", 197 ); 198 $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ); 199 @hsp_info = (); 200 } 201 push @hsp_info, $c; 202 $previous_hit_pos = $c->{hit_pos}; 203 } 204 } 205 206 DONE: 207 if ($result) { 208 $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info); 209 210 my ($hit_length,$query_length); 211 for my $hsp ($hit->hsps) { 212 $hit_length += $hsp->length(); 213 $query_length += $hsp->length('query'); 214 } 215 $hit->length($hit_length); 216 $hit->query_length($query_length); 217 # update this now that we actually know something useful.q 218 $hit->name($hsp_info[0]->{hit_chromo}); 219 220 $result->add_hit($hit) if ($hit); 221 } 222 223 return($result); 224} 225 226 227 228sub _hsp_from_info { 229 my $self = shift; 230 my $info = shift; 231 my $a = {}; # args w/ which we'll create hsp 232 my $hsp; 233 my $identical; 234 235 $a->{-algorithm} = 'GMAP'; 236 237 for my $c (@{$info}) { 238 $a->{-query_seq} .= $c->{query_base}; 239 $a->{-hit_seq} .= $c->{hit_base}; 240 $a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' '; 241 $identical++ if ( $c->{query_base} eq $c->{hit_base} ); 242 } 243 244 $a->{-query_seq} =~ s| |\-|g; # switch to bioperl gaps. 245 $a->{-hit_seq} =~ s| |\-|g; 246 247 $a->{-conserved} = $a->{-identical} = $identical; 248 249 # use the coordinates from from gmap's -f 9 output to 250 # determine whether gmap revcomped the query sequence 251 # to generate the alignment. Note that this is not 252 # the same as the cDNA's sense/anti-sense-ness. 253 $a->{-stranded} = 'both'; 254 255 $a->{-query_start} = $info->[0]->{query_pos}; 256 $a->{-query_end} = $info->[-1]->{query_pos}; 257 $a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1; 258 259 # hit can be either strand, -f 9 output tells us which. 260 # we don't have to worry about it here, but telling the generichsp code 261 # that this hit is 'stranded', it compares the start and end positions 262 # sets it for us. 263 $a->{-hit_start} = $info->[0]->{hit_pos}; 264 $a->{-hit_end} = $info->[-1]->{hit_pos}; 265 266 $a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1; 267 268 $a->{-hsp_length} = 269 $a->{-query_length} > $a->{-hit_length} ? 270 $a->{-query_length} : $a->{-hit_length}; 271 272 $hsp = Bio::Search::HSP::GenericHSP->new( %$a ); 273 274 return $hsp; 275} 276 277# TODO (adjust regexp to swallow lines w/out md5 sig's. 278sub _parse_path_header { 279 my $self = shift; 280 my $path_line = shift; 281 my $path = {}; 282 283 ( 284 $path->{query}, 285 $path->{db}, 286 $path->{path_num}, 287 $path->{path_total_num}, 288 $path->{query_length}, 289 $path->{exon_count}, 290 $path->{trimmed_coverage}, 291 $path->{percent_identity}, 292 $path->{query_start}, 293 $path->{query_end}, 294 $path->{whole_genome_start}, 295 $path->{whole_genome_end}, 296 $path->{chromosome}, 297 $path->{chromo_start}, 298 $path->{chromo_end}, 299 $path->{strand}, 300 $path->{sense}, 301 $path->{md5}, 302 ) = 303 ($_ =~ qr| 304 > 305 ([^ ]*)[ ] # the query id}, followed by a space 306 ([^ ]*)[ ] # the genome database, followed by a space 307 (\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12) 308 (\d+)[ ] # query length, followed by a space 309 (\d+)[ ] # hsp/exon count, followed by a space 310 (\d+\.\d*)[ ] # trimmed coverage 311 (\d+\.\d*)[ ] # percent identity 312 (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space 313 (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space 314 (\d+): # chromosome number 315 (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space 316 ([+-])[ ] # strand, followed by a space 317 dir:(.*) # dir:sense or dir:antisense 318 [ ]md5:([\dabcdefg]+) # md5 signature 319 |x 320 ); 321 322 $path->{query} or $self->throw("query was not found in path line."); 323 $path->{db} or $self->throw("db was not found in path line."); 324 $path->{path_num} or $self->throw("path_num was not found in path line."); 325 $path->{path_total_num} or 326 $self->throw("path_total_num was not found in path line."); 327 $path->{query_length} or 328 $self->throw("query_length was not found in path line."); 329 $path->{exon_count} or 330 $self->throw("exon_count was not found in path line."); 331 $path->{trimmed_coverage} or 332 $self->throw("trimmed_coverage was not found in path line."); 333 $path->{percent_identity} or 334 $self->throw("percent_identity was not found in path line."); 335 $path->{query_start} or 336 $self->throw("query_start was not found in path line."); 337 $path->{query_end} or 338 $self->throw("query_end was not found in path line."); 339 $path->{whole_genome_start} or 340 $self->throw("whole_genome_start was not found in path line."); 341 $path->{whole_genome_end} or 342 $self->throw("whole_genome_end was not found in path line."); 343 $path->{chromosome} or 344 $self->throw("chromosome was not found in path line."); 345 $path->{chromo_start} or 346 $self->throw("chromo_start was not found in path line."); 347 $path->{chromo_end} or 348 $self->throw("chromo_end was not found in path line."); 349 $path->{strand} or $self->throw("strand was not found in path line."); 350 $path->{sense} or $self->throw("sense was not found in path line."); 351 352 return $path; 353} 354 355sub _parse_alignment_line { 356 my $self = shift; 357 my $a_line = shift; 358 my $align = {}; 359 360 ( 361 $align->{chromo_start}, 362 $align->{chromo_end}, 363 $align->{query_start}, 364 $align->{query_end}, 365 $align->{percent_identity}, 366 $align->{align_length}, 367 $align->{intron_length}, 368 ) = 369 ($_ =~ qr| 370 [\t] 371 ([\d]+)[ ] # start in chromosome coord. 372 ([\d]+)[ ] # end in chromosome coord. 373 ([\d]+)[ ] # start in query coord. 374 ([\d]+)[ ] # end in query coord. 375 ([\d]+) # percent identity (as integer) 376 [\t].*[\t] # skip the edit script 377 ([\d]+) # length of alignment block. 378 [\t]*([\d]+)* # length of following intron. 379 |x 380 ); 381 382 $align->{chromo_start} 383 or $self->throw("chromo_start missing in alignment line."); 384 $align->{chromo_end}, 385 or $self->throw("chromo_end was missing in alignment line."); 386 $align->{query_start}, 387 or $self->throw("query_start was missing in alignment line."); 388 $align->{query_end}, 389 or $self->throw("query_end was missing in alignment line."); 390 $align->{percent_identity}, 391 or $self->throw("percent_identity was missing in alignment line."); 392 $align->{align_length}, 393 or $self->throw("align_length was missing in alignment line."); 394 395 return $align; 396} 397 3981; 399