1# POD documentation - main docs before the code 2 3=head1 NAME 4 5Bio::SeqIO::msout - input stream for output by Hudson's ms 6 7=head1 SYNOPSIS 8 9Do not use this module directly. Use it via the Bio::SeqIO class. 10 11=head1 DESCRIPTION 12 13ms ( Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral 14model. Bioinformatics 18:337-8 ) can be found at 15http://home.uchicago.edu/~rhudson1/source/mksamples.html. 16 17Currently, this object can be used to read output from ms into seq objects. 18However, because bioperl has no support for haplotypes created using an infinite 19sites model (where '1' identifies a derived allele and '0' identifies an 20ancestral allele), the sequences returned by msout are coded using A, T, C and 21G. To decode the bases, use the sequence conversion table (a hash) returned by 22get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is 23unclear. This should not ever happen when creating files with ms, but it will be 24used when creating msOUT files from a collection of seq objects ( To be added 25later ). Alternatively, use get_next_hap() to get a string with 1's and 0's 26instead of a seq object. 27 28=head2 Mapping to Finite Sites 29 30This object can now also be used to map haplotypes created using an infinite sites 31model to sequences of arbitrary finite length. See set_n_sites() for more detail. 32Thanks to Filipe G. Vieira <fgvieira@berkeley.edu> for the idea and code. 33 34=head1 FEEDBACK 35 36=head2 Mailing Lists 37 38User feedback is an integral part of the evolution of this and other 39Bioperl modules. Send your comments and suggestions preferably to the 40Bioperl mailing list. Your participation is much appreciated. 41 42 bioperl-l@bioperl.org - General discussion 43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 44 45=head2 Reporting Bugs 46 47Report bugs to the Bioperl bug tracking system to help us keep track 48of the bugs and their resolution. Bug reports can be submitted via the 49web: 50 51 https://github.com/bioperl/bioperl-live/issues 52 53=head1 AUTHOR - Warren Kretzschmar 54 55This module was written by Warren Kretzschmar 56 57email: wkretzsch@gmail.com 58 59This module grew out of a parser written by Aida Andres. 60 61=head1 COPYRIGHT 62 63=head2 Public Domain Notice 64 65This software/database is ``United States Government Work'' under the 66terms of the United States Copyright Act. It was written as part of 67the authors' official duties for the United States Government and thus 68cannot be copyrighted. This software/database is freely available to 69the public for use without a copyright notice. Restrictions cannot 70be placed on its present or future use. 71 72Although all reasonable efforts have been taken to ensure the accuracy 73and reliability of the software and data, the National Human Genome 74Research Institute (NHGRI) and the U.S. Government does not and cannot 75warrant the performance or results that may be obtained by using this 76software or data. NHGRI and the U.S. Government disclaims all 77warranties as to performance, merchantability or fitness for any 78particular purpose. 79 80=head1 METHODS 81 82=cut 83 84package Bio::SeqIO::msout; 85$Bio::SeqIO::msout::VERSION = '1.7.7'; 86use version; 87our $API_VERSION = qv('1.1.8'); 88 89use strict; 90use base qw(Bio::SeqIO); # This ISA Bio::SeqIO object 91use Bio::Seq::SeqFactory; 92 93=head2 Methods for Internal Use 94 95=head3 _initialize 96 97Title : _initialize 98Usage : $stream = Bio::SeqIO::msOUT->new($infile) 99Function: extracts basic information about the file. 100Returns : Bio::SeqIO object 101Args : no_og, gunzip, gzip, n_sites 102Details : 103 - include 'no_og' flag if the last population of an msout file contains 104 only one haplotype and you don't want the last haplotype to be 105 treated as the outgroup ( suggested when reading data created by ms ). 106 - including 'n_sites' (positive integer) causes all output haplotypes to be 107 mapped to a sequence of length 'n_sites'. See set_n_sites() for more details. 108 109=cut 110 111sub _initialize { 112 my ( $self, @args ) = @_; 113 $self->SUPER::_initialize(@args); 114 115 unless ( defined $self->sequence_factory ) { 116 $self->sequence_factory( Bio::Seq::SeqFactory->new() ); 117 } 118 my ($no_og) = $self->_rearrange( [qw(NO_OG)], @args ); 119 my ($n_sites) = $self->_rearrange( [qw(N_SITES)], @args ); 120 121 my %initial_values = ( 122 RUNS => undef, 123 N_SITES => undef, 124 SEGSITES => undef, 125 SEEDS => [], 126 MS_INFO_LINE => undef, 127 TOT_RUN_HAPS => undef, 128 POPS => [], 129 NEXT_RUN_NUM => undef, # What run is the next hap from? undef = EOF 130 LAST_READ_HAP_NUM => undef, # What did we just read from 131 LAST_HAPS_RUN_NUM => undef, 132 LAST_READ_POSITIONS => [], 133 LAST_READ_SEGSITES => undef, 134 BUFFER_HAP => undef, 135 NO_OUTGROUP => $no_og, 136 BASE_CONVERSION_TABLE_HASH_REF => { 137 'A' => 0, 138 'T' => 1, 139 'C' => 4, 140 'G' => 5, 141 }, 142 ); 143 foreach my $key ( keys %initial_values ) { 144 $self->{$key} = $initial_values{$key}; 145 } 146 $self->set_n_sites($n_sites); 147 148 # If the filehandle is defined open it and read a few lines 149 if ( ref( $self->{_filehandle} ) eq 'GLOB' ) { 150 $self->_read_start(); 151 return $self; 152 } 153 154 # Otherwise throw a warning 155 else { 156 $self->throw( 157"No filehandle defined. Please define a file handle through -file when calling msout with Bio::SeqIO" 158 ); 159 } 160} 161 162=head3 _read_start 163 164Title : _read_start 165Usage : $stream->_read_start() 166Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence). Closes the filehandle if all lines have been read. 167Returns : void 168Args : none 169 170=cut 171 172sub _read_start { 173 my $self = shift; 174 175 my $fh_IN = $self->{_filehandle}; 176 177 # get the first five lines and parse for important info 178 my ( $ms_info_line, $seeds ) = $self->_get_next_clean_hap( $fh_IN, 2, 1 ); 179 180 my @ms_info_line = split( /\s+/, $ms_info_line ); 181 182 my ( $tot_pops, @pop_haplos ); 183 184 # Parsing the ms header line 185 shift @ms_info_line; 186 my $tot_run_haps = shift @ms_info_line; 187 my $runs = shift @ms_info_line; 188 my $segsites; 189 190 foreach my $word ( 0 .. $#ms_info_line ) { 191 if ( $ms_info_line[$word] eq '-I' ) { 192 $tot_pops = $ms_info_line[ $word + 1 ]; 193 for my $pop_num ( 1 .. $tot_pops ) { 194 push @pop_haplos, $ms_info_line[ $word + 1 + $pop_num ]; 195 } 196 197# if @pop_haplos contains a non-digit, then there is an error in the msinfo line. 198 if ( !defined $pop_haplos[-1] || $pop_haplos[-1] =~ /\D/ ) { 199 $self->throw( 200"Incorrect number of populations in the ms info line (after the -I specifier)" 201 ); 202 } 203 } 204 elsif ( $ms_info_line[$word] eq '-s' ) { 205 $segsites = $ms_info_line[ $word + 1 ]; 206 } 207 else { next; } 208 } 209 210 unless (@pop_haplos) { @pop_haplos = ($tot_run_haps) } 211 212 my @seeds = split( /\s+/, $seeds ); 213 214 # Save ms info data 215 $self->{RUNS} = $runs; 216 $self->{SEGSITES} = $segsites; 217 $self->{SEEDS} = \@seeds; 218 $self->{MS_INFO_LINE} = $ms_info_line; 219 $self->{TOT_RUN_HAPS} = $tot_run_haps; 220 $self->{POPS} = [@pop_haplos]; 221 222 return; 223} 224 225=head2 Methods to Access Data 226 227=head3 get_segsites 228 229Title : get_segsites 230Usage : $segsites = $stream->get_segsites() 231Function: returns the number of segsites in the msOUT file (according to the msOUT header line's -s option), or the current run's segsites if -s was not specified in the command line (in this case the number of segsites varies from run to run). 232Returns : scalar 233Args : NONE 234 235=cut 236 237sub get_segsites { 238 my $self = shift; 239 if ( defined $self->{SEGSITES} ) { 240 return $self->{SEGSITES}; 241 } 242 else { 243 return $self->get_current_run_segsites; 244 } 245} 246 247=head3 get_current_run_segsites 248 249Title : get_current_run_segsites 250Usage : $segsites = $stream->get_current_run_segsites() 251Function: returns the number of segsites in the run of the last read 252 haplotype (sequence). 253Returns : scalar 254Args : NONE 255 256=cut 257 258sub get_current_run_segsites { 259 my $self = shift; 260 return $self->{LAST_READ_SEGSITES}; 261} 262 263=head3 get_n_sites 264 265Title : get_n_sites 266Usage : $n_sites = $stream->get_n_sites() 267Function: Gets the number of total sites (variable or not) to be output. 268Returns : scalar if n_sites option is defined at call time of new() 269Args : NONE 270Note : 271 WARNING: Final sequence length might not be equal to n_sites if n_sites is 272 too close to number of segregating sites in the msout file. 273 274=cut 275 276sub get_n_sites { 277 my ($self) = @_; 278 279 return $self->{N_SITES}; 280} 281 282=head3 set_n_sites 283 284Title : set_n_sites 285Usage : $n_sites = $stream->set_n_sites($value) 286Function: Sets the number of total sites (variable or not) to be output. 287Returns : 1 on success; throws an error if $value is not a positive integer or undef 288Args : positive integer 289Note : 290 WARNING: Final sequence length might not be equal to n_sites if it is 291 too close to number of segregating sites. 292 - n_sites needs to be at least as large as the number of segsites of 293 the next haplotype returned 294 - n_sites may also be set to undef, in which case haplotypes are returned 295 under the infinite sites model assumptions. 296 297=cut 298 299sub set_n_sites { 300 my ( $self, $value ) = @_; 301 302 # make sure $value is a positive integer if it is defined 303 if ( defined $value ) { 304 $self->throw( 305"first argument needs to be a positive integer. argument supplied: $value" 306 ) unless ( $value =~ m/^\d+$/ && $value > 0 ); 307 } 308 $self->{N_SITES} = $value; 309 310 return 1; 311} 312 313=head3 get_runs 314 315Title : get_runs 316Usage : $runs = $stream->get_runs() 317Function: returns the number of runs in the msOUT file (according to the 318 msinfo line) 319Returns : scalar 320Args : NONE 321 322=cut 323 324sub get_runs { 325 my $self = shift; 326 return $self->{RUNS}; 327} 328 329=head3 get_Seeds 330 331Title : get_Seeds 332Usage : @seeds = $stream->get_Seeds() 333Function: returns an array of the seeds used in the creation of the msOUT file. 334Returns : array 335Args : NONE 336Details : In older versions, ms used three seeds. Newer versions of ms seem to 337 use only one (longer) seed. This function will return all the seeds 338 found. 339 340=cut 341 342sub get_Seeds { 343 my $self = shift; 344 return @{ $self->{SEEDS} }; 345} 346 347=head3 get_Positions 348 349Title : get_Positions 350Usage : @positions = $stream->get_Positions() 351Function: returns an array of the names of each segsite of the run of the last 352 read hap. 353Returns : array 354Args : NONE 355Details : The Positions may or may not vary from run to run depending on the 356 options used with ms. 357 358=cut 359 360sub get_Positions { 361 my $self = shift; 362 return @{ $self->{LAST_READ_POSITIONS} }; 363} 364 365=head3 get_tot_run_haps 366 367Title : get_tot_run_haps 368Usage : $number_of_haps_per_run = $stream->get_tot_run_haps() 369Function: returns the number of haplotypes (sequences) in each run of the msOUT 370 file ( according to the msinfo line ). 371Returns : scalar >= 0 372Args : NONE 373Details : This number should not vary from run to run. 374 375=cut 376 377sub get_tot_run_haps { 378 my $self = shift; 379 return $self->{TOT_RUN_HAPS}; 380} 381 382=head3 get_ms_info_line 383 384Title : get_ms_info_line 385Usage : $ms_info_line = $stream->get_ms_info_line() 386Function: returns the header line of the msOUT file. 387Returns : scalar 388Args : NONE 389 390=cut 391 392sub get_ms_info_line { 393 my $self = shift; 394 return $self->{MS_INFO_LINE}; 395} 396 397=head3 tot_haps 398 399Title : tot_haps 400Usage : $number_of_haplotypes_in_file = $stream->tot_haps() 401Function: returns the number of haplotypes (sequences) in the msOUT file. 402 Information gathered from msOUT header line. 403Returns : scalar 404Args : NONE 405 406=cut 407 408sub get_tot_haps { 409 my $self = shift; 410 return ( $self->{TOT_RUN_HAPS} * $self->{RUNS} ); 411} 412 413=head3 get_Pops 414 415Title : get_Pops 416Usage : @pops = $stream->pops() 417Function: returns an array of population sizes (order taken from the -I flag in 418 the msOUT header line). This array will include the last hap even if 419 it looks like an outgroup. 420Returns : array of scalars > 0 421Args : NONE 422 423=cut 424 425sub get_Pops { 426 my $self = shift; 427 return @{ $self->{POPS} }; 428} 429 430=head3 get_next_run_num 431 432Title : get_next_run_num 433Usage : $next_run_number = $stream->next_run_num() 434Function: returns the number of the ms run that the next haplotype (sequence) 435 will be taken from (starting at 1). Returns undef if the complete 436 file has been read. 437Returns : scalar > 0 or undef 438Args : NONE 439 440=cut 441 442sub get_next_run_num { 443 my $self = shift; 444 return $self->{NEXT_RUN_NUM}; 445} 446 447=head3 get_last_haps_run_num 448 449Title : get_last_haps_run_num 450Usage : $last_haps_run_number = $stream->get_last_haps_run_num() 451Function: returns the number of the ms run that the last haplotype (sequence) 452 was taken from (starting at 1). Returns undef if no hap has been 453 read yet. 454Returns : scalar > 0 or undef 455Args : NONE 456 457=cut 458 459sub get_last_haps_run_num { 460 my $self = shift; 461 return $self->{LAST_HAPS_RUN_NUM}; 462} 463 464=head3 get_last_read_hap_num 465 466Title : get_last_read_hap_num 467Usage : $last_read_hap_num = $stream->get_last_read_hap_num() 468Function: returns the number (starting with 1) of the last haplotype read from 469 the ms file 470Returns : scalar >= 0 471Args : NONE 472Details : 0 means that no haplotype has been read yet. Is reset to 0 every run. 473 474=cut 475 476sub get_last_read_hap_num { 477 my $self = shift; 478 return $self->{LAST_READ_HAP_NUM}; 479} 480 481=head3 outgroup 482 483Title : outgroup 484Usage : $outgroup = $stream->outgroup() 485Function: returns '1' if the msOUT stream has an outgroup. Returns '0' 486 otherwise. 487Returns : '1' or '0' 488Args : NONE 489Details : This method will return '1' only if the last population in the msOUT 490 file contains only one haplotype. If the last population is not an 491 outgroup then create the msOUT object using 'no_og' as input flag. 492 Also, return 0, if the run has only one population. 493 494=cut 495 496sub outgroup { 497 my $self = shift; 498 my @pops = $self->get_Pops; 499 if ( $pops[$#pops] == 1 && !defined $self->{NO_OUTGROUP} && @pops > 1 ) { 500 return 1; 501 } 502 else { return 0; } 503} 504 505=head3 get_next_haps_pop_num 506 507Title : get_next_haps_pop_num 508Usage : ($next_haps_pop_num, $num_haps_left_in_pop) = $stream->get_next_haps_pop_num() 509Function: First return value is the population number (starting with 1) the 510 next hap will come from. The second return value is the number of haps 511 left to read in the population from which the next hap will come. 512Returns : (scalar > 0, scalar > 0) 513Args : NONE 514 515=cut 516 517sub get_next_haps_pop_num { 518 my $self = shift; 519 my $last_read_hap = $self->get_last_read_hap_num; 520 my @pops = $self->get_Pops; 521 522 foreach my $pop_num ( 0 .. $#pops ) { 523 if ( $last_read_hap < $pops[$pop_num] ) { 524 return ( $pop_num + 1, $pops[$pop_num] - $last_read_hap ); 525 } 526 else { $last_read_hap -= $pops[$pop_num] } 527 } 528 529 # In this case we're at the beginning of the next run 530 return ( 1, $pops[0] ); 531} 532 533=head3 get_next_seq 534 535Title : get_next_seq 536Usage : $seq = $stream->get_next_seq() 537Function: reads and returns the next sequence (haplotype) in the stream 538Returns : Bio::Seq object or void if end of file 539Args : NONE 540Note : This function is included only to conform to convention. The 541 returned Bio::Seq object holds a halpotype in coded form. Use the hash 542 returned by get_base_conversion_table() to convert 'A', 'T', 'C', 'G' 543 back into 1,2,4 and 5. Use get_next_hap() to retrieve the halptoype as 544 a string of 1,2,4 and 5s instead. 545 546=cut 547 548sub get_next_seq { 549 my $self = shift; 550 my $seqstring = $self->get_next_hap; 551 552 return unless ($seqstring); 553 554 # Used to create unique ID; 555 my $run = $self->get_last_haps_run_num; 556 557 # Converting numbers to letters so that the haplotypes can be stored as a 558 # seq object 559 my $rh_base_conversion_table = $self->get_base_conversion_table; 560 foreach my $base ( keys %{$rh_base_conversion_table} ) { 561 $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g; 562 } 563 564 # Fill in non-variable positions 565 my $segsites = $self->get_current_run_segsites; 566 my $n_sites = $self->get_n_sites; 567 if ( defined($n_sites) ) { 568 569 # make sure that n_sites is at least as large 570 # as segsites for each run. Throw an exception otherwise. 571 $self->throw( "n_sites:\t$n_sites" 572 . "\nsegsites:\t$segsites" 573 . "\nrun:\t$run" 574 . "\nn_sites needs to be at least the number of segsites of every run" 575 ) unless $segsites <= $n_sites; 576 577 my $seq_len = 0; 578 my @seq; 579 my @pos = $self->get_Positions; 580 for ( my $i = 0 ; $i <= $#pos ; $i++ ) { 581 $pos[$i] *= $n_sites; 582 my $rpt = $pos[$i] - 1 - $seq_len; 583 $rpt = $rpt >= 1 ? $rpt : 0; 584 push( @seq, "A" x ( $rpt ) ); 585 $seq_len += length( $seq[-1] ); 586 push( @seq, substr( $seqstring, $i, 1 ) ); 587 $seq_len += length( $seq[-1] ); 588 } 589 push( @seq, "A" x ( $n_sites - $seq_len ) ); 590 $seqstring = join( "", @seq ); 591 } 592 593 my $last_read_hap = $self->get_last_read_hap_num; 594 my $id = 'Hap_' . $last_read_hap . '_Run_' . $run; 595 my $description = 596 "Segsites $segsites;" 597 . " Positions " 598 . ( defined $n_sites ? $n_sites : $segsites ) . ";" 599 . " Haplotype $last_read_hap;" 600 . " Run $run;"; 601 my $seq = $self->sequence_factory->create( 602 -seq => $seqstring, 603 -id => $id, 604 -desc => $description, 605 -alphabet => q(dna), 606 -direct => 1, 607 ); 608 609 return $seq 610 611} 612 613=head3 next_seq 614 615Title : next_seq 616Usage : $seq = $stream->next_seq() 617Function: Alias to get_next_seq() 618Returns : Bio::Seq object or void if end of file 619Args : NONE 620Note : This function is only included for convention. It calls get_next_seq(). 621 See get_next_seq() for details. 622 623=cut 624 625sub next_seq { 626 my $self = shift; 627 return $self->get_next_seq(); 628} 629 630=head3 get_next_hap 631 632Title : get_next_hap 633Usage : $hap = $stream->next_hap() 634Function: reads and returns the next sequence (haplotype) in the stream. 635 Returns undef if all sequences in stream have been read. 636Returns : Haplotype string (e.g. '110110000101101045454000101' 637Args : NONE 638Note : Use get_next_seq() if you want the halpotype returned as a 639 Bio::Seq object. 640 641=cut 642 643sub get_next_hap { 644 my $self = shift; 645 646 # Let's figure out how many haps to read from the input file so that 647 # we get back to the beginning of the next run. 648 649 my $end_run = 0; 650 if ( $self->{TOT_RUN_HAPS} == $self->{LAST_READ_HAP_NUM} + 1 ) { 651 $end_run = 1; 652 } 653 654 # Setting last_haps_run_num 655 $self->{LAST_HAPS_RUN_NUM} = $self->get_next_run_num; 656 657 my $last_read_hap = $self->get_last_read_hap_num; 658 my ($seqstring) = 659 $self->_get_next_clean_hap( $self->{_filehandle}, 1, $end_run ); 660 if ( !defined $seqstring && $last_read_hap < $self->get_tot_haps ) { 661 $self->throw( 662"msout file has only $last_read_hap hap(s), which is less than indicated in msinfo line ( " 663 . $self->get_tot_haps 664 . " )" ); 665 } 666 667 return $seqstring; 668} 669 670=head3 get_next_pop 671 672Title : get_next_pop 673Usage : @seqs = $stream->next_pop() 674Function: reads and returns all the remaining sequences (haplotypes) in the 675 population of the next sequence. Returns an empty list if no more 676 haps remain to be read in the stream 677Returns : array of Bio::Seq objects 678Args : NONE 679 680=cut 681 682sub get_next_pop { 683 my $self = shift; 684 685 # Let's figure out how many haps to read from the input file so that 686 # we get back to the beginning of the next run. 687 688 my @pops = $self->get_Pops; 689 my @seqs; # holds Bio::Seq objects to return 690 691 # Determine number of the pop that the next hap will be taken from 692 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num; 693 694 # If $haps_to_pull == 0, then we need to pull the whole population 695 if ( $haps_to_pull == 0 ) { 696 $haps_to_pull = $pops[ $next_haps_pop_num - 1 ]; 697 } 698 699 for ( 1 .. $haps_to_pull ) { 700 my $seq = $self->get_next_seq; 701 next unless defined $seq; 702 703 # Add Population number information to description 704 $seq->display_id(" Population number $next_haps_pop_num;"); 705 push @seqs, $seq; 706 } 707 708 return @seqs; 709} 710 711=head3 next_run 712 713Title : next_run 714Usage : @seqs = $stream->next_run() 715Function: reads and returns all the remaining sequences (haplotypes) in the ms 716 run of the next sequence. Returns an empty list if all haps have been 717 read from the stream. 718Returns : array of Bio::Seq objects 719Args : NONE 720 721=cut 722 723sub get_next_run { 724 my $self = shift; 725 726 # Let's figure out how many haps to read from the input file so that 727 # we get back to the beginning of the next run. 728 729 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num; 730 my @seqs; 731 732 my @pops = $self->get_Pops; 733 734 foreach ( $next_haps_pop_num .. $#pops ) { 735 $haps_to_pull += $pops[$_]; 736 } 737 738 # Read those haps from the input file 739 # Next hap read will be the first hap of the first pop of the next run. 740 741 for ( 1 .. $haps_to_pull ) { 742 my $seq = $self->get_next_seq; 743 next unless defined $seq; 744 745 push @seqs, $seq; 746 } 747 748 return @seqs; 749} 750 751=head2 Methods to Retrieve Constants 752 753=head3 base_conversion_table 754 755Title : get_base_conversion_table 756Usage : $table_hash_ref = $stream->get_base_conversion_table() 757Function: returns a reference to a hash. The keys of the hash are the letters ' 758 A','T','G','C'. The values associated with each key are the value that 759 each letter in the sequence of a seq object returned by a 760 Bio::SeqIO::msout stream should be translated to. 761Returns : reference to a hash 762Args : NONE 763Synopsis: 764 765 # retrieve the Bio::Seq object's sequence 766 my $haplotype = $seq->seq; 767 768 # need to convert all letters to their corresponding numbers. 769 foreach my $base (keys %{$rh_base_conversion_table}){ 770 $haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g; 771 } 772 773 # $haplotype is now an ms style haplotype. (e.g. '100101101455') 774 775=cut 776 777sub get_base_conversion_table { 778 my $self = shift; 779 return $self->{BASE_CONVERSION_TABLE_HASH_REF}; 780} 781 782############################################################################## 783## subs for internal use only 784############################################################################## 785 786sub _get_next_clean_hap { 787 788 #By Warren Kretzschmar 789 790 # return the next non-empty line from file handle (chomped line) 791 # skipps to the next run if '//' is encountered 792 my ( $self, $fh, $times, $end_run ) = @_; 793 my @data; 794 795 unless ( ref($fh) eq q(GLOB) ) { return; } 796 797 unless ( defined $times && $times > 0 ) { 798 $times = 1; 799 } 800 801 if ( defined $self->{BUFFER_HAP} ) { 802 push @data, $self->{BUFFER_HAP}; 803 $self->{BUFFER_HAP} = undef; 804 $self->{LAST_READ_HAP_NUM}++; 805 $times--; 806 } 807 808 while ( 1 <= $times-- ) { 809 810 # Find next clean line 811 my $data = <$fh>; 812 last if !defined($data); 813 chomp $data; 814 while ( $data !~ /./ ) { 815 $data = <$fh>; 816 chomp $data; 817 } 818 819 # If the next run is encountered here, then we have a programming 820 # or format error 821 if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") } 822 823 $self->{LAST_READ_HAP_NUM}++; 824 push @data, $data; 825 } 826 827 if ($end_run) { 828 $self->_load_run_info($fh); 829 } 830 831 return (@data); 832} 833 834sub _load_run_info { 835 836 my ( $self, $fh ) = @_; 837 838 my $data = <$fh>; 839 840 # getting rid of excess newlines 841 while ( defined($data) && $data !~ /./ ) { 842 $data = <$fh>; 843 } 844 845 # In this case we are at EOF 846 if ( !defined($data) ) { $self->{NEXT_RUN_NUM} = undef; return; } 847 848 while ( $data !~ /./ ) { 849 $data = <$fh>; 850 chomp $data; 851 } 852 853 chomp $data; 854 855 # If the next run is encountered, then skip to the next hap and save it in 856 # the buffer. 857 if ( $data eq '//' ) { 858 $self->{NEXT_RUN_NUM}++; 859 $self->{LAST_READ_HAP_NUM} = 0; 860 for ( 1 .. 3 ) { 861 $data = <$fh>; 862 while ( $data !~ /./ ) { 863 $data = <$fh>; 864 chomp $data; 865 } 866 chomp $data; 867 868 if ( $_ eq '1' ) { 869 my @sites = split( /\s+/, $data ); 870 $self->{LAST_READ_SEGSITES} = $sites[1]; 871 } 872 elsif ( $_ eq '2' ) { 873 my @positions = split( /\s+/, $data ); 874 shift @positions; 875 $self->{LAST_READ_POSITIONS} = \@positions; 876 } 877 else { 878 if ( !defined($data) ) { 879 $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n"); 880 } 881 $self->{BUFFER_HAP} = $data; 882 } 883 } 884 } 885 else { 886 $self->throw( 887"'//' not encountered when expected. There are more haplos in one of the msOUT runs than advertised in the msinfo line." 888 ); 889 } 890 891} 8921; 893