1# POD documentation - main docs before the code
2
3=head1 NAME
4
5Bio::SeqIO::msout - input stream for output by Hudson's ms
6
7=head1 SYNOPSIS
8
9Do not use this module directly.  Use it via the Bio::SeqIO class.
10
11=head1 DESCRIPTION
12
13ms ( Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral
14model. Bioinformatics 18:337-8 ) can be found at
15http://home.uchicago.edu/~rhudson1/source/mksamples.html.
16
17Currently, this object can be used to read output from ms into seq objects.
18However, because bioperl has no support for haplotypes created using an infinite
19sites model (where '1' identifies a derived allele and '0' identifies an
20ancestral allele), the sequences returned by msout are coded using A, T, C and
21G. To decode the bases, use the sequence conversion table (a hash) returned by
22get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is
23unclear. This should not ever happen when creating files with ms, but it will be
24used when creating msOUT files from a collection of seq objects ( To be added
25later ). Alternatively, use get_next_hap() to get a string with 1's and 0's
26instead of a seq object.
27
28=head2 Mapping to Finite Sites
29
30This object can now also be used to map haplotypes created using an infinite sites
31model to sequences of arbitrary finite length.  See set_n_sites() for more detail.
32Thanks to Filipe G. Vieira <fgvieira@berkeley.edu> for the idea and code.
33
34=head1 FEEDBACK
35
36=head2 Mailing Lists
37
38User feedback is an integral part of the evolution of this and other
39Bioperl modules. Send your comments and suggestions preferably to the
40Bioperl mailing list. Your participation is much appreciated.
41
42  bioperl-l@bioperl.org                  - General discussion
43  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
44
45=head2 Reporting Bugs
46
47Report bugs to the Bioperl bug tracking system to help us keep track
48of the bugs and their resolution. Bug reports can be submitted via the
49web:
50
51  https://github.com/bioperl/bioperl-live/issues
52
53=head1 AUTHOR - Warren Kretzschmar
54
55This module was written by Warren Kretzschmar
56
57email: wkretzsch@gmail.com
58
59This module grew out of a parser written by Aida Andres.
60
61=head1 COPYRIGHT
62
63=head2 Public Domain Notice
64
65This software/database is ``United States Government Work'' under the
66terms of the United States Copyright Act. It was written as part of
67the authors' official duties for the United States Government and thus
68cannot be copyrighted. This software/database is freely available to
69the public for use without a copyright notice. Restrictions cannot
70be placed on its present or future use.
71
72Although all reasonable efforts have been taken to ensure the accuracy
73and reliability of the software and data, the National Human Genome
74Research Institute (NHGRI) and the U.S. Government does not and cannot
75warrant the performance or results that may be obtained by using this
76software or data.  NHGRI and the U.S. Government disclaims all
77warranties as to performance, merchantability or fitness for any
78particular purpose.
79
80=head1 METHODS
81
82=cut
83
84package Bio::SeqIO::msout;
85$Bio::SeqIO::msout::VERSION = '1.7.7';
86use version;
87our $API_VERSION = qv('1.1.8');
88
89use strict;
90use base qw(Bio::SeqIO);    # This ISA Bio::SeqIO object
91use Bio::Seq::SeqFactory;
92
93=head2 Methods for Internal Use
94
95=head3 _initialize
96
97Title   : _initialize
98Usage   : $stream = Bio::SeqIO::msOUT->new($infile)
99Function: extracts basic information about the file.
100Returns : Bio::SeqIO object
101Args    : no_og, gunzip, gzip, n_sites
102Details	:
103    - include 'no_og' flag if the last population of an msout file contains
104      only one haplotype and you don't want the last haplotype to be
105      treated as the outgroup ( suggested when reading data created by ms ).
106    - including 'n_sites' (positive integer) causes all output haplotypes to be
107      mapped to a sequence of length 'n_sites'. See set_n_sites() for more details.
108
109=cut
110
111sub _initialize {
112    my ( $self, @args ) = @_;
113    $self->SUPER::_initialize(@args);
114
115    unless ( defined $self->sequence_factory ) {
116        $self->sequence_factory( Bio::Seq::SeqFactory->new() );
117    }
118    my ($no_og)   = $self->_rearrange( [qw(NO_OG)],   @args );
119    my ($n_sites) = $self->_rearrange( [qw(N_SITES)], @args );
120
121    my %initial_values = (
122        RUNS              => undef,
123        N_SITES           => undef,
124        SEGSITES          => undef,
125        SEEDS             => [],
126        MS_INFO_LINE      => undef,
127        TOT_RUN_HAPS      => undef,
128        POPS              => [],
129        NEXT_RUN_NUM      => undef, # What run is the next hap from? undef = EOF
130        LAST_READ_HAP_NUM => undef, # What did we just read from
131        LAST_HAPS_RUN_NUM => undef,
132        LAST_READ_POSITIONS            => [],
133        LAST_READ_SEGSITES             => undef,
134        BUFFER_HAP                     => undef,
135        NO_OUTGROUP                    => $no_og,
136        BASE_CONVERSION_TABLE_HASH_REF => {
137            'A' => 0,
138            'T' => 1,
139            'C' => 4,
140            'G' => 5,
141        },
142    );
143    foreach my $key ( keys %initial_values ) {
144        $self->{$key} = $initial_values{$key};
145    }
146    $self->set_n_sites($n_sites);
147
148    # If the filehandle is defined open it and read a few lines
149    if ( ref( $self->{_filehandle} ) eq 'GLOB' ) {
150        $self->_read_start();
151        return $self;
152    }
153
154    # Otherwise throw a warning
155    else {
156        $self->throw(
157"No filehandle defined.  Please define a file handle through -file when calling msout with Bio::SeqIO"
158        );
159    }
160}
161
162=head3 _read_start
163
164Title   : _read_start
165Usage   : $stream->_read_start()
166Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence).  Closes the filehandle if all lines have been read.
167Returns : void
168Args    : none
169
170=cut
171
172sub _read_start {
173    my $self = shift;
174
175    my $fh_IN = $self->{_filehandle};
176
177    # get the first five lines and parse for important info
178    my ( $ms_info_line, $seeds ) = $self->_get_next_clean_hap( $fh_IN, 2, 1 );
179
180    my @ms_info_line = split( /\s+/, $ms_info_line );
181
182    my ( $tot_pops, @pop_haplos );
183
184    # Parsing the ms header line
185    shift @ms_info_line;
186    my $tot_run_haps = shift @ms_info_line;
187    my $runs         = shift @ms_info_line;
188    my $segsites;
189
190    foreach my $word ( 0 .. $#ms_info_line ) {
191        if ( $ms_info_line[$word] eq '-I' ) {
192            $tot_pops = $ms_info_line[ $word + 1 ];
193            for my $pop_num ( 1 .. $tot_pops ) {
194                push @pop_haplos, $ms_info_line[ $word + 1 + $pop_num ];
195            }
196
197# if @pop_haplos contains a non-digit, then there is an error in the msinfo line.
198            if ( !defined $pop_haplos[-1] || $pop_haplos[-1] =~ /\D/ ) {
199                $self->throw(
200"Incorrect number of populations in the ms info line (after the -I specifier)"
201                );
202            }
203        }
204        elsif ( $ms_info_line[$word] eq '-s' ) {
205            $segsites = $ms_info_line[ $word + 1 ];
206        }
207        else { next; }
208    }
209
210    unless (@pop_haplos) { @pop_haplos = ($tot_run_haps) }
211
212    my @seeds = split( /\s+/, $seeds );
213
214    # Save ms info data
215    $self->{RUNS}         = $runs;
216    $self->{SEGSITES}     = $segsites;
217    $self->{SEEDS}        = \@seeds;
218    $self->{MS_INFO_LINE} = $ms_info_line;
219    $self->{TOT_RUN_HAPS} = $tot_run_haps;
220    $self->{POPS}         = [@pop_haplos];
221
222    return;
223}
224
225=head2 Methods to Access Data
226
227=head3 get_segsites
228
229Title   : get_segsites
230Usage   : $segsites = $stream->get_segsites()
231Function: returns the number of segsites in the msOUT file (according to the msOUT header line's -s option), or the current run's segsites if -s was not specified in the command line (in this case the number of segsites varies from run to run).
232Returns : scalar
233Args    : NONE
234
235=cut
236
237sub get_segsites {
238    my $self = shift;
239    if ( defined $self->{SEGSITES} ) {
240        return $self->{SEGSITES};
241    }
242    else {
243        return $self->get_current_run_segsites;
244    }
245}
246
247=head3 get_current_run_segsites
248
249Title   : get_current_run_segsites
250Usage   : $segsites = $stream->get_current_run_segsites()
251Function: returns the number of segsites in the run of the last read
252          haplotype (sequence).
253Returns : scalar
254Args    : NONE
255
256=cut
257
258sub get_current_run_segsites {
259    my $self = shift;
260    return $self->{LAST_READ_SEGSITES};
261}
262
263=head3 get_n_sites
264
265Title   : get_n_sites
266Usage   : $n_sites = $stream->get_n_sites()
267Function: Gets the number of total sites (variable or not) to be output.
268Returns : scalar if n_sites option is defined at call time of new()
269Args    : NONE
270Note    :
271          WARNING: Final sequence length might not be equal to n_sites if n_sites is
272                   too close to number of segregating sites in the msout file.
273
274=cut
275
276sub get_n_sites {
277    my ($self) = @_;
278
279    return $self->{N_SITES};
280}
281
282=head3 set_n_sites
283
284Title   : set_n_sites
285Usage   : $n_sites = $stream->set_n_sites($value)
286Function: Sets the number of total sites (variable or not) to be output.
287Returns : 1 on success; throws an error if $value is not a positive integer or undef
288Args    : positive integer
289Note    :
290          WARNING: Final sequence length might not be equal to n_sites if it is
291                   too close to number of segregating sites.
292          - n_sites needs to be at least as large as the number of segsites of
293            the next haplotype returned
294          - n_sites may also be set to undef, in which case haplotypes are returned
295            under the infinite sites model assumptions.
296
297=cut
298
299sub set_n_sites {
300    my ( $self, $value ) = @_;
301
302    # make sure $value is a positive integer if it is defined
303    if ( defined $value ) {
304        $self->throw(
305"first argument needs to be a positive integer. argument supplied: $value"
306        ) unless ( $value =~ m/^\d+$/ && $value > 0 );
307    }
308    $self->{N_SITES} = $value;
309
310    return 1;
311}
312
313=head3 get_runs
314
315Title   : get_runs
316Usage   : $runs = $stream->get_runs()
317Function: returns the number of runs in the msOUT file (according to the
318          msinfo line)
319Returns : scalar
320Args    : NONE
321
322=cut
323
324sub get_runs {
325    my $self = shift;
326    return $self->{RUNS};
327}
328
329=head3 get_Seeds
330
331Title   : get_Seeds
332Usage   : @seeds = $stream->get_Seeds()
333Function: returns an array of the seeds used in the creation of the msOUT file.
334Returns : array
335Args    : NONE
336Details : In older versions, ms used three seeds.  Newer versions of ms seem to
337          use only one (longer) seed.  This function will return all the seeds
338          found.
339
340=cut
341
342sub get_Seeds {
343    my $self = shift;
344    return @{ $self->{SEEDS} };
345}
346
347=head3 get_Positions
348
349Title   : get_Positions
350Usage   : @positions = $stream->get_Positions()
351Function: returns an array of the names of each segsite of the run of the last
352          read hap.
353Returns : array
354Args    : NONE
355Details : The Positions may or may not vary from run to run depending on the
356          options used with ms.
357
358=cut
359
360sub get_Positions {
361    my $self = shift;
362    return @{ $self->{LAST_READ_POSITIONS} };
363}
364
365=head3 get_tot_run_haps
366
367Title   : get_tot_run_haps
368Usage   : $number_of_haps_per_run = $stream->get_tot_run_haps()
369Function: returns the number of haplotypes (sequences) in each run of the msOUT
370          file ( according to the msinfo line ).
371Returns : scalar >= 0
372Args    : NONE
373Details : This number should not vary from run to run.
374
375=cut
376
377sub get_tot_run_haps {
378    my $self = shift;
379    return $self->{TOT_RUN_HAPS};
380}
381
382=head3 get_ms_info_line
383
384Title   : get_ms_info_line
385Usage   : $ms_info_line = $stream->get_ms_info_line()
386Function: returns the header line of the msOUT file.
387Returns : scalar
388Args    : NONE
389
390=cut
391
392sub get_ms_info_line {
393    my $self = shift;
394    return $self->{MS_INFO_LINE};
395}
396
397=head3 tot_haps
398
399Title   : tot_haps
400Usage   : $number_of_haplotypes_in_file = $stream->tot_haps()
401Function: returns the number of haplotypes (sequences) in the msOUT file.
402          Information gathered from msOUT header line.
403Returns : scalar
404Args    : NONE
405
406=cut
407
408sub get_tot_haps {
409    my $self = shift;
410    return ( $self->{TOT_RUN_HAPS} * $self->{RUNS} );
411}
412
413=head3 get_Pops
414
415Title   : get_Pops
416Usage   : @pops = $stream->pops()
417Function: returns an array of population sizes (order taken from the -I flag in
418          the msOUT header line).  This array will include the last hap even if
419          it looks like an outgroup.
420Returns : array of scalars > 0
421Args    : NONE
422
423=cut
424
425sub get_Pops {
426    my $self = shift;
427    return @{ $self->{POPS} };
428}
429
430=head3 get_next_run_num
431
432Title   : get_next_run_num
433Usage   : $next_run_number = $stream->next_run_num()
434Function: returns the number of the ms run that the next haplotype (sequence)
435          will be taken from (starting at 1).  Returns undef if the complete
436          file has been read.
437Returns : scalar > 0 or undef
438Args    : NONE
439
440=cut
441
442sub get_next_run_num {
443    my $self = shift;
444    return $self->{NEXT_RUN_NUM};
445}
446
447=head3 get_last_haps_run_num
448
449Title   : get_last_haps_run_num
450Usage   : $last_haps_run_number = $stream->get_last_haps_run_num()
451Function: returns the number of the ms run that the last haplotype (sequence)
452          was taken from (starting at 1).  Returns undef if no hap has been
453          read yet.
454Returns : scalar > 0 or undef
455Args    : NONE
456
457=cut
458
459sub get_last_haps_run_num {
460    my $self = shift;
461    return $self->{LAST_HAPS_RUN_NUM};
462}
463
464=head3 get_last_read_hap_num
465
466Title   : get_last_read_hap_num
467Usage   : $last_read_hap_num = $stream->get_last_read_hap_num()
468Function: returns the number (starting with 1) of the last haplotype read from
469          the ms file
470Returns : scalar >= 0
471Args    : NONE
472Details	: 0 means that no haplotype has been read yet.  Is reset to 0 every run.
473
474=cut
475
476sub get_last_read_hap_num {
477    my $self = shift;
478    return $self->{LAST_READ_HAP_NUM};
479}
480
481=head3 outgroup
482
483Title   : outgroup
484Usage   : $outgroup = $stream->outgroup()
485Function: returns '1' if the msOUT stream has an outgroup.  Returns '0'
486          otherwise.
487Returns : '1' or '0'
488Args    : NONE
489Details	: This method will return '1' only if the last population in the msOUT
490          file contains only one haplotype.  If the last population is not an
491          outgroup then create the msOUT object using 'no_og' as input flag.
492          Also, return 0, if the run has only one population.
493
494=cut
495
496sub outgroup {
497    my $self = shift;
498    my @pops = $self->get_Pops;
499    if ( $pops[$#pops] == 1 && !defined $self->{NO_OUTGROUP} && @pops > 1 ) {
500        return 1;
501    }
502    else { return 0; }
503}
504
505=head3 get_next_haps_pop_num
506
507Title   : get_next_haps_pop_num
508Usage   : ($next_haps_pop_num, $num_haps_left_in_pop) = $stream->get_next_haps_pop_num()
509Function: First return value is the population number (starting with 1) the
510          next hap will come from. The second return value is the number of haps
511          left to read in the population from which the next hap will come.
512Returns : (scalar > 0, scalar > 0)
513Args    : NONE
514
515=cut
516
517sub get_next_haps_pop_num {
518    my $self          = shift;
519    my $last_read_hap = $self->get_last_read_hap_num;
520    my @pops          = $self->get_Pops;
521
522    foreach my $pop_num ( 0 .. $#pops ) {
523        if ( $last_read_hap < $pops[$pop_num] ) {
524            return ( $pop_num + 1, $pops[$pop_num] - $last_read_hap );
525        }
526        else { $last_read_hap -= $pops[$pop_num] }
527    }
528
529    # In this case we're at the beginning of the next run
530    return ( 1, $pops[0] );
531}
532
533=head3 get_next_seq
534
535Title   : get_next_seq
536Usage   : $seq = $stream->get_next_seq()
537Function: reads and returns the next sequence (haplotype) in the stream
538Returns : Bio::Seq object or void if end of file
539Args    : NONE
540Note	: This function is included only to conform to convention.  The
541          returned Bio::Seq object holds a halpotype in coded form. Use the hash
542          returned by get_base_conversion_table() to convert 'A', 'T', 'C', 'G'
543          back into 1,2,4 and 5. Use get_next_hap() to retrieve the halptoype as
544          a string of 1,2,4 and 5s instead.
545
546=cut
547
548sub get_next_seq {
549    my $self      = shift;
550    my $seqstring = $self->get_next_hap;
551
552    return unless ($seqstring);
553
554    # Used to create unique ID;
555    my $run = $self->get_last_haps_run_num;
556
557    # Converting numbers to letters so that the haplotypes can be stored as a
558    # seq object
559    my $rh_base_conversion_table = $self->get_base_conversion_table;
560    foreach my $base ( keys %{$rh_base_conversion_table} ) {
561        $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g;
562    }
563
564    # Fill in non-variable positions
565    my $segsites = $self->get_current_run_segsites;
566    my $n_sites  = $self->get_n_sites;
567    if ( defined($n_sites) ) {
568
569        # make sure that n_sites is at least as large
570        # as segsites for each run. Throw an exception otherwise.
571        $self->throw( "n_sites:\t$n_sites"
572              . "\nsegsites:\t$segsites"
573              . "\nrun:\t$run"
574              . "\nn_sites needs to be at least the number of segsites of every run"
575        ) unless $segsites <= $n_sites;
576
577        my $seq_len = 0;
578        my @seq;
579        my @pos = $self->get_Positions;
580        for ( my $i = 0 ; $i <= $#pos ; $i++ ) {
581            $pos[$i] *= $n_sites;
582            my $rpt = $pos[$i] - 1 - $seq_len;
583            $rpt = $rpt >= 1 ? $rpt : 0;
584            push( @seq, "A" x ( $rpt ) );
585            $seq_len += length( $seq[-1] );
586            push( @seq, substr( $seqstring, $i, 1 ) );
587            $seq_len += length( $seq[-1] );
588        }
589        push( @seq, "A" x ( $n_sites - $seq_len ) );
590        $seqstring = join( "", @seq );
591    }
592
593    my $last_read_hap = $self->get_last_read_hap_num;
594    my $id            = 'Hap_' . $last_read_hap . '_Run_' . $run;
595    my $description =
596        "Segsites $segsites;"
597      . " Positions "
598      . ( defined $n_sites ? $n_sites : $segsites ) . ";"
599      . " Haplotype $last_read_hap;"
600      . " Run $run;";
601    my $seq = $self->sequence_factory->create(
602        -seq      => $seqstring,
603        -id       => $id,
604        -desc     => $description,
605        -alphabet => q(dna),
606        -direct   => 1,
607    );
608
609    return $seq
610
611}
612
613=head3 next_seq
614
615Title   : next_seq
616Usage   : $seq = $stream->next_seq()
617Function: Alias to get_next_seq()
618Returns : Bio::Seq object or void if end of file
619Args    : NONE
620Note    : This function is only included for convention.  It calls get_next_seq().
621          See get_next_seq() for details.
622
623=cut
624
625sub next_seq {
626    my $self = shift;
627    return $self->get_next_seq();
628}
629
630=head3 get_next_hap
631
632Title   : get_next_hap
633Usage   : $hap = $stream->next_hap()
634Function: reads and returns the next sequence (haplotype) in the stream.
635          Returns undef if all sequences in stream have been read.
636Returns : Haplotype string (e.g. '110110000101101045454000101'
637Args    : NONE
638Note	: Use get_next_seq() if you want the halpotype returned as a
639          Bio::Seq object.
640
641=cut
642
643sub get_next_hap {
644    my $self = shift;
645
646    # Let's figure out how many haps to read from the input file so that
647    # we get back to the beginning of the next run.
648
649    my $end_run = 0;
650    if ( $self->{TOT_RUN_HAPS} == $self->{LAST_READ_HAP_NUM} + 1 ) {
651        $end_run = 1;
652    }
653
654    # Setting last_haps_run_num
655    $self->{LAST_HAPS_RUN_NUM} = $self->get_next_run_num;
656
657    my $last_read_hap = $self->get_last_read_hap_num;
658    my ($seqstring) =
659      $self->_get_next_clean_hap( $self->{_filehandle}, 1, $end_run );
660    if ( !defined $seqstring && $last_read_hap < $self->get_tot_haps ) {
661        $self->throw(
662"msout file has only $last_read_hap hap(s), which is less than indicated in msinfo line ( "
663              . $self->get_tot_haps
664              . " )" );
665    }
666
667    return $seqstring;
668}
669
670=head3 get_next_pop
671
672Title   : get_next_pop
673Usage   : @seqs = $stream->next_pop()
674Function: reads and returns all the remaining sequences (haplotypes) in the
675          population of the next sequence.  Returns an empty list if no more
676          haps remain to be read in the stream
677Returns : array of Bio::Seq objects
678Args    : NONE
679
680=cut
681
682sub get_next_pop {
683    my $self = shift;
684
685    # Let's figure out how many haps to read from the input file so that
686    # we get back to the beginning of the next run.
687
688    my @pops = $self->get_Pops;
689    my @seqs;    # holds Bio::Seq objects to return
690
691    # Determine number of the pop that the next hap will be taken from
692    my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
693
694    # If $haps_to_pull == 0, then we need to pull the whole population
695    if ( $haps_to_pull == 0 ) {
696        $haps_to_pull = $pops[ $next_haps_pop_num - 1 ];
697    }
698
699    for ( 1 .. $haps_to_pull ) {
700        my $seq = $self->get_next_seq;
701        next unless defined $seq;
702
703        # Add Population number information to description
704        $seq->display_id(" Population number $next_haps_pop_num;");
705        push @seqs, $seq;
706    }
707
708    return @seqs;
709}
710
711=head3 next_run
712
713Title   : next_run
714Usage   : @seqs = $stream->next_run()
715Function: reads and returns all the remaining sequences (haplotypes) in the ms
716          run of the next sequence.  Returns an empty list if all haps have been
717          read from the stream.
718Returns : array of Bio::Seq objects
719Args    : NONE
720
721=cut
722
723sub get_next_run {
724    my $self = shift;
725
726    # Let's figure out how many haps to read from the input file so that
727    # we get back to the beginning of the next run.
728
729    my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
730    my @seqs;
731
732    my @pops = $self->get_Pops;
733
734    foreach ( $next_haps_pop_num .. $#pops ) {
735        $haps_to_pull += $pops[$_];
736    }
737
738    # Read those haps from the input file
739    # Next hap read will be the first hap of the first pop of the next run.
740
741    for ( 1 .. $haps_to_pull ) {
742        my $seq = $self->get_next_seq;
743        next unless defined $seq;
744
745        push @seqs, $seq;
746    }
747
748    return @seqs;
749}
750
751=head2 Methods to Retrieve Constants
752
753=head3 base_conversion_table
754
755Title   : get_base_conversion_table
756Usage   : $table_hash_ref = $stream->get_base_conversion_table()
757Function: returns a reference to a hash.  The keys of the hash are the letters '
758          A','T','G','C'. The values associated with each key are the value that
759          each letter in the sequence of a seq object returned by a
760          Bio::SeqIO::msout stream should be translated to.
761Returns : reference to a hash
762Args    : NONE
763Synopsis:
764
765	# retrieve the Bio::Seq object's sequence
766	my $haplotype = $seq->seq;
767
768	# need to convert all letters to their corresponding numbers.
769	foreach my $base (keys %{$rh_base_conversion_table}){
770		$haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g;
771	}
772
773	# $haplotype is now an ms style haplotype. (e.g. '100101101455')
774
775=cut
776
777sub get_base_conversion_table {
778    my $self = shift;
779    return $self->{BASE_CONVERSION_TABLE_HASH_REF};
780}
781
782##############################################################################
783## subs for internal use only
784##############################################################################
785
786sub _get_next_clean_hap {
787
788    #By Warren Kretzschmar
789
790    # return the next non-empty line from file handle (chomped line)
791    # skipps to the next run if '//' is encountered
792    my ( $self, $fh, $times, $end_run ) = @_;
793    my @data;
794
795    unless ( ref($fh) eq q(GLOB) ) { return; }
796
797    unless ( defined $times && $times > 0 ) {
798        $times = 1;
799    }
800
801    if ( defined $self->{BUFFER_HAP} ) {
802        push @data, $self->{BUFFER_HAP};
803        $self->{BUFFER_HAP} = undef;
804        $self->{LAST_READ_HAP_NUM}++;
805        $times--;
806    }
807
808    while ( 1 <= $times-- ) {
809
810        # Find next clean line
811        my $data = <$fh>;
812        last if !defined($data);
813        chomp $data;
814        while ( $data !~ /./ ) {
815            $data = <$fh>;
816            chomp $data;
817        }
818
819        # If the next run is encountered here, then we have a programming
820        # or format error
821        if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") }
822
823        $self->{LAST_READ_HAP_NUM}++;
824        push @data, $data;
825    }
826
827    if ($end_run) {
828        $self->_load_run_info($fh);
829    }
830
831    return (@data);
832}
833
834sub _load_run_info {
835
836    my ( $self, $fh ) = @_;
837
838    my $data = <$fh>;
839
840    # getting rid of excess newlines
841    while ( defined($data) && $data !~ /./ ) {
842        $data = <$fh>;
843    }
844
845    # In this case we are at EOF
846    if ( !defined($data) ) { $self->{NEXT_RUN_NUM} = undef; return; }
847
848    while ( $data !~ /./ ) {
849        $data = <$fh>;
850        chomp $data;
851    }
852
853    chomp $data;
854
855    # If the next run is encountered, then skip to the next hap and save it in
856    # the buffer.
857    if ( $data eq '//' ) {
858        $self->{NEXT_RUN_NUM}++;
859        $self->{LAST_READ_HAP_NUM} = 0;
860        for ( 1 .. 3 ) {
861            $data = <$fh>;
862            while ( $data !~ /./ ) {
863                $data = <$fh>;
864                chomp $data;
865            }
866            chomp $data;
867
868            if ( $_ eq '1' ) {
869                my @sites = split( /\s+/, $data );
870                $self->{LAST_READ_SEGSITES} = $sites[1];
871            }
872            elsif ( $_ eq '2' ) {
873                my @positions = split( /\s+/, $data );
874                shift @positions;
875                $self->{LAST_READ_POSITIONS} = \@positions;
876            }
877            else {
878                if ( !defined($data) ) {
879                    $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n");
880                }
881                $self->{BUFFER_HAP} = $data;
882            }
883        }
884    }
885    else {
886        $self->throw(
887"'//' not encountered when expected. There are more haplos in one of the msOUT runs than advertised in the msinfo line."
888        );
889    }
890
891}
8921;
893