1package Statistics::Descriptive::Full;
2$Statistics::Descriptive::Full::VERSION = '3.0800';
3use strict;
4use warnings;
5
6use Carp;
7use POSIX ();
8use Statistics::Descriptive::Smoother;
9
10use vars qw($a $b %fields);
11
12use parent qw(Statistics::Descriptive::Sparse);
13
14use List::MoreUtils ();
15use List::Util      ();
16
17## no critic (ProhibitExplicitReturnUndef)
18##Create a list of fields not to remove when data is updated
19%fields = (
20    _permitted => undef,    ##Place holder for the inherited key hash
21    data       => undef,    ##Our data
22    samples    => undef,    ##Number of samples for each value of the data set
23    presorted  => undef,    ##Flag to indicate the data is already sorted
24    _reserved  => undef,    ##Place holder for this lookup hash
25);
26
27__PACKAGE__->_make_private_accessors(
28    [
29        qw(data samples frequency geometric_mean harmonic_mean
30            least_squares_fit median mode
31            skewness kurtosis median_absolute_deviation
32            )
33    ]
34);
35__PACKAGE__->_make_accessors( [qw(presorted _reserved _trimmed_mean_cache)] );
36
37sub _clear_fields
38{
39    my $self = shift;
40
41    # Empty array ref for holding data later!
42    $self->_data( [] );
43    $self->_samples( [] );
44    $self->_reserved( \%fields );
45    $self->presorted(0);
46    $self->_trimmed_mean_cache( +{} );
47
48    return;
49}
50
51##Have to override the base method to add the data to the object
52##The proxy method from above is still valid
53sub new
54{
55    my $proto = shift;
56    my $class = ref($proto) || $proto;
57
58    # Create my self re SUPER
59    my $self = $class->SUPER::new();
60    bless( $self, $class );    #Re-anneal the object
61    $self->_clear_fields();
62    return $self;
63}
64
65sub _is_reserved
66{
67    my $self  = shift;
68    my $field = shift;
69
70    return exists( $self->_reserved->{$field} );
71}
72
73sub _delete_all_cached_keys
74{
75    my $self = shift;
76
77    my %keys = %{$self};
78
79    # Remove reserved keys for this class from the deletion list
80    delete @keys{ keys %{ $self->_reserved } };
81    delete @keys{ keys %{ $self->_permitted } };
82    delete $keys{_trimmed_mean_cache};
83
84KEYS_LOOP:
85    foreach my $key ( keys %keys )
86    {    # Check each key in the object
87        delete $self->{$key};    # Delete any out of date cached key
88    }
89    $self->{_trimmed_mean_cache} = {};    #  just reset this one
90    return;
91}
92
93##Clear a stat.  More efficient than destroying an object and calling
94##new.
95sub clear
96{
97    my $self = shift;                     ##Myself
98    my $key;
99
100    if ( !$self->count() )
101    {
102        return;
103    }
104
105    $self->_delete_all_cached_keys();
106    $self->SUPER::clear();
107    $self->_clear_fields();
108}
109
110sub add_data
111{
112    my $self = shift;    ##Myself
113
114    my $aref;
115
116    if ( ref $_[0] eq 'ARRAY' )
117    {
118        $aref = $_[0];
119    }
120    else
121    {
122        $aref = \@_;
123    }
124
125    ##If we were given no data, we do nothing.
126    return 1 if ( !@{$aref} );
127
128    my $oldmean;
129    my ( $min, $max, $sum, $sumsq );
130    my $count = $self->count;
131
132    #  $count is modified lower down, but we need this flag after that
133    my $has_existing_data = $count;
134
135    # Take care of appending to an existing data set
136    if ($has_existing_data)
137    {
138        $min   = $self->min();
139        $max   = $self->max();
140        $sum   = $self->sum();
141        $sumsq = $self->sumsq();
142    }
143    else
144    {
145        $min   = $aref->[0];
146        $max   = $aref->[0];
147        $sum   = 0;
148        $sumsq = 0;
149    }
150
151    #  need to allow for already having data
152    $sum   += List::Util::sum(@$aref);
153    $sumsq += List::Util::sum( map { $_**2 } @$aref );
154    $max = List::Util::max( $max, @$aref );
155    $min = List::Util::min( $min, @$aref );
156    $count += scalar @$aref;
157    my $mean = $sum / $count;
158
159    $self->min($min);
160    $self->max($max);
161    $self->sample_range( $max - $min );
162    $self->sum($sum);
163    $self->sumsq($sumsq);
164    $self->mean($mean);
165    $self->count($count);
166
167    ##Variance isn't commonly enough
168    ##used to recompute every single data add, so just clear its cache.
169    $self->_variance(undef);
170
171    push @{ $self->_data() }, @{$aref};
172
173    #  no need to clear keys if we are a newly populated object,
174    #  and profiling shows it takes a long time when creating
175    #  and populating many stats objects
176    if ($has_existing_data)
177    {
178        ##Clear the presorted flag
179        $self->presorted(0);
180        $self->_delete_all_cached_keys();
181    }
182
183    return 1;
184}
185
186sub add_data_with_samples
187{
188    my ( $self, $aref_values ) = @_;
189
190    return 1 if ( !@{$aref_values} );
191
192    my $aref_data    = [ map { keys %$_ } @{$aref_values} ];
193    my $aref_samples = [ map { values %$_ } @{$aref_values} ];
194
195    $self->add_data($aref_data);
196    push @{ $self->_samples() }, @{$aref_samples};
197
198    return 1;
199}
200
201sub get_data
202{
203    my $self = shift;
204    return @{ $self->_data() };
205}
206
207sub get_data_without_outliers
208{
209    my $self = shift;
210
211    if ( $self->count() < $Statistics::Descriptive::Min_samples_number )
212    {
213        carp(
214"Need at least $Statistics::Descriptive::Min_samples_number samples\n"
215        );
216        return;
217    }
218
219    if ( !defined $self->{_outlier_filter} )
220    {
221        carp("Outliers filter not defined\n");
222        return;
223    }
224
225    my $outlier_candidate_index = $self->_outlier_candidate_index;
226    my $possible_outlier = ( $self->_data() )->[$outlier_candidate_index];
227    my $is_outlier = $self->{_outlier_filter}->( $self, $possible_outlier );
228
229    return $self->get_data unless $is_outlier;
230
231    # Removing the outlier from the dataset
232    my @good_indexes =
233        grep { $_ != $outlier_candidate_index } ( 0 .. $self->count() - 1 );
234
235    my @data          = $self->get_data;
236    my @filtered_data = @data[@good_indexes];
237    return @filtered_data;
238}
239
240sub set_outlier_filter
241{
242    my ( $self, $code_ref ) = @_;
243
244    if ( !$code_ref || ref($code_ref) ne "CODE" )
245    {
246        carp("Need to pass a code reference");
247        return;
248    }
249
250    $self->{_outlier_filter} = $code_ref;
251    return 1;
252}
253
254sub _outlier_candidate_index
255{
256    my $self = shift;
257
258    my $mean                    = $self->mean();
259    my $outlier_candidate_index = 0;
260    my $max_std_deviation       = abs( ( $self->_data() )->[0] - $mean );
261    foreach my $idx ( 1 .. ( $self->count() - 1 ) )
262    {
263        my $curr_value = ( $self->_data() )->[$idx];
264        if ( $max_std_deviation < abs( $curr_value - $mean ) )
265        {
266            $outlier_candidate_index = $idx;
267            $max_std_deviation       = abs( $curr_value - $mean );
268        }
269    }
270    return $outlier_candidate_index;
271}
272
273sub set_smoother
274{
275    my ( $self, $args ) = @_;
276
277    $args->{data}    = $self->_data();
278    $args->{samples} = $self->_samples();
279
280    $self->{_smoother} = Statistics::Descriptive::Smoother->instantiate($args);
281}
282
283sub get_smoothed_data
284{
285    my ( $self, $args ) = @_;
286
287    if ( !defined $self->{_smoother} )
288    {
289        carp("Smoother object not defined\n");
290        return;
291    }
292    $self->{_smoother}->get_smoothed_data();
293}
294
295sub maxdex
296{
297    my $self = shift;
298
299    return undef if !$self->count;
300    my $maxdex;
301
302    if ( $self->presorted )
303    {
304        $maxdex = $self->count - 1;
305    }
306    else
307    {
308        my $max = $self->max;
309        $maxdex = List::MoreUtils::first_index { $_ == $max } $self->get_data;
310    }
311
312    $self->{maxdex} = $maxdex;
313
314    return $maxdex;
315}
316
317sub mindex
318{
319    my $self = shift;
320
321    return undef if !$self->count;
322
323    #my $maxdex = $self->{maxdex};
324    #return $maxdex if defined $maxdex;
325    my $mindex;
326
327    if ( $self->presorted )
328    {
329        $mindex = 0;
330    }
331    else
332    {
333        my $min = $self->min;
334        $mindex = List::MoreUtils::first_index { $_ == $min } $self->get_data;
335    }
336
337    $self->{mindex} = $mindex;
338
339    return $mindex;
340}
341
342sub sort_data
343{
344    my $self = shift;
345
346    if ( !$self->presorted() )
347    {
348        ##Sort the data in descending order
349        $self->_data( [ sort { $a <=> $b } @{ $self->_data() } ] );
350        $self->presorted(1);
351        ##Fix the maxima and minima indices - no, this is unnecessary now we have methods
352        #$self->mindex(0);
353        #$self->maxdex($#{$self->_data()});
354    }
355
356    return 1;
357}
358
359sub percentile
360{
361    my $self       = shift;
362    my $percentile = shift || 0;
363    ##Since we're returning a single value there's no real need
364    ##to cache this.
365
366    ##If the requested percentile is less than the "percentile bin
367    ##size" then return undef.  Check description of RFC 2330 in the
368    ##POD below.
369    my $count = $self->count();
370
371    if ( ( !$count ) || ( $percentile < 100 / $count ) )
372    {
373        return;    #  allow for both scalar and list context
374    }
375
376    $self->sort_data();
377    my $num   = $count * $percentile / 100;
378    my $index = &POSIX::ceil($num) - 1;
379    my $val   = $self->_data->[$index];
380    return wantarray
381        ? ( $val, $index )
382        : $val;
383}
384
385sub _calc_new_median
386{
387    my $self  = shift;
388    my $count = $self->count();
389
390    ##Even or odd
391    if ( $count % 2 )
392    {
393        return $self->_data->[ ( $count - 1 ) / 2 ];
394    }
395    else
396    {
397        return (
398            (
399                $self->_data->[ ($count) / 2 ] +
400                    $self->_data->[ ( $count - 2 ) / 2 ]
401            ) / 2
402        );
403    }
404}
405
406sub median
407{
408    my $self = shift;
409
410    return undef if !$self->count;
411
412    ##Cached?
413    if ( !defined( $self->_median() ) )
414    {
415        $self->sort_data();
416        $self->_median( $self->_calc_new_median() );
417    }
418    return $self->_median();
419}
420
421sub quantile
422{
423    my ( $self, $QuantileNumber ) = @_;
424
425    unless ( defined $QuantileNumber and $QuantileNumber =~ m/^0|1|2|3|4$/ )
426    {
427        carp("Bad quartile type, must be 0, 1, 2, 3 or 4\n");
428        return;
429    }
430
431    #  check data count after the args are checked - should help debugging
432    return undef if !$self->count;
433
434    $self->sort_data();
435
436    return $self->_data->[0] if ( $QuantileNumber == 0 );
437
438    my $count = $self->count();
439
440    return $self->_data->[ $count - 1 ] if ( $QuantileNumber == 4 );
441
442    my $K_quantile = ( ( $QuantileNumber / 4 ) * ( $count - 1 ) + 1 );
443    my $F_quantile = $K_quantile - POSIX::floor($K_quantile);
444    $K_quantile = POSIX::floor($K_quantile);
445
446    # interpolation
447    my $aK_quantile = $self->_data->[ $K_quantile - 1 ];
448    return $aK_quantile if ( $F_quantile == 0 );
449    my $aKPlus_quantile = $self->_data->[$K_quantile];
450
451    # Calcul quantile
452    my $quantile =
453        $aK_quantile + ( $F_quantile * ( $aKPlus_quantile - $aK_quantile ) );
454
455    return $quantile;
456}
457
458sub _real_calc_trimmed_mean
459{
460    my $self  = shift;
461    my $lower = shift;
462    my $upper = shift;
463
464    my $lower_trim = int( $self->count() * $lower );
465    my $upper_trim = int( $self->count() * $upper );
466    my ( $val, $oldmean ) = ( 0, 0 );
467    my ( $tm_count, $tm_mean, $index ) = ( 0, 0, $lower_trim );
468
469    $self->sort_data();
470    while ( $index <= $self->count() - $upper_trim - 1 )
471    {
472        $val     = $self->_data()->[$index];
473        $oldmean = $tm_mean;
474        ++$index;
475        ++$tm_count;
476        $tm_mean += ( $val - $oldmean ) / $tm_count;
477    }
478
479    return $tm_mean;
480}
481
482sub trimmed_mean
483{
484    my $self = shift;
485    my ( $lower, $upper );
486
487    #upper bound is in arg list or is same as lower
488    if ( @_ == 1 )
489    {
490        ( $lower, $upper ) = ( $_[0], $_[0] );
491    }
492    else
493    {
494        ( $lower, $upper ) = ( $_[0], $_[1] );
495    }
496
497    #  check data count after the args
498    return undef if !$self->count;
499
500    ##Cache
501    my $thistm = join ':', $lower, $upper;
502    my $cache  = $self->_trimmed_mean_cache();
503    if ( !exists( $cache->{$thistm} ) )
504    {
505        $cache->{$thistm} = $self->_real_calc_trimmed_mean( $lower, $upper );
506    }
507
508    return $cache->{$thistm};
509}
510
511sub _test_for_too_small_val
512{
513    my $self = shift;
514    my $val  = shift;
515
516    return ( abs($val) <= $Statistics::Descriptive::Tolerance );
517}
518
519sub _calc_harmonic_mean
520{
521    my $self = shift;
522
523    my $hs = 0;
524
525    foreach my $item ( @{ $self->_data() } )
526    {
527        ##Guarantee that there are no divide by zeros
528        if ( $self->_test_for_too_small_val($item) )
529        {
530            return;
531        }
532
533        $hs += 1 / $item;
534    }
535
536    if ( $self->_test_for_too_small_val($hs) )
537    {
538        return;
539    }
540
541    return $self->count() / $hs;
542}
543
544sub harmonic_mean
545{
546    my $self = shift;
547
548    if ( !defined( $self->_harmonic_mean() ) )
549    {
550        $self->_harmonic_mean( scalar( $self->_calc_harmonic_mean() ) );
551    }
552
553    return $self->_harmonic_mean();
554}
555
556sub mode
557{
558    my $self = shift;
559
560    if ( !defined( $self->_mode() ) )
561    {
562        my $mode       = 0;
563        my $occurances = 0;
564
565        my %count;
566
567        foreach my $item ( @{ $self->_data() } )
568        {
569            my $count = ++$count{$item};
570            if ( $count > $occurances )
571            {
572                $mode       = $item;
573                $occurances = $count;
574            }
575        }
576
577        $self->_mode(
578              ( $occurances > 1 )
579            ? { exists => 1, mode => $mode }
580            : { exists => 0, }
581        );
582    }
583
584    my $m = $self->_mode;
585
586    return $m->{'exists'} ? $m->{mode} : undef;
587}
588
589sub geometric_mean
590{
591    my $self = shift;
592
593    return undef if !$self->count;
594
595    if ( !defined( $self->_geometric_mean() ) )
596    {
597        my $gm       = 1;
598        my $exponent = 1 / $self->count();
599
600        for my $val ( @{ $self->_data() } )
601        {
602            if ( $val < 0 )
603            {
604                return undef;
605            }
606            $gm *= $val**$exponent;
607        }
608
609        $self->_geometric_mean($gm);
610    }
611
612    return $self->_geometric_mean();
613}
614
615sub skewness
616{
617    my $self = shift;
618
619    if ( !defined( $self->_skewness() ) )
620    {
621        my $n  = $self->count();
622        my $sd = $self->standard_deviation();
623
624        my $skew;
625
626        #  skip if insufficient records
627        if ( $sd && $n > 2 )
628        {
629
630            my $mean = $self->mean();
631
632            my $sum_pow3;
633            foreach my $rec ( $self->get_data )
634            {
635                $sum_pow3 += ( ( $rec - $mean ) / $sd )**3;
636            }
637
638            my $correction = $n / ( ( $n - 1 ) * ( $n - 2 ) );
639
640            $skew = $correction * $sum_pow3;
641        }
642
643        $self->_skewness($skew);
644    }
645
646    return $self->_skewness();
647}
648
649sub kurtosis
650{
651    my $self = shift;
652
653    if ( !defined( $self->_kurtosis() ) )
654    {
655        my $kurt;
656
657        my $n  = $self->count();
658        my $sd = $self->standard_deviation();
659
660        if ( $sd && $n > 3 )
661        {
662
663            my $mean = $self->mean();
664
665            my $sum_pow4;
666            foreach my $rec ( $self->get_data )
667            {
668                $sum_pow4 += ( ( $rec - $mean ) / $sd )**4;
669            }
670
671            my $correction1 =
672                ( $n * ( $n + 1 ) ) / ( ( $n - 1 ) * ( $n - 2 ) * ( $n - 3 ) );
673            my $correction2 =
674                ( 3 * ( $n - 1 )**2 ) / ( ( $n - 2 ) * ( $n - 3 ) );
675
676            $kurt = ( $correction1 * $sum_pow4 ) - $correction2;
677        }
678
679        $self->_kurtosis($kurt);
680    }
681
682    return $self->_kurtosis();
683}
684
685sub frequency_distribution_ref
686{
687    my $self = shift;
688    my @k    = ();
689
690    # Must have at least two elements
691    if ( $self->count() < 2 )
692    {
693        return undef;
694    }
695
696    if ( ( !@_ ) && ( defined $self->_frequency() ) )
697    {
698        return $self->_frequency();
699    }
700
701    my %bins;
702    my $partitions = shift;
703
704    if ( ref($partitions) eq 'ARRAY' )
705    {
706        @k = @{$partitions};
707        return undef unless @k;    ##Empty array
708        if ( @k > 1 )
709        {
710            ##Check for monotonicity
711            my $element = $k[0];
712            for my $next_elem ( @k[ 1 .. $#k ] )
713            {
714                if ( $element > $next_elem )
715                {
716                    carp
717"Non monotonic array cannot be used as frequency bins!\n";
718                    return undef;
719                }
720                $element = $next_elem;
721            }
722        }
723        %bins = map { $_ => 0 } @k;
724    }
725    else
726    {
727        return undef unless $partitions >= 1;
728        my $interval = $self->sample_range() / $partitions;
729        foreach my $idx ( 1 .. ( $partitions - 1 ) )
730        {
731            push @k, ( $self->min() + $idx * $interval );
732        }
733
734        $bins{ $self->max() } = 0;
735
736        push @k, $self->max();
737    }
738
739ELEMENT:
740    foreach my $element ( @{ $self->_data() } )
741    {
742        foreach my $limit (@k)
743        {
744            if ( $element <= $limit )
745            {
746                $bins{$limit}++;
747                next ELEMENT;
748            }
749        }
750    }
751
752    return $self->_frequency( \%bins );
753}
754
755sub frequency_distribution
756{
757    my $self = shift;
758
759    my $ret = $self->frequency_distribution_ref(@_);
760
761    if ( !defined($ret) )
762    {
763        return undef;
764    }
765    else
766    {
767        return %$ret;
768    }
769}
770
771sub least_squares_fit
772{
773    my $self = shift;
774    return () if $self->count() < 2;
775
776    ##Sigma sums
777    my ( $sigmaxy, $sigmax, $sigmaxx, $sigmayy, $sigmay ) =
778        ( 0, 0, 0, 0, $self->sum );
779    my ( $xvar, $yvar, $err );
780
781    ##Work variables
782    my ( $iter, $y, $x, $denom ) = ( 0, 0, 0, 0 );
783    my $count = $self->count();
784    my @x;
785
786    ##Outputs
787    my ( $m, $q, $r, $rms );
788
789    if ( !defined $_[1] )
790    {
791        @x = 1 .. $self->count();
792    }
793    else
794    {
795        @x = @_;
796        if ( $self->count() != scalar @x )
797        {
798            carp "Range and domain are of unequal length.";
799            return ();
800        }
801    }
802    foreach my $x_val (@x)
803    {
804        $y = $self->_data->[$iter];
805        $sigmayy += $y * $y;
806        $sigmaxx += $x_val * $x_val;
807        $sigmaxy += $x_val * $y;
808        $sigmax  += $x_val;
809        ++$iter;
810    }
811    $denom = $count * $sigmaxx - $sigmax * $sigmax;
812    return ()
813        unless abs($denom) > $Statistics::Descriptive::Tolerance;
814
815    $m = ( $count * $sigmaxy - $sigmax * $sigmay ) / $denom;
816    $q = ( $sigmaxx * $sigmay - $sigmax * $sigmaxy ) / $denom;
817
818    $xvar = $sigmaxx - $sigmax * $sigmax / $count;
819    $yvar = $sigmayy - $sigmay * $sigmay / $count;
820
821    $denom = sqrt( $xvar * $yvar );
822    return () unless ( abs($denom) > $Statistics::Descriptive::Tolerance );
823    $r = ( $sigmaxy - $sigmax * $sigmay / $count ) / $denom;
824
825    $iter = 0;
826    $rms  = 0.0;
827    foreach (@x)
828    {
829        ##Error = Real y - calculated y
830        $err = $self->_data->[$iter] - ( $m * $_ + $q );
831        $rms += $err * $err;
832        ++$iter;
833    }
834
835    $rms = sqrt( $rms / $count );
836
837    $self->_least_squares_fit( [ $q, $m, $r, $rms ] );
838
839    return @{ $self->_least_squares_fit() };
840}
841
842sub median_absolute_deviation
843{
844    my ($self) = @_;
845
846    if ( !defined( $self->_median_absolute_deviation() ) )
847    {
848        my $stat = $self->new;
849        $stat->add_data( map { abs( $_ - $self->median ) } $self->get_data );
850        $self->_median_absolute_deviation( $stat->median );
851    }
852
853    return $self->_median_absolute_deviation();
854}
855
856sub summary
857{
858    my ($self) = @_;
859
860    my $FMT = '%.5e';
861
862    return
863        sprintf( "Min: $FMT\nMax: $FMT\nMean: $FMT\nMedian: $FMT\n"
864            . "1st quantile: $FMT\n3rd quantile: $FMT\n",
865        $self->min, $self->max, $self->mean, $self->median, $self->quantile(1),
866        $self->quantile(3), );
867
868}
8691;
870
871__END__
872
873=pod
874
875=encoding UTF-8
876
877=head1 NAME
878
879Statistics::Descriptive - Module of basic descriptive statistical functions.
880
881=head1 VERSION
882
883version 3.0800
884
885=head1 SYNOPSIS
886
887    use Statistics::Descriptive;
888    my $stat = Statistics::Descriptive::Full->new();
889    $stat->add_data(1,2,3,4);
890    my $mean = $stat->mean();
891    my $var = $stat->variance();
892    my $tm = $stat->trimmed_mean(.25);
893    $Statistics::Descriptive::Tolerance = 1e-10;
894
895=head1 DESCRIPTION
896
897This module provides basic functions used in descriptive statistics.
898It has an object oriented design and supports two different types of
899data storage and calculation objects: sparse and full. With the sparse
900method, none of the data is stored and only a few statistical measures
901are available. Using the full method, the entire data set is retained
902and additional functions are available.
903
904Whenever a division by zero may occur, the denominator is checked to be
905greater than the value C<$Statistics::Descriptive::Tolerance>, which
906defaults to 0.0. You may want to change this value to some small
907positive value such as 1e-24 in order to obtain error messages in case
908of very small denominators.
909
910Many of the methods (both Sparse and Full) cache values so that subsequent
911calls with the same arguments are faster.
912
913=head1 METHODS
914
915=head2 Sparse Methods
916
917=over 5
918
919=item $stat = Statistics::Descriptive::Sparse->new();
920
921Create a new sparse statistics object.
922
923=item $stat->clear();
924
925Effectively the same as
926
927  my $class = ref($stat);
928  undef $stat;
929  $stat = new $class;
930
931except more efficient.
932
933=item $stat->add_data(1,2,3);
934
935Adds data to the statistics variable. The cached statistical values are
936updated automatically.
937
938=item $stat->count();
939
940Returns the number of data items.
941
942=item $stat->mean();
943
944Returns the mean of the data.
945
946=item $stat->sum();
947
948Returns the sum of the data.
949
950=item $stat->variance();
951
952Returns the variance of the data.  Division by n-1 is used.
953
954=item $stat->standard_deviation();
955
956Returns the standard deviation of the data. Division by n-1 is used.
957
958=item $stat->min();
959
960Returns the minimum value of the data set.
961
962=item $stat->mindex();
963
964Returns the index of the minimum value of the data set.
965
966=item $stat->max();
967
968Returns the maximum value of the data set.
969
970=item $stat->maxdex();
971
972Returns the index of the maximum value of the data set.
973
974=item $stat->sample_range();
975
976Returns the sample range (max - min) of the data set.
977
978=back
979
980=head2 Full Methods
981
982Similar to the Sparse Methods above, any Full Method that is called caches
983the current result so that it doesn't have to be recalculated.  In some
984cases, several values can be cached at the same time.
985
986=over 5
987
988=item $stat = Statistics::Descriptive::Full->new();
989
990Create a new statistics object that inherits from
991Statistics::Descriptive::Sparse so that it contains all the methods
992described above.
993
994=item $stat->add_data(1,2,4,5);
995
996Adds data to the statistics variable.  All of the sparse statistical
997values are updated and cached.  Cached values from Full methods are
998deleted since they are no longer valid.
999
1000I<Note:  Calling add_data with an empty array will delete all of your
1001Full method cached values!  Cached values for the sparse methods are
1002not changed>
1003
1004=item $stat->add_data_with_samples([{1 => 10}, {2 => 20}, {3 => 30},]);
1005
1006Add data to the statistics variable and set the number of samples each value
1007has been built with. The data is the key of each element of the input array
1008ref, while the value is the number of samples: [{data1 => smaples1}, {data2 =>
1009samples2}, ...].
1010
1011B<NOTE:> The number of samples is only used by the smoothing function and is
1012ignored otherwise. It is not equivalent to repeat count. In order to repeat
1013a certain datum more than one time call add_data() like this:
1014
1015    my $value = 5;
1016    my $repeat_count = 10;
1017    $stat->add_data(
1018        [ ($value) x $repeat_count ]
1019    );
1020
1021=item $stat->get_data();
1022
1023Returns a copy of the data array.
1024
1025=item $stat->get_data_without_outliers();
1026
1027Returns a copy of the data array without outliers. The number minimum of
1028samples to apply the outlier filtering is C<$Statistics::Descriptive::Min_samples_number>,
10294 by default.
1030
1031A function to detect outliers need to be defined (see C<set_outlier_filter>),
1032otherwise the function will return an undef value.
1033
1034The filtering will act only on the most extreme value of the data set
1035(i.e.: value with the highest absolute standard deviation from the mean).
1036
1037If there is the need to remove more than one outlier, the filtering
1038need to be re-run for the next most extreme value with the initial outlier removed.
1039
1040This is not always needed since the test (for example Grubb's test) usually can only detect
1041the most exreme value. If there is more than one extreme case in a set,
1042then the standard deviation will be high enough to make neither case an outlier.
1043
1044=item $stat->set_outlier_filter($code_ref);
1045
1046Set the function to filter out the outlier.
1047
1048C<$code_ref> is the reference to the subroutine implementing the filtering
1049function.
1050
1051Returns C<undef> for invalid values of C<$code_ref> (i.e.: not defined or not a
1052code reference), C<1> otherwise.
1053
1054=over 4
1055
1056=item
1057
1058Example #1: Undefined code reference
1059
1060    my $stat = Statistics::Descriptive::Full->new();
1061    $stat->add_data(1, 2, 3, 4, 5);
1062
1063    print $stat->set_outlier_filter(); # => undef
1064
1065=item
1066
1067Example #2: Valid code reference
1068
1069    sub outlier_filter { return $_[1] > 1; }
1070
1071    my $stat = Statistics::Descriptive::Full->new();
1072    $stat->add_data( 1, 1, 1, 100, 1, );
1073
1074    print $stat->set_outlier_filter( \&outlier_filter ); # => 1
1075    my @filtered_data = $stat->get_data_without_outliers();
1076    # @filtered_data is (1, 1, 1, 1)
1077
1078In this example the series is really simple and the outlier filter function as well.
1079For more complex series the outlier filter function might be more complex
1080(see Grubbs' test for outliers).
1081
1082The outlier filter function will receive as first parameter the Statistics::Descriptive::Full object,
1083as second the value of the candidate outlier. Having the object in the function
1084might be useful for complex filters where statistics property are needed (again see Grubbs' test for outlier).
1085
1086=back
1087
1088=item $stat->set_smoother({ method => 'exponential', coeff => 0, });
1089
1090Set the method used to smooth the data and the smoothing coefficient.
1091See C<Statistics::Smoother> for more details.
1092
1093=item $stat->get_smoothed_data();
1094
1095Returns a copy of the smoothed data array.
1096
1097The smoothing method and coefficient need to be defined (see C<set_smoother>),
1098otherwise the function will return an undef value.
1099
1100=item $stat->sort_data();
1101
1102Sort the stored data and update the mindex and maxdex methods.  This
1103method uses perl's internal sort.
1104
1105=item $stat->presorted(1);
1106
1107=item $stat->presorted();
1108
1109If called with a non-zero argument, this method sets a flag that says
1110the data is already sorted and need not be sorted again.  Since some of
1111the methods in this class require sorted data, this saves some time.
1112If you supply sorted data to the object, call this method to prevent
1113the data from being sorted again. The flag is cleared whenever add_data
1114is called.  Calling the method without an argument returns the value of
1115the flag.
1116
1117=item $stat->skewness();
1118
1119Returns the skewness of the data.
1120A value of zero is no skew, negative is a left skewed tail,
1121positive is a right skewed tail.
1122This is consistent with Excel.
1123
1124=item $stat->kurtosis();
1125
1126Returns the kurtosis of the data.
1127Positive is peaked, negative is flattened.
1128
1129=item $x = $stat->percentile(25);
1130
1131=item ($x, $index) = $stat->percentile(25);
1132
1133Sorts the data and returns the value that corresponds to the
1134percentile as defined in RFC2330:
1135
1136=over 4
1137
1138=item
1139
1140For example, given the 6 measurements:
1141
1142-2, 7, 7, 4, 18, -5
1143
1144Then F(-8) = 0, F(-5) = 1/6, F(-5.0001) = 0, F(-4.999) = 1/6, F(7) =
11455/6, F(18) = 1, F(239) = 1.
1146
1147Note that we can recover the different measured values and how many
1148times each occurred from F(x) -- no information regarding the range
1149in values is lost.  Summarizing measurements using histograms, on the
1150other hand, in general loses information about the different values
1151observed, so the EDF is preferred.
1152
1153Using either the EDF or a histogram, however, we do lose information
1154regarding the order in which the values were observed.  Whether this
1155loss is potentially significant will depend on the metric being
1156measured.
1157
1158We will use the term "percentile" to refer to the smallest value of x
1159for which F(x) >= a given percentage.  So the 50th percentile of the
1160example above is 4, since F(4) = 3/6 = 50%; the 25th percentile is
1161-2, since F(-5) = 1/6 < 25%, and F(-2) = 2/6 >= 25%; the 100th
1162percentile is 18; and the 0th percentile is -infinity, as is the 15th
1163percentile, which for ease of handling and backward compatibility is returned
1164as undef() by the function.
1165
1166Care must be taken when using percentiles to summarize a sample,
1167because they can lend an unwarranted appearance of more precision
1168than is really available.  Any such summary must include the sample
1169size N, because any percentile difference finer than 1/N is below the
1170resolution of the sample.
1171
1172=back
1173
1174(Taken from:
1175I<RFC2330 - Framework for IP Performance Metrics>,
1176Section 11.3.  Defining Statistical Distributions.
1177RFC2330 is available from:
1178L<http://www.ietf.org/rfc/rfc2330.txt> .)
1179
1180If the percentile method is called in a list context then it will
1181also return the index of the percentile.
1182
1183=item $x = $stat->quantile($Type);
1184
1185Sorts the data and returns estimates of underlying distribution quantiles based on one
1186or two order statistics from the supplied elements.
1187
1188This method use the same algorithm as Excel and R language (quantile B<type 7>).
1189
1190The generic function quantile produces sample quantiles corresponding to the given probabilities.
1191
1192B<$Type> is an integer value between 0 to 4 :
1193
1194  0 => zero quartile (Q0) : minimal value
1195  1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile
1196  2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile
1197  3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile
1198  4 => fourth quartile (Q4) : maximal value
1199
1200Example :
1201
1202  my @data = (1..10);
1203  my $stat = Statistics::Descriptive::Full->new();
1204  $stat->add_data(@data);
1205  print $stat->quantile(0); # => 1
1206  print $stat->quantile(1); # => 3.25
1207  print $stat->quantile(2); # => 5.5
1208  print $stat->quantile(3); # => 7.75
1209  print $stat->quantile(4); # => 10
1210
1211=item $stat->median();
1212
1213Sorts the data and returns the median value of the data.
1214
1215=item $stat->harmonic_mean();
1216
1217Returns the harmonic mean of the data.  Since the mean is undefined
1218if any of the data are zero or if the sum of the reciprocals is zero,
1219it will return undef for both of those cases.
1220
1221=item $stat->geometric_mean();
1222
1223Returns the geometric mean of the data.
1224
1225=item my $mode = $stat->mode();
1226
1227Returns the mode of the data. The mode is the most commonly occurring datum.
1228See L<http://en.wikipedia.org/wiki/Mode_%28statistics%29> . If all values
1229occur only once, then mode() will return undef.
1230
1231=item $stat->trimmed_mean(ltrim[,utrim]);
1232
1233C<trimmed_mean(ltrim)> returns the mean with a fraction C<ltrim>
1234of entries at each end dropped. C<trimmed_mean(ltrim,utrim)>
1235returns the mean after a fraction C<ltrim> has been removed from the
1236lower end of the data and a fraction C<utrim> has been removed from the
1237upper end of the data.  This method sorts the data before beginning
1238to analyze it.
1239
1240All calls to trimmed_mean() are cached so that they don't have to be
1241calculated a second time.
1242
1243=item $stat->frequency_distribution_ref($partitions);
1244
1245=item $stat->frequency_distribution_ref(\@bins);
1246
1247=item $stat->frequency_distribution_ref();
1248
1249C<frequency_distribution_ref($partitions)> slices the data into
1250C<$partition> sets (where $partition is greater than 1) and counts the
1251number of items that fall into each partition. It returns a reference to
1252a hash where the keys are the numerical values of the
1253partitions used. The minimum value of the data set is not a key and the
1254maximum value of the data set is always a key. The number of entries
1255for a particular partition key are the number of items which are
1256greater than the previous partition key and less then or equal to the
1257current partition key. As an example,
1258
1259   $stat->add_data(1,1.5,2,2.5,3,3.5,4);
1260   $f = $stat->frequency_distribution_ref(2);
1261   for (sort {$a <=> $b} keys %$f) {
1262      print "key = $_, count = $f->{$_}\n";
1263   }
1264
1265prints
1266
1267   key = 2.5, count = 4
1268   key = 4, count = 3
1269
1270since there are four items less than or equal to 2.5, and 3 items
1271greater than 2.5 and less than 4.
1272
1273C<frequency_distribution_refs(\@bins)> provides the bins that are to be used
1274for the distribution.  This allows for non-uniform distributions as
1275well as trimmed or sample distributions to be found.  C<@bins> must
1276be monotonic and contain at least one element.  Note that unless the
1277set of bins contains the range that the total counts returned will
1278be less than the sample size.
1279
1280Calling C<frequency_distribution_ref()> with no arguments returns the last
1281distribution calculated, if such exists.
1282
1283=item my %hash = $stat->frequency_distribution($partitions);
1284
1285=item my %hash = $stat->frequency_distribution(\@bins);
1286
1287=item my %hash = $stat->frequency_distribution();
1288
1289Same as C<frequency_distribution_ref()> except that returns the hash clobbered
1290into the return list. Kept for compatibility reasons with previous
1291versions of Statistics::Descriptive and using it is discouraged.
1292
1293=item $stat->median_absolute_deviation()
1294
1295The median absolute deviation.
1296
1297=item $stat->summary()
1298
1299Returns a textual summary of the distribution - min, max, median, mean and
1300quantiles.
1301
1302(New in version 3.0700 .)
1303
1304=item $stat->least_squares_fit();
1305
1306=item $stat->least_squares_fit(@x);
1307
1308C<least_squares_fit()> performs a least squares fit on the data,
1309assuming a domain of C<@x> or a default of 1..$stat->count().  It
1310returns an array of four elements C<($q, $m, $r, $rms)> where
1311
1312=over 4
1313
1314=item C<$q and $m>
1315
1316satisfy the equation C($y = $m*$x + $q).
1317
1318=item C<$r>
1319
1320is the Pearson linear correlation cofficient.
1321
1322=item C<$rms>
1323
1324is the root-mean-square error.
1325
1326=back
1327
1328If case of error or division by zero, the empty list is returned.
1329
1330The array that is returned can be "coerced" into a hash structure
1331by doing the following:
1332
1333  my %hash = ();
1334  @hash{'q', 'm', 'r', 'err'} = $stat->least_squares_fit();
1335
1336Because calling C<least_squares_fit()> with no arguments defaults
1337to using the current range, there is no caching of the results.
1338
1339=back
1340
1341=head1 REPORTING ERRORS
1342
1343I read my email frequently, but since adopting this module I've added 2
1344children and 1 dog to my family, so please be patient about my response
1345times.  When reporting errors, please include the following to help
1346me out:
1347
1348=over 4
1349
1350=item *
1351
1352Your version of perl.  This can be obtained by typing perl C<-v> at
1353the command line.
1354
1355=item *
1356
1357Which version of Statistics::Descriptive you're using.  As you can
1358see below, I do make mistakes.  Unfortunately for me, right now
1359there are thousands of CD's with the version of this module with
1360the bugs in it.  Fortunately for you, I'm a very patient module
1361maintainer.
1362
1363=item *
1364
1365Details about what the error is.  Try to narrow down the scope
1366of the problem and send me code that I can run to verify and
1367track it down.
1368
1369=back
1370
1371=head1 AUTHOR
1372
1373Current maintainer:
1374
1375Shlomi Fish, L<http://www.shlomifish.org/> , C<shlomif@cpan.org>
1376
1377Previously:
1378
1379Colin Kuskie
1380
1381My email address can be found at http://www.perl.com under Who's Who
1382or at: https://metacpan.org/author/COLINK .
1383
1384=head1 CONTRIBUTORS
1385
1386Fabio Ponciroli & Adzuna Ltd. team (outliers handling)
1387
1388=head1 REFERENCES
1389
1390RFC2330, Framework for IP Performance Metrics
1391
1392The Art of Computer Programming, Volume 2, Donald Knuth.
1393
1394Handbook of Mathematica Functions, Milton Abramowitz and Irene Stegun.
1395
1396Probability and Statistics for Engineering and the Sciences, Jay Devore.
1397
1398=head1 COPYRIGHT
1399
1400Copyright (c) 1997,1998 Colin Kuskie. All rights reserved.  This
1401program is free software; you can redistribute it and/or modify it
1402under the same terms as Perl itself.
1403
1404Copyright (c) 1998 Andrea Spinelli. All rights reserved.  This program
1405is free software; you can redistribute it and/or modify it under the
1406same terms as Perl itself.
1407
1408Copyright (c) 1994,1995 Jason Kastner. All rights
1409reserved.  This program is free software; you can redistribute it
1410and/or modify it under the same terms as Perl itself.
1411
1412=head1 LICENSE
1413
1414This program is free software; you can redistribute it and/or modify it
1415under the same terms as Perl itself.
1416
1417=for :stopwords cpan testmatrix url bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan
1418
1419=head1 SUPPORT
1420
1421=head2 Websites
1422
1423The following websites have more information about this module, and may be of help to you. As always,
1424in addition to those websites please use your favorite search engine to discover more resources.
1425
1426=over 4
1427
1428=item *
1429
1430MetaCPAN
1431
1432A modern, open-source CPAN search engine, useful to view POD in HTML format.
1433
1434L<https://metacpan.org/release/Statistics-Descriptive>
1435
1436=item *
1437
1438RT: CPAN's Bug Tracker
1439
1440The RT ( Request Tracker ) website is the default bug/issue tracking system for CPAN.
1441
1442L<https://rt.cpan.org/Public/Dist/Display.html?Name=Statistics-Descriptive>
1443
1444=item *
1445
1446CPANTS
1447
1448The CPANTS is a website that analyzes the Kwalitee ( code metrics ) of a distribution.
1449
1450L<http://cpants.cpanauthors.org/dist/Statistics-Descriptive>
1451
1452=item *
1453
1454CPAN Testers
1455
1456The CPAN Testers is a network of smoke testers who run automated tests on uploaded CPAN distributions.
1457
1458L<http://www.cpantesters.org/distro/S/Statistics-Descriptive>
1459
1460=item *
1461
1462CPAN Testers Matrix
1463
1464The CPAN Testers Matrix is a website that provides a visual overview of the test results for a distribution on various Perls/platforms.
1465
1466L<http://matrix.cpantesters.org/?dist=Statistics-Descriptive>
1467
1468=item *
1469
1470CPAN Testers Dependencies
1471
1472The CPAN Testers Dependencies is a website that shows a chart of the test results of all dependencies for a distribution.
1473
1474L<http://deps.cpantesters.org/?module=Statistics::Descriptive>
1475
1476=back
1477
1478=head2 Bugs / Feature Requests
1479
1480Please report any bugs or feature requests by email to C<bug-statistics-descriptive at rt.cpan.org>, or through
1481the web interface at L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Statistics-Descriptive>. You will be automatically notified of any
1482progress on the request by the system.
1483
1484=head2 Source Code
1485
1486The code is open to the world, and available for you to hack on. Please feel free to browse it and play
1487with it, or whatever. If you want to contribute patches, please send me a diff or prod me to pull
1488from your repository :)
1489
1490L<https://github.com/shlomif/perl-Statistics-Descriptive>
1491
1492  git clone git://github.com/shlomif/perl-Statistics-Descriptive.git
1493
1494=head1 AUTHOR
1495
1496Shlomi Fish <shlomif@cpan.org>
1497
1498=head1 BUGS
1499
1500Please report any bugs or feature requests on the bugtracker website
1501L<https://github.com/shlomif/perl-Statistics-Descriptive/issues>
1502
1503When submitting a bug or request, please include a test-file or a
1504patch to an existing test-file that illustrates the bug or desired
1505feature.
1506
1507=head1 COPYRIGHT AND LICENSE
1508
1509This software is copyright (c) 1997 by Jason Kastner, Andrea Spinelli, Colin Kuskie, and others.
1510
1511This is free software; you can redistribute it and/or modify it under
1512the same terms as the Perl 5 programming language system itself.
1513
1514=cut
1515