1package Statistics::Descriptive::Full; 2$Statistics::Descriptive::Full::VERSION = '3.0800'; 3use strict; 4use warnings; 5 6use Carp; 7use POSIX (); 8use Statistics::Descriptive::Smoother; 9 10use vars qw($a $b %fields); 11 12use parent qw(Statistics::Descriptive::Sparse); 13 14use List::MoreUtils (); 15use List::Util (); 16 17## no critic (ProhibitExplicitReturnUndef) 18##Create a list of fields not to remove when data is updated 19%fields = ( 20 _permitted => undef, ##Place holder for the inherited key hash 21 data => undef, ##Our data 22 samples => undef, ##Number of samples for each value of the data set 23 presorted => undef, ##Flag to indicate the data is already sorted 24 _reserved => undef, ##Place holder for this lookup hash 25); 26 27__PACKAGE__->_make_private_accessors( 28 [ 29 qw(data samples frequency geometric_mean harmonic_mean 30 least_squares_fit median mode 31 skewness kurtosis median_absolute_deviation 32 ) 33 ] 34); 35__PACKAGE__->_make_accessors( [qw(presorted _reserved _trimmed_mean_cache)] ); 36 37sub _clear_fields 38{ 39 my $self = shift; 40 41 # Empty array ref for holding data later! 42 $self->_data( [] ); 43 $self->_samples( [] ); 44 $self->_reserved( \%fields ); 45 $self->presorted(0); 46 $self->_trimmed_mean_cache( +{} ); 47 48 return; 49} 50 51##Have to override the base method to add the data to the object 52##The proxy method from above is still valid 53sub new 54{ 55 my $proto = shift; 56 my $class = ref($proto) || $proto; 57 58 # Create my self re SUPER 59 my $self = $class->SUPER::new(); 60 bless( $self, $class ); #Re-anneal the object 61 $self->_clear_fields(); 62 return $self; 63} 64 65sub _is_reserved 66{ 67 my $self = shift; 68 my $field = shift; 69 70 return exists( $self->_reserved->{$field} ); 71} 72 73sub _delete_all_cached_keys 74{ 75 my $self = shift; 76 77 my %keys = %{$self}; 78 79 # Remove reserved keys for this class from the deletion list 80 delete @keys{ keys %{ $self->_reserved } }; 81 delete @keys{ keys %{ $self->_permitted } }; 82 delete $keys{_trimmed_mean_cache}; 83 84KEYS_LOOP: 85 foreach my $key ( keys %keys ) 86 { # Check each key in the object 87 delete $self->{$key}; # Delete any out of date cached key 88 } 89 $self->{_trimmed_mean_cache} = {}; # just reset this one 90 return; 91} 92 93##Clear a stat. More efficient than destroying an object and calling 94##new. 95sub clear 96{ 97 my $self = shift; ##Myself 98 my $key; 99 100 if ( !$self->count() ) 101 { 102 return; 103 } 104 105 $self->_delete_all_cached_keys(); 106 $self->SUPER::clear(); 107 $self->_clear_fields(); 108} 109 110sub add_data 111{ 112 my $self = shift; ##Myself 113 114 my $aref; 115 116 if ( ref $_[0] eq 'ARRAY' ) 117 { 118 $aref = $_[0]; 119 } 120 else 121 { 122 $aref = \@_; 123 } 124 125 ##If we were given no data, we do nothing. 126 return 1 if ( !@{$aref} ); 127 128 my $oldmean; 129 my ( $min, $max, $sum, $sumsq ); 130 my $count = $self->count; 131 132 # $count is modified lower down, but we need this flag after that 133 my $has_existing_data = $count; 134 135 # Take care of appending to an existing data set 136 if ($has_existing_data) 137 { 138 $min = $self->min(); 139 $max = $self->max(); 140 $sum = $self->sum(); 141 $sumsq = $self->sumsq(); 142 } 143 else 144 { 145 $min = $aref->[0]; 146 $max = $aref->[0]; 147 $sum = 0; 148 $sumsq = 0; 149 } 150 151 # need to allow for already having data 152 $sum += List::Util::sum(@$aref); 153 $sumsq += List::Util::sum( map { $_**2 } @$aref ); 154 $max = List::Util::max( $max, @$aref ); 155 $min = List::Util::min( $min, @$aref ); 156 $count += scalar @$aref; 157 my $mean = $sum / $count; 158 159 $self->min($min); 160 $self->max($max); 161 $self->sample_range( $max - $min ); 162 $self->sum($sum); 163 $self->sumsq($sumsq); 164 $self->mean($mean); 165 $self->count($count); 166 167 ##Variance isn't commonly enough 168 ##used to recompute every single data add, so just clear its cache. 169 $self->_variance(undef); 170 171 push @{ $self->_data() }, @{$aref}; 172 173 # no need to clear keys if we are a newly populated object, 174 # and profiling shows it takes a long time when creating 175 # and populating many stats objects 176 if ($has_existing_data) 177 { 178 ##Clear the presorted flag 179 $self->presorted(0); 180 $self->_delete_all_cached_keys(); 181 } 182 183 return 1; 184} 185 186sub add_data_with_samples 187{ 188 my ( $self, $aref_values ) = @_; 189 190 return 1 if ( !@{$aref_values} ); 191 192 my $aref_data = [ map { keys %$_ } @{$aref_values} ]; 193 my $aref_samples = [ map { values %$_ } @{$aref_values} ]; 194 195 $self->add_data($aref_data); 196 push @{ $self->_samples() }, @{$aref_samples}; 197 198 return 1; 199} 200 201sub get_data 202{ 203 my $self = shift; 204 return @{ $self->_data() }; 205} 206 207sub get_data_without_outliers 208{ 209 my $self = shift; 210 211 if ( $self->count() < $Statistics::Descriptive::Min_samples_number ) 212 { 213 carp( 214"Need at least $Statistics::Descriptive::Min_samples_number samples\n" 215 ); 216 return; 217 } 218 219 if ( !defined $self->{_outlier_filter} ) 220 { 221 carp("Outliers filter not defined\n"); 222 return; 223 } 224 225 my $outlier_candidate_index = $self->_outlier_candidate_index; 226 my $possible_outlier = ( $self->_data() )->[$outlier_candidate_index]; 227 my $is_outlier = $self->{_outlier_filter}->( $self, $possible_outlier ); 228 229 return $self->get_data unless $is_outlier; 230 231 # Removing the outlier from the dataset 232 my @good_indexes = 233 grep { $_ != $outlier_candidate_index } ( 0 .. $self->count() - 1 ); 234 235 my @data = $self->get_data; 236 my @filtered_data = @data[@good_indexes]; 237 return @filtered_data; 238} 239 240sub set_outlier_filter 241{ 242 my ( $self, $code_ref ) = @_; 243 244 if ( !$code_ref || ref($code_ref) ne "CODE" ) 245 { 246 carp("Need to pass a code reference"); 247 return; 248 } 249 250 $self->{_outlier_filter} = $code_ref; 251 return 1; 252} 253 254sub _outlier_candidate_index 255{ 256 my $self = shift; 257 258 my $mean = $self->mean(); 259 my $outlier_candidate_index = 0; 260 my $max_std_deviation = abs( ( $self->_data() )->[0] - $mean ); 261 foreach my $idx ( 1 .. ( $self->count() - 1 ) ) 262 { 263 my $curr_value = ( $self->_data() )->[$idx]; 264 if ( $max_std_deviation < abs( $curr_value - $mean ) ) 265 { 266 $outlier_candidate_index = $idx; 267 $max_std_deviation = abs( $curr_value - $mean ); 268 } 269 } 270 return $outlier_candidate_index; 271} 272 273sub set_smoother 274{ 275 my ( $self, $args ) = @_; 276 277 $args->{data} = $self->_data(); 278 $args->{samples} = $self->_samples(); 279 280 $self->{_smoother} = Statistics::Descriptive::Smoother->instantiate($args); 281} 282 283sub get_smoothed_data 284{ 285 my ( $self, $args ) = @_; 286 287 if ( !defined $self->{_smoother} ) 288 { 289 carp("Smoother object not defined\n"); 290 return; 291 } 292 $self->{_smoother}->get_smoothed_data(); 293} 294 295sub maxdex 296{ 297 my $self = shift; 298 299 return undef if !$self->count; 300 my $maxdex; 301 302 if ( $self->presorted ) 303 { 304 $maxdex = $self->count - 1; 305 } 306 else 307 { 308 my $max = $self->max; 309 $maxdex = List::MoreUtils::first_index { $_ == $max } $self->get_data; 310 } 311 312 $self->{maxdex} = $maxdex; 313 314 return $maxdex; 315} 316 317sub mindex 318{ 319 my $self = shift; 320 321 return undef if !$self->count; 322 323 #my $maxdex = $self->{maxdex}; 324 #return $maxdex if defined $maxdex; 325 my $mindex; 326 327 if ( $self->presorted ) 328 { 329 $mindex = 0; 330 } 331 else 332 { 333 my $min = $self->min; 334 $mindex = List::MoreUtils::first_index { $_ == $min } $self->get_data; 335 } 336 337 $self->{mindex} = $mindex; 338 339 return $mindex; 340} 341 342sub sort_data 343{ 344 my $self = shift; 345 346 if ( !$self->presorted() ) 347 { 348 ##Sort the data in descending order 349 $self->_data( [ sort { $a <=> $b } @{ $self->_data() } ] ); 350 $self->presorted(1); 351 ##Fix the maxima and minima indices - no, this is unnecessary now we have methods 352 #$self->mindex(0); 353 #$self->maxdex($#{$self->_data()}); 354 } 355 356 return 1; 357} 358 359sub percentile 360{ 361 my $self = shift; 362 my $percentile = shift || 0; 363 ##Since we're returning a single value there's no real need 364 ##to cache this. 365 366 ##If the requested percentile is less than the "percentile bin 367 ##size" then return undef. Check description of RFC 2330 in the 368 ##POD below. 369 my $count = $self->count(); 370 371 if ( ( !$count ) || ( $percentile < 100 / $count ) ) 372 { 373 return; # allow for both scalar and list context 374 } 375 376 $self->sort_data(); 377 my $num = $count * $percentile / 100; 378 my $index = &POSIX::ceil($num) - 1; 379 my $val = $self->_data->[$index]; 380 return wantarray 381 ? ( $val, $index ) 382 : $val; 383} 384 385sub _calc_new_median 386{ 387 my $self = shift; 388 my $count = $self->count(); 389 390 ##Even or odd 391 if ( $count % 2 ) 392 { 393 return $self->_data->[ ( $count - 1 ) / 2 ]; 394 } 395 else 396 { 397 return ( 398 ( 399 $self->_data->[ ($count) / 2 ] + 400 $self->_data->[ ( $count - 2 ) / 2 ] 401 ) / 2 402 ); 403 } 404} 405 406sub median 407{ 408 my $self = shift; 409 410 return undef if !$self->count; 411 412 ##Cached? 413 if ( !defined( $self->_median() ) ) 414 { 415 $self->sort_data(); 416 $self->_median( $self->_calc_new_median() ); 417 } 418 return $self->_median(); 419} 420 421sub quantile 422{ 423 my ( $self, $QuantileNumber ) = @_; 424 425 unless ( defined $QuantileNumber and $QuantileNumber =~ m/^0|1|2|3|4$/ ) 426 { 427 carp("Bad quartile type, must be 0, 1, 2, 3 or 4\n"); 428 return; 429 } 430 431 # check data count after the args are checked - should help debugging 432 return undef if !$self->count; 433 434 $self->sort_data(); 435 436 return $self->_data->[0] if ( $QuantileNumber == 0 ); 437 438 my $count = $self->count(); 439 440 return $self->_data->[ $count - 1 ] if ( $QuantileNumber == 4 ); 441 442 my $K_quantile = ( ( $QuantileNumber / 4 ) * ( $count - 1 ) + 1 ); 443 my $F_quantile = $K_quantile - POSIX::floor($K_quantile); 444 $K_quantile = POSIX::floor($K_quantile); 445 446 # interpolation 447 my $aK_quantile = $self->_data->[ $K_quantile - 1 ]; 448 return $aK_quantile if ( $F_quantile == 0 ); 449 my $aKPlus_quantile = $self->_data->[$K_quantile]; 450 451 # Calcul quantile 452 my $quantile = 453 $aK_quantile + ( $F_quantile * ( $aKPlus_quantile - $aK_quantile ) ); 454 455 return $quantile; 456} 457 458sub _real_calc_trimmed_mean 459{ 460 my $self = shift; 461 my $lower = shift; 462 my $upper = shift; 463 464 my $lower_trim = int( $self->count() * $lower ); 465 my $upper_trim = int( $self->count() * $upper ); 466 my ( $val, $oldmean ) = ( 0, 0 ); 467 my ( $tm_count, $tm_mean, $index ) = ( 0, 0, $lower_trim ); 468 469 $self->sort_data(); 470 while ( $index <= $self->count() - $upper_trim - 1 ) 471 { 472 $val = $self->_data()->[$index]; 473 $oldmean = $tm_mean; 474 ++$index; 475 ++$tm_count; 476 $tm_mean += ( $val - $oldmean ) / $tm_count; 477 } 478 479 return $tm_mean; 480} 481 482sub trimmed_mean 483{ 484 my $self = shift; 485 my ( $lower, $upper ); 486 487 #upper bound is in arg list or is same as lower 488 if ( @_ == 1 ) 489 { 490 ( $lower, $upper ) = ( $_[0], $_[0] ); 491 } 492 else 493 { 494 ( $lower, $upper ) = ( $_[0], $_[1] ); 495 } 496 497 # check data count after the args 498 return undef if !$self->count; 499 500 ##Cache 501 my $thistm = join ':', $lower, $upper; 502 my $cache = $self->_trimmed_mean_cache(); 503 if ( !exists( $cache->{$thistm} ) ) 504 { 505 $cache->{$thistm} = $self->_real_calc_trimmed_mean( $lower, $upper ); 506 } 507 508 return $cache->{$thistm}; 509} 510 511sub _test_for_too_small_val 512{ 513 my $self = shift; 514 my $val = shift; 515 516 return ( abs($val) <= $Statistics::Descriptive::Tolerance ); 517} 518 519sub _calc_harmonic_mean 520{ 521 my $self = shift; 522 523 my $hs = 0; 524 525 foreach my $item ( @{ $self->_data() } ) 526 { 527 ##Guarantee that there are no divide by zeros 528 if ( $self->_test_for_too_small_val($item) ) 529 { 530 return; 531 } 532 533 $hs += 1 / $item; 534 } 535 536 if ( $self->_test_for_too_small_val($hs) ) 537 { 538 return; 539 } 540 541 return $self->count() / $hs; 542} 543 544sub harmonic_mean 545{ 546 my $self = shift; 547 548 if ( !defined( $self->_harmonic_mean() ) ) 549 { 550 $self->_harmonic_mean( scalar( $self->_calc_harmonic_mean() ) ); 551 } 552 553 return $self->_harmonic_mean(); 554} 555 556sub mode 557{ 558 my $self = shift; 559 560 if ( !defined( $self->_mode() ) ) 561 { 562 my $mode = 0; 563 my $occurances = 0; 564 565 my %count; 566 567 foreach my $item ( @{ $self->_data() } ) 568 { 569 my $count = ++$count{$item}; 570 if ( $count > $occurances ) 571 { 572 $mode = $item; 573 $occurances = $count; 574 } 575 } 576 577 $self->_mode( 578 ( $occurances > 1 ) 579 ? { exists => 1, mode => $mode } 580 : { exists => 0, } 581 ); 582 } 583 584 my $m = $self->_mode; 585 586 return $m->{'exists'} ? $m->{mode} : undef; 587} 588 589sub geometric_mean 590{ 591 my $self = shift; 592 593 return undef if !$self->count; 594 595 if ( !defined( $self->_geometric_mean() ) ) 596 { 597 my $gm = 1; 598 my $exponent = 1 / $self->count(); 599 600 for my $val ( @{ $self->_data() } ) 601 { 602 if ( $val < 0 ) 603 { 604 return undef; 605 } 606 $gm *= $val**$exponent; 607 } 608 609 $self->_geometric_mean($gm); 610 } 611 612 return $self->_geometric_mean(); 613} 614 615sub skewness 616{ 617 my $self = shift; 618 619 if ( !defined( $self->_skewness() ) ) 620 { 621 my $n = $self->count(); 622 my $sd = $self->standard_deviation(); 623 624 my $skew; 625 626 # skip if insufficient records 627 if ( $sd && $n > 2 ) 628 { 629 630 my $mean = $self->mean(); 631 632 my $sum_pow3; 633 foreach my $rec ( $self->get_data ) 634 { 635 $sum_pow3 += ( ( $rec - $mean ) / $sd )**3; 636 } 637 638 my $correction = $n / ( ( $n - 1 ) * ( $n - 2 ) ); 639 640 $skew = $correction * $sum_pow3; 641 } 642 643 $self->_skewness($skew); 644 } 645 646 return $self->_skewness(); 647} 648 649sub kurtosis 650{ 651 my $self = shift; 652 653 if ( !defined( $self->_kurtosis() ) ) 654 { 655 my $kurt; 656 657 my $n = $self->count(); 658 my $sd = $self->standard_deviation(); 659 660 if ( $sd && $n > 3 ) 661 { 662 663 my $mean = $self->mean(); 664 665 my $sum_pow4; 666 foreach my $rec ( $self->get_data ) 667 { 668 $sum_pow4 += ( ( $rec - $mean ) / $sd )**4; 669 } 670 671 my $correction1 = 672 ( $n * ( $n + 1 ) ) / ( ( $n - 1 ) * ( $n - 2 ) * ( $n - 3 ) ); 673 my $correction2 = 674 ( 3 * ( $n - 1 )**2 ) / ( ( $n - 2 ) * ( $n - 3 ) ); 675 676 $kurt = ( $correction1 * $sum_pow4 ) - $correction2; 677 } 678 679 $self->_kurtosis($kurt); 680 } 681 682 return $self->_kurtosis(); 683} 684 685sub frequency_distribution_ref 686{ 687 my $self = shift; 688 my @k = (); 689 690 # Must have at least two elements 691 if ( $self->count() < 2 ) 692 { 693 return undef; 694 } 695 696 if ( ( !@_ ) && ( defined $self->_frequency() ) ) 697 { 698 return $self->_frequency(); 699 } 700 701 my %bins; 702 my $partitions = shift; 703 704 if ( ref($partitions) eq 'ARRAY' ) 705 { 706 @k = @{$partitions}; 707 return undef unless @k; ##Empty array 708 if ( @k > 1 ) 709 { 710 ##Check for monotonicity 711 my $element = $k[0]; 712 for my $next_elem ( @k[ 1 .. $#k ] ) 713 { 714 if ( $element > $next_elem ) 715 { 716 carp 717"Non monotonic array cannot be used as frequency bins!\n"; 718 return undef; 719 } 720 $element = $next_elem; 721 } 722 } 723 %bins = map { $_ => 0 } @k; 724 } 725 else 726 { 727 return undef unless $partitions >= 1; 728 my $interval = $self->sample_range() / $partitions; 729 foreach my $idx ( 1 .. ( $partitions - 1 ) ) 730 { 731 push @k, ( $self->min() + $idx * $interval ); 732 } 733 734 $bins{ $self->max() } = 0; 735 736 push @k, $self->max(); 737 } 738 739ELEMENT: 740 foreach my $element ( @{ $self->_data() } ) 741 { 742 foreach my $limit (@k) 743 { 744 if ( $element <= $limit ) 745 { 746 $bins{$limit}++; 747 next ELEMENT; 748 } 749 } 750 } 751 752 return $self->_frequency( \%bins ); 753} 754 755sub frequency_distribution 756{ 757 my $self = shift; 758 759 my $ret = $self->frequency_distribution_ref(@_); 760 761 if ( !defined($ret) ) 762 { 763 return undef; 764 } 765 else 766 { 767 return %$ret; 768 } 769} 770 771sub least_squares_fit 772{ 773 my $self = shift; 774 return () if $self->count() < 2; 775 776 ##Sigma sums 777 my ( $sigmaxy, $sigmax, $sigmaxx, $sigmayy, $sigmay ) = 778 ( 0, 0, 0, 0, $self->sum ); 779 my ( $xvar, $yvar, $err ); 780 781 ##Work variables 782 my ( $iter, $y, $x, $denom ) = ( 0, 0, 0, 0 ); 783 my $count = $self->count(); 784 my @x; 785 786 ##Outputs 787 my ( $m, $q, $r, $rms ); 788 789 if ( !defined $_[1] ) 790 { 791 @x = 1 .. $self->count(); 792 } 793 else 794 { 795 @x = @_; 796 if ( $self->count() != scalar @x ) 797 { 798 carp "Range and domain are of unequal length."; 799 return (); 800 } 801 } 802 foreach my $x_val (@x) 803 { 804 $y = $self->_data->[$iter]; 805 $sigmayy += $y * $y; 806 $sigmaxx += $x_val * $x_val; 807 $sigmaxy += $x_val * $y; 808 $sigmax += $x_val; 809 ++$iter; 810 } 811 $denom = $count * $sigmaxx - $sigmax * $sigmax; 812 return () 813 unless abs($denom) > $Statistics::Descriptive::Tolerance; 814 815 $m = ( $count * $sigmaxy - $sigmax * $sigmay ) / $denom; 816 $q = ( $sigmaxx * $sigmay - $sigmax * $sigmaxy ) / $denom; 817 818 $xvar = $sigmaxx - $sigmax * $sigmax / $count; 819 $yvar = $sigmayy - $sigmay * $sigmay / $count; 820 821 $denom = sqrt( $xvar * $yvar ); 822 return () unless ( abs($denom) > $Statistics::Descriptive::Tolerance ); 823 $r = ( $sigmaxy - $sigmax * $sigmay / $count ) / $denom; 824 825 $iter = 0; 826 $rms = 0.0; 827 foreach (@x) 828 { 829 ##Error = Real y - calculated y 830 $err = $self->_data->[$iter] - ( $m * $_ + $q ); 831 $rms += $err * $err; 832 ++$iter; 833 } 834 835 $rms = sqrt( $rms / $count ); 836 837 $self->_least_squares_fit( [ $q, $m, $r, $rms ] ); 838 839 return @{ $self->_least_squares_fit() }; 840} 841 842sub median_absolute_deviation 843{ 844 my ($self) = @_; 845 846 if ( !defined( $self->_median_absolute_deviation() ) ) 847 { 848 my $stat = $self->new; 849 $stat->add_data( map { abs( $_ - $self->median ) } $self->get_data ); 850 $self->_median_absolute_deviation( $stat->median ); 851 } 852 853 return $self->_median_absolute_deviation(); 854} 855 856sub summary 857{ 858 my ($self) = @_; 859 860 my $FMT = '%.5e'; 861 862 return 863 sprintf( "Min: $FMT\nMax: $FMT\nMean: $FMT\nMedian: $FMT\n" 864 . "1st quantile: $FMT\n3rd quantile: $FMT\n", 865 $self->min, $self->max, $self->mean, $self->median, $self->quantile(1), 866 $self->quantile(3), ); 867 868} 8691; 870 871__END__ 872 873=pod 874 875=encoding UTF-8 876 877=head1 NAME 878 879Statistics::Descriptive - Module of basic descriptive statistical functions. 880 881=head1 VERSION 882 883version 3.0800 884 885=head1 SYNOPSIS 886 887 use Statistics::Descriptive; 888 my $stat = Statistics::Descriptive::Full->new(); 889 $stat->add_data(1,2,3,4); 890 my $mean = $stat->mean(); 891 my $var = $stat->variance(); 892 my $tm = $stat->trimmed_mean(.25); 893 $Statistics::Descriptive::Tolerance = 1e-10; 894 895=head1 DESCRIPTION 896 897This module provides basic functions used in descriptive statistics. 898It has an object oriented design and supports two different types of 899data storage and calculation objects: sparse and full. With the sparse 900method, none of the data is stored and only a few statistical measures 901are available. Using the full method, the entire data set is retained 902and additional functions are available. 903 904Whenever a division by zero may occur, the denominator is checked to be 905greater than the value C<$Statistics::Descriptive::Tolerance>, which 906defaults to 0.0. You may want to change this value to some small 907positive value such as 1e-24 in order to obtain error messages in case 908of very small denominators. 909 910Many of the methods (both Sparse and Full) cache values so that subsequent 911calls with the same arguments are faster. 912 913=head1 METHODS 914 915=head2 Sparse Methods 916 917=over 5 918 919=item $stat = Statistics::Descriptive::Sparse->new(); 920 921Create a new sparse statistics object. 922 923=item $stat->clear(); 924 925Effectively the same as 926 927 my $class = ref($stat); 928 undef $stat; 929 $stat = new $class; 930 931except more efficient. 932 933=item $stat->add_data(1,2,3); 934 935Adds data to the statistics variable. The cached statistical values are 936updated automatically. 937 938=item $stat->count(); 939 940Returns the number of data items. 941 942=item $stat->mean(); 943 944Returns the mean of the data. 945 946=item $stat->sum(); 947 948Returns the sum of the data. 949 950=item $stat->variance(); 951 952Returns the variance of the data. Division by n-1 is used. 953 954=item $stat->standard_deviation(); 955 956Returns the standard deviation of the data. Division by n-1 is used. 957 958=item $stat->min(); 959 960Returns the minimum value of the data set. 961 962=item $stat->mindex(); 963 964Returns the index of the minimum value of the data set. 965 966=item $stat->max(); 967 968Returns the maximum value of the data set. 969 970=item $stat->maxdex(); 971 972Returns the index of the maximum value of the data set. 973 974=item $stat->sample_range(); 975 976Returns the sample range (max - min) of the data set. 977 978=back 979 980=head2 Full Methods 981 982Similar to the Sparse Methods above, any Full Method that is called caches 983the current result so that it doesn't have to be recalculated. In some 984cases, several values can be cached at the same time. 985 986=over 5 987 988=item $stat = Statistics::Descriptive::Full->new(); 989 990Create a new statistics object that inherits from 991Statistics::Descriptive::Sparse so that it contains all the methods 992described above. 993 994=item $stat->add_data(1,2,4,5); 995 996Adds data to the statistics variable. All of the sparse statistical 997values are updated and cached. Cached values from Full methods are 998deleted since they are no longer valid. 999 1000I<Note: Calling add_data with an empty array will delete all of your 1001Full method cached values! Cached values for the sparse methods are 1002not changed> 1003 1004=item $stat->add_data_with_samples([{1 => 10}, {2 => 20}, {3 => 30},]); 1005 1006Add data to the statistics variable and set the number of samples each value 1007has been built with. The data is the key of each element of the input array 1008ref, while the value is the number of samples: [{data1 => smaples1}, {data2 => 1009samples2}, ...]. 1010 1011B<NOTE:> The number of samples is only used by the smoothing function and is 1012ignored otherwise. It is not equivalent to repeat count. In order to repeat 1013a certain datum more than one time call add_data() like this: 1014 1015 my $value = 5; 1016 my $repeat_count = 10; 1017 $stat->add_data( 1018 [ ($value) x $repeat_count ] 1019 ); 1020 1021=item $stat->get_data(); 1022 1023Returns a copy of the data array. 1024 1025=item $stat->get_data_without_outliers(); 1026 1027Returns a copy of the data array without outliers. The number minimum of 1028samples to apply the outlier filtering is C<$Statistics::Descriptive::Min_samples_number>, 10294 by default. 1030 1031A function to detect outliers need to be defined (see C<set_outlier_filter>), 1032otherwise the function will return an undef value. 1033 1034The filtering will act only on the most extreme value of the data set 1035(i.e.: value with the highest absolute standard deviation from the mean). 1036 1037If there is the need to remove more than one outlier, the filtering 1038need to be re-run for the next most extreme value with the initial outlier removed. 1039 1040This is not always needed since the test (for example Grubb's test) usually can only detect 1041the most exreme value. If there is more than one extreme case in a set, 1042then the standard deviation will be high enough to make neither case an outlier. 1043 1044=item $stat->set_outlier_filter($code_ref); 1045 1046Set the function to filter out the outlier. 1047 1048C<$code_ref> is the reference to the subroutine implementing the filtering 1049function. 1050 1051Returns C<undef> for invalid values of C<$code_ref> (i.e.: not defined or not a 1052code reference), C<1> otherwise. 1053 1054=over 4 1055 1056=item 1057 1058Example #1: Undefined code reference 1059 1060 my $stat = Statistics::Descriptive::Full->new(); 1061 $stat->add_data(1, 2, 3, 4, 5); 1062 1063 print $stat->set_outlier_filter(); # => undef 1064 1065=item 1066 1067Example #2: Valid code reference 1068 1069 sub outlier_filter { return $_[1] > 1; } 1070 1071 my $stat = Statistics::Descriptive::Full->new(); 1072 $stat->add_data( 1, 1, 1, 100, 1, ); 1073 1074 print $stat->set_outlier_filter( \&outlier_filter ); # => 1 1075 my @filtered_data = $stat->get_data_without_outliers(); 1076 # @filtered_data is (1, 1, 1, 1) 1077 1078In this example the series is really simple and the outlier filter function as well. 1079For more complex series the outlier filter function might be more complex 1080(see Grubbs' test for outliers). 1081 1082The outlier filter function will receive as first parameter the Statistics::Descriptive::Full object, 1083as second the value of the candidate outlier. Having the object in the function 1084might be useful for complex filters where statistics property are needed (again see Grubbs' test for outlier). 1085 1086=back 1087 1088=item $stat->set_smoother({ method => 'exponential', coeff => 0, }); 1089 1090Set the method used to smooth the data and the smoothing coefficient. 1091See C<Statistics::Smoother> for more details. 1092 1093=item $stat->get_smoothed_data(); 1094 1095Returns a copy of the smoothed data array. 1096 1097The smoothing method and coefficient need to be defined (see C<set_smoother>), 1098otherwise the function will return an undef value. 1099 1100=item $stat->sort_data(); 1101 1102Sort the stored data and update the mindex and maxdex methods. This 1103method uses perl's internal sort. 1104 1105=item $stat->presorted(1); 1106 1107=item $stat->presorted(); 1108 1109If called with a non-zero argument, this method sets a flag that says 1110the data is already sorted and need not be sorted again. Since some of 1111the methods in this class require sorted data, this saves some time. 1112If you supply sorted data to the object, call this method to prevent 1113the data from being sorted again. The flag is cleared whenever add_data 1114is called. Calling the method without an argument returns the value of 1115the flag. 1116 1117=item $stat->skewness(); 1118 1119Returns the skewness of the data. 1120A value of zero is no skew, negative is a left skewed tail, 1121positive is a right skewed tail. 1122This is consistent with Excel. 1123 1124=item $stat->kurtosis(); 1125 1126Returns the kurtosis of the data. 1127Positive is peaked, negative is flattened. 1128 1129=item $x = $stat->percentile(25); 1130 1131=item ($x, $index) = $stat->percentile(25); 1132 1133Sorts the data and returns the value that corresponds to the 1134percentile as defined in RFC2330: 1135 1136=over 4 1137 1138=item 1139 1140For example, given the 6 measurements: 1141 1142-2, 7, 7, 4, 18, -5 1143 1144Then F(-8) = 0, F(-5) = 1/6, F(-5.0001) = 0, F(-4.999) = 1/6, F(7) = 11455/6, F(18) = 1, F(239) = 1. 1146 1147Note that we can recover the different measured values and how many 1148times each occurred from F(x) -- no information regarding the range 1149in values is lost. Summarizing measurements using histograms, on the 1150other hand, in general loses information about the different values 1151observed, so the EDF is preferred. 1152 1153Using either the EDF or a histogram, however, we do lose information 1154regarding the order in which the values were observed. Whether this 1155loss is potentially significant will depend on the metric being 1156measured. 1157 1158We will use the term "percentile" to refer to the smallest value of x 1159for which F(x) >= a given percentage. So the 50th percentile of the 1160example above is 4, since F(4) = 3/6 = 50%; the 25th percentile is 1161-2, since F(-5) = 1/6 < 25%, and F(-2) = 2/6 >= 25%; the 100th 1162percentile is 18; and the 0th percentile is -infinity, as is the 15th 1163percentile, which for ease of handling and backward compatibility is returned 1164as undef() by the function. 1165 1166Care must be taken when using percentiles to summarize a sample, 1167because they can lend an unwarranted appearance of more precision 1168than is really available. Any such summary must include the sample 1169size N, because any percentile difference finer than 1/N is below the 1170resolution of the sample. 1171 1172=back 1173 1174(Taken from: 1175I<RFC2330 - Framework for IP Performance Metrics>, 1176Section 11.3. Defining Statistical Distributions. 1177RFC2330 is available from: 1178L<http://www.ietf.org/rfc/rfc2330.txt> .) 1179 1180If the percentile method is called in a list context then it will 1181also return the index of the percentile. 1182 1183=item $x = $stat->quantile($Type); 1184 1185Sorts the data and returns estimates of underlying distribution quantiles based on one 1186or two order statistics from the supplied elements. 1187 1188This method use the same algorithm as Excel and R language (quantile B<type 7>). 1189 1190The generic function quantile produces sample quantiles corresponding to the given probabilities. 1191 1192B<$Type> is an integer value between 0 to 4 : 1193 1194 0 => zero quartile (Q0) : minimal value 1195 1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile 1196 2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile 1197 3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile 1198 4 => fourth quartile (Q4) : maximal value 1199 1200Example : 1201 1202 my @data = (1..10); 1203 my $stat = Statistics::Descriptive::Full->new(); 1204 $stat->add_data(@data); 1205 print $stat->quantile(0); # => 1 1206 print $stat->quantile(1); # => 3.25 1207 print $stat->quantile(2); # => 5.5 1208 print $stat->quantile(3); # => 7.75 1209 print $stat->quantile(4); # => 10 1210 1211=item $stat->median(); 1212 1213Sorts the data and returns the median value of the data. 1214 1215=item $stat->harmonic_mean(); 1216 1217Returns the harmonic mean of the data. Since the mean is undefined 1218if any of the data are zero or if the sum of the reciprocals is zero, 1219it will return undef for both of those cases. 1220 1221=item $stat->geometric_mean(); 1222 1223Returns the geometric mean of the data. 1224 1225=item my $mode = $stat->mode(); 1226 1227Returns the mode of the data. The mode is the most commonly occurring datum. 1228See L<http://en.wikipedia.org/wiki/Mode_%28statistics%29> . If all values 1229occur only once, then mode() will return undef. 1230 1231=item $stat->trimmed_mean(ltrim[,utrim]); 1232 1233C<trimmed_mean(ltrim)> returns the mean with a fraction C<ltrim> 1234of entries at each end dropped. C<trimmed_mean(ltrim,utrim)> 1235returns the mean after a fraction C<ltrim> has been removed from the 1236lower end of the data and a fraction C<utrim> has been removed from the 1237upper end of the data. This method sorts the data before beginning 1238to analyze it. 1239 1240All calls to trimmed_mean() are cached so that they don't have to be 1241calculated a second time. 1242 1243=item $stat->frequency_distribution_ref($partitions); 1244 1245=item $stat->frequency_distribution_ref(\@bins); 1246 1247=item $stat->frequency_distribution_ref(); 1248 1249C<frequency_distribution_ref($partitions)> slices the data into 1250C<$partition> sets (where $partition is greater than 1) and counts the 1251number of items that fall into each partition. It returns a reference to 1252a hash where the keys are the numerical values of the 1253partitions used. The minimum value of the data set is not a key and the 1254maximum value of the data set is always a key. The number of entries 1255for a particular partition key are the number of items which are 1256greater than the previous partition key and less then or equal to the 1257current partition key. As an example, 1258 1259 $stat->add_data(1,1.5,2,2.5,3,3.5,4); 1260 $f = $stat->frequency_distribution_ref(2); 1261 for (sort {$a <=> $b} keys %$f) { 1262 print "key = $_, count = $f->{$_}\n"; 1263 } 1264 1265prints 1266 1267 key = 2.5, count = 4 1268 key = 4, count = 3 1269 1270since there are four items less than or equal to 2.5, and 3 items 1271greater than 2.5 and less than 4. 1272 1273C<frequency_distribution_refs(\@bins)> provides the bins that are to be used 1274for the distribution. This allows for non-uniform distributions as 1275well as trimmed or sample distributions to be found. C<@bins> must 1276be monotonic and contain at least one element. Note that unless the 1277set of bins contains the range that the total counts returned will 1278be less than the sample size. 1279 1280Calling C<frequency_distribution_ref()> with no arguments returns the last 1281distribution calculated, if such exists. 1282 1283=item my %hash = $stat->frequency_distribution($partitions); 1284 1285=item my %hash = $stat->frequency_distribution(\@bins); 1286 1287=item my %hash = $stat->frequency_distribution(); 1288 1289Same as C<frequency_distribution_ref()> except that returns the hash clobbered 1290into the return list. Kept for compatibility reasons with previous 1291versions of Statistics::Descriptive and using it is discouraged. 1292 1293=item $stat->median_absolute_deviation() 1294 1295The median absolute deviation. 1296 1297=item $stat->summary() 1298 1299Returns a textual summary of the distribution - min, max, median, mean and 1300quantiles. 1301 1302(New in version 3.0700 .) 1303 1304=item $stat->least_squares_fit(); 1305 1306=item $stat->least_squares_fit(@x); 1307 1308C<least_squares_fit()> performs a least squares fit on the data, 1309assuming a domain of C<@x> or a default of 1..$stat->count(). It 1310returns an array of four elements C<($q, $m, $r, $rms)> where 1311 1312=over 4 1313 1314=item C<$q and $m> 1315 1316satisfy the equation C($y = $m*$x + $q). 1317 1318=item C<$r> 1319 1320is the Pearson linear correlation cofficient. 1321 1322=item C<$rms> 1323 1324is the root-mean-square error. 1325 1326=back 1327 1328If case of error or division by zero, the empty list is returned. 1329 1330The array that is returned can be "coerced" into a hash structure 1331by doing the following: 1332 1333 my %hash = (); 1334 @hash{'q', 'm', 'r', 'err'} = $stat->least_squares_fit(); 1335 1336Because calling C<least_squares_fit()> with no arguments defaults 1337to using the current range, there is no caching of the results. 1338 1339=back 1340 1341=head1 REPORTING ERRORS 1342 1343I read my email frequently, but since adopting this module I've added 2 1344children and 1 dog to my family, so please be patient about my response 1345times. When reporting errors, please include the following to help 1346me out: 1347 1348=over 4 1349 1350=item * 1351 1352Your version of perl. This can be obtained by typing perl C<-v> at 1353the command line. 1354 1355=item * 1356 1357Which version of Statistics::Descriptive you're using. As you can 1358see below, I do make mistakes. Unfortunately for me, right now 1359there are thousands of CD's with the version of this module with 1360the bugs in it. Fortunately for you, I'm a very patient module 1361maintainer. 1362 1363=item * 1364 1365Details about what the error is. Try to narrow down the scope 1366of the problem and send me code that I can run to verify and 1367track it down. 1368 1369=back 1370 1371=head1 AUTHOR 1372 1373Current maintainer: 1374 1375Shlomi Fish, L<http://www.shlomifish.org/> , C<shlomif@cpan.org> 1376 1377Previously: 1378 1379Colin Kuskie 1380 1381My email address can be found at http://www.perl.com under Who's Who 1382or at: https://metacpan.org/author/COLINK . 1383 1384=head1 CONTRIBUTORS 1385 1386Fabio Ponciroli & Adzuna Ltd. team (outliers handling) 1387 1388=head1 REFERENCES 1389 1390RFC2330, Framework for IP Performance Metrics 1391 1392The Art of Computer Programming, Volume 2, Donald Knuth. 1393 1394Handbook of Mathematica Functions, Milton Abramowitz and Irene Stegun. 1395 1396Probability and Statistics for Engineering and the Sciences, Jay Devore. 1397 1398=head1 COPYRIGHT 1399 1400Copyright (c) 1997,1998 Colin Kuskie. All rights reserved. This 1401program is free software; you can redistribute it and/or modify it 1402under the same terms as Perl itself. 1403 1404Copyright (c) 1998 Andrea Spinelli. All rights reserved. This program 1405is free software; you can redistribute it and/or modify it under the 1406same terms as Perl itself. 1407 1408Copyright (c) 1994,1995 Jason Kastner. All rights 1409reserved. This program is free software; you can redistribute it 1410and/or modify it under the same terms as Perl itself. 1411 1412=head1 LICENSE 1413 1414This program is free software; you can redistribute it and/or modify it 1415under the same terms as Perl itself. 1416 1417=for :stopwords cpan testmatrix url bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan 1418 1419=head1 SUPPORT 1420 1421=head2 Websites 1422 1423The following websites have more information about this module, and may be of help to you. As always, 1424in addition to those websites please use your favorite search engine to discover more resources. 1425 1426=over 4 1427 1428=item * 1429 1430MetaCPAN 1431 1432A modern, open-source CPAN search engine, useful to view POD in HTML format. 1433 1434L<https://metacpan.org/release/Statistics-Descriptive> 1435 1436=item * 1437 1438RT: CPAN's Bug Tracker 1439 1440The RT ( Request Tracker ) website is the default bug/issue tracking system for CPAN. 1441 1442L<https://rt.cpan.org/Public/Dist/Display.html?Name=Statistics-Descriptive> 1443 1444=item * 1445 1446CPANTS 1447 1448The CPANTS is a website that analyzes the Kwalitee ( code metrics ) of a distribution. 1449 1450L<http://cpants.cpanauthors.org/dist/Statistics-Descriptive> 1451 1452=item * 1453 1454CPAN Testers 1455 1456The CPAN Testers is a network of smoke testers who run automated tests on uploaded CPAN distributions. 1457 1458L<http://www.cpantesters.org/distro/S/Statistics-Descriptive> 1459 1460=item * 1461 1462CPAN Testers Matrix 1463 1464The CPAN Testers Matrix is a website that provides a visual overview of the test results for a distribution on various Perls/platforms. 1465 1466L<http://matrix.cpantesters.org/?dist=Statistics-Descriptive> 1467 1468=item * 1469 1470CPAN Testers Dependencies 1471 1472The CPAN Testers Dependencies is a website that shows a chart of the test results of all dependencies for a distribution. 1473 1474L<http://deps.cpantesters.org/?module=Statistics::Descriptive> 1475 1476=back 1477 1478=head2 Bugs / Feature Requests 1479 1480Please report any bugs or feature requests by email to C<bug-statistics-descriptive at rt.cpan.org>, or through 1481the web interface at L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Statistics-Descriptive>. You will be automatically notified of any 1482progress on the request by the system. 1483 1484=head2 Source Code 1485 1486The code is open to the world, and available for you to hack on. Please feel free to browse it and play 1487with it, or whatever. If you want to contribute patches, please send me a diff or prod me to pull 1488from your repository :) 1489 1490L<https://github.com/shlomif/perl-Statistics-Descriptive> 1491 1492 git clone git://github.com/shlomif/perl-Statistics-Descriptive.git 1493 1494=head1 AUTHOR 1495 1496Shlomi Fish <shlomif@cpan.org> 1497 1498=head1 BUGS 1499 1500Please report any bugs or feature requests on the bugtracker website 1501L<https://github.com/shlomif/perl-Statistics-Descriptive/issues> 1502 1503When submitting a bug or request, please include a test-file or a 1504patch to an existing test-file that illustrates the bug or desired 1505feature. 1506 1507=head1 COPYRIGHT AND LICENSE 1508 1509This software is copyright (c) 1997 by Jason Kastner, Andrea Spinelli, Colin Kuskie, and others. 1510 1511This is free software; you can redistribute it and/or modify it under 1512the same terms as the Perl 5 programming language system itself. 1513 1514=cut 1515