1#!/usr/local/bin/perl 2package Statistics::TTest; 3use strict; 4use Carp; 5use POSIX; 6use vars qw($VERSION $AUTOLOAD); 7use Statistics::Distributions qw (tdistr fdistr tprob fprob); 8use Statistics::PointEstimation; 9$VERSION='1.1'; 10my %fields= 11( 12 's1' =>undef, #sample 1 13 's2' =>undef, #sample 2 14 'significance' =>undef, 15 'alpha' =>undef, 16 'mean_difference' =>undef, 17 'variance' =>undef, 18 'standard_deviation' =>undef, 19 'standard_error' =>undef, 20 'standard_error_equal' =>undef, 21 'standard_error_unequal' =>undef, 22 'f_cutoff' =>undef, 23 'f_statistic' =>undef, 24 'df1' =>undef, 25 'df2' =>undef, 26 'df' => undef, 27 'df_equal' =>undef, 28 'df_unequal' =>undef, 29 'equal_variance'=>undef, 30 't_value' =>undef, 31 't_statistic' =>undef, 32 't_statistic_equal' =>undef, 33 't_statistic_unequal' =>undef, 34 't_prob' =>undef, 35 'null_hypothesis' =>undef, 36 'delta' =>undef, 37 'lower_clm' =>undef, 38 'upper_clm' =>undef, 39 'valid' =>undef 40); 41 42 43sub new 44{ 45 my $proto = shift; 46 my $class = ref($proto) || $proto; 47 my $self= {%fields}; 48 bless($self,$class); 49 return $self; 50} 51 52sub load_data 53{ 54 my $self=shift; 55 undef $self->{valid}; 56 my ($sample1,$sample2)=@_; 57 if((ref $sample1 ne 'ARRAY')||(ref $sample2 ne 'ARRAY')) 58 { 59 croak "Invalid input type for load_data() function.\n the 2 inputs must be array references."; 60 } 61 62 my $s1= new Statistics::PointEstimation; 63 my $s2= new Statistics::PointEstimation; 64 65 if($self->significance()) 66 { 67 $s1->set_significance($self->significance()); 68 $s2->set_significance($self->significance()); 69 } 70 71 $s1->add_data($sample1); 72 $s2->add_data($sample2); 73 74 croak "Invalid sample size. sample size for the 2 samples are",$s1->count," and ",$s2->count() , 75 ". the sample size must be greater than 1" if(($s1->count() <=1)||($s2->count() <=1 )); 76 $self->{'s1'}=$s1; 77 $self->{'s2'}=$s2; 78 79 return $self->perform_t_test(); 80 81 82 83} 84 85sub perform_t_test 86{ 87 my $self=shift; 88 my $s1=$self->s1(); 89 my $s2=$self->s2(); 90 $self->{valid}=0; 91 $self->{significance}=95 if (!defined($self->{significance})); 92 $self->{alpha}=(100-$self->{significance})/2; 93 $self->{alpha}/=100; 94 $self->{mean_difference}=$s1->{mean}-$s2->{mean}; 95 $self->{variance}=($s1->{df}*$s1->{variance}+$s2->{df}*$s2->{variance})/($s1->{df}+$s2->{df}); 96 $self->{standard_deviation}=sqrt($self->{variance}); 97 $self->{standard_error_equal}=sqrt(1/$s1->{count}+1/$s2->{count})*$self->{standard_deviation}; 98 $self->{standard_error_unequal}=sqrt(($s1->{standard_error})**2+($s2->{standard_error})**2); 99 100 $self->{df_equal}=$s1->{df}+$s2->{df}; 101 $self->{df_unequal}= (($s1->{standard_error})**4/$s1->{df}+($s2->{standard_error})**4/$s2->{df})? 102 ((($s1->{standard_error})**2+($s2->{standard_error})**2)**2 103 / 104 (($s1->{standard_error})**4/$s1->{df}+($s2->{standard_error})**4/$s2->{df})) : 105 ($self->{df_equal}); 106 107 $self->{t_statistic_equal}=($self->{standard_error_equal})? 108 (abs ($self->{mean_difference}/$self->{standard_error_equal})):99; 109 $self->{t_statistic_unequal}=($self->{standard_error_unequal})? 110 (abs ($self->{mean_difference}/$self->{standard_error_unequal})):99; 111 my ($df1,$df2); 112 if($s1->{variance}>=$s2->{variance}) 113 { 114 $df1=$s1->{df}; 115 $df2=$s2->{df}; 116 $self->{f_statistic}=($s2->{variance})?($s1->{variance}/$s2->{variance}):99; 117 118 } 119 else 120 { 121 122 $df1=$s2->{df}; 123 $df2=$s1->{df}; 124 $self->{f_statistic}=($s1->{variance})?($s2->{variance}/$s1->{variance}):99; 125 } 126 ($self->{df1},$self->{df2})=($df1,$df2); 127 $self->{f_cutoff}=fdistr($df1,$df2,$self->{alpha}); 128 129 if($self->{f_statistic}<=$self->{f_cutoff}) 130 { $self->{equal_variance}=1; } 131 else 132 { $self->{equal_variance}=0; } 133 134 135 if($self->{equal_variance}) 136 { 137 $self->{standard_error}=$self->{standard_error_equal}; 138 $self->{t_statistic}=$self->{t_statistic_equal}; 139 $self->{df}=$self->{df_equal}; 140 141 } 142 else 143 { 144 $self->{standard_error}=$self->{standard_error_unequal}; 145 $self->{t_statistic}=$self->{t_statistic_unequal}; 146 $self->{df}=$self->{df_unequal}; 147 148 } 149 $self->{t_prob}=1- abs (tprob(floor($self->{df}),-1*$self->{t_statistic})-tprob(floor($self->{df}),$self->{t_statistic})); 150 if($self->{t_prob}<$self->{alpha}*2) 151 { 152 $self->{null_hypothesis}='rejected'; 153 } 154 else 155 { 156 $self->{null_hypothesis}='not rejected'; 157 } 158 159 $self->{t_value}=tdistr(floor($self->df()),$self->alpha()); 160 $self->{delta}=$self->t_value()*$self->standard_error(); 161 $self->{lower_clm}=$self->{mean_difference}-$self->{delta}; 162 $self->{upper_clm}=$self->{mean_difference}+$self->{delta}; 163 $self->{valid}=1 if (($s1->{variance})&&($s2->{variance})); 164 return !($self->{valid}); 165 166} 167 168sub print_t_test 169{ 170 my $self=shift; 171 foreach my $k ( keys %$self) 172 { 173 next if ($k eq 's1') || ($k eq 's2'); 174 print "$k: $self->{$k} \n"; 175 } 176 return 1; 177 178} 179sub output_t_test 180{ 181 my $self=shift; 182 my $s1=$self->{s1}; 183 my $s2=$self->{s2}; 184 croak "no data. s1 or s2 is empty or the variance =0.\n" if ((!defined($s1))||(!defined($s2))||($self->valid!=1)); 185 print "*****************************************************\n\n"; 186 $s1->output_confidence_interval('1'); 187 print "*****************************************************\n\n"; 188 $s2->output_confidence_interval('2'); 189 print "*****************************************************\n\n"; 190 191 print "Comparison of these 2 independent samples.\n"; 192 print "\t F-statistic=",$self->f_statistic()," , cutoff F-statistic=",$self->f_cutoff(), 193 " with alpha level=",$self->alpha*2," and df =(",$self->df1,",",$self->df2,")\n"; 194 if($self->{equal_variance}) 195 { print "\tequal variance assumption is accepted(not rejected) since F-statistic < cutoff F-statistic\n";} 196 else 197 { print "\tequal variance assumption is rejected since F-statistic > cutoff F-statistic\n";} 198 199 print "\tdegree of freedom=",$self->df," , t-statistic=T=",$self->t_statistic," Prob >|T|=",$self->{t_prob},"\n"; 200 print "\tthe null hypothesis (the 2 samples have the same mean) is ",$self->null_hypothesis(), 201 " since the alpha level is ",$self->alpha()*2,"\n"; 202 print "\tdifference of the mean=",$self->mean_difference(),", standard error=",$self->standard_error(),"\n"; 203 print "\t the estimate of the difference of the mean is ", $self->mean_difference()," +/- ",$self->delta(),"\n\t", 204 " or (",$self->lower_clm()," to ",$self->upper_clm," ) with ",$self->significance," % of confidence\n"; 205 206 return 1; 207} 208sub set_significance 209{ 210 my $self=shift; 211 my $significance=shift; 212 croak "Invalid Significance. $significance should be 0-100 usually 90,95,99\n" 213 unless (($significance>0)&&($significance<100)); 214 $self->{significance}=$significance; 215 if($self->{s1}&&$self->{s2}) 216 { 217 $self->{s1}->set_significance($significance); 218 $self->{s2}->set_significance($significance); 219 $self->perform_t_test(); 220 221 } 222 return 1; 223 224} 225 226sub AUTOLOAD 227{ 228 my $self = shift; 229 my $type = ref($self) 230 or croak "$self is not an object"; 231 my $name = $AUTOLOAD; 232 $name =~ s/.*://; ##Strip fully qualified-package portion 233 return if $name eq "DESTROY"; 234 unless (exists $self->{$name} ) 235 { 236 croak "Can't access `$name' field in class $type"; 237 } 238 ##Read only method 239 return $self->{$name}; 240} 241 2421; 243 244#perform t-test using sufficient statistics 245package Statistics::TTest::Sufficient; 246use strict; 247use Carp; 248use vars qw($VERSION @ISA $AUTOLOAD); 249use Statistics::PointEstimation; 250use Statistics::TTest; 251use POSIX; 252@ISA= qw (Statistics::TTest); 253$VERSION = '1.1'; 254 255 256sub load_data{ 257 my $self=shift; 258 undef $self->{valid}; 259 my ($sample1,$sample2)=@_; 260 261 262 my $s1= new Statistics::PointEstimation::Sufficient; 263 my $s2= new Statistics::PointEstimation::Sufficient; 264 265 if($self->significance()) 266 { 267 $s1->set_significance($self->significance()); 268 $s2->set_significance($self->significance()); 269 } 270 271 $s1->load_data($sample1->{count},$sample1->{mean},$sample1->{variance}); 272 $s2->load_data($sample2->{count},$sample2->{mean},$sample2->{variance}); 273 274 croak "Invalid sample size. sample size for the 2 samples are",$s1->count," and ",$s2->count() , 275 ". the sample size must be greater than 1" if(($s1->count() <=1)||($s2->count() <=1 )); 276 $self->{'s1'}=$s1; 277 $self->{'s2'}=$s2; 278 279 return $self->perform_t_test(); 280 281 282} 283 2841; 285__END__ 286 287=head1 NAME 288 289 Statistics::TTest - Perl module to perform T-test on 2 independent samples 290 Statistics::TTest::Sufficient - Perl module to perfrom T-Test on 2 indepdent samples using sufficient statistics 291 292=head1 SYNOPSIS 293 294 #example for Statistics::TTest 295 use Statistics::PointEstimation; 296 use Statistics::TTest; 297 my @r1=(); 298 my @r2=(); 299 my $rand; 300 301 for($i=1;$i<=32;$i++) #generate a uniformly distributed sample with mean=5 302 { 303 304 $rand=rand(10); 305 push @r1,$rand; 306 $rand=rand(10)-2; 307 push @r2,$rand; 308 } 309 310 311 my $ttest = new Statistics::TTest; 312 $ttest->set_significance(90); 313 $ttest->load_data(\@r1,\@r2); 314 $ttest->output_t_test(); 315 $ttest->set_significance(99); 316 $ttest->print_t_test(); #list out t-test related data 317 318 #the following thes same as calling output_t_test() (you can check if $ttest->{valid}==1 to check if the data is valid.) 319 my $s1=$ttest->{s1}; #sample 1 a Statistics::PointEstimation object 320 my $s2=$ttest->{s2}; #sample 2 a Statistics::PointEstimation object 321 print "*****************************************************\n\n"; 322 $s1->output_confidence_interval('1'); 323 print "*****************************************************\n\n"; 324 $s2->output_confidence_interval('2'); 325 print "*****************************************************\n\n"; 326 327 print "Comparison of these 2 independent samples.\n"; 328 print "\t F-statistic=",$ttest->f_statistic()," , cutoff F-statistic=",$ttest->f_cutoff(), 329 " with alpha level=",$ttest->alpha*2," and df =(",$ttest->df1,",",$ttest->df2,")\n"; 330 if($ttest->{equal_variance}) 331 { print "\tequal variance assumption is accepted(not rejected) since F-statistic < cutoff F-statistic\n";} 332 else 333 { print "\tequal variance assumption is rejected since F-statistic > cutoff F-statistic\n";} 334 335 print "\tdegree of freedom=",$ttest->df," , t-statistic=T=",$ttest->t_statistic," Prob >|T|=",$ttest->{t_prob},"\n"; 336 print "\tthe null hypothesis (the 2 samples have the same mean) is ",$ttest->null_hypothesis(), 337 " since the alpha level is ",$ttest->alpha()*2,"\n"; 338 print "\tdifference of the mean=",$ttest->mean_difference(),", standard error=",$ttest->standard_error(),"\n"; 339 print "\t the estimate of the difference of the mean is ", $ttest->mean_difference()," +/- ",$ttest->delta(),"\n\t", 340 " or (",$ttest->lower_clm()," to ",$ttest->upper_clm," ) with ",$ttest->significance," % of confidence\n"; 341 342 #example for Statistics::TTest::Sufficient 343 use Statistics::PointEstimation; 344 use Statistics::TTest; 345 346 my %sample1=( 347 'count' =>30, 348 'mean' =>3.98, 349 'variance' =>2.63 350 ); 351 352 my %sample2=( 353 'count'=>30, 354 'mean'=>3.67, 355 'variance'=>1.12 356 ); 357 358 359 my $ttest = new Statistics::TTest::Sufficient; 360 $ttest->set_significance(90); 361 $ttest->load_data(\%sample1,\%sample2); 362 $ttest->output_t_test(); 363 #$ttest->s1->print_confidence_interval(); 364 $ttest->set_significance(99); 365 $ttest->output_t_test(); 366 #$ttest->s1->print_confidence_interval(); 367 368 369=head1 DESCRIPTION 370 371=head2 Statistics::TTest 372 373 This is the Statistical T-Test module to compare 2 independent samples. It takes 2 array of point measures, 374 compute the confidence intervals using the PointEstimation module (which is also included in this package) 375 and use the T-statistic to test the null hypothesis. If the null hypothesis is rejected, the difference 376 will be given as the lower_clm and upper_clm of the TTest object. 377 378=head2 Statistics::TTest::Sufficient 379 380 This module is a subclass of Statistics::TTest. Instead of taking the real data points as the input, 381 it will compute the confidence intervals based on the sufficient statistics and the sample size inputted. 382 To use this module, you need to pass the sample size, the sample mean , and the sample variance into the load_data() 383 function. The output will be exactly the same as the Statistics::TTest Module. 384 385 386 387=head1 AUTHOR 388 389Yun-Fang Juan , Yahoo! Inc. (yunfang@yahoo-inc.com) 390 391=head1 SEE ALSO 392 393Statistics::Descriptive Statistics::Distributions Statistics::PointEstimation 394 395=cut 396 397