1#!/usr/local/bin/perl
2package Statistics::TTest;
3use strict;
4use Carp;
5use POSIX;
6use vars qw($VERSION $AUTOLOAD);
7use Statistics::Distributions qw (tdistr fdistr tprob fprob);
8use Statistics::PointEstimation;
9$VERSION='1.1';
10my %fields=
11(
12	's1'          =>undef,     #sample 1
13	's2'          =>undef,     #sample 2
14	'significance' =>undef,
15	'alpha' =>undef,
16	'mean_difference' =>undef,
17	'variance' =>undef,
18	'standard_deviation' =>undef,
19	'standard_error' =>undef,
20	'standard_error_equal' =>undef,
21	'standard_error_unequal' =>undef,
22	'f_cutoff' =>undef,
23	'f_statistic' =>undef,
24	'df1' =>undef,
25	'df2' =>undef,
26	'df' => undef,
27	'df_equal' =>undef,
28	'df_unequal' =>undef,
29	'equal_variance'=>undef,
30	't_value' =>undef,
31	't_statistic' =>undef,
32	't_statistic_equal' =>undef,
33	't_statistic_unequal' =>undef,
34	't_prob' =>undef,
35	'null_hypothesis' =>undef,
36	'delta' =>undef,
37	'lower_clm' =>undef,
38	'upper_clm' =>undef,
39	'valid' =>undef
40);
41
42
43sub new
44{
45	my $proto = shift;
46        my $class = ref($proto) || $proto;
47	my $self= {%fields};
48	bless($self,$class);
49	return $self;
50}
51
52sub load_data
53{
54	my $self=shift;
55	undef $self->{valid};
56	my ($sample1,$sample2)=@_;
57	if((ref $sample1 ne 'ARRAY')||(ref $sample2 ne 'ARRAY'))
58	{
59		croak "Invalid input type for load_data() function.\n the 2 inputs must be array references.";
60	}
61
62	my $s1= new Statistics::PointEstimation;
63	my $s2= new Statistics::PointEstimation;
64
65	if($self->significance())
66	{
67		$s1->set_significance($self->significance());
68		$s2->set_significance($self->significance());
69	}
70
71	$s1->add_data($sample1);
72	$s2->add_data($sample2);
73
74	croak "Invalid sample size.  sample size for the 2 samples are",$s1->count," and ",$s2->count() ,
75	      ". the sample size must be greater than 1" if(($s1->count() <=1)||($s2->count() <=1 ));
76	$self->{'s1'}=$s1;
77	$self->{'s2'}=$s2;
78
79	return $self->perform_t_test();
80
81
82
83}
84
85sub perform_t_test
86{
87	my $self=shift;
88	my $s1=$self->s1();
89	my $s2=$self->s2();
90	$self->{valid}=0;
91	$self->{significance}=95 if (!defined($self->{significance}));
92	$self->{alpha}=(100-$self->{significance})/2;
93        $self->{alpha}/=100;
94	$self->{mean_difference}=$s1->{mean}-$s2->{mean};
95	$self->{variance}=($s1->{df}*$s1->{variance}+$s2->{df}*$s2->{variance})/($s1->{df}+$s2->{df});
96	$self->{standard_deviation}=sqrt($self->{variance});
97	$self->{standard_error_equal}=sqrt(1/$s1->{count}+1/$s2->{count})*$self->{standard_deviation};
98	$self->{standard_error_unequal}=sqrt(($s1->{standard_error})**2+($s2->{standard_error})**2);
99
100	$self->{df_equal}=$s1->{df}+$s2->{df};
101	$self->{df_unequal}= (($s1->{standard_error})**4/$s1->{df}+($s2->{standard_error})**4/$s2->{df})?
102				 ((($s1->{standard_error})**2+($s2->{standard_error})**2)**2
103				   /
104			           (($s1->{standard_error})**4/$s1->{df}+($s2->{standard_error})**4/$s2->{df})) :
105				 ($self->{df_equal});
106
107	$self->{t_statistic_equal}=($self->{standard_error_equal})?
108				   (abs ($self->{mean_difference}/$self->{standard_error_equal})):99;
109	$self->{t_statistic_unequal}=($self->{standard_error_unequal})?
110				   (abs ($self->{mean_difference}/$self->{standard_error_unequal})):99;
111	my ($df1,$df2);
112	if($s1->{variance}>=$s2->{variance})
113	{
114		$df1=$s1->{df};
115		$df2=$s2->{df};
116		$self->{f_statistic}=($s2->{variance})?($s1->{variance}/$s2->{variance}):99;
117
118	}
119	else
120	{
121
122		$df1=$s2->{df};
123		$df2=$s1->{df};
124		$self->{f_statistic}=($s1->{variance})?($s2->{variance}/$s1->{variance}):99;
125	}
126	($self->{df1},$self->{df2})=($df1,$df2);
127	$self->{f_cutoff}=fdistr($df1,$df2,$self->{alpha});
128
129	if($self->{f_statistic}<=$self->{f_cutoff})
130	{ $self->{equal_variance}=1; }
131	else
132	{ $self->{equal_variance}=0; }
133
134
135	if($self->{equal_variance})
136	{
137		$self->{standard_error}=$self->{standard_error_equal};
138		$self->{t_statistic}=$self->{t_statistic_equal};
139		$self->{df}=$self->{df_equal};
140
141	}
142	else
143	{
144		$self->{standard_error}=$self->{standard_error_unequal};
145		$self->{t_statistic}=$self->{t_statistic_unequal};
146		$self->{df}=$self->{df_unequal};
147
148	}
149	$self->{t_prob}=1- abs (tprob(floor($self->{df}),-1*$self->{t_statistic})-tprob(floor($self->{df}),$self->{t_statistic}));
150	if($self->{t_prob}<$self->{alpha}*2)
151	{
152		$self->{null_hypothesis}='rejected';
153	}
154	else
155	{
156		$self->{null_hypothesis}='not rejected';
157	}
158
159	$self->{t_value}=tdistr(floor($self->df()),$self->alpha());
160	$self->{delta}=$self->t_value()*$self->standard_error();
161	$self->{lower_clm}=$self->{mean_difference}-$self->{delta};
162	$self->{upper_clm}=$self->{mean_difference}+$self->{delta};
163	$self->{valid}=1 if (($s1->{variance})&&($s2->{variance}));
164	return !($self->{valid});
165
166}
167
168sub print_t_test
169{
170	my $self=shift;
171	foreach my $k ( keys %$self)
172        {
173		next if ($k eq 's1') || ($k eq 's2');
174                print "$k: $self->{$k} \n";
175        }
176        return 1;
177
178}
179sub output_t_test
180{
181	my $self=shift;
182	my $s1=$self->{s1};
183	my $s2=$self->{s2};
184	croak "no data. s1 or s2 is empty or the variance =0.\n" if ((!defined($s1))||(!defined($s2))||($self->valid!=1));
185	print "*****************************************************\n\n";
186	$s1->output_confidence_interval('1');
187	print "*****************************************************\n\n";
188	$s2->output_confidence_interval('2');
189	print "*****************************************************\n\n";
190
191        print "Comparison of these 2 independent samples.\n";
192        print "\t F-statistic=",$self->f_statistic()," , cutoff F-statistic=",$self->f_cutoff(),
193		" with alpha level=",$self->alpha*2," and  df =(",$self->df1,",",$self->df2,")\n";
194	if($self->{equal_variance})
195	{ print "\tequal variance assumption is accepted(not rejected) since F-statistic < cutoff F-statistic\n";}
196	else
197	{ print "\tequal variance assumption is  rejected since F-statistic > cutoff F-statistic\n";}
198
199	print "\tdegree of freedom=",$self->df," , t-statistic=T=",$self->t_statistic," Prob >|T|=",$self->{t_prob},"\n";
200	print "\tthe null hypothesis (the 2 samples have the same mean) is ",$self->null_hypothesis(),
201		 " since the alpha level is ",$self->alpha()*2,"\n";
202	print "\tdifference of the mean=",$self->mean_difference(),", standard error=",$self->standard_error(),"\n";
203	print "\t the estimate of the difference of the mean is ", $self->mean_difference()," +/- ",$self->delta(),"\n\t",
204                " or (",$self->lower_clm()," to ",$self->upper_clm," ) with ",$self->significance," % of confidence\n";
205
206	return 1;
207}
208sub set_significance
209{
210	my $self=shift;
211        my $significance=shift;
212        croak "Invalid Significance. $significance should be 0-100 usually 90,95,99\n"
213		 unless (($significance>0)&&($significance<100));
214	$self->{significance}=$significance;
215        if($self->{s1}&&$self->{s2})
216	{
217		$self->{s1}->set_significance($significance);
218		$self->{s2}->set_significance($significance);
219		$self->perform_t_test();
220
221	}
222        return 1;
223
224}
225
226sub AUTOLOAD
227{
228	my $self = shift;
229  	my $type = ref($self)
230    	or croak "$self is not an object";
231  	my $name = $AUTOLOAD;
232  	$name =~ s/.*://;     ##Strip fully qualified-package portion
233  	return if $name eq "DESTROY";
234  	unless (exists $self->{$name} )
235	{
236    		croak "Can't access `$name' field in class $type";
237  	}
238	 ##Read only method
239	 return $self->{$name};
240}
241
2421;
243
244#perform t-test using sufficient statistics
245package Statistics::TTest::Sufficient;
246use strict;
247use Carp;
248use vars qw($VERSION @ISA $AUTOLOAD);
249use Statistics::PointEstimation;
250use Statistics::TTest;
251use POSIX;
252@ISA= qw (Statistics::TTest);
253$VERSION = '1.1';
254
255
256sub load_data{
257	my $self=shift;
258        undef $self->{valid};
259        my ($sample1,$sample2)=@_;
260
261
262        my $s1= new Statistics::PointEstimation::Sufficient;
263        my $s2= new Statistics::PointEstimation::Sufficient;
264
265        if($self->significance())
266        {
267                $s1->set_significance($self->significance());
268                $s2->set_significance($self->significance());
269        }
270
271        $s1->load_data($sample1->{count},$sample1->{mean},$sample1->{variance});
272        $s2->load_data($sample2->{count},$sample2->{mean},$sample2->{variance});
273
274        croak "Invalid sample size.  sample size for the 2 samples are",$s1->count," and ",$s2->count() ,
275              ". the sample size must be greater than 1" if(($s1->count() <=1)||($s2->count() <=1 ));
276        $self->{'s1'}=$s1;
277        $self->{'s2'}=$s2;
278
279        return $self->perform_t_test();
280
281
282}
283
2841;
285__END__
286
287=head1 NAME
288
289 Statistics::TTest - Perl module to perform T-test on 2 independent samples
290 Statistics::TTest::Sufficient - Perl module to perfrom T-Test on 2 indepdent samples using sufficient statistics
291
292=head1 SYNOPSIS
293
294 #example for Statistics::TTest
295 use Statistics::PointEstimation;
296 use Statistics::TTest;
297 my @r1=();
298 my @r2=();
299 my $rand;
300
301  for($i=1;$i<=32;$i++) #generate a uniformly distributed sample with mean=5
302  {
303
304          $rand=rand(10);
305          push @r1,$rand;
306          $rand=rand(10)-2;
307          push @r2,$rand;
308  }
309
310
311 my $ttest = new Statistics::TTest;
312 $ttest->set_significance(90);
313 $ttest->load_data(\@r1,\@r2);
314 $ttest->output_t_test();
315 $ttest->set_significance(99);
316 $ttest->print_t_test();  #list out t-test related data
317
318 #the following thes same as calling output_t_test() (you can check if $ttest->{valid}==1 to check if the data is valid.)
319 my $s1=$ttest->{s1};  #sample 1  a Statistics::PointEstimation object
320 my $s2=$ttest->{s2};  #sample 2  a Statistics::PointEstimation object
321 print "*****************************************************\n\n";
322 $s1->output_confidence_interval('1');
323 print "*****************************************************\n\n";
324 $s2->output_confidence_interval('2');
325 print "*****************************************************\n\n";
326
327 print "Comparison of these 2 independent samples.\n";
328 print "\t F-statistic=",$ttest->f_statistic()," , cutoff F-statistic=",$ttest->f_cutoff(),
329 	" with alpha level=",$ttest->alpha*2," and  df =(",$ttest->df1,",",$ttest->df2,")\n";
330 if($ttest->{equal_variance})
331 { print "\tequal variance assumption is accepted(not rejected) since F-statistic < cutoff F-statistic\n";}
332 else
333 { print "\tequal variance assumption is  rejected since F-statistic > cutoff F-statistic\n";}
334
335 print "\tdegree of freedom=",$ttest->df," , t-statistic=T=",$ttest->t_statistic," Prob >|T|=",$ttest->{t_prob},"\n";
336 print "\tthe null hypothesis (the 2 samples have the same mean) is ",$ttest->null_hypothesis(),
337 	 " since the alpha level is ",$ttest->alpha()*2,"\n";
338 print "\tdifference of the mean=",$ttest->mean_difference(),", standard error=",$ttest->standard_error(),"\n";
339 print "\t the estimate of the difference of the mean is ", $ttest->mean_difference()," +/- ",$ttest->delta(),"\n\t",
340 	" or (",$ttest->lower_clm()," to ",$ttest->upper_clm," ) with ",$ttest->significance," % of confidence\n";
341
342 #example for Statistics::TTest::Sufficient
343 use Statistics::PointEstimation;
344 use Statistics::TTest;
345
346 my %sample1=(
347        'count' =>30,
348        'mean' =>3.98,
349        'variance' =>2.63
350                );
351
352 my %sample2=(
353        'count'=>30,
354        'mean'=>3.67,
355        'variance'=>1.12
356        );
357
358
359 my $ttest = new Statistics::TTest::Sufficient;
360 $ttest->set_significance(90);
361 $ttest->load_data(\%sample1,\%sample2);
362 $ttest->output_t_test();
363 #$ttest->s1->print_confidence_interval();
364 $ttest->set_significance(99);
365 $ttest->output_t_test();
366 #$ttest->s1->print_confidence_interval();
367
368
369=head1 DESCRIPTION
370
371=head2 Statistics::TTest
372
373 This is the Statistical T-Test module to compare 2 independent samples. It takes 2 array of point measures,
374 compute the confidence intervals using the PointEstimation module (which is also included in this package)
375  and use the T-statistic to test the null hypothesis. If the null hypothesis is rejected, the difference
376  will be given as the lower_clm and upper_clm of the TTest object.
377
378=head2 Statistics::TTest::Sufficient
379
380 This module is a subclass of Statistics::TTest. Instead of taking the real data points as the input,
381 it will compute the confidence intervals based on the sufficient statistics and the sample size inputted.
382 To use this module, you need to pass the sample size, the sample mean , and the sample variance into the load_data()
383 function. The output will be exactly the same as the Statistics::TTest Module.
384
385
386
387=head1 AUTHOR
388
389Yun-Fang Juan , Yahoo! Inc.  (yunfang@yahoo-inc.com)
390
391=head1 SEE ALSO
392
393Statistics::Descriptive Statistics::Distributions Statistics::PointEstimation
394
395=cut 
396
397