1import ("stats");
2
3% This file contains the following public functions:
4%
5%   ks_test          One sample Kolmogorov test
6%   ad_test          Anderson-Darling test
7%   ks_test2         Two sample Smirnov test
8%   mw_test	     Two sample Mann-Whitney-Wilcoxon test
9%   chisqr_test	     Chisqr-test
10%   t_test           Student t test
11%   t_test2          Two-sample Student t test
12%   welch_t_test
13%   spearman_r       Two-sample Spearman rank test
14%   kendall_tau      Kendall tau correlation test
15%   mann_kendall     Mann-Kendall trend test
16%   pearson_r        Pearson's r correlation test
17%   correlation      2 sample correlation
18%   z_test
19%   f_test2          2 sample F test
20%   skewness
21%   kurtosis
22%
23autoload ("ad_ktest", "statslib/ad_test");
24autoload ("ad_test", "statslib/ad_test");
25autoload ("ks_test", "statslib/ks_test");
26autoload ("ks_test2", "statslib/ks_test");
27autoload ("kuiper_test", "statslib/kuiper");
28autoload ("kuiper_test2", "statslib/kuiper");
29
30define normal_cdf ()
31{
32   variable m, s, a;
33   variable nargs = _NARGS;
34
35   switch (nargs)
36     {
37      case 1:
38	m = NULL, s = NULL;
39     }
40     {
41      case 3:
42	(m, s) = ();
43     }
44     {
45	_pop_n (nargs);
46	usage ("cdf = normal_cdf (A [, mean, stddev])");
47     }
48   a = ();
49
50   if (nargs != 1)
51     a = (a-m)/double(s);
52
53   if (typeof (a) == Array_Type)
54     return array_map (Double_Type, &_normal_cdf, a);
55
56   return _normal_cdf (a);
57}
58
59define poisson_cdf ()
60{
61   variable lam, n;
62   if (_NARGS != 2)
63     {
64	_pop_n (_NARGS);
65	usage ("cdf = poisson_cdf (lambda, n)");
66     }
67   (lam, n) = ();
68
69   if ((typeof (n) == Array_Type) or (typeof (lam) == Array_Type))
70     return array_map (Double_Type, &_poisson_cdf, lam, n);
71
72   return _poisson_cdf (lam, n);
73}
74
75define sample_mean ()
76{
77   variable args = __pop_args (_NARGS);
78   return mean (__push_args(args));
79}
80
81% These functions return the biased stddev
82define sample_stddev ()
83{
84   variable x = ();
85   variable n = 1.0*length (x);
86   return stddev(x) * sqrt((n-1.0)/n);
87}
88
89private define get_mean_stddev (x)
90{
91   variable m = mean(x);
92   variable n = 1.0*length (x);
93   variable s = stddev(x) * sqrt((n-1.0)/n);
94   return m, s, n;
95}
96
97define skewness ()
98{
99   if (_NARGS != 1)
100     usage ("s = %s(A);", _function_name ());
101   variable x = ();
102   variable m, s, n;
103   (m, s, n) = get_mean_stddev (x);
104
105   x = sum (((x - m)/s)^3)/n;
106
107   if ((s == 0.0) && isnan (x))
108     x = 0.0;
109
110   return x;
111}
112
113define kurtosis ()
114{
115   if (_NARGS != 1)
116     usage ("s = %s(A);", _function_name ());
117   variable x = ();
118   variable m, s, n;
119   (m, s, n) = get_mean_stddev (x);
120
121   x = sum (((x - m)/s)^4)/n - 3.0;
122
123   if ((s == 0.0) && isnan (x))
124     x = 0.0;
125
126   return x;
127}
128
129define covariance ()
130{
131   variable n = _NARGS;
132   if (n == 0)
133     usage ("Sigma = covariance (X1, X2, ..., Xn [;qualifiers])\n" +
134	    "Qualifiers:\n" +
135	    " mu=[mu1,mu2,..,muN]  (expected values E(Xi))"
136	   );
137
138   variable Xs = __pop_list (n);
139   variable i, m = length (Xs[0]);
140   _for i (0, n-1, 1)
141     {
142	if (length (Xs[i]) != m)
143	  throw InvalidParmError, "Arrays must be of the same size";
144     }
145   variable mus = qualifier ("mu");
146   variable norm = 1.0;
147   if (mus == NULL)
148     {
149	mus = Double_Type[n];
150	_for i (0, n-1, 1)
151	  mus[i] = mean (Xs[i]);
152	norm = m/(m-1.0);
153     }
154   if (length (mus) != n)
155     throw InvalidParmError, "The value mu qualifier has the wrong length";
156
157   variable cov = Double_Type[n,n];
158   _for i (0, n-1, 1)
159     {
160	variable j;
161	variable dx_i = Xs[i]-mus[i];
162	_for j (i, n-1, 1)
163	  {
164	     variable c = norm * mean (dx_i*(Xs[j] - mus[j]));
165	     cov[i,j] = c;
166	     cov[j,i] = c;
167	  }
168     }
169   return cov;
170}
171
172% This function assumes the distribution is symmetric
173private define map_cdf_to_pval (cdf)
174{
175   variable side = qualifier ("side", NULL);
176
177   variable pval = cdf;		       %  side="<"
178   if (side == ">")
179     pval = 1.0 - cdf;
180   else if (side != "<")	       %  double-sided
181     pval = 2.0 * _min (1.0-pval, pval);
182
183   return pval;
184}
185
186define chisqr_test ()
187{
188   variable t_ref = NULL;
189   variable nr = _NARGS;
190   if (nr > 1)
191     {
192	t_ref = ();
193	if (typeof (t_ref) == Ref_Type)
194	  nr--;
195	else
196	  {
197	     t_ref;		       %  push it back
198	     t_ref = NULL;
199	  }
200     }
201
202   if (nr < 2)
203     {
204	usage ("p=%s(X,Y,...,Z [,&T])", _function_name);
205     }
206   variable args = __pop_args (nr);
207   variable datasets = Array_Type[nr];
208   variable nc = length (args[0].value);
209   variable c = Double_Type[nc];
210
211   _for (0, nr-1, 1)
212     {
213	variable i = ();
214	variable d = args[i].value;
215	if (length (d) != nc)
216	  verror ("The chisqr test requires datasets to be of the same length");
217	datasets[i] = d;
218	c += d;
219     }
220   variable N = sum (c);
221   variable t = 0.0;
222   _for (0, nr-1, 1)
223     {
224        i = ();
225	d = datasets[i];
226	variable e = sum (d)/N * c;
227	t += sum((d-e)^2/e);
228     }
229
230   if (t_ref != NULL)
231     @t_ref = t;
232
233   return 1.0 - chisqr_cdf ((nr-1)*(nc-1), t);
234}
235
236% Usage: r = compute_rank (X, [&tie_fun [,&tied_groups]])
237% Here, if tied_groups is non-NULL, it will be an array whose length
238% represents the number of tied groups, and each element being the number
239% within the kth group.
240private define compute_rank ()
241{
242   variable x, tie_fun = &mean, group_ties_ref = NULL;
243   if (_NARGS == 3)
244     group_ties_ref = ();
245   if (_NARGS >= 2)
246     tie_fun = ();
247   x = ();
248   if (tie_fun == NULL)
249     tie_fun = &mean;
250
251   variable indx = array_sort (x);
252   x = x[indx];
253   variable n = length (x);
254   variable r = double([1:n]);
255
256   % Worry about ties
257   variable ties;
258   () = wherediff (x, &ties);
259   % Here, ties is an array of indices {j} where x[j-1]==x[j].
260   % We want those where x[j] == x[j+1].
261   ties -= 1;
262
263   variable m = length (ties);
264   variable group_ties = Int_Type[0];
265   if (m)
266     {
267	variable i = 0;
268	variable g = 0;
269	group_ties = Int_Type[m];
270	while (i < m)
271	  {
272	     variable ties_i = ties[i];
273	     variable j = i;
274	     j++;
275	     variable dties = ties_i - i;
276	     while ((j < m) && (dties + j == ties[j]))
277	       j++;
278
279	     variable dn = j - i;
280	     i = [ties_i:ties_i+dn];
281	     r[i] = (@tie_fun)(r[i]);
282	     group_ties[g] = dn+1;
283	     i = j;
284	     g++;
285	  }
286	group_ties = group_ties[[0:g-1]];
287     }
288
289   if (group_ties_ref != NULL)
290     @group_ties_ref = group_ties;
291
292   % Now put r back in the order of x before it was sorted.
293   return r[array_sort(indx)];
294}
295
296% Min sum:  1+2+...+n = n*(n+1)/2
297% Max sum:  (m+1) + (m+2) + ... (m+n) = n*m + n*(n+1)/2
298% Average: (n*(n+1) + n*m)/2 = n*(n+m+1)/2
299define mw_test ()
300{
301   variable w_ref = NULL;
302   if (_NARGS == 3)
303     w_ref = ();
304   else if (_NARGS != 2)
305     {
306	usage ("p = %s (X1, Y1 [,&w]);  %% Two-Sample Mann-Whitney",
307	       _function_name ());
308     }
309   variable x, y;
310   (x, y) = ();
311   variable side = qualifier ("side", NULL);
312
313   variable n = length (x), m = length (y);
314   variable N = m+n;
315   variable mn = m*n;
316
317   variable gties;
318   variable r = compute_rank ([x,y], &mean, &gties);
319   variable w = sum (r[[0:n-1]]);
320
321   variable has_ties = length (gties);
322#iffalse
323   if (has_ties)
324     vmessage ("*** Warning: mw_test: ties found--- using asymptotic cdf");
325#endif
326
327   variable p;
328
329   if (has_ties || ((m > 50) && (n > 50)))
330     {
331	% Asymptotic
332	variable wstar = w - 0.5*n*(N+1);
333	variable vw = (mn/12.0)*(N+1 - sum((gties-1)*gties*(gties+1))/(N*(N-1)));
334
335	p = normal_cdf (wstar/sqrt(vw));
336
337	if (side == ">")
338	  p = 1.0 - p;
339	else if (side != "<")
340	  p = 2 * _min (p, 1.0-p);
341     }
342   else
343     {
344	% exact
345	if (side == ">")
346	  p = 1.0 - mann_whitney_cdf (n, m, w);
347	else if (side == "<")
348	  p = mann_whitney_cdf (n, m, w);
349	else
350	  {
351	     p = mann_whitney_cdf (n, m, w);
352	     p = 2 * _min (p, 1-p);
353	  }
354     }
355
356   if (w_ref != NULL)
357     @w_ref = w;
358
359   return p;
360}
361
362define t_test ()
363{
364   variable x, mu;
365   variable tref = NULL;
366
367   if (_NARGS == 2)
368     (x,mu) = ();
369   else if (_NARGS == 3)
370     (x,mu,tref) = ();
371   else
372     {
373	usage ("p = t_test (X, mu [,&t] [; qualifiers]);  %% Student's t-test\n"
374	       + "Qualifiers:\n"
375	       + " side=\"<\" | \">\""
376	      );
377     }
378
379   variable n = length (x);
380   variable stat = sqrt(n)*((mean(x) - mu)/stddev(x));
381   if (tref != NULL) @tref = stat;
382
383   return map_cdf_to_pval (student_t_cdf(stat, n-1) ;; __qualifiers);
384}
385
386define t_test2 ()
387{
388   variable x, y;
389   variable tref = NULL;
390
391   if (_NARGS == 2)
392     (x,y) = ();
393   else if (_NARGS == 3)
394     (x,y,tref) = ();
395   else
396     {
397	usage ("p = t_test2 (X, Y [,&t] [; qualifiers]);  %% Student's 2 sample (unpaired) t-test\n"
398	       + "Qualifiers:\n"
399	       + " side=\"<\" | \">\""
400	      );
401     }
402   variable side = qualifier ("side", NULL);
403
404   variable nx = length (x), mx = mean(x), sx = stddev (x);
405   variable ny = length (y), my = mean(y), sy = stddev (y);
406   variable df = nx+ny-2;
407   variable stat
408     = (mx-my)/sqrt((((nx-1)*sx*sx+(ny-1)*sy*sy)*(nx+ny))/(nx*ny*df));
409
410   if (tref != NULL) @tref = stat;
411
412   return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers);
413}
414
415define welch_t_test ()
416{
417   variable x, y;
418   variable tref = NULL;
419
420   if (_NARGS == 2)
421     (x,y) = ();
422   else if (_NARGS == 3)
423     (x,y,tref) = ();
424   else
425     {
426	usage ("p = welch_t_test2 (X, Y [,&t] [; qualifiers]);  %% Welch's 2 sample t-test\n"
427	       + "Qualifiers:\n"
428	       + " side=\"<\" | \">\""
429	      );
430     }
431   variable side = qualifier ("side", NULL);
432
433   variable nx = length (x), mx = mean(x), sx = stddev (x), vx = sx*sx/nx;
434   variable ny = length (y), my = mean(y), sy = stddev (y), vy = sy*sy/ny;
435   variable vxvy = vx+vy;
436   variable stat = (mx-my)/sqrt(vxvy);
437   variable df = (vxvy*vxvy)/((vx*vx)/(nx-1) + (vy*vy)/(ny-1));
438
439   if (tref != NULL) @tref = stat;
440
441   return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers);
442}
443
444define z_test ()
445{
446   variable x, mu, sigma;
447   variable tref = NULL;
448
449   if (_NARGS == 4)
450     tref = ();
451   else if (_NARGS != 3)
452     {
453	usage ("p = z_test (X, mu, sigma [,&stat] [; qualifiers]);\n"
454	       + "Qualifiers:\n"
455	       + " side=\"<\" | \">\""
456	      );
457     }
458   (x, mu, sigma) = ();
459   variable side = qualifier ("side", NULL);
460
461   variable n = length (x);
462   variable stat = (mean(x)-mu)/(sigma/sqrt(n));
463   if (tref != NULL) @tref = stat;
464
465   return map_cdf_to_pval (normal_cdf(stat) ;; __qualifiers);
466}
467
468define f_test2 ()
469{
470   variable x, y;
471   variable tref = NULL;
472
473   if (_NARGS == 2)
474     (x,y) = ();
475   else if (_NARGS == 3)
476     (x,y,tref) = ();
477   else
478     {
479	usage ("p = f_test2 (X, Y [,&t] [; qualifiers]);  %% 2 sample F-test\n"
480	       + "Qualifiers:\n"
481	       + " side=\"<\" | \">\""
482	      );
483     }
484   variable side = qualifier ("side", NULL);
485
486   variable v1 = stddev(x)^2;
487   variable v2 = stddev(y)^2;
488   variable n1 = length(x)-1;
489   variable n2 = length(y)-1;
490   variable swap = 0;
491   if (v1 < v2)
492     {
493	swap = 1;
494	(v1, v2) = (v2, v1);
495	(n1, n2) = (n2, n1);
496     }
497   variable stat = (v1/v2);
498
499   variable pval = f_cdf (stat, n1, n2);
500   if (side == ">")
501     {
502	if (swap)
503	  pval = 1.0 - pval;
504     }
505   else if (side == "<")
506     {
507	ifnot (swap)
508	  pval = 1.0 - pval;
509     }
510   else
511     pval = 2.0 * _min (1.0-pval, pval);
512
513   if (tref != NULL) @tref = stat;
514   return pval;
515}
516
517define spearman_r ()
518{
519   variable w_ref = NULL;
520   if (_NARGS == 3)
521     w_ref = ();
522   else if (_NARGS != 2)
523     {
524	usage ("p = %s (X1, Y1 [,&r]);  %% Spearman's rank correlation",
525	       _function_name ());
526     }
527   variable x, y;
528   (x, y) = ();
529   variable n = length (y), m = length (x);
530
531   variable gties_x, gties_y;
532   variable rx = compute_rank (x, &mean, &gties_x);
533   variable ry = compute_rank (y, &mean, &gties_y);
534
535   variable d = sum ((rx-ry)^2);
536   variable cx = sum(gties_x*(gties_x*gties_x-1.0));
537   variable cy = sum(gties_y*(gties_y*gties_y-1.0));
538
539   variable den = double(n) * (n+1.0) * (n-1.0);
540
541   variable r = (1.0 - 6.0*(d+(cx+cy)/12.0)/den)
542     / sqrt((1.0-cx/den)*(1.0-cy/den));
543   if (w_ref != NULL)
544     @w_ref = r;
545
546   variable t = r * sqrt ((n-2)/(1-r*r));
547
548   return map_cdf_to_pval (student_t_cdf(t,n-2) ;; __qualifiers);
549}
550
551% This function is assumed to always pass back a new array.
552private define compute_integer_rank (x, is_sorted)
553{
554   variable n = length (x);
555   variable indx = NULL, rev_indx = NULL;
556   ifnot (is_sorted)
557     {
558	indx = array_sort (x);
559	x = x[indx];
560	% Create a reverse-permutation to restore the array order
561	% upon return.
562	rev_indx = [0:n-1];
563	rev_indx[indx] = @rev_indx;
564     }
565
566   variable r = [1:n];
567
568   % Account for ties
569   variable ties;
570   () = wherediff (x, &ties);
571   % Here, ties is an array of indices {j} where x[j-1]==x[j].
572   % We want those where x[j] == x[j+1].
573   ties -= 1;
574
575   variable m = length (ties);
576   variable i = 0, j;
577   while (i < m)
578     {
579	variable ties_i = ties[i];
580	j = i;
581	j++;
582	variable dties = ties_i - i;
583	while ((j < m) && (dties + j == ties[j]))
584	  j++;
585
586	variable dn = j - i;
587	i = [ties_i:ties_i+dn];
588	r[i] = r[ties_i];
589	i = j;
590     }
591
592   if (indx == NULL)
593     return r;
594
595   return r[rev_indx];
596}
597
598define kendall_tau ()
599{
600   variable w_ref = NULL;
601   if (_NARGS == 3)
602     w_ref = ();
603   else if (_NARGS != 2)
604     {
605	usage ("p = %s (X1, Y1 [,&r]);  %% Kendall's tau correlation",
606	       _function_name ());
607     }
608
609   variable x, y;
610   (x, y) = ();
611   variable n = length (x);
612   if (n != length (y))
613     throw InvalidParmError, "Arrays must be the same length for kendall_tau";
614
615   % _kendall_tau will modify the contents of the arrays.  Be sure to
616   % pass new instances to it.  The sort operation below will achieve
617   % that.
618   variable i = array_sort (x);
619   x = compute_integer_rank (x[i], 1);
620   y = compute_integer_rank (y[i], 0);
621
622   variable tau, z;
623
624   (tau, z) = _kendall_tau (x, y);
625
626   if (w_ref != NULL)
627     @w_ref = tau;
628
629   return map_cdf_to_pval (z ;; __qualifiers);
630}
631
632define mann_kendall ()
633{
634   variable w_ref = NULL;
635   if (_NARGS == 2)
636     w_ref = ();
637   else if (_NARGS != 1)
638     {
639	usage ("p = %s (X [,&r]);  %% Mann-Kendall trend test",
640	       _function_name ());
641     }
642
643   variable x;
644   x = ();
645   variable n = length (x);
646   variable i = [0:n-1];
647
648   % _kendall_tau will modify the contents of the arrays.  Be sure to
649   % pass new instances to it.  compute_integer_rank will create a new
650   % instance.
651   x = compute_integer_rank (x, 0);
652   variable tau, z;
653   (tau, z) = _kendall_tau (i, x);
654
655   if (w_ref != NULL)
656     @w_ref = tau;
657
658   return map_cdf_to_pval (z ;; __qualifiers);
659}
660
661define pearson_r ()
662{
663   variable w_ref = NULL;
664   if (_NARGS == 3)
665     w_ref = ();
666   else if (_NARGS != 2)
667     {
668	usage ("p = %s (X1, Y1 [,&r] [; qualifiers]);  %% Pearson's r correlation\n", +
669	       "Qualifiers:\n" +
670	       " side=\"<\" | \">\"",
671	       _function_name ());
672     }
673
674   variable x, y;
675   (x, y) = ();
676   variable n = length(x);
677   % Note: covariance handles the 1/(N-1) normalization factor
678   variable r = covariance (x, y)[0,1]/(stddev(x)*stddev(y));
679   if (w_ref != NULL)
680     @w_ref = r;
681
682   % This is meaningful only for gaussian distributions
683   variable df = length(x)-2;
684   r = sqrt(df)*r/sqrt(1-r*r);
685   return map_cdf_to_pval (student_t_cdf (r, df) ;; __qualifiers);
686}
687
688define correlation ()
689{
690   if (_NARGS != 2)
691     usage ("c = correlation (X, Y);");
692   variable x, y; (x,y) = ();
693   variable n = length(x);
694   if (n != length(y))
695     throw InvalidParmError, "Arrays must be the same length";
696   variable mx = mean(x), sx = stddev(x), my = mean(y), sy = stddev(y);
697   return sum ((x-mx)*(y-my))/((n-1)*sx*sy);
698}
699
700provide ("stats");
701