1import ("stats"); 2 3% This file contains the following public functions: 4% 5% ks_test One sample Kolmogorov test 6% ad_test Anderson-Darling test 7% ks_test2 Two sample Smirnov test 8% mw_test Two sample Mann-Whitney-Wilcoxon test 9% chisqr_test Chisqr-test 10% t_test Student t test 11% t_test2 Two-sample Student t test 12% welch_t_test 13% spearman_r Two-sample Spearman rank test 14% kendall_tau Kendall tau correlation test 15% mann_kendall Mann-Kendall trend test 16% pearson_r Pearson's r correlation test 17% correlation 2 sample correlation 18% z_test 19% f_test2 2 sample F test 20% skewness 21% kurtosis 22% 23autoload ("ad_ktest", "statslib/ad_test"); 24autoload ("ad_test", "statslib/ad_test"); 25autoload ("ks_test", "statslib/ks_test"); 26autoload ("ks_test2", "statslib/ks_test"); 27autoload ("kuiper_test", "statslib/kuiper"); 28autoload ("kuiper_test2", "statslib/kuiper"); 29 30define normal_cdf () 31{ 32 variable m, s, a; 33 variable nargs = _NARGS; 34 35 switch (nargs) 36 { 37 case 1: 38 m = NULL, s = NULL; 39 } 40 { 41 case 3: 42 (m, s) = (); 43 } 44 { 45 _pop_n (nargs); 46 usage ("cdf = normal_cdf (A [, mean, stddev])"); 47 } 48 a = (); 49 50 if (nargs != 1) 51 a = (a-m)/double(s); 52 53 if (typeof (a) == Array_Type) 54 return array_map (Double_Type, &_normal_cdf, a); 55 56 return _normal_cdf (a); 57} 58 59define poisson_cdf () 60{ 61 variable lam, n; 62 if (_NARGS != 2) 63 { 64 _pop_n (_NARGS); 65 usage ("cdf = poisson_cdf (lambda, n)"); 66 } 67 (lam, n) = (); 68 69 if ((typeof (n) == Array_Type) or (typeof (lam) == Array_Type)) 70 return array_map (Double_Type, &_poisson_cdf, lam, n); 71 72 return _poisson_cdf (lam, n); 73} 74 75define sample_mean () 76{ 77 variable args = __pop_args (_NARGS); 78 return mean (__push_args(args)); 79} 80 81% These functions return the biased stddev 82define sample_stddev () 83{ 84 variable x = (); 85 variable n = 1.0*length (x); 86 return stddev(x) * sqrt((n-1.0)/n); 87} 88 89private define get_mean_stddev (x) 90{ 91 variable m = mean(x); 92 variable n = 1.0*length (x); 93 variable s = stddev(x) * sqrt((n-1.0)/n); 94 return m, s, n; 95} 96 97define skewness () 98{ 99 if (_NARGS != 1) 100 usage ("s = %s(A);", _function_name ()); 101 variable x = (); 102 variable m, s, n; 103 (m, s, n) = get_mean_stddev (x); 104 105 x = sum (((x - m)/s)^3)/n; 106 107 if ((s == 0.0) && isnan (x)) 108 x = 0.0; 109 110 return x; 111} 112 113define kurtosis () 114{ 115 if (_NARGS != 1) 116 usage ("s = %s(A);", _function_name ()); 117 variable x = (); 118 variable m, s, n; 119 (m, s, n) = get_mean_stddev (x); 120 121 x = sum (((x - m)/s)^4)/n - 3.0; 122 123 if ((s == 0.0) && isnan (x)) 124 x = 0.0; 125 126 return x; 127} 128 129define covariance () 130{ 131 variable n = _NARGS; 132 if (n == 0) 133 usage ("Sigma = covariance (X1, X2, ..., Xn [;qualifiers])\n" + 134 "Qualifiers:\n" + 135 " mu=[mu1,mu2,..,muN] (expected values E(Xi))" 136 ); 137 138 variable Xs = __pop_list (n); 139 variable i, m = length (Xs[0]); 140 _for i (0, n-1, 1) 141 { 142 if (length (Xs[i]) != m) 143 throw InvalidParmError, "Arrays must be of the same size"; 144 } 145 variable mus = qualifier ("mu"); 146 variable norm = 1.0; 147 if (mus == NULL) 148 { 149 mus = Double_Type[n]; 150 _for i (0, n-1, 1) 151 mus[i] = mean (Xs[i]); 152 norm = m/(m-1.0); 153 } 154 if (length (mus) != n) 155 throw InvalidParmError, "The value mu qualifier has the wrong length"; 156 157 variable cov = Double_Type[n,n]; 158 _for i (0, n-1, 1) 159 { 160 variable j; 161 variable dx_i = Xs[i]-mus[i]; 162 _for j (i, n-1, 1) 163 { 164 variable c = norm * mean (dx_i*(Xs[j] - mus[j])); 165 cov[i,j] = c; 166 cov[j,i] = c; 167 } 168 } 169 return cov; 170} 171 172% This function assumes the distribution is symmetric 173private define map_cdf_to_pval (cdf) 174{ 175 variable side = qualifier ("side", NULL); 176 177 variable pval = cdf; % side="<" 178 if (side == ">") 179 pval = 1.0 - cdf; 180 else if (side != "<") % double-sided 181 pval = 2.0 * _min (1.0-pval, pval); 182 183 return pval; 184} 185 186define chisqr_test () 187{ 188 variable t_ref = NULL; 189 variable nr = _NARGS; 190 if (nr > 1) 191 { 192 t_ref = (); 193 if (typeof (t_ref) == Ref_Type) 194 nr--; 195 else 196 { 197 t_ref; % push it back 198 t_ref = NULL; 199 } 200 } 201 202 if (nr < 2) 203 { 204 usage ("p=%s(X,Y,...,Z [,&T])", _function_name); 205 } 206 variable args = __pop_args (nr); 207 variable datasets = Array_Type[nr]; 208 variable nc = length (args[0].value); 209 variable c = Double_Type[nc]; 210 211 _for (0, nr-1, 1) 212 { 213 variable i = (); 214 variable d = args[i].value; 215 if (length (d) != nc) 216 verror ("The chisqr test requires datasets to be of the same length"); 217 datasets[i] = d; 218 c += d; 219 } 220 variable N = sum (c); 221 variable t = 0.0; 222 _for (0, nr-1, 1) 223 { 224 i = (); 225 d = datasets[i]; 226 variable e = sum (d)/N * c; 227 t += sum((d-e)^2/e); 228 } 229 230 if (t_ref != NULL) 231 @t_ref = t; 232 233 return 1.0 - chisqr_cdf ((nr-1)*(nc-1), t); 234} 235 236% Usage: r = compute_rank (X, [&tie_fun [,&tied_groups]]) 237% Here, if tied_groups is non-NULL, it will be an array whose length 238% represents the number of tied groups, and each element being the number 239% within the kth group. 240private define compute_rank () 241{ 242 variable x, tie_fun = &mean, group_ties_ref = NULL; 243 if (_NARGS == 3) 244 group_ties_ref = (); 245 if (_NARGS >= 2) 246 tie_fun = (); 247 x = (); 248 if (tie_fun == NULL) 249 tie_fun = &mean; 250 251 variable indx = array_sort (x); 252 x = x[indx]; 253 variable n = length (x); 254 variable r = double([1:n]); 255 256 % Worry about ties 257 variable ties; 258 () = wherediff (x, &ties); 259 % Here, ties is an array of indices {j} where x[j-1]==x[j]. 260 % We want those where x[j] == x[j+1]. 261 ties -= 1; 262 263 variable m = length (ties); 264 variable group_ties = Int_Type[0]; 265 if (m) 266 { 267 variable i = 0; 268 variable g = 0; 269 group_ties = Int_Type[m]; 270 while (i < m) 271 { 272 variable ties_i = ties[i]; 273 variable j = i; 274 j++; 275 variable dties = ties_i - i; 276 while ((j < m) && (dties + j == ties[j])) 277 j++; 278 279 variable dn = j - i; 280 i = [ties_i:ties_i+dn]; 281 r[i] = (@tie_fun)(r[i]); 282 group_ties[g] = dn+1; 283 i = j; 284 g++; 285 } 286 group_ties = group_ties[[0:g-1]]; 287 } 288 289 if (group_ties_ref != NULL) 290 @group_ties_ref = group_ties; 291 292 % Now put r back in the order of x before it was sorted. 293 return r[array_sort(indx)]; 294} 295 296% Min sum: 1+2+...+n = n*(n+1)/2 297% Max sum: (m+1) + (m+2) + ... (m+n) = n*m + n*(n+1)/2 298% Average: (n*(n+1) + n*m)/2 = n*(n+m+1)/2 299define mw_test () 300{ 301 variable w_ref = NULL; 302 if (_NARGS == 3) 303 w_ref = (); 304 else if (_NARGS != 2) 305 { 306 usage ("p = %s (X1, Y1 [,&w]); %% Two-Sample Mann-Whitney", 307 _function_name ()); 308 } 309 variable x, y; 310 (x, y) = (); 311 variable side = qualifier ("side", NULL); 312 313 variable n = length (x), m = length (y); 314 variable N = m+n; 315 variable mn = m*n; 316 317 variable gties; 318 variable r = compute_rank ([x,y], &mean, >ies); 319 variable w = sum (r[[0:n-1]]); 320 321 variable has_ties = length (gties); 322#iffalse 323 if (has_ties) 324 vmessage ("*** Warning: mw_test: ties found--- using asymptotic cdf"); 325#endif 326 327 variable p; 328 329 if (has_ties || ((m > 50) && (n > 50))) 330 { 331 % Asymptotic 332 variable wstar = w - 0.5*n*(N+1); 333 variable vw = (mn/12.0)*(N+1 - sum((gties-1)*gties*(gties+1))/(N*(N-1))); 334 335 p = normal_cdf (wstar/sqrt(vw)); 336 337 if (side == ">") 338 p = 1.0 - p; 339 else if (side != "<") 340 p = 2 * _min (p, 1.0-p); 341 } 342 else 343 { 344 % exact 345 if (side == ">") 346 p = 1.0 - mann_whitney_cdf (n, m, w); 347 else if (side == "<") 348 p = mann_whitney_cdf (n, m, w); 349 else 350 { 351 p = mann_whitney_cdf (n, m, w); 352 p = 2 * _min (p, 1-p); 353 } 354 } 355 356 if (w_ref != NULL) 357 @w_ref = w; 358 359 return p; 360} 361 362define t_test () 363{ 364 variable x, mu; 365 variable tref = NULL; 366 367 if (_NARGS == 2) 368 (x,mu) = (); 369 else if (_NARGS == 3) 370 (x,mu,tref) = (); 371 else 372 { 373 usage ("p = t_test (X, mu [,&t] [; qualifiers]); %% Student's t-test\n" 374 + "Qualifiers:\n" 375 + " side=\"<\" | \">\"" 376 ); 377 } 378 379 variable n = length (x); 380 variable stat = sqrt(n)*((mean(x) - mu)/stddev(x)); 381 if (tref != NULL) @tref = stat; 382 383 return map_cdf_to_pval (student_t_cdf(stat, n-1) ;; __qualifiers); 384} 385 386define t_test2 () 387{ 388 variable x, y; 389 variable tref = NULL; 390 391 if (_NARGS == 2) 392 (x,y) = (); 393 else if (_NARGS == 3) 394 (x,y,tref) = (); 395 else 396 { 397 usage ("p = t_test2 (X, Y [,&t] [; qualifiers]); %% Student's 2 sample (unpaired) t-test\n" 398 + "Qualifiers:\n" 399 + " side=\"<\" | \">\"" 400 ); 401 } 402 variable side = qualifier ("side", NULL); 403 404 variable nx = length (x), mx = mean(x), sx = stddev (x); 405 variable ny = length (y), my = mean(y), sy = stddev (y); 406 variable df = nx+ny-2; 407 variable stat 408 = (mx-my)/sqrt((((nx-1)*sx*sx+(ny-1)*sy*sy)*(nx+ny))/(nx*ny*df)); 409 410 if (tref != NULL) @tref = stat; 411 412 return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers); 413} 414 415define welch_t_test () 416{ 417 variable x, y; 418 variable tref = NULL; 419 420 if (_NARGS == 2) 421 (x,y) = (); 422 else if (_NARGS == 3) 423 (x,y,tref) = (); 424 else 425 { 426 usage ("p = welch_t_test2 (X, Y [,&t] [; qualifiers]); %% Welch's 2 sample t-test\n" 427 + "Qualifiers:\n" 428 + " side=\"<\" | \">\"" 429 ); 430 } 431 variable side = qualifier ("side", NULL); 432 433 variable nx = length (x), mx = mean(x), sx = stddev (x), vx = sx*sx/nx; 434 variable ny = length (y), my = mean(y), sy = stddev (y), vy = sy*sy/ny; 435 variable vxvy = vx+vy; 436 variable stat = (mx-my)/sqrt(vxvy); 437 variable df = (vxvy*vxvy)/((vx*vx)/(nx-1) + (vy*vy)/(ny-1)); 438 439 if (tref != NULL) @tref = stat; 440 441 return map_cdf_to_pval (student_t_cdf(stat, df) ;; __qualifiers); 442} 443 444define z_test () 445{ 446 variable x, mu, sigma; 447 variable tref = NULL; 448 449 if (_NARGS == 4) 450 tref = (); 451 else if (_NARGS != 3) 452 { 453 usage ("p = z_test (X, mu, sigma [,&stat] [; qualifiers]);\n" 454 + "Qualifiers:\n" 455 + " side=\"<\" | \">\"" 456 ); 457 } 458 (x, mu, sigma) = (); 459 variable side = qualifier ("side", NULL); 460 461 variable n = length (x); 462 variable stat = (mean(x)-mu)/(sigma/sqrt(n)); 463 if (tref != NULL) @tref = stat; 464 465 return map_cdf_to_pval (normal_cdf(stat) ;; __qualifiers); 466} 467 468define f_test2 () 469{ 470 variable x, y; 471 variable tref = NULL; 472 473 if (_NARGS == 2) 474 (x,y) = (); 475 else if (_NARGS == 3) 476 (x,y,tref) = (); 477 else 478 { 479 usage ("p = f_test2 (X, Y [,&t] [; qualifiers]); %% 2 sample F-test\n" 480 + "Qualifiers:\n" 481 + " side=\"<\" | \">\"" 482 ); 483 } 484 variable side = qualifier ("side", NULL); 485 486 variable v1 = stddev(x)^2; 487 variable v2 = stddev(y)^2; 488 variable n1 = length(x)-1; 489 variable n2 = length(y)-1; 490 variable swap = 0; 491 if (v1 < v2) 492 { 493 swap = 1; 494 (v1, v2) = (v2, v1); 495 (n1, n2) = (n2, n1); 496 } 497 variable stat = (v1/v2); 498 499 variable pval = f_cdf (stat, n1, n2); 500 if (side == ">") 501 { 502 if (swap) 503 pval = 1.0 - pval; 504 } 505 else if (side == "<") 506 { 507 ifnot (swap) 508 pval = 1.0 - pval; 509 } 510 else 511 pval = 2.0 * _min (1.0-pval, pval); 512 513 if (tref != NULL) @tref = stat; 514 return pval; 515} 516 517define spearman_r () 518{ 519 variable w_ref = NULL; 520 if (_NARGS == 3) 521 w_ref = (); 522 else if (_NARGS != 2) 523 { 524 usage ("p = %s (X1, Y1 [,&r]); %% Spearman's rank correlation", 525 _function_name ()); 526 } 527 variable x, y; 528 (x, y) = (); 529 variable n = length (y), m = length (x); 530 531 variable gties_x, gties_y; 532 variable rx = compute_rank (x, &mean, >ies_x); 533 variable ry = compute_rank (y, &mean, >ies_y); 534 535 variable d = sum ((rx-ry)^2); 536 variable cx = sum(gties_x*(gties_x*gties_x-1.0)); 537 variable cy = sum(gties_y*(gties_y*gties_y-1.0)); 538 539 variable den = double(n) * (n+1.0) * (n-1.0); 540 541 variable r = (1.0 - 6.0*(d+(cx+cy)/12.0)/den) 542 / sqrt((1.0-cx/den)*(1.0-cy/den)); 543 if (w_ref != NULL) 544 @w_ref = r; 545 546 variable t = r * sqrt ((n-2)/(1-r*r)); 547 548 return map_cdf_to_pval (student_t_cdf(t,n-2) ;; __qualifiers); 549} 550 551% This function is assumed to always pass back a new array. 552private define compute_integer_rank (x, is_sorted) 553{ 554 variable n = length (x); 555 variable indx = NULL, rev_indx = NULL; 556 ifnot (is_sorted) 557 { 558 indx = array_sort (x); 559 x = x[indx]; 560 % Create a reverse-permutation to restore the array order 561 % upon return. 562 rev_indx = [0:n-1]; 563 rev_indx[indx] = @rev_indx; 564 } 565 566 variable r = [1:n]; 567 568 % Account for ties 569 variable ties; 570 () = wherediff (x, &ties); 571 % Here, ties is an array of indices {j} where x[j-1]==x[j]. 572 % We want those where x[j] == x[j+1]. 573 ties -= 1; 574 575 variable m = length (ties); 576 variable i = 0, j; 577 while (i < m) 578 { 579 variable ties_i = ties[i]; 580 j = i; 581 j++; 582 variable dties = ties_i - i; 583 while ((j < m) && (dties + j == ties[j])) 584 j++; 585 586 variable dn = j - i; 587 i = [ties_i:ties_i+dn]; 588 r[i] = r[ties_i]; 589 i = j; 590 } 591 592 if (indx == NULL) 593 return r; 594 595 return r[rev_indx]; 596} 597 598define kendall_tau () 599{ 600 variable w_ref = NULL; 601 if (_NARGS == 3) 602 w_ref = (); 603 else if (_NARGS != 2) 604 { 605 usage ("p = %s (X1, Y1 [,&r]); %% Kendall's tau correlation", 606 _function_name ()); 607 } 608 609 variable x, y; 610 (x, y) = (); 611 variable n = length (x); 612 if (n != length (y)) 613 throw InvalidParmError, "Arrays must be the same length for kendall_tau"; 614 615 % _kendall_tau will modify the contents of the arrays. Be sure to 616 % pass new instances to it. The sort operation below will achieve 617 % that. 618 variable i = array_sort (x); 619 x = compute_integer_rank (x[i], 1); 620 y = compute_integer_rank (y[i], 0); 621 622 variable tau, z; 623 624 (tau, z) = _kendall_tau (x, y); 625 626 if (w_ref != NULL) 627 @w_ref = tau; 628 629 return map_cdf_to_pval (z ;; __qualifiers); 630} 631 632define mann_kendall () 633{ 634 variable w_ref = NULL; 635 if (_NARGS == 2) 636 w_ref = (); 637 else if (_NARGS != 1) 638 { 639 usage ("p = %s (X [,&r]); %% Mann-Kendall trend test", 640 _function_name ()); 641 } 642 643 variable x; 644 x = (); 645 variable n = length (x); 646 variable i = [0:n-1]; 647 648 % _kendall_tau will modify the contents of the arrays. Be sure to 649 % pass new instances to it. compute_integer_rank will create a new 650 % instance. 651 x = compute_integer_rank (x, 0); 652 variable tau, z; 653 (tau, z) = _kendall_tau (i, x); 654 655 if (w_ref != NULL) 656 @w_ref = tau; 657 658 return map_cdf_to_pval (z ;; __qualifiers); 659} 660 661define pearson_r () 662{ 663 variable w_ref = NULL; 664 if (_NARGS == 3) 665 w_ref = (); 666 else if (_NARGS != 2) 667 { 668 usage ("p = %s (X1, Y1 [,&r] [; qualifiers]); %% Pearson's r correlation\n", + 669 "Qualifiers:\n" + 670 " side=\"<\" | \">\"", 671 _function_name ()); 672 } 673 674 variable x, y; 675 (x, y) = (); 676 variable n = length(x); 677 % Note: covariance handles the 1/(N-1) normalization factor 678 variable r = covariance (x, y)[0,1]/(stddev(x)*stddev(y)); 679 if (w_ref != NULL) 680 @w_ref = r; 681 682 % This is meaningful only for gaussian distributions 683 variable df = length(x)-2; 684 r = sqrt(df)*r/sqrt(1-r*r); 685 return map_cdf_to_pval (student_t_cdf (r, df) ;; __qualifiers); 686} 687 688define correlation () 689{ 690 if (_NARGS != 2) 691 usage ("c = correlation (X, Y);"); 692 variable x, y; (x,y) = (); 693 variable n = length(x); 694 if (n != length(y)) 695 throw InvalidParmError, "Arrays must be the same length"; 696 variable mx = mean(x), sx = stddev(x), my = mean(y), sy = stddev(y); 697 return sum ((x-mx)*(y-my))/((n-1)*sx*sy); 698} 699 700provide ("stats"); 701