1 /*
2 * gretl -- Gnu Regression, Econometrics and Time-series Library
3 * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 #include "libgretl.h"
21
22 /**
23 * SECTION:nonparam
24 * @short_description: selected nonparametric routines
25 * @title: Nonparametric
26 * @include: libgretl.h
27 *
28 * Includes various nonparametric tests: rank correlation,
29 * runs (randomness), and differences between series. Also
30 * "loess" regression a la William Cleveland.
31 */
32
33 /* Spearman, one-tailed alpha = .005, .025, .05 */
34
35 static double rhocrit[18][3] = {
36 { 0.8929, 0.7450, 0.6786 }, /* n = 7 */
37 { 0.8571, 0.7143, 0.6190 }, /* 8 */
38 { 0.8167, 0.6833, 0.5833 }, /* 9 */
39 { 0.7818, 0.6364, 0.5515 }, /* 10 */
40 { 0.7545, 0.6091, 0.5273 }, /* 11 */
41 { 0.7273, 0.5804, 0.4965 }, /* 12 */
42 { 0.6978, 0.5549, 0.4780 }, /* 13 */
43 { 0.6747, 0.5341, 0.4593 }, /* 14 */
44 { 0.6536, 0.5179, 0.4429 }, /* 15 */
45 { 0.6324, 0.5000, 0.4265 }, /* 16 */
46 { 0.6152, 0.4853, 0.4118 }, /* 17 */
47 { 0.5975, 0.4716, 0.3994 }, /* 18 */
48 { 0.5825, 0.4579, 0.3895 }, /* 19 */
49 { 0.5684, 0.4451, 0.3789 }, /* 20 */
50 { 0.5545, 0.4351, 0.3688 }, /* 21 */
51 { 0.5426, 0.4241, 0.3597 }, /* 22 */
52 { 0.5306, 0.4150, 0.3518 }, /* 23 */
53 { 0.5200, 0.4061, 0.3435 } /* 24 */
54 };
55
56 /* The values above are from Daniel and Terrell,
57 Business Statistics, 4e (Houghton Mifflin, 1986)
58 */
59
spearman_signif(double rho,int n)60 static double spearman_signif (double rho, int n)
61 {
62 const double *x;
63
64 if (n < 7) {
65 return 1.0;
66 }
67
68 x = rhocrit[n - 7];
69
70 if (rho > x[0]) return .01;
71 if (rho > x[1]) return .05;
72 if (rho > x[2]) return .10;
73
74 return 1.0;
75 }
76
77 enum {
78 RANK_X,
79 RANK_Y
80 };
81
82 /* m = number of sorted values (sz); n = number of raw values (either
83 x or y); in case of missing values in x and/or y we may have m < n.
84 */
85
make_ranking(const double * sz,int m,const double * x,const double * y,int n,double * rz,int * ties,int which)86 static void make_ranking (const double *sz, int m,
87 const double *x, const double *y,
88 int n, double *rz, int *ties,
89 int which)
90 {
91 const double *z;
92 int cases, k, i, j;
93 double avg, r = 1;
94
95 z = (which == RANK_X)? x : y;
96
97 for (i=0; i<m; i++) {
98 /* scan sorted z */
99 cases = k = 0;
100
101 if (i > 0 && sz[i] == sz[i-1]) {
102 continue;
103 }
104
105 for (j=0; j<n; j++) {
106 /* scan raw z for matches */
107 if (!na(x[j]) && !na(y[j])) {
108 if (z[j] == sz[i]) {
109 rz[k] = r;
110 cases++;
111 }
112 k++;
113 }
114 }
115
116 if (cases > 1) {
117 avg = (r + r + cases - 1.0) / 2.0;
118 for (j=0; j<m; j++) {
119 if (rz[j] == r) {
120 rz[j] = avg;
121 }
122 }
123 if (ties != NULL) {
124 *ties = 1;
125 }
126 }
127
128 r += cases;
129 }
130 }
131
rankcorr_get_rankings(const double * x,const double * y,int n,double ** rxout,double ** ryout,int * pm,int * ties)132 static int rankcorr_get_rankings (const double *x, const double *y, int n,
133 double **rxout, double **ryout,
134 int *pm, int *ties)
135 {
136 double *sx = NULL, *sy = NULL;
137 double *rx = NULL, *ry = NULL;
138 int i, m = 0;
139
140 /* count non-missing pairs */
141 for (i=0; i<n; i++) {
142 if (!na(x[i]) && !na(y[i])) {
143 m++;
144 }
145 }
146
147 if (m < 2) {
148 return E_DATA;
149 }
150
151 sx = malloc(m * sizeof *sx);
152 sy = malloc(m * sizeof *sy);
153 rx = malloc(m * sizeof *rx);
154 ry = malloc(m * sizeof *ry);
155
156 if (sx == NULL || sy == NULL ||
157 rx == NULL || ry == NULL) {
158 free(sx);
159 free(sy);
160 free(rx);
161 free(ry);
162 return E_ALLOC;
163 }
164
165 /* copy non-missing x and y into sx, sy */
166 m = 0;
167 for (i=0; i<n; i++) {
168 if (!na(x[i]) && !na(y[i])) {
169 sx[m] = x[i];
170 sy[m] = y[i];
171 m++;
172 }
173 }
174
175 /* get sorted series */
176 qsort(sx, m, sizeof *sx, gretl_inverse_compare_doubles);
177 qsort(sy, m, sizeof *sy, gretl_inverse_compare_doubles);
178
179 for (i=0; i<m; i++) {
180 rx[i] = ry[i] = 0.0;
181 }
182
183 /* make rankings by comparing "raw" x, y with sorted */
184 make_ranking(sx, m, x, y, n, rx, ties, RANK_X);
185 make_ranking(sy, m, x, y, n, ry, ties, RANK_Y);
186
187 /* save the ranks */
188 *rxout = rx;
189 *ryout = ry;
190
191 if (pm != NULL) {
192 *pm = m;
193 }
194
195 free(sx);
196 free(sy);
197
198 return 0;
199 }
200
real_spearman_rho(const double * x,const double * y,int n,double * rho,double * zval,double ** rxout,double ** ryout,int * pm)201 static int real_spearman_rho (const double *x, const double *y, int n,
202 double *rho, double *zval,
203 double **rxout, double **ryout,
204 int *pm)
205 {
206 double *rx = NULL, *ry = NULL;
207 int m, ties = 0;
208 int err = 0;
209
210 *rho = *zval = NADBL;
211
212 if (n < 2) {
213 return E_TOOFEW;
214 }
215
216 err = rankcorr_get_rankings(x, y, n, &rx, &ry, &m, &ties);
217 if (err) {
218 return err;
219 }
220
221 /* Pearson correlation in ranks */
222 *rho = gretl_corr(0, m - 1, rx, ry, NULL);
223
224 /* save the ranks, if wanted */
225 if (rxout != NULL && ry != NULL) {
226 *rxout = rx;
227 *ryout = ry;
228 } else {
229 free(rx);
230 free(ry);
231 }
232
233 *pm = m;
234
235 return err;
236 }
237
spearman_rho_func(const double * x,const double * y,int n,int * err)238 gretl_matrix *spearman_rho_func (const double *x,
239 const double *y,
240 int n, int *err)
241 {
242 gretl_matrix *ret = NULL;
243 double z, rho = NADBL;
244 int m = 0;
245
246 *err = real_spearman_rho(x, y, n, &rho, &z,
247 NULL, NULL, &m);
248
249 if (!*err) {
250 ret = gretl_matrix_alloc(1, 3);
251 if (ret == NULL) {
252 *err = E_ALLOC;
253 } else {
254 ret->val[0] = rho;
255 if (!na(z)) {
256 ret->val[1] = z;
257 ret->val[2] = normal_pvalue_2(z);
258 } else if (m > 24) {
259 ret->val[1] = rho * sqrt((m - 2) / (1 - rho*rho));
260 ret->val[2] = student_pvalue_2(m - 2, ret->val[1]);
261 } else {
262 ret->val[1] = ret->val[2] = NADBL;
263 }
264 }
265 }
266
267 return ret;
268 }
269
print_raw_and_ranked(int vx,int vy,const double * x,const double * y,const double * rx,const double * ry,const DATASET * dset,PRN * prn)270 static void print_raw_and_ranked (int vx, int vy,
271 const double *x, const double *y,
272 const double *rx, const double *ry,
273 const DATASET *dset,
274 PRN *prn)
275 {
276 int T = dset->t2 - dset->t1 + 1;
277 int obslen = max_obs_marker_length(dset);
278 int n1 = strlen(dset->varname[vx]);
279 int n2 = strlen(dset->varname[vy]);
280 int t, i = 0;
281
282 n1 = (n2 > n1)? n2 : n1;
283 n1 = (n1 < 15)? 15 : n1;
284 pputc(prn, '\n');
285 bufspace(obslen + 1, prn);
286 pprintf(prn, "%*s%8s%*s%8s\n\n", n1, dset->varname[vx], _("rank"),
287 n1, dset->varname[vy], _("rank"));
288
289 for (t=0; t<T; t++) {
290 print_obs_marker(t + dset->t1, dset, obslen, prn);
291 if (!(na(x[t])) && !(na(y[t]))) {
292 pprintf(prn, "%#*g", n1, x[t]);
293 pprintf(prn, "%8g", rx[i]);
294 pprintf(prn, "%#*g", n1, y[t]);
295 pprintf(prn, "%8g", ry[i]);
296 i++;
297 }
298 pputc(prn, '\n');
299 }
300 }
301
302 /**
303 * spearman_rho:
304 * @list: list of (two) variables to process.
305 * @dset: dataset struct.
306 * @opt: if includes %OPT_V, print both the "raw" and the ranked data.
307 * @prn: gretl printing struct.
308 *
309 * Calculates and prints Spearman's rank correlation coefficient for the two
310 * variables specified in the @list.
311 *
312 * Returns: 0 on successful completion, 1 on error.
313 */
314
spearman_rho(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)315 int spearman_rho (const int *list, const DATASET *dset,
316 gretlopt opt, PRN *prn)
317 {
318 int T = dset->t2 - dset->t1 + 1;
319 double *rx = NULL, *ry = NULL;
320 const double *x, *y;
321 double zval, rho = 0;
322 int vx, vy, m;
323 int err;
324
325 if (list == NULL || list[0] != 2) {
326 pputs(prn, _("This command requires two variables\n"));
327 return 1;
328 }
329
330 vx = list[1];
331 vy = list[2];
332
333 x = dset->Z[vx] + dset->t1;
334 y = dset->Z[vy] + dset->t1;
335
336 err = real_spearman_rho(x, y, T, &rho, &zval,
337 (opt & OPT_V)? &rx : NULL,
338 (opt & OPT_V)? &ry : NULL,
339 &m);
340 if (err) {
341 return err;
342 }
343
344 pprintf(prn, _("\nFor the variables '%s' and '%s'\n"), dset->varname[vx],
345 dset->varname[vy]);
346
347 if (na(rho)) {
348 pputs(prn, _("Spearman's rank correlation is undefined\n"));
349 return err;
350 }
351
352 pprintf(prn, _("Spearman's rank correlation coefficient (rho) = %.8f\n"), rho);
353
354 if (rho == 0.0) {
355 goto skipit;
356 }
357
358 if (!na(zval)) {
359 pputs(prn, _("Under the null hypothesis of no correlation:\n "));
360 pprintf(prn, _("z-score = %g, with two-tailed p-value %.4f\n"), zval,
361 normal_pvalue_2(zval));
362 } else if (m > 24) {
363 double tval = rho * sqrt((m - 2) / (1 - rho*rho));
364
365 pputs(prn, _("Under the null hypothesis of no correlation:\n "));
366 pprintf(prn, _("t(%d) = %g, with two-tailed p-value %.4f\n"), m - 2,
367 tval, student_pvalue_2(m - 2, tval));
368 } else if (m >= 7) {
369 double pval = spearman_signif(fabs(rho), m);
370
371 if (pval < 1.0) {
372 pprintf(prn, _("significant at the %g%% level (two-tailed)\n"),
373 100.0 * pval);
374 } else {
375 /* xgettext:no-c-format */
376 pputs(prn, _("not significant at the 10% level\n"));
377 }
378 } else {
379 pputs(prn, _("Sample is too small to calculate a p-value based on "
380 "the normal distribution\n"));
381 }
382
383 skipit:
384
385 if (rx != NULL && ry != NULL) {
386 print_raw_and_ranked(vx, vy, x, y, rx, ry, dset, prn);
387 free(rx);
388 free(ry);
389 }
390
391 pputc(prn, '\n');
392
393 return 0;
394 }
395
396 struct xy_pair {
397 double x;
398 double y;
399 };
400
compare_pairs_x(const void * a,const void * b)401 static int compare_pairs_x (const void *a, const void *b)
402 {
403 const struct xy_pair *pa = a;
404 const struct xy_pair *pb = b;
405 int ret;
406
407 ret = (pa->x > pb->x) - (pa->x < pb->x);
408 if (ret == 0) {
409 ret = (pa->y > pb->y) - (pa->y < pb->y);
410 }
411
412 return ret;
413 }
414
compare_pairs_y(const void * a,const void * b)415 static int compare_pairs_y (const void *a, const void *b)
416 {
417 const struct xy_pair *pa = a;
418 const struct xy_pair *pb = b;
419
420 return (pa->y > pb->y) - (pa->y < pb->y);
421 }
422
real_kendall_tau(const double * x,const double * y,int n,struct xy_pair * xy,int nn,double * ptau,double * pz)423 static int real_kendall_tau (const double *x, const double *y,
424 int n, struct xy_pair *xy, int nn,
425 double *ptau, double *pz)
426 {
427 double tau, nn1, s2, z;
428 int tt1, tx = 0, ty = 0;
429 int Tx = 0, Ty = 0;
430 int Tx2 = 0, Ty2 = 0;
431 int Tx25 = 0, Ty25 = 0;
432 int tie, N0, N1, S;
433 int i, j;
434
435 /* populate sorter */
436 j = 0;
437 for (i=0; i<n; i++) {
438 if (!na(x[i]) && !na(y[i])) {
439 xy[j].x = x[i];
440 xy[j].y = y[i];
441 j++;
442 }
443 }
444
445 /* sort pairs by x */
446 qsort(xy, nn, sizeof *xy, compare_pairs_x);
447
448 /* make order counts */
449 N0 = N1 = 0;
450 for (i=0; i<nn; i++) {
451 for (j=i+1; j<nn; j++) {
452 if (xy[j].y == xy[i].y) {
453 Ty++;
454 } else if (xy[j].x != xy[i].x) {
455 if (xy[j].y > xy[i].y) {
456 N0++;
457 } else if (xy[j].y < xy[i].y) {
458 N1++;
459 }
460 }
461 }
462 if (i > 0) {
463 /* account for ties in x */
464 tie = (xy[i].x == xy[i-1].x);
465 if (tie) {
466 tx++;
467 }
468 if (tx > 0 && (!tie || i == nn - 1)) {
469 tx++;
470 tt1 = tx * (tx - 1);
471 Tx += tt1;
472 Tx2 += tt1 * (tx - 2);
473 Tx25 += tt1 * (2 * tx + 5);
474 tx = 0;
475 }
476 }
477 }
478
479 if (Ty > 0) {
480 /* account for ties in y */
481 Ty = 0;
482 qsort(xy, nn, sizeof *xy, compare_pairs_y);
483 for (i=1; i<nn; i++) {
484 tie = (xy[i].y == xy[i-1].y);
485 if (tie) {
486 ty++;
487 }
488 if (ty > 0 && (!tie || i == nn - 1)) {
489 ty++;
490 tt1 = ty * (ty - 1);
491 Ty += tt1;
492 Ty2 += tt1 * (ty - 2);
493 Ty25 += tt1 * (2 * ty + 5);
494 ty = 0;
495 }
496 }
497 }
498
499 S = N0 - N1;
500
501 #if 0
502 fprintf(stderr, "N0 = %d, N1 = %d, S = %d\n", N0, N1, S);
503 fprintf(stderr, "Tx = %d, Ty = %d\n", Tx, Ty);
504 #endif
505
506 nn1 = nn * (nn - 1.0);
507
508 /* normal approximation as in Shapiro and Chen,
509 Journal of Quality Technology, 2001 */
510
511 if (Tx == 0 && Ty == 0) {
512 tau = 2 * S / nn1;
513 s2 = (1.0/18) * nn1 * (2 * nn + 5);
514 } else {
515 double den = (nn1 - Tx) * (nn1 - Ty);
516
517 tau = 2 * S / sqrt(den);
518 s2 = (1.0/18) * (nn1 * (2 * nn + 5) - Tx25 - Ty25);
519 if (Tx2 != 0 && Ty2 != 0) {
520 s2 += (1.0/(9*nn1*(nn-2))) * Tx2 * Ty2;
521 }
522 if (Tx != 0 && Ty != 0) {
523 s2 += (1.0/(2*nn1)) * Tx * Ty;
524 }
525 }
526
527 z = (S - 1) / sqrt(s2);
528
529 if (ptau != NULL) {
530 *ptau = tau;
531 }
532
533 if (pz != NULL) {
534 *pz = z;
535 }
536
537 return 0;
538 }
539
kendall_tau_func(const double * x,const double * y,int n,int * err)540 gretl_matrix *kendall_tau_func (const double *x,
541 const double *y,
542 int n, int *err)
543 {
544 gretl_matrix *ret = NULL;
545 struct xy_pair *xy;
546 double z, tau = NADBL;
547 int i, nn = 0;
548
549 /* count valid pairs */
550 for (i=0; i<n; i++) {
551 if (!na(x[i]) && !na(y[i])) {
552 nn++;
553 }
554 }
555
556 if (nn < 2) {
557 *err = E_TOOFEW;
558 return NULL;
559 }
560
561 xy = malloc(nn * sizeof *xy);
562
563 if (xy == NULL) {
564 *err = E_ALLOC;
565 } else {
566 *err = real_kendall_tau(x, y, n, xy, nn, &tau, &z);
567 }
568
569 free(xy);
570
571 if (!*err) {
572 ret = gretl_matrix_alloc(1, 3);
573 if (ret == NULL) {
574 *err = E_ALLOC;
575 } else {
576 ret->val[0] = tau;
577 ret->val[1] = z;
578 ret->val[2] = normal_pvalue_2(z);
579 }
580 }
581
582 return ret;
583 }
584
585 /**
586 * kendall_tau:
587 * @list: list of (two) variables to process.
588 * @dset: dataset struct.
589 * @opt: if includes %OPT_V, print both the "raw" and the ranked data.
590 * @prn: gretl printing struct.
591 *
592 * Calculates and prints Kendall's rank correlation tau statistic for
593 * the two variables specified in @list.
594 *
595 * Returns: 0 on successful completion, 1 on error.
596 */
597
kendall_tau(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)598 int kendall_tau (const int *list, const DATASET *dset,
599 gretlopt opt, PRN *prn)
600 {
601 struct xy_pair *xy = NULL;
602 int T = dset->t2 - dset->t1 + 1;
603 const double *x, *y;
604 double tau, z;
605 int vx, vy;
606 int t, nn = 0;
607 int err = 0;
608
609 if (list == NULL || list[0] != 2) {
610 pputs(prn, _("This command requires two variables\n"));
611 return 1;
612 }
613
614 vx = list[1];
615 vy = list[2];
616
617 x = dset->Z[vx] + dset->t1;
618 y = dset->Z[vy] + dset->t1;
619
620 /* count valid pairs */
621 for (t=0; t<T; t++) {
622 if (!na(x[t]) && !na(y[t])) {
623 nn++;
624 }
625 }
626
627 if (nn < 2) {
628 return E_TOOFEW;
629 }
630
631 /* allocate */
632 xy = malloc(nn * sizeof *xy);
633 if (xy == NULL) {
634 return E_ALLOC;
635 }
636
637 /* calculate */
638 err = real_kendall_tau(x, y, T, xy, nn, &tau, &z);
639
640 if (!err) {
641 pprintf(prn, _("\nFor the variables '%s' and '%s'\n"), dset->varname[vx],
642 dset->varname[vy]);
643 pprintf(prn, "%s = %.8f\n", _("Kendall's tau"), tau);
644 pputs(prn, _("Under the null hypothesis of no correlation:\n "));
645 pprintf(prn, _("z-score = %g, with two-tailed p-value %.4f\n"), z,
646 normal_pvalue_2(z));
647 }
648
649 if (opt & OPT_V) {
650 double *rx = NULL, *ry = NULL;
651
652 rankcorr_get_rankings(x, y, T, &rx, &ry, NULL, NULL);
653
654 if (rx != NULL && ry != NULL) {
655 print_raw_and_ranked(vx, vy, x, y, rx, ry, dset, prn);
656 free(rx);
657 free(ry);
658 }
659 }
660
661 pputc(prn, '\n');
662
663 free(xy);
664
665 return err;
666 }
667
668 #define LOCKE_DEBUG 0
669
randomize_doubles(const void * a,const void * b)670 static int randomize_doubles (const void *a, const void *b)
671 {
672 return gretl_rand_int_max(8096) - 4097;
673 }
674
675 /* check for negative values, screen out missing values,
676 and load x into an array for randomization */
677
locke_shuffle_init(const double * x,int * n,double ** psx)678 static int locke_shuffle_init (const double *x,
679 int *n, double **psx)
680 {
681 double *sx = NULL;
682 int i, m = 0;
683
684 for (i=0; i<*n; i++) {
685 if (x[i] < 0.0) {
686 return E_DATA;
687 }
688 if (!na(x[i])) {
689 m++;
690 }
691 }
692
693 if (m < 4) {
694 return E_DATA;
695 }
696
697 sx = malloc(m * sizeof *sx);
698 if (sx == NULL) {
699 return E_ALLOC;
700 }
701
702 *psx = sx;
703
704 m = 0;
705 for (i=0; i<*n; i++) {
706 if (!na(x[i])) {
707 sx[m++] = x[i];
708 }
709 }
710
711 m = 2 * m / 2;
712 *n = m;
713
714 return 0;
715 }
716
717 #define NREPEAT 100
718
719 /**
720 * lockes_test:
721 * @x: data series.
722 * @t1: start of sample range.
723 * @t2: end of sample range.
724 *
725 * Performs Charles Locke's nonparametric test for whether an
726 * empirical distribution (namely, that of @x over the range
727 * @t1 to @t2) is gamma. See C. Locke, "A Test for the Composite
728 * Hypothesis that a Population has a Gamma Distribution,"
729 * Commun. Statis.-Theor. Meth. A5(4), 351-364 (1976). Also
730 * see Shapiro and Chen, Journal of Quality Technology 33(1),
731 * Jan 2001.
732 *
733 * Returns: the z value for test, or #NADBL on error.
734 */
735
lockes_test(const double * x,int t1,int t2)736 double lockes_test (const double *x, int t1, int t2)
737 {
738 struct xy_pair *uv = NULL;
739 double *sx = NULL, *u = NULL, *v = NULL;
740 double zj, z;
741 int m = t2 - t1 + 1;
742 int i, j, t;
743 int err;
744
745 err = locke_shuffle_init(x + t1, &m, &sx);
746 if (err) {
747 return NADBL;
748 }
749
750 m /= 2;
751 u = malloc(m * sizeof *u);
752 v = malloc(m * sizeof *v);
753 uv = malloc(m * sizeof *uv);
754
755 if (u == NULL || v == NULL || uv == NULL) {
756 z = NADBL;
757 goto bailout;
758 }
759
760 z = 0.0;
761
762 /* repeat the shuffling of the series NREPEAT times, since the
763 test statistic is sensitive to the ordering under the null
764 */
765
766 for (j=0; j<NREPEAT; j++) {
767 #if LOCKE_DEBUG
768 fprintf(stderr, "locke's test: j = %d\n", j);
769 #endif
770 qsort(sx, 2 * m, sizeof *sx, randomize_doubles);
771 t = 0;
772 for (i=0; i<m; i++) {
773 u[i] = sx[t] + sx[t+1];
774 v[i] = sx[t] / sx[t+1];
775 if (sx[t+1] / sx[t] > v[i]) {
776 v[i] = sx[t+1] / sx[t];
777 }
778 t += 2;
779 }
780 real_kendall_tau(u, v, m, uv, m, NULL, &zj);
781 z += zj;
782 #if LOCKE_DEBUG
783 fprintf(stderr, "z[%d] = %g\n", j, zj);
784 #endif
785 }
786
787 z /= (double) NREPEAT;
788
789 #if LOCKE_DEBUG
790 fprintf(stderr, "Kendall's tau: average z = %g\n", z);
791 #endif
792
793 bailout:
794
795 free(u);
796 free(v);
797 free(uv);
798 free(sx);
799
800 return z;
801 }
802
803 /**
804 * runs_test:
805 * @v: ID number of the variable to process.
806 * @dset: dataset struct.
807 * @opt: %OPT_D to use first difference of variable, %OPT_E
808 * to assume positive and negative are equiprobable.
809 * @prn: gretl printing struct.
810 *
811 * Performs, and prints the results of, the runs test for randomness
812 * for the variable specified by @v. The normal approximation
813 * is that given in Gary Smith, Statistical Reasoning, 2e, p. 674.
814 *
815 * Returns: 0 on successful completion, non-zero on error.
816 */
817
runs_test(int v,const DATASET * dset,gretlopt opt,PRN * prn)818 int runs_test (int v, const DATASET *dset,
819 gretlopt opt, PRN *prn)
820 {
821 double xt, *x, mu, s2, sigma;
822 double N2, z, pval;
823 int Np, Nm;
824 int t, n, runs = 1;
825
826 n = dset->t2 - dset->t1 + 1;
827
828 x = malloc(n * sizeof *x);
829 if (x == NULL) {
830 return E_ALLOC;
831 }
832
833 n = 0;
834
835 if (opt & OPT_D) {
836 double xt1;
837
838 for (t=dset->t1 + 1; t<=dset->t2; t++) {
839 xt = dset->Z[v][t];
840 xt1 = dset->Z[v][t-1];
841 if (na(xt) || na(xt1)) {
842 continue;
843 }
844 x[n++] = xt - xt1;
845 }
846 } else {
847 for (t=dset->t1; t<=dset->t2; t++) {
848 xt = dset->Z[v][t];
849 if (na(xt)) {
850 continue;
851 }
852 x[n++] = xt;
853 }
854 }
855
856 if (n <= 1) {
857 free(x);
858 return E_TOOFEW;
859 }
860
861 Np = (x[0] > 0);
862 Nm = 1 - Np;
863
864 for (t=1; t<n; t++) {
865 if (x[t] > 0) {
866 Np++;
867 } else {
868 Nm++;
869 }
870 if ((x[t] > 0 && x[t-1] <= 0) || (x[t] <= 0 && x[t-1] > 0)) {
871 runs++;
872 }
873 }
874
875 if (opt & OPT_E) {
876 mu = 1.0 + n / 2.0;
877 s2 = ((double) n - 1) / 4.0;
878 } else {
879 /* don't assume that + and - are equiprobable */
880 N2 = 2.0 * Np * Nm;
881 mu = 1.0 + N2 / n;
882 s2 = (N2 * (N2 - n)) / (n*n * (n-1));
883 }
884
885 if (s2 <= 0) {
886 sigma = 0;
887 z = pval = NADBL;
888 } else {
889 sigma = sqrt(s2);
890 z = (runs - mu) / sigma;
891 pval = normal_pvalue_2(z);
892 }
893
894 if (opt & OPT_D) {
895 pprintf(prn, "\n%s\n", _("Runs test (first difference)"));
896 } else {
897 pprintf(prn, "\n%s\n", _("Runs test (level)"));
898 }
899
900 pprintf(prn, _("\nNumber of runs (R) in the variable '%s' = %d\n"),
901 dset->varname[v], runs);
902
903 if (na(z)) {
904 pprintf(prn, _("Test statistic cannot be computed: try "
905 "the deviation from the median?\n"));
906 } else {
907 if (opt & OPT_E) {
908 pprintf(prn, _("Under the null hypothesis of independence and equal "
909 "probability of positive\nand negative values, R "
910 "follows N(%g, %g)\n"), mu, sigma);
911 } else {
912 pprintf(prn, _("Under the null hypothesis of independence, R "
913 "follows N(%g, %g)\n"), mu, sigma);
914 }
915 pprintf(prn, _("z-score = %g, with two-tailed p-value %g\n"), z, pval);
916 }
917
918 pputc(prn, '\n');
919
920 record_test_result(z, pval);
921
922 free(x);
923
924 return 0;
925 }
926
print_z_prob(double z,PRN * prn)927 static double print_z_prob (double z, PRN *prn)
928 {
929 double pv = NADBL;
930
931 if (z > 0) {
932 pv = normal_pvalue_1(z);
933 if (!na(pv)) {
934 pprintf(prn, " P(Z > %g) = %g\n", z, pv);
935 }
936 } else if (z < 0) {
937 pv = normal_cdf(z);
938 if (!na(pv)) {
939 pprintf(prn, " P(Z < %g) = %g\n", z, pv);
940 }
941 }
942
943 if (!na(pv) && pv > 0) {
944 pprintf(prn, " %s = %g\n", _("Two-tailed p-value"), 2 * pv);
945 }
946
947 return pv;
948 }
949
diff_test_header(int v1,int v2,const DATASET * dset,PRN * prn)950 static void diff_test_header (int v1, int v2, const DATASET *dset,
951 PRN *prn)
952 {
953 pputc(prn, '\n');
954 pprintf(prn, _("Test for difference between %s and %s"),
955 dset->varname[v1], dset->varname[v2]);
956 }
957
958 static const int rank5[3][2] = {
959 { 0, 2 }, /* n = 6 */
960 { 2, 3 }, /* n = 7 */
961 { 3, 5 } /* n = 8 */
962 };
963
964 struct ranker {
965 double val;
966 double rank;
967 char c;
968 };
969
970 /* Wilcoxon signed-rank test, with handling of zero-differences and
971 non-zero ties, plus continuity correction, as in E. Cureton, "The
972 Normal Approximation to the Signed-Rank Sampling Distribution when
973 Zero Differences are Present", JASA(62), 1967, 1068-1069.
974 */
975
976 static int
signed_rank_test(const double * x,const double * y,int v1,int v2,const DATASET * dset,double * result,gretlopt opt,PRN * prn)977 signed_rank_test (const double *x, const double *y,
978 int v1, int v2, const DATASET *dset,
979 double *result, gretlopt opt, PRN *prn)
980 {
981 struct ranker *r;
982 double d, T, wp, wm;
983 double z = NADBL, pval = NADBL;
984 int quiet = (opt & OPT_Q);
985 int Z = 0, N = 0;
986 int i, k, t, n = 0;
987
988 for (t=dset->t1; t<=dset->t2; t++) {
989 if (!na(x[t]) && !na(y[t])) {
990 if (x[t] != y[t]) {
991 n++;
992 }
993 N++;
994 }
995 }
996
997 if (N == 0) {
998 return E_MISSDATA;
999 }
1000
1001 Z = N - n; /* number of zero-differences */
1002
1003 r = malloc(n * sizeof *r);
1004 if (r == NULL) {
1005 return E_ALLOC;
1006 }
1007
1008 i = 0;
1009 for (t=dset->t1; t<=dset->t2; t++) {
1010 if (!na(x[t]) && !na(y[t]) && x[t] != y[t]) {
1011 d = x[t] - y[t];
1012 r[i].val = fabs(d);
1013 r[i].c = (d > 0)? '+' : '-';
1014 i++;
1015 }
1016 }
1017
1018 qsort(r, n, sizeof *r, gretl_compare_doubles);
1019
1020 T = 0.0; /* non-zero ties correction */
1021 k = 0; /* number of non-zero ties */
1022
1023 for (i=0; i<n; i++) {
1024 int m = 0;
1025
1026 for (t=i+1; t<n && r[t].val == r[i].val; t++) {
1027 m++;
1028 }
1029
1030 if (m == 0) {
1031 r[i].rank = Z + i + 1;
1032 } else {
1033 double avg = (Z + i + 1 + Z + i + m + 1) / 2.0;
1034
1035 for (t=0; t<=m; t++) {
1036 r[i+t].rank = avg;
1037 }
1038 i += m;
1039 T += pow((double) m, 3) - m;
1040 k++;
1041 }
1042 }
1043
1044 if (!quiet) {
1045 diff_test_header(v1, v2, dset, prn);
1046 pprintf(prn, "\n%s\n", _("Wilcoxon Signed-Rank Test"));
1047 pprintf(prn, "%s: %s\n\n", _("Null hypothesis"),
1048 _("the median difference is zero"));
1049 if (opt & OPT_V) {
1050 pprintf(prn, "%16s %8s %16s\n\n", "difference", "rank",
1051 "signed rank");
1052 }
1053 }
1054
1055 wp = wm = 0.0;
1056
1057 for (i=0; i<n; i++) {
1058 d = r[i].rank;
1059 if (r[i].c == '-') {
1060 wm += d;
1061 d = -d;
1062 r[i].val = -r[i].val;
1063 } else {
1064 wp += d;
1065 }
1066 if (opt & OPT_V) {
1067 pprintf(prn, "%16g %8g %16g\n", r[i].val, r[i].rank, d);
1068 }
1069 }
1070
1071 if (opt & OPT_V) {
1072 pputc(prn, '\n');
1073 }
1074
1075 if (!quiet) {
1076 pprintf(prn, " n = %d\n", n);
1077 pprintf(prn, " W+ = %g, W- = %g\n", wp, wm);
1078 pprintf(prn, " (%s: %d, %s: %d)\n", _("zero differences"),
1079 Z, _("non-zero ties"), k);
1080 if (N > 0 && n == 0) {
1081 pprintf(prn, " %s\n", _("Fail to reject H0"));
1082 }
1083 }
1084
1085 if (n > 8) {
1086 double s2, x, num;
1087
1088 x = (N*(N+1) - Z*(Z+1)) / 4.0;
1089 s2 = (N*(N+1)*(2*N+1) - Z*(Z+1)*(2*Z+1) - T/2) / 24.0;
1090 if (!quiet) {
1091 pprintf(prn, " %s = %g\n", _("Expected value"), x);
1092 pprintf(prn, " %s = %g\n", _("Variance"), s2);
1093 }
1094 num = wp - x;
1095 if (num > 0.25) {
1096 num -= .5;
1097 } else {
1098 num += .5;
1099 }
1100 z = num / sqrt(s2);
1101 if (quiet) {
1102 pval = print_z_prob(z, NULL);
1103 } else {
1104 pprintf(prn, " z = %g\n", z);
1105 pval = print_z_prob(z, prn);
1106 }
1107 } else if (n > 5) {
1108 pprintf(prn, " 5%% critical values: %d (two-tailed), %d (one-tailed)\n",
1109 rank5[n-6][0], rank5[n-6][1]);
1110 } else {
1111 pprintf(prn, " %s\n",
1112 _("Sample too small for statistical significance"));
1113 }
1114
1115 if (!quiet) {
1116 pputc(prn, '\n');
1117 }
1118
1119 result[0] = z;
1120 result[1] = pval;
1121
1122 free(r);
1123
1124 return 0;
1125 }
1126
rank_sum_test(const double * x,const double * y,int v1,int v2,const DATASET * dset,double * result,gretlopt opt,PRN * prn)1127 static int rank_sum_test (const double *x, const double *y,
1128 int v1, int v2, const DATASET *dset,
1129 double *result, gretlopt opt, PRN *prn)
1130 {
1131 struct ranker *r;
1132 double wa, z = NADBL, pval = NADBL;
1133 char xc = 'a', yc = 'b';
1134 int na = 0, nb = 0;
1135 int quiet = (opt & OPT_Q);
1136 int i, t, n = 0;
1137
1138 for (t=dset->t1; t<=dset->t2; t++) {
1139 if (!na(x[t])) na++;
1140 if (!na(y[t])) nb++;
1141 }
1142
1143 if (na == 0 || nb == 0) {
1144 return E_MISSDATA;
1145 }
1146
1147 n = na + nb;
1148 r = malloc(n * sizeof *r);
1149 if (r == NULL) {
1150 return E_ALLOC;
1151 }
1152
1153 if (na > nb) {
1154 /* Make 'a' the smaller group */
1155 int tmp = na;
1156
1157 na = nb;
1158 nb = tmp;
1159 xc = 'b';
1160 yc = 'a';
1161 }
1162
1163 i = 0;
1164 for (t=dset->t1; t<=dset->t2; t++) {
1165 if (!na(x[t])) {
1166 r[i].val = x[t];
1167 r[i].c = xc;
1168 i++;
1169 }
1170 if (!na(y[t])) {
1171 r[i].val = y[t];
1172 r[i].c = yc;
1173 i++;
1174 }
1175 }
1176
1177 qsort(r, n, sizeof *r, gretl_compare_doubles);
1178
1179 for (i=0; i<n; i++) {
1180 int m = 1;
1181
1182 for (t=i+1; t<n && r[t].val == r[i].val; t++) {
1183 m++;
1184 }
1185
1186 if (m == 1) {
1187 r[i].rank = i + 1;
1188 } else {
1189 for (t=0; t<m; t++) {
1190 r[i+t].rank = (i + 1 + i + m) / 2.0;
1191 }
1192 i += m - 1;
1193 }
1194 }
1195
1196 if (!quiet) {
1197 diff_test_header(v1, v2, dset, prn);
1198 pprintf(prn, "\n%s\n", _("Wilcoxon Rank-Sum Test"));
1199 pprintf(prn, "%s: %s\n\n", _("Null hypothesis"),
1200 _("the two medians are equal"));
1201 }
1202
1203 wa = 0.0;
1204 for (i=0; i<n; i++) {
1205 if (r[i].c == 'a') {
1206 wa += r[i].rank;
1207 }
1208 }
1209
1210 if (opt & OPT_V) {
1211 pprintf(prn, "%10s %7s %8s\n\n", "value", "rank", "group");
1212 for (i=0; i<n; i++) {
1213 pprintf(prn, "%10g %7g %8c\n", r[i].val, r[i].rank, r[i].c);
1214 }
1215 pputc(prn, '\n');
1216 }
1217
1218 if (!quiet) {
1219 pprintf(prn, " n1 = %d, n2 = %d\n", na, nb);
1220 pprintf(prn, " w (%s) = %g\n", _("sum of ranks, sample 1"), wa);
1221 }
1222
1223 if (na >= 10 && nb >= 10) {
1224 double m, s;
1225
1226 m = na * (na + nb + 1) / 2.0;
1227 s = sqrt(na * nb * (na + nb + 1) / 12.0);
1228 z = (wa - m) / s;
1229 if (quiet) {
1230 pval = print_z_prob(z, NULL);
1231 } else {
1232 pprintf(prn, " z = (%g - %g) / %g = %g\n", wa, m, s, z);
1233 pval = print_z_prob(z, prn);
1234 }
1235 } else if (na >= 4 && nb >= 4 && nb <= 12) {
1236 void (*cv) (int, int, PRN *);
1237
1238 cv = get_plugin_function("rank_sum_lookup");
1239 if (cv != NULL) {
1240 (*cv)(na, nb, prn);
1241 }
1242 } else {
1243 pprintf(prn, " %s\n",
1244 _("Sample too small for statistical significance"));
1245 }
1246
1247 if (!quiet) {
1248 pputc(prn, '\n');
1249 }
1250
1251 result[0] = z;
1252 result[1] = pval;
1253
1254 free(r);
1255
1256 return 0;
1257 }
1258
sign_test(const double * x,const double * y,int v1,int v2,const DATASET * dset,double * result,gretlopt opt,PRN * prn)1259 static int sign_test (const double *x, const double *y,
1260 int v1, int v2, const DATASET *dset,
1261 double *result, gretlopt opt, PRN *prn)
1262 {
1263 double pv;
1264 int n, w, t;
1265
1266 n = w = 0;
1267 for (t=dset->t1; t<=dset->t2; t++) {
1268 if (!na(x[t]) && !na(y[t]) && x[t] != y[t]) {
1269 w += (x[t] > y[t]);
1270 n++;
1271 }
1272 }
1273
1274 if (n == 0) {
1275 return E_MISSDATA;
1276 }
1277
1278 if (w == 0) {
1279 pv = 1.0;
1280 } else {
1281 pv = binomial_cdf_comp(0.5, n, w - 1);
1282 }
1283
1284 if (!(opt & OPT_Q)) {
1285 diff_test_header(v1, v2, dset, prn);
1286 pprintf(prn, "\n%s\n\n", _("Sign Test"));
1287 pprintf(prn, _("Number of differences: n = %d\n"), n);
1288 pputs(prn, " ");
1289 pprintf(prn, _("Number of cases with %s > %s: w = %d (%.2f%%)\n"),
1290 dset->varname[v1], dset->varname[v2],
1291 w, 100.0 * w / n);
1292 pputs(prn, " ");
1293 pprintf(prn, _("Under the null hypothesis of no difference, W "
1294 "follows B(%d, %.1f)\n"), n, 0.5);
1295 pprintf(prn, " %s(W <= %d) = %g\n", _("Prob"), w,
1296 binomial_cdf(0.5, n, w));
1297 pprintf(prn, " %s(W >= %d) = %g\n\n", _("Prob"), w, pv);
1298 }
1299
1300 result[0] = w;
1301 result[1] = pv;
1302
1303 return 0;
1304 }
1305
1306 /**
1307 * diff_test:
1308 * @list: list containing 2 variables.
1309 * @dset: dataset struct.
1310 * @opt: %OPT_G, sign test; %OPT_R, rank sum; %OPT_I,
1311 * signed rank; %OPT_V, verbose (for rank tests).
1312 * @prn: gretl printing struct.
1313 *
1314 * Performs, and prints the results of, a non-parametric
1315 * test for a difference between two variables or groups.
1316 * The specific test performed depends on @opt.
1317 *
1318 * Returns: 0 on successful completion, non-zero on error.
1319 */
1320
diff_test(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)1321 int diff_test (const int *list, const DATASET *dset,
1322 gretlopt opt, PRN *prn)
1323 {
1324 const double *x, *y;
1325 double result[2] = {0};
1326 int v1, v2;
1327 int err = 0;
1328
1329 if (list[0] != 2) {
1330 pputs(prn, _("This command requires two variables\n"));
1331 return E_DATA;
1332 }
1333
1334 v1 = list[1];
1335 v2 = list[2];
1336
1337 x = dset->Z[v1];
1338 y = dset->Z[v2];
1339
1340 if (opt == OPT_NONE || opt == OPT_V) {
1341 opt = OPT_G;
1342 }
1343
1344 if (opt & OPT_G) {
1345 err = sign_test(x, y, v1, v2, dset, result, opt, prn);
1346 } else if (opt & OPT_R) {
1347 err = rank_sum_test(x, y, v1, v2, dset, result, opt, prn);
1348 } else if (opt & OPT_I) {
1349 err = signed_rank_test(x, y, v1, v2, dset, result, opt, prn);
1350 }
1351
1352 record_test_result(result[0], result[1]);
1353
1354 return err;
1355 }
1356
1357 struct pair_sorter {
1358 double xi;
1359 double yi;
1360 int i;
1361 char *s;
1362 };
1363
1364 /**
1365 * sort_pairs_by_x:
1366 * @x: data vector by which to sort.
1367 * @y: data vector.
1368 * @order: location to receive sort order, or %NULL.
1369 * @labels: array of strings to be sorted along with
1370 * the data, or %NULL.
1371 *
1372 * Orders the elements of @x and @y by increasing value
1373 * of @x. Optionally, returns in @order an array of
1374 * integers representing the order in which the original
1375 * observations appear in the sorted vectors. Also
1376 * optionally sorts an accomanying array of observation
1377 * labels.
1378 *
1379 * Returns: 0 on successful completion, non-zero on error.
1380 */
1381
sort_pairs_by_x(gretl_matrix * x,gretl_matrix * y,int ** order,char ** labels)1382 int sort_pairs_by_x (gretl_matrix *x, gretl_matrix *y, int **order,
1383 char **labels)
1384 {
1385 struct pair_sorter *s = NULL;
1386 int t, T = gretl_vector_get_length(x);
1387 int err = 0;
1388
1389 if (T == 0 || gretl_vector_get_length(y) != T) {
1390 return E_NONCONF;
1391 }
1392
1393 s = malloc(T * sizeof *s);
1394 if (s == NULL) {
1395 return E_ALLOC;
1396 }
1397
1398 for (t=0; t<T; t++) {
1399 s[t].xi = x->val[t];
1400 s[t].yi = y->val[t];
1401 s[t].i = t;
1402 if (labels != NULL) {
1403 s[t].s = labels[t];
1404 } else {
1405 s[t].s = NULL;
1406 }
1407 }
1408
1409 qsort(s, T, sizeof *s, gretl_compare_doubles);
1410
1411 for (t=0; t<T; t++) {
1412 x->val[t] = s[t].xi;
1413 y->val[t] = s[t].yi;
1414 if (labels != NULL) {
1415 labels[t] = s[t].s;
1416 }
1417 }
1418
1419 if (order != NULL) {
1420 int *idx = malloc(T * sizeof *idx);
1421
1422 if (idx == NULL) {
1423 err = E_ALLOC;
1424 } else {
1425 for (t=0; t<T; t++) {
1426 idx[t] = s[t].i;
1427 }
1428 *order = idx;
1429 }
1430 }
1431
1432 free(s);
1433
1434 return err;
1435 }
1436
data_pre_sorted(const gretl_matrix * x)1437 static int data_pre_sorted (const gretl_matrix *x)
1438 {
1439 int t, T = gretl_vector_get_length(x);
1440
1441 for (t=1; t<T; t++) {
1442 if (x->val[t] < x->val[t-1]) {
1443 return 0;
1444 }
1445 }
1446
1447 return 1;
1448 }
1449
1450 #define LDEBUG 0
1451
1452 /* convenience struct for passing loess data */
1453
1454 struct loess_info {
1455 const gretl_matrix *y;
1456 const gretl_matrix *x;
1457 gretl_matrix *Xi;
1458 gretl_matrix *yi;
1459 gretl_matrix *wt;
1460 int d;
1461 int n;
1462 int N;
1463 int n_ok;
1464 int loo;
1465 };
1466
1467 /* Compute robustness weights for loess, if wanted: on input @rw
1468 contains the N residuals; on return it contains the robustness
1469 weights as a function of the residuals. For observations
1470 where y is missing, the weight is NADBL.
1471 */
1472
make_robustness_weights(gretl_matrix * rw,int N)1473 static int make_robustness_weights (gretl_matrix *rw, int N)
1474 {
1475 double *tmp = malloc(N * sizeof *tmp);
1476 double s, eis;
1477 int i, k, err = 0;
1478
1479 if (tmp == NULL) {
1480 return E_ALLOC;
1481 }
1482
1483 /* pack the absolute values of the non-missing residuals
1484 into tmp */
1485 k = 0;
1486 for (i=0; i<N; i++) {
1487 if (!na(rw->val[i])) {
1488 tmp[k++] = fabs(rw->val[i]);
1489 }
1490 }
1491
1492 /* find the median absolute residual */
1493 s = gretl_median(0, k-1, tmp);
1494 free(tmp);
1495
1496 if (na(s)) {
1497 err = E_DATA;
1498 } else {
1499 s *= 6;
1500 for (i=0; i<N && !err; i++) {
1501 if (!na(rw->val[i])) {
1502 eis = rw->val[i] / s;
1503 if (fabs(eis) < 1.0) {
1504 /* apply bisquare */
1505 rw->val[i] = (1 - eis * eis) * (1 - eis * eis);
1506 } else {
1507 rw->val[i] = 0.0;
1508 }
1509 }
1510 }
1511 }
1512
1513 return err;
1514 }
1515
1516 /* Multiply the robustness weights, @rw, into the regular
1517 weights, @wt, taking care to register the two vectors
1518 (rw is full-length while wt is local), and to skip any
1519 missing values in rw.
1520 */
1521
adjust_weights(const gretl_matrix * rw,gretl_matrix * wt,int a)1522 static void adjust_weights (const gretl_matrix *rw,
1523 gretl_matrix *wt, int a)
1524 {
1525 int k, n = gretl_vector_get_length(wt);
1526 int t = a;
1527
1528 for (k=0; k<n; k++) {
1529 wt->val[k] *= rw->val[t];
1530 if (k < n - 1) {
1531 t++;
1532 while (na(rw->val[t])) t++;
1533 }
1534 }
1535 }
1536
1537 /* Apply the weights in lo->wt to the local data matrices
1538 lo->Xi and lo->yi. This is straightforward since all
1539 the matrices have lo->n rows.
1540 */
1541
weight_local_data(struct loess_info * lo)1542 static void weight_local_data (struct loess_info *lo)
1543 {
1544 double xkj, wk;
1545 int k, j;
1546
1547 for (k=0; k<lo->n; k++) {
1548 wk = sqrt(lo->wt->val[k]);
1549 for (j=0; j<lo->Xi->cols; j++) {
1550 xkj = gretl_matrix_get(lo->Xi, k, j);
1551 gretl_matrix_set(lo->Xi, k, j, xkj * wk);
1552 }
1553 lo->yi->val[k] *= wk;
1554 }
1555
1556 #if LDEBUG > 1
1557 gretl_matrix_print(lo->Xi, "Xi, weighted");
1558 gretl_matrix_print(lo->yi, "yi, weighted");
1559 #endif
1560 }
1561
next_ok_obs(const double * y,int i)1562 static int next_ok_obs (const double *y, int i)
1563 {
1564 i++;
1565 while (na(y[i])) i++;
1566 return i;
1567 }
1568
loess_get_local_data(int i,int * pa,struct loess_info * lo,int * xconst)1569 static int loess_get_local_data (int i, int *pa,
1570 struct loess_info *lo,
1571 int *xconst)
1572 {
1573 const double *x = lo->x->val;
1574 const double *y = lo->y->val;
1575 double xk, xk1, xds, h = 0;
1576 int n = lo->n, n_ok = lo->n_ok;
1577 int k_skip = -1;
1578 int a, b, k, m, t;
1579 int err = 0;
1580
1581 gretl_matrix_zero(lo->Xi);
1582 gretl_matrix_zero(lo->yi);
1583 gretl_matrix_zero(lo->wt);
1584
1585 a = *pa;
1586
1587 #if LDEBUG
1588 fprintf(stderr, "\ni=%d, a=%d, n_ok=%d\n", i, a, n_ok);
1589 #endif
1590
1591 /* First determine where we should start reading the
1592 neighbors of xi: search rightward from a, so far as
1593 this is feasible.
1594 */
1595
1596 while (n_ok > n) {
1597 /* find b, the next point that would be included if
1598 we move the neighbor set one place to the right:
1599 start at a+1 and proceed until we have n points
1600 with valid y-values
1601 */
1602 for (m=0, b=a+1; ; b++) {
1603 if (!na(y[b]) && ++m == n) {
1604 break;
1605 }
1606 }
1607 if (fabs(x[i] - x[a]) < fabs(x[i] - x[b])) {
1608 /* the max distance increased: get out */
1609 break;
1610 }
1611 /* shift one (valid) place to the right */
1612 a = next_ok_obs(y, a);
1613 n_ok--;
1614 }
1615
1616 /* record status for next round */
1617 *pa = a;
1618 lo->n_ok = n_ok;
1619
1620 /* Having found the starting index for the n nearest neighbors
1621 of xi, transcribe the relevant data into Xi and yi. As we
1622 go, check whether x is constant in this sub-sample.
1623 */
1624
1625 *xconst = 1;
1626 xk1 = 0;
1627 t = a;
1628
1629 for (k=0; k<n; k++) {
1630 if (lo->loo && t == i) {
1631 /* leave-one-out: mark this observation */
1632 k_skip = k;
1633 }
1634 lo->yi->val[k] = y[t];
1635 xk = x[t];
1636 gretl_matrix_set(lo->Xi, k, 0, 1.0);
1637 gretl_matrix_set(lo->Xi, k, 1, xk);
1638 if (lo->Xi->cols > 2) {
1639 gretl_matrix_set(lo->Xi, k, 2, xk * xk);
1640 }
1641 if (*xconst && k > 0 && xk > xk1) {
1642 *xconst = 0;
1643 }
1644 xk1 = xk;
1645 if (k < n - 1) {
1646 t = next_ok_obs(y, t);
1647 }
1648 }
1649
1650 #if LDEBUG
1651 fprintf(stderr, " -> a=%d, n_ok=%d (xconst = %d)\n",
1652 a, n_ok, *xconst);
1653 #endif
1654
1655 if (!err) {
1656 /* find the max(abs) distance from xi */
1657 double h0 = fabs(x[i] - gretl_matrix_get(lo->Xi, 0, 1));
1658 double hn = fabs(x[i] - gretl_matrix_get(lo->Xi, n-1, 1));
1659
1660 h = (h0 > hn)? h0 : hn;
1661 }
1662
1663 /* compute scaled distances and tricube weights */
1664 for (k=0; k<n; k++) {
1665 if (k == k_skip) {
1666 /* exclude this obs via a zero weight? */
1667 lo->wt->val[k] = 0.0;
1668 continue;
1669 }
1670 xk = gretl_matrix_get(lo->Xi, k, 1);
1671 if (h == 0.0) {
1672 lo->wt->val[k] = 1.0;
1673 } else {
1674 xds = fabs(x[i] - xk) / h;
1675 if (xds < 1.0) {
1676 lo->wt->val[k] = pow(1.0 - pow(xds, 3.0), 3.0);
1677 }
1678 }
1679 #if LDEBUG > 1
1680 fprintf(stderr, "y=%10g, x=%10g, dist=%10g\n",
1681 lo->yi->val[k], xk, fabs(x[i] - xk));
1682 #endif
1683 }
1684
1685 #if LDEBUG > 1
1686 gretl_matrix_print(lo->Xi, "Xi");
1687 gretl_matrix_print(lo->yi, "yi");
1688 gretl_matrix_print(lo->wt, "wt");
1689 #endif
1690
1691 return err;
1692 }
1693
loess_count_usable_obs(const gretl_matrix * y,int N,int * amin)1694 static int loess_count_usable_obs (const gretl_matrix *y,
1695 int N, int *amin)
1696 {
1697 int i, n_ok = 0;
1698
1699 for (i=0; i<N; i++) {
1700 if (!na(y->val[i])) {
1701 if (n_ok == 0) {
1702 /* record the first usable obs index */
1703 *amin = i;
1704 }
1705 n_ok++;
1706 }
1707 }
1708
1709 return n_ok;
1710 }
1711
1712 /**
1713 * loess_fit:
1714 * @x: x-axis variable (must be pre-sorted).
1715 * @y: response variable.
1716 * @d: order for polynomial fit (0 <= d <= 2).
1717 * @q: bandwidth (0 < q <= 1).
1718 * @opt: give %OPT_R for robust variant (with re-weighting based on
1719 * the first-stage residuals).
1720 * @err: location to receive error code.
1721 *
1722 * Computes loess estimates based on William Cleveland, "Robust Locally
1723 * Weighted Regression and Smoothing Scatterplots", Journal of the
1724 * American Statistical Association, Vol. 74 (1979), pp. 829-836.
1725 * Typically one expects that @d = 1 and @q is in the neighborhood
1726 * of 0.5.
1727 *
1728 * The x,y pairs must be pre-sorted by increasing value of @x; an
1729 * error is flagged if this is not the case. See also
1730 * sort_pairs_by_x().
1731 *
1732 * Returns: allocated vector containing the loess fitted values, or
1733 * %NULL on failure.
1734 */
1735
loess_fit(const gretl_matrix * x,const gretl_matrix * y,int d,double q,gretlopt opt,int * err)1736 gretl_matrix *loess_fit (const gretl_matrix *x, const gretl_matrix *y,
1737 int d, double q, gretlopt opt, int *err)
1738 {
1739 struct loess_info lo;
1740 gretl_matrix_block *B;
1741 gretl_matrix *Xi, *yi, *wt, *b;
1742 gretl_matrix *yh = NULL;
1743 gretl_matrix *rw = NULL;
1744 int N = gretl_vector_get_length(y);
1745 int k, iters, Xic, amin = 0;
1746 int n_ok, robust = 0, loo = 0;
1747 int i, n;
1748
1749 if (d < 0 || d > 2 || q > 1.0) {
1750 *err = E_DATA;
1751 return NULL;
1752 }
1753
1754 if (!data_pre_sorted(x)) {
1755 gretl_errmsg_set("loess: the data must be sorted by x");
1756 *err = E_DATA;
1757 return NULL;
1758 }
1759
1760 /* check for usable data points */
1761 n_ok = loess_count_usable_obs(y, N, &amin);
1762 if (n_ok < 4) {
1763 *err = E_TOOFEW;
1764 return NULL;
1765 }
1766
1767 if (opt & OPT_O) {
1768 /* leave one out: experimental */
1769 loo = 1;
1770 }
1771
1772 /* check for q too small */
1773 if (q < (d + 1.0) / n_ok) {
1774 q = (d + 1.0) / n_ok;
1775 }
1776
1777 /* set the local sub-sample size */
1778 n = (int) ceil(q * n_ok);
1779
1780 /* we need a minimum of two columns in Xi, in order
1781 to compute the x-distance based weights */
1782 Xic = (d == 0)? 2 : d + 1;
1783
1784 B = gretl_matrix_block_new(&Xi, n, Xic,
1785 &yi, n, 1,
1786 &wt, n, 1,
1787 &b, d+1, 1,
1788 NULL);
1789 if (B == NULL) {
1790 *err = E_ALLOC;
1791 return NULL;
1792 }
1793
1794 /* vector to hold the fitted values */
1795 yh = gretl_column_vector_alloc(N);
1796 if (yh == NULL) {
1797 *err = E_ALLOC;
1798 goto bailout;
1799 }
1800
1801 if (opt & OPT_R) {
1802 /* extra storage for residuals/robustness weights */
1803 rw = gretl_column_vector_alloc(N);
1804 if (rw == NULL) {
1805 *err = E_ALLOC;
1806 goto bailout;
1807 }
1808 iters = 2; /* maybe 3? */
1809 robust = 1;
1810 } else {
1811 iters = 1;
1812 }
1813
1814 /* fill out the convenience struct */
1815 lo.y = y;
1816 lo.x = x;
1817 lo.Xi = Xi;
1818 lo.yi = yi;
1819 lo.wt = wt;
1820 lo.d = d;
1821 lo.n = n;
1822 lo.N = N;
1823 lo.loo = loo;
1824
1825 for (k=0; k<iters && !*err; k++) {
1826 /* iterations for robustness, if wanted */
1827 int a = amin;
1828
1829 lo.n_ok = n_ok;
1830
1831 for (i=0; i<N && !*err; i++) {
1832 /* iterate across points in full sample */
1833 double xi = x->val[i];
1834 int xconst = 0;
1835
1836 /* a is the leftmost possible index for starting to
1837 read the n nearest neighbors of xi.
1838 */
1839 *err = loess_get_local_data(i, &a, &lo, &xconst);
1840 if (*err) {
1841 break;
1842 }
1843
1844 if (k > 0) {
1845 /* We have robustness weights, rw, based on the residuals
1846 from the last round, which should be used to adjust
1847 the wt as computed in loess_get_local_data().
1848 */
1849 adjust_weights(rw, wt, a);
1850 }
1851
1852 /* apply weights to the local data */
1853 weight_local_data(&lo);
1854
1855 if (d == 0 || xconst) {
1856 /* not using x in the regressions: mask the x column(s) */
1857 gretl_matrix_reuse(Xi, -1, 1);
1858 if (b->rows > 1) {
1859 gretl_matrix_reuse(b, 1, 1);
1860 }
1861 }
1862
1863 /* run local WLS */
1864 *err = gretl_matrix_SVD_ols(yi, Xi, b, NULL, NULL, NULL);
1865
1866 if (!*err) {
1867 /* yh: evaluate the polynomial at xi */
1868 yh->val[i] = b->val[0];
1869 if (b->rows > 1) {
1870 yh->val[i] += b->val[1] * xi;
1871 if (b->rows == 3) {
1872 yh->val[i] += b->val[2] * xi * xi;
1873 }
1874 }
1875 if (robust && k < iters - 1) {
1876 /* save residual for robustness weights */
1877 if (na(y->val[i])) {
1878 rw->val[i] = NADBL;
1879 } else {
1880 rw->val[i] = y->val[i] - yh->val[i];
1881 }
1882 }
1883 }
1884
1885 /* ensure matrices are at full size */
1886 gretl_matrix_reuse(Xi, -1, Xic);
1887 gretl_matrix_reuse(b, d+1, -1);
1888
1889 } /* end loop over sample */
1890
1891 if (!*err && robust && k < iters - 1) {
1892 *err = make_robustness_weights(rw, N);
1893 }
1894 } /* end robustness iterations */
1895
1896 bailout:
1897
1898 gretl_matrix_block_destroy(B);
1899 gretl_matrix_free(rw);
1900
1901 if (*err) {
1902 gretl_matrix_free(yh);
1903 yh = NULL;
1904 }
1905
1906 return yh;
1907 }
1908
quartiles(const double * x,int n,double * q1,double * q3)1909 static double quartiles (const double *x, int n,
1910 double *q1, double *q3)
1911 {
1912 int n2 = n / 2;
1913 double xx;
1914
1915 xx = (n % 2)? x[n2] : 0.5 * (x[n2 - 1] + x[n2]);
1916
1917 if (q1 != NULL && q3 != NULL) {
1918 if (n % 2) {
1919 *q1 = quartiles(x, n2 + 1, NULL, NULL);
1920 *q3 = quartiles(x + n2, n2 + 1, NULL, NULL);
1921 } else {
1922 *q1 = quartiles(x, n2, NULL, NULL);
1923 *q3 = quartiles(x + n2, n2, NULL, NULL);
1924 }
1925 }
1926
1927 return xx;
1928 }
1929
kernel_bandwidth(const double * x,int n)1930 double kernel_bandwidth (const double *x, int n)
1931 {
1932 double s, w, q1, q3, r;
1933 double n5 = pow((double) n, -0.20);
1934
1935 s = gretl_stddev(0, n - 1, x);
1936 quartiles(x, n, &q1, &q3);
1937 r = (q3 - q1) / 1.349;
1938 w = (r > 0 && r < s)? r : s;
1939
1940 #if 0
1941 fprintf(stderr, "Silverman bandwidth: s=%g, q1=%g, q3=%g, IQR=%g, w=%g\n",
1942 s, q1, q3, q3 - q1, w);
1943 #endif
1944
1945 /* Silverman bandwidth times scale factor */
1946 return 0.9 * w * n5;
1947 }
1948