1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #include "libgretl.h"
21 
22 /**
23  * SECTION:nonparam
24  * @short_description: selected nonparametric routines
25  * @title: Nonparametric
26  * @include: libgretl.h
27  *
28  * Includes various nonparametric tests: rank correlation,
29  * runs (randomness), and differences between series.  Also
30  * "loess" regression a la William Cleveland.
31  */
32 
33 /* Spearman, one-tailed alpha = .005, .025, .05 */
34 
35 static double rhocrit[18][3] = {
36     { 0.8929, 0.7450, 0.6786 }, /*  n = 7 */
37     { 0.8571, 0.7143, 0.6190 }, /*  8 */
38     { 0.8167, 0.6833, 0.5833 }, /*  9 */
39     { 0.7818, 0.6364, 0.5515 }, /* 10 */
40     { 0.7545, 0.6091, 0.5273 }, /* 11 */
41     { 0.7273, 0.5804, 0.4965 }, /* 12 */
42     { 0.6978, 0.5549, 0.4780 }, /* 13 */
43     { 0.6747, 0.5341, 0.4593 }, /* 14 */
44     { 0.6536, 0.5179, 0.4429 }, /* 15 */
45     { 0.6324, 0.5000, 0.4265 }, /* 16 */
46     { 0.6152, 0.4853, 0.4118 }, /* 17 */
47     { 0.5975, 0.4716, 0.3994 }, /* 18 */
48     { 0.5825, 0.4579, 0.3895 }, /* 19 */
49     { 0.5684, 0.4451, 0.3789 }, /* 20 */
50     { 0.5545, 0.4351, 0.3688 }, /* 21 */
51     { 0.5426, 0.4241, 0.3597 }, /* 22 */
52     { 0.5306, 0.4150, 0.3518 }, /* 23 */
53     { 0.5200, 0.4061, 0.3435 }  /* 24 */
54 };
55 
56 /* The values above are from Daniel and Terrell,
57    Business Statistics, 4e (Houghton Mifflin, 1986)
58 */
59 
spearman_signif(double rho,int n)60 static double spearman_signif (double rho, int n)
61 {
62     const double *x;
63 
64     if (n < 7) {
65 	return 1.0;
66     }
67 
68     x = rhocrit[n - 7];
69 
70     if (rho > x[0]) return .01;
71     if (rho > x[1]) return .05;
72     if (rho > x[2]) return .10;
73 
74     return 1.0;
75 }
76 
77 enum {
78     RANK_X,
79     RANK_Y
80 };
81 
82 /* m = number of sorted values (sz); n = number of raw values (either
83    x or y); in case of missing values in x and/or y we may have m < n.
84 */
85 
make_ranking(const double * sz,int m,const double * x,const double * y,int n,double * rz,int * ties,int which)86 static void make_ranking (const double *sz, int m,
87 			  const double *x, const double *y,
88 			  int n, double *rz, int *ties,
89 			  int which)
90 {
91     const double *z;
92     int cases, k, i, j;
93     double avg, r = 1;
94 
95     z = (which == RANK_X)? x : y;
96 
97     for (i=0; i<m; i++) {
98 	/* scan sorted z */
99 	cases = k = 0;
100 
101 	if (i > 0 && sz[i] == sz[i-1]) {
102 	    continue;
103 	}
104 
105 	for (j=0; j<n; j++) {
106 	    /* scan raw z for matches */
107 	    if (!na(x[j]) && !na(y[j])) {
108 		if (z[j] == sz[i]) {
109 		    rz[k] = r;
110 		    cases++;
111 		}
112 		k++;
113 	    }
114 	}
115 
116 	if (cases > 1) {
117 	    avg = (r + r + cases - 1.0) / 2.0;
118 	    for (j=0; j<m; j++) {
119 		if (rz[j] == r) {
120 		    rz[j] = avg;
121 		}
122 	    }
123 	    if (ties != NULL) {
124 		*ties = 1;
125 	    }
126 	}
127 
128 	r += cases;
129     }
130 }
131 
rankcorr_get_rankings(const double * x,const double * y,int n,double ** rxout,double ** ryout,int * pm,int * ties)132 static int rankcorr_get_rankings (const double *x, const double *y, int n,
133 				  double **rxout, double **ryout,
134 				  int *pm, int *ties)
135 {
136     double *sx = NULL, *sy = NULL;
137     double *rx = NULL, *ry = NULL;
138     int i, m = 0;
139 
140     /* count non-missing pairs */
141     for (i=0; i<n; i++) {
142 	if (!na(x[i]) && !na(y[i])) {
143 	    m++;
144 	}
145     }
146 
147     if (m < 2) {
148 	return E_DATA;
149     }
150 
151     sx = malloc(m * sizeof *sx);
152     sy = malloc(m * sizeof *sy);
153     rx = malloc(m * sizeof *rx);
154     ry = malloc(m * sizeof *ry);
155 
156     if (sx == NULL || sy == NULL ||
157 	rx == NULL || ry == NULL) {
158 	free(sx);
159 	free(sy);
160 	free(rx);
161 	free(ry);
162 	return E_ALLOC;
163     }
164 
165     /* copy non-missing x and y into sx, sy */
166     m = 0;
167     for (i=0; i<n; i++) {
168 	if (!na(x[i]) && !na(y[i])) {
169 	    sx[m] = x[i];
170 	    sy[m] = y[i];
171 	    m++;
172 	}
173     }
174 
175     /* get sorted series */
176     qsort(sx, m, sizeof *sx, gretl_inverse_compare_doubles);
177     qsort(sy, m, sizeof *sy, gretl_inverse_compare_doubles);
178 
179     for (i=0; i<m; i++) {
180 	rx[i] = ry[i] = 0.0;
181     }
182 
183     /* make rankings by comparing "raw" x, y with sorted */
184     make_ranking(sx, m, x, y, n, rx, ties, RANK_X);
185     make_ranking(sy, m, x, y, n, ry, ties, RANK_Y);
186 
187     /* save the ranks */
188     *rxout = rx;
189     *ryout = ry;
190 
191     if (pm != NULL) {
192 	*pm = m;
193     }
194 
195     free(sx);
196     free(sy);
197 
198     return 0;
199 }
200 
real_spearman_rho(const double * x,const double * y,int n,double * rho,double * zval,double ** rxout,double ** ryout,int * pm)201 static int real_spearman_rho (const double *x, const double *y, int n,
202 			      double *rho, double *zval,
203 			      double **rxout, double **ryout,
204 			      int *pm)
205 {
206     double *rx = NULL, *ry = NULL;
207     int m, ties = 0;
208     int err = 0;
209 
210     *rho = *zval = NADBL;
211 
212     if (n < 2) {
213 	return E_TOOFEW;
214     }
215 
216     err = rankcorr_get_rankings(x, y, n, &rx, &ry, &m, &ties);
217     if (err) {
218 	return err;
219     }
220 
221     /* Pearson correlation in ranks */
222     *rho = gretl_corr(0, m - 1, rx, ry, NULL);
223 
224     /* save the ranks, if wanted */
225     if (rxout != NULL && ry != NULL) {
226 	*rxout = rx;
227 	*ryout = ry;
228     } else {
229 	free(rx);
230 	free(ry);
231     }
232 
233     *pm = m;
234 
235     return err;
236 }
237 
spearman_rho_func(const double * x,const double * y,int n,int * err)238 gretl_matrix *spearman_rho_func (const double *x,
239 				 const double *y,
240 				 int n, int *err)
241 {
242     gretl_matrix *ret = NULL;
243     double z, rho = NADBL;
244     int m = 0;
245 
246     *err = real_spearman_rho(x, y, n, &rho, &z,
247 			     NULL, NULL, &m);
248 
249     if (!*err) {
250 	ret = gretl_matrix_alloc(1, 3);
251 	if (ret == NULL) {
252 	    *err = E_ALLOC;
253 	} else {
254 	    ret->val[0] = rho;
255 	    if (!na(z)) {
256 		ret->val[1] = z;
257 		ret->val[2] = normal_pvalue_2(z);
258 	    } else if (m > 24) {
259 		ret->val[1] = rho * sqrt((m - 2) / (1 - rho*rho));
260 		ret->val[2] = student_pvalue_2(m - 2, ret->val[1]);
261 	    } else {
262 		ret->val[1] = ret->val[2] = NADBL;
263 	    }
264 	}
265     }
266 
267     return ret;
268 }
269 
print_raw_and_ranked(int vx,int vy,const double * x,const double * y,const double * rx,const double * ry,const DATASET * dset,PRN * prn)270 static void print_raw_and_ranked (int vx, int vy,
271 				  const double *x, const double *y,
272 				  const double *rx, const double *ry,
273 				  const DATASET *dset,
274 				  PRN *prn)
275 {
276     int T = dset->t2 - dset->t1 + 1;
277     int obslen = max_obs_marker_length(dset);
278     int n1 = strlen(dset->varname[vx]);
279     int n2 = strlen(dset->varname[vy]);
280     int t, i = 0;
281 
282     n1 = (n2 > n1)? n2 : n1;
283     n1 = (n1 < 15)? 15 : n1;
284     pputc(prn, '\n');
285     bufspace(obslen + 1, prn);
286     pprintf(prn, "%*s%8s%*s%8s\n\n", n1, dset->varname[vx], _("rank"),
287 	    n1, dset->varname[vy], _("rank"));
288 
289     for (t=0; t<T; t++) {
290 	print_obs_marker(t + dset->t1, dset, obslen, prn);
291 	if (!(na(x[t])) && !(na(y[t]))) {
292 	    pprintf(prn, "%#*g", n1, x[t]);
293 	    pprintf(prn, "%8g", rx[i]);
294 	    pprintf(prn, "%#*g", n1, y[t]);
295 	    pprintf(prn, "%8g", ry[i]);
296 	    i++;
297 	}
298 	pputc(prn, '\n');
299     }
300 }
301 
302 /**
303  * spearman_rho:
304  * @list: list of (two) variables to process.
305  * @dset: dataset struct.
306  * @opt: if includes %OPT_V, print both the "raw" and the ranked data.
307  * @prn: gretl printing struct.
308  *
309  * Calculates and prints Spearman's rank correlation coefficient for the two
310  * variables specified in the @list.
311  *
312  * Returns: 0 on successful completion, 1 on error.
313  */
314 
spearman_rho(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)315 int spearman_rho (const int *list, const DATASET *dset,
316 		  gretlopt opt, PRN *prn)
317 {
318     int T = dset->t2 - dset->t1 + 1;
319     double *rx = NULL, *ry = NULL;
320     const double *x, *y;
321     double zval, rho = 0;
322     int vx, vy, m;
323     int err;
324 
325     if (list == NULL || list[0] != 2) {
326 	pputs(prn, _("This command requires two variables\n"));
327 	return 1;
328     }
329 
330     vx = list[1];
331     vy = list[2];
332 
333     x = dset->Z[vx] + dset->t1;
334     y = dset->Z[vy] + dset->t1;
335 
336     err = real_spearman_rho(x, y, T, &rho, &zval,
337 			    (opt & OPT_V)? &rx : NULL,
338 			    (opt & OPT_V)? &ry : NULL,
339 			    &m);
340     if (err) {
341 	return err;
342     }
343 
344     pprintf(prn, _("\nFor the variables '%s' and '%s'\n"), dset->varname[vx],
345 	    dset->varname[vy]);
346 
347     if (na(rho)) {
348 	pputs(prn, _("Spearman's rank correlation is undefined\n"));
349 	return err;
350     }
351 
352     pprintf(prn, _("Spearman's rank correlation coefficient (rho) = %.8f\n"), rho);
353 
354     if (rho == 0.0) {
355 	goto skipit;
356     }
357 
358     if (!na(zval)) {
359 	pputs(prn, _("Under the null hypothesis of no correlation:\n "));
360 	pprintf(prn, _("z-score = %g, with two-tailed p-value %.4f\n"), zval,
361 		normal_pvalue_2(zval));
362     } else if (m > 24) {
363 	double tval = rho * sqrt((m - 2) / (1 - rho*rho));
364 
365 	pputs(prn, _("Under the null hypothesis of no correlation:\n "));
366 	pprintf(prn, _("t(%d) = %g, with two-tailed p-value %.4f\n"), m - 2,
367 		tval, student_pvalue_2(m - 2, tval));
368     } else if (m >= 7) {
369 	double pval = spearman_signif(fabs(rho), m);
370 
371 	if (pval < 1.0) {
372 	    pprintf(prn, _("significant at the %g%% level (two-tailed)\n"),
373 		    100.0 * pval);
374 	} else {
375 	    /* xgettext:no-c-format */
376 	    pputs(prn, _("not significant at the 10% level\n"));
377 	}
378     } else {
379 	pputs(prn, _("Sample is too small to calculate a p-value based on "
380 		     "the normal distribution\n"));
381     }
382 
383  skipit:
384 
385     if (rx != NULL && ry != NULL) {
386 	print_raw_and_ranked(vx, vy, x, y, rx, ry, dset, prn);
387 	free(rx);
388 	free(ry);
389     }
390 
391     pputc(prn, '\n');
392 
393     return 0;
394 }
395 
396 struct xy_pair {
397     double x;
398     double y;
399 };
400 
compare_pairs_x(const void * a,const void * b)401 static int compare_pairs_x (const void *a, const void *b)
402 {
403     const struct xy_pair *pa = a;
404     const struct xy_pair *pb = b;
405     int ret;
406 
407     ret = (pa->x > pb->x) - (pa->x < pb->x);
408     if (ret == 0) {
409 	ret = (pa->y > pb->y) - (pa->y < pb->y);
410     }
411 
412     return ret;
413 }
414 
compare_pairs_y(const void * a,const void * b)415 static int compare_pairs_y (const void *a, const void *b)
416 {
417     const struct xy_pair *pa = a;
418     const struct xy_pair *pb = b;
419 
420     return (pa->y > pb->y) - (pa->y < pb->y);
421 }
422 
real_kendall_tau(const double * x,const double * y,int n,struct xy_pair * xy,int nn,double * ptau,double * pz)423 static int real_kendall_tau (const double *x, const double *y,
424 			     int n, struct xy_pair *xy, int nn,
425 			     double *ptau, double *pz)
426 {
427     double tau, nn1, s2, z;
428     int tt1, tx = 0, ty = 0;
429     int Tx = 0, Ty = 0;
430     int Tx2 = 0, Ty2 = 0;
431     int Tx25 = 0, Ty25 = 0;
432     int tie, N0, N1, S;
433     int i, j;
434 
435     /* populate sorter */
436     j = 0;
437     for (i=0; i<n; i++) {
438 	if (!na(x[i]) && !na(y[i])) {
439 	    xy[j].x = x[i];
440 	    xy[j].y = y[i];
441 	    j++;
442 	}
443     }
444 
445     /* sort pairs by x */
446     qsort(xy, nn, sizeof *xy, compare_pairs_x);
447 
448     /* make order counts */
449     N0 = N1 = 0;
450     for (i=0; i<nn; i++) {
451 	for (j=i+1; j<nn; j++) {
452 	    if (xy[j].y == xy[i].y) {
453 		Ty++;
454 	    } else if (xy[j].x != xy[i].x) {
455 		if (xy[j].y > xy[i].y) {
456 		    N0++;
457 		} else if (xy[j].y < xy[i].y) {
458 		    N1++;
459 		}
460 	    }
461 	}
462 	if (i > 0) {
463 	    /* account for ties in x */
464 	    tie = (xy[i].x == xy[i-1].x);
465 	    if (tie) {
466 		tx++;
467 	    }
468 	    if (tx > 0 && (!tie || i == nn - 1)) {
469 		tx++;
470 		tt1 = tx * (tx - 1);
471 		Tx += tt1;
472 		Tx2 += tt1 * (tx - 2);
473 		Tx25 += tt1 * (2 * tx + 5);
474 		tx = 0;
475 	    }
476 	}
477     }
478 
479     if (Ty > 0) {
480 	/* account for ties in y */
481 	Ty = 0;
482 	qsort(xy, nn, sizeof *xy, compare_pairs_y);
483 	for (i=1; i<nn; i++) {
484 	    tie = (xy[i].y == xy[i-1].y);
485 	    if (tie) {
486 		ty++;
487 	    }
488 	    if (ty > 0 && (!tie || i == nn - 1)) {
489 		ty++;
490 		tt1 = ty * (ty - 1);
491 		Ty += tt1;
492 		Ty2 += tt1 * (ty - 2);
493 		Ty25 += tt1 * (2 * ty + 5);
494 		ty = 0;
495 	    }
496 	}
497     }
498 
499     S = N0 - N1;
500 
501 #if 0
502     fprintf(stderr, "N0 = %d, N1 = %d, S = %d\n", N0, N1, S);
503     fprintf(stderr, "Tx = %d, Ty = %d\n", Tx, Ty);
504 #endif
505 
506     nn1 = nn * (nn - 1.0);
507 
508     /* normal approximation as in Shapiro and Chen,
509        Journal of Quality Technology, 2001 */
510 
511     if (Tx == 0 && Ty == 0) {
512 	tau = 2 * S / nn1;
513 	s2 = (1.0/18) * nn1 * (2 * nn + 5);
514     } else {
515 	double den = (nn1 - Tx) * (nn1 - Ty);
516 
517 	tau = 2 * S / sqrt(den);
518 	s2 = (1.0/18) * (nn1 * (2 * nn + 5) - Tx25 - Ty25);
519 	if (Tx2 != 0 && Ty2 != 0) {
520 	    s2 += (1.0/(9*nn1*(nn-2))) * Tx2 * Ty2;
521 	}
522 	if (Tx != 0 && Ty != 0) {
523 	    s2 += (1.0/(2*nn1)) * Tx * Ty;
524 	}
525     }
526 
527     z = (S - 1) / sqrt(s2);
528 
529     if (ptau != NULL) {
530 	*ptau = tau;
531     }
532 
533     if (pz != NULL) {
534 	*pz = z;
535     }
536 
537     return 0;
538 }
539 
kendall_tau_func(const double * x,const double * y,int n,int * err)540 gretl_matrix *kendall_tau_func (const double *x,
541 				const double *y,
542 				int n, int *err)
543 {
544     gretl_matrix *ret = NULL;
545     struct xy_pair *xy;
546     double z, tau = NADBL;
547     int i, nn = 0;
548 
549     /* count valid pairs */
550     for (i=0; i<n; i++) {
551 	if (!na(x[i]) && !na(y[i])) {
552 	    nn++;
553 	}
554     }
555 
556     if (nn < 2) {
557 	*err = E_TOOFEW;
558 	return NULL;
559     }
560 
561     xy = malloc(nn * sizeof *xy);
562 
563     if (xy == NULL) {
564 	*err = E_ALLOC;
565     } else {
566 	*err = real_kendall_tau(x, y, n, xy, nn, &tau, &z);
567     }
568 
569     free(xy);
570 
571     if (!*err) {
572 	ret = gretl_matrix_alloc(1, 3);
573 	if (ret == NULL) {
574 	    *err = E_ALLOC;
575 	} else {
576 	    ret->val[0] = tau;
577 	    ret->val[1] = z;
578 	    ret->val[2] = normal_pvalue_2(z);
579 	}
580     }
581 
582     return ret;
583 }
584 
585 /**
586  * kendall_tau:
587  * @list: list of (two) variables to process.
588  * @dset: dataset struct.
589  * @opt: if includes %OPT_V, print both the "raw" and the ranked data.
590  * @prn: gretl printing struct.
591  *
592  * Calculates and prints Kendall's rank correlation tau statistic for
593  * the two variables specified in @list.
594  *
595  * Returns: 0 on successful completion, 1 on error.
596  */
597 
kendall_tau(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)598 int kendall_tau (const int *list, const DATASET *dset,
599 		 gretlopt opt, PRN *prn)
600 {
601     struct xy_pair *xy = NULL;
602     int T = dset->t2 - dset->t1 + 1;
603     const double *x, *y;
604     double tau, z;
605     int vx, vy;
606     int t, nn = 0;
607     int err = 0;
608 
609     if (list == NULL || list[0] != 2) {
610 	pputs(prn, _("This command requires two variables\n"));
611 	return 1;
612     }
613 
614     vx = list[1];
615     vy = list[2];
616 
617     x = dset->Z[vx] + dset->t1;
618     y = dset->Z[vy] + dset->t1;
619 
620     /* count valid pairs */
621     for (t=0; t<T; t++) {
622 	if (!na(x[t]) && !na(y[t])) {
623 	    nn++;
624 	}
625     }
626 
627     if (nn < 2) {
628 	return E_TOOFEW;
629     }
630 
631     /* allocate */
632     xy = malloc(nn * sizeof *xy);
633     if (xy == NULL) {
634 	return E_ALLOC;
635     }
636 
637     /* calculate */
638     err = real_kendall_tau(x, y, T, xy, nn, &tau, &z);
639 
640     if (!err) {
641 	pprintf(prn, _("\nFor the variables '%s' and '%s'\n"), dset->varname[vx],
642 		dset->varname[vy]);
643 	pprintf(prn, "%s = %.8f\n", _("Kendall's tau"), tau);
644 	pputs(prn, _("Under the null hypothesis of no correlation:\n "));
645 	pprintf(prn, _("z-score = %g, with two-tailed p-value %.4f\n"), z,
646 		normal_pvalue_2(z));
647     }
648 
649     if (opt & OPT_V) {
650 	double *rx = NULL, *ry = NULL;
651 
652 	rankcorr_get_rankings(x, y, T, &rx, &ry, NULL, NULL);
653 
654 	if (rx != NULL && ry != NULL) {
655 	    print_raw_and_ranked(vx, vy, x, y, rx, ry, dset, prn);
656 	    free(rx);
657 	    free(ry);
658 	}
659     }
660 
661     pputc(prn, '\n');
662 
663     free(xy);
664 
665     return err;
666 }
667 
668 #define LOCKE_DEBUG 0
669 
randomize_doubles(const void * a,const void * b)670 static int randomize_doubles (const void *a, const void *b)
671 {
672     return gretl_rand_int_max(8096) - 4097;
673 }
674 
675 /* check for negative values, screen out missing values,
676    and load x into an array for randomization */
677 
locke_shuffle_init(const double * x,int * n,double ** psx)678 static int locke_shuffle_init (const double *x,
679 			       int *n, double **psx)
680 {
681     double *sx = NULL;
682     int i, m = 0;
683 
684     for (i=0; i<*n; i++) {
685 	if (x[i] < 0.0) {
686 	    return E_DATA;
687 	}
688 	if (!na(x[i])) {
689 	    m++;
690 	}
691     }
692 
693     if (m < 4) {
694 	return E_DATA;
695     }
696 
697     sx = malloc(m * sizeof *sx);
698     if (sx == NULL) {
699 	return E_ALLOC;
700     }
701 
702     *psx = sx;
703 
704     m = 0;
705     for (i=0; i<*n; i++) {
706 	if (!na(x[i])) {
707 	    sx[m++] = x[i];
708 	}
709     }
710 
711     m = 2 * m / 2;
712     *n = m;
713 
714     return 0;
715 }
716 
717 #define NREPEAT 100
718 
719 /**
720  * lockes_test:
721  * @x: data series.
722  * @t1: start of sample range.
723  * @t2: end of sample range.
724  *
725  * Performs Charles Locke's nonparametric test for whether an
726  * empirical distribution (namely, that of @x over the range
727  * @t1 to @t2) is gamma.  See C. Locke, "A Test for the Composite
728  * Hypothesis that a Population has a Gamma Distribution,"
729  * Commun. Statis.-Theor. Meth. A5(4), 351-364 (1976).  Also
730  * see Shapiro and Chen, Journal of Quality Technology 33(1),
731  * Jan 2001.
732  *
733  * Returns: the z value for test, or #NADBL on error.
734  */
735 
lockes_test(const double * x,int t1,int t2)736 double lockes_test (const double *x, int t1, int t2)
737 {
738     struct xy_pair *uv = NULL;
739     double *sx = NULL, *u = NULL, *v = NULL;
740     double zj, z;
741     int m = t2 - t1 + 1;
742     int i, j, t;
743     int err;
744 
745     err = locke_shuffle_init(x + t1, &m, &sx);
746     if (err) {
747 	return NADBL;
748     }
749 
750     m /= 2;
751     u = malloc(m * sizeof *u);
752     v = malloc(m * sizeof *v);
753     uv = malloc(m * sizeof *uv);
754 
755     if (u == NULL || v == NULL || uv == NULL) {
756 	z = NADBL;
757 	goto bailout;
758     }
759 
760     z = 0.0;
761 
762     /* repeat the shuffling of the series NREPEAT times, since the
763        test statistic is sensitive to the ordering under the null
764     */
765 
766     for (j=0; j<NREPEAT; j++) {
767 #if LOCKE_DEBUG
768 	fprintf(stderr, "locke's test: j = %d\n", j);
769 #endif
770 	qsort(sx, 2 * m, sizeof *sx, randomize_doubles);
771 	t = 0;
772 	for (i=0; i<m; i++) {
773 	    u[i] = sx[t] + sx[t+1];
774 	    v[i] = sx[t] / sx[t+1];
775 	    if (sx[t+1] / sx[t] > v[i]) {
776 		v[i] = sx[t+1] / sx[t];
777 	    }
778 	    t += 2;
779 	}
780 	real_kendall_tau(u, v, m, uv, m, NULL, &zj);
781 	z += zj;
782 #if LOCKE_DEBUG
783 	fprintf(stderr, "z[%d] = %g\n", j, zj);
784 #endif
785     }
786 
787     z /= (double) NREPEAT;
788 
789 #if LOCKE_DEBUG
790     fprintf(stderr, "Kendall's tau: average z = %g\n", z);
791 #endif
792 
793  bailout:
794 
795     free(u);
796     free(v);
797     free(uv);
798     free(sx);
799 
800     return z;
801 }
802 
803 /**
804  * runs_test:
805  * @v: ID number of the variable to process.
806  * @dset: dataset struct.
807  * @opt: %OPT_D to use first difference of variable, %OPT_E
808  * to assume positive and negative are equiprobable.
809  * @prn: gretl printing struct.
810  *
811  * Performs, and prints the results of, the runs test for randomness
812  * for the variable specified by @v.  The normal approximation
813  * is that given in Gary Smith, Statistical Reasoning, 2e, p. 674.
814  *
815  * Returns: 0 on successful completion, non-zero on error.
816  */
817 
runs_test(int v,const DATASET * dset,gretlopt opt,PRN * prn)818 int runs_test (int v, const DATASET *dset,
819 	       gretlopt opt, PRN *prn)
820 {
821     double xt, *x, mu, s2, sigma;
822     double N2, z, pval;
823     int Np, Nm;
824     int t, n, runs = 1;
825 
826     n = dset->t2 - dset->t1 + 1;
827 
828     x = malloc(n * sizeof *x);
829     if (x == NULL) {
830 	return E_ALLOC;
831     }
832 
833     n = 0;
834 
835     if (opt & OPT_D) {
836 	double xt1;
837 
838 	for (t=dset->t1 + 1; t<=dset->t2; t++) {
839 	    xt = dset->Z[v][t];
840 	    xt1 = dset->Z[v][t-1];
841 	    if (na(xt) || na(xt1)) {
842 		continue;
843 	    }
844 	    x[n++] = xt - xt1;
845 	}
846     } else {
847 	for (t=dset->t1; t<=dset->t2; t++) {
848 	    xt = dset->Z[v][t];
849 	    if (na(xt)) {
850 		continue;
851 	    }
852 	    x[n++] = xt;
853 	}
854     }
855 
856     if (n <= 1) {
857 	free(x);
858 	return E_TOOFEW;
859     }
860 
861     Np = (x[0] > 0);
862     Nm = 1 - Np;
863 
864     for (t=1; t<n; t++) {
865 	if (x[t] > 0) {
866 	    Np++;
867 	} else {
868 	    Nm++;
869 	}
870 	if ((x[t] > 0 && x[t-1] <= 0) || (x[t] <= 0 && x[t-1] > 0)) {
871 	    runs++;
872 	}
873     }
874 
875     if (opt & OPT_E) {
876 	mu = 1.0 + n / 2.0;
877 	s2 = ((double) n - 1) / 4.0;
878     } else {
879 	/* don't assume that + and - are equiprobable */
880 	N2 = 2.0 * Np * Nm;
881 	mu = 1.0 + N2 / n;
882 	s2 = (N2 * (N2 - n)) / (n*n * (n-1));
883     }
884 
885     if (s2 <= 0) {
886 	sigma = 0;
887 	z = pval = NADBL;
888     } else {
889 	sigma = sqrt(s2);
890 	z = (runs - mu) / sigma;
891 	pval = normal_pvalue_2(z);
892     }
893 
894     if (opt & OPT_D) {
895 	pprintf(prn, "\n%s\n", _("Runs test (first difference)"));
896     } else {
897 	pprintf(prn, "\n%s\n", _("Runs test (level)"));
898     }
899 
900     pprintf(prn, _("\nNumber of runs (R) in the variable '%s' = %d\n"),
901 	    dset->varname[v], runs);
902 
903     if (na(z)) {
904 	pprintf(prn, _("Test statistic cannot be computed: try "
905 		       "the deviation from the median?\n"));
906     } else {
907 	if (opt & OPT_E) {
908 	    pprintf(prn, _("Under the null hypothesis of independence and equal "
909 			   "probability of positive\nand negative values, R "
910 			   "follows N(%g, %g)\n"), mu, sigma);
911 	} else {
912 	    pprintf(prn, _("Under the null hypothesis of independence, R "
913 			   "follows N(%g, %g)\n"), mu, sigma);
914 	}
915 	pprintf(prn, _("z-score = %g, with two-tailed p-value %g\n"), z, pval);
916     }
917 
918     pputc(prn, '\n');
919 
920     record_test_result(z, pval);
921 
922     free(x);
923 
924     return 0;
925 }
926 
print_z_prob(double z,PRN * prn)927 static double print_z_prob (double z, PRN *prn)
928 {
929     double pv = NADBL;
930 
931     if (z > 0) {
932 	pv = normal_pvalue_1(z);
933 	if (!na(pv)) {
934 	    pprintf(prn, "  P(Z > %g) = %g\n", z, pv);
935 	}
936     } else if (z < 0) {
937 	pv = normal_cdf(z);
938 	if (!na(pv)) {
939 	    pprintf(prn, "  P(Z < %g) = %g\n", z, pv);
940 	}
941     }
942 
943     if (!na(pv) && pv > 0) {
944 	pprintf(prn, "  %s = %g\n", _("Two-tailed p-value"), 2 * pv);
945     }
946 
947     return pv;
948 }
949 
diff_test_header(int v1,int v2,const DATASET * dset,PRN * prn)950 static void diff_test_header (int v1, int v2, const DATASET *dset,
951 			      PRN *prn)
952 {
953     pputc(prn, '\n');
954     pprintf(prn, _("Test for difference between %s and %s"),
955 	    dset->varname[v1], dset->varname[v2]);
956 }
957 
958 static const int rank5[3][2] = {
959     { 0, 2 }, /* n = 6 */
960     { 2, 3 }, /* n = 7 */
961     { 3, 5 }  /* n = 8 */
962 };
963 
964 struct ranker {
965     double val;
966     double rank;
967     char c;
968 };
969 
970 /* Wilcoxon signed-rank test, with handling of zero-differences and
971    non-zero ties, plus continuity correction, as in E. Cureton, "The
972    Normal Approximation to the Signed-Rank Sampling Distribution when
973    Zero Differences are Present", JASA(62), 1967, 1068-1069.
974 */
975 
976 static int
signed_rank_test(const double * x,const double * y,int v1,int v2,const DATASET * dset,double * result,gretlopt opt,PRN * prn)977 signed_rank_test (const double *x, const double *y,
978 		  int v1, int v2, const DATASET *dset,
979 		  double *result, gretlopt opt, PRN *prn)
980 {
981     struct ranker *r;
982     double d, T, wp, wm;
983     double z = NADBL, pval = NADBL;
984     int quiet = (opt & OPT_Q);
985     int Z = 0, N = 0;
986     int i, k, t, n = 0;
987 
988     for (t=dset->t1; t<=dset->t2; t++) {
989 	if (!na(x[t]) && !na(y[t])) {
990 	    if (x[t] != y[t]) {
991 		n++;
992 	    }
993 	    N++;
994 	}
995     }
996 
997     if (N == 0) {
998 	return E_MISSDATA;
999     }
1000 
1001     Z = N - n; /* number of zero-differences */
1002 
1003     r = malloc(n * sizeof *r);
1004     if (r == NULL) {
1005 	return E_ALLOC;
1006     }
1007 
1008     i = 0;
1009     for (t=dset->t1; t<=dset->t2; t++) {
1010 	if (!na(x[t]) && !na(y[t]) && x[t] != y[t]) {
1011 	    d = x[t] - y[t];
1012 	    r[i].val = fabs(d);
1013 	    r[i].c = (d > 0)? '+' : '-';
1014 	    i++;
1015 	}
1016     }
1017 
1018     qsort(r, n, sizeof *r, gretl_compare_doubles);
1019 
1020     T = 0.0; /* non-zero ties correction */
1021     k = 0;   /* number of non-zero ties */
1022 
1023     for (i=0; i<n; i++) {
1024 	int m = 0;
1025 
1026 	for (t=i+1; t<n && r[t].val == r[i].val; t++) {
1027 	    m++;
1028 	}
1029 
1030 	if (m == 0) {
1031 	    r[i].rank = Z + i + 1;
1032 	} else {
1033 	    double avg = (Z + i + 1 + Z + i + m + 1) / 2.0;
1034 
1035 	    for (t=0; t<=m; t++) {
1036 		r[i+t].rank = avg;
1037 	    }
1038 	    i += m;
1039 	    T += pow((double) m, 3) - m;
1040 	    k++;
1041 	}
1042     }
1043 
1044     if (!quiet) {
1045 	diff_test_header(v1, v2, dset, prn);
1046 	pprintf(prn, "\n%s\n", _("Wilcoxon Signed-Rank Test"));
1047 	pprintf(prn, "%s: %s\n\n", _("Null hypothesis"),
1048 		_("the median difference is zero"));
1049 	if (opt & OPT_V) {
1050 	    pprintf(prn, "%16s %8s %16s\n\n", "difference", "rank",
1051 		    "signed rank");
1052 	}
1053     }
1054 
1055     wp = wm = 0.0;
1056 
1057     for (i=0; i<n; i++) {
1058 	d = r[i].rank;
1059 	if (r[i].c == '-') {
1060 	    wm += d;
1061 	    d = -d;
1062 	    r[i].val = -r[i].val;
1063 	} else {
1064 	    wp += d;
1065 	}
1066 	if (opt & OPT_V) {
1067 	    pprintf(prn, "%16g %8g %16g\n", r[i].val, r[i].rank, d);
1068 	}
1069     }
1070 
1071     if (opt & OPT_V) {
1072 	pputc(prn, '\n');
1073     }
1074 
1075     if (!quiet) {
1076 	pprintf(prn, "  n = %d\n", n);
1077 	pprintf(prn, "  W+ = %g, W- = %g\n", wp, wm);
1078 	pprintf(prn, "  (%s: %d, %s: %d)\n", _("zero differences"),
1079 		Z, _("non-zero ties"), k);
1080 	if (N > 0 && n == 0) {
1081 	    pprintf(prn, "  %s\n", _("Fail to reject H0"));
1082 	}
1083     }
1084 
1085     if (n > 8) {
1086 	double s2, x, num;
1087 
1088 	x = (N*(N+1) - Z*(Z+1)) / 4.0;
1089 	s2 = (N*(N+1)*(2*N+1) - Z*(Z+1)*(2*Z+1) - T/2) / 24.0;
1090 	if (!quiet) {
1091 	    pprintf(prn, "  %s = %g\n", _("Expected value"), x);
1092 	    pprintf(prn, "  %s = %g\n", _("Variance"), s2);
1093 	}
1094 	num = wp - x;
1095 	if (num > 0.25) {
1096 	    num -= .5;
1097 	} else {
1098 	    num += .5;
1099 	}
1100 	z = num / sqrt(s2);
1101 	if (quiet) {
1102 	    pval = print_z_prob(z, NULL);
1103 	} else {
1104 	    pprintf(prn, "  z = %g\n", z);
1105 	    pval = print_z_prob(z, prn);
1106 	}
1107     } else if (n > 5) {
1108 	pprintf(prn, "  5%% critical values: %d (two-tailed), %d (one-tailed)\n",
1109 		rank5[n-6][0], rank5[n-6][1]);
1110     } else {
1111 	pprintf(prn, "  %s\n",
1112 		_("Sample too small for statistical significance"));
1113     }
1114 
1115     if (!quiet) {
1116 	pputc(prn, '\n');
1117     }
1118 
1119     result[0] = z;
1120     result[1] = pval;
1121 
1122     free(r);
1123 
1124     return 0;
1125 }
1126 
rank_sum_test(const double * x,const double * y,int v1,int v2,const DATASET * dset,double * result,gretlopt opt,PRN * prn)1127 static int rank_sum_test (const double *x, const double *y,
1128 			  int v1, int v2, const DATASET *dset,
1129 			  double *result, gretlopt opt, PRN *prn)
1130 {
1131     struct ranker *r;
1132     double wa, z = NADBL, pval = NADBL;
1133     char xc = 'a', yc = 'b';
1134     int na = 0, nb = 0;
1135     int quiet = (opt & OPT_Q);
1136     int i, t, n = 0;
1137 
1138     for (t=dset->t1; t<=dset->t2; t++) {
1139 	if (!na(x[t])) na++;
1140 	if (!na(y[t])) nb++;
1141     }
1142 
1143     if (na == 0 || nb == 0) {
1144 	return E_MISSDATA;
1145     }
1146 
1147     n = na + nb;
1148     r = malloc(n * sizeof *r);
1149     if (r == NULL) {
1150 	return E_ALLOC;
1151     }
1152 
1153     if (na > nb) {
1154 	/* Make 'a' the smaller group */
1155 	int tmp = na;
1156 
1157 	na = nb;
1158 	nb = tmp;
1159 	xc = 'b';
1160 	yc = 'a';
1161     }
1162 
1163     i = 0;
1164     for (t=dset->t1; t<=dset->t2; t++) {
1165 	if (!na(x[t])) {
1166 	    r[i].val = x[t];
1167 	    r[i].c = xc;
1168 	    i++;
1169 	}
1170 	if (!na(y[t])) {
1171 	    r[i].val = y[t];
1172 	    r[i].c = yc;
1173 	    i++;
1174 	}
1175     }
1176 
1177     qsort(r, n, sizeof *r, gretl_compare_doubles);
1178 
1179     for (i=0; i<n; i++) {
1180 	int m = 1;
1181 
1182 	for (t=i+1; t<n && r[t].val == r[i].val; t++) {
1183 	    m++;
1184 	}
1185 
1186 	if (m == 1) {
1187 	    r[i].rank = i + 1;
1188 	} else {
1189 	    for (t=0; t<m; t++) {
1190 		r[i+t].rank = (i + 1 + i + m) / 2.0;
1191 	    }
1192 	    i += m - 1;
1193 	}
1194     }
1195 
1196     if (!quiet) {
1197 	diff_test_header(v1, v2, dset, prn);
1198 	pprintf(prn, "\n%s\n", _("Wilcoxon Rank-Sum Test"));
1199 	pprintf(prn, "%s: %s\n\n", _("Null hypothesis"),
1200 		_("the two medians are equal"));
1201     }
1202 
1203     wa = 0.0;
1204     for (i=0; i<n; i++) {
1205 	if (r[i].c == 'a') {
1206 	    wa += r[i].rank;
1207 	}
1208     }
1209 
1210     if (opt & OPT_V) {
1211 	pprintf(prn, "%10s %7s %8s\n\n", "value", "rank", "group");
1212 	for (i=0; i<n; i++) {
1213 	    pprintf(prn, "%10g %7g %8c\n", r[i].val, r[i].rank, r[i].c);
1214 	}
1215  	pputc(prn, '\n');
1216     }
1217 
1218     if (!quiet) {
1219 	pprintf(prn, "  n1 = %d, n2 = %d\n", na, nb);
1220 	pprintf(prn, "  w (%s) = %g\n", _("sum of ranks, sample 1"), wa);
1221     }
1222 
1223     if (na >= 10 && nb >= 10) {
1224 	double m, s;
1225 
1226 	m = na * (na + nb + 1) / 2.0;
1227 	s = sqrt(na * nb * (na + nb + 1) / 12.0);
1228 	z = (wa - m) / s;
1229 	if (quiet) {
1230 	    pval = print_z_prob(z, NULL);
1231 	} else {
1232 	    pprintf(prn, "  z = (%g - %g) / %g = %g\n", wa, m, s, z);
1233 	    pval = print_z_prob(z, prn);
1234 	}
1235     } else if (na >= 4 && nb >= 4 && nb <= 12) {
1236 	void (*cv) (int, int, PRN *);
1237 
1238 	cv = get_plugin_function("rank_sum_lookup");
1239 	if (cv != NULL) {
1240 	    (*cv)(na, nb, prn);
1241 	}
1242     } else {
1243 	pprintf(prn, "  %s\n",
1244 		_("Sample too small for statistical significance"));
1245     }
1246 
1247     if (!quiet) {
1248 	pputc(prn, '\n');
1249     }
1250 
1251     result[0] = z;
1252     result[1] = pval;
1253 
1254     free(r);
1255 
1256     return 0;
1257 }
1258 
sign_test(const double * x,const double * y,int v1,int v2,const DATASET * dset,double * result,gretlopt opt,PRN * prn)1259 static int sign_test (const double *x, const double *y,
1260 		      int v1, int v2, const DATASET *dset,
1261 		      double *result, gretlopt opt, PRN *prn)
1262 {
1263     double pv;
1264     int n, w, t;
1265 
1266     n = w = 0;
1267     for (t=dset->t1; t<=dset->t2; t++) {
1268 	if (!na(x[t]) && !na(y[t]) && x[t] != y[t]) {
1269 	    w += (x[t] > y[t]);
1270 	    n++;
1271 	}
1272     }
1273 
1274     if (n == 0) {
1275 	return E_MISSDATA;
1276     }
1277 
1278     if (w == 0) {
1279 	pv = 1.0;
1280     } else {
1281 	pv = binomial_cdf_comp(0.5, n, w - 1);
1282     }
1283 
1284     if (!(opt & OPT_Q)) {
1285 	diff_test_header(v1, v2, dset, prn);
1286 	pprintf(prn, "\n%s\n\n", _("Sign Test"));
1287 	pprintf(prn, _("Number of differences: n = %d\n"), n);
1288 	pputs(prn, "  ");
1289 	pprintf(prn, _("Number of cases with %s > %s: w = %d (%.2f%%)\n"),
1290 		dset->varname[v1], dset->varname[v2],
1291 		w, 100.0 * w / n);
1292 	pputs(prn, "  ");
1293 	pprintf(prn, _("Under the null hypothesis of no difference, W "
1294 		       "follows B(%d, %.1f)\n"), n, 0.5);
1295 	pprintf(prn, "  %s(W <= %d) = %g\n", _("Prob"), w,
1296 		binomial_cdf(0.5, n, w));
1297 	pprintf(prn, "  %s(W >= %d) = %g\n\n", _("Prob"), w, pv);
1298     }
1299 
1300     result[0] = w;
1301     result[1] = pv;
1302 
1303     return 0;
1304 }
1305 
1306 /**
1307  * diff_test:
1308  * @list: list containing 2 variables.
1309  * @dset: dataset struct.
1310  * @opt: %OPT_G, sign test; %OPT_R, rank sum; %OPT_I,
1311  * signed rank; %OPT_V, verbose (for rank tests).
1312  * @prn: gretl printing struct.
1313  *
1314  * Performs, and prints the results of, a non-parametric
1315  * test for a difference between two variables or groups.
1316  * The specific test performed depends on @opt.
1317  *
1318  * Returns: 0 on successful completion, non-zero on error.
1319  */
1320 
diff_test(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)1321 int diff_test (const int *list, const DATASET *dset,
1322 	       gretlopt opt, PRN *prn)
1323 {
1324     const double *x, *y;
1325     double result[2] = {0};
1326     int v1, v2;
1327     int err = 0;
1328 
1329     if (list[0] != 2) {
1330 	pputs(prn, _("This command requires two variables\n"));
1331 	return E_DATA;
1332     }
1333 
1334     v1 = list[1];
1335     v2 = list[2];
1336 
1337     x = dset->Z[v1];
1338     y = dset->Z[v2];
1339 
1340     if (opt == OPT_NONE || opt == OPT_V) {
1341 	opt = OPT_G;
1342     }
1343 
1344     if (opt & OPT_G) {
1345 	err = sign_test(x, y, v1, v2, dset, result, opt, prn);
1346     } else if (opt & OPT_R) {
1347 	err = rank_sum_test(x, y, v1, v2, dset, result, opt, prn);
1348     } else if (opt & OPT_I) {
1349 	err = signed_rank_test(x, y, v1, v2, dset, result, opt, prn);
1350     }
1351 
1352     record_test_result(result[0], result[1]);
1353 
1354     return err;
1355 }
1356 
1357 struct pair_sorter {
1358     double xi;
1359     double yi;
1360     int i;
1361     char *s;
1362 };
1363 
1364 /**
1365  * sort_pairs_by_x:
1366  * @x: data vector by which to sort.
1367  * @y: data vector.
1368  * @order: location to receive sort order, or %NULL.
1369  * @labels: array of strings to be sorted along with
1370  * the data, or %NULL.
1371  *
1372  * Orders the elements of @x and @y by increasing value
1373  * of @x.  Optionally, returns in @order an array of
1374  * integers representing the order in which the original
1375  * observations appear in the sorted vectors.  Also
1376  * optionally sorts an accomanying array of observation
1377  * labels.
1378  *
1379  * Returns: 0 on successful completion, non-zero on error.
1380  */
1381 
sort_pairs_by_x(gretl_matrix * x,gretl_matrix * y,int ** order,char ** labels)1382 int sort_pairs_by_x (gretl_matrix *x, gretl_matrix *y, int **order,
1383 		     char **labels)
1384 {
1385     struct pair_sorter *s = NULL;
1386     int t, T = gretl_vector_get_length(x);
1387     int err = 0;
1388 
1389     if (T == 0 || gretl_vector_get_length(y) != T) {
1390 	return E_NONCONF;
1391     }
1392 
1393     s = malloc(T * sizeof *s);
1394     if (s == NULL) {
1395 	return E_ALLOC;
1396     }
1397 
1398     for (t=0; t<T; t++) {
1399 	s[t].xi = x->val[t];
1400 	s[t].yi = y->val[t];
1401 	s[t].i = t;
1402 	if (labels != NULL) {
1403 	    s[t].s = labels[t];
1404 	} else {
1405 	    s[t].s = NULL;
1406 	}
1407     }
1408 
1409     qsort(s, T, sizeof *s, gretl_compare_doubles);
1410 
1411     for (t=0; t<T; t++) {
1412 	x->val[t] = s[t].xi;
1413 	y->val[t] = s[t].yi;
1414 	if (labels != NULL) {
1415 	    labels[t] = s[t].s;
1416 	}
1417     }
1418 
1419     if (order != NULL) {
1420 	int *idx = malloc(T * sizeof *idx);
1421 
1422 	if (idx == NULL) {
1423 	    err = E_ALLOC;
1424 	} else {
1425 	    for (t=0; t<T; t++) {
1426 		idx[t] = s[t].i;
1427 	    }
1428 	    *order = idx;
1429 	}
1430     }
1431 
1432     free(s);
1433 
1434     return err;
1435 }
1436 
data_pre_sorted(const gretl_matrix * x)1437 static int data_pre_sorted (const gretl_matrix *x)
1438 {
1439     int t, T = gretl_vector_get_length(x);
1440 
1441     for (t=1; t<T; t++) {
1442 	if (x->val[t] < x->val[t-1]) {
1443 	    return 0;
1444 	}
1445     }
1446 
1447     return 1;
1448 }
1449 
1450 #define LDEBUG 0
1451 
1452 /* convenience struct for passing loess data */
1453 
1454 struct loess_info {
1455     const gretl_matrix *y;
1456     const gretl_matrix *x;
1457     gretl_matrix *Xi;
1458     gretl_matrix *yi;
1459     gretl_matrix *wt;
1460     int d;
1461     int n;
1462     int N;
1463     int n_ok;
1464     int loo;
1465 };
1466 
1467 /* Compute robustness weights for loess, if wanted: on input @rw
1468    contains the N residuals; on return it contains the robustness
1469    weights as a function of the residuals. For observations
1470    where y is missing, the weight is NADBL.
1471 */
1472 
make_robustness_weights(gretl_matrix * rw,int N)1473 static int make_robustness_weights (gretl_matrix *rw, int N)
1474 {
1475     double *tmp = malloc(N * sizeof *tmp);
1476     double s, eis;
1477     int i, k, err = 0;
1478 
1479     if (tmp == NULL) {
1480 	return E_ALLOC;
1481     }
1482 
1483     /* pack the absolute values of the non-missing residuals
1484        into tmp */
1485     k = 0;
1486     for (i=0; i<N; i++) {
1487 	if (!na(rw->val[i])) {
1488 	    tmp[k++] = fabs(rw->val[i]);
1489 	}
1490     }
1491 
1492     /* find the median absolute residual */
1493     s = gretl_median(0, k-1, tmp);
1494     free(tmp);
1495 
1496     if (na(s)) {
1497 	err = E_DATA;
1498     } else {
1499 	s *= 6;
1500 	for (i=0; i<N && !err; i++) {
1501 	    if (!na(rw->val[i])) {
1502 		eis = rw->val[i] / s;
1503 		if (fabs(eis) < 1.0) {
1504 		    /* apply bisquare */
1505 		    rw->val[i] = (1 - eis * eis) * (1 - eis * eis);
1506 		} else {
1507 		    rw->val[i] = 0.0;
1508 		}
1509 	    }
1510 	}
1511     }
1512 
1513     return err;
1514 }
1515 
1516 /* Multiply the robustness weights, @rw, into the regular
1517    weights, @wt, taking care to register the two vectors
1518    (rw is full-length while wt is local), and to skip any
1519    missing values in rw.
1520 */
1521 
adjust_weights(const gretl_matrix * rw,gretl_matrix * wt,int a)1522 static void adjust_weights (const gretl_matrix *rw,
1523 			    gretl_matrix *wt, int a)
1524 {
1525     int k, n = gretl_vector_get_length(wt);
1526     int t = a;
1527 
1528     for (k=0; k<n; k++) {
1529 	wt->val[k] *= rw->val[t];
1530 	if (k < n - 1) {
1531 	    t++;
1532 	    while (na(rw->val[t])) t++;
1533 	}
1534     }
1535 }
1536 
1537 /* Apply the weights in lo->wt to the local data matrices
1538    lo->Xi and lo->yi. This is straightforward since all
1539    the matrices have lo->n rows.
1540 */
1541 
weight_local_data(struct loess_info * lo)1542 static void weight_local_data (struct loess_info *lo)
1543 {
1544     double xkj, wk;
1545     int k, j;
1546 
1547     for (k=0; k<lo->n; k++) {
1548 	wk = sqrt(lo->wt->val[k]);
1549 	for (j=0; j<lo->Xi->cols; j++) {
1550 	    xkj = gretl_matrix_get(lo->Xi, k, j);
1551 	    gretl_matrix_set(lo->Xi, k, j, xkj * wk);
1552 	}
1553 	lo->yi->val[k] *= wk;
1554     }
1555 
1556 #if LDEBUG > 1
1557     gretl_matrix_print(lo->Xi, "Xi, weighted");
1558     gretl_matrix_print(lo->yi, "yi, weighted");
1559 #endif
1560 }
1561 
next_ok_obs(const double * y,int i)1562 static int next_ok_obs (const double *y, int i)
1563 {
1564     i++;
1565     while (na(y[i])) i++;
1566     return i;
1567 }
1568 
loess_get_local_data(int i,int * pa,struct loess_info * lo,int * xconst)1569 static int loess_get_local_data (int i, int *pa,
1570 				 struct loess_info *lo,
1571 				 int *xconst)
1572 {
1573     const double *x = lo->x->val;
1574     const double *y = lo->y->val;
1575     double xk, xk1, xds, h = 0;
1576     int n = lo->n, n_ok = lo->n_ok;
1577     int k_skip = -1;
1578     int a, b, k, m, t;
1579     int err = 0;
1580 
1581     gretl_matrix_zero(lo->Xi);
1582     gretl_matrix_zero(lo->yi);
1583     gretl_matrix_zero(lo->wt);
1584 
1585     a = *pa;
1586 
1587 #if LDEBUG
1588     fprintf(stderr, "\ni=%d, a=%d, n_ok=%d\n", i, a, n_ok);
1589 #endif
1590 
1591     /* First determine where we should start reading the
1592        neighbors of xi: search rightward from a, so far as
1593        this is feasible.
1594     */
1595 
1596     while (n_ok > n) {
1597 	/* find b, the next point that would be included if
1598 	   we move the neighbor set one place to the right:
1599 	   start at a+1 and proceed until we have n points
1600 	   with valid y-values
1601 	*/
1602 	for (m=0, b=a+1; ; b++) {
1603 	    if (!na(y[b]) && ++m == n) {
1604 		break;
1605 	    }
1606 	}
1607 	if (fabs(x[i] - x[a]) < fabs(x[i] - x[b])) {
1608 	    /* the max distance increased: get out */
1609 	    break;
1610 	}
1611 	/* shift one (valid) place to the right */
1612 	a = next_ok_obs(y, a);
1613 	n_ok--;
1614     }
1615 
1616     /* record status for next round */
1617     *pa = a;
1618     lo->n_ok = n_ok;
1619 
1620     /* Having found the starting index for the n nearest neighbors
1621        of xi, transcribe the relevant data into Xi and yi. As we
1622        go, check whether x is constant in this sub-sample.
1623     */
1624 
1625     *xconst = 1;
1626     xk1 = 0;
1627     t = a;
1628 
1629     for (k=0; k<n; k++) {
1630 	if (lo->loo && t == i) {
1631 	    /* leave-one-out: mark this observation */
1632 	    k_skip = k;
1633 	}
1634 	lo->yi->val[k] = y[t];
1635 	xk = x[t];
1636 	gretl_matrix_set(lo->Xi, k, 0, 1.0);
1637 	gretl_matrix_set(lo->Xi, k, 1, xk);
1638 	if (lo->Xi->cols > 2) {
1639 	    gretl_matrix_set(lo->Xi, k, 2, xk * xk);
1640 	}
1641 	if (*xconst && k > 0 && xk > xk1) {
1642 	    *xconst = 0;
1643 	}
1644 	xk1 = xk;
1645 	if (k < n - 1) {
1646 	    t = next_ok_obs(y, t);
1647 	}
1648     }
1649 
1650 #if LDEBUG
1651     fprintf(stderr, " -> a=%d, n_ok=%d (xconst = %d)\n",
1652 	    a, n_ok, *xconst);
1653 #endif
1654 
1655     if (!err) {
1656 	/* find the max(abs) distance from xi */
1657 	double h0 = fabs(x[i] - gretl_matrix_get(lo->Xi, 0, 1));
1658 	double hn = fabs(x[i] - gretl_matrix_get(lo->Xi, n-1, 1));
1659 
1660 	h = (h0 > hn)? h0 : hn;
1661     }
1662 
1663     /* compute scaled distances and tricube weights */
1664     for (k=0; k<n; k++) {
1665 	if (k == k_skip) {
1666 	    /* exclude this obs via a zero weight? */
1667 	    lo->wt->val[k] = 0.0;
1668 	    continue;
1669 	}
1670 	xk = gretl_matrix_get(lo->Xi, k, 1);
1671 	if (h == 0.0) {
1672 	    lo->wt->val[k] = 1.0;
1673 	} else {
1674 	    xds = fabs(x[i] - xk) / h;
1675 	    if (xds < 1.0) {
1676 		lo->wt->val[k] = pow(1.0 - pow(xds, 3.0), 3.0);
1677 	    }
1678 	}
1679 #if LDEBUG > 1
1680 	fprintf(stderr, "y=%10g, x=%10g, dist=%10g\n",
1681 		lo->yi->val[k], xk, fabs(x[i] - xk));
1682 #endif
1683     }
1684 
1685 #if LDEBUG > 1
1686     gretl_matrix_print(lo->Xi, "Xi");
1687     gretl_matrix_print(lo->yi, "yi");
1688     gretl_matrix_print(lo->wt, "wt");
1689 #endif
1690 
1691     return err;
1692 }
1693 
loess_count_usable_obs(const gretl_matrix * y,int N,int * amin)1694 static int loess_count_usable_obs (const gretl_matrix *y,
1695 				   int N, int *amin)
1696 {
1697     int i, n_ok = 0;
1698 
1699     for (i=0; i<N; i++) {
1700 	if (!na(y->val[i])) {
1701 	    if (n_ok == 0) {
1702 		/* record the first usable obs index */
1703 		*amin = i;
1704 	    }
1705 	    n_ok++;
1706 	}
1707     }
1708 
1709     return n_ok;
1710 }
1711 
1712 /**
1713  * loess_fit:
1714  * @x: x-axis variable (must be pre-sorted).
1715  * @y: response variable.
1716  * @d: order for polynomial fit (0 <= d <= 2).
1717  * @q: bandwidth (0 < q <= 1).
1718  * @opt: give %OPT_R for robust variant (with re-weighting based on
1719  * the first-stage residuals).
1720  * @err: location to receive error code.
1721  *
1722  * Computes loess estimates based on William Cleveland, "Robust Locally
1723  * Weighted Regression and Smoothing Scatterplots", Journal of the
1724  * American Statistical Association, Vol. 74 (1979), pp. 829-836.
1725  * Typically one expects that @d = 1 and @q is in the neighborhood
1726  * of 0.5.
1727  *
1728  * The x,y pairs must be pre-sorted by increasing value of @x; an
1729  * error is flagged if this is not the case.  See also
1730  * sort_pairs_by_x().
1731  *
1732  * Returns: allocated vector containing the loess fitted values, or
1733  * %NULL on failure.
1734  */
1735 
loess_fit(const gretl_matrix * x,const gretl_matrix * y,int d,double q,gretlopt opt,int * err)1736 gretl_matrix *loess_fit (const gretl_matrix *x, const gretl_matrix *y,
1737 			 int d, double q, gretlopt opt, int *err)
1738 {
1739     struct loess_info lo;
1740     gretl_matrix_block *B;
1741     gretl_matrix *Xi, *yi, *wt, *b;
1742     gretl_matrix *yh = NULL;
1743     gretl_matrix *rw = NULL;
1744     int N = gretl_vector_get_length(y);
1745     int k, iters, Xic, amin = 0;
1746     int n_ok, robust = 0, loo = 0;
1747     int i, n;
1748 
1749     if (d < 0 || d > 2 || q > 1.0) {
1750 	*err = E_DATA;
1751 	return NULL;
1752     }
1753 
1754     if (!data_pre_sorted(x)) {
1755 	gretl_errmsg_set("loess: the data must be sorted by x");
1756 	*err = E_DATA;
1757 	return NULL;
1758     }
1759 
1760     /* check for usable data points */
1761     n_ok = loess_count_usable_obs(y, N, &amin);
1762     if (n_ok < 4) {
1763 	*err = E_TOOFEW;
1764 	return NULL;
1765     }
1766 
1767     if (opt & OPT_O) {
1768 	/* leave one out: experimental */
1769 	loo = 1;
1770     }
1771 
1772     /* check for q too small */
1773     if (q < (d + 1.0) / n_ok) {
1774 	q = (d + 1.0) / n_ok;
1775     }
1776 
1777     /* set the local sub-sample size */
1778     n = (int) ceil(q * n_ok);
1779 
1780     /* we need a minimum of two columns in Xi, in order
1781        to compute the x-distance based weights */
1782     Xic = (d == 0)? 2 : d + 1;
1783 
1784     B = gretl_matrix_block_new(&Xi, n, Xic,
1785 			       &yi, n, 1,
1786 			       &wt, n, 1,
1787 			       &b, d+1, 1,
1788 			       NULL);
1789     if (B == NULL) {
1790 	*err = E_ALLOC;
1791 	return NULL;
1792     }
1793 
1794     /* vector to hold the fitted values */
1795     yh = gretl_column_vector_alloc(N);
1796     if (yh == NULL) {
1797 	*err = E_ALLOC;
1798 	goto bailout;
1799     }
1800 
1801     if (opt & OPT_R) {
1802 	/* extra storage for residuals/robustness weights */
1803 	rw = gretl_column_vector_alloc(N);
1804 	if (rw == NULL) {
1805 	    *err = E_ALLOC;
1806 	    goto bailout;
1807 	}
1808 	iters = 2; /* maybe 3? */
1809 	robust = 1;
1810     } else {
1811 	iters = 1;
1812     }
1813 
1814     /* fill out the convenience struct */
1815     lo.y = y;
1816     lo.x = x;
1817     lo.Xi = Xi;
1818     lo.yi = yi;
1819     lo.wt = wt;
1820     lo.d = d;
1821     lo.n = n;
1822     lo.N = N;
1823     lo.loo = loo;
1824 
1825     for (k=0; k<iters && !*err; k++) {
1826 	/* iterations for robustness, if wanted */
1827 	int a = amin;
1828 
1829 	lo.n_ok = n_ok;
1830 
1831 	for (i=0; i<N && !*err; i++) {
1832 	    /* iterate across points in full sample */
1833 	    double xi = x->val[i];
1834 	    int xconst = 0;
1835 
1836 	    /* a is the leftmost possible index for starting to
1837 	       read the n nearest neighbors of xi.
1838 	    */
1839 	    *err = loess_get_local_data(i, &a, &lo, &xconst);
1840 	    if (*err) {
1841 		break;
1842 	    }
1843 
1844 	    if (k > 0) {
1845 		/* We have robustness weights, rw, based on the residuals
1846 		   from the last round, which should be used to adjust
1847 		   the wt as computed in loess_get_local_data().
1848 		*/
1849 		adjust_weights(rw, wt, a);
1850 	    }
1851 
1852 	    /* apply weights to the local data */
1853 	    weight_local_data(&lo);
1854 
1855 	    if (d == 0 || xconst) {
1856 		/* not using x in the regressions: mask the x column(s) */
1857 		gretl_matrix_reuse(Xi, -1, 1);
1858 		if (b->rows > 1) {
1859 		    gretl_matrix_reuse(b, 1, 1);
1860 		}
1861 	    }
1862 
1863 	    /* run local WLS */
1864 	    *err = gretl_matrix_SVD_ols(yi, Xi, b, NULL, NULL, NULL);
1865 
1866 	    if (!*err) {
1867 		/* yh: evaluate the polynomial at xi */
1868 		yh->val[i] = b->val[0];
1869 		if (b->rows > 1) {
1870 		    yh->val[i] += b->val[1] * xi;
1871 		    if (b->rows == 3) {
1872 			yh->val[i] += b->val[2] * xi * xi;
1873 		    }
1874 		}
1875 		if (robust && k < iters - 1) {
1876 		    /* save residual for robustness weights */
1877 		    if (na(y->val[i])) {
1878 			rw->val[i] = NADBL;
1879 		    } else {
1880 			rw->val[i] = y->val[i] - yh->val[i];
1881 		    }
1882 		}
1883 	    }
1884 
1885 	    /* ensure matrices are at full size */
1886 	    gretl_matrix_reuse(Xi, -1, Xic);
1887 	    gretl_matrix_reuse(b, d+1, -1);
1888 
1889 	} /* end loop over sample */
1890 
1891 	if (!*err && robust && k < iters - 1) {
1892 	    *err = make_robustness_weights(rw, N);
1893 	}
1894     } /* end robustness iterations */
1895 
1896  bailout:
1897 
1898     gretl_matrix_block_destroy(B);
1899     gretl_matrix_free(rw);
1900 
1901     if (*err) {
1902 	gretl_matrix_free(yh);
1903 	yh = NULL;
1904     }
1905 
1906     return yh;
1907 }
1908 
quartiles(const double * x,int n,double * q1,double * q3)1909 static double quartiles (const double *x, int n,
1910 			 double *q1, double *q3)
1911 {
1912     int n2 = n / 2;
1913     double xx;
1914 
1915     xx = (n % 2)? x[n2] : 0.5 * (x[n2 - 1] + x[n2]);
1916 
1917     if (q1 != NULL && q3 != NULL) {
1918         if (n % 2) {
1919             *q1 = quartiles(x, n2 + 1, NULL, NULL);
1920             *q3 = quartiles(x + n2, n2 + 1, NULL, NULL);
1921         } else {
1922             *q1 = quartiles(x, n2, NULL, NULL);
1923             *q3 = quartiles(x + n2, n2, NULL, NULL);
1924         }
1925     }
1926 
1927     return xx;
1928 }
1929 
kernel_bandwidth(const double * x,int n)1930 double kernel_bandwidth (const double *x, int n)
1931 {
1932     double s, w, q1, q3, r;
1933     double n5 = pow((double) n, -0.20);
1934 
1935     s = gretl_stddev(0, n - 1, x);
1936     quartiles(x, n, &q1, &q3);
1937     r = (q3 - q1) / 1.349;
1938     w = (r > 0 && r < s)? r : s;
1939 
1940 #if 0
1941     fprintf(stderr, "Silverman bandwidth: s=%g, q1=%g, q3=%g, IQR=%g, w=%g\n",
1942 	    s, q1, q3, q3 - q1, w);
1943 #endif
1944 
1945     /* Silverman bandwidth times scale factor */
1946     return 0.9 * w * n5;
1947 }
1948