1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #include "libgretl.h"
21 #include "gretl_matrix.h"
22 #include "gretl_cmatrix.h"
23 #include "matrix_extra.h"
24 #include "gretl_panel.h"
25 #include "gretl_string_table.h"
26 #include "libset.h"
27 #include "compat.h"
28 #include "plotspec.h"
29 #include "usermat.h"
30 
31 #include <unistd.h>
32 
33 #ifdef WIN32
34 # include <windows.h>
35 #endif
36 
37 /**
38  * SECTION:describe
39  * @short_description: descriptive statistics plus some tests
40  * @title: Descriptive statistics
41  * @include: libgretl.h
42  *
43  * Computation and printing of numerous descriptive statistics
44  * along with some hypothesis tests, for example regarding the
45  * normality of a data series.
46  */
47 
48 /* return the number of "good" (non-missing) observations in series x;
49    also (optionally) write to x0 the first non-missing value */
50 
good_obs(const double * x,int n,double * x0)51 static int good_obs (const double *x, int n, double *x0)
52 {
53     int t, good = n;
54 
55     if (x0 != NULL) {
56 	*x0 = NADBL;
57     }
58 
59     for (t=0; t<n; t++) {
60 	if (na(x[t])) {
61 	    good--;
62 	} else if (x0 != NULL && na(*x0)) {
63 	    *x0 = x[t];
64 	}
65     }
66 
67     return good;
68 }
69 
70 /**
71  * gretl_minmax:
72  * @t1: starting observation.
73  * @t2: ending observation.
74  * @x: data series.
75  * @min: location to receive minimum value.
76  * @max: location to receive maximum value.
77  *
78  * Puts the minimum and maximum values of the series @x,
79  * from obs @t1 to obs @t2, into the variables @min and @max.
80  *
81  * Returns: the number of valid observations in the given
82  * data range.
83  */
84 
gretl_minmax(int t1,int t2,const double * x,double * min,double * max)85 int gretl_minmax (int t1, int t2, const double *x,
86 		  double *min, double *max)
87 {
88     int t, n = 0;
89 
90     *max = *min = NADBL;
91 
92     for (t=t1; t<=t2; t++) {
93 	if (!(na(x[t]))) {
94 	    if (n == 0) {
95 		*max = *min = x[t];
96 	    } else {
97 		if (x[t] > *max) *max = x[t];
98 		if (x[t] < *min) *min = x[t];
99 	    }
100 	    n++;
101 	}
102     }
103 
104     return n;
105 }
106 
107 /**
108  * gretl_min:
109  * @t1: starting observation.
110  * @t2: ending observation.
111  * @x: data series.
112  *
113  * Returns: the minimum value of @x over the given range,
114  * or #NADBL if no valid vaues are found.
115  */
116 
gretl_min(int t1,int t2,const double * x)117 double gretl_min (int t1, int t2, const double *x)
118 {
119     double min, max;
120 
121     gretl_minmax(t1, t2, x, &min, &max);
122     return min;
123 }
124 
125 /**
126  * gretl_max:
127  * @t1: starting observation.
128  * @t2: ending observation.
129  * @x: data series.
130  *
131  * Returns: the maximum value of @x over the given range,
132  * or #NADBL if no valid vaues are found.
133  */
134 
gretl_max(int t1,int t2,const double * x)135 double gretl_max (int t1, int t2, const double *x)
136 {
137     double min, max;
138 
139     gretl_minmax(t1, t2, x, &min, &max);
140     return max;
141 }
142 
143 /**
144  * gretl_sum:
145  * @t1: starting observation.
146  * @t2: ending observation.
147  * @x: data series.
148  *
149  * Returns: the sum of the series @x from obs
150  * @t1 to obs @t2, skipping any missing values, or #NADBL
151  * in case there are no valid observations.
152  */
153 
gretl_sum(int t1,int t2,const double * x)154 double gretl_sum (int t1, int t2, const double *x)
155 {
156     double sum = 0.0;
157     int t, n = 0;
158 
159     for (t=t1; t<=t2; t++) {
160 	if (!(na(x[t]))) {
161 	    sum += x[t];
162 	    n++;
163 	}
164     }
165 
166     return (n == 0)? NADBL : sum;
167 }
168 
169 /**
170  * gretl_mean:
171  * @t1: starting observation.
172  * @t2: ending observation.
173  * @x: data series.
174  *
175  * Returns: the arithmetic mean of the series @x from obs
176  * @t1 to obs @t2, skipping any missing values, or #NADBL
177  * in case there are no valid observations.
178  */
179 
gretl_mean(int t1,int t2,const double * x)180 double gretl_mean (int t1, int t2, const double *x)
181 {
182     double xbar, sum = 0.0;
183     int t, n = 0;
184 
185     for (t=t1; t<=t2; t++) {
186 	if (!(na(x[t]))) {
187 	    sum += x[t];
188 	    n++;
189 	}
190     }
191 
192     if (n == 0) {
193 	return NADBL;
194     }
195 
196     xbar = sum / n;
197     sum = 0.0;
198 
199     for (t=t1; t<=t2; t++) {
200 	if (!(na(x[t]))) {
201 	    sum += (x[t] - xbar);
202 	}
203     }
204 
205     return xbar + sum / n;
206 }
207 
208 /**
209  * eval_ytest:
210  * @y: reference numerical value.
211  * @op: operator.
212  * @test: numerical test value.
213  *
214  * Returns: 1 if the expression @y @yop @test (for example
215  * "y = 2" or "y <= 45") evaluates as true, else 0.
216  */
217 
eval_ytest(double y,GretlOp op,double test)218 int eval_ytest (double y, GretlOp op, double test)
219 {
220     int ret = 0;
221 
222     switch (op) {
223     case OP_EQ:
224 	if (na(test)) {
225 	    ret = na(y);
226 	} else {
227 	    ret = (y == test);
228 	}
229 	break;
230     case OP_GT:
231 	ret = (y > test);
232 	break;
233     case OP_LT:
234 	ret = (y < test);
235 	break;
236     case OP_NEQ:
237 	if (na(test)) {
238 	    ret = !na(y);
239 	} else {
240 	    ret = (y != test);
241 	}
242 	break;
243     case OP_GTE:
244 	ret = (y >= test);
245 	break;
246     case OP_LTE:
247 	ret = (y <= test);
248 	break;
249     default:
250 	break;
251     }
252 
253     return ret;
254 }
255 
256 /**
257  * gretl_restricted_mean:
258  * @t1: starting observation.
259  * @t2: ending observation.
260  * @x: data series.
261  * @y: criterion series.
262  * @yop: criterion operator.
263  * @yval: criterion value.
264  *
265  * Returns: the arithmetic mean of the series @x in the
266  * range @t1 to @t2 (inclusive), but including only
267  * observations where the criterion variable @y bears the
268  * relationship @yop to the value @yval -- or #NADBL in case
269  * there are no observations that satisfy the restriction.
270  */
271 
gretl_restricted_mean(int t1,int t2,const double * x,const double * y,GretlOp yop,double yval)272 double gretl_restricted_mean (int t1, int t2, const double *x,
273 			      const double *y, GretlOp yop,
274 			      double yval)
275 {
276     double xbar, sum = 0.0;
277     int t, n = 0;
278 
279     for (t=t1; t<=t2; t++) {
280 	if (!na(x[t]) && eval_ytest(y[t], yop, yval)) {
281 	    sum += x[t];
282 	    n++;
283 	}
284     }
285 
286     if (n == 0) {
287 	return NADBL;
288     }
289 
290     xbar = sum / n;
291     sum = 0.0;
292 
293     for (t=t1; t<=t2; t++) {
294 	if (!na(x[t]) && eval_ytest(y[t], yop, yval)) {
295 	    sum += (x[t] - xbar);
296 	}
297     }
298 
299     return xbar + sum / n;
300 }
301 
302 /*
303    For an array a with n elements, find the element which would be a[h]
304    if the array were sorted from smallest to largest (without the need
305    to do a full sort).  The algorithm is due to C. A. R. Hoare; the C
306    implementation is shamelessly copied, with minor adaptations, from
307    http://www.astro.northwestern.edu/~ato/um/programming/sorting.html
308    (Atakan G\"urkan: link now dead).
309 */
310 
find_hoare(double * a,int n,int h)311 static double find_hoare (double *a, int n, int h)
312 {
313     double w, x;
314     int l = 0;
315     int r = n - 1;
316     int i, j;
317 
318     while (l < r) {
319 	x = a[h];
320 	i = l;
321 	j = r;
322 	while (j >= i) {
323 	    while (a[i] < x) i++;
324 	    while (x < a[j]) j--;
325 	    if (i <= j) {
326 		w = a[i];
327 		a[i++] = a[j];
328 		a[j--] = w;
329 	    }
330 	}
331 	if (j < h) l = i;
332 	if (h < i) r = j;
333     }
334 
335     return a[h];
336 }
337 
find_hoare_inexact(double * a,double p,double xmin,double xmax,double frac,int n,int hf,int hc)338 static double find_hoare_inexact (double *a, double p,
339 				  double xmin, double xmax,
340 				  double frac, int n,
341 				  int hf, int hc)
342 {
343     double tmp, high, low;
344     int i;
345 
346     if (p < 0.5) {
347 	high = find_hoare(a, n, hc);
348 	tmp = xmin;
349 	for (i=0; i<hc; i++) {
350 	    if (a[i] > tmp) {
351 		tmp = a[i];
352 	    }
353 	}
354 	low = tmp;
355     } else {
356 	low = find_hoare(a, n, hf);
357 	tmp = xmax;
358 	for (i=hc; i<n; i++) {
359 	    if (a[i] < tmp) {
360 		tmp = a[i];
361 	    }
362 	}
363 	high = tmp;
364     }
365 
366     return low + frac * (high - low);
367 }
368 
369 /* See https://en.wikipedia.org/wiki/Quantile, also
370    Hyndman and Fan, "Sample Quantiles in Statistical
371    Packages" (The American Statistician, 1996).
372    Return the "index" of the @p-quantile for a sample
373    of size @n (which may not be an integer).
374 */
375 
quantile_index(int n,double p)376 static double quantile_index (int n, double p)
377 {
378     int Qtype = libset_get_int(QUANTILE_TYPE);
379     double h;
380 
381     if (Qtype == 0) {
382 	/* \hat{Q}_6 */
383 	h = (n + 1) * p;
384     } else if (Qtype == 1) {
385 	/* \hat{Q}_7 */
386 	h = (n - 1) * p + 1;
387     } else {
388 	/* \hat{Q}_8 */
389 	h = (n + 1.0/3) * p + 1.0/3;
390     }
391 
392     return h - 1; /* 0-based */
393 }
394 
395 /**
396  * gretl_quantile:
397  * @t1: starting observation.
398  * @t2: ending observation.
399  * @x: data series.
400  * @p: probability.
401  * @opt: may include OPT_Q to hush warning when sample is
402  * too small.
403  * @err: location to receive error code.
404  *
405  * Returns: the @p quantile of the series @x from obs
406  * @t1 to obs @t2, skipping any missing values, or #NADBL
407  * on failure.
408  */
409 
gretl_quantile(int t1,int t2,const double * x,double p,gretlopt opt,int * err)410 double gretl_quantile (int t1, int t2, const double *x, double p,
411 		       gretlopt opt, int *err)
412 {
413     double *a = NULL;
414     double xmin, xmax;
415     double h, ret;
416     int hf, hc;
417     int t, n = 0;
418 
419     if (*err) {
420 	/* don't compound a prior error */
421 	return NADBL;
422     }
423 
424     if (p <= 0 || p >= 1) {
425 	/* sanity check */
426 	*err = E_DATA;
427     } else {
428 	n = gretl_minmax(t1, t2, x, &xmin, &xmax);
429 	if (n == 0) {
430 	    *err = E_DATA;
431 	}
432     }
433 
434     if (!*err) {
435 	h = quantile_index(n, p);
436 	hf = floor(h);
437 	hc = ceil(h);
438 
439 	if (hc == 0 || hc == n) {
440 	    /* too few usable observations for the specified
441 	       quantile; don't treat as fatal error */
442 	    return NADBL;
443 	} else {
444 	    a = malloc(n * sizeof *a);
445 	    if (a == NULL) {
446 		*err = E_ALLOC;
447 	    }
448 	}
449     }
450 
451     if (*err) {
452 	return NADBL;
453     }
454 
455     n = 0;
456     for (t=t1; t<=t2; t++) {
457 	if (!na(x[t])) {
458 	    a[n++] = x[t];
459 	}
460     }
461 
462     if (hf == hc) {
463 	/* "exact" */
464 	ret = find_hoare(a, n, hf);
465     } else {
466 	ret = find_hoare_inexact(a, p, xmin, xmax, h - hf,
467 				 n, hf, hc);
468     }
469 
470     free(a);
471 
472     return ret;
473 }
474 
475 /**
476  * gretl_array_quantiles:
477  * @a: data array (this gets re-ordered).
478  * @n: length of array.
479  * @p: array of probabilities (over-written by quantiles).
480  * @k: number of probabilities.
481  *
482  * Computes @k quantiles (given by the elements of @p) for the
483  * first n elements of the array @a, which is re-ordered in
484  * the process. On successful exit, @p contains the quantiles.
485  *
486  * Returns: 0 on success, non-zero code on error.
487  */
488 
gretl_array_quantiles(double * a,int n,double * p,int k)489 int gretl_array_quantiles (double *a, int n, double *p, int k)
490 {
491     double h, xmin = 0, xmax = NADBL;
492     int hf, hc, i;
493     int err = 0;
494 
495     if (n <= 0 || k <= 0) {
496 	return E_DATA;
497     }
498 
499     for (i=0; i<k; i++) {
500 	if (p[i] <= 0.0 || p[i] >= 1.0) {
501 	    p[i] = NADBL;
502 	    err = E_INVARG;
503 	    break;
504 	}
505 
506 	h = quantile_index(n, p[i]);
507 	hf = floor(h);
508 	hc = ceil(h);
509 
510 	if (hc == 0 || hc == n) {
511 	    p[i] = NADBL;
512 	} else if (hf == hc) {
513 	    p[i] = find_hoare(a, n, hf);
514 	} else {
515 	    if (na(xmax)) {
516 		gretl_minmax(0, n-1, a, &xmin, &xmax);
517 	    }
518 	    p[i] = find_hoare_inexact(a, p[i], xmin, xmax, h - hf,
519 				      n, hf, hc);
520 	}
521     }
522 
523     return err;
524 }
525 
526 /**
527  * gretl_array_quantile:
528  * @a: array on which to operate.
529  * @n: number of elements in @a.
530  * @p: probability.
531  *
532  * Returns: the @p quantile of the first @n elements in @a,
533  * which is re-ordered in the process, or #NADBL on
534  * failure.
535  */
536 
gretl_array_quantile(double * a,int n,double p)537 double gretl_array_quantile (double *a, int n, double p)
538 {
539     gretl_array_quantiles(a, n, &p, 1);
540     return p;
541 }
542 
543 /**
544  * gretl_median:
545  * @t1: starting observation.
546  * @t2: ending observation.
547  * @x: data series.
548  *
549  * Returns: the median value of the series @x from obs
550  * @t1 to obs @t2, skipping any missing values, or #NADBL
551  * on failure.
552  */
553 
gretl_median(int t1,int t2,const double * x)554 double gretl_median (int t1, int t2, const double *x)
555 {
556     int err = 0;
557 
558     return gretl_quantile(t1, t2, x, 0.5, OPT_NONE, &err);
559 }
560 
561 /**
562  * gretl_sst:
563  * @t1: starting observation.
564  * @t2: ending observation.
565  * @x: data series.
566  *
567  * Returns: the sum of squared deviations from the mean for
568  * the series @x from obs @t1 to obs @t2, skipping any missing
569  * values, or #NADBL on failure.
570  */
571 
gretl_sst(int t1,int t2,const double * x)572 double gretl_sst (int t1, int t2, const double *x)
573 {
574     int t, n = t2 - t1 + 1;
575     double sumsq, xx, xbar;
576 
577     if (n == 0) {
578 	return NADBL;
579     }
580 
581     xbar = gretl_mean(t1, t2, x);
582     if (na(xbar)) {
583 	return NADBL;
584     }
585 
586     sumsq = 0.0;
587 
588     for (t=t1; t<=t2; t++) {
589 	if (!na(x[t])) {
590 	    xx = x[t] - xbar;
591 	    sumsq += xx * xx;
592 	}
593     }
594 
595     return sumsq;
596 }
597 
598 /* allows for computing MLE rather than using df correction,
599    via the @asy argument */
600 
real_gretl_variance(int t1,int t2,const double * x,int asy)601 double real_gretl_variance (int t1, int t2, const double *x,
602 			    int asy)
603 {
604     int t, n = t2 - t1 + 1;
605     double v, dx, xbar;
606 
607     if (n == 0) {
608 	/* null sample */
609 	return NADBL;
610     }
611 
612     xbar = gretl_mean(t1, t2, x);
613     if (na(xbar)) {
614 	return NADBL;
615     }
616 
617     v = 0.0;
618     n = 0;
619 
620     for (t=t1; t<=t2; t++) {
621 	if (!na(x[t])) {
622 	    dx = x[t] - xbar;
623 	    v += dx * dx;
624 	    n++;
625 	}
626     }
627 
628     if (asy && n > 0) {
629 	v /= n;
630     } else if (n > 1) {
631 	v /= (n - 1);
632     } else {
633 	v = 0.0;
634     }
635 
636     return (v >= 0)? v : NADBL;
637 }
638 
639 /**
640  * gretl_variance:
641  * @t1: starting observation.
642  * @t2: ending observation.
643  * @x: data series.
644  *
645  * Returns: the variance of the series @x from obs
646  * @t1 to obs @t2, skipping any missing values, or #NADBL
647  * on failure.
648  */
649 
gretl_variance(int t1,int t2,const double * x)650 double gretl_variance (int t1, int t2, const double *x)
651 {
652     return real_gretl_variance(t1, t2, x, 0);
653 }
654 
655 /**
656  * gretl_restricted_variance:
657  * @t1: starting observation.
658  * @t2: ending observation.
659  * @x: data series.
660  * @y: criterion series.
661  * @yop: criterion operator.
662  * @yval: criterion value.
663  *
664  * Returns: the variance of the series @x from obs
665  * @t1 to obs @t2, skipping any missing values and
666  * observations where the series @y does not bear the
667  * relationship @yop to the value @yval, or #NADBL on
668  * failure.
669  */
670 
gretl_restricted_variance(int t1,int t2,const double * x,const double * y,GretlOp yop,double yval)671 double gretl_restricted_variance (int t1, int t2, const double *x,
672 				  const double *y, GretlOp yop,
673 				  double yval)
674 {
675     double sumsq, xx, xbar;
676     int t, n = 0;
677 
678     xbar = gretl_restricted_mean(t1, t2, x, y, yop, yval);
679     if (na(xbar)) {
680 	return NADBL;
681     }
682 
683     sumsq = 0.0;
684 
685     for (t=t1; t<=t2; t++) {
686 	if (!na(x[t]) && eval_ytest(y[t], yop, yval)) {
687 	    xx = x[t] - xbar;
688 	    sumsq += xx * xx;
689 	    n++;
690 	}
691     }
692 
693     sumsq = (n > 1)? sumsq / (n - 1) : 0.0;
694 
695     return (sumsq >= 0)? sumsq : NADBL;
696 }
697 
698 /**
699  * gretl_stddev:
700  * @t1: starting observation.
701  * @t2: ending observation.
702  * @x: data series.
703  *
704  * Returns: the standard deviation of the series @x from obs
705  * @t1 to obs @t2, skipping any missing values, or #NADBL
706  * on failure.
707  */
708 
gretl_stddev(int t1,int t2,const double * x)709 double gretl_stddev (int t1, int t2, const double *x)
710 {
711     double v = gretl_variance(t1, t2, x);
712 
713     return (na(v))? v : sqrt(v);
714 }
715 
716 /**
717  * gretl_restricted_stddev:
718  * @t1: starting observation.
719  * @t2: ending observation.
720  * @x: data series.
721  * @y: criterion series.
722  * @yop: criterion operator.
723  * @yval: criterion value.
724  *
725  * Returns: the standard deviation of the series @x from obs
726  * @t1 to obs @t2, skipping any missing values and observations
727  * where the series @y does not bear the relationship @yop to
728  * the value @yval, or #NADBL on failure.
729  */
730 
gretl_restricted_stddev(int t1,int t2,const double * x,const double * y,GretlOp yop,double yval)731 double gretl_restricted_stddev (int t1, int t2, const double *x,
732 				const double *y, GretlOp yop,
733 				double yval)
734 {
735     double xx = gretl_restricted_variance(t1, t2, x, y, yop, yval);
736 
737     return (na(xx))? xx : sqrt(xx);
738 }
739 
740 /**
741 * gretl_long_run_variance:
742 * @t1: starting observation.
743 * @t2: ending observation.
744 * @x: data series.
745 * @m: bandwidth (<= 0 for automatic).
746 * @mu: mean (or NADBL to use sample mean).
747 *
748 * Returns: the long-run variance of the series @x from obs
749 * @t1 to obs @t2, using Bartlett kernel weights, or #NADBL
750 * on failure (which includes encountering missing values).
751 */
752 
gretl_long_run_variance(int t1,int t2,const double * x,int m,double mu)753 double gretl_long_run_variance (int t1, int t2, const double *x,
754 				int m, double mu)
755 {
756     double zt, wi, xbar, s2 = 0.0;
757     int use_mu = !na(mu);
758     int i, t, n, order;
759 
760     if (series_adjust_sample(x, &t1, &t2) != 0) {
761 	return NADBL;
762     }
763 
764     n = t2 - t1 + 1;
765 
766     if (n < 2) {
767 	return NADBL;
768     }
769 
770     if (m < 0) {
771 	order = (int) exp(log(n) / 3.0);
772     } else {
773 	order = m;
774     }
775 
776     if (use_mu) {
777 	xbar = mu;
778     }
779 
780     if (use_mu && mu == 0.0) {
781 	for (t=t1; t<=t2; t++) {
782 	    s2 += x[t] * x[t];
783 	}
784     } else {
785 	if (!use_mu) {
786 	    xbar = 0.0;
787 	    for (t=t1; t<=t2; t++) {
788 		xbar += x[t];
789 	    }
790 	    xbar /= n;
791 	}
792 	for (t=t1; t<=t2; t++) {
793 	    zt = x[t] - xbar;
794 	    s2 += zt * zt;
795 	}
796     }
797 
798     for (i=1; i<=order; i++) {
799 	wi = 1.0 - i /((double) order + 1);
800 	for (t=t1+i; t<=t2; t++) {
801 	    zt = x[t] - xbar;
802 	    s2 += 2 * wi * zt * (x[t-i] - xbar);
803 	}
804     }
805 
806     return s2 / n;
807 }
808 
809 /**
810  * gretl_covar:
811  * @t1: starting observation.
812  * @t2: ending observation.
813  * @x: data series.
814  * @y: data series.
815  * @missing: location to receive information on the number
816  * of missing observations that were skipped, or %NULL.
817  *
818  * Returns: the covariance of the series @x and @y from obs
819  * @t1 to obs @t2, skipping any missing values, or #NADBL
820  * on failure.
821  */
822 
gretl_covar(int t1,int t2,const double * x,const double * y,int * missing)823 double gretl_covar (int t1, int t2, const double *x, const double *y,
824 		    int *missing)
825 {
826     double sx, sy, sxy, xbar, ybar;
827     int t, nn, n = t2 - t1 + 1;
828 
829     if (n == 0) {
830 	/* void sample */
831 	return NADBL;
832     }
833 
834     nn = 0;
835     sx = sy = 0.0;
836 
837     for (t=t1; t<=t2; t++) {
838         if (!na(x[t]) && !na(y[t])) {
839 	    sx += x[t];
840 	    sy += y[t];
841 	    nn++;
842 	}
843     }
844 
845     if (nn < 2) {
846 	return NADBL;
847     }
848 
849     xbar = sx / nn;
850     ybar = sy / nn;
851     sxy = 0.0;
852 
853     for (t=t1; t<=t2; t++) {
854         if (!na(x[t]) && !na(y[t])) {
855 	    sx = x[t] - xbar;
856 	    sy = y[t] - ybar;
857 	    sxy = sxy + (sx * sy);
858 	}
859     }
860 
861     if (missing != NULL) {
862 	/* record number of missing obs */
863 	*missing = n - nn;
864     }
865 
866     return sxy / (nn - 1);
867 }
868 
869 /**
870  * gretl_corr:
871  * @t1: starting observation.
872  * @t2: ending observation.
873  * @x: data series.
874  * @y: data series.
875  * @missing: location to receive information on the number
876  * of missing observations that were skipped, or %NULL.
877  *
878  * Returns: the correlation coefficient for the series @x and @y
879  * from obs @t1 to obs @t2, skipping any missing values, or #NADBL
880  * on failure.
881  */
882 
gretl_corr(int t1,int t2,const double * x,const double * y,int * missing)883 double gretl_corr (int t1, int t2, const double *x, const double *y,
884 		   int *missing)
885 {
886     int t, nn, n = t2 - t1 + 1;
887     double sx, sy, sxx, syy, sxy, den, xbar, ybar;
888     double cval = 0.0;
889 
890     if (n == 0) {
891 	/* void sample */
892 	return NADBL;
893     }
894 
895     if (gretl_isconst(t1, t2, x) || gretl_isconst(t1, t2, y)) {
896 	/* correlation between any x and a constant is
897 	   undefined */
898 	return NADBL;
899     }
900 
901     nn = 0;
902     sx = sy = 0.0;
903 
904     for (t=t1; t<=t2; t++) {
905         if (!na(x[t]) && !na(y[t])) {
906   	    sx += x[t];
907 	    sy += y[t];
908 	    nn++;
909 	}
910     }
911 
912     if (nn < 2) {
913 	/* zero or 1 observations: no go! */
914 	return NADBL;
915     }
916 
917     xbar = sx / nn;
918     ybar = sy / nn;
919     sxx = syy = sxy = 0.0;
920 
921     for (t=t1; t<=t2; t++) {
922         if (!na(x[t]) && !na(y[t])) {
923 	    sx = x[t] - xbar;
924 	    sy = y[t] - ybar;
925 	    sxx += sx * sx;
926 	    syy += sy * sy;
927 	    sxy += sx * sy;
928 	}
929     }
930 
931     if (sxy != 0.0) {
932         den = sxx * syy;
933         if (den > 0.0) {
934 	    cval = sxy / sqrt(den);
935         } else {
936 	    cval = NADBL;
937 	}
938     }
939 
940     if (missing != NULL) {
941 	/* record number of missing observations */
942 	*missing = n - nn;
943     }
944 
945     return cval;
946 }
947 
948 /**
949  * gretl_corr_rsq:
950  * @t1: starting observation.
951  * @t2: ending observation.
952  * @x: data series.
953  * @y: data series.
954  *
955  * Returns: the square of the correlation coefficient for the series
956  * @x and @y from obs @t1 to obs @t2, skipping any missing values,
957  * or #NADBL on failure.  Used as alternative value for R^2 in a
958  * regression without an intercept.
959  */
960 
gretl_corr_rsq(int t1,int t2,const double * x,const double * y)961 double gretl_corr_rsq (int t1, int t2, const double *x, const double *y)
962 {
963     double r = gretl_corr(t1, t2, x, y, NULL);
964 
965     return (na(r))? r : r * r;
966 }
967 
968 /* we're supposing a variance smaller than this is just noise */
969 #define TINYVAR 1.0e-36
970 
gretl_skewness(int t1,int t2,const double * x)971 double gretl_skewness (int t1, int t2, const double *x)
972 {
973     double s, sd, xx, xbar, ret;
974     int t, n = 0;
975 
976     xbar = gretl_mean(t1, t2, x);
977     if (na(xbar)) {
978 	return NADBL;
979     }
980 
981     s = 0.0;
982 
983     for (t=t1; t<=t2; t++) {
984 	if (!na(x[t])) {
985 	    xx = x[t] - xbar;
986 	    s += xx * xx;
987 	    n++;
988 	}
989     }
990 
991     s /= n;
992 
993     if (s <= TINYVAR) {
994 	ret = NADBL;
995     } else {
996 	sd = sqrt(s);
997 	s = 0.0;
998 
999 	for (t=t1; t<=t2; t++) {
1000 	    if (!na(x[t])) {
1001 		xx = (x[t] - xbar) / sd;
1002 		s += xx * xx * xx;
1003 	    }
1004 	}
1005 
1006 	ret = s/n;
1007     }
1008 
1009     return ret;
1010 }
1011 
1012 /* note: the following computes EXCESS kurtosis */
1013 
gretl_kurtosis(int t1,int t2,const double * x)1014 double gretl_kurtosis (int t1, int t2, const double *x)
1015 {
1016     double s, sd, xx, xbar, ret;
1017     int t, n = 0;
1018 
1019     xbar = gretl_mean(t1, t2, x);
1020     if (na(xbar)) {
1021 	return NADBL;
1022     }
1023 
1024     s = 0.0;
1025 
1026     for (t=t1; t<=t2; t++) {
1027 	if (!na(x[t])) {
1028 	    xx = x[t] - xbar;
1029 	    s += xx * xx;
1030 	    n++;
1031 	}
1032     }
1033 
1034     s /= n;
1035 
1036     if (s <= TINYVAR) {
1037 	ret = NADBL;
1038     } else {
1039 	sd = sqrt(s);
1040 	s = 0.0;
1041 
1042 	for (t=t1; t<=t2; t++) {
1043 	    if (!na(x[t])) {
1044 		xx = (x[t] - xbar) / sd;
1045 		s += xx * xx * xx * xx;
1046 	    }
1047 	}
1048 
1049 	ret = s/n - 3.0;
1050     }
1051 
1052     return ret;
1053 }
1054 
1055 /**
1056  * gretl_moments:
1057  * @t1: starting observation.
1058  * @t2: ending observation.
1059  * @x: data series.
1060  * @wts: weights (may be NULL)
1061  * @xbar: pointer to receive mean.
1062  * @sd: pointer to receive standard deviation.
1063  * @skew: pointer to receive skewness.
1064  * @kurt: pointer to receive excess kurtosis.
1065  * @k: degrees of freedom loss (generally 1).
1066  *
1067  * Calculates sample moments for series @x from obs @t1 to obs
1068  * @t2.
1069  *
1070  * Returns: 0 on success, 1 on error.
1071  */
1072 
gretl_moments(int t1,int t2,const double * x,const double * wts,double * xbar,double * std,double * skew,double * kurt,int k)1073 int gretl_moments (int t1, int t2, const double *x,
1074 		   const double *wts,
1075 		   double *xbar, double *std,
1076 		   double *skew, double *kurt, int k)
1077 {
1078     int t, n;
1079     double dev, var;
1080     double s, s2, s3, s4;
1081     int allstats = 1;
1082     int weighted = (wts != NULL);
1083     double wt, wn = 0;
1084 
1085     if (skew == NULL && kurt == NULL) {
1086 	allstats = 0;
1087     }
1088 
1089     while (na(x[t1]) && t1 <= t2) {
1090 	/* skip missing values at start of sample */
1091 	t1++;
1092     }
1093 
1094     if (gretl_isconst(t1, t2, x)) {
1095 	/* no variation in x */
1096 	*xbar = x[t1];
1097 	*std = 0.0;
1098 	if (allstats) {
1099 	    *skew = *kurt = NADBL;
1100 	}
1101 	return 1;
1102     }
1103 
1104     /* get number and sum of valid observations */
1105     s = 0.0;
1106     n = 0;
1107     for (t=t1; t<=t2; t++) {
1108 	if (!na(x[t])) {
1109 	    if (weighted) {
1110 		wt = wts[t];
1111 		if (!na(wt) && wt != 0.0) {
1112 		    s += wt * x[t];
1113 		    wn += wt;
1114 		    n++;
1115 		}
1116 	    } else {
1117 		s += x[t];
1118 		n++;
1119 	    }
1120 	}
1121     }
1122 
1123     if (n == 0) {
1124 	/* no valid data */
1125 	*xbar = *std = NADBL;
1126 	if (allstats) {
1127 	    *skew = *kurt = 0.0;
1128 	}
1129 	return 1;
1130     }
1131 
1132     if (!weighted) {
1133 	wn = n;
1134     }
1135 
1136     /* calculate mean and initialize other stats */
1137     *xbar = s / wn;
1138     var = 0.0;
1139     if (allstats) {
1140 	*skew = *kurt = 0.0;
1141     }
1142 
1143     /* compute quantities needed for higher moments */
1144     s2 = s3 = s4 = 0.0;
1145     for (t=t1; t<=t2; t++) {
1146 	if (!na(x[t])) {
1147 	    dev = x[t] - *xbar;
1148 	    if (weighted) {
1149 		wt = wts[t];
1150 		if (!na(wt) && wt != 0.0) {
1151 		    s2 += wt * dev * dev;
1152 		    if (allstats) {
1153 			s3 += wt * pow(dev, 3);
1154 			s4 += wt * pow(dev, 4);
1155 		    }
1156 		}
1157 	    } else {
1158 		s2 += dev * dev;
1159 		if (allstats) {
1160 		    s3 += pow(dev, 3);
1161 		    s4 += pow(dev, 4);
1162 		}
1163 	    }
1164 	}
1165     }
1166 
1167 
1168     /* variance */
1169     if (weighted) {
1170 	/* as per Stata's -summary- with aweights */
1171 	var = n * s2 / (wn * (n-1));
1172     } else {
1173 	var = s2 / (wn - k);
1174     }
1175 
1176     if (var < 0.0) {
1177 	/* in principle impossible, but this is a computer */
1178 	*std = NADBL;
1179 	if (allstats) {
1180 	    *skew = *kurt = NADBL;
1181 	}
1182 	return 1;
1183     }
1184 
1185     if (var > TINYVAR) {
1186 	*std = sqrt(var);
1187     } else {
1188 	/* screen out minuscule variance */
1189 	*std = 0.0;
1190     }
1191 
1192     if (allstats) {
1193 	if (var > TINYVAR) {
1194 	    *skew = (s3 / wn) / pow(s2 / wn, 1.5);
1195 	    *kurt = ((s4 / wn) / pow(s2 / wn, 2)) - 3.0; /* excess kurtosis */
1196 	} else {
1197 	    /* if variance is effectively zero, these should be undef'd */
1198 	    *skew = *kurt = NADBL;
1199 	}
1200     }
1201 
1202     return 0;
1203 }
1204 
1205 /**
1206  * gretl_sorted_series:
1207  * @v: ID number of input series.
1208  * @dset: dataset struct.
1209  * @opt: may include OPT_M to flag an error in case
1210  * missing values are found.
1211  * @n: on input, the minimum acceptable number of
1212  * non-missing observations; on output, the number
1213  * of valid observations.
1214  * @err: location to receive error code.
1215  *
1216  * Returns: an array containing the valid values of the
1217  * input series over the sample range given in @dset,
1218  * sorted from smallest to largest, or NULL on error.
1219  * An error is flagged if the number of valid observations
1220  * is less than that given in @n on input, or if OPT_M
1221  * is given and the input contains missing values.
1222  */
1223 
gretl_sorted_series(int v,const DATASET * dset,gretlopt opt,int * n,int * err)1224 double *gretl_sorted_series (int v, const DATASET *dset,
1225 			     gretlopt opt, int *n,
1226 			     int *err)
1227 {
1228     double *y = NULL;
1229     int t, k = 0;
1230 
1231     if (*n < 1) {
1232 	*n = 1;
1233     }
1234 
1235     for (t=dset->t1; t<=dset->t2; t++) {
1236 	if (!na(dset->Z[v][t])) {
1237 	    k++;
1238 	} else if (opt & OPT_M) {
1239 	    *err = E_MISSDATA;
1240 	    return NULL;
1241 	}
1242     }
1243 
1244     if (k < *n) {
1245 	gretl_errmsg_set(_("Insufficient data"));
1246 	*err = E_DATA;
1247 	return NULL;
1248     }
1249 
1250     y = malloc(k * sizeof *y);
1251     if (y == NULL) {
1252 	*err = E_ALLOC;
1253 	return NULL;
1254     }
1255 
1256     *n = k;
1257 
1258     k = 0;
1259     for (t=dset->t1; t<=dset->t2; t++) {
1260 	if (!na(dset->Z[v][t])) {
1261 	    y[k++] = dset->Z[v][t];
1262 	}
1263     }
1264 
1265     qsort(y, k, sizeof *y, gretl_compare_doubles);
1266 
1267     return y;
1268 }
1269 
1270 /**
1271  * free_freq:
1272  * @freq: pointer to gretl frequency distribution struct
1273  *
1274  * Frees all resources associated with @freq, and the
1275  * pointer itself.
1276  */
1277 
free_freq(FreqDist * freq)1278 void free_freq (FreqDist *freq)
1279 {
1280     if (freq == NULL) {
1281 	return;
1282     }
1283 
1284     free(freq->midpt);
1285     free(freq->endpt);
1286     free(freq->f);
1287 
1288     if (freq->S != NULL) {
1289 	strings_array_free(freq->S, freq->numbins);
1290     }
1291 
1292     free(freq);
1293 }
1294 
freq_new(const char * vname)1295 static FreqDist *freq_new (const char *vname)
1296 {
1297     FreqDist *freq;
1298 
1299     freq = malloc(sizeof *freq);
1300     if (freq == NULL) return NULL;
1301 
1302     strcpy(freq->varname, vname);
1303 
1304     freq->midpt = NULL;
1305     freq->endpt = NULL;
1306     freq->S = NULL;
1307     freq->f = NULL;
1308 
1309     freq->dist = 0;
1310     freq->discrete = 0;
1311     freq->strvals = 0;
1312 
1313     freq->xbar = NADBL;
1314     freq->sdx = NADBL;
1315     freq->test = NADBL;
1316 
1317     return freq;
1318 }
1319 
1320 /*
1321  * dh_root_b1_to_z1:
1322  * @rb1: square root b1, skewness.
1323  * @n: number of observations.
1324  *
1325  * Performs the transformation from skewness, root b1, to the
1326  * normal score, z1, as set out in Doornik and Hansen, "An Omnibus
1327  * Test for Normality", 1994.  The transformation is originally
1328  * due to D'Agostino (Biometrika, 1970).
1329  *
1330  * Returns: the z1 value.
1331  */
1332 
dh_root_b1_to_z1(double rb1,double n)1333 static double dh_root_b1_to_z1 (double rb1, double n)
1334 {
1335     double b, w2, d, y, z1;
1336 
1337     b = 3.0 * (n*n + 27*n - 70) * (n+1) * (n+3) /
1338 	((n-2) * (n+5) * (n+7) * (n+9));
1339 
1340     w2 = -1.0 + sqrt(2 * (b-1));
1341     d = 1.0 / sqrt(log(sqrt(w2)));
1342     y = rb1 * sqrt(((w2-1.0)/2.0) * ((n+1.0)*(n+3.0))/(6.0*(n-2)));
1343     z1 = d * log(y + sqrt(y*y + 1));
1344 
1345     return z1;
1346 }
1347 
1348 /*
1349  * dh_b2_to_z2:
1350  * @b1: skewness.
1351  * @b2: kurtosis.
1352  * @n: number of observations.
1353  *
1354  * Performs the transformation from kurtosis, b2, to the
1355  * normal score, z2, as set out in Doornik and Hansen, "An Omnibus
1356  * Test for Normality", 1994.
1357  *
1358  * Returns: the z2 value, or #NADBL on failure.
1359  */
1360 
dh_b2_to_z2(double b1,double b2,double n)1361 static double dh_b2_to_z2 (double b1, double b2, double n)
1362 {
1363     double d, a, c, k, alpha, chi, z2;
1364     double n2 = n * n;
1365 
1366     d = (n-3) * (n+1) * (n2 + 15*n - 4.0);
1367     a = ((n-2) * (n+5) * (n+7) * (n2 + 27*n - 70.0)) / (6.0 * d);
1368     c = ((n-7) * (n+5) * (n+7) * (n2 + 2*n - 5.0)) / (6.0 * d);
1369     k = ((n+5) * (n+7) * (n2*n + 37*n2 + 11*n - 313.0)) / (12.0 * d);
1370 
1371     alpha = a + b1 * c;
1372     z2 =  (1.0 / (9.0 * alpha)) - 1.0;
1373     chi = (b2 - 1.0 - b1) * 2.0 * k;
1374 
1375     if (chi > 0.0) {
1376        z2 += pow(chi/(2*alpha), 1.0 / 3.0);
1377     }
1378 
1379     z2 *= sqrt(9.0*alpha);
1380 
1381     return z2;
1382 }
1383 
1384 /**
1385  * doornik_chisq:
1386  * @skew: skewness.
1387  * @xkurt: excess kurtosis.
1388  * @n: number of observations.
1389  *
1390  * Calculates the Chi-square test for normality as set out by
1391  * Doornik and Hansen, "An Omnibus Test for Normality", 1994.
1392  * This is a modified version of the test proposed by Bowman and
1393  * Shenton (Biometrika, 1975).
1394  *
1395  * Returns: the Chi-square value, which has 2 degrees of freedom.
1396  */
1397 
doornik_chisq(double skew,double xkurt,int n)1398 double doornik_chisq (double skew, double xkurt, int n)
1399 {
1400     double z1, z2, x2 = NADBL;
1401 
1402     z1 = dh_root_b1_to_z1(skew, (double) n);
1403     z2 = dh_b2_to_z2(skew * skew, xkurt + 3.0, (double) n);
1404 
1405     if (!na(z2)) {
1406 	x2 = z1*z1 + z2*z2;
1407     }
1408 
1409     return x2;
1410 }
1411 
1412 static int
series_get_moments(int t1,int t2,const double * x,double * skew,double * xkurt,int * pn)1413 series_get_moments (int t1, int t2, const double *x,
1414 		    double *skew, double *xkurt,
1415 		    int *pn)
1416 {
1417     double dev, s[4] = {0.0};
1418     int t, n = 0;
1419     int err = 0;
1420 
1421     for (t=t1; t<=t2; t++) {
1422 	if (!na(x[t])) {
1423 	    s[0] += x[t];
1424 	    n++;
1425 	}
1426     }
1427 
1428     *pn = n;
1429     if (n < 3) {
1430 	return E_DATA;
1431     }
1432 
1433     s[0] /= n;
1434 
1435     for (t=t1; t<=t2; t++) {
1436 	if (!na(x[t])) {
1437 	    dev = x[t] - s[0];
1438 	    s[1] += dev * dev;
1439 	    s[2] += pow(dev, 3);
1440 	    s[3] += pow(dev, 4);
1441 	}
1442     }
1443 
1444     s[1] /= n;
1445 
1446     if (s[1] > 0.0) {
1447 	*skew = (s[2] / n) / pow(s[1], 1.5);
1448 	*xkurt = ((s[3] / n) / pow(s[1], 2)) - 3.0;
1449     } else {
1450 	/* if variance is effectively zero, these should be undef'd */
1451 	*skew = *xkurt = NADBL;
1452 	err = 1;
1453     }
1454 
1455     return err;
1456 }
1457 
1458 static int
get_moments(const gretl_matrix * M,int row,double * skew,double * kurt)1459 get_moments (const gretl_matrix *M, int row, double *skew, double *kurt)
1460 {
1461     int j, n = gretl_matrix_cols(M);
1462     double dev, s[4] = {0.0};
1463     int err = 0;
1464 
1465     for (j=0; j<n; j++) {
1466 	s[0] += gretl_matrix_get(M, row, j);
1467     }
1468 
1469     s[0] /= n;
1470 
1471     for (j=0; j<n; j++) {
1472 	dev = gretl_matrix_get(M, row, j) - s[0];
1473 	s[1] += dev * dev;
1474 	s[2] += pow(dev, 3);
1475 	s[3] += pow(dev, 4);
1476     }
1477 
1478     s[1] /= n;
1479 
1480     if (s[1] > 0.0) {
1481 	*skew = (s[2] / n) / pow(s[1], 1.5);
1482 	*kurt = ((s[3] / n) / pow(s[1], 2));
1483     } else {
1484 	/* if variance is effectively zero, these should be undef'd */
1485 	*skew = *kurt = NADBL;
1486 	err = 1;
1487     }
1488 
1489     return err;
1490 }
1491 
1492 int
multivariate_normality_test(const gretl_matrix * E,const gretl_matrix * Sigma,gretlopt opt,PRN * prn)1493 multivariate_normality_test (const gretl_matrix *E,
1494 			     const gretl_matrix *Sigma,
1495 			     gretlopt opt, PRN *prn)
1496 {
1497     gretl_matrix *S = NULL;
1498     gretl_matrix *V = NULL;
1499     gretl_matrix *C = NULL;
1500     gretl_matrix *X = NULL;
1501     gretl_matrix *R = NULL;
1502     gretl_matrix *evals = NULL;
1503     gretl_matrix *tmp = NULL;
1504     /* convenience pointers: do not free! */
1505     gretl_matrix *H;
1506     gretl_vector *Z1;
1507     gretl_vector *Z2;
1508     double x, skew, kurt;
1509     double X2 = NADBL;
1510     int n, p;
1511     int i, j;
1512     int err = 0;
1513 
1514     if (E == NULL || Sigma == NULL) {
1515 	err = E_DATA;
1516 	goto bailout;
1517     }
1518 
1519     p = gretl_matrix_cols(E);
1520     n = gretl_matrix_rows(E);
1521 
1522     clear_gretl_matrix_err();
1523 
1524     S = gretl_matrix_copy(Sigma);
1525     V = gretl_vector_alloc(p);
1526     C = gretl_matrix_alloc(p, p);
1527     X = gretl_matrix_copy_transpose(E);
1528     R = gretl_matrix_alloc(p, n);
1529     tmp = gretl_matrix_alloc(p, p);
1530 
1531     err = get_gretl_matrix_err();
1532     if (err) {
1533 	goto bailout;
1534     }
1535 
1536     for (i=0; i<p; i++) {
1537 	x = gretl_matrix_get(S, i, i);
1538 	gretl_vector_set(V, i, 1.0 / sqrt(x));
1539     }
1540 
1541     err = gretl_matrix_diagonal_sandwich(V, S, C);
1542     if (err) {
1543 	goto bailout;
1544     }
1545 
1546     if (!(opt & OPT_Q)) {
1547 	pputc(prn, '\n');
1548 	gretl_matrix_print_to_prn(C, _("Residual correlation matrix, C"), prn);
1549     }
1550 
1551     evals = gretl_symmetric_matrix_eigenvals(C, 1, &err);
1552     if (err) {
1553 	goto bailout;
1554     }
1555 
1556     if (!(opt & OPT_Q)) {
1557 	pprintf(prn, "%s\n\n", _("Eigenvalues of C"));
1558 	for (i=0; i<p; i++) {
1559 	    pprintf(prn, " %10g\n", evals->val[i]);
1560 	}
1561 	pputc(prn, '\n');
1562     }
1563 
1564     /* C should now contain eigenvectors of the original C:
1565        relabel as 'H' for perspicuity */
1566     H = C;
1567 #if 0
1568     gretl_matrix_print_to_prn(H, "Eigenvectors, H", prn);
1569 #endif
1570     gretl_matrix_copy_values(tmp, H);
1571 
1572     /* make "tmp" into $H \Lambda^{-1/2}$ */
1573     for (i=0; i<p; i++) {
1574 	for (j=0; j<p; j++) {
1575 	    x = gretl_matrix_get(tmp, i, j);
1576 	    x *= 1.0 / sqrt(evals->val[j]);
1577 	    gretl_matrix_set(tmp, i, j, x);
1578 	}
1579     }
1580 
1581     /* make S into $H \Lambda^{-1/2} H'$ */
1582     gretl_matrix_multiply_mod(tmp, GRETL_MOD_NONE,
1583 			      H, GRETL_MOD_TRANSPOSE,
1584 			      S, GRETL_MOD_NONE);
1585 
1586 #if 1
1587     gretl_matrix_demean_by_row(X);
1588 #endif
1589 
1590     /* compute VX', in X (don't need to subtract means, because these
1591        are OLS residuals)
1592     */
1593     for (i=0; i<p; i++) {
1594 	for (j=0; j<n; j++) {
1595 	    x = gretl_matrix_get(X, i, j);
1596 	    x *= gretl_vector_get(V, i);
1597 	    gretl_matrix_set(X, i, j, x);
1598 	}
1599     }
1600 
1601     /* finally, compute $R' = H  \Lambda^{-1/2} H' V X'$
1602        Doornik and Hansen, 1994, section 3 */
1603     gretl_matrix_multiply(S, X, R);
1604 
1605     /* Z_1 and Z_2 are p-vectors: use existing storage */
1606     Z1 = V;
1607     Z2 = gretl_matrix_reuse(tmp, p, 1);
1608 
1609     for (i=0; i<p && !err; i++) {
1610 	get_moments(R, i, &skew, &kurt);
1611 	if (na(skew) || na(kurt)) {
1612 	    err = 1;
1613 	} else {
1614 	    double z1i = dh_root_b1_to_z1(skew, n);
1615 	    double z2i = dh_b2_to_z2(skew * skew, kurt, n);
1616 
1617 	    if (na(z2i)) {
1618 		X2 = NADBL;
1619 		err = E_NAN;
1620 	    } else {
1621 		gretl_vector_set(Z1, i, z1i);
1622 		gretl_vector_set(Z2, i, z2i);
1623 	    }
1624 	}
1625     }
1626 
1627     if (!err) {
1628 	X2 = gretl_vector_dot_product(Z1, Z1, &err);
1629 	X2 += gretl_vector_dot_product(Z2, Z2, &err);
1630     }
1631 
1632     if (na(X2)) {
1633 	pputs(prn, "Calculation of test statistic failed\n");
1634     } else {
1635 	/* print and record result */
1636 	double pv = chisq_cdf_comp(2 * p, X2);
1637 
1638 	if (!(opt & OPT_I)) {
1639 	    pputs(prn, _("Doornik-Hansen test"));
1640 	    pprintf(prn, "\n %s(%d) = %g [%.4f]\n\n", _("Chi-square"), 2 * p,
1641 		    X2, pv);
1642 	}
1643 	record_test_result(X2, pv);
1644     }
1645 
1646  bailout:
1647 
1648     gretl_matrix_free(S);
1649     gretl_matrix_free(V);
1650     gretl_matrix_free(C);
1651     gretl_matrix_free(X);
1652     gretl_matrix_free(R);
1653     gretl_matrix_free(evals);
1654     gretl_matrix_free(tmp);
1655 
1656     return err;
1657 }
1658 
freq_add_arrays(FreqDist * freq,int n)1659 static int freq_add_arrays (FreqDist *freq, int n)
1660 {
1661     int err = 0;
1662 
1663     if (!freq->discrete) {
1664 	freq->endpt = malloc((n + 1) * sizeof *freq->endpt);
1665 	if (freq->endpt == NULL) {
1666 	    err = E_ALLOC;
1667 	}
1668     }
1669 
1670     if (!err) {
1671 	if (freq->strvals) {
1672 	    freq->S = strings_array_new(n);
1673 	    if (freq->S == NULL) {
1674 		err = E_ALLOC;
1675 	    }
1676 	} else {
1677 	    freq->midpt = malloc(n * sizeof *freq->midpt);
1678 	    if (freq->midpt == NULL) {
1679 		err = E_ALLOC;
1680 	    }
1681 	}
1682     }
1683 
1684     if (!err) {
1685 	freq->f = malloc(n * sizeof *freq->f);
1686 	if (freq->f == NULL) {
1687 	    err = E_ALLOC;
1688 	}
1689     }
1690 
1691     if (!err) {
1692 	freq->numbins = n;
1693     }
1694 
1695     return err;
1696 }
1697 
freq_setup(int v,const DATASET * dset,int * pn,double * pxmax,double * pxmin,int * nbins,double * binwidth)1698 int freq_setup (int v, const DATASET *dset, int *pn,
1699 		double *pxmax, double *pxmin, int *nbins,
1700 		double *binwidth)
1701 {
1702     const double *x = dset->Z[v];
1703     double xrange, xmin = 0, xmax = 0;
1704     int t, k = 0, n = 0;
1705 
1706     for (t=dset->t1; t<=dset->t2; t++) {
1707 	if (!na(x[t])) {
1708 	    if (n == 0) {
1709 		xmax = xmin = x[t];
1710 	    } else {
1711 		if (x[t] > xmax) xmax = x[t];
1712 		if (x[t] < xmin) xmin = x[t];
1713 	    }
1714 	    n++;
1715 	}
1716     }
1717 
1718     if (n < 8) {
1719 	gretl_errmsg_sprintf(_("Insufficient data to build frequency "
1720 			       "distribution for variable %s"),
1721 			     dset->varname[v]);
1722 	return E_TOOFEW;
1723     }
1724 
1725     xrange = xmax - xmin;
1726     if (xrange == 0) {
1727 	gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[v]);
1728 	return E_DATA;
1729     }
1730 
1731     if (*nbins > 0) {
1732 	k = *nbins;
1733     } else if (n < 16) {
1734 	k = 5;
1735     } else if (n < 50) {
1736 	k = 7;
1737     } else if (n > 850) {
1738 	k = 29;
1739     } else {
1740 	k = (int) sqrt((double) n);
1741 	if (k > 99) {
1742 	    k = 99;
1743 	}
1744     }
1745 
1746     if (k % 2 == 0) {
1747 	k++;
1748     }
1749 
1750     *pn = n;
1751     *pxmax = xmax;
1752     *pxmin = xmin;
1753     *nbins = k;
1754     *binwidth = xrange / (k - 1);
1755 
1756     return 0;
1757 }
1758 
1759 /* calculate test stat for distribution, if the sample
1760    is big enough */
1761 
1762 static void
freq_dist_stat(FreqDist * freq,const double * x,gretlopt opt,int k)1763 freq_dist_stat (FreqDist *freq, const double *x, gretlopt opt, int k)
1764 {
1765     double skew, kurt;
1766 
1767     freq->test = NADBL;
1768     freq->dist = 0;
1769 
1770     gretl_moments(freq->t1, freq->t2, x, NULL,
1771 		  &freq->xbar, &freq->sdx,
1772 		  &skew, &kurt, k);
1773 
1774     if (freq->n > 7) {
1775 	if (opt & OPT_O) {
1776 	    freq->test = lockes_test(x, freq->t1, freq->t2);
1777 	    freq->dist = D_GAMMA;
1778 	} else if (opt & OPT_Z) {
1779 	    freq->test = doornik_chisq(skew, kurt, freq->n);
1780 	    freq->dist = D_NORMAL;
1781 	}
1782     }
1783 }
1784 
1785 struct strval_sorter {
1786     char *s;
1787     int n;
1788 };
1789 
compare_strvals(const void * a,const void * b)1790 int compare_strvals (const void *a, const void *b)
1791 {
1792     const struct strval_sorter *sa = a;
1793     const struct strval_sorter *sb = b;
1794 
1795     return strcmp(sa->s, sb->s);
1796 }
1797 
get_string_freq(int v,const DATASET * dset,int * err)1798 FreqDist *get_string_freq (int v, const DATASET *dset,
1799 			   int *err)
1800 {
1801     FreqDist *freq;
1802     struct strval_sorter *ss;
1803     const double *x = dset->Z[v];
1804     series_table *st;
1805     char **S;
1806     int i, t, n, ns;
1807 
1808     freq = freq_new(dset->varname[v]);
1809     if (freq == NULL) {
1810 	*err = E_ALLOC;
1811 	return NULL;
1812     }
1813 
1814     st = series_get_string_table(dset, v);
1815     S = series_table_get_strings(st, &ns);
1816 
1817     ss = malloc(ns * sizeof *ss);
1818     if (ss == NULL) {
1819 	*err = E_ALLOC;
1820 	free(freq);
1821 	return NULL;
1822     }
1823 
1824     for (i=0; i<ns; i++) {
1825 	ss[i].s = S[i];
1826 	ss[i].n = 0;
1827     }
1828 
1829     n = 0;
1830     for (t=dset->t1; t<=dset->t2; t++) {
1831 	if (!na(x[t])) {
1832 	    i = x[t] - 1;
1833 	    ss[i].n += 1;
1834 	    n++;
1835 	}
1836     }
1837 
1838     qsort(ss, ns, sizeof *ss, compare_strvals);
1839 
1840     freq->t1 = dset->t1;
1841     freq->t2 = dset->t2;
1842     freq->n = n;
1843     freq->discrete = 1;
1844     freq->strvals = 1;
1845 
1846     if (freq_add_arrays(freq, ns)) {
1847 	*err = E_ALLOC;
1848     } else {
1849 	for (i=0; i<ns; i++) {
1850 	    freq->S[i] = gretl_strdup(ss[i].s);
1851 	    freq->f[i] = ss[i].n;
1852 	}
1853     }
1854 
1855     free(ss);
1856 
1857     if (*err && freq != NULL) {
1858 	free_freq(freq);
1859 	freq = NULL;
1860     }
1861 
1862     return freq;
1863 }
1864 
get_discrete_freq(int v,const DATASET * dset,gretlopt opt,int * err)1865 FreqDist *get_discrete_freq (int v, const DATASET *dset,
1866 			     gretlopt opt, int *err)
1867 {
1868     FreqDist *freq;
1869     const double *x = dset->Z[v];
1870     int *ifreq = NULL;
1871     double *ivals = NULL;
1872     double *sorted = NULL;
1873     double last;
1874     int i, t, nv;
1875 
1876     freq = freq_new(dset->varname[v]);
1877     if (freq == NULL) {
1878 	*err = E_ALLOC;
1879 	return NULL;
1880     }
1881 
1882     freq->t1 = dset->t1;
1883     freq->t2 = dset->t2;
1884 
1885     freq->n = 0;
1886     for (t=freq->t1; t<=freq->t2; t++) {
1887 	if (!na(x[t])) {
1888 	    freq->n += 1;
1889 	}
1890     }
1891 
1892     if (freq->n < 3) {
1893 	gretl_errmsg_sprintf(_("Insufficient data to build frequency "
1894 			       "distribution for variable %s"),
1895 			     dset->varname[v]);
1896 	*err = E_TOOFEW;
1897 	goto bailout;
1898     }
1899 
1900     freq->discrete = 1;
1901     freq->test = NADBL;
1902     freq->dist = 0;
1903 
1904     sorted = malloc(freq->n * sizeof *sorted);
1905     if (sorted == NULL) {
1906 	*err = E_ALLOC;
1907 	goto bailout;
1908     }
1909 
1910     i = 0;
1911     for (t=freq->t1; t<=freq->t2; t++) {
1912 	if (!na(x[t])) {
1913 	    sorted[i++] = x[t];
1914 	}
1915     }
1916 
1917     qsort(sorted, freq->n, sizeof *sorted, gretl_compare_doubles);
1918     nv = count_distinct_values(sorted, freq->n);
1919 
1920     if (nv >= 10 && !(opt & OPT_X)) {
1921 	freq_dist_stat(freq, x, opt, 1);
1922     } else if (opt & (OPT_Z | OPT_O)) {
1923 	freq->xbar = gretl_mean(freq->t1, freq->t2, x);
1924 	freq->sdx = gretl_stddev(freq->t1, freq->t2, x);
1925     }
1926 
1927     ifreq = malloc(nv * sizeof *ifreq);
1928     ivals = malloc(nv * sizeof *ivals);
1929     if (ifreq == NULL || ivals == NULL) {
1930 	*err = E_ALLOC;
1931 	goto bailout;
1932     }
1933 
1934     ivals[0] = last = sorted[0];
1935     ifreq[0] = i = 1;
1936 
1937     for (t=1; t<freq->n; t++) {
1938 	if (sorted[t] != last) {
1939 	    last = sorted[t];
1940 	    ifreq[i] = 1;
1941 	    ivals[i++] = last;
1942 	} else {
1943 	    ifreq[i-1] += 1;
1944 	}
1945     }
1946 
1947     if (freq_add_arrays(freq, nv)) {
1948 	*err = E_ALLOC;
1949     } else {
1950 	int allints = 1;
1951 
1952 	for (i=0; i<nv; i++) {
1953 	    if (allints && ivals[i] != floor(ivals[i])) {
1954 		allints = 0;
1955 	    }
1956 	    freq->midpt[i] = ivals[i];
1957 	    freq->f[i] = ifreq[i];
1958 	}
1959 
1960 	if (allints) {
1961 	    freq->discrete = 2;
1962 	}
1963     }
1964 
1965  bailout:
1966 
1967     free(sorted);
1968     free(ivals);
1969     free(ifreq);
1970 
1971     if (*err && freq != NULL) {
1972 	free_freq(freq);
1973 	freq = NULL;
1974     }
1975 
1976     return freq;
1977 }
1978 
1979 /**
1980  * get_freq:
1981  * @varno: ID number of variable to process.
1982  * @dset: dataset struct.
1983  * @fmin: lower limit of left-most bin (or #NADBL for automatic).
1984  * @fwid: bin width (or #NADBL for automatic).
1985  * @nbins: number of bins to use (or 0 for automatic).
1986  * @params: degrees of freedom loss (generally = 1 unless we're dealing
1987  * with the residual from a regression).
1988  * @opt: if includes %OPT_Z, set up for comparison with normal dist;
1989  * if includes %OPT_O, compare with gamma distribution;
1990  * if includes %OPT_S we're not printing results; if includes %OPT_D,
1991  * treat the variable as discrete; %OPT_X indicates that this function
1992  * is called as part of a cross-tabulation; if includes %OPT_G,
1993  * add the arrays that are needed for a plot even when we're not
1994  * printing (%OPT_S).
1995  * @err: location to receive error code.
1996  *
1997  * Calculates the frequency distribution for the specified variable.
1998  *
1999  * Returns: pointer to struct containing the distribution.
2000  */
2001 
get_freq(int varno,const DATASET * dset,double fmin,double fwid,int nbins,int params,gretlopt opt,int * err)2002 FreqDist *get_freq (int varno, const DATASET *dset,
2003 		    double fmin, double fwid,
2004 		    int nbins, int params,
2005 		    gretlopt opt, int *err)
2006 {
2007     FreqDist *freq;
2008     const double *x;
2009     double xx, xmin, xmax;
2010     double binwidth = fwid;
2011     int t, k, n;
2012 
2013     if (is_string_valued(dset, varno)) {
2014 	return get_string_freq(varno, dset, err);
2015     }
2016 
2017     if (series_is_discrete(dset, varno) || (opt & OPT_D)) {
2018 	return get_discrete_freq(varno, dset, opt, err);
2019     }
2020 
2021     if (gretl_isdiscrete(dset->t1, dset->t2, dset->Z[varno]) > 1) {
2022 	return get_discrete_freq(varno, dset, opt, err);
2023     }
2024 
2025     freq = freq_new(dset->varname[varno]);
2026     if (freq == NULL) {
2027 	*err = E_ALLOC;
2028 	return NULL;
2029     }
2030 
2031     *err = freq_setup(varno, dset, &n, &xmax, &xmin, &nbins, &binwidth);
2032 
2033     if (*err) {
2034 	goto bailout;
2035     }
2036 
2037     if (!na(fmin) && !na(fwid)) {
2038 	/* endogenous implied number of bins */
2039 	nbins = (int) ceil((xmax - fmin) / fwid);
2040 	if (nbins <= 0 || nbins > 5000) {
2041 	    *err = E_INVARG;
2042 	    goto bailout;
2043 	} else {
2044 	    binwidth = fwid;
2045 	}
2046     }
2047 
2048     freq->t1 = dset->t1;
2049     freq->t2 = dset->t2;
2050     freq->n = n;
2051 
2052     x = dset->Z[varno];
2053     freq_dist_stat(freq, x, opt, params);
2054 
2055     if (freq_add_arrays(freq, nbins)) {
2056 	*err = E_ALLOC;
2057 	goto bailout;
2058     }
2059 
2060     if (na(fmin) || na(fwid)) {
2061 	freq->endpt[0] = xmin - .5 * binwidth;
2062 
2063 	if (xmin > 0.0 && freq->endpt[0] < 0.0) {
2064 	    double rshift;
2065 
2066 	    freq->endpt[0] = 0.0;
2067 	    rshift = 1.0 - xmin / binwidth;
2068 	    freq->endpt[freq->numbins] = xmax + rshift * binwidth;
2069 	} else {
2070 	    freq->endpt[freq->numbins] = xmax + .5 * binwidth;
2071 	}
2072     } else {
2073 	freq->endpt[0] = fmin;
2074 	freq->endpt[freq->numbins] = fmin + nbins * fwid;
2075     }
2076 
2077     for (k=0; k<freq->numbins; k++) {
2078 	freq->f[k] = 0;
2079 	if (k > 0) {
2080 	    freq->endpt[k] = freq->endpt[k-1] + binwidth;
2081 	}
2082 	freq->midpt[k] = freq->endpt[k] + .5 * binwidth;
2083     }
2084 
2085     for (t=dset->t1; t<=dset->t2; t++) {
2086 	xx = x[t];
2087 	if (na(xx)) {
2088 	    continue;
2089 	}
2090 	if (xx < freq->endpt[1]) {
2091 	    freq->f[0] += 1;
2092 	} else if (xx >= freq->endpt[freq->numbins]) {
2093 	    freq->f[freq->numbins - 1] += 1;
2094 	    continue;
2095 	} else {
2096 	    for (k=1; k<freq->numbins; k++) {
2097 		if (freq->endpt[k] <= xx && xx < freq->endpt[k+1]) {
2098 		    freq->f[k] += 1;
2099 		    break;
2100 		}
2101 	    }
2102 	}
2103     }
2104 
2105  bailout:
2106 
2107     if (*err && freq != NULL) {
2108 	free_freq(freq);
2109 	freq = NULL;
2110     }
2111 
2112     return freq;
2113 }
2114 
record_freq_test(const FreqDist * freq)2115 static void record_freq_test (const FreqDist *freq)
2116 {
2117     double pval = NADBL;
2118 
2119     if (freq->dist == D_NORMAL) {
2120 	pval = chisq_cdf_comp(2, freq->test);
2121     } else if (freq->dist == D_GAMMA) {
2122 	pval = normal_pvalue_2(freq->test);
2123     }
2124 
2125     if (!na(pval)) {
2126 	record_test_result(freq->test, pval);
2127     }
2128 }
2129 
check_freq_opts(gretlopt opt,int * n_bins,double * fmin,double * fwid)2130 static int check_freq_opts (gretlopt opt, int *n_bins,
2131 			    double *fmin, double *fwid)
2132 {
2133     double x;
2134     int err = 0;
2135 
2136     if (opt & OPT_N) {
2137 	/* number of bins given */
2138 	if (opt & (OPT_M | OPT_W)) {
2139 	    /* can't give min, width spec */
2140 	    return E_BADOPT;
2141 	} else {
2142 	    int n = get_optval_int(FREQ, OPT_N, &err);
2143 
2144 	    if (!err) {
2145 		if (n < 2 || n > 10000) {
2146 		    err = E_INVARG;
2147 		} else {
2148 		    *n_bins = n;
2149 		}
2150 	    }
2151 	    if (err) {
2152 		return err;
2153 	    }
2154 	}
2155     }
2156 
2157     if (opt & OPT_M) {
2158 	/* minimum specified */
2159 	if (!(opt & OPT_W)) {
2160 	    /* but no width given */
2161 	    return E_ARGS;
2162 	} else {
2163 	    x = get_optval_double(FREQ, OPT_M, &err);
2164 	    if (err) {
2165 		return err;
2166 	    } else if (na(x)) {
2167 		return E_ARGS;
2168 	    } else {
2169 		*fmin = x;
2170 	    }
2171 	}
2172     }
2173 
2174     if (opt & OPT_W) {
2175 	/* width specified */
2176 	if (!(opt & OPT_M)) {
2177 	    /* but no min given */
2178 	    return E_ARGS;
2179 	} else {
2180 	    x = get_optval_double(FREQ, OPT_W, &err);
2181 	    if (err) {
2182 		return err;
2183 	    } else if (na(x)) {
2184 		return E_ARGS;
2185 	    } else if (x <= 0) {
2186 		return E_INVARG;
2187 	    } else {
2188 		*fwid = x;
2189 	    }
2190 	}
2191     }
2192 
2193     return 0;
2194 }
2195 
record_freq_matrix(FreqDist * fd)2196 static void record_freq_matrix (FreqDist *fd)
2197 {
2198     gretl_matrix *m = NULL;
2199     char **Sr = NULL;
2200     int i, n = fd->numbins;
2201 
2202     if (fd->S != NULL) {
2203 	m = gretl_matrix_alloc(n, 1);
2204 	Sr = strings_array_dup(fd->S, n);
2205     } else {
2206 	m = gretl_matrix_alloc(n, 2);
2207     }
2208 
2209     if (m != NULL) {
2210 	char **Sc = NULL;
2211 
2212 	if (fd->S == NULL) {
2213 	    Sc = strings_array_new(2);
2214 	    if (Sc != NULL) {
2215 		Sc[0] = gretl_strdup("midpoint");
2216 		Sc[1] = gretl_strdup("count");
2217 	    }
2218 	}
2219 	for (i=0; i<n; i++) {
2220 	    if (fd->S != NULL) {
2221 		gretl_matrix_set(m, i, 0, fd->f[i]);
2222 	    } else {
2223 		gretl_matrix_set(m, i, 0, fd->midpt[i]);
2224 		gretl_matrix_set(m, i, 1, fd->f[i]);
2225 	    }
2226 	}
2227 	if (Sc != NULL) {
2228 	    gretl_matrix_set_colnames(m, Sc);
2229 	}
2230 	if (Sr != NULL) {
2231 	    gretl_matrix_set_rownames(m, Sr);
2232 	}
2233 	set_last_result_data(m, GRETL_TYPE_MATRIX);
2234     }
2235 }
2236 
2237 /* Wrapper function: get the distribution, print it if
2238    wanted, graph it if wanted, then free stuff.
2239 */
2240 
freqdist(int varno,const DATASET * dset,gretlopt opt,PRN * prn)2241 int freqdist (int varno, const DATASET *dset,
2242 	      gretlopt opt, PRN *prn)
2243 {
2244     FreqDist *freq = NULL;
2245     DistCode dist = D_NONE;
2246     PlotType ptype = 0;
2247     double fmin = NADBL;
2248     double fwid = NADBL;
2249     int n_bins = 0;
2250     int do_graph = 0;
2251     int err;
2252 
2253     if (opt & OPT_O) {
2254 	dist = D_GAMMA;
2255 	ptype = PLOT_FREQ_GAMMA;
2256     } else if (opt & OPT_Z) {
2257 	dist = D_NORMAL;
2258 	ptype = PLOT_FREQ_NORMAL;
2259     } else {
2260 	ptype = PLOT_FREQ_SIMPLE;
2261     }
2262 
2263     if (!(opt & OPT_Q)) {
2264 	if (opt & OPT_G) {
2265 	    /* it's a legacy thing */
2266 	    opt |= OPT_U;
2267 	    opt &= ~OPT_G;
2268 	    set_optval_string(FREQ, OPT_U, "display");
2269 	}
2270 	do_graph = gnuplot_graph_wanted(ptype, opt);
2271     }
2272 
2273     err = check_freq_opts(opt, &n_bins, &fmin, &fwid);
2274 
2275     if (!err) {
2276 	gretlopt fopt = opt;
2277 
2278 	if (do_graph) {
2279 	    fopt |= OPT_G;
2280 	}
2281 	freq = get_freq(varno, dset, fmin, fwid, n_bins, 1, fopt, &err);
2282     }
2283 
2284     if (!err) {
2285 	if (!(opt & OPT_S)) {
2286 	    print_freq(freq, varno, dset, prn);
2287 	} else if (dist) {
2288 	    record_freq_test(freq);
2289 	}
2290 	record_freq_matrix(freq);
2291 
2292 	if (do_graph && freq->numbins < 2) {
2293 	    do_graph = 0;
2294 	}
2295 	if (do_graph) {
2296 	    int gerr = plot_freq(freq, dist, opt);
2297 
2298 	    if (gerr) {
2299 		pputs(prn, _("gnuplot command failed\n"));
2300 		do_graph = 0;
2301 	    }
2302 	}
2303 	free_freq(freq);
2304     }
2305 
2306     return err;
2307 }
2308 
2309 /* Wrapper function: get the frequency distribution and write it into
2310    a gretl_matrix: the first column holds the mid-points of the bins
2311    and the second holds the frequencies.
2312 */
2313 
freqdist_matrix(const double * x,int t1,int t2,int * err)2314 gretl_matrix *freqdist_matrix (const double *x, int t1, int t2, int *err)
2315 {
2316     DATASET *dset = NULL;
2317     FreqDist *freq = NULL;
2318     gretl_matrix *m = NULL;
2319     int i, t, T = t2 - t1 + 1;
2320 
2321     dset = create_auxiliary_dataset(1, T, 0);
2322     if (dset == NULL) {
2323 	*err = E_ALLOC;
2324     }
2325 
2326     if (!*err) {
2327 	i = 0;
2328 	for (t=t1; t<=t2; t++) {
2329 	    dset->Z[0][i++] = x[t];
2330 	}
2331 	freq = get_freq(0, dset, NADBL, NADBL, 0, 1, OPT_NONE, err);
2332     }
2333 
2334     if (!*err) {
2335 	m = gretl_matrix_alloc(freq->numbins, 2);
2336 	if (m == NULL) {
2337 	    *err = E_ALLOC;
2338 	}
2339     }
2340 
2341     if (!*err) {
2342 	for (i=0; i<freq->numbins; i++) {
2343 	    gretl_matrix_set(m, i, 0, freq->midpt[i]);
2344 	    gretl_matrix_set(m, i, 1, freq->f[i]);
2345 	}
2346     }
2347 
2348     destroy_dataset(dset);
2349     free_freq(freq);
2350 
2351     return m;
2352 }
2353 
xtab_to_matrix(const Xtab * tab)2354 gretl_matrix *xtab_to_matrix (const Xtab *tab)
2355 {
2356     gretl_matrix *m;
2357     double x;
2358     int i, j;
2359 
2360     if (tab == NULL) {
2361 	return NULL;
2362     }
2363 
2364     m = gretl_matrix_alloc(tab->rows, tab->cols);
2365     if (m == NULL) {
2366 	return NULL;
2367     }
2368 
2369     for (j=0; j<tab->cols; j++) {
2370 	for (i=0; i<tab->rows; i++) {
2371 	    x = (double) tab->f[i][j];
2372 	    gretl_matrix_set(m, i, j, x);
2373 	}
2374     }
2375 
2376     return m;
2377 }
2378 
2379 static void *last_result;
2380 static GretlType last_result_type;
2381 
get_last_result_data(GretlType * type,int * err)2382 void *get_last_result_data (GretlType *type, int *err)
2383 {
2384     void *ret = NULL;
2385 
2386     if (last_result == NULL) {
2387 	*type = GRETL_TYPE_NONE;
2388 	*err = E_BADSTAT;
2389     } else {
2390 	*type = last_result_type;
2391 	if (*type == GRETL_TYPE_MATRIX) {
2392 	    ret = gretl_matrix_copy(last_result);
2393 	} else {
2394 	    ret = gretl_bundle_copy(last_result, err);
2395 	}
2396     }
2397 
2398     return ret;
2399 }
2400 
set_last_result_data(void * data,GretlType type)2401 void set_last_result_data (void *data, GretlType type)
2402 {
2403     if (last_result != NULL) {
2404 	if (last_result_type == GRETL_TYPE_MATRIX) {
2405 	    gretl_matrix_free(last_result);
2406 	} else if (last_result_type == GRETL_TYPE_BUNDLE) {
2407 	    gretl_bundle_destroy(last_result);
2408 	}
2409     }
2410 
2411     last_result = data;
2412     last_result_type = type;
2413 }
2414 
last_result_cleanup(void)2415 void last_result_cleanup (void)
2416 {
2417     if (last_result != NULL) {
2418 	if (last_result_type == GRETL_TYPE_MATRIX) {
2419 	    gretl_matrix_free(last_result);
2420 	} else if (last_result_type == GRETL_TYPE_BUNDLE) {
2421 	    gretl_bundle_destroy(last_result);
2422 	}
2423     }
2424 
2425     last_result = NULL;
2426     last_result_type = 0;
2427 }
2428 
2429 /**
2430  * free_xtab:
2431  * @tab: pointer to gretl crosstab struct.
2432  *
2433  * Frees all resources associated with @tab, and the
2434  * pointer itself.
2435  */
2436 
free_xtab(Xtab * tab)2437 void free_xtab (Xtab *tab)
2438 {
2439     int i;
2440 
2441     if (tab == NULL) {
2442 	return;
2443     }
2444 
2445     free(tab->rtotal);
2446     free(tab->ctotal);
2447     free(tab->rval);
2448     free(tab->cval);
2449 
2450     if (tab->f != NULL) {
2451 	for (i=0; i<tab->rows; i++) {
2452 	    free(tab->f[i]);
2453 	}
2454 	free(tab->f);
2455     }
2456 
2457     if (tab->Sr != NULL) {
2458 	strings_array_free(tab->Sr, tab->rows);
2459     }
2460     if (tab->Sc != NULL) {
2461 	strings_array_free(tab->Sc, tab->cols);
2462     }
2463 
2464     free(tab);
2465 }
2466 
xtab_new(int n,int t1,int t2)2467 static Xtab *xtab_new (int n, int t1, int t2)
2468 {
2469     Xtab *tab = malloc(sizeof *tab);
2470 
2471     if (tab == NULL) return NULL;
2472 
2473     tab->rtotal = NULL;
2474     tab->ctotal = NULL;
2475     tab->rval = NULL;
2476     tab->cval = NULL;
2477     tab->f = NULL;
2478 
2479     tab->n = n;
2480     tab->t1 = t1;
2481     tab->t2 = t2;
2482     tab->missing = 0;
2483 
2484     *tab->rvarname = '\0';
2485     *tab->cvarname = '\0';
2486     tab->Sr = NULL;
2487     tab->Sc = NULL;
2488     tab->rstrs = 0;
2489     tab->cstrs = 0;
2490 
2491     return tab;
2492 }
2493 
2494 /* allocate the required arrays (apart from S, which must be
2495    handled separately) and initialize counters to zero
2496 */
2497 
xtab_allocate_arrays(Xtab * tab)2498 static int xtab_allocate_arrays (Xtab *tab)
2499 {
2500     int i, j, err = 0;
2501 
2502     if (!tab->rstrs && tab->rval == NULL) {
2503 	tab->rval = malloc(tab->rows * sizeof *tab->rval);
2504 	if (tab->rval == NULL) {
2505 	    err = E_ALLOC;
2506 	}
2507     }
2508 
2509     if (!err && !tab->cstrs && tab->cval == NULL) {
2510 	tab->cval = malloc(tab->cols * sizeof *tab->cval);
2511 	if (tab->cval == NULL) {
2512 	    err = E_ALLOC;
2513 	}
2514     }
2515 
2516     if (!err) {
2517 	tab->rtotal = malloc(tab->rows * sizeof *tab->rtotal);
2518 	tab->ctotal = malloc(tab->cols * sizeof *tab->ctotal);
2519 	if (tab->rtotal == NULL || tab->ctotal == NULL) {
2520 	    err = E_ALLOC;
2521 	} else {
2522 	    for (i=0; i<tab->rows; i++) {
2523 		tab->rtotal[i] = 0;
2524 	    }
2525 	    for (j=0; j<tab->cols; j++) {
2526 		tab->ctotal[j] = 0;
2527 	    }
2528 	}
2529     }
2530 
2531     if (!err) {
2532 	tab->f = malloc(tab->rows * sizeof *tab->f);
2533 	if (tab->f == NULL) {
2534 	    err = E_ALLOC;
2535 	} else {
2536 	    for (i=0; i<tab->rows; i++) {
2537 		tab->f[i] = NULL;
2538 	    }
2539 	}
2540     }
2541 
2542     for (i=0; i<tab->rows && !err; i++) {
2543 	tab->f[i] = malloc(tab->cols * sizeof *tab->f[i]);
2544 	if (tab->f[i] == NULL) {
2545 	    err = E_ALLOC;
2546 	} else {
2547 	    for (j=0; j<tab->cols; j++) {
2548 		tab->f[i][j] = 0;
2549 	    }
2550 	}
2551     }
2552 
2553     return err;
2554 }
2555 
2556 /* also used in gretl_matrix.c */
2557 
compare_xtab_rows(const void * a,const void * b)2558 int compare_xtab_rows (const void *a, const void *b)
2559 {
2560     const double **da = (const double **) a;
2561     const double **db = (const double **) b;
2562     int ret;
2563 
2564     ret = da[0][0] - db[0][0];
2565 
2566     if (ret == 0) {
2567 	ret = da[0][1] - db[0][1];
2568     }
2569 
2570     return ret;
2571 }
2572 
xtab_get_data(Xtab * tab,int v,int j,const DATASET * dset,series_table ** pst)2573 static int xtab_get_data (Xtab *tab, int v, int j,
2574 			  const DATASET *dset,
2575 			  series_table **pst)
2576 {
2577     double **xtarg = (j == 1)? &tab->cval : &tab->rval;
2578     char ***Starg = (j == 1)? &tab->Sc : &tab->Sr;
2579     int *itarg = (j == 1)? &tab->cols : &tab->rows;
2580     int *ttarg = (j == 1)? &tab->cstrs : &tab->rstrs;
2581     double *x = dset->Z[v] + dset->t1;
2582     int n = sample_size(dset);
2583     gretl_matrix *u;
2584     int err = 0;
2585 
2586     u = gretl_matrix_values(x, n, OPT_S, &err);
2587 
2588     if (!err && is_string_valued(dset, v)) {
2589 	series_table *st;
2590 	int ns, nv = u->rows;
2591 	char **S0, **S = NULL;
2592 
2593 	*pst = st = series_get_string_table(dset, v);
2594 	S0 = series_table_get_strings(st, &ns);
2595 
2596 	if (ns > nv) {
2597 	    int i, k;
2598 
2599 	    S = strings_array_new(nv);
2600 	    if (S != NULL) {
2601 		for (i=0; i<nv; i++) {
2602 		    k = u->val[i] - 1;
2603 		    S[i] = gretl_strdup(S0[k]);
2604 		}
2605 	    }
2606 	} else {
2607 	    S = strings_array_dup(S0, ns);
2608 	}
2609 	if (S == NULL) {
2610 	    err = E_ALLOC;
2611 	} else {
2612 	    *ttarg = 1;
2613 	    *itarg = nv;
2614 	    *Starg = S;
2615 	    strings_array_sort(Starg, &nv, OPT_NONE);
2616 	}
2617     } else if (!err) {
2618 	*itarg = u->rows;
2619 	*xtarg = u->val;
2620 	u->val = NULL;
2621     }
2622 
2623     gretl_matrix_free(u);
2624 
2625     return err;
2626 }
2627 
xtab_row_match(Xtab * tab,int i,const char * s,double x)2628 static int xtab_row_match (Xtab *tab, int i, const char *s, double x)
2629 {
2630     if (tab->rstrs) {
2631 	return strcmp(s, tab->Sr[i]) == 0;
2632     } else {
2633 	return x == tab->rval[i];
2634     }
2635 }
2636 
xtab_col_match(Xtab * tab,int j,const char * s,double x)2637 static int xtab_col_match (Xtab *tab, int j, const char *s, double x)
2638 {
2639     if (tab->cstrs) {
2640 	return strcmp(s, tab->Sc[j]) == 0;
2641     } else {
2642 	return x == tab->cval[j];
2643     }
2644 }
2645 
2646 #define complete_obs(x,y,t) (!na(x[t]) && !na(y[t]))
2647 
2648 /* crosstab struct creation functions */
2649 
get_new_xtab(int rv,int cv,const DATASET * dset,int * err)2650 static Xtab *get_new_xtab (int rv, int cv, const DATASET *dset,
2651 			   int *err)
2652 {
2653     series_table *sti = NULL;
2654     series_table *stj = NULL;
2655     const char *s1 = NULL;
2656     const char *s2 = NULL;
2657     double x1 = 0, x2 = 0;
2658     Xtab *tab = NULL;
2659     int imatch, jmatch;
2660     int i, j, t, n = 0;
2661 
2662     for (t=dset->t1; t<=dset->t2; t++) {
2663 	if (complete_obs(dset->Z[rv], dset->Z[cv], t)) {
2664 	    n++;
2665 	}
2666     }
2667 
2668     if (n == 0) {
2669 	*err = E_MISSDATA;
2670     } else {
2671 	tab = xtab_new(n, dset->t1, dset->t2);
2672 	if (tab == NULL) {
2673 	    *err = E_ALLOC;
2674 	}
2675     }
2676 
2677     if (!*err) {
2678 	/* assemble row data */
2679 	*err = xtab_get_data(tab, rv, 0, dset, &sti);
2680     }
2681     if (!*err) {
2682 	/* assemble column data */
2683 	*err = xtab_get_data(tab, cv, 1, dset, &stj);
2684     }
2685     if (!*err) {
2686 	*err = xtab_allocate_arrays(tab);
2687     }
2688 
2689     if (*err) goto bailout;
2690 
2691     tab->missing = (dset->t2 - dset->t1 + 1) - n;
2692     strcpy(tab->rvarname, dset->varname[rv]);
2693     strcpy(tab->cvarname, dset->varname[cv]);
2694 
2695     /* The following could be made more efficient by substituting
2696        sorted arrays for dset->Z[rv] and dset->Z[cv] but I'm not
2697        sure if the fixed cost would be recouped.
2698     */
2699 
2700     for (t=dset->t1; t<=dset->t2; t++) {
2701 	if (!complete_obs(dset->Z[rv], dset->Z[cv], t)) {
2702 	    continue;
2703 	}
2704 	x1 = dset->Z[rv][t];
2705 	if (tab->rstrs) {
2706 	    s1 = series_table_get_string(sti, x1);
2707 	}
2708 	x2 = dset->Z[cv][t];
2709 	if (tab->cstrs) {
2710 	    s2 = series_table_get_string(stj, x2);
2711 	}
2712 	for (i=0; i<tab->rows; i++) {
2713 	    imatch = xtab_row_match(tab, i, s1, x1);
2714 	    if (imatch) {
2715 		jmatch = 0;
2716 		for (j=0; j<tab->cols && !jmatch; j++) {
2717 		    jmatch = xtab_col_match(tab, j, s2, x2);
2718 		    if (jmatch) {
2719 			tab->f[i][j] += 1;
2720 			tab->rtotal[i] += 1;
2721 			tab->ctotal[j] += 1;
2722 		    }
2723 		}
2724 		break;
2725 	    }
2726 	}
2727     }
2728 
2729  bailout:
2730 
2731     if (*err) {
2732 	free_xtab(tab);
2733 	tab = NULL;
2734     }
2735 
2736     return tab;
2737 }
2738 
record_xtab(const Xtab * tab,const DATASET * dset,gretlopt opt)2739 static void record_xtab (const Xtab *tab, const DATASET *dset,
2740 			 gretlopt opt)
2741 {
2742     gretl_matrix *X = NULL;
2743     char **Sc = NULL, **Sr = NULL;
2744     double xij, cj, ri;
2745     int totals, rows, cols;
2746     int i, j;
2747 
2748     totals = (opt & OPT_N)? 0 : 1;
2749     rows = tab->rows + totals;
2750     cols = tab->cols + totals;
2751 
2752     X = gretl_zero_matrix_new(rows, cols);
2753     if (X == NULL) {
2754 	return;
2755     }
2756 
2757     /* column labels */
2758     Sc = strings_array_new(cols);
2759     if (Sc != NULL) {
2760 	for (j=0; j<tab->cols; j++) {
2761 	    if (tab->cstrs) {
2762 		Sc[j] = gretl_strdup(tab->Sc[j]);
2763 	    } else {
2764 		cj = tab->cval[j];
2765 		Sc[j] = gretl_strdup_printf("%4g", cj);
2766 	    }
2767 	}
2768 	if (totals) {
2769 	    Sc[cols-1] = gretl_strdup("TOTAL");
2770 	}
2771     }
2772 
2773     /* row labels */
2774     Sr = strings_array_new(rows);
2775     if (Sr != NULL) {
2776 	for (i=0; i<tab->rows; i++) {
2777 	    if (tab->rstrs) {
2778 		Sr[i] = gretl_strdup(tab->Sr[i]);
2779 	    } else {
2780 		ri = tab->rval[i];
2781 		Sr[i] = gretl_strdup_printf("%4g", ri);
2782 	    }
2783 	}
2784 	if (totals) {
2785 	    Sr[rows-1] = gretl_strdup("TOTAL");
2786 	}
2787     }
2788 
2789     /* body of table */
2790 
2791     for (i=0; i<tab->rows; i++) {
2792 	if (tab->rtotal[i] > 0) {
2793 	    /* row counts */
2794 	    for (j=0; j<tab->cols; j++) {
2795 		if (tab->ctotal[j] > 0) {
2796 		    if (opt & (OPT_C | OPT_R)) {
2797 			if (opt & OPT_C) {
2798 			    xij = 100.0 * tab->f[i][j] / tab->ctotal[j];
2799 			} else {
2800 			    xij = 100.0 * tab->f[i][j] / tab->rtotal[i];
2801 			}
2802 		    } else {
2803 			xij = tab->f[i][j];
2804 		    }
2805 		    gretl_matrix_set(X, i, j, xij);
2806 		}
2807 	    }
2808 	    if (totals) {
2809 		/* row totals */
2810 		if (opt & OPT_C) {
2811 		    xij = 100.0 * tab->rtotal[i] / tab->n;
2812 		} else {
2813 		    xij = tab->rtotal[i];
2814 		}
2815 		gretl_matrix_set(X, i, cols-1, xij);
2816 	    }
2817 	}
2818     }
2819 
2820     if (totals) {
2821 	/* column totals */
2822 	for (j=0; j<tab->cols; j++) {
2823 	    if (opt & OPT_R) {
2824 		xij = 100.0 * tab->ctotal[j] / tab->n;
2825 	    } else {
2826 		xij = tab->ctotal[j];
2827 	    }
2828 	    gretl_matrix_set(X, rows-1, j, xij);
2829 	}
2830 	gretl_matrix_set(X, rows-1, cols-1, tab->n);
2831     }
2832 
2833     gretl_matrix_set_colnames(X, Sc);
2834     gretl_matrix_set_rownames(X, Sr);
2835     set_last_result_data(X, GRETL_TYPE_MATRIX);
2836 }
2837 
2838 /* For use in the context of "xtab" with --quiet option:
2839    compute and record the Pearson chi-square value and its
2840    p-value.
2841 */
2842 
xtab_do_pearson(const Xtab * tab)2843 static int xtab_do_pearson (const Xtab *tab)
2844 {
2845     double x, y, ymin = 1.0e-7;
2846     double pearson = 0.0;
2847     int i, j, err = 0;
2848 
2849     for (i=0; i<tab->rows && !err; i++) {
2850 	if (tab->rtotal[i] > 0) {
2851 	    for (j=0; j<tab->cols && !err; j++) {
2852 		y = ((double) tab->rtotal[i] * tab->ctotal[j]) / tab->n;
2853 		if (y < ymin) {
2854 		    err = E_DATA;
2855 		} else {
2856 		    x = (double) tab->f[i][j] - y;
2857 		    pearson += x * x / y;
2858 		}
2859 	    }
2860 	}
2861     }
2862 
2863     if (!err) {
2864 	int df = (tab->rows - 1) * (tab->cols - 1);
2865 	double pval = chisq_cdf_comp(df, pearson);
2866 
2867 	if (!na(pval)) {
2868 	    record_test_result(pearson, pval);
2869 	} else {
2870 	    err = E_DATA;
2871 	}
2872     }
2873 
2874     if (err) {
2875 	record_test_result(NADBL, NADBL);
2876     }
2877 
2878     return err;
2879 }
2880 
crosstab_from_matrix(gretlopt opt,PRN * prn)2881 int crosstab_from_matrix (gretlopt opt, PRN *prn)
2882 {
2883     const char *mname;
2884     const gretl_matrix *m;
2885     Xtab *tab;
2886     int i, j, nvals, n = 0;
2887     double x;
2888     int err = 0;
2889 
2890     mname = get_optval_string(XTAB, OPT_X);
2891     if (mname == NULL) {
2892 	return E_DATA;
2893     }
2894 
2895     m = get_matrix_by_name(mname);
2896     if (m == NULL) {
2897 	return E_UNKVAR;
2898     }
2899 
2900     if (m->rows < 2 || m->cols < 2) {
2901 	err = E_DATA;
2902     }
2903 
2904     nvals = m->rows * m->cols;
2905 
2906     for (i=0; i<nvals && !err; i++) {
2907 	x = m->val[i];
2908 	if (x < 0 || x != floor(x) || x > INT_MAX) {
2909 	    err = E_DATA;
2910 	}
2911 	n += x;
2912     }
2913 
2914     if (err) {
2915 	gretl_errmsg_sprintf(_("Matrix %s does not represent a "
2916 			       "contingency table"), mname);
2917 	return err;
2918     }
2919 
2920     tab = xtab_new(n, 0, 0);
2921     if (tab == NULL) {
2922 	return E_ALLOC;
2923     }
2924 
2925     tab->rows = m->rows;
2926     tab->cols = m->cols;
2927 
2928     if (xtab_allocate_arrays(tab)) {
2929 	free_xtab(tab);
2930 	return E_ALLOC;
2931     }
2932 
2933     for (i=0; i<m->rows; i++) {
2934 	tab->rval[i] = i + 1;
2935 	tab->rtotal[i] = 0.0;
2936 	for (j=0; j<m->cols; j++) {
2937 	    tab->f[i][j] = gretl_matrix_get(m, i, j);
2938 	    tab->rtotal[i] += tab->f[i][j];
2939 	}
2940     }
2941 
2942     for (j=0; j<m->cols; j++) {
2943 	tab->cval[j] = j + 1;
2944 	tab->ctotal[j] = 0.0;
2945 	for (i=0; i<m->rows; i++) {
2946 	    tab->ctotal[j] += tab->f[i][j];
2947 	}
2948     }
2949 
2950     if (opt & OPT_Q) {
2951 	xtab_do_pearson(tab);
2952     } else {
2953 	print_xtab(tab, NULL, opt | OPT_S, prn);
2954     }
2955 
2956     free_xtab(tab);
2957 
2958     return err;
2959 }
2960 
crosstab(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)2961 int crosstab (const int *list, const DATASET *dset,
2962 	      gretlopt opt, PRN *prn)
2963 {
2964     Xtab *tab;
2965     int *rowvar = NULL;
2966     int *colvar = NULL;
2967     int i, j, vi, vj, k;
2968     int pos, onelist;
2969     int nrv, ncv;
2970     int err = 0;
2971 
2972     pos = gretl_list_separator_position(list);
2973     onelist = (pos == 0);
2974 
2975     if (pos == 0) {
2976 	/* single list case */
2977 	nrv = list[0];
2978 	ncv = nrv - 1;
2979     } else {
2980 	/* double list case */
2981 	nrv = pos - 1;
2982 	ncv = list[0] - pos;
2983     }
2984 
2985     if (nrv == 0 || ncv == 0) {
2986 	return E_PARSE;
2987     }
2988 
2989     rowvar = gretl_list_new(nrv);
2990     if (rowvar == NULL) {
2991 	return E_ALLOC;
2992     }
2993 
2994     j = 1;
2995     for (i=1; i<=nrv; i++) {
2996 	k = list[i];
2997 	if (accept_as_discrete(dset, k, 0)) {
2998 	    rowvar[j++] = k;
2999 	} else {
3000 	    pprintf(prn, _("dropping %s: not a discrete variable\n"),
3001 		    dset->varname[k]);
3002 	    rowvar[0] -= 1;
3003 	}
3004     }
3005 
3006     if (rowvar[0] == 0 || (onelist && rowvar[0] == 1)) {
3007 	gretl_errmsg_set("xtab: variables must be discrete");
3008 	free(rowvar);
3009 	return E_TYPES;
3010     }
3011 
3012     if (onelist && rowvar[0] == 2) {
3013 	/* the bivariate case */
3014 	tab = get_new_xtab(rowvar[1], rowvar[2], dset, &err);
3015 	if (!err) {
3016 	    /* make $result matrix available */
3017 	    record_xtab(tab, dset, opt);
3018 	    if (opt & OPT_Q) {
3019 		/* quiet: run and record the Pearson test */
3020 		xtab_do_pearson(tab);
3021 	    } else {
3022 		/* print, and record Pearson test */
3023 		print_xtab(tab, dset, opt | OPT_S, prn);
3024 	    }
3025 	    free_xtab(tab);
3026 	}
3027 	goto finish;
3028     }
3029 
3030     if (!onelist) {
3031 	/* construct the second list */
3032 	colvar = gretl_list_new(ncv);
3033 	if (colvar == NULL) {
3034 	    err = E_ALLOC;
3035 	} else {
3036 	    j = 1;
3037 	    for (i=1; i<=ncv; i++) {
3038 		k = list[pos+i];
3039 		if (accept_as_discrete(dset, k, 0)) {
3040 		    colvar[j++] = k;
3041 		} else {
3042 		    colvar[0] -= 1;
3043 		}
3044 	    }
3045 	    if (colvar[0] == 0) {
3046 		err = E_TYPES;
3047 	    }
3048 	}
3049     }
3050 
3051     for (i=1; i<=rowvar[0] && !err; i++) {
3052 	vi = rowvar[i];
3053 	if (onelist) {
3054 	    /* single list case */
3055 	    for (j=1; j<i && !err; j++) {
3056 		vj = rowvar[j];
3057 		tab = get_new_xtab(vj, vi, dset, &err);
3058 		if (!err) {
3059 		    print_xtab(tab, dset, opt, prn);
3060 		    free_xtab(tab);
3061 		}
3062 	    }
3063 	} else {
3064 	    /* double list case */
3065 	    for (j=1; j<=colvar[0] && !err; j++) {
3066 		vj = colvar[j];
3067 		tab = get_new_xtab(vi, vj, dset, &err);
3068 		if (!err) {
3069 		    print_xtab(tab, dset, opt, prn);
3070 		    free_xtab(tab);
3071 		}
3072 	    }
3073 	}
3074     }
3075 
3076  finish:
3077 
3078     free(rowvar);
3079     free(colvar);
3080 
3081     return err;
3082 }
3083 
single_crosstab(const int * list,const DATASET * dset,gretlopt opt,PRN * prn,int * err)3084 Xtab *single_crosstab (const int *list, const DATASET *dset,
3085 		       gretlopt opt, PRN *prn, int *err)
3086 {
3087     Xtab *tab = NULL;
3088     int rv, cv;
3089 
3090     if (list[0] != 2) {
3091 	*err = E_DATA;
3092 	return NULL;
3093     }
3094 
3095     rv = list[1];
3096     cv = list[2];
3097 
3098     if (accept_as_discrete(dset, rv, 0) &&
3099 	accept_as_discrete(dset, cv, 0)) {
3100 	tab = get_new_xtab(rv, cv, dset, err);
3101     } else {
3102 	*err = E_TYPES;
3103     }
3104 
3105     if (!*err) {
3106 	print_xtab(tab, dset, opt, prn);
3107     }
3108 
3109     return tab;
3110 }
3111 
just_record_freq_test(const FreqDist * freq)3112 static int just_record_freq_test (const FreqDist *freq)
3113 {
3114     double pval = NADBL;
3115 
3116     if (freq->dist == D_NORMAL) {
3117 	pval = chisq_cdf_comp(2, freq->test);
3118     } else if (freq->dist == D_GAMMA) {
3119 	pval = normal_pvalue_2(freq->test);
3120     }
3121 
3122     if (na(pval)) {
3123 	return E_NAN;
3124     } else {
3125 	record_test_result(freq->test, pval);
3126 	return 0;
3127     }
3128 }
3129 
model_error_dist(const MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)3130 int model_error_dist (const MODEL *pmod, DATASET *dset,
3131 		      gretlopt opt, PRN *prn)
3132 {
3133     FreqDist *freq = NULL;
3134     gretlopt fopt = OPT_Z; /* show normality test */
3135     int save_t1 = dset->t1;
3136     int save_t2 = dset->t2;
3137     int err = 0;
3138 
3139     if (pmod == NULL || pmod->uhat == NULL) {
3140 	return E_DATA;
3141     }
3142 
3143     err = gretl_model_get_normality_test(pmod, prn);
3144 
3145     if (!err) {
3146 	return 0;
3147     } else if (LIMDEP(pmod->ci)) {
3148 	return err;
3149     } else {
3150 	err = 0;
3151     }
3152 
3153     if (exact_fit_check(pmod, prn)) {
3154 	return 0;
3155     }
3156 
3157     if (genr_fit_resid(pmod, dset, M_UHAT)) {
3158 	return E_ALLOC;
3159     }
3160 
3161     if (!err) {
3162 	dset->t1 = pmod->t1;
3163 	dset->t2 = pmod->t2;
3164 	freq = get_freq(dset->v - 1, dset, NADBL, NADBL, 0,
3165 			pmod->ncoeff, fopt, &err);
3166     }
3167 
3168     if (!err) {
3169 	if (opt & OPT_I) {
3170 	    err = just_record_freq_test(freq);
3171 	} else if (opt & OPT_Q) {
3172 	    print_freq_test(freq, prn);
3173 	} else {
3174 	    print_freq(freq, 0, NULL, prn);
3175 	}
3176 	free_freq(freq);
3177     }
3178 
3179     dataset_drop_last_variables(dset, 1);
3180 
3181     dset->t1 = save_t1;
3182     dset->t2 = save_t2;
3183 
3184     return err;
3185 }
3186 
3187 /* PACF via Durbin-Levinson algorithm */
3188 
get_pacf(gretl_matrix * A)3189 static int get_pacf (gretl_matrix *A)
3190 {
3191     int m = gretl_matrix_rows(A);
3192     gretl_matrix *phi;
3193     double *acf = A->val;
3194     double *pacf = acf + m;
3195     double x, num, den;
3196     int i, j;
3197 
3198     phi = gretl_matrix_alloc(m, m);
3199     if (phi == NULL) {
3200 	return E_ALLOC;
3201     }
3202 
3203     gretl_matrix_set(A, 0, 1, acf[0]);
3204     gretl_matrix_set(phi, 0, 0, acf[0]);
3205 
3206     for (i=1; i<m; i++) {
3207 	num = acf[i];
3208 	for (j=0; j<i; j++) {
3209 	    num -= gretl_matrix_get(phi, i-1, j) * acf[i-j-1];
3210 	}
3211 	den = 1.0;
3212 	for (j=0; j<i; j++) {
3213 	    den -= gretl_matrix_get(phi, i-1, j) * acf[j];
3214 	}
3215 	pacf[i] = num / den;
3216 	gretl_matrix_set(phi, i, i, pacf[i]);
3217 	for (j=0; j<i; j++) {
3218 	    x = gretl_matrix_get(phi, i-1, j);
3219 	    x -= pacf[i] * gretl_matrix_get(phi, i-1, i-j-1);
3220 	    gretl_matrix_set(phi, i, j, x);
3221 	}
3222     }
3223 
3224     gretl_matrix_free(phi);
3225 
3226     return 0;
3227 }
3228 
3229 /* also used in GUI, library.c */
3230 
auto_acf_order(int T)3231 int auto_acf_order (int T)
3232 {
3233     int p = 10 * log10(T);
3234 
3235     if (p > T / 5) {
3236 	/* restrict to 20 percent of data (Tadeusz) */
3237 	p = T / 5;
3238     }
3239 
3240     return p;
3241 }
3242 
3243 /**
3244  * gretl_acf:
3245  * @k: lag order.
3246  * @t1: starting observation.
3247  * @t2: ending observation.
3248  * @y: data series.
3249  * @ybar: mean of @y over range @t1 to @t2.
3250  *
3251  * Returns: the autocorrelation at lag @k for the series @y over
3252  * the range @t1 to @t2, or #NADBL on failure.
3253  */
3254 
gretl_acf(int k,int t1,int t2,const double * y,double ybar)3255 static double gretl_acf (int k, int t1, int t2, const double *y,
3256 			 double ybar)
3257 {
3258     double z, num = 0, den = 0;
3259     int t, n = 0;
3260 
3261     for (t=t1; t<=t2; t++) {
3262 	if (na(y[t])) {
3263 	    return NADBL;
3264 	}
3265 	z = y[t] - ybar;
3266 	den += z * z;
3267 	if (t - k >= t1) {
3268 	    if (na(y[t-k])) {
3269 		return NADBL;
3270 	    }
3271 	    num += z * (y[t-k] - ybar);
3272 	    n++;
3273 	}
3274     }
3275 
3276     if (n == 0) {
3277 	return NADBL;
3278     }
3279 
3280     return num / den;
3281 }
3282 
3283 /**
3284  * ljung_box:
3285  * @m: maximum lag.
3286  * @t1: starting observation.
3287  * @t2: ending observation.
3288  * @y: data series.
3289  * @err: location to receive error code.
3290  *
3291  * Returns: the Ljung-Box statistic for lag order @m for
3292  * the series @y over the sample @t1 to @t2, or #NADBL
3293  * on failure.
3294  */
3295 
ljung_box(int m,int t1,int t2,const double * y,int * err)3296 double ljung_box (int m, int t1, int t2, const double *y, int *err)
3297 {
3298     double acf, ybar = 0.0, LB = 0.0;
3299     int k, n = t2 - t1 + 1;
3300 
3301     *err = 0;
3302 
3303     if (n == 0 || gretl_isconst(t1, t2, y)) {
3304 	*err = E_DATA;
3305     } else if (m <= 0) {
3306 	gretl_errmsg_sprintf(_("Invalid lag order %d"), m);
3307 	*err = E_DATA;
3308     } else {
3309 	ybar = gretl_mean(t1, t2, y);
3310 	if (na(ybar)) {
3311 	    *err = E_DATA;
3312 	}
3313     }
3314 
3315     if (*err) {
3316 	return NADBL;
3317     }
3318 
3319     /* calculate acf up to lag m, cumulating LB */
3320     for (k=1; k<=m; k++) {
3321 	acf = gretl_acf(k, t1, t2, y, ybar);
3322 	if (na(acf)) {
3323 	    *err = E_MISSDATA;
3324 	    break;
3325 	}
3326 	LB += acf * acf / (n - k);
3327     }
3328 
3329     if (*err) {
3330 	LB = NADBL;
3331     } else {
3332 	LB *= n * (n + 2.0);
3333     }
3334 
3335     return LB;
3336 }
3337 
3338 /**
3339  * gretl_xcf:
3340  * @k: lag order (or lead order if < 0).
3341  * @t1: starting observation.
3342  * @t2: ending observation.
3343  * @x: first data series.
3344  * @y: second data series.
3345  * @xbar: mean of first series.
3346  * @ybar: mean of second series.
3347  *
3348  * Returns: the cross-correlation at lag (or lead) @k for the
3349  * series @x and @y over the range @t1 to @t2, or #NADBL on failure.
3350  */
3351 
3352 static double
gretl_xcf(int k,int t1,int t2,const double * x,const double * y,double xbar,double ybar)3353 gretl_xcf (int k, int t1, int t2, const double *x, const double *y,
3354 	   double xbar, double ybar)
3355 {
3356     double num = 0, den1 = 0, den2 = 0;
3357     double zx, zy;
3358     int t;
3359 
3360     for (t=t1; t<=t2; t++) {
3361 	if (na(x[t]) || na(y[t])) {
3362 	    return NADBL;
3363 	}
3364 	zx = x[t] - xbar;
3365 	zy = y[t] - ybar;
3366 	den1 += zx * zx;
3367 	den2 += zy * zy;
3368 	if ((k >= 0 && t - k >= t1) || (k < 0 && t - k <= t2)) {
3369 	    num += zx * (y[t-k] - ybar);
3370 	}
3371     }
3372 
3373     return num / sqrt(den1 * den2);
3374 }
3375 
corrgm_ascii_plot(const char * vname,const gretl_matrix * A,PRN * prn)3376 static int corrgm_ascii_plot (const char *vname,
3377 			      const gretl_matrix *A,
3378 			      PRN *prn)
3379 {
3380     int k, m = gretl_matrix_rows(A);
3381     double *xk = malloc(m * sizeof *xk);
3382 
3383     if (xk == NULL) {
3384 	return E_ALLOC;
3385     }
3386 
3387     for (k=0; k<m; k++) {
3388 	xk[k] = k + 1.0;
3389     }
3390 
3391     pprintf(prn, "\n\n%s\n\n", _("Correlogram"));
3392     graphyx(A->val, xk, m, vname, _("lag"), prn);
3393 
3394     free(xk);
3395 
3396     return 0;
3397 }
3398 
handle_corrgm_plot_options(int ci,gretlopt opt,int * ascii,int * gp)3399 static void handle_corrgm_plot_options (int ci,
3400 					gretlopt opt,
3401 					int *ascii,
3402 					int *gp)
3403 {
3404     if (opt & OPT_Q) {
3405 	/* backward compat: --quiet = no plot */
3406 	*ascii = *gp = 0;
3407 	return;
3408     }
3409 
3410     if (opt & OPT_U) {
3411 	/* --plot=whatever */
3412 	const char *s = get_optval_string(ci, OPT_U);
3413 
3414 	if (s != NULL) {
3415 	    if (!strcmp(s, "none")) {
3416 		/* no plot */
3417 		*ascii = *gp = 0;
3418 		return;
3419 	    } else if (!strcmp(s, "ascii")) {
3420 		*ascii = 1;
3421 		*gp = 0;
3422 		return;
3423 	    } else {
3424 		/* implies use of gnuplot */
3425 		*gp = 1;
3426 		return;
3427 	    }
3428 	}
3429     }
3430 
3431     /* the defaults */
3432     if (gretl_in_batch_mode()) {
3433 	*ascii = 1;
3434     } else {
3435 	*gp = 1;
3436     }
3437 }
3438 
do_acf_bartlett(gretl_matrix * PM,int i,int T,double ssr,const double * z)3439 static void do_acf_bartlett (gretl_matrix *PM, int i,
3440 			     int T, double ssr,
3441 			     const double *z)
3442 {
3443     double b;
3444     int j;
3445 
3446     for (j=0; j<PM->cols; j++) {
3447 	b = z[j] * sqrt((1.0/T) * (1 + 2*ssr));
3448 	gretl_matrix_set(PM, i, j, b);
3449     }
3450 }
3451 
3452 /**
3453  * corrgram:
3454  * @varno: ID number of variable to process.
3455  * @order: integer order for autocorrelation function.
3456  * @nparam: number of estimated parameters (e.g. for the
3457  * case of ARMA), used to correct the degrees of freedom
3458  * for Q test.
3459  * @dset: dataset struct.
3460  * @opt: if includes OPT_R, variable in question is a model
3461  * residual generated "on the fly"; OPT_U can be used to
3462  * specify a plot option.
3463  * @prn: gretl printing struct.
3464  *
3465  * Computes the autocorrelation function and plots the correlogram for
3466  * the variable specified by @varno.
3467  *
3468  * Returns: 0 on successful completion, error code on error.
3469  */
3470 
corrgram(int varno,int order,int nparam,DATASET * dset,gretlopt opt,PRN * prn)3471 int corrgram (int varno, int order, int nparam, DATASET *dset,
3472 	      gretlopt opt, PRN *prn)
3473 {
3474     const double z[] = {1.65, 1.96, 2.58};
3475     gretl_matrix *PM = NULL;
3476     gretl_matrix *A = NULL;
3477     double ybar, ssr, lbox;
3478     double pval = NADBL;
3479     double pm[3];
3480     double *acf, *pacf;
3481     const char *vname;
3482     int i, k, m, T, dfQ;
3483     int ascii_plot = 0;
3484     int use_gnuplot = 0;
3485     int t1 = dset->t1, t2 = dset->t2;
3486     int err = 0, pacf_err = 0;
3487 
3488     gretl_error_clear();
3489 
3490     if (order < 0) {
3491 	gretl_errmsg_sprintf(_("Invalid lag order %d"), order);
3492 	return E_DATA;
3493     }
3494 
3495     err = series_adjust_sample(dset->Z[varno], &t1, &t2);
3496     if (err) {
3497 	return err;
3498     }
3499 
3500     if ((T = t2 - t1 + 1) < 4) {
3501 	return E_TOOFEW;
3502     }
3503 
3504     if (gretl_isconst(t1, t2, dset->Z[varno])) {
3505 	gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[varno]);
3506 	return E_DATA;
3507     }
3508 
3509     ybar = gretl_mean(t1, t2, dset->Z[varno]);
3510     if (na(ybar)) {
3511 	return E_DATA;
3512     }
3513 
3514     vname = series_get_graph_name(dset, varno);
3515 
3516     /* lag order for acf */
3517     m = order;
3518     if (m == 0) {
3519 	m = auto_acf_order(T);
3520     } else if (m > T - dset->pd) {
3521 	int mmax = T - 1;
3522 
3523 	if (m > mmax) {
3524 	    m = mmax;
3525 	}
3526     }
3527 
3528     A = gretl_matrix_alloc(m, 2);
3529     if (A == NULL) {
3530 	err = E_ALLOC;
3531 	goto bailout;
3532     }
3533 
3534     if (opt & OPT_B) {
3535 	PM = gretl_matrix_alloc(m, 3);
3536 	if (PM == NULL) {
3537 	    err = E_ALLOC;
3538 	    goto bailout;
3539 	}
3540     }
3541 
3542     acf = A->val;
3543     pacf = acf + m;
3544 
3545     /* calculate acf up to order acf_m */
3546     for (k=1; k<=m; k++) {
3547 	acf[k-1] = gretl_acf(k, t1, t2, dset->Z[varno], ybar);
3548     }
3549 
3550     /* graphing? */
3551     handle_corrgm_plot_options(CORRGM, opt, &ascii_plot, &use_gnuplot);
3552 
3553     if (ascii_plot) {
3554 	corrgm_ascii_plot(vname, A, prn);
3555     }
3556 
3557     if (opt & OPT_R) {
3558 	pprintf(prn, "\n%s\n", _("Residual autocorrelation function"));
3559     } else {
3560 	pputc(prn, '\n');
3561 	pprintf(prn, _("Autocorrelation function for %s"), vname);
3562 	pputc(prn, '\n');
3563     }
3564 
3565     pputs(prn, _("***, **, * indicate significance at the 1%, 5%, 10% levels\n"));
3566 
3567     if (opt & OPT_B) {
3568 	pputs(prn, _("using Bartlett standard errors for ACF"));
3569     } else {
3570 	pprintf(prn, _("using standard error 1/T^%.1f"), 0.5);
3571     }
3572 
3573     pputs(prn, "\n\n");
3574 
3575     /* for confidence bands */
3576     for (i=0; i<3; i++) {
3577 	pm[i] = z[i] / sqrt((double) T);
3578 	if (pm[i] > 0.5) {
3579 	    pm[i] = 0.5;
3580 	}
3581     }
3582 
3583     /* generate (and if not in batch mode) plot partial
3584        autocorrelation function */
3585 
3586     err = pacf_err = get_pacf(A);
3587 
3588     pputs(prn, _("  LAG      ACF          PACF         Q-stat. [p-value]"));
3589     pputs(prn, "\n\n");
3590 
3591     lbox = ssr = 0.0;
3592     dfQ = 1;
3593 
3594     for (k=0; k<m; k++) {
3595 	double pm0, pm1, pm2;
3596 
3597 	if (na(acf[k])) {
3598 	    pprintf(prn, "%5d\n", k + 1);
3599 	    continue;
3600 	}
3601 
3602 	/* ACF */
3603 	pprintf(prn, "%5d%9.4f ", k + 1, acf[k]);
3604 	if (PM != NULL) {
3605 	    if (k > 0 && !na(acf[k-1])) {
3606 		ssr += acf[k-1] * acf[k-1];
3607 	    }
3608 	    do_acf_bartlett(PM, k, T, ssr, z);
3609 	    pm0 = gretl_matrix_get(PM, k, 0);
3610 	    pm1 = gretl_matrix_get(PM, k, 1);
3611 	    pm2 = gretl_matrix_get(PM, k, 2);
3612 	} else {
3613 	    pm0 = pm[0];
3614 	    pm1 = pm[1];
3615 	    pm2 = pm[2];
3616 	}
3617 	if (fabs(acf[k]) > pm2) {
3618 	    pputs(prn, " ***");
3619 	} else if (fabs(acf[k]) > pm1) {
3620 	    pputs(prn, " ** ");
3621 	} else if (fabs(acf[k]) > pm0) {
3622 	    pputs(prn, " *  ");
3623 	} else {
3624 	    pputs(prn, "    ");
3625 	}
3626 
3627 	if (na(pacf[k])) {
3628 	    bufspace(13, prn);
3629 	} else {
3630 	    /* PACF */
3631 	    pprintf(prn, "%9.4f", pacf[k]);
3632 	    if (fabs(pacf[k]) > pm[2]) {
3633 		pputs(prn, " ***");
3634 	    } else if (fabs(pacf[k]) > pm[1]) {
3635 		pputs(prn, " ** ");
3636 	    } else if (fabs(pacf[k]) > pm[0]) {
3637 		pputs(prn, " *  ");
3638 	    } else {
3639 		pputs(prn, "    ");
3640 	    }
3641 	}
3642 
3643 	/* Ljung-Box Q */
3644 	lbox += (T * (T + 2.0)) * acf[k] * acf[k] / (T - (k + 1));
3645 
3646 	if (k >= nparam) {
3647 	    /* i.e., if the real df is > 0 */
3648 	    pprintf(prn, "%12.4f", lbox);
3649 	    pval = chisq_cdf_comp(dfQ++, lbox);
3650 	    pprintf(prn, "  [%5.3f]", pval);
3651 	}
3652 
3653 	pputc(prn, '\n');
3654     }
3655     pputc(prn, '\n');
3656 
3657     if (lbox > 0 && !na(pval)) {
3658 	record_test_result(lbox, pval);
3659     }
3660 
3661     if (use_gnuplot) {
3662 	err = correlogram_plot(vname, acf, (pacf_err)? NULL : pacf,
3663 			       PM, m, pm[1], opt);
3664     }
3665 
3666  bailout:
3667 
3668     gretl_matrix_free(A);
3669     gretl_matrix_free(PM);
3670 
3671     return err;
3672 }
3673 
3674 /**
3675  * acf_matrix:
3676  * @x: series to analyse.
3677  * @order: maximum lag for autocorrelation function.
3678  * @dset: information on the data set, or %NULL.
3679  * @n: length of series (required if @dset is %NULL).
3680  * @err: location to receive error code.
3681  *
3682  * Computes the autocorrelation function for series @x with
3683  * maximum lag @order.
3684  *
3685  * Returns: two-column matrix containing the values of the
3686  * ACF and PACF at the successive lags, or %NULL on error.
3687  */
3688 
acf_matrix(const double * x,int order,const DATASET * dset,int n,int * err)3689 gretl_matrix *acf_matrix (const double *x, int order,
3690 			  const DATASET *dset, int n,
3691 			  int *err)
3692 {
3693     gretl_matrix *A = NULL;
3694     double xbar;
3695     int m, k, t, T;
3696     int t1, t2;
3697 
3698     if (dset != NULL) {
3699 	t1 = dset->t1;
3700 	t2 = dset->t2;
3701 
3702 	while (na(x[t1])) t1++;
3703 	while (na(x[t2])) t2--;
3704 
3705 	T = t2 - t1 + 1;
3706     } else {
3707 	t1 = 0;
3708 	t2 = n - 1;
3709 	T = n;
3710     }
3711 
3712     if (T < 4) {
3713 	*err = E_TOOFEW;
3714 	return NULL;
3715     }
3716 
3717     for (t=t1; t<=t2; t++) {
3718 	if (na(x[t])) {
3719 	    *err = E_MISSDATA;
3720 	    return NULL;
3721 	}
3722     }
3723 
3724     if (gretl_isconst(t1, t2, x)) {
3725 	gretl_errmsg_set(_("Argument is a constant"));
3726 	*err = E_DATA;
3727 	return NULL;
3728     }
3729 
3730     xbar = gretl_mean(t1, t2, x);
3731     if (na(xbar)) {
3732 	*err = E_DATA;
3733 	return NULL;
3734     }
3735 
3736     m = order;
3737 
3738     if (dset == NULL) {
3739 	if (m < 1 || m > T) {
3740 	    *err = E_DATA;
3741 	    return NULL;
3742 	}
3743     } else {
3744 	if (m == 0) {
3745 	    m = auto_acf_order(T);
3746 	} else if (m > T - dset->pd) {
3747 	    int mmax = T - 1;
3748 
3749 	    if (m > mmax) {
3750 		m = mmax;
3751 	    }
3752 	}
3753     }
3754 
3755     A = gretl_matrix_alloc(m, 2);
3756 
3757     if (A == NULL) {
3758 	*err = E_ALLOC;
3759 	return NULL;
3760     }
3761 
3762     /* calculate ACF up to order m */
3763     for (k=0; k<m && !*err; k++) {
3764 	A->val[k] = gretl_acf(k+1, t1, t2, x, xbar);
3765 	if (na(A->val[k])) {
3766 	    *err = E_DATA;
3767 	}
3768     }
3769 
3770     /* add PACF */
3771     if (!*err) {
3772 	*err = get_pacf(A);
3773     }
3774 
3775     if (*err) {
3776 	gretl_matrix_free(A);
3777 	A = NULL;
3778     }
3779 
3780     return A;
3781 }
3782 
xcorrgm_graph(const char * xname,const char * yname,double * xcf,int m,double * pm,int allpos)3783 static int xcorrgm_graph (const char *xname, const char *yname,
3784 			  double *xcf, int m, double *pm,
3785 			  int allpos)
3786 {
3787     char crit_string[16];
3788     gchar *title;
3789     FILE *fp;
3790     int k, err = 0;
3791 
3792     fp = open_plot_input_file(PLOT_XCORRELOGRAM, 0, &err);
3793     if (err) {
3794 	return err;
3795     }
3796 
3797     sprintf(crit_string, "%.2f/T^%.1f", 1.96, 0.5);
3798 
3799     gretl_push_c_numeric_locale();
3800 
3801     fputs("set xzeroaxis\n", fp);
3802     fputs("set yzeroaxis\n", fp);
3803     print_keypos_string(GP_KEY_RIGHT_TOP, fp);
3804     fprintf(fp, "set xlabel '%s'\n", _("lag"));
3805     if (allpos) {
3806 	fputs("set yrange [-0.1:1.1]\n", fp);
3807     } else {
3808 	fputs("set yrange [-1.1:1.1]\n", fp);
3809     }
3810     title = g_strdup_printf(_("Correlations of %s and lagged %s"),
3811 			    xname, yname);
3812     fprintf(fp, "set title '%s'\n", title);
3813     g_free(title);
3814     fprintf(fp, "set xrange [%d:%d]\n", -(m + 1), m + 1);
3815     if (allpos) {
3816 	fprintf(fp, "plot \\\n"
3817 		"'-' using 1:2 notitle w impulses lw 5, \\\n"
3818 		"%g title '%s' lt 2\n", pm[1], crit_string);
3819     } else {
3820 	fprintf(fp, "plot \\\n"
3821 		"'-' using 1:2 notitle w impulses lw 5, \\\n"
3822 		"%g title '+- %s' lt 2, \\\n"
3823 		"%g notitle lt 2\n", pm[1], crit_string, -pm[1]);
3824     }
3825 
3826     for (k=-m; k<=m; k++) {
3827 	fprintf(fp, "%d %g\n", k, xcf[k+m]);
3828     }
3829     fputs("e\n", fp);
3830 
3831     gretl_pop_c_numeric_locale();
3832 
3833     return finalize_plot_input_file(fp);
3834 }
3835 
xcorrgm_ascii_plot(double * xcf,int xcf_m,PRN * prn)3836 static int xcorrgm_ascii_plot (double *xcf, int xcf_m, PRN *prn)
3837 {
3838     double *xk = malloc((xcf_m * 2 + 1) * sizeof *xk);
3839     int k;
3840 
3841     if (xk == NULL) {
3842 	return E_ALLOC;
3843     }
3844 
3845     for (k=-xcf_m; k<=xcf_m; k++) {
3846 	xk[k+xcf_m] = k;
3847     }
3848 
3849     pprintf(prn, "\n\n%s\n\n", _("Cross-correlogram"));
3850     graphyx(xcf, xk, 2 * xcf_m + 1, "", _("lag"), prn);
3851 
3852     free(xk);
3853 
3854     return 0;
3855 }
3856 
3857 /* We assume here that all data issues have already been
3858    assessed (lag length, missing values etc.) and we just
3859    get on with the job.
3860 */
3861 
real_xcf_vec(const double * x,const double * y,int p,int T,int * err)3862 static gretl_matrix *real_xcf_vec (const double *x, const double *y,
3863 				   int p, int T, int *err)
3864 {
3865     gretl_matrix *xcf;
3866     double xbar, ybar;
3867     int i;
3868 
3869     xbar = gretl_mean(0, T-1, x);
3870     if (na(xbar)) {
3871 	*err = E_DATA;
3872 	return NULL;
3873     }
3874 
3875     ybar = gretl_mean(0, T-1, y);
3876     if (na(ybar)) {
3877 	*err = E_DATA;
3878 	return NULL;
3879     }
3880 
3881     xcf = gretl_column_vector_alloc(p * 2 + 1);
3882     if (xcf == NULL) {
3883 	*err = E_ALLOC;
3884 	return NULL;
3885     }
3886 
3887     for (i=-p; i<=p; i++) {
3888 	xcf->val[i+p] = gretl_xcf(i, 0, T - 1, x, y, xbar, ybar);
3889     }
3890 
3891     return xcf;
3892 }
3893 
3894 /* for arrays @x and @y, check that there are no missing values
3895    and that neither series has a constant value
3896 */
3897 
xcf_data_check(const double * x,const double * y,int T,int * badvar)3898 static int xcf_data_check (const double *x, const double *y, int T,
3899 			   int *badvar)
3900 {
3901     int xconst = 1, yconst = 1;
3902     int t;
3903 
3904     if (T < 5) {
3905 	return E_TOOFEW;
3906     }
3907 
3908     for (t=0; t<T; t++) {
3909 	if (na(x[t]) || na(y[t])) {
3910 	    return E_MISSDATA;
3911 	}
3912 	if (t > 0 && x[t] != x[0]) {
3913 	    xconst = 0;
3914 	}
3915 	if (t > 0 && y[t] != y[0]) {
3916 	    yconst = 0;
3917 	}
3918     }
3919 
3920     if (xconst) {
3921 	*badvar = 1;
3922 	return E_DATA;
3923     } else if (yconst) {
3924 	*badvar = 2;
3925 	return E_DATA;
3926     }
3927 
3928     return 0;
3929 }
3930 
3931 /**
3932  * xcf_vec:
3933  * @x: first series.
3934  * @y: second series.
3935  * @p: maximum lag for cross-correlation function.
3936  * @dset: information on the data set, or NULL.
3937  * @n: length of series (required only if @dset is NULL).
3938  * @err: location to receive error code.
3939  *
3940  * Computes the cross-correlation function for series @x with
3941  * series @y up to maximum lag @order.
3942  *
3943  * Returns: column vector containing the values of the
3944  * cross-correlation function, or NULL on error.
3945  */
3946 
xcf_vec(const double * x,const double * y,int p,const DATASET * dset,int n,int * err)3947 gretl_matrix *xcf_vec (const double *x, const double *y,
3948 		       int p, const DATASET *dset,
3949 		       int n, int *err)
3950 {
3951     gretl_matrix *xcf = NULL;
3952     int t1, t2;
3953     int T, badvar = 0;
3954 
3955     if (p <= 0) {
3956 	*err = E_DATA;
3957 	return NULL;
3958     }
3959 
3960     if (dset != NULL) {
3961 	int yt1, yt2;
3962 
3963 	t1 = yt1 = dset->t1;
3964 	t2 = yt2 = dset->t2;
3965 
3966 	while (na(x[t1])) t1++;
3967 	while (na(y[yt1])) yt1++;
3968 
3969 	while (na(x[t2])) t2--;
3970 	while (na(y[yt2])) yt2--;
3971 
3972 	t1 = (yt1 > t1)? yt1 : t1;
3973 	t2 = (yt2 < t2)? yt2 : t2;
3974 
3975 	T = t2 - t1 + 1;
3976     } else {
3977 	t1 = 0;
3978 	t2 = n - 1;
3979 	T = n;
3980     }
3981 
3982 #if 0
3983     fprintf(stderr, "t1=%d, t2=%d, T=%d\n", t1, t2, T);
3984     fprintf(stderr, "x[t1]=%g, y[t1]=%g\n\n", x[t1], y[t1]);
3985 #endif
3986 
3987     if (dset != NULL) {
3988 	if (2 * p > T - dset->pd) { /* ?? */
3989 	    *err = E_DATA;
3990 	}
3991     } else if (2 * p > T) {
3992 	*err = E_DATA;
3993     }
3994 
3995     if (!*err) {
3996 	*err = xcf_data_check(x + t1, y + t1, T, &badvar);
3997 	if (badvar) {
3998 	    gretl_errmsg_sprintf(_("Argument %d is a constant"),
3999 				 badvar);
4000 	}
4001     }
4002 
4003     if (!*err) {
4004 	xcf = real_xcf_vec(x + t1, y + t1, p, T, err);
4005     }
4006 
4007     return xcf;
4008 }
4009 
4010 /**
4011  * xcorrgram:
4012  * @list: should contain ID numbers of two variables.
4013  * @order: integer order for autocorrelation function.
4014  * @dset: dataset struct.
4015  * @opt: may include OPT_U for plot options.
4016  * @prn: gretl printing struct.
4017  *
4018  * Computes the cross-correlation function and plots the
4019  * cross-correlogram for the specified variables.
4020  *
4021  * Returns: 0 on successful completion, error code on error.
4022  */
4023 
xcorrgram(const int * list,int order,DATASET * dset,gretlopt opt,PRN * prn)4024 int xcorrgram (const int *list, int order, DATASET *dset,
4025 	       gretlopt opt, PRN *prn)
4026 {
4027     gretl_matrix *xcf = NULL;
4028     double pm[3];
4029     const char *xname, *yname;
4030     const double *x, *y;
4031     int t1 = dset->t1, t2 = dset->t2;
4032     int ascii_plot = 0;
4033     int use_gnuplot = 0;
4034     int k, p, badvar = 0;
4035     int T, err = 0;
4036 
4037     gretl_error_clear();
4038 
4039     if (order < 0) {
4040 	gretl_errmsg_sprintf(_("Invalid lag order %d"), order);
4041 	return E_DATA;
4042     }
4043 
4044     if (list[0] != 2) {
4045 	return E_DATA;
4046     }
4047 
4048     x = dset->Z[list[1]];
4049     y = dset->Z[list[2]];
4050 
4051     err = list_adjust_sample(list, &t1, &t2, dset, NULL);
4052 
4053     if (!err) {
4054 	T = t2 - t1 + 1;
4055 	err = xcf_data_check(x + t1, y + t1, T, &badvar);
4056     }
4057 
4058     if (err) {
4059 	if (badvar) {
4060 	    gretl_errmsg_sprintf(_("%s is a constant"),
4061 				 dset->varname[list[badvar]]);
4062 	}
4063 	return err;
4064     }
4065 
4066     xname = dset->varname[list[1]];
4067     yname = dset->varname[list[2]];
4068 
4069     p = order;
4070     if (p == 0) {
4071 	p = auto_acf_order(T) / 2;
4072     } else if (2 * p > T - dset->pd) {
4073 	p = (T - 1) / 2; /* ?? */
4074     }
4075 
4076     xcf = real_xcf_vec(x + t1, y + t1, p, T, &err);
4077     if (err) {
4078 	return err;
4079     }
4080 
4081     /* graphing? */
4082     handle_corrgm_plot_options(XCORRGM, opt, &ascii_plot, &use_gnuplot);
4083 
4084     if (ascii_plot) {
4085 	xcorrgm_ascii_plot(xcf->val, p, prn);
4086     }
4087 
4088     /* for confidence bands */
4089     pm[0] = 1.65 / sqrt((double) T);
4090     pm[1] = 1.96 / sqrt((double) T);
4091     pm[2] = 2.58 / sqrt((double) T);
4092 
4093     pputc(prn, '\n');
4094     pprintf(prn, _("Cross-correlation function for %s and %s"),
4095 	    xname, yname);
4096     pputs(prn, "\n\n");
4097     pputs(prn, _("  LAG      XCF"));
4098     pputs(prn, "\n\n");
4099 
4100     for (k=-p; k<=p; k++) {
4101 	double x = xcf->val[k + p];
4102 
4103 	pprintf(prn, "%5d%9.4f", k, x);
4104 	if (fabs(x) > pm[2]) {
4105 	    pputs(prn, " ***");
4106 	} else if (fabs(x) > pm[1]) {
4107 	    pputs(prn, " **");
4108 	} else if (fabs(x) > pm[0]) {
4109 	    pputs(prn, " *");
4110 	}
4111 	pputc(prn, '\n');
4112     }
4113     pputc(prn, '\n');
4114 
4115     if (use_gnuplot) {
4116 	int allpos = 1;
4117 
4118 	for (k=-p; k<=p; k++) {
4119 	    if (xcf->val[k+p] < 0) {
4120 		allpos = 0;
4121 		break;
4122 	    }
4123 	}
4124 	err = xcorrgm_graph(xname, yname, xcf->val, p, pm, allpos);
4125     }
4126 
4127     gretl_matrix_free(xcf);
4128 
4129     return err;
4130 }
4131 
4132 struct fractint_test {
4133     double d;    /* estimated degree of integration */
4134     double se;   /* standard error of the above */
4135 };
4136 
fract_int_GPH(int T,int m,const double * xvec,struct fractint_test * ft)4137 static int fract_int_GPH (int T, int m, const double *xvec,
4138 			  struct fractint_test *ft)
4139 {
4140     gretl_matrix *y = NULL;
4141     gretl_matrix *X = NULL;
4142     gretl_matrix *b = NULL;
4143     gretl_matrix *V = NULL;
4144     double x, w;
4145     int t, err = 0;
4146 
4147     ft->d = ft->se = NADBL;
4148 
4149     y = gretl_column_vector_alloc(m);
4150     X = gretl_unit_matrix_new(m, 2);
4151     b = gretl_column_vector_alloc(2);
4152     V = gretl_matrix_alloc(2, 2);
4153 
4154     if (y == NULL || X == NULL || b == NULL || V == NULL) {
4155 	err = E_ALLOC;
4156 	goto bailout;
4157     }
4158 
4159     /* Test from Geweke and Porter-Hudak, as set out in
4160        Greene, Econometric Analysis 4e, p. 787 */
4161 
4162     for (t=0; t<m; t++) {
4163 	y->val[t] = log(xvec[t+1]);
4164 	w = M_2PI * (t + 1) / (double) T;
4165 	x = sin(w / 2);
4166 	gretl_matrix_set(X, t, 1, log(4 * x * x));
4167     }
4168 
4169     err = gretl_matrix_ols(y, X, b, V, NULL, &x);
4170 
4171     if (!err) {
4172 	ft->d = -b->val[1];
4173 	ft->se = sqrt(gretl_matrix_get(V, 1, 1));
4174     }
4175 
4176  bailout:
4177 
4178     gretl_matrix_free(y);
4179     gretl_matrix_free(X);
4180     gretl_matrix_free(b);
4181     gretl_matrix_free(V);
4182 
4183     return err;
4184 }
4185 
gretl_matrix_pergm(const gretl_matrix * x,int m,int * err)4186 static gretl_matrix *gretl_matrix_pergm (const gretl_matrix *x, int m,
4187 					 int *err)
4188 {
4189     gretl_matrix *p = NULL;
4190     gretl_matrix *f = NULL;
4191 
4192     f = gretl_matrix_fft(x, 0, err);
4193     if (*err) {
4194 	return NULL;
4195     }
4196 
4197     p = gretl_column_vector_alloc(m);
4198 
4199     if (p == NULL) {
4200 	*err = E_ALLOC;
4201     } else {
4202 	int T = gretl_vector_get_length(x);
4203 	double re, im, scale = M_2PI * T;
4204 	int i;
4205 
4206 	for (i=0; i<m; i++) {
4207 	    re = gretl_matrix_get(f, i+1, 0);
4208 	    im = gretl_matrix_get(f, i+1, 1);
4209 	    p->val[i] = (re*re + im*im) / scale;
4210 	}
4211     }
4212 
4213     gretl_matrix_free(f);
4214 
4215     return p;
4216 }
4217 
4218 struct LWE_helper {
4219     gretl_matrix *lambda;
4220     gretl_matrix *lpow;
4221     gretl_matrix *I1;
4222     gretl_matrix *I2;
4223     double lcm;
4224 };
4225 
LWE_free(struct LWE_helper * L)4226 static void LWE_free (struct LWE_helper *L)
4227 {
4228     gretl_matrix_free(L->lambda);
4229     gretl_matrix_free(L->lpow);
4230     gretl_matrix_free(L->I1);
4231     gretl_matrix_free(L->I2);
4232 }
4233 
LWE_obj_func(struct LWE_helper * L,double d)4234 static double LWE_obj_func (struct LWE_helper *L, double d)
4235 {
4236     double dd = 2.0 * d;
4237     int i;
4238 
4239     gretl_matrix_copy_values(L->lpow, L->lambda);
4240     gretl_matrix_raise(L->lpow, dd);
4241 
4242     for (i=0; i<L->I1->rows; i++) {
4243 	L->I2->val[i] = L->I1->val[i] * L->lpow->val[i];
4244     }
4245 
4246     return -(log(gretl_vector_mean(L->I2)) - dd * L->lcm);
4247 }
4248 
LWE_lambda(const gretl_matrix * I1,int n)4249 static gretl_matrix *LWE_lambda (const gretl_matrix *I1, int n)
4250 {
4251     gretl_matrix *lambda;
4252     int i, m = gretl_vector_get_length(I1);
4253 
4254     lambda = gretl_column_vector_alloc(m);
4255 
4256     if (lambda != NULL) {
4257 	for (i=0; i<m; i++) {
4258 	    gretl_vector_set(lambda, i, (M_2PI / n) * (i + 1));
4259 	}
4260     }
4261 
4262     return lambda;
4263 }
4264 
4265 static int
LWE_init(struct LWE_helper * L,const gretl_matrix * X,int m)4266 LWE_init (struct LWE_helper *L, const gretl_matrix *X, int m)
4267 {
4268     int i, err = 0;
4269 
4270     L->I2 = L->lpow = NULL;
4271 
4272     L->I1 = gretl_matrix_pergm(X, m, &err);
4273     if (err) {
4274 	return err;
4275     }
4276 
4277     L->lambda = LWE_lambda(L->I1, X->rows);
4278     if (L->lambda == NULL) {
4279 	gretl_matrix_free(L->I1);
4280 	return E_ALLOC;
4281     }
4282 
4283     L->lpow = gretl_matrix_copy(L->lambda);
4284     L->I2 = gretl_matrix_copy(L->I1);
4285 
4286     if (L->lpow == NULL || L->I2 == NULL) {
4287 	err = E_ALLOC;
4288 	LWE_free(L);
4289     } else {
4290 	L->lcm = 0.0;
4291 	for (i=0; i<m; i++) {
4292 	    L->lcm += log(L->lambda->val[i]);
4293 	}
4294 	L->lcm /= m;
4295     }
4296 
4297     return err;
4298 }
4299 
4300 #define LWE_MAXITER 100
4301 
LWE_calc(const gretl_matrix * X,int m,int * err)4302 static double LWE_calc (const gretl_matrix *X, int m, int *err)
4303 {
4304     struct LWE_helper L = {0};
4305     double d = 0, dd = 1.0;
4306     double eps = 1.0e-05;
4307     double f, incr, incl, deriv, h;
4308     int iter = 0;
4309 
4310     *err = LWE_init(&L, X, m);
4311     if (*err) {
4312 	return NADBL;
4313     }
4314 
4315     while (fabs(dd) > 1.0e-06 && iter++ < LWE_MAXITER) {
4316 	f = LWE_obj_func(&L, d);
4317 	incr = LWE_obj_func(&L, d + eps) / eps;
4318 	incl = LWE_obj_func(&L, d - eps) / eps;
4319 
4320 	deriv = (incr - incl) / 2.0;
4321 	h = (0.5 * (incr + incl) - f / eps) / eps;
4322 	dd = (h < 0)? (-deriv / h) : deriv;
4323 
4324 	if (fabs(dd) > 1) {
4325 	    dd = (dd > 0)? 1 : -1;
4326 	}
4327 
4328 	d += 0.5 * dd;
4329     }
4330 
4331     if (*err) {
4332 	d = NADBL;
4333     } else if (iter == LWE_MAXITER) {
4334 	fprintf(stderr, "LWE: max iterations reached\n");
4335 	d = NADBL;
4336     }
4337 
4338     LWE_free(&L);
4339 
4340     return d;
4341 }
4342 
auto_spectrum_order(int T,gretlopt opt)4343 int auto_spectrum_order (int T, gretlopt opt)
4344 {
4345     int m;
4346 
4347     if (opt & OPT_O) {
4348 	/* Bartlett */
4349 	m = (int) 2.0 * sqrt((double) T);
4350     } else {
4351 	/* fractional integration test */
4352 	double m1 = floor((double) T / 2.0);
4353 	double m2 = floor(pow((double) T, 0.6));
4354 
4355 	m = (m1 < m2)? m1 : m2;
4356 	m--;
4357     }
4358 
4359     return m;
4360 }
4361 
fract_int_LWE(const double * x,int m,int t1,int t2,struct fractint_test * ft)4362 static int fract_int_LWE (const double *x, int m, int t1, int t2,
4363 			  struct fractint_test *ft)
4364 {
4365     gretl_matrix *X;
4366     int err = 0;
4367 
4368     X = gretl_vector_from_series(x, t1, t2);
4369 
4370     if (X == NULL) {
4371 	err = E_ALLOC;
4372     } else {
4373 	ft->d = LWE_calc(X, m, &err);
4374 	if (!err) {
4375 	    ft->se = 1.0 / (2 * sqrt((double) m));
4376 	}
4377 	gretl_matrix_free(X);
4378     }
4379 
4380     return err;
4381 }
4382 
pergm_do_plot(gretlopt opt)4383 static int pergm_do_plot (gretlopt opt)
4384 {
4385     if (opt & OPT_U) {
4386 	/* --plot=whatever */
4387 	const char *s = get_optval_string(PERGM, OPT_U);
4388 
4389 	if (s != NULL) {
4390 	    if (!strcmp(s, "none")) {
4391 		return 0;
4392 	    } else {
4393 		/* implies use of gnuplot */
4394 		return 1;
4395 	    }
4396 	}
4397     }
4398 
4399     /* the default */
4400     return !gretl_in_batch_mode();
4401 }
4402 
pergm_print(const char * vname,const double * d,int T,int L,gretlopt opt,PRN * prn)4403 static void pergm_print (const char *vname, const double *d,
4404 			 int T, int L, gretlopt opt, PRN *prn)
4405 {
4406     char xstr[32];
4407     double dt, yt;
4408     int t;
4409 
4410     if (vname == NULL) {
4411 	pprintf(prn, "\n%s\n", _("Residual periodogram"));
4412     } else {
4413 	pprintf(prn, _("\nPeriodogram for %s\n"), vname);
4414     }
4415 
4416     pprintf(prn, _("Number of observations = %d\n"), T);
4417     if (opt & OPT_O) {
4418 	pprintf(prn, _("Using Bartlett lag window, length %d\n\n"), L);
4419     } else {
4420 	pputc(prn, '\n');
4421     }
4422 
4423     if (opt & OPT_L) {
4424 	pputs(prn, _(" omega  scaled frequency  periods  log spectral density\n\n"));
4425     } else {
4426 	pputs(prn, _(" omega  scaled frequency  periods  spectral density\n\n"));
4427     }
4428 
4429     for (t=1; t<=T/2; t++) {
4430 	dt = (opt & OPT_L)? log(d[t]) : d[t];
4431 	if (opt & OPT_R) {
4432 	    yt = 2 * M_PI * (double) t / T;
4433 	    pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t);
4434 	} else if (opt & OPT_D) {
4435 	    yt = 360 * t / (double) T;
4436 	    pprintf(prn, " %7.3f%8d%16.2f", yt, t, (double) T / t);
4437 	} else {
4438 	    yt = M_2PI * t / (double) T;
4439 	    pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t);
4440 	}
4441 	dt = (opt & OPT_L)? log(d[t]) : d[t];
4442 	sprintf(xstr, "%#.5g", dt);
4443 	gretl_fix_exponent(xstr);
4444 	pprintf(prn, "%16s\n", xstr);
4445     }
4446 
4447     pputc(prn, '\n');
4448 
4449     if (pergm_do_plot(opt)) {
4450 	periodogram_plot(vname, T, L, d, opt);
4451     }
4452 }
4453 
finalize_fractint(const double * x,const double * dens,int t1,int t2,int width,const char * vname,gretlopt opt,PRN * prn)4454 static int finalize_fractint (const double *x,
4455 			      const double *dens,
4456 			      int t1, int t2, int width,
4457 			      const char *vname,
4458 			      gretlopt opt,
4459 			      PRN *prn)
4460 {
4461     struct fractint_test ft;
4462     gretl_matrix *result = NULL;
4463     int do_GPH = opt & (OPT_G | OPT_A);
4464     int do_LWE = !(opt & OPT_G);
4465     int do_print = !(opt & OPT_Q);
4466     int T = t2 - t1 + 1;
4467     int m, err = 0;
4468 
4469     /* order for test */
4470     if (width <= 0) {
4471 	m = auto_spectrum_order(T, OPT_NONE);
4472     } else {
4473 	m = (width > T / 2)? T / 2 : width;
4474     }
4475 
4476     if (do_print && vname != NULL) {
4477 	pprintf(prn, "\n%s, T = %d\n\n", vname, T);
4478     }
4479 
4480     if (do_GPH && do_LWE) {
4481 	result = gretl_matrix_alloc(2, 2);
4482     } else {
4483 	result = gretl_matrix_alloc(1, 2);
4484     }
4485 
4486     if (do_LWE) {
4487 	/* do Local Whittle if wanted */
4488 	err = fract_int_LWE(x, m, t1, t2, &ft);
4489 	if (!err) {
4490 	    double z = ft.d / ft.se;
4491 	    double pv = normal_pvalue_2(z);
4492 
4493 	    record_test_result(z, pv);
4494 	    gretl_matrix_set(result, 0, 0, ft.d);
4495 	    gretl_matrix_set(result, 0, 1, ft.se);
4496 
4497 	    if (do_print) {
4498 		pprintf(prn, "%s (m = %d)\n"
4499 			"  %s = %g (%g)\n"
4500 			"  %s: z = %g, %s %.4f\n\n",
4501 			_("Local Whittle Estimator"), m,
4502 			_("Estimated degree of integration"), ft.d, ft.se,
4503 			_("test statistic"), z,
4504 			_("with p-value"), pv);
4505 	    }
4506 	}
4507     }
4508 
4509     if (!err && do_GPH) {
4510 	/* do GPH if wanted (options --all or --gph) */
4511 	int row = do_LWE ? 1 : 0;
4512 
4513 	err = fract_int_GPH(T, m, dens, &ft);
4514 	gretl_matrix_set(result, row, 0, ft.d);
4515 	gretl_matrix_set(result, row, 1, ft.se);
4516 
4517 	if (!err && (!do_LWE || do_print)) {
4518 	    double tval = ft.d / ft.se;
4519 	    int df = m - 2;
4520 	    double pv = student_pvalue_2(df, tval);
4521 
4522 	    if (!do_LWE) {
4523 		record_test_result(tval, pv);
4524 	    }
4525 	    if (do_print) {
4526 		pprintf(prn, "%s (m = %d)\n"
4527 			"  %s = %g (%g)\n"
4528 			"  %s: t(%d) = %g, %s %.4f\n\n",
4529 			_("GPH test"), m,
4530 			_("Estimated degree of integration"), ft.d, ft.se,
4531 			_("test statistic"), df, tval,
4532 			_("with p-value"), pv);
4533 	    }
4534 	}
4535     }
4536 
4537     if (err) {
4538 	gretl_matrix_free(result);
4539     } else {
4540 	char **colnames = strings_array_new(2);
4541 
4542 	colnames[0] = gretl_strdup("d");
4543 	colnames[1] = gretl_strdup("se");
4544 	gretl_matrix_set_colnames(result, colnames);
4545 	set_last_result_data(result, GRETL_TYPE_MATRIX);
4546     }
4547 
4548     return err;
4549 }
4550 
pergm_compute_density(const double * x,int t1,int t2,int L,int bartlett,int * err)4551 static double *pergm_compute_density (const double *x, int t1, int t2,
4552 				      int L, int bartlett, int *err)
4553 {
4554     double *sdy, *acov, *dens;
4555     double xx, yy, vx, sx, w;
4556     int k, t, T = t2 - t1 + 1;
4557 
4558     sdy = malloc(T * sizeof *sdy);
4559     acov = malloc((L + 1) * sizeof *acov);
4560     dens = malloc((1 + T/2) * sizeof *dens);
4561 
4562     if (sdy == NULL || acov == NULL || dens == NULL) {
4563 	*err = E_ALLOC;
4564 	free(sdy);
4565 	free(acov);
4566 	free(dens);
4567 	return NULL;
4568     }
4569 
4570     xx = gretl_mean(t1, t2, x);
4571     vx = real_gretl_variance(t1, t2, x, 1);
4572     sx = sqrt(vx);
4573 
4574     for (t=t1; t<=t2; t++) {
4575 	sdy[t-t1] = (x[t] - xx) / sx;
4576     }
4577 
4578     /* autocovariances */
4579     for (k=1; k<=L; k++) {
4580 	acov[k] = 0.0;
4581 	for (t=k; t<T; t++) {
4582 	    acov[k] += sdy[t] * sdy[t-k];
4583 	}
4584 	acov[k] /= T;
4585     }
4586 
4587     vx /= M_2PI;
4588 
4589     for (t=1; t<=T/2; t++) {
4590 	yy = M_2PI * t / (double) T;
4591 	xx = 1.0;
4592 	for (k=1; k<=L; k++) {
4593 	    if (bartlett) {
4594 		w = 1.0 - (double) k/(L + 1);
4595 		xx += 2.0 * w * acov[k] * cos(yy * k);
4596 	    } else {
4597 		xx += 2.0 * acov[k] * cos(yy * k);
4598 	    }
4599 	}
4600 	dens[t] = xx * vx;
4601     }
4602 
4603     free(sdy);
4604     free(acov);
4605 
4606     return dens;
4607 }
4608 
4609 enum {
4610     PERGM_CMD,
4611     PERGM_FUNC,
4612     FRACTINT_CMD
4613 };
4614 
4615 /* The following is a little complicated because we're supporting
4616    three use cases:
4617 
4618    1) Implementing the "pergm" command, with or without the
4619    option of using a Bartlett window (OPT_O); in this case the
4620    final matrix-location argument will be NULL.
4621 
4622    2) Implementing the user-space pergm function: in this case
4623    the final matrix-location argument will be non-NULL, and the
4624    @vname argument will be NULL.
4625 
4626    3) Implementing the fractint command, computing the Local
4627    Whittle Estimator and/or the GPH test -- this is flagged by
4628    opt & OPT_F. If we're doing Whittle only, we don't need to compute
4629    the spectral density via the autocovariances approach; that is
4630    handled via FFT in the LWE functions above.
4631 */
4632 
4633 static int
pergm_or_fractint(int usage,const double * x,int t1,int t2,int width,const char * vname,gretlopt opt,PRN * prn,gretl_matrix ** pmat)4634 pergm_or_fractint (int usage, const double *x, int t1, int t2,
4635 		   int width, const char *vname, gretlopt opt,
4636 		   PRN *prn, gretl_matrix **pmat)
4637 {
4638     double *dens = NULL;
4639     int bartlett = (opt & OPT_O);
4640     int whittle_only = 0;
4641     int t, T, L = 0;
4642     int err = 0;
4643 
4644     gretl_error_clear();
4645 
4646     /* common to all uses: check for data problems */
4647 
4648     err = series_adjust_sample(x, &t1, &t2);
4649 
4650     if (!err && (T = t2 - t1 + 1) < 12) {
4651 	err = E_TOOFEW;
4652     }
4653 
4654     if (!err && gretl_isconst(t1, t2, x)) {
4655 	if (vname != NULL) {
4656 	    gretl_errmsg_sprintf(_("%s is a constant"), vname);
4657 	}
4658 	err = E_DATA;
4659     }
4660 
4661     if (err) {
4662 	return err;
4663     }
4664 
4665     if (usage == FRACTINT_CMD) {
4666 	/* doing fractint */
4667 	if (!(opt & (OPT_A | OPT_G))) {
4668 	    /* not --all, not --gph */
4669 	    whittle_only = 1;
4670 	}
4671     }
4672 
4673     if (!whittle_only) {
4674 	/* Chatfield (1996); William Greene, 4e, p. 772 */
4675 	if (bartlett) {
4676 	    if (width <= 0) {
4677 		L = auto_spectrum_order(T, opt);
4678 	    } else {
4679 		L = (width > T / 2)? T / 2 : width;
4680 	    }
4681 	} else {
4682 	    /* use full sample */
4683 	    L = T - 1;
4684 	}
4685 
4686 	dens = pergm_compute_density(x, t1, t2, L, bartlett, &err);
4687 	if (err) {
4688 	    return err;
4689 	}
4690     }
4691 
4692     if (usage == PERGM_FUNC) {
4693 	/* make matrix for pergm() function */
4694 	int T2 = T / 2;
4695 	gretl_matrix *pm;
4696 
4697 	*pmat = pm = gretl_matrix_alloc(T2, 2);
4698 
4699 	if (pm == NULL) {
4700 	    err = E_ALLOC;
4701 	} else {
4702 	    for (t=1; t<=T2; t++) {
4703 		gretl_matrix_set(pm, t-1, 0, M_2PI * t / (double) T);
4704 		gretl_matrix_set(pm, t-1, 1, dens[t]);
4705 	    }
4706 	}
4707     } else if (usage == FRACTINT_CMD) {
4708 	/* supporting "fractint" command */
4709 	err = finalize_fractint(x, dens, t1, t2, width,
4710 				vname, opt, prn);
4711     } else {
4712 	/* supporting "pergm" command */
4713 	pergm_print(vname, dens, T, L, opt, prn);
4714     }
4715 
4716     free(dens);
4717 
4718     return err;
4719 }
4720 
4721 /**
4722  * periodogram:
4723  * @varno: ID number of variable to process.
4724  * @width: width of window.
4725  * @dset: dataset struct.
4726  * @opt: if includes OPT_O, use Bartlett lag window for periodogram;
4727  * OPT_L, use log scale.
4728  * @prn: gretl printing struct.
4729  *
4730  * Computes and displays the periodogram for the series specified
4731  * by @varno.
4732  *
4733  * Returns: 0 on successful completion, error code on error.
4734  */
4735 
periodogram(int varno,int width,const DATASET * dset,gretlopt opt,PRN * prn)4736 int periodogram (int varno, int width, const DATASET *dset,
4737 		 gretlopt opt, PRN *prn)
4738 {
4739     return pergm_or_fractint(PERGM_CMD, dset->Z[varno],
4740 			     dset->t1, dset->t2,
4741 			     width, dset->varname[varno],
4742 			     opt, prn, NULL);
4743 }
4744 
4745 /**
4746  * residual_periodogram:
4747  * @x: series to process.
4748  * @width: width of window.
4749  * @dset: dataset struct.
4750  * @opt: if includes OPT_O, use Bartlett lag window for periodogram;
4751  * OPT_L, use log scale.
4752  * @prn: gretl printing struct.
4753  *
4754  * Computes and displays the periodogram for @x, which is presumed to
4755  * be a model residual series.
4756  *
4757  * Returns: 0 on successful completion, error code on error.
4758  */
4759 
residual_periodogram(const double * x,int width,const DATASET * dset,gretlopt opt,PRN * prn)4760 int residual_periodogram (const double *x, int width, const DATASET *dset,
4761 			  gretlopt opt, PRN *prn)
4762 {
4763     return pergm_or_fractint(PERGM_CMD, x,
4764 			     dset->t1, dset->t2,
4765 			     width, NULL,
4766 			     opt, prn, NULL);
4767 }
4768 
4769 /**
4770  * periodogram_matrix:
4771  * @x: the series to process.
4772  * @t1: starting observation in @x.
4773  * @t2: ending observation in @x.
4774  * @width: width of Bartlett window, or -1 for plain sample
4775  * periodogram.
4776  * @err: location to receive error code.
4777  *
4778  * Implements the userspace gretl pergm function, which can
4779  * be used on either a series from the dataset or a gretl
4780  * vector.
4781  *
4782  * Returns: allocated matrix on success, NULL on failure.
4783  */
4784 
periodogram_matrix(const double * x,int t1,int t2,int width,int * err)4785 gretl_matrix *periodogram_matrix (const double *x, int t1, int t2,
4786 				  int width, int *err)
4787 {
4788     gretlopt opt = (width < 0)? OPT_NONE : OPT_O;
4789     gretl_matrix *m = NULL;
4790 
4791     *err = pergm_or_fractint(PERGM_FUNC, x, t1, t2, width,
4792 			     NULL, opt, NULL, &m);
4793 
4794     return m;
4795 }
4796 
4797 /**
4798  * fractint:
4799  * @varno: ID number of variable to process.
4800  * @order: lag order / window size.
4801  * @dset: dataset struct.
4802  * @opt: option flags.
4803  * @prn: gretl printing struct.
4804  *
4805  * Computes and prints a test for fractional integration of the
4806  * series specified by @varno. By default the test uses the
4807  * Local Whittle Estimator but if @opt includes OPT_G then
4808  * the Geweke and Porter-Hudak test is done instead, or if
4809  * OPT_A then both tests are shown. If OPT_Q is given the
4810  * test results are not printed, just recorded (with
4811  * preference given to the LWE in case of OPT_A).
4812  *
4813  * Returns: 0 on successful completion, error code on error.
4814  */
4815 
fractint(int varno,int order,const DATASET * dset,gretlopt opt,PRN * prn)4816 int fractint (int varno, int order, const DATASET *dset,
4817 	      gretlopt opt, PRN *prn)
4818 {
4819     int err = incompatible_options(opt, (OPT_G | OPT_A));
4820 
4821     if (!err) {
4822 	err = pergm_or_fractint(FRACTINT_CMD, dset->Z[varno],
4823 				dset->t1, dset->t2,
4824 				order,
4825 				dset->varname[varno],
4826 				opt, prn, NULL);
4827     }
4828 
4829     return err;
4830 }
4831 
printf15(double x,int d,PRN * prn)4832 static void printf15 (double x, int d, PRN *prn)
4833 {
4834     pputc(prn, ' ');
4835     if (na(x)) {
4836 	pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA"));
4837     } else if (x > 999 && x < 100000) {
4838 	int p = 1 + floor(log10(x));
4839 
4840 	d -= p;
4841 	if (d < 0) d = 0;
4842 	pprintf(prn, "%14.*f", d, x);
4843     } else {
4844 	pprintf(prn, "%#14.*g", d, x);
4845     }
4846 }
4847 
printf11(double x,int d,PRN * prn)4848 static void printf11 (double x, int d, PRN *prn)
4849 {
4850     pputc(prn, ' ');
4851     if (na(x)) {
4852 	pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 10), _("NA"));
4853     } else if (x > 999 && x < 100000) {
4854 	int p = 1 + floor(log10(x));
4855 
4856 	d -= p;
4857 	if (d < 0) d = 0;
4858 	pprintf(prn, "%10.*f", d, x);
4859     } else {
4860 	pprintf(prn, "%#10.*g", d, x);
4861     }
4862 }
4863 
output_line(char * str,PRN * prn,int dblspc)4864 static void output_line (char *str, PRN *prn, int dblspc)
4865 {
4866     pputs(prn, str);
4867     if (dblspc) {
4868 	pputs(prn, "\n\n");
4869     } else {
4870 	pputc(prn, '\n');
4871     }
4872 }
4873 
prhdr(const char * str,const DATASET * dset,int missing,PRN * prn)4874 static void prhdr (const char *str, const DATASET *dset,
4875 		   int missing, PRN *prn)
4876 {
4877     char date1[OBSLEN], date2[OBSLEN];
4878     gchar *tmp;
4879 
4880     ntolabel(date1, dset->t1, dset);
4881     ntolabel(date2, dset->t2, dset);
4882 
4883     pputc(prn, '\n');
4884 
4885     tmp = g_strdup_printf(_("%s, using the observations %s - %s"),
4886 			  str, date1, date2);
4887     output_line(tmp, prn, 0);
4888     g_free(tmp);
4889 
4890     if (missing) {
4891 	output_line(_("(missing values were skipped)"), prn, 1);
4892     }
4893 }
4894 
summary_print_val(double x,int digits,int places,PRN * prn)4895 static void summary_print_val (double x, int digits, int places,
4896 			       PRN *prn)
4897 {
4898     pputc(prn, ' ');
4899 
4900     if (na(x)) {
4901 	pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA"));
4902     } else if (digits < 0) {
4903 	pprintf(prn, "%14d", (int) x);
4904     } else if (digits > 0 || places > 0) {
4905 	int prec = (digits > 0)? digits : places;
4906 	int len = prec + 9;
4907 	char *s = (digits > 0)? "#" : "";
4908 	char t = (digits > 0)? 'g' : 'f';
4909 	char fmt[32];
4910 
4911 	sprintf(fmt, "%%%s%d.%d%c", s, len, prec, t);
4912 	pprintf(prn, fmt, x);
4913     } else {
4914 	/* the default */
4915 	pprintf(prn, "%#14.5g", x);
4916     }
4917 }
4918 
4919 #define NSUMM 12
4920 
print_summary_single(const Summary * s,int digits,int places,const DATASET * dset,PRN * prn)4921 void print_summary_single (const Summary *s,
4922 			   int digits, int places,
4923 			   const DATASET *dset,
4924 			   PRN *prn)
4925 {
4926     const char *labels[NSUMM] = {
4927 	N_("Mean"),
4928 	N_("Median"),
4929 	N_("Minimum"),
4930 	N_("Maximum"),
4931 	N_("Standard deviation"),
4932 	N_("C.V."),
4933 	N_("Skewness"),
4934 	N_("Ex. kurtosis"),
4935 	/* xgettext:no-c-format */
4936 	N_("5% percentile"),
4937 	/* xgettext:no-c-format */
4938 	N_("95% percentile"),
4939 	N_("Interquartile range"),
4940 	N_("Missing obs.")
4941     };
4942     const char *wstr = N_("Within s.d.");
4943     const char *bstr = N_("Between s.d.");
4944     double vals[NSUMM];
4945     int simple_skip[NSUMM] = {0,1,0,0,0,1,1,1,1,1,1,0};
4946     int skip0595 = 0;
4947     int offset = 2;
4948     int slen = 0, i = 0;
4949 
4950     if (s->opt & OPT_B) {
4951 	offset = 4;
4952     } else {
4953 	const char *vname = dset->varname[s->list[1]];
4954 	char obs1[OBSLEN], obs2[OBSLEN];
4955 	gchar *tmp = NULL;
4956 
4957 	ntolabel(obs1, dset->t1, dset);
4958 	ntolabel(obs2, dset->t2, dset);
4959 
4960 	prhdr(_("Summary statistics"), dset, 0, prn);
4961 
4962 	if (isdigit(*vname)) {
4963 	    const char *mname = dataset_get_matrix_name(dset);
4964 
4965 	    if (mname != NULL) {
4966 		tmp = g_strdup_printf(_("for column %d of %s (%d valid observations)"),
4967 				      atoi(vname), mname, s->n);
4968 	    } else {
4969 		tmp = g_strdup_printf(_("for column %d (%d valid observations)"),
4970 				      atoi(vname), s->n);
4971 	    }
4972 	} else {
4973 	    tmp = g_strdup_printf(_("for the variable '%s' (%d valid observations)"),
4974 				  dset->varname[s->list[1]], s->n);
4975 	}
4976 	output_line(tmp, prn, 1);
4977 	g_free(tmp);
4978     }
4979 
4980     vals[0]  = s->mean[0];
4981     vals[1]  = s->median[0];
4982     vals[2]  = s->low[0];
4983     vals[3]  = s->high[0];
4984     vals[4]  = s->sd[0];
4985     vals[5]  = s->cv[0];
4986     vals[6]  = s->skew[0];
4987     vals[7]  = s->xkurt[0];
4988     vals[8]  = s->perc05[0];
4989     vals[9]  = s->perc95[0];
4990     vals[10] = s->iqr[0];
4991     vals[11] = s->misscount[0];
4992 
4993     if (na(vals[8]) && na(vals[9])) {
4994 	skip0595 = 1;
4995     }
4996 
4997     for (i=0; i<NSUMM; i++) {
4998 	if ((s->opt & OPT_S) && simple_skip[i]) {
4999 	    continue;
5000 	} else if (skip0595 && (i == 8 || i == 9)) {
5001 	    continue;
5002 	}
5003 	if (strlen(_(labels[i])) > slen) {
5004 	    slen = g_utf8_strlen(_(labels[i]), -1);
5005 	}
5006     }
5007     slen++;
5008 
5009     for (i=0; i<NSUMM; i++) {
5010 	if ((s->opt & OPT_S) && simple_skip[i]) {
5011 	    continue;
5012 	} else if (skip0595 && (i == 8 || i == 9)) {
5013 	    continue;
5014 	}
5015 	bufspace(offset, prn);
5016 	if (i == 8 || i == 9) {
5017 	    /* the strings contain '%' */
5018 	    gchar *pcstr = g_strdup(_(labels[i]));
5019 	    int n = slen - g_utf8_strlen(pcstr, -1);
5020 
5021 	    pputs(prn, pcstr);
5022 	    if (n > 0) {
5023 		bufspace(n, prn);
5024 	    }
5025 	    g_free(pcstr);
5026 	} else {
5027 	    pprintf(prn, "%-*s", UTF_WIDTH(_(labels[i]), slen), _(labels[i]));
5028 	}
5029 	if (i == NSUMM - 1) {
5030 	    summary_print_val(vals[i], -1, places, prn);
5031 	} else {
5032 	    summary_print_val(vals[i], digits, places, prn);
5033 	}
5034 	pputc(prn, '\n');
5035     }
5036 
5037     if (!na(s->sw) && !na(s->sb)) {
5038 	pputc(prn, '\n');
5039 	bufspace(offset, prn);
5040 	pprintf(prn, "%-*s", UTF_WIDTH(_(wstr), slen), _(wstr));
5041 	summary_print_val(s->sw, digits, places, prn);
5042 	pputc(prn, '\n');
5043 	bufspace(offset, prn);
5044 	pprintf(prn, "%-*s", UTF_WIDTH(_(bstr), slen), _(bstr));
5045 	summary_print_val(s->sb, digits, places, prn);
5046     }
5047 
5048     pputc(prn, '\n');
5049 }
5050 
summary_print_varname(const char * src,int len,PRN * prn)5051 static void summary_print_varname (const char *src,
5052 				   int len, PRN *prn)
5053 {
5054     char vname[NAMETRUNC];
5055 
5056     maybe_trim_varname(vname, src);
5057     pprintf(prn, "%-*s", len, vname);
5058 }
5059 
5060 /**
5061  * print_summary:
5062  * @summ: pointer to gretl summary statistics struct.
5063  * @dset: information on the data set.
5064  * @prn: gretl printing struct.
5065  *
5066  * Prints the summary statistics for a given variable.
5067  */
5068 
print_summary(const Summary * summ,const DATASET * dset,PRN * prn)5069 void print_summary (const Summary *summ,
5070 		    const DATASET *dset,
5071 		    PRN *prn)
5072 {
5073     int dmax, d = get_gretl_digits();
5074     int len, maxlen = 0;
5075     int i, vi;
5076 
5077     if (summ->list == NULL || summ->list[0] == 0) {
5078 	return;
5079     }
5080 
5081     if (summ->weight_var > 0) {
5082 	pputc(prn, '\n');
5083 	pprintf(prn, _("Weighting variable: %s\n"),
5084 		       dset->varname[summ->weight_var]);
5085     }
5086 
5087     if (summ->list[0] == 1) {
5088 	print_summary_single(summ, 0, 0, dset, prn);
5089 	return;
5090     }
5091 
5092     /* number of significant figures to use */
5093     dmax = (summ->opt & OPT_S)? 4 : 5;
5094     d = d > dmax ? dmax : d;
5095 
5096     maxlen = max_namelen_in_list(summ->list, dset);
5097     len = maxlen <= 8 ? 10 : (maxlen + 1);
5098 
5099 #if 0
5100     if (!(summ->opt & OPT_B)) {
5101 	int missing = summary_has_missing_values(summ);
5102 
5103 	prhdr(_("Summary statistics"), dset, missing, prn);
5104     }
5105 #endif
5106 
5107     pputc(prn, '\n');
5108 
5109     if (summ->opt & OPT_S) {
5110 	/* the "simple" option */
5111 	const char *h[] = {
5112 	    N_("Mean"),
5113 	    N_("Median"),
5114 	    /* TRANSLATORS: 'S.D.' means Standard Deviation */
5115 	    N_("S.D."),
5116 	    N_("Min"),
5117 	    N_("Max"),
5118 	};
5119 
5120 	pprintf(prn, "%*s%*s%*s%*s%*s%*s\n", len, " ",
5121 		UTF_WIDTH(_(h[0]), 11), _(h[0]),
5122 		UTF_WIDTH(_(h[1]), 11), _(h[1]),
5123 		UTF_WIDTH(_(h[2]), 11), _(h[2]),
5124 		UTF_WIDTH(_(h[3]), 11), _(h[3]),
5125 		UTF_WIDTH(_(h[4]), 11), _(h[4]));
5126 
5127 	for (i=0; i<summ->list[0]; i++) {
5128 	    vi = summ->list[i+1];
5129 	    summary_print_varname(dset->varname[vi], len, prn);
5130 	    printf11(summ->mean[i], d, prn);
5131 	    printf11(summ->median[i], d, prn);
5132 	    printf11(summ->sd[i], d, prn);
5133 	    printf11(summ->low[i], d, prn);
5134 	    printf11(summ->high[i], d, prn);
5135 	    pputc(prn, '\n');
5136 	}
5137 	pputc(prn, '\n');
5138     } else {
5139 	/* print all available stats */
5140 	const char *ha[] = {
5141 	    N_("Mean"),
5142 	    N_("Median"),
5143 	    N_("Minimum"),
5144 	    N_("Maximum"),
5145 	};
5146 	const char *hb[] = {
5147 	    N_("Std. Dev."),
5148 	    N_("C.V."),
5149 	    N_("Skewness"),
5150 	    N_("Ex. kurtosis")
5151 	};
5152 	const char *hc[] = {
5153 	    /* xgettext:no-c-format */
5154 	    N_("5% perc."),
5155 	    /* xgettext:no-c-format */
5156 	    N_("95% perc."),
5157 	    N_("IQ range"),
5158 	    N_("Missing obs.")
5159 	};
5160 	/* cases where 0.05 and 0.95 quantiles are OK */
5161 	int npct = 0;
5162 
5163 	pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ",
5164 		UTF_WIDTH(_(ha[0]), 15), _(ha[0]),
5165 		UTF_WIDTH(_(ha[1]), 15), _(ha[1]),
5166 		UTF_WIDTH(_(ha[2]), 15), _(ha[2]),
5167 		UTF_WIDTH(_(ha[3]), 15), _(ha[3]));
5168 
5169 	for (i=0; i<summ->list[0]; i++) {
5170 	    vi = summ->list[i+1];
5171 	    summary_print_varname(dset->varname[vi], len, prn);
5172 	    printf15(summ->mean[i], d, prn);
5173 	    printf15(summ->median[i], d, prn);
5174 	    printf15(summ->low[i], d, prn);
5175 	    printf15(summ->high[i], d, prn);
5176 	    pputc(prn, '\n');
5177 	    /* while we're at it, register cases where we can
5178 	       show the 0.05 and 0.95 quantiles
5179 	    */
5180 	    if (!na(summ->perc05[i]) && !na(summ->perc95[i])) {
5181 		npct++;
5182 	    }
5183 	}
5184 	pputc(prn, '\n');
5185 
5186 	pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ",
5187 		UTF_WIDTH(_(hb[0]), 15), _(hb[0]),
5188 		UTF_WIDTH(_(hb[1]), 15), _(hb[1]),
5189 		UTF_WIDTH(_(hb[2]), 15), _(hb[2]),
5190 		UTF_WIDTH(_(hb[3]), 15), _(hb[3]));
5191 
5192 	for (i=0; i<summ->list[0]; i++) {
5193 	    double cv;
5194 
5195 	    vi = summ->list[i+1];
5196 	    summary_print_varname(dset->varname[vi], len, prn);
5197 
5198 	    if (floateq(summ->mean[i], 0.0)) {
5199 		cv = NADBL;
5200 	    } else if (floateq(summ->sd[i], 0.0)) {
5201 		cv = 0.0;
5202 	    } else {
5203 		cv = fabs(summ->sd[i] / summ->mean[i]);
5204 	    }
5205 
5206 	    printf15(summ->sd[i], d, prn);
5207 	    printf15(cv, d, prn);
5208 	    printf15(summ->skew[i], d, prn);
5209 	    printf15(summ->xkurt[i], d, prn);
5210 	    pputc(prn, '\n');
5211 	}
5212 	pputc(prn, '\n');
5213 
5214 	if (npct > 0) {
5215 	    /* note: use pputs for strings containing literal '%' */
5216 	    gchar *hc0 = g_strdup(_(hc[0]));
5217 	    gchar *hc1 = g_strdup(_(hc[1]));
5218 	    int n;
5219 
5220 	    pprintf(prn, "%*s", len, " ");
5221 	    n = 15 - g_utf8_strlen(hc0, -1);
5222 	    if (n > 0) bufspace(n, prn);
5223 	    pputs(prn, hc0);
5224 	    n = 15 - g_utf8_strlen(hc1, -1);
5225 	    if (n > 0) bufspace(n, prn);
5226 	    pputs(prn, hc1);
5227 	    g_free(hc0); g_free(hc1);
5228 
5229 	    pprintf(prn, "%*s%*s\n",
5230 		    UTF_WIDTH(_(ha[2]), 15), _(hc[2]),
5231 		    UTF_WIDTH(_(ha[3]), 15), _(hc[3]));
5232 	} else {
5233 	    /* not showing any 0.05, 0.95 quantiles */
5234 	    pprintf(prn, "%*s%*s%*s\n", len, " ",
5235 		    UTF_WIDTH(_(ha[2]), 15), _(hc[2]),
5236 		    UTF_WIDTH(_(ha[3]), 15), _(hc[3]));
5237 	}
5238 
5239 	for (i=0; i<summ->list[0]; i++) {
5240 	    vi = summ->list[i+1];
5241 	    summary_print_varname(dset->varname[vi], len, prn);
5242 	    if (!na(summ->perc05[i]) && !na(summ->perc95[i])) {
5243 		printf15(summ->perc05[i], d, prn);
5244 		printf15(summ->perc95[i], d, prn);
5245 	    } else if (npct > 0) {
5246 		pprintf(prn, "%*s", 15, "NA");
5247 		pprintf(prn, "%*s", 15, "NA");
5248 	    }
5249 	    printf15(summ->iqr[i], d, prn);
5250 	    pprintf(prn, "%15d", (int) summ->misscount[i]);
5251 	    pputc(prn, '\n');
5252 	}
5253 	pputc(prn, '\n');
5254     }
5255 }
5256 
5257 /**
5258  * free_summary:
5259  * @summ: pointer to gretl summary statistics struct
5260  *
5261  * Frees all resources associated with @summ, and
5262  * the pointer itself.
5263  */
5264 
free_summary(Summary * summ)5265 void free_summary (Summary *summ)
5266 {
5267     free(summ->list);
5268     free(summ->misscount);
5269     free(summ->stats);
5270 
5271     free(summ);
5272 }
5273 
5274 /***
5275  * summary_new:
5276  * @list: list of variables we want summary statistics for
5277  * @wv: weighting variable (0=no weights)
5278  * @opt: options
5279  *
5280  * Allocates a new "Summary" struct and initializes a few
5281  * things inside it
5282  */
5283 
summary_new(const int * list,int wv,gretlopt opt,int * err)5284 static Summary *summary_new (const int *list, int wv,
5285 			     gretlopt opt, int *err)
5286 {
5287     Summary *s;
5288     int nv;
5289 
5290     if (list == NULL) {
5291 	fprintf(stderr, "summary_new: list is NULL\n");
5292 	*err = E_DATA;
5293 	return NULL;
5294     }
5295 
5296     nv = list[0];
5297 
5298     s = malloc(sizeof *s);
5299     if (s == NULL) {
5300 	*err = E_ALLOC;
5301 	return NULL;
5302     }
5303 
5304     s->list = gretl_list_copy(list);
5305     if (s->list == NULL) {
5306 	*err = E_ALLOC;
5307 	free(s);
5308 	return NULL;
5309     }
5310 
5311     s->weight_var = wv;
5312     s->opt = opt;
5313     s->n = 0;
5314     s->misscount = malloc(nv * sizeof *s->misscount);
5315 
5316     s->stats = malloc(11 * nv * sizeof *s->stats);
5317     if (s->stats == NULL) {
5318 	*err = E_ALLOC;
5319 	free_summary(s);
5320 	return NULL;
5321     }
5322 
5323     s->mean   = s->stats;
5324     s->median = s->mean + nv;
5325     s->sd     = s->median + nv;
5326     s->skew   = s->sd + nv;
5327     s->xkurt  = s->skew + nv;
5328     s->low    = s->xkurt + nv;
5329     s->high   = s->low + nv;
5330     s->cv     = s->high + nv;
5331     s->perc05 = s->cv + nv;
5332     s->perc95 = s->perc05 + nv;
5333     s->iqr    = s->perc95 + nv;
5334 
5335     s->sb = s->sw = NADBL;
5336 
5337     return s;
5338 }
5339 
summary_has_missing_values(const Summary * summ)5340 int summary_has_missing_values (const Summary *summ)
5341 {
5342     if (summ->misscount != NULL) {
5343 	int i, nv = summ->list[0];
5344 
5345 	for (i=0; i<nv; i++) {
5346 	    if (summ->misscount[i] > 0) {
5347 		return 1;
5348 	    }
5349 	}
5350     }
5351 
5352     return 0;
5353 }
5354 
compare_wgtord_rows(const void * a,const void * b)5355 static int compare_wgtord_rows (const void *a, const void *b)
5356 {
5357     const double **da = (const double **) a;
5358     const double **db = (const double **) b;
5359 
5360     return (da[0][0] > db[0][0]) - (da[0][0] < db[0][0]);
5361 }
5362 
weighted_order_stats(const double * y,const double * w,double * ostats,int n,int k)5363 static int weighted_order_stats (const double *y, const double *w,
5364 				 double *ostats, int n, int k)
5365 {
5366     /* on input "ostats" contains the quantiles to compute;
5367        on output it holds the results
5368     */
5369     double p, q, wsum = 0.0;
5370     double **X;
5371     int t, i, err = 0;
5372 
5373     X = doubles_array_new(n, 2);
5374     if (X == NULL) {
5375 	return E_ALLOC;
5376     }
5377 
5378     i = 0;
5379     for (t=0; t<n; t++) {
5380 	if (na(y[t]) || na(w[t]) || w[t] == 0.0) {
5381 	    continue;
5382 	} else {
5383 	    X[i][0] = y[t];
5384 	    X[i][1] = w[t];
5385 	    wsum += w[t];
5386 	    i++;
5387 	}
5388     }
5389 
5390     qsort(X, i, sizeof *X, compare_wgtord_rows);
5391 
5392     for (i=0; i<k; i++) {
5393 	p = ostats[i] * wsum;
5394 	q = X[0][1];
5395 	t = 0;
5396 	while (q <= p) {
5397 	    q += X[t][1];
5398 	    if (q < p) t++;
5399 	}
5400 
5401 	if (t == 0 || t >= n - 1) {
5402 	    ostats[i] = NADBL;
5403 	    continue;
5404 	}
5405 
5406 	if (X[t-1][0] == X[t][0]) {
5407 	    ostats[i] = X[t][0];
5408 	} else {
5409 	    double a = (q - p) / X[t][1];
5410 
5411 	    ostats[i] = a * X[t+1][0] + (1-a) * X[t][0];
5412 	}
5413     }
5414 
5415     doubles_array_free(X, n);
5416 
5417     return err;
5418 }
5419 
5420 /**
5421  * get_summary_weighted:
5422  * @list: list of variables to process.
5423  * @dset: dataset struct.
5424  * @wgt: series to use as weights.
5425  * @opt: may include OPT_S for "simple" version.
5426  * @prn: gretl printing struct.
5427  * @err: location to receive error code.
5428  *
5429  * Calculates descriptive summary statistics for the specified
5430  * variables, weighting the observations by @rv. The series @rv must
5431  * be of full length (dset->n).
5432  *
5433  * Returns: #Summary object containing the summary statistics, or NULL
5434  * on failure.
5435  */
5436 
get_summary_weighted(const int * list,const DATASET * dset,int wtvar,gretlopt opt,PRN * prn,int * err)5437 Summary *get_summary_weighted (const int *list, const DATASET *dset,
5438 			       int wtvar, gretlopt opt,
5439 			       PRN *prn, int *err)
5440 {
5441     const double *wts;
5442     double *x, ostats[5];
5443     int t1 = dset->t1;
5444     int t2 = dset->t2;
5445     Summary *s;
5446     int i, t;
5447 
5448     s = summary_new(list, wtvar, opt, err);
5449     if (s == NULL) {
5450 	return NULL;
5451     }
5452 
5453     x = malloc(dset->n * sizeof *x);
5454     if (x == NULL) {
5455 	*err = E_ALLOC;
5456 	free_summary(s);
5457 	return NULL;
5458     }
5459 
5460     wts = dset->Z[wtvar];
5461 
5462     for (i=0; i<list[0]; i++)  {
5463 	double *pskew = NULL, *pkurt = NULL;
5464 	int vi = s->list[i+1];
5465 	int ni = 0;
5466         int ntot = 0;
5467         double wt;
5468 
5469 	/* prepare the series for weighting: substitute NAs for values
5470 	   at which the weights are invalid or zero
5471 	*/
5472 	for (t=t1; t<=t2; t++) {
5473 	    wt = wts[t];
5474 	    if (!na(wt) && wt != 0.0) {
5475                 ntot++;
5476 		x[t] = dset->Z[vi][t];
5477 		if (!na(x[t])) {
5478 		    ni++;
5479 		}
5480 	    } else {
5481 		x[t] = NADBL;
5482 	    }
5483 	}
5484 
5485 	s->misscount[i] = ntot - ni;
5486 
5487 	if (ni > s->n) {
5488 	    s->n = ni;
5489 	}
5490 
5491 	if (ni == 0) {
5492 	    pprintf(prn, _("Dropping %s: sample range contains no valid "
5493 			   "observations\n"), dset->varname[vi]);
5494 	    gretl_list_delete_at_pos(s->list, i + 1);
5495 	    if (s->list[0] == 0) {
5496 		return s;
5497 	    } else {
5498 		i--;
5499 		continue;
5500 	    }
5501 	}
5502 
5503 	if (opt & OPT_S) {
5504 	    s->skew[i] = NADBL;
5505 	    s->xkurt[i] = NADBL;
5506 	    s->cv[i] = NADBL;
5507 	    s->median[i] = NADBL;
5508 	} else {
5509 	    pskew = &s->skew[i];
5510 	    pkurt = &s->xkurt[i];
5511 	}
5512 
5513 	gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
5514 	gretl_moments(t1, t2, x, wts, &s->mean[i], &s->sd[i], pskew, pkurt, 1);
5515 
5516 	if (!(opt & OPT_S)) {
5517 	    int err;
5518 
5519 	    if (floateq(s->mean[i], 0.0)) {
5520 		s->cv[i] = NADBL;
5521 	    } else if (floateq(s->sd[i], 0.0)) {
5522 		s->cv[i] = 0.0;
5523 	    } else {
5524 		s->cv[i] = fabs(s->sd[i] / s->mean[i]);
5525 	    }
5526 
5527 	    ostats[0] = 0.05;
5528 	    ostats[1] = 0.25;
5529 	    ostats[2] = 0.5;
5530 	    ostats[3] = 0.75;
5531 	    ostats[4] = 0.95;
5532 
5533 	    err = weighted_order_stats(x, wts, ostats, ni, 5);
5534 
5535 	    if (!err) {
5536 		s->median[i] = ostats[2];
5537 		s->perc05[i] = ostats[0];
5538 		s->perc95[i] = ostats[4];
5539 		if (!na(ostats[1]) && !na(ostats[3])) {
5540 		    s->iqr[i] = ostats[3] - ostats[1];
5541 		}
5542 	    }
5543 	}
5544 #if 0
5545 	if (dataset_is_panel(dset) && list[0] == 1) {
5546 	    panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
5547 	}
5548 #endif
5549     }
5550 
5551     free(x);
5552 
5553     return s;
5554 }
5555 
5556 /* Get the additional statistics wanted if the --simple
5557    flag was not given to the summary command.
5558 */
5559 
get_extra_stats(Summary * s,int i,int t1,int t2,const double * x)5560 static int get_extra_stats (Summary *s, int i,
5561 			    int t1, int t2,
5562 			    const double *x)
5563 {
5564     int err = 0;
5565 
5566     if (floateq(s->mean[i], 0.0)) {
5567 	s->cv[i] = NADBL;
5568     } else if (floateq(s->sd[i], 0.0)) {
5569 	s->cv[i] = 0.0;
5570     } else {
5571 	s->cv[i] = fabs(s->sd[i] / s->mean[i]);
5572     }
5573 
5574     s->perc05[i] = gretl_quantile(t1, t2, x, 0.05, OPT_Q, &err);
5575     s->perc95[i] = gretl_quantile(t1, t2, x, 0.95, OPT_Q, &err);
5576     s->iqr[i] = gretl_quantile(t1, t2, x, 0.75, OPT_NONE, &err);
5577     if (!na(s->iqr[i])) {
5578 	s->iqr[i] -= gretl_quantile(t1, t2, x, 0.25, OPT_NONE, &err);
5579     }
5580 
5581     return err;
5582 }
5583 
5584 /**
5585  * get_summary_restricted:
5586  * @list: list of variables to process.
5587  * @dset: dataset struct.
5588  * @rv: series to use as restriction dummy.
5589  * @opt: may include OPT_S for "simple" version.
5590  * @prn: gretl printing struct.
5591  * @err: location to receive error code.
5592  *
5593  * Calculates descriptive summary statistics for the specified variables,
5594  * with the observations restricted to those for which @rv has a non-zero
5595  * (and non-missing) value. The series @rv must be of full length (dset->n).
5596  *
5597  * Returns: #Summary object containing the summary statistics, or NULL
5598  * on failure.
5599  */
5600 
get_summary_restricted(const int * list,const DATASET * dset,const double * rv,gretlopt opt,PRN * prn,int * err)5601 Summary *get_summary_restricted (const int *list, const DATASET *dset,
5602 				 const double *rv, gretlopt opt,
5603 				 PRN *prn, int *err)
5604 {
5605     int t1 = dset->t1;
5606     int t2 = dset->t2;
5607     Summary *s;
5608     double *x;
5609     int i, t;
5610 
5611     s = summary_new(list, 0, opt, err);
5612     if (s == NULL) {
5613 	return NULL;
5614     }
5615 
5616     x = malloc(dset->n * sizeof *x);
5617     if (x == NULL) {
5618 	*err = E_ALLOC;
5619 	free_summary(s);
5620 	return NULL;
5621     }
5622 
5623     for (i=0; i<s->list[0]; i++)  {
5624 	double *pskew = NULL, *pkurt = NULL;
5625 	int vi = s->list[i+1];
5626 	int strvals;
5627 	int ni = 0;
5628         int ntot = 0;
5629 
5630 	strvals = is_string_valued(dset, vi);
5631 
5632 	if (!strvals) {
5633 	    /* create the restricted series: substitute NAs
5634 	       for values at which the restriction dummy is
5635 	       invalid or zero
5636 	    */
5637 	    for (t=t1; t<=t2; t++) {
5638 		if (!na(rv[t]) && rv[t] != 0.0) {
5639 		    ntot++;
5640 		    x[t] = dset->Z[vi][t];
5641 		    if (!na(x[t])) {
5642 			ni++;
5643 		    }
5644 		} else {
5645 		    x[t] = NADBL;
5646 		}
5647 	    }
5648 	    s->misscount[i] = ntot - ni;
5649 	    if (ni > s->n) {
5650 		s->n = ni;
5651 	    }
5652 	}
5653 
5654 	if (ni == 0) {
5655 	    if (strvals) {
5656 		pprintf(prn, _("Dropping %s: string-valued series\n"),
5657 			dset->varname[vi]);
5658 	    } else {
5659 		pprintf(prn, _("Dropping %s: sample range contains no valid "
5660 			       "observations\n"), dset->varname[vi]);
5661 	    }
5662 	    gretl_list_delete_at_pos(s->list, i + 1);
5663 	    if (s->list[0] == 0) {
5664 		return s;
5665 	    } else {
5666 		i--;
5667 		continue;
5668 	    }
5669 	}
5670 
5671 	if (opt & OPT_S) {
5672 	    s->skew[i] = NADBL;
5673 	    s->xkurt[i] = NADBL;
5674 	    s->cv[i] = NADBL;
5675 	} else {
5676 	    pskew = &s->skew[i];
5677 	    pkurt = &s->xkurt[i];
5678 	}
5679 
5680 	gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
5681 	gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i], pskew, pkurt, 1);
5682 	s->median[i] = gretl_median(t1, t2, x);
5683 
5684 	if (!(opt & OPT_S)) {
5685 	    *err = get_extra_stats(s, i, t1, t2, x);
5686 	}
5687 
5688 	if (dataset_is_panel(dset) && list[0] == 1) {
5689 	    panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
5690 	}
5691     }
5692 
5693     free(x);
5694 
5695     return s;
5696 }
5697 
5698 /**
5699  * get_summary:
5700  * @list: list of variables to process.
5701  * @dset: dataset struct.
5702  * @opt: may include OPT_S for "simple" version.
5703  * @prn: gretl printing struct.
5704  * @err: location to receive error code.
5705  *
5706  * Calculates descriptive summary statistics for the specified variables.
5707  *
5708  * Returns: #Summary object containing the summary statistics, or NULL
5709  * on failure.
5710  */
5711 
get_summary(const int * list,const DATASET * dset,gretlopt opt,PRN * prn,int * err)5712 Summary *get_summary (const int *list, const DATASET *dset,
5713 		      gretlopt opt, PRN *prn, int *err)
5714 {
5715     int t1 = dset->t1;
5716     int t2 = dset->t2;
5717     Summary *s;
5718     int i, nmax;
5719 
5720     s = summary_new(list, 0, opt, err);
5721     if (s == NULL) {
5722 	return NULL;
5723     }
5724 
5725     nmax = sample_size(dset);
5726 
5727     for (i=0; i<s->list[0]; i++)  {
5728 	double *pskew = NULL, *pkurt = NULL;
5729 	const double *x;
5730 	double x0;
5731 	int vi = s->list[i+1];
5732 	int strvals;
5733 	int ni = 0;
5734 
5735 	strvals = is_string_valued(dset, vi);
5736 	if (!strvals) {
5737 	    x = dset->Z[vi];
5738 	    ni = good_obs(x + t1, nmax, &x0);
5739 	    s->misscount[i] = nmax - ni;
5740 	    if (ni > s->n) {
5741 		s->n = ni;
5742 	    }
5743 	}
5744 	if (ni == 0) {
5745 	    if (strvals) {
5746 		pprintf(prn, _("Dropping %s: string-valued series\n"),
5747 			dset->varname[vi]);
5748 	    } else {
5749 		pprintf(prn, _("Dropping %s: sample range contains no valid "
5750 			       "observations\n"), dset->varname[vi]);
5751 	    }
5752 	    gretl_list_delete_at_pos(s->list, i + 1);
5753 	    if (s->list[0] == 0) {
5754 		return s;
5755 	    } else {
5756 		i--;
5757 		continue;
5758 	    }
5759 	}
5760 
5761 	if (opt & OPT_S) {
5762 	    s->skew[i] = NADBL;
5763 	    s->xkurt[i] = NADBL;
5764 	    s->cv[i] = NADBL;
5765 	    s->median[i] = NADBL;
5766 	} else {
5767 	    pskew = &s->skew[i];
5768 	    pkurt = &s->xkurt[i];
5769 	}
5770 
5771 	gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
5772 
5773 	if (s->weight_var == 0) {
5774 	    gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i],
5775 			  pskew, pkurt, 1);
5776 	} else {
5777 	    gretl_moments(t1, t2, x, dset->Z[s->weight_var], &s->mean[i], &s->sd[i],
5778 			  pskew, pkurt, 0);
5779 	}
5780 
5781 	/* included in both simple and full variants */
5782 	s->median[i] = gretl_median(t1, t2, x);
5783 
5784 	if (!(opt & OPT_S)) {
5785 	    *err = get_extra_stats(s, i, t1, t2, x);
5786 	}
5787 
5788 	if (dataset_is_panel(dset) && list[0] == 1) {
5789 	    panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
5790 	}
5791     }
5792 
5793     return s;
5794 }
5795 
record_summary(Summary * summ,const DATASET * dset)5796 static void record_summary (Summary *summ, const DATASET *dset)
5797 {
5798     gretl_matrix *m = NULL;
5799     char **Sc, **Sr;
5800     int nv = summ->list[0];
5801     int i, vi, cols;
5802 
5803     cols = (summ->opt & OPT_S)? 5 : 12;
5804     m = gretl_matrix_alloc(nv, cols);
5805     if (m == NULL) {
5806 	return;
5807     }
5808 
5809     Sc = strings_array_new(cols);
5810     Sr = strings_array_new(nv);
5811 
5812     if (summ->opt & OPT_S) {
5813 	/* the "simple" option */
5814 	const char *h[] = {
5815 	    N_("Mean"),
5816 	    N_("Median"),
5817 	    N_("S.D."),
5818 	    N_("Min"),
5819 	    N_("Max"),
5820 	};
5821 
5822 	if (Sc != NULL) {
5823 	    for (i=0; i<cols; i++) {
5824 		Sc[i] = gretl_strdup(_(h[i]));
5825 	    }
5826 	}
5827 	for (i=0; i<nv; i++) {
5828 	    if (Sr != NULL) {
5829 		vi = summ->list[i+1];
5830 		Sr[i] = gretl_strdup(dset->varname[vi]);
5831 	    }
5832 	    gretl_matrix_set(m, i, 0, summ->mean[i]);
5833 	    gretl_matrix_set(m, i, 1, summ->median[i]);
5834 	    gretl_matrix_set(m, i, 2, summ->sd[i]);
5835 	    gretl_matrix_set(m, i, 3, summ->low[i]);
5836 	    gretl_matrix_set(m, i, 4, summ->high[i]);
5837 	}
5838     } else {
5839 	/* record all available stats */
5840 	const char *h[] = {
5841 	    N_("Mean"),
5842 	    N_("Median"),
5843 	    N_("Minimum"),
5844 	    N_("Maximum"),
5845 	    N_("Std. Dev."),
5846 	    N_("C.V."),
5847 	    N_("Skewness"),
5848 	    N_("Ex. kurtosis"),
5849 	    /* xgettext:no-c-format */
5850 	    N_("5% perc."),
5851 	    /* xgettext:no-c-format */
5852 	    N_("95% perc."),
5853 	    N_("IQ range"),
5854 	    N_("Missing obs.")
5855 	};
5856 	double cv;
5857 
5858 	if (Sc != NULL) {
5859 	    for (i=0; i<cols; i++) {
5860 		Sc[i] = gretl_strdup(_(h[i]));
5861 	    }
5862 	}
5863 	for (i=0; i<nv; i++) {
5864 	    if (Sr != NULL) {
5865 		vi = summ->list[i+1];
5866 		Sr[i] = gretl_strdup(dset->varname[vi]);
5867 	    }
5868 	    gretl_matrix_set(m, i, 0, summ->mean[i]);
5869 	    gretl_matrix_set(m, i, 1, summ->median[i]);
5870 	    gretl_matrix_set(m, i, 2, summ->low[i]);
5871 	    gretl_matrix_set(m, i, 3, summ->high[i]);
5872 	    gretl_matrix_set(m, i, 4, summ->sd[i]);
5873 	    if (floateq(summ->mean[i], 0.0)) {
5874 		cv = NADBL;
5875 	    } else if (floateq(summ->sd[i], 0.0)) {
5876 		cv = 0.0;
5877 	    } else {
5878 		cv = fabs(summ->sd[i] / summ->mean[i]);
5879 	    }
5880 	    gretl_matrix_set(m, i,  5, cv);
5881 	    gretl_matrix_set(m, i,  6, summ->skew[i]);
5882 	    gretl_matrix_set(m, i,  7, summ->xkurt[i]);
5883 	    gretl_matrix_set(m, i,  8, summ->perc05[i]);
5884 	    gretl_matrix_set(m, i,  9, summ->perc95[i]);
5885 	    gretl_matrix_set(m, i, 10, summ->iqr[i]);
5886 	    gretl_matrix_set(m, i, 11, summ->misscount[i]);
5887 	}
5888     }
5889 
5890     gretl_matrix_set_colnames(m, Sc);
5891     gretl_matrix_set_rownames(m, Sr);
5892     set_last_result_data(m, GRETL_TYPE_MATRIX);
5893 }
5894 
5895 /**
5896  * list_summary:
5897  * @list: list of series to process.
5898  * @dset: dataset struct.
5899  * @opt: may include %OPT_S for "simple" version.
5900  * @prn: gretl printing struct.
5901  *
5902  * Prints descriptive statistics for the listed variables.
5903  *
5904  * Returns: 0 on success, non-zero code on error.
5905  */
5906 
list_summary(const int * list,int wgtvar,const DATASET * dset,gretlopt opt,PRN * prn)5907 int list_summary (const int *list, int wgtvar,
5908 		  const DATASET *dset,
5909 		  gretlopt opt, PRN *prn)
5910 {
5911     Summary *summ = NULL;
5912     int err = 0;
5913 
5914     if (list != NULL) {
5915 	if (list[0] == 0) {
5916 	    return 0;
5917 	}
5918 	if (wgtvar == 0) {
5919 	    /* no weights */
5920 	    summ = get_summary(list, dset, opt, prn, &err);
5921 	} else {
5922 	    summ = get_summary_weighted(list, dset, wgtvar,
5923 					opt, prn, &err);
5924 	}
5925     } else {
5926 	int *tmplist = full_var_list(dset, NULL);
5927 
5928 	if (tmplist != NULL) {
5929 	    if (wgtvar == 0) {
5930 		/* no weights */
5931 		summ = get_summary(tmplist, dset, opt, prn, &err);
5932 	    } else {
5933 		summ = get_summary_weighted(tmplist, dset, wgtvar,
5934 					    opt, prn, &err);
5935 	    }
5936 	    free(tmplist);
5937 	}
5938     }
5939 
5940     if (summ != NULL) {
5941 	if (!(opt & OPT_Q)) {
5942 	    print_summary(summ, dset, prn);
5943 	}
5944 	record_summary(summ, dset);
5945 	free_summary(summ);
5946     }
5947 
5948     return err;
5949 }
5950 
5951 /**
5952  * vmatrix_new:
5953  *
5954  * Returns: an allocated and initialized #VMatrix, or
5955  * %NULL on failure.
5956  */
5957 
vmatrix_new(void)5958 VMatrix *vmatrix_new (void)
5959 {
5960     VMatrix *v = malloc(sizeof *v);
5961 
5962     if (v != NULL) {
5963 	v->vec = NULL;
5964 	v->xbar = NULL;
5965 	v->ssx = NULL;
5966 	v->list = NULL;
5967 	v->names = NULL;
5968 	v->ci = 0;
5969 	v->dim = 0;
5970 	v->t1 = 0;
5971 	v->t2 = 0;
5972 	v->n = 0;
5973 	v->missing = 0;
5974     }
5975 
5976     return v;
5977 }
5978 
5979 /**
5980  * free_vmatrix:
5981  * @vmat: pointer to gretl correlation matrix struct
5982  *
5983  * Frees all resources associated with @vmat, and
5984  * the pointer itself.
5985  */
5986 
free_vmatrix(VMatrix * vmat)5987 void free_vmatrix (VMatrix *vmat)
5988 {
5989     if (vmat != NULL) {
5990 	if (vmat->names != NULL) {
5991 	    strings_array_free(vmat->names, vmat->dim);
5992 	}
5993 	if (vmat->vec != NULL) {
5994 	    free(vmat->vec);
5995 	}
5996 	if (vmat->xbar != NULL) {
5997 	    free(vmat->xbar);
5998 	}
5999 	if (vmat->ssx != NULL) {
6000 	    free(vmat->ssx);
6001 	}
6002 	if (vmat->list != NULL) {
6003 	    free(vmat->list);
6004 	}
6005 	free(vmat);
6006     }
6007 }
6008 
6009 enum {
6010     CORRMAT = 0,
6011     COVMAT
6012 };
6013 
6014 /* Compute correlation or covariance matrix, using the maximum
6015    available sample for each coefficient.
6016 */
6017 
max_corrcov_matrix(VMatrix * v,const DATASET * dset,int flag)6018 static int max_corrcov_matrix (VMatrix *v, const DATASET *dset,
6019 			       int flag)
6020 {
6021     double **Z = dset->Z;
6022     int i, j, vi, vj, nij;
6023     int nmin, nmax = 0;
6024     int m = v->dim;
6025     int nmiss;
6026 
6027     nmin = v->n = v->t2 - v->t1 + 1;
6028 
6029     for (i=0; i<m; i++) {
6030 	vi = v->list[i+1];
6031 	for (j=i; j<m; j++)  {
6032 	    vj = v->list[j+1];
6033 	    nij = ijton(i, j, m);
6034 	    if (i == j && flag == CORRMAT) {
6035 		v->vec[nij] = 1.0;
6036 	    } else {
6037 		nmiss = 0;
6038 		if (flag == COVMAT) {
6039 		    v->vec[nij] = gretl_covar(v->t1, v->t2, Z[vi], Z[vj],
6040 					      &nmiss);
6041 		} else {
6042 		    v->vec[nij] = gretl_corr(v->t1, v->t2, Z[vi], Z[vj],
6043 					     &nmiss);
6044 		}
6045 		if (nmiss > 0) {
6046 		    int n = v->n - nmiss;
6047 
6048 		    if (n < nmin && n > 0) {
6049 			nmin = n;
6050 		    }
6051 		    if (n > nmax) {
6052 			nmax = n;
6053 		    }
6054 		    v->missing = 1;
6055 		} else {
6056 		    nmax = v->n;
6057 		}
6058 	    }
6059 	}
6060     }
6061 
6062     /* We'll record an "n" value if there's something resembling
6063        a common number of observations across the coefficients.
6064     */
6065 
6066     if (v->missing) {
6067 	v->n = 0;
6068 	if (nmax > 0) {
6069 	    double d = (nmax - nmin) / (double) nmax;
6070 
6071 	    if (d < 0.10) {
6072 		v->n = nmin;
6073 	    }
6074 	}
6075     }
6076 
6077     return 0;
6078 }
6079 
6080 /* Compute correlation or covariance matrix, ensuring we use the same
6081    sample for all coefficients. We may be doing this in the context of
6082    the "corr" command or in the context of "pca". In the latter case
6083    we want to save the "xbar" and "ssx" values for later use when
6084    calculating the component series.
6085 */
6086 
uniform_corrcov_matrix(VMatrix * v,const DATASET * dset,int ci,int flag)6087 static int uniform_corrcov_matrix (VMatrix *v, const DATASET *dset,
6088 				   int ci, int flag)
6089 {
6090     double **Z = dset->Z;
6091     double *xbar = NULL, *ssx = NULL;
6092     double *x;
6093     int m = v->dim;
6094     int mm = (m * (m + 1)) / 2;
6095     double d1, d2;
6096     int i, j, jmin, nij, t;
6097     int miss, n = 0;
6098 
6099     xbar = malloc(m * sizeof *xbar);
6100     if (xbar == NULL) {
6101 	return E_ALLOC;
6102     }
6103 
6104     if (ci == PCA || flag == CORRMAT) {
6105 	ssx = malloc(m * sizeof *ssx);
6106 	if (ssx == NULL) {
6107 	    free(xbar);
6108 	    return E_ALLOC;
6109 	}
6110     }
6111 
6112     for (i=0; i<m; i++) {
6113 	xbar[i] = 0.0;
6114     }
6115 
6116     if (ssx != NULL) {
6117 	for (i=0; i<m; i++) {
6118 	    ssx[i] = 0.0;
6119 	}
6120     }
6121 
6122     v->missing = 0;
6123 
6124     /* first pass: get sample size and sums */
6125 
6126     for (t=v->t1; t<=v->t2; t++) {
6127 	miss = 0;
6128 	for (i=0; i<m; i++) {
6129 	    if (na(Z[v->list[i+1]][t])) {
6130 		miss = 1;
6131 		v->missing += 1;
6132 		break;
6133 	    }
6134 	}
6135 	if (!miss) {
6136 	    n++;
6137 	    for (i=0; i<m; i++) {
6138 		xbar[i] += Z[v->list[i+1]][t];
6139 	    }
6140 	}
6141     }
6142 
6143     if (n < 2) {
6144 	free(xbar);
6145 	free(ssx);
6146 	return E_MISSDATA;
6147     }
6148 
6149     for (i=0; i<m; i++) {
6150 	xbar[i] /= n;
6151     }
6152 
6153     for (i=0; i<mm; i++) {
6154 	v->vec[i] = 0.0;
6155     }
6156 
6157     /* second pass: get deviations from means and cumulate */
6158 
6159     for (t=v->t1; t<=v->t2; t++) {
6160 	miss = 0;
6161 	for (i=0; i<m; i++) {
6162 	    x = Z[v->list[i+1]];
6163 	    if (na(x[t])) {
6164 		miss = 1;
6165 		break;
6166 	    }
6167 	}
6168 	if (!miss) {
6169 	    for (i=0; i<m; i++) {
6170 		x = Z[v->list[i+1]];
6171 		d1 = x[t] - xbar[i];
6172 		if (ssx != NULL) {
6173 		    ssx[i] += d1 * d1;
6174 		}
6175 		jmin = (flag == COVMAT)? i : i + 1;
6176 		for (j=jmin; j<m; j++) {
6177 		    x = Z[v->list[j+1]];
6178 		    nij = ijton(i, j, m);
6179 		    d2 = x[t] - xbar[j];
6180 		    v->vec[nij] += d1 * d2;
6181 		}
6182 	    }
6183 	}
6184     }
6185 
6186     /* finalize: compute correlations or covariances */
6187 
6188     if (flag == CORRMAT) {
6189 	/* correlations */
6190 	for (i=0; i<m; i++) {
6191 	    for (j=i; j<m; j++)  {
6192 		nij = ijton(i, j, m);
6193 		if (i == j) {
6194 		    v->vec[nij] = 1.0;
6195 		} else if (ssx[i] == 0.0 || ssx[j] == 0.0) {
6196 		    v->vec[nij] = NADBL;
6197 		} else {
6198 		    v->vec[nij] /= sqrt(ssx[i] * ssx[j]);
6199 		}
6200 	    }
6201 	}
6202     } else {
6203 	/* covariances */
6204 	for (i=0; i<mm; i++) {
6205 	    v->vec[i] /= (n - 1);
6206 	}
6207     }
6208 
6209     v->n = n;
6210 
6211     if (ci == PCA) {
6212 	v->xbar = xbar;
6213 	v->ssx = ssx;
6214     } else {
6215 	free(xbar);
6216 	free(ssx);
6217     }
6218 
6219     return 0;
6220 }
6221 
6222 /**
6223  * corrlist:
6224  * @ci: command index.
6225  * @list: list of variables to process, by ID number.
6226  * @dset: dataset struct.
6227  * @opt: option flags.
6228  * @err: location to receive error code.
6229  *
6230  * Computes pairwise correlation coefficients for the variables
6231  * specified in @list, skipping any constants.  If the option
6232  * flags contain OPT_N, a uniform sample is ensured: only those
6233  * observations for which all the listed variables have valid
6234  * values are used.  If OPT_C is included, we actually calculate
6235  * covariances rather than correlations.
6236  *
6237  * Returns: gretl correlation matrix struct, or NULL on failure.
6238  */
6239 
corrlist(int ci,int * list,const DATASET * dset,gretlopt opt,int * err)6240 VMatrix *corrlist (int ci, int *list, const DATASET *dset,
6241 		   gretlopt opt, int *err)
6242 {
6243     int flag = (opt & OPT_C)? COVMAT : CORRMAT;
6244     VMatrix *v;
6245     int i, m, mm;
6246     int nmiss = 0;
6247 
6248     if (list == NULL) {
6249 	fprintf(stderr, "corrlist: list is NULL\n");
6250 	*err = E_DATA;
6251 	return NULL;
6252     }
6253 
6254     v = vmatrix_new();
6255     if (v == NULL) {
6256 	*err = E_ALLOC;
6257 	return NULL;
6258     }
6259 
6260     /* drop any constants from list */
6261     for (i=list[0]; i>0; i--) {
6262 	if (gretl_isconst(dset->t1, dset->t2, dset->Z[list[i]])) {
6263 	    gretl_list_delete_at_pos(list, i);
6264 	}
6265     }
6266 
6267     if (list[0] < 2) {
6268 	gretl_errmsg_set(_("corr: needs at least two non-constant arguments"));
6269 	*err = E_DATA;
6270 	goto bailout;
6271     }
6272 
6273     v->t1 = dset->t1;
6274     v->t2 = dset->t2;
6275 
6276     /* adjust sample range if need be */
6277     list_adjust_sample(list, &v->t1, &v->t2, dset, &nmiss);
6278 
6279     if (v->t2 - v->t1 + 1 < 3) {
6280 	*err = E_TOOFEW;
6281 	goto bailout;
6282     }
6283 
6284     v->dim = m = list[0];
6285     mm = (m * (m + 1)) / 2;
6286 
6287     v->names = strings_array_new(m);
6288     v->vec = malloc(mm * sizeof *v->vec);
6289     v->list = gretl_list_copy(list);
6290 
6291     if (v->names == NULL || v->vec == NULL || v->list == NULL) {
6292 	*err = E_ALLOC;
6293 	goto bailout;
6294     }
6295 
6296     if (ci == PCA) {
6297 	opt |= OPT_N;
6298     }
6299 
6300     if (opt & OPT_N) {
6301 	/* impose uniform sample size */
6302 	*err = uniform_corrcov_matrix(v, dset, ci, flag);
6303     } else {
6304 	/* sample sizes may differ */
6305 	*err = max_corrcov_matrix(v, dset, flag);
6306     }
6307 
6308     if (!*err) {
6309 	for (i=0; i<m; i++) {
6310 	    v->names[i] = gretl_strdup(dset->varname[list[i+1]]);
6311 	    if (v->names[i] == NULL) {
6312 		*err = E_ALLOC;
6313 		goto bailout;
6314 	    }
6315 	}
6316     }
6317 
6318     v->ci = (flag == CORRMAT)? CORR : 0; /* FIXME? */
6319 
6320  bailout:
6321 
6322     if (*err) {
6323 	free_vmatrix(v);
6324 	v = NULL;
6325     }
6326 
6327     return v;
6328 }
6329 
printcorr(const VMatrix * v,PRN * prn)6330 static void printcorr (const VMatrix *v, PRN *prn)
6331 {
6332     double r = v->vec[1];
6333 
6334     pprintf(prn, "\ncorr(%s, %s)", v->names[0], v->names[1]);
6335 
6336     if (na(r)) {
6337 	pprintf(prn, ": %s\n\n", _("undefined"));
6338     } else {
6339 	pprintf(prn, " = %.8f\n", r);
6340 	if (fabs(r) < 1.0) {
6341 	    int n2 = v->n - 2;
6342 	    double tval = r * sqrt(n2 / (1 - r*r));
6343 
6344 	    pputs(prn, _("Under the null hypothesis of no correlation:\n "));
6345 	    pprintf(prn, _("t(%d) = %g, with two-tailed p-value %.4f\n"), n2,
6346 		    tval, student_pvalue_2(n2, tval));
6347 	    pputc(prn, '\n');
6348 	} else {
6349 	    pprintf(prn, _("5%% critical value (two-tailed) = "
6350 		       "%.4f for n = %d"), rhocrit95(v->n), v->n);
6351 	    pputs(prn, "\n\n");
6352 	}
6353     }
6354 }
6355 
6356 /**
6357  * print_corrmat:
6358  * @corr: gretl correlation matrix.
6359  * @dset: dataset information.
6360  * @prn: gretl printing struct.
6361  *
6362  * Prints a gretl correlation matrix to @prn.
6363  */
6364 
print_corrmat(VMatrix * corr,const DATASET * dset,PRN * prn)6365 void print_corrmat (VMatrix *corr, const DATASET *dset, PRN *prn)
6366 {
6367     if (corr->dim == 2) {
6368 	printcorr(corr, prn);
6369     } else {
6370 	char date1[OBSLEN], date2[OBSLEN];
6371 	gchar *tmp = NULL;
6372 
6373 	ntolabel(date1, corr->t1, dset);
6374 	ntolabel(date2, corr->t2, dset);
6375 
6376 	pputc(prn, '\n');
6377 
6378 	tmp = g_strdup_printf(_("%s, using the observations %s - %s"),
6379 			      _("Correlation Coefficients"), date1, date2);
6380 	output_line(tmp, prn, 0);
6381 	g_free(tmp);
6382 
6383 	if (corr->missing) {
6384 	    output_line(_("(missing values were skipped)"), prn, 1);
6385 	}
6386 
6387 	if (corr->n > 0) {
6388 	    tmp = g_strdup_printf(_("5%% critical value (two-tailed) = "
6389 				    "%.4f for n = %d"), rhocrit95(corr->n),
6390 				  corr->n);
6391 	    output_line(tmp, prn, 1);
6392 	    g_free(tmp);
6393 	}
6394 
6395 	text_print_vmatrix(corr, prn);
6396     }
6397 }
6398 
record_corr_matrix(VMatrix * c)6399 static void record_corr_matrix (VMatrix *c)
6400 {
6401     gretl_matrix *m = NULL;
6402     int i, j, k, n = c->dim;
6403     double cij;
6404 
6405     m = gretl_matrix_alloc(n, n);
6406 
6407     if (m != NULL) {
6408 	k = 0;
6409 	for (i=0; i<n; i++) {
6410 	    for (j=i; j<n; j++) {
6411 		cij = c->vec[k];
6412 		gretl_matrix_set(m, i, j, cij);
6413 		gretl_matrix_set(m, j, i, cij);
6414 		k++;
6415 	    }
6416 	}
6417 
6418 	if (c->names) {
6419 	    char **rlab = strings_array_new(n);
6420 	    char **clab = strings_array_new(n);
6421 
6422 	    if (rlab != NULL && clab != NULL) {
6423 		for (i=0; i<n; i++) {
6424 		    rlab[i] = gretl_strdup(c->names[i]);
6425 		    clab[i] = gretl_strdup(c->names[i]);
6426 		}
6427 		gretl_matrix_set_rownames(m, rlab);
6428 		gretl_matrix_set_colnames(m, clab);
6429 	    }
6430 	}
6431 
6432 	set_last_result_data(m, GRETL_TYPE_MATRIX);
6433     }
6434 }
6435 
6436 /**
6437  * gretl_corrmx:
6438  * @list: gives the ID numbers of the variables to process.
6439  * @dset: dataset struct.
6440  * @opt: option flags: OPT_N means use uniform sample size,
6441  * OPT_U controls plotting options.
6442  * @prn: gretl printing struct.
6443  *
6444  * Computes and prints the correlation matrix for the specified list
6445  * of variables.
6446  *
6447  * Returns: 0 on successful completion, 1 on error.
6448  */
6449 
gretl_corrmx(int * list,const DATASET * dset,gretlopt opt,PRN * prn)6450 int gretl_corrmx (int *list, const DATASET *dset,
6451 		  gretlopt opt, PRN *prn)
6452 {
6453     VMatrix *corr = NULL;
6454     int err = 0;
6455 
6456     if (list != NULL) {
6457 	if (list[0] == 0) {
6458 	    return 0;
6459 	} else {
6460 	    corr = corrlist(CORR, list, dset, opt, &err);
6461 	}
6462     } else {
6463 	int *tmplist = full_var_list(dset, NULL);
6464 
6465 	if (tmplist != NULL) {
6466 	    corr = corrlist(CORR, tmplist, dset, opt, &err);
6467 	    free(tmplist);
6468 	}
6469     }
6470 
6471     if (corr != NULL) {
6472 	if (!(opt & OPT_Q)) {
6473 	    print_corrmat(corr, dset, prn);
6474 	}
6475 	if (corr->dim > 2 && gnuplot_graph_wanted(PLOT_HEATMAP, opt)) {
6476 	    err = plot_corrmat(corr, opt);
6477 	}
6478 
6479 	record_corr_matrix(corr);
6480 	free_vmatrix(corr);
6481     }
6482 
6483     return err;
6484 }
6485 
6486 /*
6487   Satterthwaite, F.E. (1946). "An Approximate Distribution of Estimates
6488   of Variance Components". Biometrics Bulletin, 2, 6, pp. 110–114.
6489 
6490   v = (s^2_1/n_1 + s^2_2/n_2)^2 /
6491         [(s^_1/n1)^2 / (n_1 - 1) + (s^2_2/n_2)^2 / (n_2 - 1)]
6492 */
6493 
satterthwaite_df(double v1,int n1,double v2,int n2)6494 int satterthwaite_df (double v1, int n1,
6495 		      double v2, int n2)
6496 {
6497     double rtnum = v1 / n1 + v2 / n2;
6498     double d1 = (v1/n1) * (v1/n1) / (n1 - 1);
6499     double d2 = (v2/n2) * (v2/n2) / (n2 - 1);
6500     double v = rtnum * rtnum / (d1 + d2);
6501 
6502     return (int) floor(v);
6503 }
6504 
6505 /**
6506  * means_test:
6507  * @list: gives the ID numbers of the variables to compare.
6508  * @dset: dataset struct.
6509  * @opt: if OPT_O, assume population variances are different.
6510  * @prn: gretl printing struct.
6511  *
6512  * Carries out test of the null hypothesis that the means of two
6513  * variables are equal.
6514  *
6515  * Returns: 0 on successful completion, error code on error.
6516  */
6517 
means_test(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)6518 int means_test (const int *list, const DATASET *dset,
6519 		gretlopt opt, PRN *prn)
6520 {
6521     double m1, m2, s1, s2, skew, kurt, se, mdiff, t, pval;
6522     double v1, v2;
6523     double *x = NULL, *y = NULL;
6524     int vardiff = (opt & OPT_O);
6525     int df, n1, n2, n = dset->n;
6526 
6527     if (list[0] < 2) {
6528 	return E_ARGS;
6529     }
6530 
6531     x = malloc(n * sizeof *x);
6532     if (x == NULL) {
6533 	return E_ALLOC;
6534     }
6535 
6536     y = malloc(n * sizeof *y);
6537     if (y == NULL) {
6538 	free(x);
6539 	return E_ALLOC;
6540     }
6541 
6542     n1 = transcribe_array(x, dset->Z[list[1]], dset);
6543     n2 = transcribe_array(y, dset->Z[list[2]], dset);
6544 
6545     if (n1 == 0 || n2 == 0) {
6546 	pputs(prn, _("Sample range has no valid observations."));
6547 	free(x); free(y);
6548 	return 1;
6549     }
6550 
6551     if (n1 == 1 || n2 == 1) {
6552 	pputs(prn, _("Sample range has only one observation."));
6553 	free(x); free(y);
6554 	return 1;
6555     }
6556 
6557     df = n1 + n2 - 2;
6558 
6559     gretl_moments(0, n1-1, x, NULL, &m1, &s1, &skew, &kurt, 1);
6560     gretl_moments(0, n2-1, y, NULL, &m2, &s2, &skew, &kurt, 1);
6561     mdiff = m1 - m2;
6562 
6563     v1 = s1 * s1;
6564     v2 = s2 * s2;
6565 
6566     if (vardiff) {
6567 	/* do not assume the variances are equal */
6568 	se = sqrt((v1 / n1) + (v2 / n2));
6569 	df = satterthwaite_df(v1, n1, v2, n2);
6570     } else {
6571 	/* form pooled estimate of variance */
6572 	double s2p;
6573 
6574 	s2p = ((n1-1) * v1 + (n2-1) * v2) / df;
6575 	se = sqrt(s2p / n1 + s2p / n2);
6576 	df = n1 + n2 - 2;
6577     }
6578 
6579     t = mdiff / se;
6580     pval = student_pvalue_2(df, t);
6581 
6582     pprintf(prn, _("\nEquality of means test "
6583 	    "(assuming %s variances)\n\n"), (vardiff)? _("unequal") : _("equal"));
6584     pprintf(prn, "   %s: ", dset->varname[list[1]]);
6585     pprintf(prn, _("Number of observations = %d\n"), n1);
6586     pprintf(prn, "   %s: ", dset->varname[list[2]]);
6587     pprintf(prn, _("Number of observations = %d\n"), n2);
6588     pprintf(prn, _("   Difference between sample means = %g - %g = %g\n"),
6589 	    m1, m2, mdiff);
6590     pputs(prn, _("   Null hypothesis: The two population means are the same.\n"));
6591     pprintf(prn, _("   Estimated standard error = %g\n"), se);
6592     pprintf(prn, _("   Test statistic: t(%d) = %g\n"), df, t);
6593     pprintf(prn, _("   p-value (two-tailed) = %g\n\n"), pval);
6594     if (pval > .10)
6595 	pputs(prn, _("   The difference is not statistically significant.\n\n"));
6596 
6597     record_test_result(t, pval);
6598 
6599     free(x);
6600     free(y);
6601 
6602     return 0;
6603 }
6604 
6605 /**
6606  * vars_test:
6607  * @list: gives the ID numbers of the variables to compare.
6608  * @dset: dataset struct.
6609  * @prn: gretl printing struct.
6610  *
6611  * Carries out test of the null hypothesis that the variances of two
6612  * variables are equal.
6613  *
6614  * Returns: 0 on successful completion, error code on error.
6615  */
6616 
vars_test(const int * list,const DATASET * dset,PRN * prn)6617 int vars_test (const int *list, const DATASET *dset, PRN *prn)
6618 {
6619     double m, s1, s2, var1, var2;
6620     double F, pval;
6621     double *x = NULL, *y = NULL;
6622     int dfn, dfd, n1, n2, n = dset->n;
6623 
6624     if (list[0] < 2) return E_ARGS;
6625 
6626     x = malloc(n * sizeof *x);
6627     y = malloc(n * sizeof *y);
6628 
6629     if (x == NULL || y == NULL) {
6630 	free(x);
6631 	free(y);
6632 	return E_ALLOC;
6633     }
6634 
6635     n1 = transcribe_array(x, dset->Z[list[1]], dset);
6636     n2 = transcribe_array(y, dset->Z[list[2]], dset);
6637 
6638     if (n1 == 0 || n2 == 0) {
6639 	pputs(prn, _("Sample range has no valid observations."));
6640 	free(x); free(y);
6641 	return 1;
6642     }
6643 
6644     if (n1 == 1 || n2 == 1) {
6645 	pputs(prn, _("Sample range has only one observation."));
6646 	free(x); free(y);
6647 	return 1;
6648     }
6649 
6650     gretl_moments(0, n1-1, x, NULL, &m, &s1, NULL, NULL, 1);
6651     gretl_moments(0, n2-1, y, NULL, &m, &s2, NULL, NULL, 1);
6652 
6653     var1 = s1*s1;
6654     var2 = s2*s2;
6655     if (var1 > var2) {
6656 	F = var1 / var2;
6657 	dfn = n1 - 1;
6658 	dfd = n2 - 1;
6659     } else {
6660 	F = var2 / var1;
6661 	dfn = n2 - 1;
6662 	dfd = n1 - 1;
6663     }
6664 
6665     pval = snedecor_cdf_comp(dfn, dfd, F);
6666 
6667     pputs(prn, _("\nEquality of variances test\n\n"));
6668     pprintf(prn, "   %s: ", dset->varname[list[1]]);
6669     pprintf(prn, _("Number of observations = %d\n"), n1);
6670     pprintf(prn, "   %s: ", dset->varname[list[2]]);
6671     pprintf(prn, _("Number of observations = %d\n"), n2);
6672     pprintf(prn, _("   Ratio of sample variances = %g\n"), F);
6673     pprintf(prn, "   %s: %s\n", _("Null hypothesis"),
6674 	    _("The two population variances are equal"));
6675     pprintf(prn, "   %s: F(%d,%d) = %g\n", _("Test statistic"), dfn, dfd, F);
6676     pprintf(prn, _("   p-value (two-tailed) = %g\n\n"), pval);
6677     if (snedecor_cdf_comp(dfn, dfd, F) > .10)
6678 	pputs(prn, _("   The difference is not statistically significant.\n\n"));
6679 
6680     record_test_result(F, pval);
6681 
6682     free(x);
6683     free(y);
6684 
6685     return 0;
6686 }
6687 
6688 struct MahalDist_ {
6689     int *list;
6690     int n;
6691     double *d;
6692 };
6693 
mahal_dist_get_distances(const MahalDist * md)6694 const double *mahal_dist_get_distances (const MahalDist *md)
6695 {
6696     return md->d;
6697 }
6698 
mahal_dist_get_n(const MahalDist * md)6699 int mahal_dist_get_n (const MahalDist *md)
6700 {
6701     return md->n;
6702 }
6703 
mahal_dist_get_varlist(const MahalDist * md)6704 const int *mahal_dist_get_varlist(const MahalDist *md)
6705 {
6706     return md->list;
6707 }
6708 
free_mahal_dist(MahalDist * md)6709 void free_mahal_dist (MahalDist *md)
6710 {
6711     free(md->list);
6712     free(md->d);
6713     free(md);
6714 }
6715 
mahal_dist_new(const int * list,int n)6716 static MahalDist *mahal_dist_new (const int *list, int n)
6717 {
6718     MahalDist *md = malloc(sizeof *md);
6719 
6720     if (md != NULL) {
6721 	md->d = malloc(n * sizeof *md->d);
6722 	if (md->d == NULL) {
6723 	    free(md);
6724 	    md = NULL;
6725 	} else {
6726 	    md->list = gretl_list_copy(list);
6727 	    if (md->list == NULL) {
6728 		free(md->d);
6729 		free(md);
6730 		md = NULL;
6731 	    } else {
6732 		md->n = n;
6733 	    }
6734 	}
6735     }
6736 
6737     if (md != NULL) {
6738 	int t;
6739 
6740 	for (t=0; t<n; t++) {
6741 	    md->d[t] = NADBL;
6742 	}
6743     }
6744 
6745     return md;
6746 }
6747 
mdist_saver(DATASET * dset)6748 static int mdist_saver (DATASET *dset)
6749 {
6750     int t, v, err;
6751 
6752     err = dataset_add_series(dset, 1);
6753 
6754     if (err) {
6755 	v = 0;
6756     } else {
6757 	v = dset->v - 1;
6758 
6759 	for (t=0; t<dset->n; t++) {
6760 	    dset->Z[v][t] = NADBL;
6761 	}
6762 
6763 	strcpy(dset->varname[v], "mdist");
6764 	make_varname_unique(dset->varname[v], v, dset);
6765 	series_set_label(dset, v, _("Mahalanobis distances"));
6766     }
6767 
6768     return v;
6769 }
6770 
6771 static int
real_mahalanobis_distance(const int * list,DATASET * dset,gretlopt opt,MahalDist * md,PRN * prn)6772 real_mahalanobis_distance (const int *list, DATASET *dset,
6773 			   gretlopt opt, MahalDist *md,
6774 			   PRN *prn)
6775 {
6776     gretl_matrix *S = NULL;
6777     gretl_vector *means = NULL;
6778     gretl_vector *xdiff;
6779     int orig_t1 = dset->t1;
6780     int orig_t2 = dset->t2;
6781     int n, err = 0;
6782 
6783     list_adjust_sample(list, &dset->t1, &dset->t2, dset, NULL);
6784 
6785     n = sample_size(dset);
6786     if (n < 2) {
6787 	err = E_DATA;
6788 	goto bailout;
6789     }
6790 
6791     xdiff = gretl_column_vector_alloc(list[0]);
6792     if (xdiff == NULL) {
6793 	err = E_ALLOC;
6794 	goto bailout;
6795     }
6796 
6797     S = gretl_covariance_matrix_from_varlist(list,
6798 					     dset,
6799 					     &means,
6800 					     &err);
6801 
6802     if (!err) {
6803 	if (opt & OPT_V) {
6804 	    gretl_matrix_print_to_prn(S, _("Covariance matrix"), prn);
6805 	}
6806 	err = gretl_invert_symmetric_matrix(S);
6807 	if (err) {
6808 	    fprintf(stderr, "error inverting covariance matrix\n");
6809 	} else if (opt & OPT_V) {
6810 	    gretl_matrix_print_to_prn(S, _("Inverse of covariance matrix"), prn);
6811 	}
6812     }
6813 
6814     if (!err) {
6815 	int k = gretl_vector_get_length(means);
6816 	int miss, obslen = 0, savevar = 0;
6817 	double m, x, xbar;
6818 	int i, t, vi;
6819 
6820 	if (opt & OPT_S) {
6821 	    /* save the results to a data series */
6822 	    savevar = mdist_saver(dset);
6823 	}
6824 
6825 	if (prn != NULL) {
6826 	    pprintf(prn, "%s\n", _("Mahalanobis distances from the centroid"));
6827 	    pprintf(prn, "%s\n", _("using the variables:"));
6828 	    for (i=1; i<=list[0]; i++) {
6829 		pprintf(prn, " %s\n", dset->varname[list[i]]);
6830 	    }
6831 	    pputc(prn, '\n');
6832 
6833 	    obslen = max_obs_marker_length(dset);
6834 	    if (obslen < 8) {
6835 		obslen = 8;
6836 	    }
6837 	}
6838 
6839 	for (t=dset->t1; t<=dset->t2; t++) {
6840 	    miss = 0;
6841 
6842 	    /* write vector of deviations from centroid for
6843 	       observation t */
6844 	    for (i=0; i<k; i++) {
6845 		vi = list[i+1];
6846 		xbar = gretl_vector_get(means, i);
6847 		x = dset->Z[vi][t];
6848 		miss += na(x);
6849 		if (!miss) {
6850 		    gretl_vector_set(xdiff, i,  x - xbar);
6851 		}
6852 	    }
6853 
6854 	    if (miss) {
6855 		m = NADBL;
6856 	    } else {
6857 		m = gretl_scalar_qform(xdiff, S, &err);
6858 		if (!err) {
6859 		    m = sqrt(m);
6860 		}
6861 	    }
6862 
6863 	    if (prn != NULL) {
6864 		print_obs_marker(t, dset, obslen, prn);
6865 		if (err || miss) {
6866 		    pprintf(prn, "NA\n");
6867 		} else {
6868 		    pprintf(prn, "%9.6f\n", m);
6869 		}
6870 	    }
6871 
6872 	    if (savevar > 0) {
6873 		dset->Z[savevar][t] = m;
6874 	    } else if (md != NULL) {
6875 		md->d[t] = m;
6876 	    }
6877 	}
6878 
6879 	if (savevar > 0 && prn != NULL) {
6880 	    pputc(prn, '\n');
6881 	    pprintf(prn, _("Distances saved as '%s'"),
6882 		    dset->varname[savevar]);
6883 	    pputc(prn, '\n');
6884 	}
6885 
6886 	if (prn != NULL) {
6887 	    pputc(prn, '\n');
6888 	}
6889     }
6890 
6891     gretl_matrix_free(xdiff);
6892     gretl_matrix_free(means);
6893     gretl_matrix_free(S);
6894 
6895  bailout:
6896 
6897     dset->t1 = orig_t1;
6898     dset->t2 = orig_t2;
6899 
6900     return err;
6901 }
6902 
mahalanobis_distance(const int * list,DATASET * dset,gretlopt opt,PRN * prn)6903 int mahalanobis_distance (const int *list, DATASET *dset,
6904 			  gretlopt opt, PRN *prn)
6905 {
6906     return real_mahalanobis_distance(list, dset, opt, NULL,
6907 				     (opt & OPT_Q) ? NULL : prn);
6908 }
6909 
get_mahal_distances(const int * list,DATASET * dset,gretlopt opt,PRN * prn,int * err)6910 MahalDist *get_mahal_distances (const int *list, DATASET *dset,
6911 				gretlopt opt, PRN *prn, int *err)
6912 {
6913     MahalDist *md = mahal_dist_new(list, dset->n);
6914 
6915     if (md == NULL) {
6916 	*err = E_ALLOC;
6917     } else {
6918 	*err = real_mahalanobis_distance(list, dset, opt,
6919 					 md, prn);
6920 	if (*err) {
6921 	    free_mahal_dist(md);
6922 	    md = NULL;
6923 	}
6924     }
6925 
6926     return md;
6927 }
6928 
6929 /*
6930    G = [(2 * \sum_i^n (i * x_i)) / (n * \sum_i^n x_i )] - [(n + 1)/n]
6931 
6932    where x is sorted: x_i <= x_{i+1}
6933 */
6934 
gini_coeff(const double * x,int t1,int t2,double ** plz,int * pn,int * err)6935 static double gini_coeff (const double *x, int t1, int t2, double **plz,
6936 			  int *pn, int *err)
6937 {
6938     int m = t2 - t1 + 1;
6939     double *sx = NULL;
6940     double csx = 0.0, sumx = 0.0, sisx = 0.0;
6941     double G;
6942     int t, n = 0;
6943 
6944     gretl_error_clear();
6945 
6946     sx = malloc(m * sizeof *sx);
6947 
6948     if (sx == NULL) {
6949 	*err = E_ALLOC;
6950 	return NADBL;
6951     }
6952 
6953     n = 0;
6954     for (t=t1; t<=t2; t++) {
6955 	if (na(x[t])) {
6956 	    continue;
6957 	} else if (x[t] < 0.0) {
6958 	    gretl_errmsg_set(_("illegal negative value"));
6959 	    *err = E_DATA;
6960 	    break;
6961 	} else {
6962 	    sx[n++] = x[t];
6963 	    sumx += x[t];
6964 	}
6965     }
6966 
6967     if (!*err && (n == 0 || sumx == 0.0)) {
6968 	*err = E_DATA;
6969     }
6970 
6971     if (*err) {
6972 	free(sx);
6973 	return NADBL;
6974     }
6975 
6976     qsort(sx, n, sizeof *sx, gretl_compare_doubles);
6977 
6978 #if 0
6979     if (1) {
6980 	/* just testing alternative calculation for equivalence */
6981 	double num, S[2];
6982 	int s, fyi;
6983 
6984 	num = S[0] = S[1] = 0.0;
6985 
6986 	for (t=0; t<n; t++) {
6987 	    fyi = 0;
6988 	    s = t;
6989 	    while (s < n && sx[s] == sx[t]) {
6990 		fyi++;
6991 		s++;
6992 	    }
6993 	    S[1] += (double) fyi/n * sx[t];
6994 	    num += (double) fyi/n * (S[0] + S[1]);
6995 	    t = s - 1;
6996 	    S[0] = S[1];
6997 	}
6998 
6999 	G = 1.0 - num / S[1];
7000 	fprintf(stderr, "alt G = %g\n", G);
7001     }
7002 #endif
7003 
7004     for (t=0; t<n; t++) {
7005 	csx += sx[t];
7006 	sisx += (t + 1) * sx[t];
7007 	sx[t] = csx / sumx; /* sx now = Lorenz curve */
7008     }
7009 
7010     G = 2.0 * sisx / (n * sumx) - ((double) n + 1) / n;
7011 
7012     if (plz != NULL) {
7013 	*plz = sx;
7014 	*pn = n;
7015     } else {
7016 	free(sx);
7017     }
7018 
7019     return G;
7020 }
7021 
7022 /**
7023  * gretl_gini:
7024  * @t1: starting observation.
7025  * @t2: ending observation.
7026  * @x: data series.
7027  *
7028  * Returns: the Gini coefficient for the series @x from obs
7029  * @t1 to obs @t2, skipping any missing values, or #NADBL
7030  * on failure.
7031  */
7032 
gretl_gini(int t1,int t2,const double * x)7033 double gretl_gini (int t1, int t2, const double *x)
7034 {
7035     double g;
7036     int err = 0;
7037 
7038     g = gini_coeff(x, t1, t2, NULL, NULL, &err);
7039 
7040     if (err) {
7041 	g = NADBL;
7042     }
7043 
7044     return g;
7045 }
7046 
lorenz_graph(const char * vname,double * lz,int n)7047 static int lorenz_graph (const char *vname, double *lz, int n)
7048 {
7049     FILE *fp;
7050     double idx;
7051     int downsample = 0;
7052     int t, err = 0;
7053 
7054     fp = open_plot_input_file(PLOT_REGULAR, 0, &err);
7055     if (err) {
7056 	return err;
7057     }
7058 
7059     print_keypos_string(GP_KEY_LEFT_TOP, fp);
7060 
7061     fprintf(fp, "set title '%s'\n", vname);
7062     fprintf(fp, "plot \\\n"
7063 	    "'-' using 1:2 title '%s' w lines, \\\n"
7064 	    "'-' using 1:2 notitle w lines\n",
7065 	    _("Lorenz curve"));
7066 
7067     gretl_push_c_numeric_locale();
7068 
7069     if (n > 4000) {
7070 	downsample = (int) (n / 1000.0);
7071     }
7072 
7073     for (t=0; t<n; t++) {
7074 	if (downsample && t % downsample) {
7075 	    continue;
7076 	}
7077 	idx = t + 1;
7078 	fprintf(fp, "%g %g\n", idx / n, lz[t]);
7079     }
7080     fputs("e\n", fp);
7081 
7082     for (t=0; t<n; t++) {
7083 	if (downsample && t % downsample) {
7084 	    continue;
7085 	}
7086 	idx = ((double) t + 1) / n;
7087 	fprintf(fp, "%g %g\n", idx, idx);
7088     }
7089     fputs("e\n", fp);
7090 
7091     gretl_pop_c_numeric_locale();
7092 
7093     return finalize_plot_input_file(fp);
7094 }
7095 
7096 /**
7097  * gini:
7098  * @varno: ID number of variable to examine.
7099  * @dset: dataset struct.
7100  * @opt: unused at present.
7101  * @prn: gretl printing struct.
7102  *
7103  * Graphs the Lorenz curve for variable @vnum and prints the
7104  * Gini coefficient.
7105  *
7106  * Returns: 0 on successful completion, error code on error.
7107  */
7108 
gini(int varno,DATASET * dset,gretlopt opt,PRN * prn)7109 int gini (int varno, DATASET *dset, gretlopt opt, PRN *prn)
7110 {
7111     double *lz = NULL;
7112     double gini;
7113     int n, fulln;
7114     int err = 0;
7115 
7116     gini = gini_coeff(dset->Z[varno], dset->t1, dset->t2,
7117 		      &lz, &n, &err);
7118 
7119     if (err) {
7120 	return err;
7121     }
7122 
7123     fulln = dset->t2 - dset->t1 - 1;
7124     pprintf(prn, "%s\n", dset->varname[varno], n);
7125     pprintf(prn, _("Number of observations = %d\n"), n);
7126 
7127     if (n < fulln) {
7128 	pputs(prn, _("Warning: there were missing values\n"));
7129     }
7130 
7131     pputc(prn, '\n');
7132 
7133     pprintf(prn, "%s = %g\n", _("Sample Gini coefficient"), gini);
7134     pprintf(prn, "%s = %g\n", _("Estimate of population value"),
7135 	    gini * (double) n / (n - 1));
7136 
7137     err = lorenz_graph(dset->varname[varno], lz, n);
7138 
7139     free(lz);
7140 
7141     return err;
7142 }
7143 
7144 /* Following: apparatus for Shapiro-Wilk normality test.
7145    Thanks to Marcin Blazejowski for the contribution.
7146 */
7147 
7148 #ifndef min
7149 # define min(a, b) ((a) > (b) ? (b) : (a))
7150 #endif
7151 
sign(int a)7152 static int sign (int a)
7153 {
7154     return (a == 0)? 0 : ((a < 0)? -1 : 1);
7155 }
7156 
compare_floats(const void * a,const void * b)7157 static int compare_floats (const void *a, const void *b)
7158 {
7159     const float *fa = (const float *) a;
7160     const float *fb = (const float *) b;
7161 
7162     return (*fa > *fb) - (*fa < *fb);
7163 }
7164 
7165 /* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
7166    Calculates the algebraic polynomial of order n-1 with
7167    array of coefficients cc.  Zero-order coefficient is
7168    cc(1) = cc[0]
7169 */
7170 
poly(const float * cc,int n,float x)7171 static double poly (const float *cc, int n, float x) {
7172     double p, ret = cc[0]; /* preserve precision! */
7173     int j;
7174 
7175     if (n > 1) {
7176 	p = x * cc[n-1];
7177 	for (j=n-2; j>0; j--) {
7178 	    p = (p + cc[j]) * x;
7179 	}
7180 	ret += p;
7181     }
7182 
7183     return ret;
7184 }
7185 
7186 /* Calculate coefficients for the Shapiro-Wilk test */
7187 
sw_coeff(int n,int n2,float * a)7188 static void sw_coeff (int n, int n2, float *a)
7189 {
7190     const float c1[6] = { 0.f,.221157f,-.147981f,-2.07119f, 4.434685f, -2.706056f };
7191     const float c2[6] = { 0.f,.042981f,-.293762f,-1.752461f,5.682633f, -3.582633f };
7192     float summ2, ssumm2;
7193     float a0, a1, an = (float) n;
7194     float fac, an25, rsn;
7195     int i, i1, nn2 = n / 2;
7196 
7197     if (n == 3) {
7198 	a[0] = sqrt(0.5);
7199     } else {
7200 	an25 = an + .25;
7201 	summ2 = 0.0;
7202 	for (i=0; i<n2; i++) {
7203 	    a[i] = (float) normal_cdf_inverse((i + 1 - .375f) / an25);
7204 	    summ2 += a[i] * a[i];
7205 	}
7206 	summ2 *= 2.0;
7207 	ssumm2 = sqrt(summ2);
7208 	rsn = 1.0 / sqrt(an);
7209 	a0 = poly(c1, 6, rsn) - a[0] / ssumm2;
7210 
7211 	/* Normalize array a */
7212 	if (n > 5) {
7213 	    i1 = 2;
7214 	    a1 = -a[1] / ssumm2 + poly(c2, 6, rsn);
7215 	    fac = sqrt((summ2 - 2.0 * (a[0] * a[0]) - 2.0 * (a[1] * a[1])) /
7216 		       (1.0 - 2.0 * (a0 * a0) - 2.0 * (a1 * a1)));
7217 	    a[1] = a1;
7218 	} else {
7219 	    i1 = 1;
7220 	    fac = sqrt((summ2 - 2.0 * (a[0] * a[0])) / (1.0 - 2.0 * (a0 * a0)));
7221 	}
7222 	a[0] = a0;
7223 	for (i=i1; i<nn2; i++) {
7224 	    a[i] /= -fac;
7225 	}
7226     }
7227 }
7228 
7229 /* ALGORITHM AS R94 APPL. STATIST. (1995) vol. 44, no. 4, 547-551.
7230    Calculates the Shapiro-Wilk W test and its significance level,
7231    Rendered into C by Marcin Blazejowski, May 2008.
7232 */
7233 
sw_w(float * x,int n,int n1,float * a,double * W,double * pval)7234 static int sw_w (float *x, int n, int n1, float *a,
7235 		 double *W, double *pval)
7236 {
7237     const float z90 = 1.2816f;
7238     const float z95 = 1.6449f;
7239     const float z99 = 2.3263f;
7240     const float zm = 1.7509f;
7241     const float zss = .56268f;
7242     const float bf1 = .8378f;
7243     const double xx90 = .556;
7244     const double xx95 = .622;
7245     const float little = 1e-19f; /* "small" is reserved on win32 */
7246     const float pi6 = 1.909859f;
7247     const float stqr = 1.047198f;
7248 
7249     /* polynomial coefficients */
7250     const float  g[2] = { -2.273f, .459f };
7251     const float c3[4] = { .544f, -.39978f, .025054f, -6.714e-4f };
7252     const float c4[4] = { 1.3822f, -.77857f, .062767f, -.0020322f };
7253     const float c5[4] = { -1.5861f, -.31082f, -.083751f, .0038915f };
7254     const float c6[3] = { -.4803f, -.082676f, .0030302f };
7255     const float c7[2] = { .164f, .533f };
7256     const float c8[2] = { .1736f, .315f };
7257     const float c9[2] = { .256f, -.00635f };
7258 
7259     float r1, zbar, ssassx, gamma, range;
7260     float bf, ld, m, s, sa, xi, sx, xx, y, w1;
7261     float asa, ssa, z90f, sax, zfm, z95f, zsd, z99f, ssx, xsx;
7262     float an = (float) n;
7263     int i, j, ncens = n - n1;
7264 
7265     /* Check for zero range */
7266     range = x[n1 - 1] - x[0];
7267     if (range < little) {
7268 	fprintf(stderr, "sw_w: range is too small\n");
7269 	return 1;
7270     }
7271 
7272     /* Check for correct sort order on range-scaled X */
7273     /* In fact we don't need do this, since we use qsort() from libc
7274        and we can suppose, that qsort() can sort... */
7275     xx = x[0] / range;
7276     sx = xx;
7277     sa = -a[0];
7278     j = n - 1;
7279     for (i = 1; i < n1; --j) {
7280 	xi = x[i] / range;
7281 	sx += xi;
7282 	++i;
7283 	if (i != j) {
7284 	    sa += sign(i - j) * a[min(i,j) - 1];
7285 	}
7286 	xx = xi;
7287     }
7288 
7289     /* Calculate W statistic as squared correlation between data and
7290        coefficients */
7291     sa /= n1;
7292     sx /= n1;
7293     ssa = ssx = sax = 0.0;
7294     j = n - 1;
7295     for (i=0; i<n1; i++, j--) {
7296 	if (i != j) {
7297 	    asa = sign(i - j) * a[min(i,j)] - sa;
7298 	} else {
7299 	    asa = -sa;
7300 	}
7301 	xsx = x[i] / range - sx;
7302 	ssa += asa * asa;
7303 	ssx += xsx * xsx;
7304 	sax += asa * xsx;
7305     }
7306 
7307     /* W1 equals (1-W) calculated to avoid excessive rounding error
7308        for W very near 1 (a potential problem in very large samples)
7309     */
7310     ssassx = sqrt(ssa * ssx);
7311     w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
7312     *W = 1 - w1;
7313 
7314     /* Calculate significance level for W */
7315     if (n == 3) {
7316 	/* exact P-value */
7317 	*pval = pi6 * (asin(sqrt(*W)) - stqr);
7318 	return 0;
7319     }
7320 
7321     y = log(w1);
7322     xx = log(an);
7323 
7324     if (n <= 11) {
7325 	gamma = poly(g, 2, an);
7326 	if (y >= gamma) {
7327 	    /* FIXME: rather use an even smaller value, or NA? */
7328 	    *pval = little;
7329 	    return 0;
7330 	}
7331 	y = -log(gamma - y);
7332 	m = poly(c3, 4, an);
7333 	s = exp(poly(c4, 4, an));
7334     } else {
7335 	/* n >= 12 */
7336 	m = poly(c5, 4, xx);
7337 	s = exp(poly(c6, 3, xx));
7338     }
7339 
7340     if (ncens > 0) {
7341 	/* Censoring by proportion ncens/n.
7342 	   Calculate mean and sd of normal equivalent deviate of W
7343 	*/
7344 	float delta = (float) ncens / an;
7345 
7346 	ld = -log(delta);
7347 	bf = 1.0 + xx * bf1;
7348 	r1 = pow(xx90, (double) xx);
7349 	z90f = z90 + bf * pow(poly(c7, 2, r1), (double) ld);
7350 	r1 = pow(xx95, (double) xx);
7351 	z95f = z95 + bf * pow(poly(c8, 2, r1), (double) ld);
7352 	z99f = z99 + bf * pow(poly(c9, 2, xx), (double) ld);
7353 
7354 	/* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
7355 	   pseudo-mean and pseudo-sd of z as the slope and intercept
7356 	*/
7357 	zfm = (z90f + z95f + z99f) / 3.0;
7358 	zsd = (z90 * (z90f - zfm) + z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;
7359 	zbar = zfm - zsd * zm;
7360 	m += zbar * s;
7361 	s *= zsd;
7362     }
7363 
7364     *pval = normal_cdf_comp(((double) y - (double) m) / (double) s);
7365 
7366     return 0;
7367 }
7368 
sw_sample_check(int n,int n1)7369 static int sw_sample_check (int n, int n1)
7370 {
7371     int ncens = n - n1;
7372     float delta, an = n;
7373 
7374     if (n < 3 || n1 < 3) {
7375 	fprintf(stderr, "There is no way to run SW test for less then 3 obs\n");
7376 	return E_DATA;
7377     }
7378 
7379     if (ncens < 0 || (ncens > 0 && n < 20)) {
7380 	fprintf(stderr, "sw_w: not enough uncensored obserations\n");
7381 	return E_DATA;
7382     }
7383 
7384     delta = (float) ncens / an;
7385     if (delta > .8f) {
7386 	fprintf(stderr, "sw_w: too many censored obserations\n");
7387 	return E_DATA;
7388     }
7389 
7390     return 0;
7391 }
7392 
7393 /**
7394  * shapiro_wilk:
7395  * @x: data array.
7396  * @t1: starting observation.
7397  * @t2: ending observation.
7398  * @W: location to receive test statistic.
7399  * @pval: location to receive p-value.
7400  *
7401  * Computes the Shapiro-Wilk W statistic as a test for
7402  * normality of the data @x, and also the p-value for
7403  * the test. These are written into the pointer
7404  * arguments @W and @pval.
7405  *
7406  * Returns: 0 on success, non-zero on failure.
7407 */
7408 
shapiro_wilk(const double * x,int t1,int t2,double * W,double * pval)7409 int shapiro_wilk (const double *x, int t1, int t2, double *W, double *pval)
7410 {
7411     int n1 = 0;         /* number of uncensored obs */
7412     float *a = NULL;	/* array of coefficients */
7413     float *xf = NULL;   /* copy of x in float format */
7414     int i, t, n2, n = 0;
7415     int err = 0;
7416 
7417     *W = *pval = NADBL;
7418 
7419     for (t=t1; t<=t2; t++) {
7420 	if (!na(x[t])) n++;
7421     }
7422 
7423     /* for now we assume all obs are uncensored */
7424     n1 = n;
7425 
7426     err = sw_sample_check(n, n1);
7427     if (err) {
7428 	return err;
7429     }
7430 
7431     /* How many coeffs should be computed? */
7432     n2 = fmod(n, 2.0);
7433     n2 = (n2 == 0)?  n / 2 : (n - 1) / 2;
7434 
7435     xf = malloc(n * sizeof *xf);
7436     a = malloc(n2 * sizeof *a);
7437 
7438     if (xf == NULL || a == NULL) {
7439 	err = E_ALLOC;
7440     } else {
7441 	i = 0;
7442 	for (t=t1; t<=t2; t++) {
7443 	    if (!na(x[t])) {
7444 		xf[i++] = x[t];
7445 	    }
7446 	}
7447 	qsort(xf, n, sizeof *xf, compare_floats);
7448 	/* Main job: compute W stat and its p-value */
7449 	sw_coeff(n, n2, a);
7450 	err = sw_w(xf, n, n1, a, W, pval);
7451     }
7452 
7453     free(a);
7454     free(xf);
7455 
7456     return err;
7457 }
7458 
round2(double x)7459 static double round2 (double x)
7460 {
7461     x *= 100.0;
7462     x = (x - floor(x) < 0.5)? floor(x) : ceil(x);
7463     return x / 100;
7464 }
7465 
7466 /* Polynomial approximation to the p-value for the Lilliefors test,
7467    said to be good to 2 decimal places.  See Abdi and Molin,
7468    "Lilliefors/Van Soest's test of normality", in Salkind (ed.)
7469    Encyclopedia of Measurement and Statistics, Sage, 2007; also
7470    http://www.utd.edu/~herve/Abdi-Lillie2007-pretty.pdf
7471  */
7472 
lilliefors_pval(double L,int N)7473 static double lilliefors_pval (double L, int N)
7474 {
7475     double b0 = 0.37872256037043;
7476     double b1 = 1.30748185078790;
7477     double b2 = 0.08861783849346;
7478     double b1pN = b1 + N;
7479     double Lm2 = 1.0 / (L * L);
7480     double A, pv;
7481 
7482     A = (-b1pN + sqrt(b1pN * b1pN - 4 * b2 * (b0 - Lm2))) / (2 * b2);
7483 
7484     pv = -0.37782822932809 + 1.67819837908004*A
7485 	- 3.02959249450445*A*A + 2.80015798142101*pow(A, 3.)
7486 	- 1.39874347510845*pow(A, 4.) + 0.40466213484419*pow(A, 5.)
7487 	- 0.06353440854207*pow(A, 6.) + 0.00287462087623*pow(A, 7.)
7488 	+ 0.00069650013110*pow(A, 8.) - 0.00011872227037*pow(A, 9.)
7489 	+ 0.00000575586834*pow(A, 10.);
7490 
7491     if (pv < 0) {
7492 	pv = 0;
7493     } else if (pv > 1) {
7494 	pv = 1;
7495     } else {
7496 	/* round to 2 digits */
7497 	pv = round2(pv);
7498     }
7499 
7500     return pv;
7501 }
7502 
lilliefors_test(const double * x,int t1,int t2,double * L,double * pval)7503 static int lilliefors_test (const double *x, int t1, int t2,
7504 			    double *L, double *pval)
7505 {
7506     double *zx;   /* copy of x for sorting, z-scoring */
7507     int i, t, n = 0;
7508     int err = 0;
7509 
7510     *L = *pval = NADBL;
7511 
7512     for (t=t1; t<=t2; t++) {
7513 	if (!na(x[t])) n++;
7514     }
7515 
7516     if (n < 5) {
7517 	/* we need more than 4 data points */
7518 	return E_DATA;
7519     }
7520 
7521     zx = malloc(n * sizeof *zx);
7522 
7523     if (zx == NULL) {
7524 	err = E_ALLOC;
7525     } else {
7526 	double dx, mx = 0.0, sx = 0.0;
7527 	double Phi, Dp = 0.0, Dm = 0.0;
7528 	double Dmax = 0.0;
7529 
7530 	/* compute sample mean and standard deviation plus
7531 	   sorted z-scores */
7532 
7533 	i = 0;
7534 	for (t=t1; t<=t2; t++) {
7535 	    if (!na(x[t])) {
7536 		zx[i++] = x[t];
7537 		mx += x[t];
7538 	    }
7539 	}
7540 	mx /= n;
7541 
7542 	for (t=t1; t<=t2; t++) {
7543 	    if (!na(x[t])) {
7544 		dx = x[t] - mx;
7545 		sx += dx * dx;
7546 	    }
7547 	}
7548 	sx = sqrt(sx / (n - 1));
7549 
7550 	qsort(zx, n, sizeof *zx, gretl_compare_doubles);
7551 
7552 	for (i=0; i<n; i++) {
7553 	    zx[i] = (zx[i] - mx) / sx;
7554 	}
7555 
7556 	/* The following is based on the formula given in the
7557 	   "nortest" package for GNU R, function lillie.test (written
7558 	   by Juergen Gross).  It differs from the algorithm given by
7559 	   Abdi and Molin (whose p-value approximation is used above).
7560 	   The A & M account of the statistic itself seems to be
7561 	   wrong: it consistently over-rejects (i.e. around 8 percent
7562 	   of the time for a nominal 5 percent significance level).
7563 	*/
7564 
7565 	for (i=0; i<n; i++) {
7566 	    Phi = normal_cdf(zx[i]);
7567 	    Dp = (double) (i+1) / n - Phi;
7568 	    Dm = Phi - (double) i / n;
7569 	    if (Dp > Dmax) Dmax = Dp;
7570 	    if (Dm > Dmax) Dmax = Dm;
7571 	}
7572 
7573 	*L = Dmax;
7574 	*pval = lilliefors_pval(Dmax, n);
7575     }
7576 
7577     free(zx);
7578 
7579     return err;
7580 }
7581 
skew_kurt_test(const double * x,int t1,int t2,double * test,double * pval,gretlopt opt)7582 static int skew_kurt_test (const double *x, int t1, int t2,
7583 			   double *test, double *pval,
7584 			   gretlopt opt)
7585 {
7586     double skew, xkurt;
7587     int n, err = 0;
7588 
7589     err = series_get_moments(t1, t2, x, &skew, &xkurt, &n);
7590 
7591     if (!err) {
7592 	if (opt & OPT_J) {
7593 	    /* Jarque-Bera */
7594 	    *test = (n / 6.0) * (skew * skew + xkurt * xkurt/ 4.0);
7595 	} else {
7596 	    /* Doornik-Hansen */
7597 	    *test = doornik_chisq(skew, xkurt, n);
7598 	}
7599     }
7600 
7601     if (!err && na(*test)) {
7602 	err = E_NAN;
7603     }
7604 
7605     if (!na(*test)) {
7606 	*pval = chisq_cdf_comp(2, *test);
7607     }
7608 
7609     return err;
7610 }
7611 
print_normality_stat(double test,double pval,gretlopt opt,PRN * prn)7612 static void print_normality_stat (double test, double pval,
7613 				  gretlopt opt, PRN *prn)
7614 {
7615     const char *tstrs[] = {
7616 	N_("Shapiro-Wilk W"),
7617 	N_("Jarque-Bera test"),
7618 	N_("Lilliefors test"),
7619 	N_("Doornik-Hansen test")
7620     };
7621     int i = (opt & OPT_W)? 0 : (opt & OPT_J)? 1 :
7622 	(opt & OPT_L)? 2 : 3;
7623 
7624     if (na(test) || na(pval)) {
7625 	return;
7626     }
7627 
7628     if (opt & OPT_L) {
7629 	pprintf(prn, " %s = %g, %s ~= %g\n\n",
7630 		_(tstrs[i]), test, _("with p-value"), pval);
7631     } else {
7632 	pprintf(prn, " %s = %g, %s %g\n\n",
7633 		_(tstrs[i]), test, _("with p-value"), pval);
7634     }
7635 }
7636 
7637 /**
7638  * gretl_normality_test:
7639  * @varno: ID number of the variable to process.
7640  * @dset: dataset struct.
7641  * @opt: OPT_A: all tests; OPT_D: Doornik-Hansen; OPT_W: Shapiro-Wilk;
7642  * OPT_J: Jarque-Bera; OPT_L: Lilliefors; default is Doornik-Hansen.
7643  * Also use OPT_Q for quiet.
7644  * @prn: gretl printing struct.
7645  *
7646  * Performs, and prints the results of, the specified test(s) randomness
7647  * for the variable specified by @v.
7648  *
7649  * Returns: 0 on successful completion, non-zero on error.
7650  */
7651 
gretl_normality_test(int varno,const DATASET * dset,gretlopt opt,PRN * prn)7652 int gretl_normality_test (int varno, const DATASET *dset,
7653 			  gretlopt opt, PRN *prn)
7654 {
7655     gretlopt alltests = OPT_D | OPT_W | OPT_J | OPT_L;
7656     double test = NADBL;
7657     double pval = NADBL;
7658     double trec = NADBL;
7659     double pvrec = NADBL;
7660     int err;
7661 
7662     if (varno < 0 || varno >= dset->v) {
7663 	return E_DATA;
7664     }
7665 
7666     err = incompatible_options(opt, OPT_A | alltests);
7667     if (err) {
7668 	return err;
7669     }
7670 
7671     if (opt & OPT_A) {
7672 	/* show all tests */
7673 	opt |= alltests;
7674     }
7675 
7676     if (!(opt & alltests)) {
7677 	/* no method selected: use Doornik-Hansen */
7678 	opt |= OPT_D;
7679     }
7680 
7681     if (!(opt & OPT_Q)) {
7682 	pprintf(prn, _("Test for normality of %s:"), dset->varname[varno]);
7683 	if (opt & OPT_A) {
7684 	    pputs(prn, "\n\n");
7685 	} else {
7686 	    pputc(prn, '\n');
7687 	}
7688     }
7689 
7690     if (opt & OPT_D) {
7691 	err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2,
7692 			     &test, &pval, OPT_D);
7693 	if (!err && !(opt & OPT_Q)) {
7694 	    print_normality_stat(test, pval, OPT_D, prn);
7695 	}
7696 	if (!err) {
7697 	    /* record Doornik-Hansen result by default */
7698 	    trec = test;
7699 	    pvrec = pval;
7700 	}
7701     }
7702 
7703     if (opt & OPT_W) {
7704 	err = shapiro_wilk(dset->Z[varno], dset->t1, dset->t2,
7705 			   &test, &pval);
7706 	if (!err && !(opt & OPT_Q)) {
7707 	    print_normality_stat(test, pval, OPT_W, prn);
7708 	}
7709     }
7710 
7711     if (opt & OPT_L) {
7712 	err = lilliefors_test(dset->Z[varno], dset->t1, dset->t2,
7713 			      &test, &pval);
7714 	if (!err && !(opt & OPT_Q)) {
7715 	    print_normality_stat(test, pval, OPT_L, prn);
7716 	}
7717     }
7718 
7719     if (opt & OPT_J) {
7720 	err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2,
7721 			     &test, &pval, OPT_J);
7722 	if (!err && !(opt & OPT_Q)) {
7723 	    print_normality_stat(test, pval, OPT_J, prn);
7724 	}
7725     }
7726 
7727     if (na(trec) && !na(test)) {
7728 	trec = test;
7729     }
7730 
7731     if (na(pvrec) && !na(pval)) {
7732 	pvrec = pval;
7733     }
7734 
7735     if (!na(trec) && !na(pvrec)) {
7736 	record_test_result(trec, pvrec);
7737     }
7738 
7739     return err;
7740 }
7741 
gretl_normtest_matrix(const double * y,int t1,int t2,gretlopt opt,int * err)7742 gretl_matrix *gretl_normtest_matrix (const double *y,
7743 				     int t1, int t2,
7744 				     gretlopt opt,
7745 				     int *err)
7746 {
7747     gretl_matrix *ret = NULL;
7748     double test = NADBL;
7749     double pval = NADBL;
7750 
7751     if (opt & OPT_J) {
7752 	/* Jarque-Bera */
7753 	*err = skew_kurt_test(y, t1, t2, &test, &pval, opt);
7754     } else if (opt & OPT_W) {
7755 	/* Shapiro-Wilk */
7756 	*err = shapiro_wilk(y, t1, t2, &test, &pval);
7757     } else if (opt & OPT_L) {
7758 	/* Lilliefors */
7759 	*err = lilliefors_test(y, t1, t2, &test, &pval);
7760     } else {
7761 	/* Doornik-Hansen */
7762 	*err = skew_kurt_test(y, t1, t2, &test, &pval, opt);
7763     }
7764 
7765     if (!*err) {
7766 	ret = gretl_matrix_alloc(1, 2);
7767 	if (ret == NULL) {
7768 	    *err = E_ALLOC;
7769 	} else {
7770 	    ret->val[0] = test;
7771 	    ret->val[1] = pval;
7772 	}
7773     }
7774 
7775     return ret;
7776 }
7777