1 /*
2 * gretl -- Gnu Regression, Econometrics and Time-series Library
3 * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 #include "libgretl.h"
21 #include "gretl_matrix.h"
22 #include "gretl_cmatrix.h"
23 #include "matrix_extra.h"
24 #include "gretl_panel.h"
25 #include "gretl_string_table.h"
26 #include "libset.h"
27 #include "compat.h"
28 #include "plotspec.h"
29 #include "usermat.h"
30
31 #include <unistd.h>
32
33 #ifdef WIN32
34 # include <windows.h>
35 #endif
36
37 /**
38 * SECTION:describe
39 * @short_description: descriptive statistics plus some tests
40 * @title: Descriptive statistics
41 * @include: libgretl.h
42 *
43 * Computation and printing of numerous descriptive statistics
44 * along with some hypothesis tests, for example regarding the
45 * normality of a data series.
46 */
47
48 /* return the number of "good" (non-missing) observations in series x;
49 also (optionally) write to x0 the first non-missing value */
50
good_obs(const double * x,int n,double * x0)51 static int good_obs (const double *x, int n, double *x0)
52 {
53 int t, good = n;
54
55 if (x0 != NULL) {
56 *x0 = NADBL;
57 }
58
59 for (t=0; t<n; t++) {
60 if (na(x[t])) {
61 good--;
62 } else if (x0 != NULL && na(*x0)) {
63 *x0 = x[t];
64 }
65 }
66
67 return good;
68 }
69
70 /**
71 * gretl_minmax:
72 * @t1: starting observation.
73 * @t2: ending observation.
74 * @x: data series.
75 * @min: location to receive minimum value.
76 * @max: location to receive maximum value.
77 *
78 * Puts the minimum and maximum values of the series @x,
79 * from obs @t1 to obs @t2, into the variables @min and @max.
80 *
81 * Returns: the number of valid observations in the given
82 * data range.
83 */
84
gretl_minmax(int t1,int t2,const double * x,double * min,double * max)85 int gretl_minmax (int t1, int t2, const double *x,
86 double *min, double *max)
87 {
88 int t, n = 0;
89
90 *max = *min = NADBL;
91
92 for (t=t1; t<=t2; t++) {
93 if (!(na(x[t]))) {
94 if (n == 0) {
95 *max = *min = x[t];
96 } else {
97 if (x[t] > *max) *max = x[t];
98 if (x[t] < *min) *min = x[t];
99 }
100 n++;
101 }
102 }
103
104 return n;
105 }
106
107 /**
108 * gretl_min:
109 * @t1: starting observation.
110 * @t2: ending observation.
111 * @x: data series.
112 *
113 * Returns: the minimum value of @x over the given range,
114 * or #NADBL if no valid vaues are found.
115 */
116
gretl_min(int t1,int t2,const double * x)117 double gretl_min (int t1, int t2, const double *x)
118 {
119 double min, max;
120
121 gretl_minmax(t1, t2, x, &min, &max);
122 return min;
123 }
124
125 /**
126 * gretl_max:
127 * @t1: starting observation.
128 * @t2: ending observation.
129 * @x: data series.
130 *
131 * Returns: the maximum value of @x over the given range,
132 * or #NADBL if no valid vaues are found.
133 */
134
gretl_max(int t1,int t2,const double * x)135 double gretl_max (int t1, int t2, const double *x)
136 {
137 double min, max;
138
139 gretl_minmax(t1, t2, x, &min, &max);
140 return max;
141 }
142
143 /**
144 * gretl_sum:
145 * @t1: starting observation.
146 * @t2: ending observation.
147 * @x: data series.
148 *
149 * Returns: the sum of the series @x from obs
150 * @t1 to obs @t2, skipping any missing values, or #NADBL
151 * in case there are no valid observations.
152 */
153
gretl_sum(int t1,int t2,const double * x)154 double gretl_sum (int t1, int t2, const double *x)
155 {
156 double sum = 0.0;
157 int t, n = 0;
158
159 for (t=t1; t<=t2; t++) {
160 if (!(na(x[t]))) {
161 sum += x[t];
162 n++;
163 }
164 }
165
166 return (n == 0)? NADBL : sum;
167 }
168
169 /**
170 * gretl_mean:
171 * @t1: starting observation.
172 * @t2: ending observation.
173 * @x: data series.
174 *
175 * Returns: the arithmetic mean of the series @x from obs
176 * @t1 to obs @t2, skipping any missing values, or #NADBL
177 * in case there are no valid observations.
178 */
179
gretl_mean(int t1,int t2,const double * x)180 double gretl_mean (int t1, int t2, const double *x)
181 {
182 double xbar, sum = 0.0;
183 int t, n = 0;
184
185 for (t=t1; t<=t2; t++) {
186 if (!(na(x[t]))) {
187 sum += x[t];
188 n++;
189 }
190 }
191
192 if (n == 0) {
193 return NADBL;
194 }
195
196 xbar = sum / n;
197 sum = 0.0;
198
199 for (t=t1; t<=t2; t++) {
200 if (!(na(x[t]))) {
201 sum += (x[t] - xbar);
202 }
203 }
204
205 return xbar + sum / n;
206 }
207
208 /**
209 * eval_ytest:
210 * @y: reference numerical value.
211 * @op: operator.
212 * @test: numerical test value.
213 *
214 * Returns: 1 if the expression @y @yop @test (for example
215 * "y = 2" or "y <= 45") evaluates as true, else 0.
216 */
217
eval_ytest(double y,GretlOp op,double test)218 int eval_ytest (double y, GretlOp op, double test)
219 {
220 int ret = 0;
221
222 switch (op) {
223 case OP_EQ:
224 if (na(test)) {
225 ret = na(y);
226 } else {
227 ret = (y == test);
228 }
229 break;
230 case OP_GT:
231 ret = (y > test);
232 break;
233 case OP_LT:
234 ret = (y < test);
235 break;
236 case OP_NEQ:
237 if (na(test)) {
238 ret = !na(y);
239 } else {
240 ret = (y != test);
241 }
242 break;
243 case OP_GTE:
244 ret = (y >= test);
245 break;
246 case OP_LTE:
247 ret = (y <= test);
248 break;
249 default:
250 break;
251 }
252
253 return ret;
254 }
255
256 /**
257 * gretl_restricted_mean:
258 * @t1: starting observation.
259 * @t2: ending observation.
260 * @x: data series.
261 * @y: criterion series.
262 * @yop: criterion operator.
263 * @yval: criterion value.
264 *
265 * Returns: the arithmetic mean of the series @x in the
266 * range @t1 to @t2 (inclusive), but including only
267 * observations where the criterion variable @y bears the
268 * relationship @yop to the value @yval -- or #NADBL in case
269 * there are no observations that satisfy the restriction.
270 */
271
gretl_restricted_mean(int t1,int t2,const double * x,const double * y,GretlOp yop,double yval)272 double gretl_restricted_mean (int t1, int t2, const double *x,
273 const double *y, GretlOp yop,
274 double yval)
275 {
276 double xbar, sum = 0.0;
277 int t, n = 0;
278
279 for (t=t1; t<=t2; t++) {
280 if (!na(x[t]) && eval_ytest(y[t], yop, yval)) {
281 sum += x[t];
282 n++;
283 }
284 }
285
286 if (n == 0) {
287 return NADBL;
288 }
289
290 xbar = sum / n;
291 sum = 0.0;
292
293 for (t=t1; t<=t2; t++) {
294 if (!na(x[t]) && eval_ytest(y[t], yop, yval)) {
295 sum += (x[t] - xbar);
296 }
297 }
298
299 return xbar + sum / n;
300 }
301
302 /*
303 For an array a with n elements, find the element which would be a[h]
304 if the array were sorted from smallest to largest (without the need
305 to do a full sort). The algorithm is due to C. A. R. Hoare; the C
306 implementation is shamelessly copied, with minor adaptations, from
307 http://www.astro.northwestern.edu/~ato/um/programming/sorting.html
308 (Atakan G\"urkan: link now dead).
309 */
310
find_hoare(double * a,int n,int h)311 static double find_hoare (double *a, int n, int h)
312 {
313 double w, x;
314 int l = 0;
315 int r = n - 1;
316 int i, j;
317
318 while (l < r) {
319 x = a[h];
320 i = l;
321 j = r;
322 while (j >= i) {
323 while (a[i] < x) i++;
324 while (x < a[j]) j--;
325 if (i <= j) {
326 w = a[i];
327 a[i++] = a[j];
328 a[j--] = w;
329 }
330 }
331 if (j < h) l = i;
332 if (h < i) r = j;
333 }
334
335 return a[h];
336 }
337
find_hoare_inexact(double * a,double p,double xmin,double xmax,double frac,int n,int hf,int hc)338 static double find_hoare_inexact (double *a, double p,
339 double xmin, double xmax,
340 double frac, int n,
341 int hf, int hc)
342 {
343 double tmp, high, low;
344 int i;
345
346 if (p < 0.5) {
347 high = find_hoare(a, n, hc);
348 tmp = xmin;
349 for (i=0; i<hc; i++) {
350 if (a[i] > tmp) {
351 tmp = a[i];
352 }
353 }
354 low = tmp;
355 } else {
356 low = find_hoare(a, n, hf);
357 tmp = xmax;
358 for (i=hc; i<n; i++) {
359 if (a[i] < tmp) {
360 tmp = a[i];
361 }
362 }
363 high = tmp;
364 }
365
366 return low + frac * (high - low);
367 }
368
369 /* See https://en.wikipedia.org/wiki/Quantile, also
370 Hyndman and Fan, "Sample Quantiles in Statistical
371 Packages" (The American Statistician, 1996).
372 Return the "index" of the @p-quantile for a sample
373 of size @n (which may not be an integer).
374 */
375
quantile_index(int n,double p)376 static double quantile_index (int n, double p)
377 {
378 int Qtype = libset_get_int(QUANTILE_TYPE);
379 double h;
380
381 if (Qtype == 0) {
382 /* \hat{Q}_6 */
383 h = (n + 1) * p;
384 } else if (Qtype == 1) {
385 /* \hat{Q}_7 */
386 h = (n - 1) * p + 1;
387 } else {
388 /* \hat{Q}_8 */
389 h = (n + 1.0/3) * p + 1.0/3;
390 }
391
392 return h - 1; /* 0-based */
393 }
394
395 /**
396 * gretl_quantile:
397 * @t1: starting observation.
398 * @t2: ending observation.
399 * @x: data series.
400 * @p: probability.
401 * @opt: may include OPT_Q to hush warning when sample is
402 * too small.
403 * @err: location to receive error code.
404 *
405 * Returns: the @p quantile of the series @x from obs
406 * @t1 to obs @t2, skipping any missing values, or #NADBL
407 * on failure.
408 */
409
gretl_quantile(int t1,int t2,const double * x,double p,gretlopt opt,int * err)410 double gretl_quantile (int t1, int t2, const double *x, double p,
411 gretlopt opt, int *err)
412 {
413 double *a = NULL;
414 double xmin, xmax;
415 double h, ret;
416 int hf, hc;
417 int t, n = 0;
418
419 if (*err) {
420 /* don't compound a prior error */
421 return NADBL;
422 }
423
424 if (p <= 0 || p >= 1) {
425 /* sanity check */
426 *err = E_DATA;
427 } else {
428 n = gretl_minmax(t1, t2, x, &xmin, &xmax);
429 if (n == 0) {
430 *err = E_DATA;
431 }
432 }
433
434 if (!*err) {
435 h = quantile_index(n, p);
436 hf = floor(h);
437 hc = ceil(h);
438
439 if (hc == 0 || hc == n) {
440 /* too few usable observations for the specified
441 quantile; don't treat as fatal error */
442 return NADBL;
443 } else {
444 a = malloc(n * sizeof *a);
445 if (a == NULL) {
446 *err = E_ALLOC;
447 }
448 }
449 }
450
451 if (*err) {
452 return NADBL;
453 }
454
455 n = 0;
456 for (t=t1; t<=t2; t++) {
457 if (!na(x[t])) {
458 a[n++] = x[t];
459 }
460 }
461
462 if (hf == hc) {
463 /* "exact" */
464 ret = find_hoare(a, n, hf);
465 } else {
466 ret = find_hoare_inexact(a, p, xmin, xmax, h - hf,
467 n, hf, hc);
468 }
469
470 free(a);
471
472 return ret;
473 }
474
475 /**
476 * gretl_array_quantiles:
477 * @a: data array (this gets re-ordered).
478 * @n: length of array.
479 * @p: array of probabilities (over-written by quantiles).
480 * @k: number of probabilities.
481 *
482 * Computes @k quantiles (given by the elements of @p) for the
483 * first n elements of the array @a, which is re-ordered in
484 * the process. On successful exit, @p contains the quantiles.
485 *
486 * Returns: 0 on success, non-zero code on error.
487 */
488
gretl_array_quantiles(double * a,int n,double * p,int k)489 int gretl_array_quantiles (double *a, int n, double *p, int k)
490 {
491 double h, xmin = 0, xmax = NADBL;
492 int hf, hc, i;
493 int err = 0;
494
495 if (n <= 0 || k <= 0) {
496 return E_DATA;
497 }
498
499 for (i=0; i<k; i++) {
500 if (p[i] <= 0.0 || p[i] >= 1.0) {
501 p[i] = NADBL;
502 err = E_INVARG;
503 break;
504 }
505
506 h = quantile_index(n, p[i]);
507 hf = floor(h);
508 hc = ceil(h);
509
510 if (hc == 0 || hc == n) {
511 p[i] = NADBL;
512 } else if (hf == hc) {
513 p[i] = find_hoare(a, n, hf);
514 } else {
515 if (na(xmax)) {
516 gretl_minmax(0, n-1, a, &xmin, &xmax);
517 }
518 p[i] = find_hoare_inexact(a, p[i], xmin, xmax, h - hf,
519 n, hf, hc);
520 }
521 }
522
523 return err;
524 }
525
526 /**
527 * gretl_array_quantile:
528 * @a: array on which to operate.
529 * @n: number of elements in @a.
530 * @p: probability.
531 *
532 * Returns: the @p quantile of the first @n elements in @a,
533 * which is re-ordered in the process, or #NADBL on
534 * failure.
535 */
536
gretl_array_quantile(double * a,int n,double p)537 double gretl_array_quantile (double *a, int n, double p)
538 {
539 gretl_array_quantiles(a, n, &p, 1);
540 return p;
541 }
542
543 /**
544 * gretl_median:
545 * @t1: starting observation.
546 * @t2: ending observation.
547 * @x: data series.
548 *
549 * Returns: the median value of the series @x from obs
550 * @t1 to obs @t2, skipping any missing values, or #NADBL
551 * on failure.
552 */
553
gretl_median(int t1,int t2,const double * x)554 double gretl_median (int t1, int t2, const double *x)
555 {
556 int err = 0;
557
558 return gretl_quantile(t1, t2, x, 0.5, OPT_NONE, &err);
559 }
560
561 /**
562 * gretl_sst:
563 * @t1: starting observation.
564 * @t2: ending observation.
565 * @x: data series.
566 *
567 * Returns: the sum of squared deviations from the mean for
568 * the series @x from obs @t1 to obs @t2, skipping any missing
569 * values, or #NADBL on failure.
570 */
571
gretl_sst(int t1,int t2,const double * x)572 double gretl_sst (int t1, int t2, const double *x)
573 {
574 int t, n = t2 - t1 + 1;
575 double sumsq, xx, xbar;
576
577 if (n == 0) {
578 return NADBL;
579 }
580
581 xbar = gretl_mean(t1, t2, x);
582 if (na(xbar)) {
583 return NADBL;
584 }
585
586 sumsq = 0.0;
587
588 for (t=t1; t<=t2; t++) {
589 if (!na(x[t])) {
590 xx = x[t] - xbar;
591 sumsq += xx * xx;
592 }
593 }
594
595 return sumsq;
596 }
597
598 /* allows for computing MLE rather than using df correction,
599 via the @asy argument */
600
real_gretl_variance(int t1,int t2,const double * x,int asy)601 double real_gretl_variance (int t1, int t2, const double *x,
602 int asy)
603 {
604 int t, n = t2 - t1 + 1;
605 double v, dx, xbar;
606
607 if (n == 0) {
608 /* null sample */
609 return NADBL;
610 }
611
612 xbar = gretl_mean(t1, t2, x);
613 if (na(xbar)) {
614 return NADBL;
615 }
616
617 v = 0.0;
618 n = 0;
619
620 for (t=t1; t<=t2; t++) {
621 if (!na(x[t])) {
622 dx = x[t] - xbar;
623 v += dx * dx;
624 n++;
625 }
626 }
627
628 if (asy && n > 0) {
629 v /= n;
630 } else if (n > 1) {
631 v /= (n - 1);
632 } else {
633 v = 0.0;
634 }
635
636 return (v >= 0)? v : NADBL;
637 }
638
639 /**
640 * gretl_variance:
641 * @t1: starting observation.
642 * @t2: ending observation.
643 * @x: data series.
644 *
645 * Returns: the variance of the series @x from obs
646 * @t1 to obs @t2, skipping any missing values, or #NADBL
647 * on failure.
648 */
649
gretl_variance(int t1,int t2,const double * x)650 double gretl_variance (int t1, int t2, const double *x)
651 {
652 return real_gretl_variance(t1, t2, x, 0);
653 }
654
655 /**
656 * gretl_restricted_variance:
657 * @t1: starting observation.
658 * @t2: ending observation.
659 * @x: data series.
660 * @y: criterion series.
661 * @yop: criterion operator.
662 * @yval: criterion value.
663 *
664 * Returns: the variance of the series @x from obs
665 * @t1 to obs @t2, skipping any missing values and
666 * observations where the series @y does not bear the
667 * relationship @yop to the value @yval, or #NADBL on
668 * failure.
669 */
670
gretl_restricted_variance(int t1,int t2,const double * x,const double * y,GretlOp yop,double yval)671 double gretl_restricted_variance (int t1, int t2, const double *x,
672 const double *y, GretlOp yop,
673 double yval)
674 {
675 double sumsq, xx, xbar;
676 int t, n = 0;
677
678 xbar = gretl_restricted_mean(t1, t2, x, y, yop, yval);
679 if (na(xbar)) {
680 return NADBL;
681 }
682
683 sumsq = 0.0;
684
685 for (t=t1; t<=t2; t++) {
686 if (!na(x[t]) && eval_ytest(y[t], yop, yval)) {
687 xx = x[t] - xbar;
688 sumsq += xx * xx;
689 n++;
690 }
691 }
692
693 sumsq = (n > 1)? sumsq / (n - 1) : 0.0;
694
695 return (sumsq >= 0)? sumsq : NADBL;
696 }
697
698 /**
699 * gretl_stddev:
700 * @t1: starting observation.
701 * @t2: ending observation.
702 * @x: data series.
703 *
704 * Returns: the standard deviation of the series @x from obs
705 * @t1 to obs @t2, skipping any missing values, or #NADBL
706 * on failure.
707 */
708
gretl_stddev(int t1,int t2,const double * x)709 double gretl_stddev (int t1, int t2, const double *x)
710 {
711 double v = gretl_variance(t1, t2, x);
712
713 return (na(v))? v : sqrt(v);
714 }
715
716 /**
717 * gretl_restricted_stddev:
718 * @t1: starting observation.
719 * @t2: ending observation.
720 * @x: data series.
721 * @y: criterion series.
722 * @yop: criterion operator.
723 * @yval: criterion value.
724 *
725 * Returns: the standard deviation of the series @x from obs
726 * @t1 to obs @t2, skipping any missing values and observations
727 * where the series @y does not bear the relationship @yop to
728 * the value @yval, or #NADBL on failure.
729 */
730
gretl_restricted_stddev(int t1,int t2,const double * x,const double * y,GretlOp yop,double yval)731 double gretl_restricted_stddev (int t1, int t2, const double *x,
732 const double *y, GretlOp yop,
733 double yval)
734 {
735 double xx = gretl_restricted_variance(t1, t2, x, y, yop, yval);
736
737 return (na(xx))? xx : sqrt(xx);
738 }
739
740 /**
741 * gretl_long_run_variance:
742 * @t1: starting observation.
743 * @t2: ending observation.
744 * @x: data series.
745 * @m: bandwidth (<= 0 for automatic).
746 * @mu: mean (or NADBL to use sample mean).
747 *
748 * Returns: the long-run variance of the series @x from obs
749 * @t1 to obs @t2, using Bartlett kernel weights, or #NADBL
750 * on failure (which includes encountering missing values).
751 */
752
gretl_long_run_variance(int t1,int t2,const double * x,int m,double mu)753 double gretl_long_run_variance (int t1, int t2, const double *x,
754 int m, double mu)
755 {
756 double zt, wi, xbar, s2 = 0.0;
757 int use_mu = !na(mu);
758 int i, t, n, order;
759
760 if (series_adjust_sample(x, &t1, &t2) != 0) {
761 return NADBL;
762 }
763
764 n = t2 - t1 + 1;
765
766 if (n < 2) {
767 return NADBL;
768 }
769
770 if (m < 0) {
771 order = (int) exp(log(n) / 3.0);
772 } else {
773 order = m;
774 }
775
776 if (use_mu) {
777 xbar = mu;
778 }
779
780 if (use_mu && mu == 0.0) {
781 for (t=t1; t<=t2; t++) {
782 s2 += x[t] * x[t];
783 }
784 } else {
785 if (!use_mu) {
786 xbar = 0.0;
787 for (t=t1; t<=t2; t++) {
788 xbar += x[t];
789 }
790 xbar /= n;
791 }
792 for (t=t1; t<=t2; t++) {
793 zt = x[t] - xbar;
794 s2 += zt * zt;
795 }
796 }
797
798 for (i=1; i<=order; i++) {
799 wi = 1.0 - i /((double) order + 1);
800 for (t=t1+i; t<=t2; t++) {
801 zt = x[t] - xbar;
802 s2 += 2 * wi * zt * (x[t-i] - xbar);
803 }
804 }
805
806 return s2 / n;
807 }
808
809 /**
810 * gretl_covar:
811 * @t1: starting observation.
812 * @t2: ending observation.
813 * @x: data series.
814 * @y: data series.
815 * @missing: location to receive information on the number
816 * of missing observations that were skipped, or %NULL.
817 *
818 * Returns: the covariance of the series @x and @y from obs
819 * @t1 to obs @t2, skipping any missing values, or #NADBL
820 * on failure.
821 */
822
gretl_covar(int t1,int t2,const double * x,const double * y,int * missing)823 double gretl_covar (int t1, int t2, const double *x, const double *y,
824 int *missing)
825 {
826 double sx, sy, sxy, xbar, ybar;
827 int t, nn, n = t2 - t1 + 1;
828
829 if (n == 0) {
830 /* void sample */
831 return NADBL;
832 }
833
834 nn = 0;
835 sx = sy = 0.0;
836
837 for (t=t1; t<=t2; t++) {
838 if (!na(x[t]) && !na(y[t])) {
839 sx += x[t];
840 sy += y[t];
841 nn++;
842 }
843 }
844
845 if (nn < 2) {
846 return NADBL;
847 }
848
849 xbar = sx / nn;
850 ybar = sy / nn;
851 sxy = 0.0;
852
853 for (t=t1; t<=t2; t++) {
854 if (!na(x[t]) && !na(y[t])) {
855 sx = x[t] - xbar;
856 sy = y[t] - ybar;
857 sxy = sxy + (sx * sy);
858 }
859 }
860
861 if (missing != NULL) {
862 /* record number of missing obs */
863 *missing = n - nn;
864 }
865
866 return sxy / (nn - 1);
867 }
868
869 /**
870 * gretl_corr:
871 * @t1: starting observation.
872 * @t2: ending observation.
873 * @x: data series.
874 * @y: data series.
875 * @missing: location to receive information on the number
876 * of missing observations that were skipped, or %NULL.
877 *
878 * Returns: the correlation coefficient for the series @x and @y
879 * from obs @t1 to obs @t2, skipping any missing values, or #NADBL
880 * on failure.
881 */
882
gretl_corr(int t1,int t2,const double * x,const double * y,int * missing)883 double gretl_corr (int t1, int t2, const double *x, const double *y,
884 int *missing)
885 {
886 int t, nn, n = t2 - t1 + 1;
887 double sx, sy, sxx, syy, sxy, den, xbar, ybar;
888 double cval = 0.0;
889
890 if (n == 0) {
891 /* void sample */
892 return NADBL;
893 }
894
895 if (gretl_isconst(t1, t2, x) || gretl_isconst(t1, t2, y)) {
896 /* correlation between any x and a constant is
897 undefined */
898 return NADBL;
899 }
900
901 nn = 0;
902 sx = sy = 0.0;
903
904 for (t=t1; t<=t2; t++) {
905 if (!na(x[t]) && !na(y[t])) {
906 sx += x[t];
907 sy += y[t];
908 nn++;
909 }
910 }
911
912 if (nn < 2) {
913 /* zero or 1 observations: no go! */
914 return NADBL;
915 }
916
917 xbar = sx / nn;
918 ybar = sy / nn;
919 sxx = syy = sxy = 0.0;
920
921 for (t=t1; t<=t2; t++) {
922 if (!na(x[t]) && !na(y[t])) {
923 sx = x[t] - xbar;
924 sy = y[t] - ybar;
925 sxx += sx * sx;
926 syy += sy * sy;
927 sxy += sx * sy;
928 }
929 }
930
931 if (sxy != 0.0) {
932 den = sxx * syy;
933 if (den > 0.0) {
934 cval = sxy / sqrt(den);
935 } else {
936 cval = NADBL;
937 }
938 }
939
940 if (missing != NULL) {
941 /* record number of missing observations */
942 *missing = n - nn;
943 }
944
945 return cval;
946 }
947
948 /**
949 * gretl_corr_rsq:
950 * @t1: starting observation.
951 * @t2: ending observation.
952 * @x: data series.
953 * @y: data series.
954 *
955 * Returns: the square of the correlation coefficient for the series
956 * @x and @y from obs @t1 to obs @t2, skipping any missing values,
957 * or #NADBL on failure. Used as alternative value for R^2 in a
958 * regression without an intercept.
959 */
960
gretl_corr_rsq(int t1,int t2,const double * x,const double * y)961 double gretl_corr_rsq (int t1, int t2, const double *x, const double *y)
962 {
963 double r = gretl_corr(t1, t2, x, y, NULL);
964
965 return (na(r))? r : r * r;
966 }
967
968 /* we're supposing a variance smaller than this is just noise */
969 #define TINYVAR 1.0e-36
970
gretl_skewness(int t1,int t2,const double * x)971 double gretl_skewness (int t1, int t2, const double *x)
972 {
973 double s, sd, xx, xbar, ret;
974 int t, n = 0;
975
976 xbar = gretl_mean(t1, t2, x);
977 if (na(xbar)) {
978 return NADBL;
979 }
980
981 s = 0.0;
982
983 for (t=t1; t<=t2; t++) {
984 if (!na(x[t])) {
985 xx = x[t] - xbar;
986 s += xx * xx;
987 n++;
988 }
989 }
990
991 s /= n;
992
993 if (s <= TINYVAR) {
994 ret = NADBL;
995 } else {
996 sd = sqrt(s);
997 s = 0.0;
998
999 for (t=t1; t<=t2; t++) {
1000 if (!na(x[t])) {
1001 xx = (x[t] - xbar) / sd;
1002 s += xx * xx * xx;
1003 }
1004 }
1005
1006 ret = s/n;
1007 }
1008
1009 return ret;
1010 }
1011
1012 /* note: the following computes EXCESS kurtosis */
1013
gretl_kurtosis(int t1,int t2,const double * x)1014 double gretl_kurtosis (int t1, int t2, const double *x)
1015 {
1016 double s, sd, xx, xbar, ret;
1017 int t, n = 0;
1018
1019 xbar = gretl_mean(t1, t2, x);
1020 if (na(xbar)) {
1021 return NADBL;
1022 }
1023
1024 s = 0.0;
1025
1026 for (t=t1; t<=t2; t++) {
1027 if (!na(x[t])) {
1028 xx = x[t] - xbar;
1029 s += xx * xx;
1030 n++;
1031 }
1032 }
1033
1034 s /= n;
1035
1036 if (s <= TINYVAR) {
1037 ret = NADBL;
1038 } else {
1039 sd = sqrt(s);
1040 s = 0.0;
1041
1042 for (t=t1; t<=t2; t++) {
1043 if (!na(x[t])) {
1044 xx = (x[t] - xbar) / sd;
1045 s += xx * xx * xx * xx;
1046 }
1047 }
1048
1049 ret = s/n - 3.0;
1050 }
1051
1052 return ret;
1053 }
1054
1055 /**
1056 * gretl_moments:
1057 * @t1: starting observation.
1058 * @t2: ending observation.
1059 * @x: data series.
1060 * @wts: weights (may be NULL)
1061 * @xbar: pointer to receive mean.
1062 * @sd: pointer to receive standard deviation.
1063 * @skew: pointer to receive skewness.
1064 * @kurt: pointer to receive excess kurtosis.
1065 * @k: degrees of freedom loss (generally 1).
1066 *
1067 * Calculates sample moments for series @x from obs @t1 to obs
1068 * @t2.
1069 *
1070 * Returns: 0 on success, 1 on error.
1071 */
1072
gretl_moments(int t1,int t2,const double * x,const double * wts,double * xbar,double * std,double * skew,double * kurt,int k)1073 int gretl_moments (int t1, int t2, const double *x,
1074 const double *wts,
1075 double *xbar, double *std,
1076 double *skew, double *kurt, int k)
1077 {
1078 int t, n;
1079 double dev, var;
1080 double s, s2, s3, s4;
1081 int allstats = 1;
1082 int weighted = (wts != NULL);
1083 double wt, wn = 0;
1084
1085 if (skew == NULL && kurt == NULL) {
1086 allstats = 0;
1087 }
1088
1089 while (na(x[t1]) && t1 <= t2) {
1090 /* skip missing values at start of sample */
1091 t1++;
1092 }
1093
1094 if (gretl_isconst(t1, t2, x)) {
1095 /* no variation in x */
1096 *xbar = x[t1];
1097 *std = 0.0;
1098 if (allstats) {
1099 *skew = *kurt = NADBL;
1100 }
1101 return 1;
1102 }
1103
1104 /* get number and sum of valid observations */
1105 s = 0.0;
1106 n = 0;
1107 for (t=t1; t<=t2; t++) {
1108 if (!na(x[t])) {
1109 if (weighted) {
1110 wt = wts[t];
1111 if (!na(wt) && wt != 0.0) {
1112 s += wt * x[t];
1113 wn += wt;
1114 n++;
1115 }
1116 } else {
1117 s += x[t];
1118 n++;
1119 }
1120 }
1121 }
1122
1123 if (n == 0) {
1124 /* no valid data */
1125 *xbar = *std = NADBL;
1126 if (allstats) {
1127 *skew = *kurt = 0.0;
1128 }
1129 return 1;
1130 }
1131
1132 if (!weighted) {
1133 wn = n;
1134 }
1135
1136 /* calculate mean and initialize other stats */
1137 *xbar = s / wn;
1138 var = 0.0;
1139 if (allstats) {
1140 *skew = *kurt = 0.0;
1141 }
1142
1143 /* compute quantities needed for higher moments */
1144 s2 = s3 = s4 = 0.0;
1145 for (t=t1; t<=t2; t++) {
1146 if (!na(x[t])) {
1147 dev = x[t] - *xbar;
1148 if (weighted) {
1149 wt = wts[t];
1150 if (!na(wt) && wt != 0.0) {
1151 s2 += wt * dev * dev;
1152 if (allstats) {
1153 s3 += wt * pow(dev, 3);
1154 s4 += wt * pow(dev, 4);
1155 }
1156 }
1157 } else {
1158 s2 += dev * dev;
1159 if (allstats) {
1160 s3 += pow(dev, 3);
1161 s4 += pow(dev, 4);
1162 }
1163 }
1164 }
1165 }
1166
1167
1168 /* variance */
1169 if (weighted) {
1170 /* as per Stata's -summary- with aweights */
1171 var = n * s2 / (wn * (n-1));
1172 } else {
1173 var = s2 / (wn - k);
1174 }
1175
1176 if (var < 0.0) {
1177 /* in principle impossible, but this is a computer */
1178 *std = NADBL;
1179 if (allstats) {
1180 *skew = *kurt = NADBL;
1181 }
1182 return 1;
1183 }
1184
1185 if (var > TINYVAR) {
1186 *std = sqrt(var);
1187 } else {
1188 /* screen out minuscule variance */
1189 *std = 0.0;
1190 }
1191
1192 if (allstats) {
1193 if (var > TINYVAR) {
1194 *skew = (s3 / wn) / pow(s2 / wn, 1.5);
1195 *kurt = ((s4 / wn) / pow(s2 / wn, 2)) - 3.0; /* excess kurtosis */
1196 } else {
1197 /* if variance is effectively zero, these should be undef'd */
1198 *skew = *kurt = NADBL;
1199 }
1200 }
1201
1202 return 0;
1203 }
1204
1205 /**
1206 * gretl_sorted_series:
1207 * @v: ID number of input series.
1208 * @dset: dataset struct.
1209 * @opt: may include OPT_M to flag an error in case
1210 * missing values are found.
1211 * @n: on input, the minimum acceptable number of
1212 * non-missing observations; on output, the number
1213 * of valid observations.
1214 * @err: location to receive error code.
1215 *
1216 * Returns: an array containing the valid values of the
1217 * input series over the sample range given in @dset,
1218 * sorted from smallest to largest, or NULL on error.
1219 * An error is flagged if the number of valid observations
1220 * is less than that given in @n on input, or if OPT_M
1221 * is given and the input contains missing values.
1222 */
1223
gretl_sorted_series(int v,const DATASET * dset,gretlopt opt,int * n,int * err)1224 double *gretl_sorted_series (int v, const DATASET *dset,
1225 gretlopt opt, int *n,
1226 int *err)
1227 {
1228 double *y = NULL;
1229 int t, k = 0;
1230
1231 if (*n < 1) {
1232 *n = 1;
1233 }
1234
1235 for (t=dset->t1; t<=dset->t2; t++) {
1236 if (!na(dset->Z[v][t])) {
1237 k++;
1238 } else if (opt & OPT_M) {
1239 *err = E_MISSDATA;
1240 return NULL;
1241 }
1242 }
1243
1244 if (k < *n) {
1245 gretl_errmsg_set(_("Insufficient data"));
1246 *err = E_DATA;
1247 return NULL;
1248 }
1249
1250 y = malloc(k * sizeof *y);
1251 if (y == NULL) {
1252 *err = E_ALLOC;
1253 return NULL;
1254 }
1255
1256 *n = k;
1257
1258 k = 0;
1259 for (t=dset->t1; t<=dset->t2; t++) {
1260 if (!na(dset->Z[v][t])) {
1261 y[k++] = dset->Z[v][t];
1262 }
1263 }
1264
1265 qsort(y, k, sizeof *y, gretl_compare_doubles);
1266
1267 return y;
1268 }
1269
1270 /**
1271 * free_freq:
1272 * @freq: pointer to gretl frequency distribution struct
1273 *
1274 * Frees all resources associated with @freq, and the
1275 * pointer itself.
1276 */
1277
free_freq(FreqDist * freq)1278 void free_freq (FreqDist *freq)
1279 {
1280 if (freq == NULL) {
1281 return;
1282 }
1283
1284 free(freq->midpt);
1285 free(freq->endpt);
1286 free(freq->f);
1287
1288 if (freq->S != NULL) {
1289 strings_array_free(freq->S, freq->numbins);
1290 }
1291
1292 free(freq);
1293 }
1294
freq_new(const char * vname)1295 static FreqDist *freq_new (const char *vname)
1296 {
1297 FreqDist *freq;
1298
1299 freq = malloc(sizeof *freq);
1300 if (freq == NULL) return NULL;
1301
1302 strcpy(freq->varname, vname);
1303
1304 freq->midpt = NULL;
1305 freq->endpt = NULL;
1306 freq->S = NULL;
1307 freq->f = NULL;
1308
1309 freq->dist = 0;
1310 freq->discrete = 0;
1311 freq->strvals = 0;
1312
1313 freq->xbar = NADBL;
1314 freq->sdx = NADBL;
1315 freq->test = NADBL;
1316
1317 return freq;
1318 }
1319
1320 /*
1321 * dh_root_b1_to_z1:
1322 * @rb1: square root b1, skewness.
1323 * @n: number of observations.
1324 *
1325 * Performs the transformation from skewness, root b1, to the
1326 * normal score, z1, as set out in Doornik and Hansen, "An Omnibus
1327 * Test for Normality", 1994. The transformation is originally
1328 * due to D'Agostino (Biometrika, 1970).
1329 *
1330 * Returns: the z1 value.
1331 */
1332
dh_root_b1_to_z1(double rb1,double n)1333 static double dh_root_b1_to_z1 (double rb1, double n)
1334 {
1335 double b, w2, d, y, z1;
1336
1337 b = 3.0 * (n*n + 27*n - 70) * (n+1) * (n+3) /
1338 ((n-2) * (n+5) * (n+7) * (n+9));
1339
1340 w2 = -1.0 + sqrt(2 * (b-1));
1341 d = 1.0 / sqrt(log(sqrt(w2)));
1342 y = rb1 * sqrt(((w2-1.0)/2.0) * ((n+1.0)*(n+3.0))/(6.0*(n-2)));
1343 z1 = d * log(y + sqrt(y*y + 1));
1344
1345 return z1;
1346 }
1347
1348 /*
1349 * dh_b2_to_z2:
1350 * @b1: skewness.
1351 * @b2: kurtosis.
1352 * @n: number of observations.
1353 *
1354 * Performs the transformation from kurtosis, b2, to the
1355 * normal score, z2, as set out in Doornik and Hansen, "An Omnibus
1356 * Test for Normality", 1994.
1357 *
1358 * Returns: the z2 value, or #NADBL on failure.
1359 */
1360
dh_b2_to_z2(double b1,double b2,double n)1361 static double dh_b2_to_z2 (double b1, double b2, double n)
1362 {
1363 double d, a, c, k, alpha, chi, z2;
1364 double n2 = n * n;
1365
1366 d = (n-3) * (n+1) * (n2 + 15*n - 4.0);
1367 a = ((n-2) * (n+5) * (n+7) * (n2 + 27*n - 70.0)) / (6.0 * d);
1368 c = ((n-7) * (n+5) * (n+7) * (n2 + 2*n - 5.0)) / (6.0 * d);
1369 k = ((n+5) * (n+7) * (n2*n + 37*n2 + 11*n - 313.0)) / (12.0 * d);
1370
1371 alpha = a + b1 * c;
1372 z2 = (1.0 / (9.0 * alpha)) - 1.0;
1373 chi = (b2 - 1.0 - b1) * 2.0 * k;
1374
1375 if (chi > 0.0) {
1376 z2 += pow(chi/(2*alpha), 1.0 / 3.0);
1377 }
1378
1379 z2 *= sqrt(9.0*alpha);
1380
1381 return z2;
1382 }
1383
1384 /**
1385 * doornik_chisq:
1386 * @skew: skewness.
1387 * @xkurt: excess kurtosis.
1388 * @n: number of observations.
1389 *
1390 * Calculates the Chi-square test for normality as set out by
1391 * Doornik and Hansen, "An Omnibus Test for Normality", 1994.
1392 * This is a modified version of the test proposed by Bowman and
1393 * Shenton (Biometrika, 1975).
1394 *
1395 * Returns: the Chi-square value, which has 2 degrees of freedom.
1396 */
1397
doornik_chisq(double skew,double xkurt,int n)1398 double doornik_chisq (double skew, double xkurt, int n)
1399 {
1400 double z1, z2, x2 = NADBL;
1401
1402 z1 = dh_root_b1_to_z1(skew, (double) n);
1403 z2 = dh_b2_to_z2(skew * skew, xkurt + 3.0, (double) n);
1404
1405 if (!na(z2)) {
1406 x2 = z1*z1 + z2*z2;
1407 }
1408
1409 return x2;
1410 }
1411
1412 static int
series_get_moments(int t1,int t2,const double * x,double * skew,double * xkurt,int * pn)1413 series_get_moments (int t1, int t2, const double *x,
1414 double *skew, double *xkurt,
1415 int *pn)
1416 {
1417 double dev, s[4] = {0.0};
1418 int t, n = 0;
1419 int err = 0;
1420
1421 for (t=t1; t<=t2; t++) {
1422 if (!na(x[t])) {
1423 s[0] += x[t];
1424 n++;
1425 }
1426 }
1427
1428 *pn = n;
1429 if (n < 3) {
1430 return E_DATA;
1431 }
1432
1433 s[0] /= n;
1434
1435 for (t=t1; t<=t2; t++) {
1436 if (!na(x[t])) {
1437 dev = x[t] - s[0];
1438 s[1] += dev * dev;
1439 s[2] += pow(dev, 3);
1440 s[3] += pow(dev, 4);
1441 }
1442 }
1443
1444 s[1] /= n;
1445
1446 if (s[1] > 0.0) {
1447 *skew = (s[2] / n) / pow(s[1], 1.5);
1448 *xkurt = ((s[3] / n) / pow(s[1], 2)) - 3.0;
1449 } else {
1450 /* if variance is effectively zero, these should be undef'd */
1451 *skew = *xkurt = NADBL;
1452 err = 1;
1453 }
1454
1455 return err;
1456 }
1457
1458 static int
get_moments(const gretl_matrix * M,int row,double * skew,double * kurt)1459 get_moments (const gretl_matrix *M, int row, double *skew, double *kurt)
1460 {
1461 int j, n = gretl_matrix_cols(M);
1462 double dev, s[4] = {0.0};
1463 int err = 0;
1464
1465 for (j=0; j<n; j++) {
1466 s[0] += gretl_matrix_get(M, row, j);
1467 }
1468
1469 s[0] /= n;
1470
1471 for (j=0; j<n; j++) {
1472 dev = gretl_matrix_get(M, row, j) - s[0];
1473 s[1] += dev * dev;
1474 s[2] += pow(dev, 3);
1475 s[3] += pow(dev, 4);
1476 }
1477
1478 s[1] /= n;
1479
1480 if (s[1] > 0.0) {
1481 *skew = (s[2] / n) / pow(s[1], 1.5);
1482 *kurt = ((s[3] / n) / pow(s[1], 2));
1483 } else {
1484 /* if variance is effectively zero, these should be undef'd */
1485 *skew = *kurt = NADBL;
1486 err = 1;
1487 }
1488
1489 return err;
1490 }
1491
1492 int
multivariate_normality_test(const gretl_matrix * E,const gretl_matrix * Sigma,gretlopt opt,PRN * prn)1493 multivariate_normality_test (const gretl_matrix *E,
1494 const gretl_matrix *Sigma,
1495 gretlopt opt, PRN *prn)
1496 {
1497 gretl_matrix *S = NULL;
1498 gretl_matrix *V = NULL;
1499 gretl_matrix *C = NULL;
1500 gretl_matrix *X = NULL;
1501 gretl_matrix *R = NULL;
1502 gretl_matrix *evals = NULL;
1503 gretl_matrix *tmp = NULL;
1504 /* convenience pointers: do not free! */
1505 gretl_matrix *H;
1506 gretl_vector *Z1;
1507 gretl_vector *Z2;
1508 double x, skew, kurt;
1509 double X2 = NADBL;
1510 int n, p;
1511 int i, j;
1512 int err = 0;
1513
1514 if (E == NULL || Sigma == NULL) {
1515 err = E_DATA;
1516 goto bailout;
1517 }
1518
1519 p = gretl_matrix_cols(E);
1520 n = gretl_matrix_rows(E);
1521
1522 clear_gretl_matrix_err();
1523
1524 S = gretl_matrix_copy(Sigma);
1525 V = gretl_vector_alloc(p);
1526 C = gretl_matrix_alloc(p, p);
1527 X = gretl_matrix_copy_transpose(E);
1528 R = gretl_matrix_alloc(p, n);
1529 tmp = gretl_matrix_alloc(p, p);
1530
1531 err = get_gretl_matrix_err();
1532 if (err) {
1533 goto bailout;
1534 }
1535
1536 for (i=0; i<p; i++) {
1537 x = gretl_matrix_get(S, i, i);
1538 gretl_vector_set(V, i, 1.0 / sqrt(x));
1539 }
1540
1541 err = gretl_matrix_diagonal_sandwich(V, S, C);
1542 if (err) {
1543 goto bailout;
1544 }
1545
1546 if (!(opt & OPT_Q)) {
1547 pputc(prn, '\n');
1548 gretl_matrix_print_to_prn(C, _("Residual correlation matrix, C"), prn);
1549 }
1550
1551 evals = gretl_symmetric_matrix_eigenvals(C, 1, &err);
1552 if (err) {
1553 goto bailout;
1554 }
1555
1556 if (!(opt & OPT_Q)) {
1557 pprintf(prn, "%s\n\n", _("Eigenvalues of C"));
1558 for (i=0; i<p; i++) {
1559 pprintf(prn, " %10g\n", evals->val[i]);
1560 }
1561 pputc(prn, '\n');
1562 }
1563
1564 /* C should now contain eigenvectors of the original C:
1565 relabel as 'H' for perspicuity */
1566 H = C;
1567 #if 0
1568 gretl_matrix_print_to_prn(H, "Eigenvectors, H", prn);
1569 #endif
1570 gretl_matrix_copy_values(tmp, H);
1571
1572 /* make "tmp" into $H \Lambda^{-1/2}$ */
1573 for (i=0; i<p; i++) {
1574 for (j=0; j<p; j++) {
1575 x = gretl_matrix_get(tmp, i, j);
1576 x *= 1.0 / sqrt(evals->val[j]);
1577 gretl_matrix_set(tmp, i, j, x);
1578 }
1579 }
1580
1581 /* make S into $H \Lambda^{-1/2} H'$ */
1582 gretl_matrix_multiply_mod(tmp, GRETL_MOD_NONE,
1583 H, GRETL_MOD_TRANSPOSE,
1584 S, GRETL_MOD_NONE);
1585
1586 #if 1
1587 gretl_matrix_demean_by_row(X);
1588 #endif
1589
1590 /* compute VX', in X (don't need to subtract means, because these
1591 are OLS residuals)
1592 */
1593 for (i=0; i<p; i++) {
1594 for (j=0; j<n; j++) {
1595 x = gretl_matrix_get(X, i, j);
1596 x *= gretl_vector_get(V, i);
1597 gretl_matrix_set(X, i, j, x);
1598 }
1599 }
1600
1601 /* finally, compute $R' = H \Lambda^{-1/2} H' V X'$
1602 Doornik and Hansen, 1994, section 3 */
1603 gretl_matrix_multiply(S, X, R);
1604
1605 /* Z_1 and Z_2 are p-vectors: use existing storage */
1606 Z1 = V;
1607 Z2 = gretl_matrix_reuse(tmp, p, 1);
1608
1609 for (i=0; i<p && !err; i++) {
1610 get_moments(R, i, &skew, &kurt);
1611 if (na(skew) || na(kurt)) {
1612 err = 1;
1613 } else {
1614 double z1i = dh_root_b1_to_z1(skew, n);
1615 double z2i = dh_b2_to_z2(skew * skew, kurt, n);
1616
1617 if (na(z2i)) {
1618 X2 = NADBL;
1619 err = E_NAN;
1620 } else {
1621 gretl_vector_set(Z1, i, z1i);
1622 gretl_vector_set(Z2, i, z2i);
1623 }
1624 }
1625 }
1626
1627 if (!err) {
1628 X2 = gretl_vector_dot_product(Z1, Z1, &err);
1629 X2 += gretl_vector_dot_product(Z2, Z2, &err);
1630 }
1631
1632 if (na(X2)) {
1633 pputs(prn, "Calculation of test statistic failed\n");
1634 } else {
1635 /* print and record result */
1636 double pv = chisq_cdf_comp(2 * p, X2);
1637
1638 if (!(opt & OPT_I)) {
1639 pputs(prn, _("Doornik-Hansen test"));
1640 pprintf(prn, "\n %s(%d) = %g [%.4f]\n\n", _("Chi-square"), 2 * p,
1641 X2, pv);
1642 }
1643 record_test_result(X2, pv);
1644 }
1645
1646 bailout:
1647
1648 gretl_matrix_free(S);
1649 gretl_matrix_free(V);
1650 gretl_matrix_free(C);
1651 gretl_matrix_free(X);
1652 gretl_matrix_free(R);
1653 gretl_matrix_free(evals);
1654 gretl_matrix_free(tmp);
1655
1656 return err;
1657 }
1658
freq_add_arrays(FreqDist * freq,int n)1659 static int freq_add_arrays (FreqDist *freq, int n)
1660 {
1661 int err = 0;
1662
1663 if (!freq->discrete) {
1664 freq->endpt = malloc((n + 1) * sizeof *freq->endpt);
1665 if (freq->endpt == NULL) {
1666 err = E_ALLOC;
1667 }
1668 }
1669
1670 if (!err) {
1671 if (freq->strvals) {
1672 freq->S = strings_array_new(n);
1673 if (freq->S == NULL) {
1674 err = E_ALLOC;
1675 }
1676 } else {
1677 freq->midpt = malloc(n * sizeof *freq->midpt);
1678 if (freq->midpt == NULL) {
1679 err = E_ALLOC;
1680 }
1681 }
1682 }
1683
1684 if (!err) {
1685 freq->f = malloc(n * sizeof *freq->f);
1686 if (freq->f == NULL) {
1687 err = E_ALLOC;
1688 }
1689 }
1690
1691 if (!err) {
1692 freq->numbins = n;
1693 }
1694
1695 return err;
1696 }
1697
freq_setup(int v,const DATASET * dset,int * pn,double * pxmax,double * pxmin,int * nbins,double * binwidth)1698 int freq_setup (int v, const DATASET *dset, int *pn,
1699 double *pxmax, double *pxmin, int *nbins,
1700 double *binwidth)
1701 {
1702 const double *x = dset->Z[v];
1703 double xrange, xmin = 0, xmax = 0;
1704 int t, k = 0, n = 0;
1705
1706 for (t=dset->t1; t<=dset->t2; t++) {
1707 if (!na(x[t])) {
1708 if (n == 0) {
1709 xmax = xmin = x[t];
1710 } else {
1711 if (x[t] > xmax) xmax = x[t];
1712 if (x[t] < xmin) xmin = x[t];
1713 }
1714 n++;
1715 }
1716 }
1717
1718 if (n < 8) {
1719 gretl_errmsg_sprintf(_("Insufficient data to build frequency "
1720 "distribution for variable %s"),
1721 dset->varname[v]);
1722 return E_TOOFEW;
1723 }
1724
1725 xrange = xmax - xmin;
1726 if (xrange == 0) {
1727 gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[v]);
1728 return E_DATA;
1729 }
1730
1731 if (*nbins > 0) {
1732 k = *nbins;
1733 } else if (n < 16) {
1734 k = 5;
1735 } else if (n < 50) {
1736 k = 7;
1737 } else if (n > 850) {
1738 k = 29;
1739 } else {
1740 k = (int) sqrt((double) n);
1741 if (k > 99) {
1742 k = 99;
1743 }
1744 }
1745
1746 if (k % 2 == 0) {
1747 k++;
1748 }
1749
1750 *pn = n;
1751 *pxmax = xmax;
1752 *pxmin = xmin;
1753 *nbins = k;
1754 *binwidth = xrange / (k - 1);
1755
1756 return 0;
1757 }
1758
1759 /* calculate test stat for distribution, if the sample
1760 is big enough */
1761
1762 static void
freq_dist_stat(FreqDist * freq,const double * x,gretlopt opt,int k)1763 freq_dist_stat (FreqDist *freq, const double *x, gretlopt opt, int k)
1764 {
1765 double skew, kurt;
1766
1767 freq->test = NADBL;
1768 freq->dist = 0;
1769
1770 gretl_moments(freq->t1, freq->t2, x, NULL,
1771 &freq->xbar, &freq->sdx,
1772 &skew, &kurt, k);
1773
1774 if (freq->n > 7) {
1775 if (opt & OPT_O) {
1776 freq->test = lockes_test(x, freq->t1, freq->t2);
1777 freq->dist = D_GAMMA;
1778 } else if (opt & OPT_Z) {
1779 freq->test = doornik_chisq(skew, kurt, freq->n);
1780 freq->dist = D_NORMAL;
1781 }
1782 }
1783 }
1784
1785 struct strval_sorter {
1786 char *s;
1787 int n;
1788 };
1789
compare_strvals(const void * a,const void * b)1790 int compare_strvals (const void *a, const void *b)
1791 {
1792 const struct strval_sorter *sa = a;
1793 const struct strval_sorter *sb = b;
1794
1795 return strcmp(sa->s, sb->s);
1796 }
1797
get_string_freq(int v,const DATASET * dset,int * err)1798 FreqDist *get_string_freq (int v, const DATASET *dset,
1799 int *err)
1800 {
1801 FreqDist *freq;
1802 struct strval_sorter *ss;
1803 const double *x = dset->Z[v];
1804 series_table *st;
1805 char **S;
1806 int i, t, n, ns;
1807
1808 freq = freq_new(dset->varname[v]);
1809 if (freq == NULL) {
1810 *err = E_ALLOC;
1811 return NULL;
1812 }
1813
1814 st = series_get_string_table(dset, v);
1815 S = series_table_get_strings(st, &ns);
1816
1817 ss = malloc(ns * sizeof *ss);
1818 if (ss == NULL) {
1819 *err = E_ALLOC;
1820 free(freq);
1821 return NULL;
1822 }
1823
1824 for (i=0; i<ns; i++) {
1825 ss[i].s = S[i];
1826 ss[i].n = 0;
1827 }
1828
1829 n = 0;
1830 for (t=dset->t1; t<=dset->t2; t++) {
1831 if (!na(x[t])) {
1832 i = x[t] - 1;
1833 ss[i].n += 1;
1834 n++;
1835 }
1836 }
1837
1838 qsort(ss, ns, sizeof *ss, compare_strvals);
1839
1840 freq->t1 = dset->t1;
1841 freq->t2 = dset->t2;
1842 freq->n = n;
1843 freq->discrete = 1;
1844 freq->strvals = 1;
1845
1846 if (freq_add_arrays(freq, ns)) {
1847 *err = E_ALLOC;
1848 } else {
1849 for (i=0; i<ns; i++) {
1850 freq->S[i] = gretl_strdup(ss[i].s);
1851 freq->f[i] = ss[i].n;
1852 }
1853 }
1854
1855 free(ss);
1856
1857 if (*err && freq != NULL) {
1858 free_freq(freq);
1859 freq = NULL;
1860 }
1861
1862 return freq;
1863 }
1864
get_discrete_freq(int v,const DATASET * dset,gretlopt opt,int * err)1865 FreqDist *get_discrete_freq (int v, const DATASET *dset,
1866 gretlopt opt, int *err)
1867 {
1868 FreqDist *freq;
1869 const double *x = dset->Z[v];
1870 int *ifreq = NULL;
1871 double *ivals = NULL;
1872 double *sorted = NULL;
1873 double last;
1874 int i, t, nv;
1875
1876 freq = freq_new(dset->varname[v]);
1877 if (freq == NULL) {
1878 *err = E_ALLOC;
1879 return NULL;
1880 }
1881
1882 freq->t1 = dset->t1;
1883 freq->t2 = dset->t2;
1884
1885 freq->n = 0;
1886 for (t=freq->t1; t<=freq->t2; t++) {
1887 if (!na(x[t])) {
1888 freq->n += 1;
1889 }
1890 }
1891
1892 if (freq->n < 3) {
1893 gretl_errmsg_sprintf(_("Insufficient data to build frequency "
1894 "distribution for variable %s"),
1895 dset->varname[v]);
1896 *err = E_TOOFEW;
1897 goto bailout;
1898 }
1899
1900 freq->discrete = 1;
1901 freq->test = NADBL;
1902 freq->dist = 0;
1903
1904 sorted = malloc(freq->n * sizeof *sorted);
1905 if (sorted == NULL) {
1906 *err = E_ALLOC;
1907 goto bailout;
1908 }
1909
1910 i = 0;
1911 for (t=freq->t1; t<=freq->t2; t++) {
1912 if (!na(x[t])) {
1913 sorted[i++] = x[t];
1914 }
1915 }
1916
1917 qsort(sorted, freq->n, sizeof *sorted, gretl_compare_doubles);
1918 nv = count_distinct_values(sorted, freq->n);
1919
1920 if (nv >= 10 && !(opt & OPT_X)) {
1921 freq_dist_stat(freq, x, opt, 1);
1922 } else if (opt & (OPT_Z | OPT_O)) {
1923 freq->xbar = gretl_mean(freq->t1, freq->t2, x);
1924 freq->sdx = gretl_stddev(freq->t1, freq->t2, x);
1925 }
1926
1927 ifreq = malloc(nv * sizeof *ifreq);
1928 ivals = malloc(nv * sizeof *ivals);
1929 if (ifreq == NULL || ivals == NULL) {
1930 *err = E_ALLOC;
1931 goto bailout;
1932 }
1933
1934 ivals[0] = last = sorted[0];
1935 ifreq[0] = i = 1;
1936
1937 for (t=1; t<freq->n; t++) {
1938 if (sorted[t] != last) {
1939 last = sorted[t];
1940 ifreq[i] = 1;
1941 ivals[i++] = last;
1942 } else {
1943 ifreq[i-1] += 1;
1944 }
1945 }
1946
1947 if (freq_add_arrays(freq, nv)) {
1948 *err = E_ALLOC;
1949 } else {
1950 int allints = 1;
1951
1952 for (i=0; i<nv; i++) {
1953 if (allints && ivals[i] != floor(ivals[i])) {
1954 allints = 0;
1955 }
1956 freq->midpt[i] = ivals[i];
1957 freq->f[i] = ifreq[i];
1958 }
1959
1960 if (allints) {
1961 freq->discrete = 2;
1962 }
1963 }
1964
1965 bailout:
1966
1967 free(sorted);
1968 free(ivals);
1969 free(ifreq);
1970
1971 if (*err && freq != NULL) {
1972 free_freq(freq);
1973 freq = NULL;
1974 }
1975
1976 return freq;
1977 }
1978
1979 /**
1980 * get_freq:
1981 * @varno: ID number of variable to process.
1982 * @dset: dataset struct.
1983 * @fmin: lower limit of left-most bin (or #NADBL for automatic).
1984 * @fwid: bin width (or #NADBL for automatic).
1985 * @nbins: number of bins to use (or 0 for automatic).
1986 * @params: degrees of freedom loss (generally = 1 unless we're dealing
1987 * with the residual from a regression).
1988 * @opt: if includes %OPT_Z, set up for comparison with normal dist;
1989 * if includes %OPT_O, compare with gamma distribution;
1990 * if includes %OPT_S we're not printing results; if includes %OPT_D,
1991 * treat the variable as discrete; %OPT_X indicates that this function
1992 * is called as part of a cross-tabulation; if includes %OPT_G,
1993 * add the arrays that are needed for a plot even when we're not
1994 * printing (%OPT_S).
1995 * @err: location to receive error code.
1996 *
1997 * Calculates the frequency distribution for the specified variable.
1998 *
1999 * Returns: pointer to struct containing the distribution.
2000 */
2001
get_freq(int varno,const DATASET * dset,double fmin,double fwid,int nbins,int params,gretlopt opt,int * err)2002 FreqDist *get_freq (int varno, const DATASET *dset,
2003 double fmin, double fwid,
2004 int nbins, int params,
2005 gretlopt opt, int *err)
2006 {
2007 FreqDist *freq;
2008 const double *x;
2009 double xx, xmin, xmax;
2010 double binwidth = fwid;
2011 int t, k, n;
2012
2013 if (is_string_valued(dset, varno)) {
2014 return get_string_freq(varno, dset, err);
2015 }
2016
2017 if (series_is_discrete(dset, varno) || (opt & OPT_D)) {
2018 return get_discrete_freq(varno, dset, opt, err);
2019 }
2020
2021 if (gretl_isdiscrete(dset->t1, dset->t2, dset->Z[varno]) > 1) {
2022 return get_discrete_freq(varno, dset, opt, err);
2023 }
2024
2025 freq = freq_new(dset->varname[varno]);
2026 if (freq == NULL) {
2027 *err = E_ALLOC;
2028 return NULL;
2029 }
2030
2031 *err = freq_setup(varno, dset, &n, &xmax, &xmin, &nbins, &binwidth);
2032
2033 if (*err) {
2034 goto bailout;
2035 }
2036
2037 if (!na(fmin) && !na(fwid)) {
2038 /* endogenous implied number of bins */
2039 nbins = (int) ceil((xmax - fmin) / fwid);
2040 if (nbins <= 0 || nbins > 5000) {
2041 *err = E_INVARG;
2042 goto bailout;
2043 } else {
2044 binwidth = fwid;
2045 }
2046 }
2047
2048 freq->t1 = dset->t1;
2049 freq->t2 = dset->t2;
2050 freq->n = n;
2051
2052 x = dset->Z[varno];
2053 freq_dist_stat(freq, x, opt, params);
2054
2055 if (freq_add_arrays(freq, nbins)) {
2056 *err = E_ALLOC;
2057 goto bailout;
2058 }
2059
2060 if (na(fmin) || na(fwid)) {
2061 freq->endpt[0] = xmin - .5 * binwidth;
2062
2063 if (xmin > 0.0 && freq->endpt[0] < 0.0) {
2064 double rshift;
2065
2066 freq->endpt[0] = 0.0;
2067 rshift = 1.0 - xmin / binwidth;
2068 freq->endpt[freq->numbins] = xmax + rshift * binwidth;
2069 } else {
2070 freq->endpt[freq->numbins] = xmax + .5 * binwidth;
2071 }
2072 } else {
2073 freq->endpt[0] = fmin;
2074 freq->endpt[freq->numbins] = fmin + nbins * fwid;
2075 }
2076
2077 for (k=0; k<freq->numbins; k++) {
2078 freq->f[k] = 0;
2079 if (k > 0) {
2080 freq->endpt[k] = freq->endpt[k-1] + binwidth;
2081 }
2082 freq->midpt[k] = freq->endpt[k] + .5 * binwidth;
2083 }
2084
2085 for (t=dset->t1; t<=dset->t2; t++) {
2086 xx = x[t];
2087 if (na(xx)) {
2088 continue;
2089 }
2090 if (xx < freq->endpt[1]) {
2091 freq->f[0] += 1;
2092 } else if (xx >= freq->endpt[freq->numbins]) {
2093 freq->f[freq->numbins - 1] += 1;
2094 continue;
2095 } else {
2096 for (k=1; k<freq->numbins; k++) {
2097 if (freq->endpt[k] <= xx && xx < freq->endpt[k+1]) {
2098 freq->f[k] += 1;
2099 break;
2100 }
2101 }
2102 }
2103 }
2104
2105 bailout:
2106
2107 if (*err && freq != NULL) {
2108 free_freq(freq);
2109 freq = NULL;
2110 }
2111
2112 return freq;
2113 }
2114
record_freq_test(const FreqDist * freq)2115 static void record_freq_test (const FreqDist *freq)
2116 {
2117 double pval = NADBL;
2118
2119 if (freq->dist == D_NORMAL) {
2120 pval = chisq_cdf_comp(2, freq->test);
2121 } else if (freq->dist == D_GAMMA) {
2122 pval = normal_pvalue_2(freq->test);
2123 }
2124
2125 if (!na(pval)) {
2126 record_test_result(freq->test, pval);
2127 }
2128 }
2129
check_freq_opts(gretlopt opt,int * n_bins,double * fmin,double * fwid)2130 static int check_freq_opts (gretlopt opt, int *n_bins,
2131 double *fmin, double *fwid)
2132 {
2133 double x;
2134 int err = 0;
2135
2136 if (opt & OPT_N) {
2137 /* number of bins given */
2138 if (opt & (OPT_M | OPT_W)) {
2139 /* can't give min, width spec */
2140 return E_BADOPT;
2141 } else {
2142 int n = get_optval_int(FREQ, OPT_N, &err);
2143
2144 if (!err) {
2145 if (n < 2 || n > 10000) {
2146 err = E_INVARG;
2147 } else {
2148 *n_bins = n;
2149 }
2150 }
2151 if (err) {
2152 return err;
2153 }
2154 }
2155 }
2156
2157 if (opt & OPT_M) {
2158 /* minimum specified */
2159 if (!(opt & OPT_W)) {
2160 /* but no width given */
2161 return E_ARGS;
2162 } else {
2163 x = get_optval_double(FREQ, OPT_M, &err);
2164 if (err) {
2165 return err;
2166 } else if (na(x)) {
2167 return E_ARGS;
2168 } else {
2169 *fmin = x;
2170 }
2171 }
2172 }
2173
2174 if (opt & OPT_W) {
2175 /* width specified */
2176 if (!(opt & OPT_M)) {
2177 /* but no min given */
2178 return E_ARGS;
2179 } else {
2180 x = get_optval_double(FREQ, OPT_W, &err);
2181 if (err) {
2182 return err;
2183 } else if (na(x)) {
2184 return E_ARGS;
2185 } else if (x <= 0) {
2186 return E_INVARG;
2187 } else {
2188 *fwid = x;
2189 }
2190 }
2191 }
2192
2193 return 0;
2194 }
2195
record_freq_matrix(FreqDist * fd)2196 static void record_freq_matrix (FreqDist *fd)
2197 {
2198 gretl_matrix *m = NULL;
2199 char **Sr = NULL;
2200 int i, n = fd->numbins;
2201
2202 if (fd->S != NULL) {
2203 m = gretl_matrix_alloc(n, 1);
2204 Sr = strings_array_dup(fd->S, n);
2205 } else {
2206 m = gretl_matrix_alloc(n, 2);
2207 }
2208
2209 if (m != NULL) {
2210 char **Sc = NULL;
2211
2212 if (fd->S == NULL) {
2213 Sc = strings_array_new(2);
2214 if (Sc != NULL) {
2215 Sc[0] = gretl_strdup("midpoint");
2216 Sc[1] = gretl_strdup("count");
2217 }
2218 }
2219 for (i=0; i<n; i++) {
2220 if (fd->S != NULL) {
2221 gretl_matrix_set(m, i, 0, fd->f[i]);
2222 } else {
2223 gretl_matrix_set(m, i, 0, fd->midpt[i]);
2224 gretl_matrix_set(m, i, 1, fd->f[i]);
2225 }
2226 }
2227 if (Sc != NULL) {
2228 gretl_matrix_set_colnames(m, Sc);
2229 }
2230 if (Sr != NULL) {
2231 gretl_matrix_set_rownames(m, Sr);
2232 }
2233 set_last_result_data(m, GRETL_TYPE_MATRIX);
2234 }
2235 }
2236
2237 /* Wrapper function: get the distribution, print it if
2238 wanted, graph it if wanted, then free stuff.
2239 */
2240
freqdist(int varno,const DATASET * dset,gretlopt opt,PRN * prn)2241 int freqdist (int varno, const DATASET *dset,
2242 gretlopt opt, PRN *prn)
2243 {
2244 FreqDist *freq = NULL;
2245 DistCode dist = D_NONE;
2246 PlotType ptype = 0;
2247 double fmin = NADBL;
2248 double fwid = NADBL;
2249 int n_bins = 0;
2250 int do_graph = 0;
2251 int err;
2252
2253 if (opt & OPT_O) {
2254 dist = D_GAMMA;
2255 ptype = PLOT_FREQ_GAMMA;
2256 } else if (opt & OPT_Z) {
2257 dist = D_NORMAL;
2258 ptype = PLOT_FREQ_NORMAL;
2259 } else {
2260 ptype = PLOT_FREQ_SIMPLE;
2261 }
2262
2263 if (!(opt & OPT_Q)) {
2264 if (opt & OPT_G) {
2265 /* it's a legacy thing */
2266 opt |= OPT_U;
2267 opt &= ~OPT_G;
2268 set_optval_string(FREQ, OPT_U, "display");
2269 }
2270 do_graph = gnuplot_graph_wanted(ptype, opt);
2271 }
2272
2273 err = check_freq_opts(opt, &n_bins, &fmin, &fwid);
2274
2275 if (!err) {
2276 gretlopt fopt = opt;
2277
2278 if (do_graph) {
2279 fopt |= OPT_G;
2280 }
2281 freq = get_freq(varno, dset, fmin, fwid, n_bins, 1, fopt, &err);
2282 }
2283
2284 if (!err) {
2285 if (!(opt & OPT_S)) {
2286 print_freq(freq, varno, dset, prn);
2287 } else if (dist) {
2288 record_freq_test(freq);
2289 }
2290 record_freq_matrix(freq);
2291
2292 if (do_graph && freq->numbins < 2) {
2293 do_graph = 0;
2294 }
2295 if (do_graph) {
2296 int gerr = plot_freq(freq, dist, opt);
2297
2298 if (gerr) {
2299 pputs(prn, _("gnuplot command failed\n"));
2300 do_graph = 0;
2301 }
2302 }
2303 free_freq(freq);
2304 }
2305
2306 return err;
2307 }
2308
2309 /* Wrapper function: get the frequency distribution and write it into
2310 a gretl_matrix: the first column holds the mid-points of the bins
2311 and the second holds the frequencies.
2312 */
2313
freqdist_matrix(const double * x,int t1,int t2,int * err)2314 gretl_matrix *freqdist_matrix (const double *x, int t1, int t2, int *err)
2315 {
2316 DATASET *dset = NULL;
2317 FreqDist *freq = NULL;
2318 gretl_matrix *m = NULL;
2319 int i, t, T = t2 - t1 + 1;
2320
2321 dset = create_auxiliary_dataset(1, T, 0);
2322 if (dset == NULL) {
2323 *err = E_ALLOC;
2324 }
2325
2326 if (!*err) {
2327 i = 0;
2328 for (t=t1; t<=t2; t++) {
2329 dset->Z[0][i++] = x[t];
2330 }
2331 freq = get_freq(0, dset, NADBL, NADBL, 0, 1, OPT_NONE, err);
2332 }
2333
2334 if (!*err) {
2335 m = gretl_matrix_alloc(freq->numbins, 2);
2336 if (m == NULL) {
2337 *err = E_ALLOC;
2338 }
2339 }
2340
2341 if (!*err) {
2342 for (i=0; i<freq->numbins; i++) {
2343 gretl_matrix_set(m, i, 0, freq->midpt[i]);
2344 gretl_matrix_set(m, i, 1, freq->f[i]);
2345 }
2346 }
2347
2348 destroy_dataset(dset);
2349 free_freq(freq);
2350
2351 return m;
2352 }
2353
xtab_to_matrix(const Xtab * tab)2354 gretl_matrix *xtab_to_matrix (const Xtab *tab)
2355 {
2356 gretl_matrix *m;
2357 double x;
2358 int i, j;
2359
2360 if (tab == NULL) {
2361 return NULL;
2362 }
2363
2364 m = gretl_matrix_alloc(tab->rows, tab->cols);
2365 if (m == NULL) {
2366 return NULL;
2367 }
2368
2369 for (j=0; j<tab->cols; j++) {
2370 for (i=0; i<tab->rows; i++) {
2371 x = (double) tab->f[i][j];
2372 gretl_matrix_set(m, i, j, x);
2373 }
2374 }
2375
2376 return m;
2377 }
2378
2379 static void *last_result;
2380 static GretlType last_result_type;
2381
get_last_result_data(GretlType * type,int * err)2382 void *get_last_result_data (GretlType *type, int *err)
2383 {
2384 void *ret = NULL;
2385
2386 if (last_result == NULL) {
2387 *type = GRETL_TYPE_NONE;
2388 *err = E_BADSTAT;
2389 } else {
2390 *type = last_result_type;
2391 if (*type == GRETL_TYPE_MATRIX) {
2392 ret = gretl_matrix_copy(last_result);
2393 } else {
2394 ret = gretl_bundle_copy(last_result, err);
2395 }
2396 }
2397
2398 return ret;
2399 }
2400
set_last_result_data(void * data,GretlType type)2401 void set_last_result_data (void *data, GretlType type)
2402 {
2403 if (last_result != NULL) {
2404 if (last_result_type == GRETL_TYPE_MATRIX) {
2405 gretl_matrix_free(last_result);
2406 } else if (last_result_type == GRETL_TYPE_BUNDLE) {
2407 gretl_bundle_destroy(last_result);
2408 }
2409 }
2410
2411 last_result = data;
2412 last_result_type = type;
2413 }
2414
last_result_cleanup(void)2415 void last_result_cleanup (void)
2416 {
2417 if (last_result != NULL) {
2418 if (last_result_type == GRETL_TYPE_MATRIX) {
2419 gretl_matrix_free(last_result);
2420 } else if (last_result_type == GRETL_TYPE_BUNDLE) {
2421 gretl_bundle_destroy(last_result);
2422 }
2423 }
2424
2425 last_result = NULL;
2426 last_result_type = 0;
2427 }
2428
2429 /**
2430 * free_xtab:
2431 * @tab: pointer to gretl crosstab struct.
2432 *
2433 * Frees all resources associated with @tab, and the
2434 * pointer itself.
2435 */
2436
free_xtab(Xtab * tab)2437 void free_xtab (Xtab *tab)
2438 {
2439 int i;
2440
2441 if (tab == NULL) {
2442 return;
2443 }
2444
2445 free(tab->rtotal);
2446 free(tab->ctotal);
2447 free(tab->rval);
2448 free(tab->cval);
2449
2450 if (tab->f != NULL) {
2451 for (i=0; i<tab->rows; i++) {
2452 free(tab->f[i]);
2453 }
2454 free(tab->f);
2455 }
2456
2457 if (tab->Sr != NULL) {
2458 strings_array_free(tab->Sr, tab->rows);
2459 }
2460 if (tab->Sc != NULL) {
2461 strings_array_free(tab->Sc, tab->cols);
2462 }
2463
2464 free(tab);
2465 }
2466
xtab_new(int n,int t1,int t2)2467 static Xtab *xtab_new (int n, int t1, int t2)
2468 {
2469 Xtab *tab = malloc(sizeof *tab);
2470
2471 if (tab == NULL) return NULL;
2472
2473 tab->rtotal = NULL;
2474 tab->ctotal = NULL;
2475 tab->rval = NULL;
2476 tab->cval = NULL;
2477 tab->f = NULL;
2478
2479 tab->n = n;
2480 tab->t1 = t1;
2481 tab->t2 = t2;
2482 tab->missing = 0;
2483
2484 *tab->rvarname = '\0';
2485 *tab->cvarname = '\0';
2486 tab->Sr = NULL;
2487 tab->Sc = NULL;
2488 tab->rstrs = 0;
2489 tab->cstrs = 0;
2490
2491 return tab;
2492 }
2493
2494 /* allocate the required arrays (apart from S, which must be
2495 handled separately) and initialize counters to zero
2496 */
2497
xtab_allocate_arrays(Xtab * tab)2498 static int xtab_allocate_arrays (Xtab *tab)
2499 {
2500 int i, j, err = 0;
2501
2502 if (!tab->rstrs && tab->rval == NULL) {
2503 tab->rval = malloc(tab->rows * sizeof *tab->rval);
2504 if (tab->rval == NULL) {
2505 err = E_ALLOC;
2506 }
2507 }
2508
2509 if (!err && !tab->cstrs && tab->cval == NULL) {
2510 tab->cval = malloc(tab->cols * sizeof *tab->cval);
2511 if (tab->cval == NULL) {
2512 err = E_ALLOC;
2513 }
2514 }
2515
2516 if (!err) {
2517 tab->rtotal = malloc(tab->rows * sizeof *tab->rtotal);
2518 tab->ctotal = malloc(tab->cols * sizeof *tab->ctotal);
2519 if (tab->rtotal == NULL || tab->ctotal == NULL) {
2520 err = E_ALLOC;
2521 } else {
2522 for (i=0; i<tab->rows; i++) {
2523 tab->rtotal[i] = 0;
2524 }
2525 for (j=0; j<tab->cols; j++) {
2526 tab->ctotal[j] = 0;
2527 }
2528 }
2529 }
2530
2531 if (!err) {
2532 tab->f = malloc(tab->rows * sizeof *tab->f);
2533 if (tab->f == NULL) {
2534 err = E_ALLOC;
2535 } else {
2536 for (i=0; i<tab->rows; i++) {
2537 tab->f[i] = NULL;
2538 }
2539 }
2540 }
2541
2542 for (i=0; i<tab->rows && !err; i++) {
2543 tab->f[i] = malloc(tab->cols * sizeof *tab->f[i]);
2544 if (tab->f[i] == NULL) {
2545 err = E_ALLOC;
2546 } else {
2547 for (j=0; j<tab->cols; j++) {
2548 tab->f[i][j] = 0;
2549 }
2550 }
2551 }
2552
2553 return err;
2554 }
2555
2556 /* also used in gretl_matrix.c */
2557
compare_xtab_rows(const void * a,const void * b)2558 int compare_xtab_rows (const void *a, const void *b)
2559 {
2560 const double **da = (const double **) a;
2561 const double **db = (const double **) b;
2562 int ret;
2563
2564 ret = da[0][0] - db[0][0];
2565
2566 if (ret == 0) {
2567 ret = da[0][1] - db[0][1];
2568 }
2569
2570 return ret;
2571 }
2572
xtab_get_data(Xtab * tab,int v,int j,const DATASET * dset,series_table ** pst)2573 static int xtab_get_data (Xtab *tab, int v, int j,
2574 const DATASET *dset,
2575 series_table **pst)
2576 {
2577 double **xtarg = (j == 1)? &tab->cval : &tab->rval;
2578 char ***Starg = (j == 1)? &tab->Sc : &tab->Sr;
2579 int *itarg = (j == 1)? &tab->cols : &tab->rows;
2580 int *ttarg = (j == 1)? &tab->cstrs : &tab->rstrs;
2581 double *x = dset->Z[v] + dset->t1;
2582 int n = sample_size(dset);
2583 gretl_matrix *u;
2584 int err = 0;
2585
2586 u = gretl_matrix_values(x, n, OPT_S, &err);
2587
2588 if (!err && is_string_valued(dset, v)) {
2589 series_table *st;
2590 int ns, nv = u->rows;
2591 char **S0, **S = NULL;
2592
2593 *pst = st = series_get_string_table(dset, v);
2594 S0 = series_table_get_strings(st, &ns);
2595
2596 if (ns > nv) {
2597 int i, k;
2598
2599 S = strings_array_new(nv);
2600 if (S != NULL) {
2601 for (i=0; i<nv; i++) {
2602 k = u->val[i] - 1;
2603 S[i] = gretl_strdup(S0[k]);
2604 }
2605 }
2606 } else {
2607 S = strings_array_dup(S0, ns);
2608 }
2609 if (S == NULL) {
2610 err = E_ALLOC;
2611 } else {
2612 *ttarg = 1;
2613 *itarg = nv;
2614 *Starg = S;
2615 strings_array_sort(Starg, &nv, OPT_NONE);
2616 }
2617 } else if (!err) {
2618 *itarg = u->rows;
2619 *xtarg = u->val;
2620 u->val = NULL;
2621 }
2622
2623 gretl_matrix_free(u);
2624
2625 return err;
2626 }
2627
xtab_row_match(Xtab * tab,int i,const char * s,double x)2628 static int xtab_row_match (Xtab *tab, int i, const char *s, double x)
2629 {
2630 if (tab->rstrs) {
2631 return strcmp(s, tab->Sr[i]) == 0;
2632 } else {
2633 return x == tab->rval[i];
2634 }
2635 }
2636
xtab_col_match(Xtab * tab,int j,const char * s,double x)2637 static int xtab_col_match (Xtab *tab, int j, const char *s, double x)
2638 {
2639 if (tab->cstrs) {
2640 return strcmp(s, tab->Sc[j]) == 0;
2641 } else {
2642 return x == tab->cval[j];
2643 }
2644 }
2645
2646 #define complete_obs(x,y,t) (!na(x[t]) && !na(y[t]))
2647
2648 /* crosstab struct creation functions */
2649
get_new_xtab(int rv,int cv,const DATASET * dset,int * err)2650 static Xtab *get_new_xtab (int rv, int cv, const DATASET *dset,
2651 int *err)
2652 {
2653 series_table *sti = NULL;
2654 series_table *stj = NULL;
2655 const char *s1 = NULL;
2656 const char *s2 = NULL;
2657 double x1 = 0, x2 = 0;
2658 Xtab *tab = NULL;
2659 int imatch, jmatch;
2660 int i, j, t, n = 0;
2661
2662 for (t=dset->t1; t<=dset->t2; t++) {
2663 if (complete_obs(dset->Z[rv], dset->Z[cv], t)) {
2664 n++;
2665 }
2666 }
2667
2668 if (n == 0) {
2669 *err = E_MISSDATA;
2670 } else {
2671 tab = xtab_new(n, dset->t1, dset->t2);
2672 if (tab == NULL) {
2673 *err = E_ALLOC;
2674 }
2675 }
2676
2677 if (!*err) {
2678 /* assemble row data */
2679 *err = xtab_get_data(tab, rv, 0, dset, &sti);
2680 }
2681 if (!*err) {
2682 /* assemble column data */
2683 *err = xtab_get_data(tab, cv, 1, dset, &stj);
2684 }
2685 if (!*err) {
2686 *err = xtab_allocate_arrays(tab);
2687 }
2688
2689 if (*err) goto bailout;
2690
2691 tab->missing = (dset->t2 - dset->t1 + 1) - n;
2692 strcpy(tab->rvarname, dset->varname[rv]);
2693 strcpy(tab->cvarname, dset->varname[cv]);
2694
2695 /* The following could be made more efficient by substituting
2696 sorted arrays for dset->Z[rv] and dset->Z[cv] but I'm not
2697 sure if the fixed cost would be recouped.
2698 */
2699
2700 for (t=dset->t1; t<=dset->t2; t++) {
2701 if (!complete_obs(dset->Z[rv], dset->Z[cv], t)) {
2702 continue;
2703 }
2704 x1 = dset->Z[rv][t];
2705 if (tab->rstrs) {
2706 s1 = series_table_get_string(sti, x1);
2707 }
2708 x2 = dset->Z[cv][t];
2709 if (tab->cstrs) {
2710 s2 = series_table_get_string(stj, x2);
2711 }
2712 for (i=0; i<tab->rows; i++) {
2713 imatch = xtab_row_match(tab, i, s1, x1);
2714 if (imatch) {
2715 jmatch = 0;
2716 for (j=0; j<tab->cols && !jmatch; j++) {
2717 jmatch = xtab_col_match(tab, j, s2, x2);
2718 if (jmatch) {
2719 tab->f[i][j] += 1;
2720 tab->rtotal[i] += 1;
2721 tab->ctotal[j] += 1;
2722 }
2723 }
2724 break;
2725 }
2726 }
2727 }
2728
2729 bailout:
2730
2731 if (*err) {
2732 free_xtab(tab);
2733 tab = NULL;
2734 }
2735
2736 return tab;
2737 }
2738
record_xtab(const Xtab * tab,const DATASET * dset,gretlopt opt)2739 static void record_xtab (const Xtab *tab, const DATASET *dset,
2740 gretlopt opt)
2741 {
2742 gretl_matrix *X = NULL;
2743 char **Sc = NULL, **Sr = NULL;
2744 double xij, cj, ri;
2745 int totals, rows, cols;
2746 int i, j;
2747
2748 totals = (opt & OPT_N)? 0 : 1;
2749 rows = tab->rows + totals;
2750 cols = tab->cols + totals;
2751
2752 X = gretl_zero_matrix_new(rows, cols);
2753 if (X == NULL) {
2754 return;
2755 }
2756
2757 /* column labels */
2758 Sc = strings_array_new(cols);
2759 if (Sc != NULL) {
2760 for (j=0; j<tab->cols; j++) {
2761 if (tab->cstrs) {
2762 Sc[j] = gretl_strdup(tab->Sc[j]);
2763 } else {
2764 cj = tab->cval[j];
2765 Sc[j] = gretl_strdup_printf("%4g", cj);
2766 }
2767 }
2768 if (totals) {
2769 Sc[cols-1] = gretl_strdup("TOTAL");
2770 }
2771 }
2772
2773 /* row labels */
2774 Sr = strings_array_new(rows);
2775 if (Sr != NULL) {
2776 for (i=0; i<tab->rows; i++) {
2777 if (tab->rstrs) {
2778 Sr[i] = gretl_strdup(tab->Sr[i]);
2779 } else {
2780 ri = tab->rval[i];
2781 Sr[i] = gretl_strdup_printf("%4g", ri);
2782 }
2783 }
2784 if (totals) {
2785 Sr[rows-1] = gretl_strdup("TOTAL");
2786 }
2787 }
2788
2789 /* body of table */
2790
2791 for (i=0; i<tab->rows; i++) {
2792 if (tab->rtotal[i] > 0) {
2793 /* row counts */
2794 for (j=0; j<tab->cols; j++) {
2795 if (tab->ctotal[j] > 0) {
2796 if (opt & (OPT_C | OPT_R)) {
2797 if (opt & OPT_C) {
2798 xij = 100.0 * tab->f[i][j] / tab->ctotal[j];
2799 } else {
2800 xij = 100.0 * tab->f[i][j] / tab->rtotal[i];
2801 }
2802 } else {
2803 xij = tab->f[i][j];
2804 }
2805 gretl_matrix_set(X, i, j, xij);
2806 }
2807 }
2808 if (totals) {
2809 /* row totals */
2810 if (opt & OPT_C) {
2811 xij = 100.0 * tab->rtotal[i] / tab->n;
2812 } else {
2813 xij = tab->rtotal[i];
2814 }
2815 gretl_matrix_set(X, i, cols-1, xij);
2816 }
2817 }
2818 }
2819
2820 if (totals) {
2821 /* column totals */
2822 for (j=0; j<tab->cols; j++) {
2823 if (opt & OPT_R) {
2824 xij = 100.0 * tab->ctotal[j] / tab->n;
2825 } else {
2826 xij = tab->ctotal[j];
2827 }
2828 gretl_matrix_set(X, rows-1, j, xij);
2829 }
2830 gretl_matrix_set(X, rows-1, cols-1, tab->n);
2831 }
2832
2833 gretl_matrix_set_colnames(X, Sc);
2834 gretl_matrix_set_rownames(X, Sr);
2835 set_last_result_data(X, GRETL_TYPE_MATRIX);
2836 }
2837
2838 /* For use in the context of "xtab" with --quiet option:
2839 compute and record the Pearson chi-square value and its
2840 p-value.
2841 */
2842
xtab_do_pearson(const Xtab * tab)2843 static int xtab_do_pearson (const Xtab *tab)
2844 {
2845 double x, y, ymin = 1.0e-7;
2846 double pearson = 0.0;
2847 int i, j, err = 0;
2848
2849 for (i=0; i<tab->rows && !err; i++) {
2850 if (tab->rtotal[i] > 0) {
2851 for (j=0; j<tab->cols && !err; j++) {
2852 y = ((double) tab->rtotal[i] * tab->ctotal[j]) / tab->n;
2853 if (y < ymin) {
2854 err = E_DATA;
2855 } else {
2856 x = (double) tab->f[i][j] - y;
2857 pearson += x * x / y;
2858 }
2859 }
2860 }
2861 }
2862
2863 if (!err) {
2864 int df = (tab->rows - 1) * (tab->cols - 1);
2865 double pval = chisq_cdf_comp(df, pearson);
2866
2867 if (!na(pval)) {
2868 record_test_result(pearson, pval);
2869 } else {
2870 err = E_DATA;
2871 }
2872 }
2873
2874 if (err) {
2875 record_test_result(NADBL, NADBL);
2876 }
2877
2878 return err;
2879 }
2880
crosstab_from_matrix(gretlopt opt,PRN * prn)2881 int crosstab_from_matrix (gretlopt opt, PRN *prn)
2882 {
2883 const char *mname;
2884 const gretl_matrix *m;
2885 Xtab *tab;
2886 int i, j, nvals, n = 0;
2887 double x;
2888 int err = 0;
2889
2890 mname = get_optval_string(XTAB, OPT_X);
2891 if (mname == NULL) {
2892 return E_DATA;
2893 }
2894
2895 m = get_matrix_by_name(mname);
2896 if (m == NULL) {
2897 return E_UNKVAR;
2898 }
2899
2900 if (m->rows < 2 || m->cols < 2) {
2901 err = E_DATA;
2902 }
2903
2904 nvals = m->rows * m->cols;
2905
2906 for (i=0; i<nvals && !err; i++) {
2907 x = m->val[i];
2908 if (x < 0 || x != floor(x) || x > INT_MAX) {
2909 err = E_DATA;
2910 }
2911 n += x;
2912 }
2913
2914 if (err) {
2915 gretl_errmsg_sprintf(_("Matrix %s does not represent a "
2916 "contingency table"), mname);
2917 return err;
2918 }
2919
2920 tab = xtab_new(n, 0, 0);
2921 if (tab == NULL) {
2922 return E_ALLOC;
2923 }
2924
2925 tab->rows = m->rows;
2926 tab->cols = m->cols;
2927
2928 if (xtab_allocate_arrays(tab)) {
2929 free_xtab(tab);
2930 return E_ALLOC;
2931 }
2932
2933 for (i=0; i<m->rows; i++) {
2934 tab->rval[i] = i + 1;
2935 tab->rtotal[i] = 0.0;
2936 for (j=0; j<m->cols; j++) {
2937 tab->f[i][j] = gretl_matrix_get(m, i, j);
2938 tab->rtotal[i] += tab->f[i][j];
2939 }
2940 }
2941
2942 for (j=0; j<m->cols; j++) {
2943 tab->cval[j] = j + 1;
2944 tab->ctotal[j] = 0.0;
2945 for (i=0; i<m->rows; i++) {
2946 tab->ctotal[j] += tab->f[i][j];
2947 }
2948 }
2949
2950 if (opt & OPT_Q) {
2951 xtab_do_pearson(tab);
2952 } else {
2953 print_xtab(tab, NULL, opt | OPT_S, prn);
2954 }
2955
2956 free_xtab(tab);
2957
2958 return err;
2959 }
2960
crosstab(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)2961 int crosstab (const int *list, const DATASET *dset,
2962 gretlopt opt, PRN *prn)
2963 {
2964 Xtab *tab;
2965 int *rowvar = NULL;
2966 int *colvar = NULL;
2967 int i, j, vi, vj, k;
2968 int pos, onelist;
2969 int nrv, ncv;
2970 int err = 0;
2971
2972 pos = gretl_list_separator_position(list);
2973 onelist = (pos == 0);
2974
2975 if (pos == 0) {
2976 /* single list case */
2977 nrv = list[0];
2978 ncv = nrv - 1;
2979 } else {
2980 /* double list case */
2981 nrv = pos - 1;
2982 ncv = list[0] - pos;
2983 }
2984
2985 if (nrv == 0 || ncv == 0) {
2986 return E_PARSE;
2987 }
2988
2989 rowvar = gretl_list_new(nrv);
2990 if (rowvar == NULL) {
2991 return E_ALLOC;
2992 }
2993
2994 j = 1;
2995 for (i=1; i<=nrv; i++) {
2996 k = list[i];
2997 if (accept_as_discrete(dset, k, 0)) {
2998 rowvar[j++] = k;
2999 } else {
3000 pprintf(prn, _("dropping %s: not a discrete variable\n"),
3001 dset->varname[k]);
3002 rowvar[0] -= 1;
3003 }
3004 }
3005
3006 if (rowvar[0] == 0 || (onelist && rowvar[0] == 1)) {
3007 gretl_errmsg_set("xtab: variables must be discrete");
3008 free(rowvar);
3009 return E_TYPES;
3010 }
3011
3012 if (onelist && rowvar[0] == 2) {
3013 /* the bivariate case */
3014 tab = get_new_xtab(rowvar[1], rowvar[2], dset, &err);
3015 if (!err) {
3016 /* make $result matrix available */
3017 record_xtab(tab, dset, opt);
3018 if (opt & OPT_Q) {
3019 /* quiet: run and record the Pearson test */
3020 xtab_do_pearson(tab);
3021 } else {
3022 /* print, and record Pearson test */
3023 print_xtab(tab, dset, opt | OPT_S, prn);
3024 }
3025 free_xtab(tab);
3026 }
3027 goto finish;
3028 }
3029
3030 if (!onelist) {
3031 /* construct the second list */
3032 colvar = gretl_list_new(ncv);
3033 if (colvar == NULL) {
3034 err = E_ALLOC;
3035 } else {
3036 j = 1;
3037 for (i=1; i<=ncv; i++) {
3038 k = list[pos+i];
3039 if (accept_as_discrete(dset, k, 0)) {
3040 colvar[j++] = k;
3041 } else {
3042 colvar[0] -= 1;
3043 }
3044 }
3045 if (colvar[0] == 0) {
3046 err = E_TYPES;
3047 }
3048 }
3049 }
3050
3051 for (i=1; i<=rowvar[0] && !err; i++) {
3052 vi = rowvar[i];
3053 if (onelist) {
3054 /* single list case */
3055 for (j=1; j<i && !err; j++) {
3056 vj = rowvar[j];
3057 tab = get_new_xtab(vj, vi, dset, &err);
3058 if (!err) {
3059 print_xtab(tab, dset, opt, prn);
3060 free_xtab(tab);
3061 }
3062 }
3063 } else {
3064 /* double list case */
3065 for (j=1; j<=colvar[0] && !err; j++) {
3066 vj = colvar[j];
3067 tab = get_new_xtab(vi, vj, dset, &err);
3068 if (!err) {
3069 print_xtab(tab, dset, opt, prn);
3070 free_xtab(tab);
3071 }
3072 }
3073 }
3074 }
3075
3076 finish:
3077
3078 free(rowvar);
3079 free(colvar);
3080
3081 return err;
3082 }
3083
single_crosstab(const int * list,const DATASET * dset,gretlopt opt,PRN * prn,int * err)3084 Xtab *single_crosstab (const int *list, const DATASET *dset,
3085 gretlopt opt, PRN *prn, int *err)
3086 {
3087 Xtab *tab = NULL;
3088 int rv, cv;
3089
3090 if (list[0] != 2) {
3091 *err = E_DATA;
3092 return NULL;
3093 }
3094
3095 rv = list[1];
3096 cv = list[2];
3097
3098 if (accept_as_discrete(dset, rv, 0) &&
3099 accept_as_discrete(dset, cv, 0)) {
3100 tab = get_new_xtab(rv, cv, dset, err);
3101 } else {
3102 *err = E_TYPES;
3103 }
3104
3105 if (!*err) {
3106 print_xtab(tab, dset, opt, prn);
3107 }
3108
3109 return tab;
3110 }
3111
just_record_freq_test(const FreqDist * freq)3112 static int just_record_freq_test (const FreqDist *freq)
3113 {
3114 double pval = NADBL;
3115
3116 if (freq->dist == D_NORMAL) {
3117 pval = chisq_cdf_comp(2, freq->test);
3118 } else if (freq->dist == D_GAMMA) {
3119 pval = normal_pvalue_2(freq->test);
3120 }
3121
3122 if (na(pval)) {
3123 return E_NAN;
3124 } else {
3125 record_test_result(freq->test, pval);
3126 return 0;
3127 }
3128 }
3129
model_error_dist(const MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)3130 int model_error_dist (const MODEL *pmod, DATASET *dset,
3131 gretlopt opt, PRN *prn)
3132 {
3133 FreqDist *freq = NULL;
3134 gretlopt fopt = OPT_Z; /* show normality test */
3135 int save_t1 = dset->t1;
3136 int save_t2 = dset->t2;
3137 int err = 0;
3138
3139 if (pmod == NULL || pmod->uhat == NULL) {
3140 return E_DATA;
3141 }
3142
3143 err = gretl_model_get_normality_test(pmod, prn);
3144
3145 if (!err) {
3146 return 0;
3147 } else if (LIMDEP(pmod->ci)) {
3148 return err;
3149 } else {
3150 err = 0;
3151 }
3152
3153 if (exact_fit_check(pmod, prn)) {
3154 return 0;
3155 }
3156
3157 if (genr_fit_resid(pmod, dset, M_UHAT)) {
3158 return E_ALLOC;
3159 }
3160
3161 if (!err) {
3162 dset->t1 = pmod->t1;
3163 dset->t2 = pmod->t2;
3164 freq = get_freq(dset->v - 1, dset, NADBL, NADBL, 0,
3165 pmod->ncoeff, fopt, &err);
3166 }
3167
3168 if (!err) {
3169 if (opt & OPT_I) {
3170 err = just_record_freq_test(freq);
3171 } else if (opt & OPT_Q) {
3172 print_freq_test(freq, prn);
3173 } else {
3174 print_freq(freq, 0, NULL, prn);
3175 }
3176 free_freq(freq);
3177 }
3178
3179 dataset_drop_last_variables(dset, 1);
3180
3181 dset->t1 = save_t1;
3182 dset->t2 = save_t2;
3183
3184 return err;
3185 }
3186
3187 /* PACF via Durbin-Levinson algorithm */
3188
get_pacf(gretl_matrix * A)3189 static int get_pacf (gretl_matrix *A)
3190 {
3191 int m = gretl_matrix_rows(A);
3192 gretl_matrix *phi;
3193 double *acf = A->val;
3194 double *pacf = acf + m;
3195 double x, num, den;
3196 int i, j;
3197
3198 phi = gretl_matrix_alloc(m, m);
3199 if (phi == NULL) {
3200 return E_ALLOC;
3201 }
3202
3203 gretl_matrix_set(A, 0, 1, acf[0]);
3204 gretl_matrix_set(phi, 0, 0, acf[0]);
3205
3206 for (i=1; i<m; i++) {
3207 num = acf[i];
3208 for (j=0; j<i; j++) {
3209 num -= gretl_matrix_get(phi, i-1, j) * acf[i-j-1];
3210 }
3211 den = 1.0;
3212 for (j=0; j<i; j++) {
3213 den -= gretl_matrix_get(phi, i-1, j) * acf[j];
3214 }
3215 pacf[i] = num / den;
3216 gretl_matrix_set(phi, i, i, pacf[i]);
3217 for (j=0; j<i; j++) {
3218 x = gretl_matrix_get(phi, i-1, j);
3219 x -= pacf[i] * gretl_matrix_get(phi, i-1, i-j-1);
3220 gretl_matrix_set(phi, i, j, x);
3221 }
3222 }
3223
3224 gretl_matrix_free(phi);
3225
3226 return 0;
3227 }
3228
3229 /* also used in GUI, library.c */
3230
auto_acf_order(int T)3231 int auto_acf_order (int T)
3232 {
3233 int p = 10 * log10(T);
3234
3235 if (p > T / 5) {
3236 /* restrict to 20 percent of data (Tadeusz) */
3237 p = T / 5;
3238 }
3239
3240 return p;
3241 }
3242
3243 /**
3244 * gretl_acf:
3245 * @k: lag order.
3246 * @t1: starting observation.
3247 * @t2: ending observation.
3248 * @y: data series.
3249 * @ybar: mean of @y over range @t1 to @t2.
3250 *
3251 * Returns: the autocorrelation at lag @k for the series @y over
3252 * the range @t1 to @t2, or #NADBL on failure.
3253 */
3254
gretl_acf(int k,int t1,int t2,const double * y,double ybar)3255 static double gretl_acf (int k, int t1, int t2, const double *y,
3256 double ybar)
3257 {
3258 double z, num = 0, den = 0;
3259 int t, n = 0;
3260
3261 for (t=t1; t<=t2; t++) {
3262 if (na(y[t])) {
3263 return NADBL;
3264 }
3265 z = y[t] - ybar;
3266 den += z * z;
3267 if (t - k >= t1) {
3268 if (na(y[t-k])) {
3269 return NADBL;
3270 }
3271 num += z * (y[t-k] - ybar);
3272 n++;
3273 }
3274 }
3275
3276 if (n == 0) {
3277 return NADBL;
3278 }
3279
3280 return num / den;
3281 }
3282
3283 /**
3284 * ljung_box:
3285 * @m: maximum lag.
3286 * @t1: starting observation.
3287 * @t2: ending observation.
3288 * @y: data series.
3289 * @err: location to receive error code.
3290 *
3291 * Returns: the Ljung-Box statistic for lag order @m for
3292 * the series @y over the sample @t1 to @t2, or #NADBL
3293 * on failure.
3294 */
3295
ljung_box(int m,int t1,int t2,const double * y,int * err)3296 double ljung_box (int m, int t1, int t2, const double *y, int *err)
3297 {
3298 double acf, ybar = 0.0, LB = 0.0;
3299 int k, n = t2 - t1 + 1;
3300
3301 *err = 0;
3302
3303 if (n == 0 || gretl_isconst(t1, t2, y)) {
3304 *err = E_DATA;
3305 } else if (m <= 0) {
3306 gretl_errmsg_sprintf(_("Invalid lag order %d"), m);
3307 *err = E_DATA;
3308 } else {
3309 ybar = gretl_mean(t1, t2, y);
3310 if (na(ybar)) {
3311 *err = E_DATA;
3312 }
3313 }
3314
3315 if (*err) {
3316 return NADBL;
3317 }
3318
3319 /* calculate acf up to lag m, cumulating LB */
3320 for (k=1; k<=m; k++) {
3321 acf = gretl_acf(k, t1, t2, y, ybar);
3322 if (na(acf)) {
3323 *err = E_MISSDATA;
3324 break;
3325 }
3326 LB += acf * acf / (n - k);
3327 }
3328
3329 if (*err) {
3330 LB = NADBL;
3331 } else {
3332 LB *= n * (n + 2.0);
3333 }
3334
3335 return LB;
3336 }
3337
3338 /**
3339 * gretl_xcf:
3340 * @k: lag order (or lead order if < 0).
3341 * @t1: starting observation.
3342 * @t2: ending observation.
3343 * @x: first data series.
3344 * @y: second data series.
3345 * @xbar: mean of first series.
3346 * @ybar: mean of second series.
3347 *
3348 * Returns: the cross-correlation at lag (or lead) @k for the
3349 * series @x and @y over the range @t1 to @t2, or #NADBL on failure.
3350 */
3351
3352 static double
gretl_xcf(int k,int t1,int t2,const double * x,const double * y,double xbar,double ybar)3353 gretl_xcf (int k, int t1, int t2, const double *x, const double *y,
3354 double xbar, double ybar)
3355 {
3356 double num = 0, den1 = 0, den2 = 0;
3357 double zx, zy;
3358 int t;
3359
3360 for (t=t1; t<=t2; t++) {
3361 if (na(x[t]) || na(y[t])) {
3362 return NADBL;
3363 }
3364 zx = x[t] - xbar;
3365 zy = y[t] - ybar;
3366 den1 += zx * zx;
3367 den2 += zy * zy;
3368 if ((k >= 0 && t - k >= t1) || (k < 0 && t - k <= t2)) {
3369 num += zx * (y[t-k] - ybar);
3370 }
3371 }
3372
3373 return num / sqrt(den1 * den2);
3374 }
3375
corrgm_ascii_plot(const char * vname,const gretl_matrix * A,PRN * prn)3376 static int corrgm_ascii_plot (const char *vname,
3377 const gretl_matrix *A,
3378 PRN *prn)
3379 {
3380 int k, m = gretl_matrix_rows(A);
3381 double *xk = malloc(m * sizeof *xk);
3382
3383 if (xk == NULL) {
3384 return E_ALLOC;
3385 }
3386
3387 for (k=0; k<m; k++) {
3388 xk[k] = k + 1.0;
3389 }
3390
3391 pprintf(prn, "\n\n%s\n\n", _("Correlogram"));
3392 graphyx(A->val, xk, m, vname, _("lag"), prn);
3393
3394 free(xk);
3395
3396 return 0;
3397 }
3398
handle_corrgm_plot_options(int ci,gretlopt opt,int * ascii,int * gp)3399 static void handle_corrgm_plot_options (int ci,
3400 gretlopt opt,
3401 int *ascii,
3402 int *gp)
3403 {
3404 if (opt & OPT_Q) {
3405 /* backward compat: --quiet = no plot */
3406 *ascii = *gp = 0;
3407 return;
3408 }
3409
3410 if (opt & OPT_U) {
3411 /* --plot=whatever */
3412 const char *s = get_optval_string(ci, OPT_U);
3413
3414 if (s != NULL) {
3415 if (!strcmp(s, "none")) {
3416 /* no plot */
3417 *ascii = *gp = 0;
3418 return;
3419 } else if (!strcmp(s, "ascii")) {
3420 *ascii = 1;
3421 *gp = 0;
3422 return;
3423 } else {
3424 /* implies use of gnuplot */
3425 *gp = 1;
3426 return;
3427 }
3428 }
3429 }
3430
3431 /* the defaults */
3432 if (gretl_in_batch_mode()) {
3433 *ascii = 1;
3434 } else {
3435 *gp = 1;
3436 }
3437 }
3438
do_acf_bartlett(gretl_matrix * PM,int i,int T,double ssr,const double * z)3439 static void do_acf_bartlett (gretl_matrix *PM, int i,
3440 int T, double ssr,
3441 const double *z)
3442 {
3443 double b;
3444 int j;
3445
3446 for (j=0; j<PM->cols; j++) {
3447 b = z[j] * sqrt((1.0/T) * (1 + 2*ssr));
3448 gretl_matrix_set(PM, i, j, b);
3449 }
3450 }
3451
3452 /**
3453 * corrgram:
3454 * @varno: ID number of variable to process.
3455 * @order: integer order for autocorrelation function.
3456 * @nparam: number of estimated parameters (e.g. for the
3457 * case of ARMA), used to correct the degrees of freedom
3458 * for Q test.
3459 * @dset: dataset struct.
3460 * @opt: if includes OPT_R, variable in question is a model
3461 * residual generated "on the fly"; OPT_U can be used to
3462 * specify a plot option.
3463 * @prn: gretl printing struct.
3464 *
3465 * Computes the autocorrelation function and plots the correlogram for
3466 * the variable specified by @varno.
3467 *
3468 * Returns: 0 on successful completion, error code on error.
3469 */
3470
corrgram(int varno,int order,int nparam,DATASET * dset,gretlopt opt,PRN * prn)3471 int corrgram (int varno, int order, int nparam, DATASET *dset,
3472 gretlopt opt, PRN *prn)
3473 {
3474 const double z[] = {1.65, 1.96, 2.58};
3475 gretl_matrix *PM = NULL;
3476 gretl_matrix *A = NULL;
3477 double ybar, ssr, lbox;
3478 double pval = NADBL;
3479 double pm[3];
3480 double *acf, *pacf;
3481 const char *vname;
3482 int i, k, m, T, dfQ;
3483 int ascii_plot = 0;
3484 int use_gnuplot = 0;
3485 int t1 = dset->t1, t2 = dset->t2;
3486 int err = 0, pacf_err = 0;
3487
3488 gretl_error_clear();
3489
3490 if (order < 0) {
3491 gretl_errmsg_sprintf(_("Invalid lag order %d"), order);
3492 return E_DATA;
3493 }
3494
3495 err = series_adjust_sample(dset->Z[varno], &t1, &t2);
3496 if (err) {
3497 return err;
3498 }
3499
3500 if ((T = t2 - t1 + 1) < 4) {
3501 return E_TOOFEW;
3502 }
3503
3504 if (gretl_isconst(t1, t2, dset->Z[varno])) {
3505 gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[varno]);
3506 return E_DATA;
3507 }
3508
3509 ybar = gretl_mean(t1, t2, dset->Z[varno]);
3510 if (na(ybar)) {
3511 return E_DATA;
3512 }
3513
3514 vname = series_get_graph_name(dset, varno);
3515
3516 /* lag order for acf */
3517 m = order;
3518 if (m == 0) {
3519 m = auto_acf_order(T);
3520 } else if (m > T - dset->pd) {
3521 int mmax = T - 1;
3522
3523 if (m > mmax) {
3524 m = mmax;
3525 }
3526 }
3527
3528 A = gretl_matrix_alloc(m, 2);
3529 if (A == NULL) {
3530 err = E_ALLOC;
3531 goto bailout;
3532 }
3533
3534 if (opt & OPT_B) {
3535 PM = gretl_matrix_alloc(m, 3);
3536 if (PM == NULL) {
3537 err = E_ALLOC;
3538 goto bailout;
3539 }
3540 }
3541
3542 acf = A->val;
3543 pacf = acf + m;
3544
3545 /* calculate acf up to order acf_m */
3546 for (k=1; k<=m; k++) {
3547 acf[k-1] = gretl_acf(k, t1, t2, dset->Z[varno], ybar);
3548 }
3549
3550 /* graphing? */
3551 handle_corrgm_plot_options(CORRGM, opt, &ascii_plot, &use_gnuplot);
3552
3553 if (ascii_plot) {
3554 corrgm_ascii_plot(vname, A, prn);
3555 }
3556
3557 if (opt & OPT_R) {
3558 pprintf(prn, "\n%s\n", _("Residual autocorrelation function"));
3559 } else {
3560 pputc(prn, '\n');
3561 pprintf(prn, _("Autocorrelation function for %s"), vname);
3562 pputc(prn, '\n');
3563 }
3564
3565 pputs(prn, _("***, **, * indicate significance at the 1%, 5%, 10% levels\n"));
3566
3567 if (opt & OPT_B) {
3568 pputs(prn, _("using Bartlett standard errors for ACF"));
3569 } else {
3570 pprintf(prn, _("using standard error 1/T^%.1f"), 0.5);
3571 }
3572
3573 pputs(prn, "\n\n");
3574
3575 /* for confidence bands */
3576 for (i=0; i<3; i++) {
3577 pm[i] = z[i] / sqrt((double) T);
3578 if (pm[i] > 0.5) {
3579 pm[i] = 0.5;
3580 }
3581 }
3582
3583 /* generate (and if not in batch mode) plot partial
3584 autocorrelation function */
3585
3586 err = pacf_err = get_pacf(A);
3587
3588 pputs(prn, _(" LAG ACF PACF Q-stat. [p-value]"));
3589 pputs(prn, "\n\n");
3590
3591 lbox = ssr = 0.0;
3592 dfQ = 1;
3593
3594 for (k=0; k<m; k++) {
3595 double pm0, pm1, pm2;
3596
3597 if (na(acf[k])) {
3598 pprintf(prn, "%5d\n", k + 1);
3599 continue;
3600 }
3601
3602 /* ACF */
3603 pprintf(prn, "%5d%9.4f ", k + 1, acf[k]);
3604 if (PM != NULL) {
3605 if (k > 0 && !na(acf[k-1])) {
3606 ssr += acf[k-1] * acf[k-1];
3607 }
3608 do_acf_bartlett(PM, k, T, ssr, z);
3609 pm0 = gretl_matrix_get(PM, k, 0);
3610 pm1 = gretl_matrix_get(PM, k, 1);
3611 pm2 = gretl_matrix_get(PM, k, 2);
3612 } else {
3613 pm0 = pm[0];
3614 pm1 = pm[1];
3615 pm2 = pm[2];
3616 }
3617 if (fabs(acf[k]) > pm2) {
3618 pputs(prn, " ***");
3619 } else if (fabs(acf[k]) > pm1) {
3620 pputs(prn, " ** ");
3621 } else if (fabs(acf[k]) > pm0) {
3622 pputs(prn, " * ");
3623 } else {
3624 pputs(prn, " ");
3625 }
3626
3627 if (na(pacf[k])) {
3628 bufspace(13, prn);
3629 } else {
3630 /* PACF */
3631 pprintf(prn, "%9.4f", pacf[k]);
3632 if (fabs(pacf[k]) > pm[2]) {
3633 pputs(prn, " ***");
3634 } else if (fabs(pacf[k]) > pm[1]) {
3635 pputs(prn, " ** ");
3636 } else if (fabs(pacf[k]) > pm[0]) {
3637 pputs(prn, " * ");
3638 } else {
3639 pputs(prn, " ");
3640 }
3641 }
3642
3643 /* Ljung-Box Q */
3644 lbox += (T * (T + 2.0)) * acf[k] * acf[k] / (T - (k + 1));
3645
3646 if (k >= nparam) {
3647 /* i.e., if the real df is > 0 */
3648 pprintf(prn, "%12.4f", lbox);
3649 pval = chisq_cdf_comp(dfQ++, lbox);
3650 pprintf(prn, " [%5.3f]", pval);
3651 }
3652
3653 pputc(prn, '\n');
3654 }
3655 pputc(prn, '\n');
3656
3657 if (lbox > 0 && !na(pval)) {
3658 record_test_result(lbox, pval);
3659 }
3660
3661 if (use_gnuplot) {
3662 err = correlogram_plot(vname, acf, (pacf_err)? NULL : pacf,
3663 PM, m, pm[1], opt);
3664 }
3665
3666 bailout:
3667
3668 gretl_matrix_free(A);
3669 gretl_matrix_free(PM);
3670
3671 return err;
3672 }
3673
3674 /**
3675 * acf_matrix:
3676 * @x: series to analyse.
3677 * @order: maximum lag for autocorrelation function.
3678 * @dset: information on the data set, or %NULL.
3679 * @n: length of series (required if @dset is %NULL).
3680 * @err: location to receive error code.
3681 *
3682 * Computes the autocorrelation function for series @x with
3683 * maximum lag @order.
3684 *
3685 * Returns: two-column matrix containing the values of the
3686 * ACF and PACF at the successive lags, or %NULL on error.
3687 */
3688
acf_matrix(const double * x,int order,const DATASET * dset,int n,int * err)3689 gretl_matrix *acf_matrix (const double *x, int order,
3690 const DATASET *dset, int n,
3691 int *err)
3692 {
3693 gretl_matrix *A = NULL;
3694 double xbar;
3695 int m, k, t, T;
3696 int t1, t2;
3697
3698 if (dset != NULL) {
3699 t1 = dset->t1;
3700 t2 = dset->t2;
3701
3702 while (na(x[t1])) t1++;
3703 while (na(x[t2])) t2--;
3704
3705 T = t2 - t1 + 1;
3706 } else {
3707 t1 = 0;
3708 t2 = n - 1;
3709 T = n;
3710 }
3711
3712 if (T < 4) {
3713 *err = E_TOOFEW;
3714 return NULL;
3715 }
3716
3717 for (t=t1; t<=t2; t++) {
3718 if (na(x[t])) {
3719 *err = E_MISSDATA;
3720 return NULL;
3721 }
3722 }
3723
3724 if (gretl_isconst(t1, t2, x)) {
3725 gretl_errmsg_set(_("Argument is a constant"));
3726 *err = E_DATA;
3727 return NULL;
3728 }
3729
3730 xbar = gretl_mean(t1, t2, x);
3731 if (na(xbar)) {
3732 *err = E_DATA;
3733 return NULL;
3734 }
3735
3736 m = order;
3737
3738 if (dset == NULL) {
3739 if (m < 1 || m > T) {
3740 *err = E_DATA;
3741 return NULL;
3742 }
3743 } else {
3744 if (m == 0) {
3745 m = auto_acf_order(T);
3746 } else if (m > T - dset->pd) {
3747 int mmax = T - 1;
3748
3749 if (m > mmax) {
3750 m = mmax;
3751 }
3752 }
3753 }
3754
3755 A = gretl_matrix_alloc(m, 2);
3756
3757 if (A == NULL) {
3758 *err = E_ALLOC;
3759 return NULL;
3760 }
3761
3762 /* calculate ACF up to order m */
3763 for (k=0; k<m && !*err; k++) {
3764 A->val[k] = gretl_acf(k+1, t1, t2, x, xbar);
3765 if (na(A->val[k])) {
3766 *err = E_DATA;
3767 }
3768 }
3769
3770 /* add PACF */
3771 if (!*err) {
3772 *err = get_pacf(A);
3773 }
3774
3775 if (*err) {
3776 gretl_matrix_free(A);
3777 A = NULL;
3778 }
3779
3780 return A;
3781 }
3782
xcorrgm_graph(const char * xname,const char * yname,double * xcf,int m,double * pm,int allpos)3783 static int xcorrgm_graph (const char *xname, const char *yname,
3784 double *xcf, int m, double *pm,
3785 int allpos)
3786 {
3787 char crit_string[16];
3788 gchar *title;
3789 FILE *fp;
3790 int k, err = 0;
3791
3792 fp = open_plot_input_file(PLOT_XCORRELOGRAM, 0, &err);
3793 if (err) {
3794 return err;
3795 }
3796
3797 sprintf(crit_string, "%.2f/T^%.1f", 1.96, 0.5);
3798
3799 gretl_push_c_numeric_locale();
3800
3801 fputs("set xzeroaxis\n", fp);
3802 fputs("set yzeroaxis\n", fp);
3803 print_keypos_string(GP_KEY_RIGHT_TOP, fp);
3804 fprintf(fp, "set xlabel '%s'\n", _("lag"));
3805 if (allpos) {
3806 fputs("set yrange [-0.1:1.1]\n", fp);
3807 } else {
3808 fputs("set yrange [-1.1:1.1]\n", fp);
3809 }
3810 title = g_strdup_printf(_("Correlations of %s and lagged %s"),
3811 xname, yname);
3812 fprintf(fp, "set title '%s'\n", title);
3813 g_free(title);
3814 fprintf(fp, "set xrange [%d:%d]\n", -(m + 1), m + 1);
3815 if (allpos) {
3816 fprintf(fp, "plot \\\n"
3817 "'-' using 1:2 notitle w impulses lw 5, \\\n"
3818 "%g title '%s' lt 2\n", pm[1], crit_string);
3819 } else {
3820 fprintf(fp, "plot \\\n"
3821 "'-' using 1:2 notitle w impulses lw 5, \\\n"
3822 "%g title '+- %s' lt 2, \\\n"
3823 "%g notitle lt 2\n", pm[1], crit_string, -pm[1]);
3824 }
3825
3826 for (k=-m; k<=m; k++) {
3827 fprintf(fp, "%d %g\n", k, xcf[k+m]);
3828 }
3829 fputs("e\n", fp);
3830
3831 gretl_pop_c_numeric_locale();
3832
3833 return finalize_plot_input_file(fp);
3834 }
3835
xcorrgm_ascii_plot(double * xcf,int xcf_m,PRN * prn)3836 static int xcorrgm_ascii_plot (double *xcf, int xcf_m, PRN *prn)
3837 {
3838 double *xk = malloc((xcf_m * 2 + 1) * sizeof *xk);
3839 int k;
3840
3841 if (xk == NULL) {
3842 return E_ALLOC;
3843 }
3844
3845 for (k=-xcf_m; k<=xcf_m; k++) {
3846 xk[k+xcf_m] = k;
3847 }
3848
3849 pprintf(prn, "\n\n%s\n\n", _("Cross-correlogram"));
3850 graphyx(xcf, xk, 2 * xcf_m + 1, "", _("lag"), prn);
3851
3852 free(xk);
3853
3854 return 0;
3855 }
3856
3857 /* We assume here that all data issues have already been
3858 assessed (lag length, missing values etc.) and we just
3859 get on with the job.
3860 */
3861
real_xcf_vec(const double * x,const double * y,int p,int T,int * err)3862 static gretl_matrix *real_xcf_vec (const double *x, const double *y,
3863 int p, int T, int *err)
3864 {
3865 gretl_matrix *xcf;
3866 double xbar, ybar;
3867 int i;
3868
3869 xbar = gretl_mean(0, T-1, x);
3870 if (na(xbar)) {
3871 *err = E_DATA;
3872 return NULL;
3873 }
3874
3875 ybar = gretl_mean(0, T-1, y);
3876 if (na(ybar)) {
3877 *err = E_DATA;
3878 return NULL;
3879 }
3880
3881 xcf = gretl_column_vector_alloc(p * 2 + 1);
3882 if (xcf == NULL) {
3883 *err = E_ALLOC;
3884 return NULL;
3885 }
3886
3887 for (i=-p; i<=p; i++) {
3888 xcf->val[i+p] = gretl_xcf(i, 0, T - 1, x, y, xbar, ybar);
3889 }
3890
3891 return xcf;
3892 }
3893
3894 /* for arrays @x and @y, check that there are no missing values
3895 and that neither series has a constant value
3896 */
3897
xcf_data_check(const double * x,const double * y,int T,int * badvar)3898 static int xcf_data_check (const double *x, const double *y, int T,
3899 int *badvar)
3900 {
3901 int xconst = 1, yconst = 1;
3902 int t;
3903
3904 if (T < 5) {
3905 return E_TOOFEW;
3906 }
3907
3908 for (t=0; t<T; t++) {
3909 if (na(x[t]) || na(y[t])) {
3910 return E_MISSDATA;
3911 }
3912 if (t > 0 && x[t] != x[0]) {
3913 xconst = 0;
3914 }
3915 if (t > 0 && y[t] != y[0]) {
3916 yconst = 0;
3917 }
3918 }
3919
3920 if (xconst) {
3921 *badvar = 1;
3922 return E_DATA;
3923 } else if (yconst) {
3924 *badvar = 2;
3925 return E_DATA;
3926 }
3927
3928 return 0;
3929 }
3930
3931 /**
3932 * xcf_vec:
3933 * @x: first series.
3934 * @y: second series.
3935 * @p: maximum lag for cross-correlation function.
3936 * @dset: information on the data set, or NULL.
3937 * @n: length of series (required only if @dset is NULL).
3938 * @err: location to receive error code.
3939 *
3940 * Computes the cross-correlation function for series @x with
3941 * series @y up to maximum lag @order.
3942 *
3943 * Returns: column vector containing the values of the
3944 * cross-correlation function, or NULL on error.
3945 */
3946
xcf_vec(const double * x,const double * y,int p,const DATASET * dset,int n,int * err)3947 gretl_matrix *xcf_vec (const double *x, const double *y,
3948 int p, const DATASET *dset,
3949 int n, int *err)
3950 {
3951 gretl_matrix *xcf = NULL;
3952 int t1, t2;
3953 int T, badvar = 0;
3954
3955 if (p <= 0) {
3956 *err = E_DATA;
3957 return NULL;
3958 }
3959
3960 if (dset != NULL) {
3961 int yt1, yt2;
3962
3963 t1 = yt1 = dset->t1;
3964 t2 = yt2 = dset->t2;
3965
3966 while (na(x[t1])) t1++;
3967 while (na(y[yt1])) yt1++;
3968
3969 while (na(x[t2])) t2--;
3970 while (na(y[yt2])) yt2--;
3971
3972 t1 = (yt1 > t1)? yt1 : t1;
3973 t2 = (yt2 < t2)? yt2 : t2;
3974
3975 T = t2 - t1 + 1;
3976 } else {
3977 t1 = 0;
3978 t2 = n - 1;
3979 T = n;
3980 }
3981
3982 #if 0
3983 fprintf(stderr, "t1=%d, t2=%d, T=%d\n", t1, t2, T);
3984 fprintf(stderr, "x[t1]=%g, y[t1]=%g\n\n", x[t1], y[t1]);
3985 #endif
3986
3987 if (dset != NULL) {
3988 if (2 * p > T - dset->pd) { /* ?? */
3989 *err = E_DATA;
3990 }
3991 } else if (2 * p > T) {
3992 *err = E_DATA;
3993 }
3994
3995 if (!*err) {
3996 *err = xcf_data_check(x + t1, y + t1, T, &badvar);
3997 if (badvar) {
3998 gretl_errmsg_sprintf(_("Argument %d is a constant"),
3999 badvar);
4000 }
4001 }
4002
4003 if (!*err) {
4004 xcf = real_xcf_vec(x + t1, y + t1, p, T, err);
4005 }
4006
4007 return xcf;
4008 }
4009
4010 /**
4011 * xcorrgram:
4012 * @list: should contain ID numbers of two variables.
4013 * @order: integer order for autocorrelation function.
4014 * @dset: dataset struct.
4015 * @opt: may include OPT_U for plot options.
4016 * @prn: gretl printing struct.
4017 *
4018 * Computes the cross-correlation function and plots the
4019 * cross-correlogram for the specified variables.
4020 *
4021 * Returns: 0 on successful completion, error code on error.
4022 */
4023
xcorrgram(const int * list,int order,DATASET * dset,gretlopt opt,PRN * prn)4024 int xcorrgram (const int *list, int order, DATASET *dset,
4025 gretlopt opt, PRN *prn)
4026 {
4027 gretl_matrix *xcf = NULL;
4028 double pm[3];
4029 const char *xname, *yname;
4030 const double *x, *y;
4031 int t1 = dset->t1, t2 = dset->t2;
4032 int ascii_plot = 0;
4033 int use_gnuplot = 0;
4034 int k, p, badvar = 0;
4035 int T, err = 0;
4036
4037 gretl_error_clear();
4038
4039 if (order < 0) {
4040 gretl_errmsg_sprintf(_("Invalid lag order %d"), order);
4041 return E_DATA;
4042 }
4043
4044 if (list[0] != 2) {
4045 return E_DATA;
4046 }
4047
4048 x = dset->Z[list[1]];
4049 y = dset->Z[list[2]];
4050
4051 err = list_adjust_sample(list, &t1, &t2, dset, NULL);
4052
4053 if (!err) {
4054 T = t2 - t1 + 1;
4055 err = xcf_data_check(x + t1, y + t1, T, &badvar);
4056 }
4057
4058 if (err) {
4059 if (badvar) {
4060 gretl_errmsg_sprintf(_("%s is a constant"),
4061 dset->varname[list[badvar]]);
4062 }
4063 return err;
4064 }
4065
4066 xname = dset->varname[list[1]];
4067 yname = dset->varname[list[2]];
4068
4069 p = order;
4070 if (p == 0) {
4071 p = auto_acf_order(T) / 2;
4072 } else if (2 * p > T - dset->pd) {
4073 p = (T - 1) / 2; /* ?? */
4074 }
4075
4076 xcf = real_xcf_vec(x + t1, y + t1, p, T, &err);
4077 if (err) {
4078 return err;
4079 }
4080
4081 /* graphing? */
4082 handle_corrgm_plot_options(XCORRGM, opt, &ascii_plot, &use_gnuplot);
4083
4084 if (ascii_plot) {
4085 xcorrgm_ascii_plot(xcf->val, p, prn);
4086 }
4087
4088 /* for confidence bands */
4089 pm[0] = 1.65 / sqrt((double) T);
4090 pm[1] = 1.96 / sqrt((double) T);
4091 pm[2] = 2.58 / sqrt((double) T);
4092
4093 pputc(prn, '\n');
4094 pprintf(prn, _("Cross-correlation function for %s and %s"),
4095 xname, yname);
4096 pputs(prn, "\n\n");
4097 pputs(prn, _(" LAG XCF"));
4098 pputs(prn, "\n\n");
4099
4100 for (k=-p; k<=p; k++) {
4101 double x = xcf->val[k + p];
4102
4103 pprintf(prn, "%5d%9.4f", k, x);
4104 if (fabs(x) > pm[2]) {
4105 pputs(prn, " ***");
4106 } else if (fabs(x) > pm[1]) {
4107 pputs(prn, " **");
4108 } else if (fabs(x) > pm[0]) {
4109 pputs(prn, " *");
4110 }
4111 pputc(prn, '\n');
4112 }
4113 pputc(prn, '\n');
4114
4115 if (use_gnuplot) {
4116 int allpos = 1;
4117
4118 for (k=-p; k<=p; k++) {
4119 if (xcf->val[k+p] < 0) {
4120 allpos = 0;
4121 break;
4122 }
4123 }
4124 err = xcorrgm_graph(xname, yname, xcf->val, p, pm, allpos);
4125 }
4126
4127 gretl_matrix_free(xcf);
4128
4129 return err;
4130 }
4131
4132 struct fractint_test {
4133 double d; /* estimated degree of integration */
4134 double se; /* standard error of the above */
4135 };
4136
fract_int_GPH(int T,int m,const double * xvec,struct fractint_test * ft)4137 static int fract_int_GPH (int T, int m, const double *xvec,
4138 struct fractint_test *ft)
4139 {
4140 gretl_matrix *y = NULL;
4141 gretl_matrix *X = NULL;
4142 gretl_matrix *b = NULL;
4143 gretl_matrix *V = NULL;
4144 double x, w;
4145 int t, err = 0;
4146
4147 ft->d = ft->se = NADBL;
4148
4149 y = gretl_column_vector_alloc(m);
4150 X = gretl_unit_matrix_new(m, 2);
4151 b = gretl_column_vector_alloc(2);
4152 V = gretl_matrix_alloc(2, 2);
4153
4154 if (y == NULL || X == NULL || b == NULL || V == NULL) {
4155 err = E_ALLOC;
4156 goto bailout;
4157 }
4158
4159 /* Test from Geweke and Porter-Hudak, as set out in
4160 Greene, Econometric Analysis 4e, p. 787 */
4161
4162 for (t=0; t<m; t++) {
4163 y->val[t] = log(xvec[t+1]);
4164 w = M_2PI * (t + 1) / (double) T;
4165 x = sin(w / 2);
4166 gretl_matrix_set(X, t, 1, log(4 * x * x));
4167 }
4168
4169 err = gretl_matrix_ols(y, X, b, V, NULL, &x);
4170
4171 if (!err) {
4172 ft->d = -b->val[1];
4173 ft->se = sqrt(gretl_matrix_get(V, 1, 1));
4174 }
4175
4176 bailout:
4177
4178 gretl_matrix_free(y);
4179 gretl_matrix_free(X);
4180 gretl_matrix_free(b);
4181 gretl_matrix_free(V);
4182
4183 return err;
4184 }
4185
gretl_matrix_pergm(const gretl_matrix * x,int m,int * err)4186 static gretl_matrix *gretl_matrix_pergm (const gretl_matrix *x, int m,
4187 int *err)
4188 {
4189 gretl_matrix *p = NULL;
4190 gretl_matrix *f = NULL;
4191
4192 f = gretl_matrix_fft(x, 0, err);
4193 if (*err) {
4194 return NULL;
4195 }
4196
4197 p = gretl_column_vector_alloc(m);
4198
4199 if (p == NULL) {
4200 *err = E_ALLOC;
4201 } else {
4202 int T = gretl_vector_get_length(x);
4203 double re, im, scale = M_2PI * T;
4204 int i;
4205
4206 for (i=0; i<m; i++) {
4207 re = gretl_matrix_get(f, i+1, 0);
4208 im = gretl_matrix_get(f, i+1, 1);
4209 p->val[i] = (re*re + im*im) / scale;
4210 }
4211 }
4212
4213 gretl_matrix_free(f);
4214
4215 return p;
4216 }
4217
4218 struct LWE_helper {
4219 gretl_matrix *lambda;
4220 gretl_matrix *lpow;
4221 gretl_matrix *I1;
4222 gretl_matrix *I2;
4223 double lcm;
4224 };
4225
LWE_free(struct LWE_helper * L)4226 static void LWE_free (struct LWE_helper *L)
4227 {
4228 gretl_matrix_free(L->lambda);
4229 gretl_matrix_free(L->lpow);
4230 gretl_matrix_free(L->I1);
4231 gretl_matrix_free(L->I2);
4232 }
4233
LWE_obj_func(struct LWE_helper * L,double d)4234 static double LWE_obj_func (struct LWE_helper *L, double d)
4235 {
4236 double dd = 2.0 * d;
4237 int i;
4238
4239 gretl_matrix_copy_values(L->lpow, L->lambda);
4240 gretl_matrix_raise(L->lpow, dd);
4241
4242 for (i=0; i<L->I1->rows; i++) {
4243 L->I2->val[i] = L->I1->val[i] * L->lpow->val[i];
4244 }
4245
4246 return -(log(gretl_vector_mean(L->I2)) - dd * L->lcm);
4247 }
4248
LWE_lambda(const gretl_matrix * I1,int n)4249 static gretl_matrix *LWE_lambda (const gretl_matrix *I1, int n)
4250 {
4251 gretl_matrix *lambda;
4252 int i, m = gretl_vector_get_length(I1);
4253
4254 lambda = gretl_column_vector_alloc(m);
4255
4256 if (lambda != NULL) {
4257 for (i=0; i<m; i++) {
4258 gretl_vector_set(lambda, i, (M_2PI / n) * (i + 1));
4259 }
4260 }
4261
4262 return lambda;
4263 }
4264
4265 static int
LWE_init(struct LWE_helper * L,const gretl_matrix * X,int m)4266 LWE_init (struct LWE_helper *L, const gretl_matrix *X, int m)
4267 {
4268 int i, err = 0;
4269
4270 L->I2 = L->lpow = NULL;
4271
4272 L->I1 = gretl_matrix_pergm(X, m, &err);
4273 if (err) {
4274 return err;
4275 }
4276
4277 L->lambda = LWE_lambda(L->I1, X->rows);
4278 if (L->lambda == NULL) {
4279 gretl_matrix_free(L->I1);
4280 return E_ALLOC;
4281 }
4282
4283 L->lpow = gretl_matrix_copy(L->lambda);
4284 L->I2 = gretl_matrix_copy(L->I1);
4285
4286 if (L->lpow == NULL || L->I2 == NULL) {
4287 err = E_ALLOC;
4288 LWE_free(L);
4289 } else {
4290 L->lcm = 0.0;
4291 for (i=0; i<m; i++) {
4292 L->lcm += log(L->lambda->val[i]);
4293 }
4294 L->lcm /= m;
4295 }
4296
4297 return err;
4298 }
4299
4300 #define LWE_MAXITER 100
4301
LWE_calc(const gretl_matrix * X,int m,int * err)4302 static double LWE_calc (const gretl_matrix *X, int m, int *err)
4303 {
4304 struct LWE_helper L = {0};
4305 double d = 0, dd = 1.0;
4306 double eps = 1.0e-05;
4307 double f, incr, incl, deriv, h;
4308 int iter = 0;
4309
4310 *err = LWE_init(&L, X, m);
4311 if (*err) {
4312 return NADBL;
4313 }
4314
4315 while (fabs(dd) > 1.0e-06 && iter++ < LWE_MAXITER) {
4316 f = LWE_obj_func(&L, d);
4317 incr = LWE_obj_func(&L, d + eps) / eps;
4318 incl = LWE_obj_func(&L, d - eps) / eps;
4319
4320 deriv = (incr - incl) / 2.0;
4321 h = (0.5 * (incr + incl) - f / eps) / eps;
4322 dd = (h < 0)? (-deriv / h) : deriv;
4323
4324 if (fabs(dd) > 1) {
4325 dd = (dd > 0)? 1 : -1;
4326 }
4327
4328 d += 0.5 * dd;
4329 }
4330
4331 if (*err) {
4332 d = NADBL;
4333 } else if (iter == LWE_MAXITER) {
4334 fprintf(stderr, "LWE: max iterations reached\n");
4335 d = NADBL;
4336 }
4337
4338 LWE_free(&L);
4339
4340 return d;
4341 }
4342
auto_spectrum_order(int T,gretlopt opt)4343 int auto_spectrum_order (int T, gretlopt opt)
4344 {
4345 int m;
4346
4347 if (opt & OPT_O) {
4348 /* Bartlett */
4349 m = (int) 2.0 * sqrt((double) T);
4350 } else {
4351 /* fractional integration test */
4352 double m1 = floor((double) T / 2.0);
4353 double m2 = floor(pow((double) T, 0.6));
4354
4355 m = (m1 < m2)? m1 : m2;
4356 m--;
4357 }
4358
4359 return m;
4360 }
4361
fract_int_LWE(const double * x,int m,int t1,int t2,struct fractint_test * ft)4362 static int fract_int_LWE (const double *x, int m, int t1, int t2,
4363 struct fractint_test *ft)
4364 {
4365 gretl_matrix *X;
4366 int err = 0;
4367
4368 X = gretl_vector_from_series(x, t1, t2);
4369
4370 if (X == NULL) {
4371 err = E_ALLOC;
4372 } else {
4373 ft->d = LWE_calc(X, m, &err);
4374 if (!err) {
4375 ft->se = 1.0 / (2 * sqrt((double) m));
4376 }
4377 gretl_matrix_free(X);
4378 }
4379
4380 return err;
4381 }
4382
pergm_do_plot(gretlopt opt)4383 static int pergm_do_plot (gretlopt opt)
4384 {
4385 if (opt & OPT_U) {
4386 /* --plot=whatever */
4387 const char *s = get_optval_string(PERGM, OPT_U);
4388
4389 if (s != NULL) {
4390 if (!strcmp(s, "none")) {
4391 return 0;
4392 } else {
4393 /* implies use of gnuplot */
4394 return 1;
4395 }
4396 }
4397 }
4398
4399 /* the default */
4400 return !gretl_in_batch_mode();
4401 }
4402
pergm_print(const char * vname,const double * d,int T,int L,gretlopt opt,PRN * prn)4403 static void pergm_print (const char *vname, const double *d,
4404 int T, int L, gretlopt opt, PRN *prn)
4405 {
4406 char xstr[32];
4407 double dt, yt;
4408 int t;
4409
4410 if (vname == NULL) {
4411 pprintf(prn, "\n%s\n", _("Residual periodogram"));
4412 } else {
4413 pprintf(prn, _("\nPeriodogram for %s\n"), vname);
4414 }
4415
4416 pprintf(prn, _("Number of observations = %d\n"), T);
4417 if (opt & OPT_O) {
4418 pprintf(prn, _("Using Bartlett lag window, length %d\n\n"), L);
4419 } else {
4420 pputc(prn, '\n');
4421 }
4422
4423 if (opt & OPT_L) {
4424 pputs(prn, _(" omega scaled frequency periods log spectral density\n\n"));
4425 } else {
4426 pputs(prn, _(" omega scaled frequency periods spectral density\n\n"));
4427 }
4428
4429 for (t=1; t<=T/2; t++) {
4430 dt = (opt & OPT_L)? log(d[t]) : d[t];
4431 if (opt & OPT_R) {
4432 yt = 2 * M_PI * (double) t / T;
4433 pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t);
4434 } else if (opt & OPT_D) {
4435 yt = 360 * t / (double) T;
4436 pprintf(prn, " %7.3f%8d%16.2f", yt, t, (double) T / t);
4437 } else {
4438 yt = M_2PI * t / (double) T;
4439 pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t);
4440 }
4441 dt = (opt & OPT_L)? log(d[t]) : d[t];
4442 sprintf(xstr, "%#.5g", dt);
4443 gretl_fix_exponent(xstr);
4444 pprintf(prn, "%16s\n", xstr);
4445 }
4446
4447 pputc(prn, '\n');
4448
4449 if (pergm_do_plot(opt)) {
4450 periodogram_plot(vname, T, L, d, opt);
4451 }
4452 }
4453
finalize_fractint(const double * x,const double * dens,int t1,int t2,int width,const char * vname,gretlopt opt,PRN * prn)4454 static int finalize_fractint (const double *x,
4455 const double *dens,
4456 int t1, int t2, int width,
4457 const char *vname,
4458 gretlopt opt,
4459 PRN *prn)
4460 {
4461 struct fractint_test ft;
4462 gretl_matrix *result = NULL;
4463 int do_GPH = opt & (OPT_G | OPT_A);
4464 int do_LWE = !(opt & OPT_G);
4465 int do_print = !(opt & OPT_Q);
4466 int T = t2 - t1 + 1;
4467 int m, err = 0;
4468
4469 /* order for test */
4470 if (width <= 0) {
4471 m = auto_spectrum_order(T, OPT_NONE);
4472 } else {
4473 m = (width > T / 2)? T / 2 : width;
4474 }
4475
4476 if (do_print && vname != NULL) {
4477 pprintf(prn, "\n%s, T = %d\n\n", vname, T);
4478 }
4479
4480 if (do_GPH && do_LWE) {
4481 result = gretl_matrix_alloc(2, 2);
4482 } else {
4483 result = gretl_matrix_alloc(1, 2);
4484 }
4485
4486 if (do_LWE) {
4487 /* do Local Whittle if wanted */
4488 err = fract_int_LWE(x, m, t1, t2, &ft);
4489 if (!err) {
4490 double z = ft.d / ft.se;
4491 double pv = normal_pvalue_2(z);
4492
4493 record_test_result(z, pv);
4494 gretl_matrix_set(result, 0, 0, ft.d);
4495 gretl_matrix_set(result, 0, 1, ft.se);
4496
4497 if (do_print) {
4498 pprintf(prn, "%s (m = %d)\n"
4499 " %s = %g (%g)\n"
4500 " %s: z = %g, %s %.4f\n\n",
4501 _("Local Whittle Estimator"), m,
4502 _("Estimated degree of integration"), ft.d, ft.se,
4503 _("test statistic"), z,
4504 _("with p-value"), pv);
4505 }
4506 }
4507 }
4508
4509 if (!err && do_GPH) {
4510 /* do GPH if wanted (options --all or --gph) */
4511 int row = do_LWE ? 1 : 0;
4512
4513 err = fract_int_GPH(T, m, dens, &ft);
4514 gretl_matrix_set(result, row, 0, ft.d);
4515 gretl_matrix_set(result, row, 1, ft.se);
4516
4517 if (!err && (!do_LWE || do_print)) {
4518 double tval = ft.d / ft.se;
4519 int df = m - 2;
4520 double pv = student_pvalue_2(df, tval);
4521
4522 if (!do_LWE) {
4523 record_test_result(tval, pv);
4524 }
4525 if (do_print) {
4526 pprintf(prn, "%s (m = %d)\n"
4527 " %s = %g (%g)\n"
4528 " %s: t(%d) = %g, %s %.4f\n\n",
4529 _("GPH test"), m,
4530 _("Estimated degree of integration"), ft.d, ft.se,
4531 _("test statistic"), df, tval,
4532 _("with p-value"), pv);
4533 }
4534 }
4535 }
4536
4537 if (err) {
4538 gretl_matrix_free(result);
4539 } else {
4540 char **colnames = strings_array_new(2);
4541
4542 colnames[0] = gretl_strdup("d");
4543 colnames[1] = gretl_strdup("se");
4544 gretl_matrix_set_colnames(result, colnames);
4545 set_last_result_data(result, GRETL_TYPE_MATRIX);
4546 }
4547
4548 return err;
4549 }
4550
pergm_compute_density(const double * x,int t1,int t2,int L,int bartlett,int * err)4551 static double *pergm_compute_density (const double *x, int t1, int t2,
4552 int L, int bartlett, int *err)
4553 {
4554 double *sdy, *acov, *dens;
4555 double xx, yy, vx, sx, w;
4556 int k, t, T = t2 - t1 + 1;
4557
4558 sdy = malloc(T * sizeof *sdy);
4559 acov = malloc((L + 1) * sizeof *acov);
4560 dens = malloc((1 + T/2) * sizeof *dens);
4561
4562 if (sdy == NULL || acov == NULL || dens == NULL) {
4563 *err = E_ALLOC;
4564 free(sdy);
4565 free(acov);
4566 free(dens);
4567 return NULL;
4568 }
4569
4570 xx = gretl_mean(t1, t2, x);
4571 vx = real_gretl_variance(t1, t2, x, 1);
4572 sx = sqrt(vx);
4573
4574 for (t=t1; t<=t2; t++) {
4575 sdy[t-t1] = (x[t] - xx) / sx;
4576 }
4577
4578 /* autocovariances */
4579 for (k=1; k<=L; k++) {
4580 acov[k] = 0.0;
4581 for (t=k; t<T; t++) {
4582 acov[k] += sdy[t] * sdy[t-k];
4583 }
4584 acov[k] /= T;
4585 }
4586
4587 vx /= M_2PI;
4588
4589 for (t=1; t<=T/2; t++) {
4590 yy = M_2PI * t / (double) T;
4591 xx = 1.0;
4592 for (k=1; k<=L; k++) {
4593 if (bartlett) {
4594 w = 1.0 - (double) k/(L + 1);
4595 xx += 2.0 * w * acov[k] * cos(yy * k);
4596 } else {
4597 xx += 2.0 * acov[k] * cos(yy * k);
4598 }
4599 }
4600 dens[t] = xx * vx;
4601 }
4602
4603 free(sdy);
4604 free(acov);
4605
4606 return dens;
4607 }
4608
4609 enum {
4610 PERGM_CMD,
4611 PERGM_FUNC,
4612 FRACTINT_CMD
4613 };
4614
4615 /* The following is a little complicated because we're supporting
4616 three use cases:
4617
4618 1) Implementing the "pergm" command, with or without the
4619 option of using a Bartlett window (OPT_O); in this case the
4620 final matrix-location argument will be NULL.
4621
4622 2) Implementing the user-space pergm function: in this case
4623 the final matrix-location argument will be non-NULL, and the
4624 @vname argument will be NULL.
4625
4626 3) Implementing the fractint command, computing the Local
4627 Whittle Estimator and/or the GPH test -- this is flagged by
4628 opt & OPT_F. If we're doing Whittle only, we don't need to compute
4629 the spectral density via the autocovariances approach; that is
4630 handled via FFT in the LWE functions above.
4631 */
4632
4633 static int
pergm_or_fractint(int usage,const double * x,int t1,int t2,int width,const char * vname,gretlopt opt,PRN * prn,gretl_matrix ** pmat)4634 pergm_or_fractint (int usage, const double *x, int t1, int t2,
4635 int width, const char *vname, gretlopt opt,
4636 PRN *prn, gretl_matrix **pmat)
4637 {
4638 double *dens = NULL;
4639 int bartlett = (opt & OPT_O);
4640 int whittle_only = 0;
4641 int t, T, L = 0;
4642 int err = 0;
4643
4644 gretl_error_clear();
4645
4646 /* common to all uses: check for data problems */
4647
4648 err = series_adjust_sample(x, &t1, &t2);
4649
4650 if (!err && (T = t2 - t1 + 1) < 12) {
4651 err = E_TOOFEW;
4652 }
4653
4654 if (!err && gretl_isconst(t1, t2, x)) {
4655 if (vname != NULL) {
4656 gretl_errmsg_sprintf(_("%s is a constant"), vname);
4657 }
4658 err = E_DATA;
4659 }
4660
4661 if (err) {
4662 return err;
4663 }
4664
4665 if (usage == FRACTINT_CMD) {
4666 /* doing fractint */
4667 if (!(opt & (OPT_A | OPT_G))) {
4668 /* not --all, not --gph */
4669 whittle_only = 1;
4670 }
4671 }
4672
4673 if (!whittle_only) {
4674 /* Chatfield (1996); William Greene, 4e, p. 772 */
4675 if (bartlett) {
4676 if (width <= 0) {
4677 L = auto_spectrum_order(T, opt);
4678 } else {
4679 L = (width > T / 2)? T / 2 : width;
4680 }
4681 } else {
4682 /* use full sample */
4683 L = T - 1;
4684 }
4685
4686 dens = pergm_compute_density(x, t1, t2, L, bartlett, &err);
4687 if (err) {
4688 return err;
4689 }
4690 }
4691
4692 if (usage == PERGM_FUNC) {
4693 /* make matrix for pergm() function */
4694 int T2 = T / 2;
4695 gretl_matrix *pm;
4696
4697 *pmat = pm = gretl_matrix_alloc(T2, 2);
4698
4699 if (pm == NULL) {
4700 err = E_ALLOC;
4701 } else {
4702 for (t=1; t<=T2; t++) {
4703 gretl_matrix_set(pm, t-1, 0, M_2PI * t / (double) T);
4704 gretl_matrix_set(pm, t-1, 1, dens[t]);
4705 }
4706 }
4707 } else if (usage == FRACTINT_CMD) {
4708 /* supporting "fractint" command */
4709 err = finalize_fractint(x, dens, t1, t2, width,
4710 vname, opt, prn);
4711 } else {
4712 /* supporting "pergm" command */
4713 pergm_print(vname, dens, T, L, opt, prn);
4714 }
4715
4716 free(dens);
4717
4718 return err;
4719 }
4720
4721 /**
4722 * periodogram:
4723 * @varno: ID number of variable to process.
4724 * @width: width of window.
4725 * @dset: dataset struct.
4726 * @opt: if includes OPT_O, use Bartlett lag window for periodogram;
4727 * OPT_L, use log scale.
4728 * @prn: gretl printing struct.
4729 *
4730 * Computes and displays the periodogram for the series specified
4731 * by @varno.
4732 *
4733 * Returns: 0 on successful completion, error code on error.
4734 */
4735
periodogram(int varno,int width,const DATASET * dset,gretlopt opt,PRN * prn)4736 int periodogram (int varno, int width, const DATASET *dset,
4737 gretlopt opt, PRN *prn)
4738 {
4739 return pergm_or_fractint(PERGM_CMD, dset->Z[varno],
4740 dset->t1, dset->t2,
4741 width, dset->varname[varno],
4742 opt, prn, NULL);
4743 }
4744
4745 /**
4746 * residual_periodogram:
4747 * @x: series to process.
4748 * @width: width of window.
4749 * @dset: dataset struct.
4750 * @opt: if includes OPT_O, use Bartlett lag window for periodogram;
4751 * OPT_L, use log scale.
4752 * @prn: gretl printing struct.
4753 *
4754 * Computes and displays the periodogram for @x, which is presumed to
4755 * be a model residual series.
4756 *
4757 * Returns: 0 on successful completion, error code on error.
4758 */
4759
residual_periodogram(const double * x,int width,const DATASET * dset,gretlopt opt,PRN * prn)4760 int residual_periodogram (const double *x, int width, const DATASET *dset,
4761 gretlopt opt, PRN *prn)
4762 {
4763 return pergm_or_fractint(PERGM_CMD, x,
4764 dset->t1, dset->t2,
4765 width, NULL,
4766 opt, prn, NULL);
4767 }
4768
4769 /**
4770 * periodogram_matrix:
4771 * @x: the series to process.
4772 * @t1: starting observation in @x.
4773 * @t2: ending observation in @x.
4774 * @width: width of Bartlett window, or -1 for plain sample
4775 * periodogram.
4776 * @err: location to receive error code.
4777 *
4778 * Implements the userspace gretl pergm function, which can
4779 * be used on either a series from the dataset or a gretl
4780 * vector.
4781 *
4782 * Returns: allocated matrix on success, NULL on failure.
4783 */
4784
periodogram_matrix(const double * x,int t1,int t2,int width,int * err)4785 gretl_matrix *periodogram_matrix (const double *x, int t1, int t2,
4786 int width, int *err)
4787 {
4788 gretlopt opt = (width < 0)? OPT_NONE : OPT_O;
4789 gretl_matrix *m = NULL;
4790
4791 *err = pergm_or_fractint(PERGM_FUNC, x, t1, t2, width,
4792 NULL, opt, NULL, &m);
4793
4794 return m;
4795 }
4796
4797 /**
4798 * fractint:
4799 * @varno: ID number of variable to process.
4800 * @order: lag order / window size.
4801 * @dset: dataset struct.
4802 * @opt: option flags.
4803 * @prn: gretl printing struct.
4804 *
4805 * Computes and prints a test for fractional integration of the
4806 * series specified by @varno. By default the test uses the
4807 * Local Whittle Estimator but if @opt includes OPT_G then
4808 * the Geweke and Porter-Hudak test is done instead, or if
4809 * OPT_A then both tests are shown. If OPT_Q is given the
4810 * test results are not printed, just recorded (with
4811 * preference given to the LWE in case of OPT_A).
4812 *
4813 * Returns: 0 on successful completion, error code on error.
4814 */
4815
fractint(int varno,int order,const DATASET * dset,gretlopt opt,PRN * prn)4816 int fractint (int varno, int order, const DATASET *dset,
4817 gretlopt opt, PRN *prn)
4818 {
4819 int err = incompatible_options(opt, (OPT_G | OPT_A));
4820
4821 if (!err) {
4822 err = pergm_or_fractint(FRACTINT_CMD, dset->Z[varno],
4823 dset->t1, dset->t2,
4824 order,
4825 dset->varname[varno],
4826 opt, prn, NULL);
4827 }
4828
4829 return err;
4830 }
4831
printf15(double x,int d,PRN * prn)4832 static void printf15 (double x, int d, PRN *prn)
4833 {
4834 pputc(prn, ' ');
4835 if (na(x)) {
4836 pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA"));
4837 } else if (x > 999 && x < 100000) {
4838 int p = 1 + floor(log10(x));
4839
4840 d -= p;
4841 if (d < 0) d = 0;
4842 pprintf(prn, "%14.*f", d, x);
4843 } else {
4844 pprintf(prn, "%#14.*g", d, x);
4845 }
4846 }
4847
printf11(double x,int d,PRN * prn)4848 static void printf11 (double x, int d, PRN *prn)
4849 {
4850 pputc(prn, ' ');
4851 if (na(x)) {
4852 pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 10), _("NA"));
4853 } else if (x > 999 && x < 100000) {
4854 int p = 1 + floor(log10(x));
4855
4856 d -= p;
4857 if (d < 0) d = 0;
4858 pprintf(prn, "%10.*f", d, x);
4859 } else {
4860 pprintf(prn, "%#10.*g", d, x);
4861 }
4862 }
4863
output_line(char * str,PRN * prn,int dblspc)4864 static void output_line (char *str, PRN *prn, int dblspc)
4865 {
4866 pputs(prn, str);
4867 if (dblspc) {
4868 pputs(prn, "\n\n");
4869 } else {
4870 pputc(prn, '\n');
4871 }
4872 }
4873
prhdr(const char * str,const DATASET * dset,int missing,PRN * prn)4874 static void prhdr (const char *str, const DATASET *dset,
4875 int missing, PRN *prn)
4876 {
4877 char date1[OBSLEN], date2[OBSLEN];
4878 gchar *tmp;
4879
4880 ntolabel(date1, dset->t1, dset);
4881 ntolabel(date2, dset->t2, dset);
4882
4883 pputc(prn, '\n');
4884
4885 tmp = g_strdup_printf(_("%s, using the observations %s - %s"),
4886 str, date1, date2);
4887 output_line(tmp, prn, 0);
4888 g_free(tmp);
4889
4890 if (missing) {
4891 output_line(_("(missing values were skipped)"), prn, 1);
4892 }
4893 }
4894
summary_print_val(double x,int digits,int places,PRN * prn)4895 static void summary_print_val (double x, int digits, int places,
4896 PRN *prn)
4897 {
4898 pputc(prn, ' ');
4899
4900 if (na(x)) {
4901 pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA"));
4902 } else if (digits < 0) {
4903 pprintf(prn, "%14d", (int) x);
4904 } else if (digits > 0 || places > 0) {
4905 int prec = (digits > 0)? digits : places;
4906 int len = prec + 9;
4907 char *s = (digits > 0)? "#" : "";
4908 char t = (digits > 0)? 'g' : 'f';
4909 char fmt[32];
4910
4911 sprintf(fmt, "%%%s%d.%d%c", s, len, prec, t);
4912 pprintf(prn, fmt, x);
4913 } else {
4914 /* the default */
4915 pprintf(prn, "%#14.5g", x);
4916 }
4917 }
4918
4919 #define NSUMM 12
4920
print_summary_single(const Summary * s,int digits,int places,const DATASET * dset,PRN * prn)4921 void print_summary_single (const Summary *s,
4922 int digits, int places,
4923 const DATASET *dset,
4924 PRN *prn)
4925 {
4926 const char *labels[NSUMM] = {
4927 N_("Mean"),
4928 N_("Median"),
4929 N_("Minimum"),
4930 N_("Maximum"),
4931 N_("Standard deviation"),
4932 N_("C.V."),
4933 N_("Skewness"),
4934 N_("Ex. kurtosis"),
4935 /* xgettext:no-c-format */
4936 N_("5% percentile"),
4937 /* xgettext:no-c-format */
4938 N_("95% percentile"),
4939 N_("Interquartile range"),
4940 N_("Missing obs.")
4941 };
4942 const char *wstr = N_("Within s.d.");
4943 const char *bstr = N_("Between s.d.");
4944 double vals[NSUMM];
4945 int simple_skip[NSUMM] = {0,1,0,0,0,1,1,1,1,1,1,0};
4946 int skip0595 = 0;
4947 int offset = 2;
4948 int slen = 0, i = 0;
4949
4950 if (s->opt & OPT_B) {
4951 offset = 4;
4952 } else {
4953 const char *vname = dset->varname[s->list[1]];
4954 char obs1[OBSLEN], obs2[OBSLEN];
4955 gchar *tmp = NULL;
4956
4957 ntolabel(obs1, dset->t1, dset);
4958 ntolabel(obs2, dset->t2, dset);
4959
4960 prhdr(_("Summary statistics"), dset, 0, prn);
4961
4962 if (isdigit(*vname)) {
4963 const char *mname = dataset_get_matrix_name(dset);
4964
4965 if (mname != NULL) {
4966 tmp = g_strdup_printf(_("for column %d of %s (%d valid observations)"),
4967 atoi(vname), mname, s->n);
4968 } else {
4969 tmp = g_strdup_printf(_("for column %d (%d valid observations)"),
4970 atoi(vname), s->n);
4971 }
4972 } else {
4973 tmp = g_strdup_printf(_("for the variable '%s' (%d valid observations)"),
4974 dset->varname[s->list[1]], s->n);
4975 }
4976 output_line(tmp, prn, 1);
4977 g_free(tmp);
4978 }
4979
4980 vals[0] = s->mean[0];
4981 vals[1] = s->median[0];
4982 vals[2] = s->low[0];
4983 vals[3] = s->high[0];
4984 vals[4] = s->sd[0];
4985 vals[5] = s->cv[0];
4986 vals[6] = s->skew[0];
4987 vals[7] = s->xkurt[0];
4988 vals[8] = s->perc05[0];
4989 vals[9] = s->perc95[0];
4990 vals[10] = s->iqr[0];
4991 vals[11] = s->misscount[0];
4992
4993 if (na(vals[8]) && na(vals[9])) {
4994 skip0595 = 1;
4995 }
4996
4997 for (i=0; i<NSUMM; i++) {
4998 if ((s->opt & OPT_S) && simple_skip[i]) {
4999 continue;
5000 } else if (skip0595 && (i == 8 || i == 9)) {
5001 continue;
5002 }
5003 if (strlen(_(labels[i])) > slen) {
5004 slen = g_utf8_strlen(_(labels[i]), -1);
5005 }
5006 }
5007 slen++;
5008
5009 for (i=0; i<NSUMM; i++) {
5010 if ((s->opt & OPT_S) && simple_skip[i]) {
5011 continue;
5012 } else if (skip0595 && (i == 8 || i == 9)) {
5013 continue;
5014 }
5015 bufspace(offset, prn);
5016 if (i == 8 || i == 9) {
5017 /* the strings contain '%' */
5018 gchar *pcstr = g_strdup(_(labels[i]));
5019 int n = slen - g_utf8_strlen(pcstr, -1);
5020
5021 pputs(prn, pcstr);
5022 if (n > 0) {
5023 bufspace(n, prn);
5024 }
5025 g_free(pcstr);
5026 } else {
5027 pprintf(prn, "%-*s", UTF_WIDTH(_(labels[i]), slen), _(labels[i]));
5028 }
5029 if (i == NSUMM - 1) {
5030 summary_print_val(vals[i], -1, places, prn);
5031 } else {
5032 summary_print_val(vals[i], digits, places, prn);
5033 }
5034 pputc(prn, '\n');
5035 }
5036
5037 if (!na(s->sw) && !na(s->sb)) {
5038 pputc(prn, '\n');
5039 bufspace(offset, prn);
5040 pprintf(prn, "%-*s", UTF_WIDTH(_(wstr), slen), _(wstr));
5041 summary_print_val(s->sw, digits, places, prn);
5042 pputc(prn, '\n');
5043 bufspace(offset, prn);
5044 pprintf(prn, "%-*s", UTF_WIDTH(_(bstr), slen), _(bstr));
5045 summary_print_val(s->sb, digits, places, prn);
5046 }
5047
5048 pputc(prn, '\n');
5049 }
5050
summary_print_varname(const char * src,int len,PRN * prn)5051 static void summary_print_varname (const char *src,
5052 int len, PRN *prn)
5053 {
5054 char vname[NAMETRUNC];
5055
5056 maybe_trim_varname(vname, src);
5057 pprintf(prn, "%-*s", len, vname);
5058 }
5059
5060 /**
5061 * print_summary:
5062 * @summ: pointer to gretl summary statistics struct.
5063 * @dset: information on the data set.
5064 * @prn: gretl printing struct.
5065 *
5066 * Prints the summary statistics for a given variable.
5067 */
5068
print_summary(const Summary * summ,const DATASET * dset,PRN * prn)5069 void print_summary (const Summary *summ,
5070 const DATASET *dset,
5071 PRN *prn)
5072 {
5073 int dmax, d = get_gretl_digits();
5074 int len, maxlen = 0;
5075 int i, vi;
5076
5077 if (summ->list == NULL || summ->list[0] == 0) {
5078 return;
5079 }
5080
5081 if (summ->weight_var > 0) {
5082 pputc(prn, '\n');
5083 pprintf(prn, _("Weighting variable: %s\n"),
5084 dset->varname[summ->weight_var]);
5085 }
5086
5087 if (summ->list[0] == 1) {
5088 print_summary_single(summ, 0, 0, dset, prn);
5089 return;
5090 }
5091
5092 /* number of significant figures to use */
5093 dmax = (summ->opt & OPT_S)? 4 : 5;
5094 d = d > dmax ? dmax : d;
5095
5096 maxlen = max_namelen_in_list(summ->list, dset);
5097 len = maxlen <= 8 ? 10 : (maxlen + 1);
5098
5099 #if 0
5100 if (!(summ->opt & OPT_B)) {
5101 int missing = summary_has_missing_values(summ);
5102
5103 prhdr(_("Summary statistics"), dset, missing, prn);
5104 }
5105 #endif
5106
5107 pputc(prn, '\n');
5108
5109 if (summ->opt & OPT_S) {
5110 /* the "simple" option */
5111 const char *h[] = {
5112 N_("Mean"),
5113 N_("Median"),
5114 /* TRANSLATORS: 'S.D.' means Standard Deviation */
5115 N_("S.D."),
5116 N_("Min"),
5117 N_("Max"),
5118 };
5119
5120 pprintf(prn, "%*s%*s%*s%*s%*s%*s\n", len, " ",
5121 UTF_WIDTH(_(h[0]), 11), _(h[0]),
5122 UTF_WIDTH(_(h[1]), 11), _(h[1]),
5123 UTF_WIDTH(_(h[2]), 11), _(h[2]),
5124 UTF_WIDTH(_(h[3]), 11), _(h[3]),
5125 UTF_WIDTH(_(h[4]), 11), _(h[4]));
5126
5127 for (i=0; i<summ->list[0]; i++) {
5128 vi = summ->list[i+1];
5129 summary_print_varname(dset->varname[vi], len, prn);
5130 printf11(summ->mean[i], d, prn);
5131 printf11(summ->median[i], d, prn);
5132 printf11(summ->sd[i], d, prn);
5133 printf11(summ->low[i], d, prn);
5134 printf11(summ->high[i], d, prn);
5135 pputc(prn, '\n');
5136 }
5137 pputc(prn, '\n');
5138 } else {
5139 /* print all available stats */
5140 const char *ha[] = {
5141 N_("Mean"),
5142 N_("Median"),
5143 N_("Minimum"),
5144 N_("Maximum"),
5145 };
5146 const char *hb[] = {
5147 N_("Std. Dev."),
5148 N_("C.V."),
5149 N_("Skewness"),
5150 N_("Ex. kurtosis")
5151 };
5152 const char *hc[] = {
5153 /* xgettext:no-c-format */
5154 N_("5% perc."),
5155 /* xgettext:no-c-format */
5156 N_("95% perc."),
5157 N_("IQ range"),
5158 N_("Missing obs.")
5159 };
5160 /* cases where 0.05 and 0.95 quantiles are OK */
5161 int npct = 0;
5162
5163 pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ",
5164 UTF_WIDTH(_(ha[0]), 15), _(ha[0]),
5165 UTF_WIDTH(_(ha[1]), 15), _(ha[1]),
5166 UTF_WIDTH(_(ha[2]), 15), _(ha[2]),
5167 UTF_WIDTH(_(ha[3]), 15), _(ha[3]));
5168
5169 for (i=0; i<summ->list[0]; i++) {
5170 vi = summ->list[i+1];
5171 summary_print_varname(dset->varname[vi], len, prn);
5172 printf15(summ->mean[i], d, prn);
5173 printf15(summ->median[i], d, prn);
5174 printf15(summ->low[i], d, prn);
5175 printf15(summ->high[i], d, prn);
5176 pputc(prn, '\n');
5177 /* while we're at it, register cases where we can
5178 show the 0.05 and 0.95 quantiles
5179 */
5180 if (!na(summ->perc05[i]) && !na(summ->perc95[i])) {
5181 npct++;
5182 }
5183 }
5184 pputc(prn, '\n');
5185
5186 pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ",
5187 UTF_WIDTH(_(hb[0]), 15), _(hb[0]),
5188 UTF_WIDTH(_(hb[1]), 15), _(hb[1]),
5189 UTF_WIDTH(_(hb[2]), 15), _(hb[2]),
5190 UTF_WIDTH(_(hb[3]), 15), _(hb[3]));
5191
5192 for (i=0; i<summ->list[0]; i++) {
5193 double cv;
5194
5195 vi = summ->list[i+1];
5196 summary_print_varname(dset->varname[vi], len, prn);
5197
5198 if (floateq(summ->mean[i], 0.0)) {
5199 cv = NADBL;
5200 } else if (floateq(summ->sd[i], 0.0)) {
5201 cv = 0.0;
5202 } else {
5203 cv = fabs(summ->sd[i] / summ->mean[i]);
5204 }
5205
5206 printf15(summ->sd[i], d, prn);
5207 printf15(cv, d, prn);
5208 printf15(summ->skew[i], d, prn);
5209 printf15(summ->xkurt[i], d, prn);
5210 pputc(prn, '\n');
5211 }
5212 pputc(prn, '\n');
5213
5214 if (npct > 0) {
5215 /* note: use pputs for strings containing literal '%' */
5216 gchar *hc0 = g_strdup(_(hc[0]));
5217 gchar *hc1 = g_strdup(_(hc[1]));
5218 int n;
5219
5220 pprintf(prn, "%*s", len, " ");
5221 n = 15 - g_utf8_strlen(hc0, -1);
5222 if (n > 0) bufspace(n, prn);
5223 pputs(prn, hc0);
5224 n = 15 - g_utf8_strlen(hc1, -1);
5225 if (n > 0) bufspace(n, prn);
5226 pputs(prn, hc1);
5227 g_free(hc0); g_free(hc1);
5228
5229 pprintf(prn, "%*s%*s\n",
5230 UTF_WIDTH(_(ha[2]), 15), _(hc[2]),
5231 UTF_WIDTH(_(ha[3]), 15), _(hc[3]));
5232 } else {
5233 /* not showing any 0.05, 0.95 quantiles */
5234 pprintf(prn, "%*s%*s%*s\n", len, " ",
5235 UTF_WIDTH(_(ha[2]), 15), _(hc[2]),
5236 UTF_WIDTH(_(ha[3]), 15), _(hc[3]));
5237 }
5238
5239 for (i=0; i<summ->list[0]; i++) {
5240 vi = summ->list[i+1];
5241 summary_print_varname(dset->varname[vi], len, prn);
5242 if (!na(summ->perc05[i]) && !na(summ->perc95[i])) {
5243 printf15(summ->perc05[i], d, prn);
5244 printf15(summ->perc95[i], d, prn);
5245 } else if (npct > 0) {
5246 pprintf(prn, "%*s", 15, "NA");
5247 pprintf(prn, "%*s", 15, "NA");
5248 }
5249 printf15(summ->iqr[i], d, prn);
5250 pprintf(prn, "%15d", (int) summ->misscount[i]);
5251 pputc(prn, '\n');
5252 }
5253 pputc(prn, '\n');
5254 }
5255 }
5256
5257 /**
5258 * free_summary:
5259 * @summ: pointer to gretl summary statistics struct
5260 *
5261 * Frees all resources associated with @summ, and
5262 * the pointer itself.
5263 */
5264
free_summary(Summary * summ)5265 void free_summary (Summary *summ)
5266 {
5267 free(summ->list);
5268 free(summ->misscount);
5269 free(summ->stats);
5270
5271 free(summ);
5272 }
5273
5274 /***
5275 * summary_new:
5276 * @list: list of variables we want summary statistics for
5277 * @wv: weighting variable (0=no weights)
5278 * @opt: options
5279 *
5280 * Allocates a new "Summary" struct and initializes a few
5281 * things inside it
5282 */
5283
summary_new(const int * list,int wv,gretlopt opt,int * err)5284 static Summary *summary_new (const int *list, int wv,
5285 gretlopt opt, int *err)
5286 {
5287 Summary *s;
5288 int nv;
5289
5290 if (list == NULL) {
5291 fprintf(stderr, "summary_new: list is NULL\n");
5292 *err = E_DATA;
5293 return NULL;
5294 }
5295
5296 nv = list[0];
5297
5298 s = malloc(sizeof *s);
5299 if (s == NULL) {
5300 *err = E_ALLOC;
5301 return NULL;
5302 }
5303
5304 s->list = gretl_list_copy(list);
5305 if (s->list == NULL) {
5306 *err = E_ALLOC;
5307 free(s);
5308 return NULL;
5309 }
5310
5311 s->weight_var = wv;
5312 s->opt = opt;
5313 s->n = 0;
5314 s->misscount = malloc(nv * sizeof *s->misscount);
5315
5316 s->stats = malloc(11 * nv * sizeof *s->stats);
5317 if (s->stats == NULL) {
5318 *err = E_ALLOC;
5319 free_summary(s);
5320 return NULL;
5321 }
5322
5323 s->mean = s->stats;
5324 s->median = s->mean + nv;
5325 s->sd = s->median + nv;
5326 s->skew = s->sd + nv;
5327 s->xkurt = s->skew + nv;
5328 s->low = s->xkurt + nv;
5329 s->high = s->low + nv;
5330 s->cv = s->high + nv;
5331 s->perc05 = s->cv + nv;
5332 s->perc95 = s->perc05 + nv;
5333 s->iqr = s->perc95 + nv;
5334
5335 s->sb = s->sw = NADBL;
5336
5337 return s;
5338 }
5339
summary_has_missing_values(const Summary * summ)5340 int summary_has_missing_values (const Summary *summ)
5341 {
5342 if (summ->misscount != NULL) {
5343 int i, nv = summ->list[0];
5344
5345 for (i=0; i<nv; i++) {
5346 if (summ->misscount[i] > 0) {
5347 return 1;
5348 }
5349 }
5350 }
5351
5352 return 0;
5353 }
5354
compare_wgtord_rows(const void * a,const void * b)5355 static int compare_wgtord_rows (const void *a, const void *b)
5356 {
5357 const double **da = (const double **) a;
5358 const double **db = (const double **) b;
5359
5360 return (da[0][0] > db[0][0]) - (da[0][0] < db[0][0]);
5361 }
5362
weighted_order_stats(const double * y,const double * w,double * ostats,int n,int k)5363 static int weighted_order_stats (const double *y, const double *w,
5364 double *ostats, int n, int k)
5365 {
5366 /* on input "ostats" contains the quantiles to compute;
5367 on output it holds the results
5368 */
5369 double p, q, wsum = 0.0;
5370 double **X;
5371 int t, i, err = 0;
5372
5373 X = doubles_array_new(n, 2);
5374 if (X == NULL) {
5375 return E_ALLOC;
5376 }
5377
5378 i = 0;
5379 for (t=0; t<n; t++) {
5380 if (na(y[t]) || na(w[t]) || w[t] == 0.0) {
5381 continue;
5382 } else {
5383 X[i][0] = y[t];
5384 X[i][1] = w[t];
5385 wsum += w[t];
5386 i++;
5387 }
5388 }
5389
5390 qsort(X, i, sizeof *X, compare_wgtord_rows);
5391
5392 for (i=0; i<k; i++) {
5393 p = ostats[i] * wsum;
5394 q = X[0][1];
5395 t = 0;
5396 while (q <= p) {
5397 q += X[t][1];
5398 if (q < p) t++;
5399 }
5400
5401 if (t == 0 || t >= n - 1) {
5402 ostats[i] = NADBL;
5403 continue;
5404 }
5405
5406 if (X[t-1][0] == X[t][0]) {
5407 ostats[i] = X[t][0];
5408 } else {
5409 double a = (q - p) / X[t][1];
5410
5411 ostats[i] = a * X[t+1][0] + (1-a) * X[t][0];
5412 }
5413 }
5414
5415 doubles_array_free(X, n);
5416
5417 return err;
5418 }
5419
5420 /**
5421 * get_summary_weighted:
5422 * @list: list of variables to process.
5423 * @dset: dataset struct.
5424 * @wgt: series to use as weights.
5425 * @opt: may include OPT_S for "simple" version.
5426 * @prn: gretl printing struct.
5427 * @err: location to receive error code.
5428 *
5429 * Calculates descriptive summary statistics for the specified
5430 * variables, weighting the observations by @rv. The series @rv must
5431 * be of full length (dset->n).
5432 *
5433 * Returns: #Summary object containing the summary statistics, or NULL
5434 * on failure.
5435 */
5436
get_summary_weighted(const int * list,const DATASET * dset,int wtvar,gretlopt opt,PRN * prn,int * err)5437 Summary *get_summary_weighted (const int *list, const DATASET *dset,
5438 int wtvar, gretlopt opt,
5439 PRN *prn, int *err)
5440 {
5441 const double *wts;
5442 double *x, ostats[5];
5443 int t1 = dset->t1;
5444 int t2 = dset->t2;
5445 Summary *s;
5446 int i, t;
5447
5448 s = summary_new(list, wtvar, opt, err);
5449 if (s == NULL) {
5450 return NULL;
5451 }
5452
5453 x = malloc(dset->n * sizeof *x);
5454 if (x == NULL) {
5455 *err = E_ALLOC;
5456 free_summary(s);
5457 return NULL;
5458 }
5459
5460 wts = dset->Z[wtvar];
5461
5462 for (i=0; i<list[0]; i++) {
5463 double *pskew = NULL, *pkurt = NULL;
5464 int vi = s->list[i+1];
5465 int ni = 0;
5466 int ntot = 0;
5467 double wt;
5468
5469 /* prepare the series for weighting: substitute NAs for values
5470 at which the weights are invalid or zero
5471 */
5472 for (t=t1; t<=t2; t++) {
5473 wt = wts[t];
5474 if (!na(wt) && wt != 0.0) {
5475 ntot++;
5476 x[t] = dset->Z[vi][t];
5477 if (!na(x[t])) {
5478 ni++;
5479 }
5480 } else {
5481 x[t] = NADBL;
5482 }
5483 }
5484
5485 s->misscount[i] = ntot - ni;
5486
5487 if (ni > s->n) {
5488 s->n = ni;
5489 }
5490
5491 if (ni == 0) {
5492 pprintf(prn, _("Dropping %s: sample range contains no valid "
5493 "observations\n"), dset->varname[vi]);
5494 gretl_list_delete_at_pos(s->list, i + 1);
5495 if (s->list[0] == 0) {
5496 return s;
5497 } else {
5498 i--;
5499 continue;
5500 }
5501 }
5502
5503 if (opt & OPT_S) {
5504 s->skew[i] = NADBL;
5505 s->xkurt[i] = NADBL;
5506 s->cv[i] = NADBL;
5507 s->median[i] = NADBL;
5508 } else {
5509 pskew = &s->skew[i];
5510 pkurt = &s->xkurt[i];
5511 }
5512
5513 gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
5514 gretl_moments(t1, t2, x, wts, &s->mean[i], &s->sd[i], pskew, pkurt, 1);
5515
5516 if (!(opt & OPT_S)) {
5517 int err;
5518
5519 if (floateq(s->mean[i], 0.0)) {
5520 s->cv[i] = NADBL;
5521 } else if (floateq(s->sd[i], 0.0)) {
5522 s->cv[i] = 0.0;
5523 } else {
5524 s->cv[i] = fabs(s->sd[i] / s->mean[i]);
5525 }
5526
5527 ostats[0] = 0.05;
5528 ostats[1] = 0.25;
5529 ostats[2] = 0.5;
5530 ostats[3] = 0.75;
5531 ostats[4] = 0.95;
5532
5533 err = weighted_order_stats(x, wts, ostats, ni, 5);
5534
5535 if (!err) {
5536 s->median[i] = ostats[2];
5537 s->perc05[i] = ostats[0];
5538 s->perc95[i] = ostats[4];
5539 if (!na(ostats[1]) && !na(ostats[3])) {
5540 s->iqr[i] = ostats[3] - ostats[1];
5541 }
5542 }
5543 }
5544 #if 0
5545 if (dataset_is_panel(dset) && list[0] == 1) {
5546 panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
5547 }
5548 #endif
5549 }
5550
5551 free(x);
5552
5553 return s;
5554 }
5555
5556 /* Get the additional statistics wanted if the --simple
5557 flag was not given to the summary command.
5558 */
5559
get_extra_stats(Summary * s,int i,int t1,int t2,const double * x)5560 static int get_extra_stats (Summary *s, int i,
5561 int t1, int t2,
5562 const double *x)
5563 {
5564 int err = 0;
5565
5566 if (floateq(s->mean[i], 0.0)) {
5567 s->cv[i] = NADBL;
5568 } else if (floateq(s->sd[i], 0.0)) {
5569 s->cv[i] = 0.0;
5570 } else {
5571 s->cv[i] = fabs(s->sd[i] / s->mean[i]);
5572 }
5573
5574 s->perc05[i] = gretl_quantile(t1, t2, x, 0.05, OPT_Q, &err);
5575 s->perc95[i] = gretl_quantile(t1, t2, x, 0.95, OPT_Q, &err);
5576 s->iqr[i] = gretl_quantile(t1, t2, x, 0.75, OPT_NONE, &err);
5577 if (!na(s->iqr[i])) {
5578 s->iqr[i] -= gretl_quantile(t1, t2, x, 0.25, OPT_NONE, &err);
5579 }
5580
5581 return err;
5582 }
5583
5584 /**
5585 * get_summary_restricted:
5586 * @list: list of variables to process.
5587 * @dset: dataset struct.
5588 * @rv: series to use as restriction dummy.
5589 * @opt: may include OPT_S for "simple" version.
5590 * @prn: gretl printing struct.
5591 * @err: location to receive error code.
5592 *
5593 * Calculates descriptive summary statistics for the specified variables,
5594 * with the observations restricted to those for which @rv has a non-zero
5595 * (and non-missing) value. The series @rv must be of full length (dset->n).
5596 *
5597 * Returns: #Summary object containing the summary statistics, or NULL
5598 * on failure.
5599 */
5600
get_summary_restricted(const int * list,const DATASET * dset,const double * rv,gretlopt opt,PRN * prn,int * err)5601 Summary *get_summary_restricted (const int *list, const DATASET *dset,
5602 const double *rv, gretlopt opt,
5603 PRN *prn, int *err)
5604 {
5605 int t1 = dset->t1;
5606 int t2 = dset->t2;
5607 Summary *s;
5608 double *x;
5609 int i, t;
5610
5611 s = summary_new(list, 0, opt, err);
5612 if (s == NULL) {
5613 return NULL;
5614 }
5615
5616 x = malloc(dset->n * sizeof *x);
5617 if (x == NULL) {
5618 *err = E_ALLOC;
5619 free_summary(s);
5620 return NULL;
5621 }
5622
5623 for (i=0; i<s->list[0]; i++) {
5624 double *pskew = NULL, *pkurt = NULL;
5625 int vi = s->list[i+1];
5626 int strvals;
5627 int ni = 0;
5628 int ntot = 0;
5629
5630 strvals = is_string_valued(dset, vi);
5631
5632 if (!strvals) {
5633 /* create the restricted series: substitute NAs
5634 for values at which the restriction dummy is
5635 invalid or zero
5636 */
5637 for (t=t1; t<=t2; t++) {
5638 if (!na(rv[t]) && rv[t] != 0.0) {
5639 ntot++;
5640 x[t] = dset->Z[vi][t];
5641 if (!na(x[t])) {
5642 ni++;
5643 }
5644 } else {
5645 x[t] = NADBL;
5646 }
5647 }
5648 s->misscount[i] = ntot - ni;
5649 if (ni > s->n) {
5650 s->n = ni;
5651 }
5652 }
5653
5654 if (ni == 0) {
5655 if (strvals) {
5656 pprintf(prn, _("Dropping %s: string-valued series\n"),
5657 dset->varname[vi]);
5658 } else {
5659 pprintf(prn, _("Dropping %s: sample range contains no valid "
5660 "observations\n"), dset->varname[vi]);
5661 }
5662 gretl_list_delete_at_pos(s->list, i + 1);
5663 if (s->list[0] == 0) {
5664 return s;
5665 } else {
5666 i--;
5667 continue;
5668 }
5669 }
5670
5671 if (opt & OPT_S) {
5672 s->skew[i] = NADBL;
5673 s->xkurt[i] = NADBL;
5674 s->cv[i] = NADBL;
5675 } else {
5676 pskew = &s->skew[i];
5677 pkurt = &s->xkurt[i];
5678 }
5679
5680 gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
5681 gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i], pskew, pkurt, 1);
5682 s->median[i] = gretl_median(t1, t2, x);
5683
5684 if (!(opt & OPT_S)) {
5685 *err = get_extra_stats(s, i, t1, t2, x);
5686 }
5687
5688 if (dataset_is_panel(dset) && list[0] == 1) {
5689 panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
5690 }
5691 }
5692
5693 free(x);
5694
5695 return s;
5696 }
5697
5698 /**
5699 * get_summary:
5700 * @list: list of variables to process.
5701 * @dset: dataset struct.
5702 * @opt: may include OPT_S for "simple" version.
5703 * @prn: gretl printing struct.
5704 * @err: location to receive error code.
5705 *
5706 * Calculates descriptive summary statistics for the specified variables.
5707 *
5708 * Returns: #Summary object containing the summary statistics, or NULL
5709 * on failure.
5710 */
5711
get_summary(const int * list,const DATASET * dset,gretlopt opt,PRN * prn,int * err)5712 Summary *get_summary (const int *list, const DATASET *dset,
5713 gretlopt opt, PRN *prn, int *err)
5714 {
5715 int t1 = dset->t1;
5716 int t2 = dset->t2;
5717 Summary *s;
5718 int i, nmax;
5719
5720 s = summary_new(list, 0, opt, err);
5721 if (s == NULL) {
5722 return NULL;
5723 }
5724
5725 nmax = sample_size(dset);
5726
5727 for (i=0; i<s->list[0]; i++) {
5728 double *pskew = NULL, *pkurt = NULL;
5729 const double *x;
5730 double x0;
5731 int vi = s->list[i+1];
5732 int strvals;
5733 int ni = 0;
5734
5735 strvals = is_string_valued(dset, vi);
5736 if (!strvals) {
5737 x = dset->Z[vi];
5738 ni = good_obs(x + t1, nmax, &x0);
5739 s->misscount[i] = nmax - ni;
5740 if (ni > s->n) {
5741 s->n = ni;
5742 }
5743 }
5744 if (ni == 0) {
5745 if (strvals) {
5746 pprintf(prn, _("Dropping %s: string-valued series\n"),
5747 dset->varname[vi]);
5748 } else {
5749 pprintf(prn, _("Dropping %s: sample range contains no valid "
5750 "observations\n"), dset->varname[vi]);
5751 }
5752 gretl_list_delete_at_pos(s->list, i + 1);
5753 if (s->list[0] == 0) {
5754 return s;
5755 } else {
5756 i--;
5757 continue;
5758 }
5759 }
5760
5761 if (opt & OPT_S) {
5762 s->skew[i] = NADBL;
5763 s->xkurt[i] = NADBL;
5764 s->cv[i] = NADBL;
5765 s->median[i] = NADBL;
5766 } else {
5767 pskew = &s->skew[i];
5768 pkurt = &s->xkurt[i];
5769 }
5770
5771 gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
5772
5773 if (s->weight_var == 0) {
5774 gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i],
5775 pskew, pkurt, 1);
5776 } else {
5777 gretl_moments(t1, t2, x, dset->Z[s->weight_var], &s->mean[i], &s->sd[i],
5778 pskew, pkurt, 0);
5779 }
5780
5781 /* included in both simple and full variants */
5782 s->median[i] = gretl_median(t1, t2, x);
5783
5784 if (!(opt & OPT_S)) {
5785 *err = get_extra_stats(s, i, t1, t2, x);
5786 }
5787
5788 if (dataset_is_panel(dset) && list[0] == 1) {
5789 panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
5790 }
5791 }
5792
5793 return s;
5794 }
5795
record_summary(Summary * summ,const DATASET * dset)5796 static void record_summary (Summary *summ, const DATASET *dset)
5797 {
5798 gretl_matrix *m = NULL;
5799 char **Sc, **Sr;
5800 int nv = summ->list[0];
5801 int i, vi, cols;
5802
5803 cols = (summ->opt & OPT_S)? 5 : 12;
5804 m = gretl_matrix_alloc(nv, cols);
5805 if (m == NULL) {
5806 return;
5807 }
5808
5809 Sc = strings_array_new(cols);
5810 Sr = strings_array_new(nv);
5811
5812 if (summ->opt & OPT_S) {
5813 /* the "simple" option */
5814 const char *h[] = {
5815 N_("Mean"),
5816 N_("Median"),
5817 N_("S.D."),
5818 N_("Min"),
5819 N_("Max"),
5820 };
5821
5822 if (Sc != NULL) {
5823 for (i=0; i<cols; i++) {
5824 Sc[i] = gretl_strdup(_(h[i]));
5825 }
5826 }
5827 for (i=0; i<nv; i++) {
5828 if (Sr != NULL) {
5829 vi = summ->list[i+1];
5830 Sr[i] = gretl_strdup(dset->varname[vi]);
5831 }
5832 gretl_matrix_set(m, i, 0, summ->mean[i]);
5833 gretl_matrix_set(m, i, 1, summ->median[i]);
5834 gretl_matrix_set(m, i, 2, summ->sd[i]);
5835 gretl_matrix_set(m, i, 3, summ->low[i]);
5836 gretl_matrix_set(m, i, 4, summ->high[i]);
5837 }
5838 } else {
5839 /* record all available stats */
5840 const char *h[] = {
5841 N_("Mean"),
5842 N_("Median"),
5843 N_("Minimum"),
5844 N_("Maximum"),
5845 N_("Std. Dev."),
5846 N_("C.V."),
5847 N_("Skewness"),
5848 N_("Ex. kurtosis"),
5849 /* xgettext:no-c-format */
5850 N_("5% perc."),
5851 /* xgettext:no-c-format */
5852 N_("95% perc."),
5853 N_("IQ range"),
5854 N_("Missing obs.")
5855 };
5856 double cv;
5857
5858 if (Sc != NULL) {
5859 for (i=0; i<cols; i++) {
5860 Sc[i] = gretl_strdup(_(h[i]));
5861 }
5862 }
5863 for (i=0; i<nv; i++) {
5864 if (Sr != NULL) {
5865 vi = summ->list[i+1];
5866 Sr[i] = gretl_strdup(dset->varname[vi]);
5867 }
5868 gretl_matrix_set(m, i, 0, summ->mean[i]);
5869 gretl_matrix_set(m, i, 1, summ->median[i]);
5870 gretl_matrix_set(m, i, 2, summ->low[i]);
5871 gretl_matrix_set(m, i, 3, summ->high[i]);
5872 gretl_matrix_set(m, i, 4, summ->sd[i]);
5873 if (floateq(summ->mean[i], 0.0)) {
5874 cv = NADBL;
5875 } else if (floateq(summ->sd[i], 0.0)) {
5876 cv = 0.0;
5877 } else {
5878 cv = fabs(summ->sd[i] / summ->mean[i]);
5879 }
5880 gretl_matrix_set(m, i, 5, cv);
5881 gretl_matrix_set(m, i, 6, summ->skew[i]);
5882 gretl_matrix_set(m, i, 7, summ->xkurt[i]);
5883 gretl_matrix_set(m, i, 8, summ->perc05[i]);
5884 gretl_matrix_set(m, i, 9, summ->perc95[i]);
5885 gretl_matrix_set(m, i, 10, summ->iqr[i]);
5886 gretl_matrix_set(m, i, 11, summ->misscount[i]);
5887 }
5888 }
5889
5890 gretl_matrix_set_colnames(m, Sc);
5891 gretl_matrix_set_rownames(m, Sr);
5892 set_last_result_data(m, GRETL_TYPE_MATRIX);
5893 }
5894
5895 /**
5896 * list_summary:
5897 * @list: list of series to process.
5898 * @dset: dataset struct.
5899 * @opt: may include %OPT_S for "simple" version.
5900 * @prn: gretl printing struct.
5901 *
5902 * Prints descriptive statistics for the listed variables.
5903 *
5904 * Returns: 0 on success, non-zero code on error.
5905 */
5906
list_summary(const int * list,int wgtvar,const DATASET * dset,gretlopt opt,PRN * prn)5907 int list_summary (const int *list, int wgtvar,
5908 const DATASET *dset,
5909 gretlopt opt, PRN *prn)
5910 {
5911 Summary *summ = NULL;
5912 int err = 0;
5913
5914 if (list != NULL) {
5915 if (list[0] == 0) {
5916 return 0;
5917 }
5918 if (wgtvar == 0) {
5919 /* no weights */
5920 summ = get_summary(list, dset, opt, prn, &err);
5921 } else {
5922 summ = get_summary_weighted(list, dset, wgtvar,
5923 opt, prn, &err);
5924 }
5925 } else {
5926 int *tmplist = full_var_list(dset, NULL);
5927
5928 if (tmplist != NULL) {
5929 if (wgtvar == 0) {
5930 /* no weights */
5931 summ = get_summary(tmplist, dset, opt, prn, &err);
5932 } else {
5933 summ = get_summary_weighted(tmplist, dset, wgtvar,
5934 opt, prn, &err);
5935 }
5936 free(tmplist);
5937 }
5938 }
5939
5940 if (summ != NULL) {
5941 if (!(opt & OPT_Q)) {
5942 print_summary(summ, dset, prn);
5943 }
5944 record_summary(summ, dset);
5945 free_summary(summ);
5946 }
5947
5948 return err;
5949 }
5950
5951 /**
5952 * vmatrix_new:
5953 *
5954 * Returns: an allocated and initialized #VMatrix, or
5955 * %NULL on failure.
5956 */
5957
vmatrix_new(void)5958 VMatrix *vmatrix_new (void)
5959 {
5960 VMatrix *v = malloc(sizeof *v);
5961
5962 if (v != NULL) {
5963 v->vec = NULL;
5964 v->xbar = NULL;
5965 v->ssx = NULL;
5966 v->list = NULL;
5967 v->names = NULL;
5968 v->ci = 0;
5969 v->dim = 0;
5970 v->t1 = 0;
5971 v->t2 = 0;
5972 v->n = 0;
5973 v->missing = 0;
5974 }
5975
5976 return v;
5977 }
5978
5979 /**
5980 * free_vmatrix:
5981 * @vmat: pointer to gretl correlation matrix struct
5982 *
5983 * Frees all resources associated with @vmat, and
5984 * the pointer itself.
5985 */
5986
free_vmatrix(VMatrix * vmat)5987 void free_vmatrix (VMatrix *vmat)
5988 {
5989 if (vmat != NULL) {
5990 if (vmat->names != NULL) {
5991 strings_array_free(vmat->names, vmat->dim);
5992 }
5993 if (vmat->vec != NULL) {
5994 free(vmat->vec);
5995 }
5996 if (vmat->xbar != NULL) {
5997 free(vmat->xbar);
5998 }
5999 if (vmat->ssx != NULL) {
6000 free(vmat->ssx);
6001 }
6002 if (vmat->list != NULL) {
6003 free(vmat->list);
6004 }
6005 free(vmat);
6006 }
6007 }
6008
6009 enum {
6010 CORRMAT = 0,
6011 COVMAT
6012 };
6013
6014 /* Compute correlation or covariance matrix, using the maximum
6015 available sample for each coefficient.
6016 */
6017
max_corrcov_matrix(VMatrix * v,const DATASET * dset,int flag)6018 static int max_corrcov_matrix (VMatrix *v, const DATASET *dset,
6019 int flag)
6020 {
6021 double **Z = dset->Z;
6022 int i, j, vi, vj, nij;
6023 int nmin, nmax = 0;
6024 int m = v->dim;
6025 int nmiss;
6026
6027 nmin = v->n = v->t2 - v->t1 + 1;
6028
6029 for (i=0; i<m; i++) {
6030 vi = v->list[i+1];
6031 for (j=i; j<m; j++) {
6032 vj = v->list[j+1];
6033 nij = ijton(i, j, m);
6034 if (i == j && flag == CORRMAT) {
6035 v->vec[nij] = 1.0;
6036 } else {
6037 nmiss = 0;
6038 if (flag == COVMAT) {
6039 v->vec[nij] = gretl_covar(v->t1, v->t2, Z[vi], Z[vj],
6040 &nmiss);
6041 } else {
6042 v->vec[nij] = gretl_corr(v->t1, v->t2, Z[vi], Z[vj],
6043 &nmiss);
6044 }
6045 if (nmiss > 0) {
6046 int n = v->n - nmiss;
6047
6048 if (n < nmin && n > 0) {
6049 nmin = n;
6050 }
6051 if (n > nmax) {
6052 nmax = n;
6053 }
6054 v->missing = 1;
6055 } else {
6056 nmax = v->n;
6057 }
6058 }
6059 }
6060 }
6061
6062 /* We'll record an "n" value if there's something resembling
6063 a common number of observations across the coefficients.
6064 */
6065
6066 if (v->missing) {
6067 v->n = 0;
6068 if (nmax > 0) {
6069 double d = (nmax - nmin) / (double) nmax;
6070
6071 if (d < 0.10) {
6072 v->n = nmin;
6073 }
6074 }
6075 }
6076
6077 return 0;
6078 }
6079
6080 /* Compute correlation or covariance matrix, ensuring we use the same
6081 sample for all coefficients. We may be doing this in the context of
6082 the "corr" command or in the context of "pca". In the latter case
6083 we want to save the "xbar" and "ssx" values for later use when
6084 calculating the component series.
6085 */
6086
uniform_corrcov_matrix(VMatrix * v,const DATASET * dset,int ci,int flag)6087 static int uniform_corrcov_matrix (VMatrix *v, const DATASET *dset,
6088 int ci, int flag)
6089 {
6090 double **Z = dset->Z;
6091 double *xbar = NULL, *ssx = NULL;
6092 double *x;
6093 int m = v->dim;
6094 int mm = (m * (m + 1)) / 2;
6095 double d1, d2;
6096 int i, j, jmin, nij, t;
6097 int miss, n = 0;
6098
6099 xbar = malloc(m * sizeof *xbar);
6100 if (xbar == NULL) {
6101 return E_ALLOC;
6102 }
6103
6104 if (ci == PCA || flag == CORRMAT) {
6105 ssx = malloc(m * sizeof *ssx);
6106 if (ssx == NULL) {
6107 free(xbar);
6108 return E_ALLOC;
6109 }
6110 }
6111
6112 for (i=0; i<m; i++) {
6113 xbar[i] = 0.0;
6114 }
6115
6116 if (ssx != NULL) {
6117 for (i=0; i<m; i++) {
6118 ssx[i] = 0.0;
6119 }
6120 }
6121
6122 v->missing = 0;
6123
6124 /* first pass: get sample size and sums */
6125
6126 for (t=v->t1; t<=v->t2; t++) {
6127 miss = 0;
6128 for (i=0; i<m; i++) {
6129 if (na(Z[v->list[i+1]][t])) {
6130 miss = 1;
6131 v->missing += 1;
6132 break;
6133 }
6134 }
6135 if (!miss) {
6136 n++;
6137 for (i=0; i<m; i++) {
6138 xbar[i] += Z[v->list[i+1]][t];
6139 }
6140 }
6141 }
6142
6143 if (n < 2) {
6144 free(xbar);
6145 free(ssx);
6146 return E_MISSDATA;
6147 }
6148
6149 for (i=0; i<m; i++) {
6150 xbar[i] /= n;
6151 }
6152
6153 for (i=0; i<mm; i++) {
6154 v->vec[i] = 0.0;
6155 }
6156
6157 /* second pass: get deviations from means and cumulate */
6158
6159 for (t=v->t1; t<=v->t2; t++) {
6160 miss = 0;
6161 for (i=0; i<m; i++) {
6162 x = Z[v->list[i+1]];
6163 if (na(x[t])) {
6164 miss = 1;
6165 break;
6166 }
6167 }
6168 if (!miss) {
6169 for (i=0; i<m; i++) {
6170 x = Z[v->list[i+1]];
6171 d1 = x[t] - xbar[i];
6172 if (ssx != NULL) {
6173 ssx[i] += d1 * d1;
6174 }
6175 jmin = (flag == COVMAT)? i : i + 1;
6176 for (j=jmin; j<m; j++) {
6177 x = Z[v->list[j+1]];
6178 nij = ijton(i, j, m);
6179 d2 = x[t] - xbar[j];
6180 v->vec[nij] += d1 * d2;
6181 }
6182 }
6183 }
6184 }
6185
6186 /* finalize: compute correlations or covariances */
6187
6188 if (flag == CORRMAT) {
6189 /* correlations */
6190 for (i=0; i<m; i++) {
6191 for (j=i; j<m; j++) {
6192 nij = ijton(i, j, m);
6193 if (i == j) {
6194 v->vec[nij] = 1.0;
6195 } else if (ssx[i] == 0.0 || ssx[j] == 0.0) {
6196 v->vec[nij] = NADBL;
6197 } else {
6198 v->vec[nij] /= sqrt(ssx[i] * ssx[j]);
6199 }
6200 }
6201 }
6202 } else {
6203 /* covariances */
6204 for (i=0; i<mm; i++) {
6205 v->vec[i] /= (n - 1);
6206 }
6207 }
6208
6209 v->n = n;
6210
6211 if (ci == PCA) {
6212 v->xbar = xbar;
6213 v->ssx = ssx;
6214 } else {
6215 free(xbar);
6216 free(ssx);
6217 }
6218
6219 return 0;
6220 }
6221
6222 /**
6223 * corrlist:
6224 * @ci: command index.
6225 * @list: list of variables to process, by ID number.
6226 * @dset: dataset struct.
6227 * @opt: option flags.
6228 * @err: location to receive error code.
6229 *
6230 * Computes pairwise correlation coefficients for the variables
6231 * specified in @list, skipping any constants. If the option
6232 * flags contain OPT_N, a uniform sample is ensured: only those
6233 * observations for which all the listed variables have valid
6234 * values are used. If OPT_C is included, we actually calculate
6235 * covariances rather than correlations.
6236 *
6237 * Returns: gretl correlation matrix struct, or NULL on failure.
6238 */
6239
corrlist(int ci,int * list,const DATASET * dset,gretlopt opt,int * err)6240 VMatrix *corrlist (int ci, int *list, const DATASET *dset,
6241 gretlopt opt, int *err)
6242 {
6243 int flag = (opt & OPT_C)? COVMAT : CORRMAT;
6244 VMatrix *v;
6245 int i, m, mm;
6246 int nmiss = 0;
6247
6248 if (list == NULL) {
6249 fprintf(stderr, "corrlist: list is NULL\n");
6250 *err = E_DATA;
6251 return NULL;
6252 }
6253
6254 v = vmatrix_new();
6255 if (v == NULL) {
6256 *err = E_ALLOC;
6257 return NULL;
6258 }
6259
6260 /* drop any constants from list */
6261 for (i=list[0]; i>0; i--) {
6262 if (gretl_isconst(dset->t1, dset->t2, dset->Z[list[i]])) {
6263 gretl_list_delete_at_pos(list, i);
6264 }
6265 }
6266
6267 if (list[0] < 2) {
6268 gretl_errmsg_set(_("corr: needs at least two non-constant arguments"));
6269 *err = E_DATA;
6270 goto bailout;
6271 }
6272
6273 v->t1 = dset->t1;
6274 v->t2 = dset->t2;
6275
6276 /* adjust sample range if need be */
6277 list_adjust_sample(list, &v->t1, &v->t2, dset, &nmiss);
6278
6279 if (v->t2 - v->t1 + 1 < 3) {
6280 *err = E_TOOFEW;
6281 goto bailout;
6282 }
6283
6284 v->dim = m = list[0];
6285 mm = (m * (m + 1)) / 2;
6286
6287 v->names = strings_array_new(m);
6288 v->vec = malloc(mm * sizeof *v->vec);
6289 v->list = gretl_list_copy(list);
6290
6291 if (v->names == NULL || v->vec == NULL || v->list == NULL) {
6292 *err = E_ALLOC;
6293 goto bailout;
6294 }
6295
6296 if (ci == PCA) {
6297 opt |= OPT_N;
6298 }
6299
6300 if (opt & OPT_N) {
6301 /* impose uniform sample size */
6302 *err = uniform_corrcov_matrix(v, dset, ci, flag);
6303 } else {
6304 /* sample sizes may differ */
6305 *err = max_corrcov_matrix(v, dset, flag);
6306 }
6307
6308 if (!*err) {
6309 for (i=0; i<m; i++) {
6310 v->names[i] = gretl_strdup(dset->varname[list[i+1]]);
6311 if (v->names[i] == NULL) {
6312 *err = E_ALLOC;
6313 goto bailout;
6314 }
6315 }
6316 }
6317
6318 v->ci = (flag == CORRMAT)? CORR : 0; /* FIXME? */
6319
6320 bailout:
6321
6322 if (*err) {
6323 free_vmatrix(v);
6324 v = NULL;
6325 }
6326
6327 return v;
6328 }
6329
printcorr(const VMatrix * v,PRN * prn)6330 static void printcorr (const VMatrix *v, PRN *prn)
6331 {
6332 double r = v->vec[1];
6333
6334 pprintf(prn, "\ncorr(%s, %s)", v->names[0], v->names[1]);
6335
6336 if (na(r)) {
6337 pprintf(prn, ": %s\n\n", _("undefined"));
6338 } else {
6339 pprintf(prn, " = %.8f\n", r);
6340 if (fabs(r) < 1.0) {
6341 int n2 = v->n - 2;
6342 double tval = r * sqrt(n2 / (1 - r*r));
6343
6344 pputs(prn, _("Under the null hypothesis of no correlation:\n "));
6345 pprintf(prn, _("t(%d) = %g, with two-tailed p-value %.4f\n"), n2,
6346 tval, student_pvalue_2(n2, tval));
6347 pputc(prn, '\n');
6348 } else {
6349 pprintf(prn, _("5%% critical value (two-tailed) = "
6350 "%.4f for n = %d"), rhocrit95(v->n), v->n);
6351 pputs(prn, "\n\n");
6352 }
6353 }
6354 }
6355
6356 /**
6357 * print_corrmat:
6358 * @corr: gretl correlation matrix.
6359 * @dset: dataset information.
6360 * @prn: gretl printing struct.
6361 *
6362 * Prints a gretl correlation matrix to @prn.
6363 */
6364
print_corrmat(VMatrix * corr,const DATASET * dset,PRN * prn)6365 void print_corrmat (VMatrix *corr, const DATASET *dset, PRN *prn)
6366 {
6367 if (corr->dim == 2) {
6368 printcorr(corr, prn);
6369 } else {
6370 char date1[OBSLEN], date2[OBSLEN];
6371 gchar *tmp = NULL;
6372
6373 ntolabel(date1, corr->t1, dset);
6374 ntolabel(date2, corr->t2, dset);
6375
6376 pputc(prn, '\n');
6377
6378 tmp = g_strdup_printf(_("%s, using the observations %s - %s"),
6379 _("Correlation Coefficients"), date1, date2);
6380 output_line(tmp, prn, 0);
6381 g_free(tmp);
6382
6383 if (corr->missing) {
6384 output_line(_("(missing values were skipped)"), prn, 1);
6385 }
6386
6387 if (corr->n > 0) {
6388 tmp = g_strdup_printf(_("5%% critical value (two-tailed) = "
6389 "%.4f for n = %d"), rhocrit95(corr->n),
6390 corr->n);
6391 output_line(tmp, prn, 1);
6392 g_free(tmp);
6393 }
6394
6395 text_print_vmatrix(corr, prn);
6396 }
6397 }
6398
record_corr_matrix(VMatrix * c)6399 static void record_corr_matrix (VMatrix *c)
6400 {
6401 gretl_matrix *m = NULL;
6402 int i, j, k, n = c->dim;
6403 double cij;
6404
6405 m = gretl_matrix_alloc(n, n);
6406
6407 if (m != NULL) {
6408 k = 0;
6409 for (i=0; i<n; i++) {
6410 for (j=i; j<n; j++) {
6411 cij = c->vec[k];
6412 gretl_matrix_set(m, i, j, cij);
6413 gretl_matrix_set(m, j, i, cij);
6414 k++;
6415 }
6416 }
6417
6418 if (c->names) {
6419 char **rlab = strings_array_new(n);
6420 char **clab = strings_array_new(n);
6421
6422 if (rlab != NULL && clab != NULL) {
6423 for (i=0; i<n; i++) {
6424 rlab[i] = gretl_strdup(c->names[i]);
6425 clab[i] = gretl_strdup(c->names[i]);
6426 }
6427 gretl_matrix_set_rownames(m, rlab);
6428 gretl_matrix_set_colnames(m, clab);
6429 }
6430 }
6431
6432 set_last_result_data(m, GRETL_TYPE_MATRIX);
6433 }
6434 }
6435
6436 /**
6437 * gretl_corrmx:
6438 * @list: gives the ID numbers of the variables to process.
6439 * @dset: dataset struct.
6440 * @opt: option flags: OPT_N means use uniform sample size,
6441 * OPT_U controls plotting options.
6442 * @prn: gretl printing struct.
6443 *
6444 * Computes and prints the correlation matrix for the specified list
6445 * of variables.
6446 *
6447 * Returns: 0 on successful completion, 1 on error.
6448 */
6449
gretl_corrmx(int * list,const DATASET * dset,gretlopt opt,PRN * prn)6450 int gretl_corrmx (int *list, const DATASET *dset,
6451 gretlopt opt, PRN *prn)
6452 {
6453 VMatrix *corr = NULL;
6454 int err = 0;
6455
6456 if (list != NULL) {
6457 if (list[0] == 0) {
6458 return 0;
6459 } else {
6460 corr = corrlist(CORR, list, dset, opt, &err);
6461 }
6462 } else {
6463 int *tmplist = full_var_list(dset, NULL);
6464
6465 if (tmplist != NULL) {
6466 corr = corrlist(CORR, tmplist, dset, opt, &err);
6467 free(tmplist);
6468 }
6469 }
6470
6471 if (corr != NULL) {
6472 if (!(opt & OPT_Q)) {
6473 print_corrmat(corr, dset, prn);
6474 }
6475 if (corr->dim > 2 && gnuplot_graph_wanted(PLOT_HEATMAP, opt)) {
6476 err = plot_corrmat(corr, opt);
6477 }
6478
6479 record_corr_matrix(corr);
6480 free_vmatrix(corr);
6481 }
6482
6483 return err;
6484 }
6485
6486 /*
6487 Satterthwaite, F.E. (1946). "An Approximate Distribution of Estimates
6488 of Variance Components". Biometrics Bulletin, 2, 6, pp. 110–114.
6489
6490 v = (s^2_1/n_1 + s^2_2/n_2)^2 /
6491 [(s^_1/n1)^2 / (n_1 - 1) + (s^2_2/n_2)^2 / (n_2 - 1)]
6492 */
6493
satterthwaite_df(double v1,int n1,double v2,int n2)6494 int satterthwaite_df (double v1, int n1,
6495 double v2, int n2)
6496 {
6497 double rtnum = v1 / n1 + v2 / n2;
6498 double d1 = (v1/n1) * (v1/n1) / (n1 - 1);
6499 double d2 = (v2/n2) * (v2/n2) / (n2 - 1);
6500 double v = rtnum * rtnum / (d1 + d2);
6501
6502 return (int) floor(v);
6503 }
6504
6505 /**
6506 * means_test:
6507 * @list: gives the ID numbers of the variables to compare.
6508 * @dset: dataset struct.
6509 * @opt: if OPT_O, assume population variances are different.
6510 * @prn: gretl printing struct.
6511 *
6512 * Carries out test of the null hypothesis that the means of two
6513 * variables are equal.
6514 *
6515 * Returns: 0 on successful completion, error code on error.
6516 */
6517
means_test(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)6518 int means_test (const int *list, const DATASET *dset,
6519 gretlopt opt, PRN *prn)
6520 {
6521 double m1, m2, s1, s2, skew, kurt, se, mdiff, t, pval;
6522 double v1, v2;
6523 double *x = NULL, *y = NULL;
6524 int vardiff = (opt & OPT_O);
6525 int df, n1, n2, n = dset->n;
6526
6527 if (list[0] < 2) {
6528 return E_ARGS;
6529 }
6530
6531 x = malloc(n * sizeof *x);
6532 if (x == NULL) {
6533 return E_ALLOC;
6534 }
6535
6536 y = malloc(n * sizeof *y);
6537 if (y == NULL) {
6538 free(x);
6539 return E_ALLOC;
6540 }
6541
6542 n1 = transcribe_array(x, dset->Z[list[1]], dset);
6543 n2 = transcribe_array(y, dset->Z[list[2]], dset);
6544
6545 if (n1 == 0 || n2 == 0) {
6546 pputs(prn, _("Sample range has no valid observations."));
6547 free(x); free(y);
6548 return 1;
6549 }
6550
6551 if (n1 == 1 || n2 == 1) {
6552 pputs(prn, _("Sample range has only one observation."));
6553 free(x); free(y);
6554 return 1;
6555 }
6556
6557 df = n1 + n2 - 2;
6558
6559 gretl_moments(0, n1-1, x, NULL, &m1, &s1, &skew, &kurt, 1);
6560 gretl_moments(0, n2-1, y, NULL, &m2, &s2, &skew, &kurt, 1);
6561 mdiff = m1 - m2;
6562
6563 v1 = s1 * s1;
6564 v2 = s2 * s2;
6565
6566 if (vardiff) {
6567 /* do not assume the variances are equal */
6568 se = sqrt((v1 / n1) + (v2 / n2));
6569 df = satterthwaite_df(v1, n1, v2, n2);
6570 } else {
6571 /* form pooled estimate of variance */
6572 double s2p;
6573
6574 s2p = ((n1-1) * v1 + (n2-1) * v2) / df;
6575 se = sqrt(s2p / n1 + s2p / n2);
6576 df = n1 + n2 - 2;
6577 }
6578
6579 t = mdiff / se;
6580 pval = student_pvalue_2(df, t);
6581
6582 pprintf(prn, _("\nEquality of means test "
6583 "(assuming %s variances)\n\n"), (vardiff)? _("unequal") : _("equal"));
6584 pprintf(prn, " %s: ", dset->varname[list[1]]);
6585 pprintf(prn, _("Number of observations = %d\n"), n1);
6586 pprintf(prn, " %s: ", dset->varname[list[2]]);
6587 pprintf(prn, _("Number of observations = %d\n"), n2);
6588 pprintf(prn, _(" Difference between sample means = %g - %g = %g\n"),
6589 m1, m2, mdiff);
6590 pputs(prn, _(" Null hypothesis: The two population means are the same.\n"));
6591 pprintf(prn, _(" Estimated standard error = %g\n"), se);
6592 pprintf(prn, _(" Test statistic: t(%d) = %g\n"), df, t);
6593 pprintf(prn, _(" p-value (two-tailed) = %g\n\n"), pval);
6594 if (pval > .10)
6595 pputs(prn, _(" The difference is not statistically significant.\n\n"));
6596
6597 record_test_result(t, pval);
6598
6599 free(x);
6600 free(y);
6601
6602 return 0;
6603 }
6604
6605 /**
6606 * vars_test:
6607 * @list: gives the ID numbers of the variables to compare.
6608 * @dset: dataset struct.
6609 * @prn: gretl printing struct.
6610 *
6611 * Carries out test of the null hypothesis that the variances of two
6612 * variables are equal.
6613 *
6614 * Returns: 0 on successful completion, error code on error.
6615 */
6616
vars_test(const int * list,const DATASET * dset,PRN * prn)6617 int vars_test (const int *list, const DATASET *dset, PRN *prn)
6618 {
6619 double m, s1, s2, var1, var2;
6620 double F, pval;
6621 double *x = NULL, *y = NULL;
6622 int dfn, dfd, n1, n2, n = dset->n;
6623
6624 if (list[0] < 2) return E_ARGS;
6625
6626 x = malloc(n * sizeof *x);
6627 y = malloc(n * sizeof *y);
6628
6629 if (x == NULL || y == NULL) {
6630 free(x);
6631 free(y);
6632 return E_ALLOC;
6633 }
6634
6635 n1 = transcribe_array(x, dset->Z[list[1]], dset);
6636 n2 = transcribe_array(y, dset->Z[list[2]], dset);
6637
6638 if (n1 == 0 || n2 == 0) {
6639 pputs(prn, _("Sample range has no valid observations."));
6640 free(x); free(y);
6641 return 1;
6642 }
6643
6644 if (n1 == 1 || n2 == 1) {
6645 pputs(prn, _("Sample range has only one observation."));
6646 free(x); free(y);
6647 return 1;
6648 }
6649
6650 gretl_moments(0, n1-1, x, NULL, &m, &s1, NULL, NULL, 1);
6651 gretl_moments(0, n2-1, y, NULL, &m, &s2, NULL, NULL, 1);
6652
6653 var1 = s1*s1;
6654 var2 = s2*s2;
6655 if (var1 > var2) {
6656 F = var1 / var2;
6657 dfn = n1 - 1;
6658 dfd = n2 - 1;
6659 } else {
6660 F = var2 / var1;
6661 dfn = n2 - 1;
6662 dfd = n1 - 1;
6663 }
6664
6665 pval = snedecor_cdf_comp(dfn, dfd, F);
6666
6667 pputs(prn, _("\nEquality of variances test\n\n"));
6668 pprintf(prn, " %s: ", dset->varname[list[1]]);
6669 pprintf(prn, _("Number of observations = %d\n"), n1);
6670 pprintf(prn, " %s: ", dset->varname[list[2]]);
6671 pprintf(prn, _("Number of observations = %d\n"), n2);
6672 pprintf(prn, _(" Ratio of sample variances = %g\n"), F);
6673 pprintf(prn, " %s: %s\n", _("Null hypothesis"),
6674 _("The two population variances are equal"));
6675 pprintf(prn, " %s: F(%d,%d) = %g\n", _("Test statistic"), dfn, dfd, F);
6676 pprintf(prn, _(" p-value (two-tailed) = %g\n\n"), pval);
6677 if (snedecor_cdf_comp(dfn, dfd, F) > .10)
6678 pputs(prn, _(" The difference is not statistically significant.\n\n"));
6679
6680 record_test_result(F, pval);
6681
6682 free(x);
6683 free(y);
6684
6685 return 0;
6686 }
6687
6688 struct MahalDist_ {
6689 int *list;
6690 int n;
6691 double *d;
6692 };
6693
mahal_dist_get_distances(const MahalDist * md)6694 const double *mahal_dist_get_distances (const MahalDist *md)
6695 {
6696 return md->d;
6697 }
6698
mahal_dist_get_n(const MahalDist * md)6699 int mahal_dist_get_n (const MahalDist *md)
6700 {
6701 return md->n;
6702 }
6703
mahal_dist_get_varlist(const MahalDist * md)6704 const int *mahal_dist_get_varlist(const MahalDist *md)
6705 {
6706 return md->list;
6707 }
6708
free_mahal_dist(MahalDist * md)6709 void free_mahal_dist (MahalDist *md)
6710 {
6711 free(md->list);
6712 free(md->d);
6713 free(md);
6714 }
6715
mahal_dist_new(const int * list,int n)6716 static MahalDist *mahal_dist_new (const int *list, int n)
6717 {
6718 MahalDist *md = malloc(sizeof *md);
6719
6720 if (md != NULL) {
6721 md->d = malloc(n * sizeof *md->d);
6722 if (md->d == NULL) {
6723 free(md);
6724 md = NULL;
6725 } else {
6726 md->list = gretl_list_copy(list);
6727 if (md->list == NULL) {
6728 free(md->d);
6729 free(md);
6730 md = NULL;
6731 } else {
6732 md->n = n;
6733 }
6734 }
6735 }
6736
6737 if (md != NULL) {
6738 int t;
6739
6740 for (t=0; t<n; t++) {
6741 md->d[t] = NADBL;
6742 }
6743 }
6744
6745 return md;
6746 }
6747
mdist_saver(DATASET * dset)6748 static int mdist_saver (DATASET *dset)
6749 {
6750 int t, v, err;
6751
6752 err = dataset_add_series(dset, 1);
6753
6754 if (err) {
6755 v = 0;
6756 } else {
6757 v = dset->v - 1;
6758
6759 for (t=0; t<dset->n; t++) {
6760 dset->Z[v][t] = NADBL;
6761 }
6762
6763 strcpy(dset->varname[v], "mdist");
6764 make_varname_unique(dset->varname[v], v, dset);
6765 series_set_label(dset, v, _("Mahalanobis distances"));
6766 }
6767
6768 return v;
6769 }
6770
6771 static int
real_mahalanobis_distance(const int * list,DATASET * dset,gretlopt opt,MahalDist * md,PRN * prn)6772 real_mahalanobis_distance (const int *list, DATASET *dset,
6773 gretlopt opt, MahalDist *md,
6774 PRN *prn)
6775 {
6776 gretl_matrix *S = NULL;
6777 gretl_vector *means = NULL;
6778 gretl_vector *xdiff;
6779 int orig_t1 = dset->t1;
6780 int orig_t2 = dset->t2;
6781 int n, err = 0;
6782
6783 list_adjust_sample(list, &dset->t1, &dset->t2, dset, NULL);
6784
6785 n = sample_size(dset);
6786 if (n < 2) {
6787 err = E_DATA;
6788 goto bailout;
6789 }
6790
6791 xdiff = gretl_column_vector_alloc(list[0]);
6792 if (xdiff == NULL) {
6793 err = E_ALLOC;
6794 goto bailout;
6795 }
6796
6797 S = gretl_covariance_matrix_from_varlist(list,
6798 dset,
6799 &means,
6800 &err);
6801
6802 if (!err) {
6803 if (opt & OPT_V) {
6804 gretl_matrix_print_to_prn(S, _("Covariance matrix"), prn);
6805 }
6806 err = gretl_invert_symmetric_matrix(S);
6807 if (err) {
6808 fprintf(stderr, "error inverting covariance matrix\n");
6809 } else if (opt & OPT_V) {
6810 gretl_matrix_print_to_prn(S, _("Inverse of covariance matrix"), prn);
6811 }
6812 }
6813
6814 if (!err) {
6815 int k = gretl_vector_get_length(means);
6816 int miss, obslen = 0, savevar = 0;
6817 double m, x, xbar;
6818 int i, t, vi;
6819
6820 if (opt & OPT_S) {
6821 /* save the results to a data series */
6822 savevar = mdist_saver(dset);
6823 }
6824
6825 if (prn != NULL) {
6826 pprintf(prn, "%s\n", _("Mahalanobis distances from the centroid"));
6827 pprintf(prn, "%s\n", _("using the variables:"));
6828 for (i=1; i<=list[0]; i++) {
6829 pprintf(prn, " %s\n", dset->varname[list[i]]);
6830 }
6831 pputc(prn, '\n');
6832
6833 obslen = max_obs_marker_length(dset);
6834 if (obslen < 8) {
6835 obslen = 8;
6836 }
6837 }
6838
6839 for (t=dset->t1; t<=dset->t2; t++) {
6840 miss = 0;
6841
6842 /* write vector of deviations from centroid for
6843 observation t */
6844 for (i=0; i<k; i++) {
6845 vi = list[i+1];
6846 xbar = gretl_vector_get(means, i);
6847 x = dset->Z[vi][t];
6848 miss += na(x);
6849 if (!miss) {
6850 gretl_vector_set(xdiff, i, x - xbar);
6851 }
6852 }
6853
6854 if (miss) {
6855 m = NADBL;
6856 } else {
6857 m = gretl_scalar_qform(xdiff, S, &err);
6858 if (!err) {
6859 m = sqrt(m);
6860 }
6861 }
6862
6863 if (prn != NULL) {
6864 print_obs_marker(t, dset, obslen, prn);
6865 if (err || miss) {
6866 pprintf(prn, "NA\n");
6867 } else {
6868 pprintf(prn, "%9.6f\n", m);
6869 }
6870 }
6871
6872 if (savevar > 0) {
6873 dset->Z[savevar][t] = m;
6874 } else if (md != NULL) {
6875 md->d[t] = m;
6876 }
6877 }
6878
6879 if (savevar > 0 && prn != NULL) {
6880 pputc(prn, '\n');
6881 pprintf(prn, _("Distances saved as '%s'"),
6882 dset->varname[savevar]);
6883 pputc(prn, '\n');
6884 }
6885
6886 if (prn != NULL) {
6887 pputc(prn, '\n');
6888 }
6889 }
6890
6891 gretl_matrix_free(xdiff);
6892 gretl_matrix_free(means);
6893 gretl_matrix_free(S);
6894
6895 bailout:
6896
6897 dset->t1 = orig_t1;
6898 dset->t2 = orig_t2;
6899
6900 return err;
6901 }
6902
mahalanobis_distance(const int * list,DATASET * dset,gretlopt opt,PRN * prn)6903 int mahalanobis_distance (const int *list, DATASET *dset,
6904 gretlopt opt, PRN *prn)
6905 {
6906 return real_mahalanobis_distance(list, dset, opt, NULL,
6907 (opt & OPT_Q) ? NULL : prn);
6908 }
6909
get_mahal_distances(const int * list,DATASET * dset,gretlopt opt,PRN * prn,int * err)6910 MahalDist *get_mahal_distances (const int *list, DATASET *dset,
6911 gretlopt opt, PRN *prn, int *err)
6912 {
6913 MahalDist *md = mahal_dist_new(list, dset->n);
6914
6915 if (md == NULL) {
6916 *err = E_ALLOC;
6917 } else {
6918 *err = real_mahalanobis_distance(list, dset, opt,
6919 md, prn);
6920 if (*err) {
6921 free_mahal_dist(md);
6922 md = NULL;
6923 }
6924 }
6925
6926 return md;
6927 }
6928
6929 /*
6930 G = [(2 * \sum_i^n (i * x_i)) / (n * \sum_i^n x_i )] - [(n + 1)/n]
6931
6932 where x is sorted: x_i <= x_{i+1}
6933 */
6934
gini_coeff(const double * x,int t1,int t2,double ** plz,int * pn,int * err)6935 static double gini_coeff (const double *x, int t1, int t2, double **plz,
6936 int *pn, int *err)
6937 {
6938 int m = t2 - t1 + 1;
6939 double *sx = NULL;
6940 double csx = 0.0, sumx = 0.0, sisx = 0.0;
6941 double G;
6942 int t, n = 0;
6943
6944 gretl_error_clear();
6945
6946 sx = malloc(m * sizeof *sx);
6947
6948 if (sx == NULL) {
6949 *err = E_ALLOC;
6950 return NADBL;
6951 }
6952
6953 n = 0;
6954 for (t=t1; t<=t2; t++) {
6955 if (na(x[t])) {
6956 continue;
6957 } else if (x[t] < 0.0) {
6958 gretl_errmsg_set(_("illegal negative value"));
6959 *err = E_DATA;
6960 break;
6961 } else {
6962 sx[n++] = x[t];
6963 sumx += x[t];
6964 }
6965 }
6966
6967 if (!*err && (n == 0 || sumx == 0.0)) {
6968 *err = E_DATA;
6969 }
6970
6971 if (*err) {
6972 free(sx);
6973 return NADBL;
6974 }
6975
6976 qsort(sx, n, sizeof *sx, gretl_compare_doubles);
6977
6978 #if 0
6979 if (1) {
6980 /* just testing alternative calculation for equivalence */
6981 double num, S[2];
6982 int s, fyi;
6983
6984 num = S[0] = S[1] = 0.0;
6985
6986 for (t=0; t<n; t++) {
6987 fyi = 0;
6988 s = t;
6989 while (s < n && sx[s] == sx[t]) {
6990 fyi++;
6991 s++;
6992 }
6993 S[1] += (double) fyi/n * sx[t];
6994 num += (double) fyi/n * (S[0] + S[1]);
6995 t = s - 1;
6996 S[0] = S[1];
6997 }
6998
6999 G = 1.0 - num / S[1];
7000 fprintf(stderr, "alt G = %g\n", G);
7001 }
7002 #endif
7003
7004 for (t=0; t<n; t++) {
7005 csx += sx[t];
7006 sisx += (t + 1) * sx[t];
7007 sx[t] = csx / sumx; /* sx now = Lorenz curve */
7008 }
7009
7010 G = 2.0 * sisx / (n * sumx) - ((double) n + 1) / n;
7011
7012 if (plz != NULL) {
7013 *plz = sx;
7014 *pn = n;
7015 } else {
7016 free(sx);
7017 }
7018
7019 return G;
7020 }
7021
7022 /**
7023 * gretl_gini:
7024 * @t1: starting observation.
7025 * @t2: ending observation.
7026 * @x: data series.
7027 *
7028 * Returns: the Gini coefficient for the series @x from obs
7029 * @t1 to obs @t2, skipping any missing values, or #NADBL
7030 * on failure.
7031 */
7032
gretl_gini(int t1,int t2,const double * x)7033 double gretl_gini (int t1, int t2, const double *x)
7034 {
7035 double g;
7036 int err = 0;
7037
7038 g = gini_coeff(x, t1, t2, NULL, NULL, &err);
7039
7040 if (err) {
7041 g = NADBL;
7042 }
7043
7044 return g;
7045 }
7046
lorenz_graph(const char * vname,double * lz,int n)7047 static int lorenz_graph (const char *vname, double *lz, int n)
7048 {
7049 FILE *fp;
7050 double idx;
7051 int downsample = 0;
7052 int t, err = 0;
7053
7054 fp = open_plot_input_file(PLOT_REGULAR, 0, &err);
7055 if (err) {
7056 return err;
7057 }
7058
7059 print_keypos_string(GP_KEY_LEFT_TOP, fp);
7060
7061 fprintf(fp, "set title '%s'\n", vname);
7062 fprintf(fp, "plot \\\n"
7063 "'-' using 1:2 title '%s' w lines, \\\n"
7064 "'-' using 1:2 notitle w lines\n",
7065 _("Lorenz curve"));
7066
7067 gretl_push_c_numeric_locale();
7068
7069 if (n > 4000) {
7070 downsample = (int) (n / 1000.0);
7071 }
7072
7073 for (t=0; t<n; t++) {
7074 if (downsample && t % downsample) {
7075 continue;
7076 }
7077 idx = t + 1;
7078 fprintf(fp, "%g %g\n", idx / n, lz[t]);
7079 }
7080 fputs("e\n", fp);
7081
7082 for (t=0; t<n; t++) {
7083 if (downsample && t % downsample) {
7084 continue;
7085 }
7086 idx = ((double) t + 1) / n;
7087 fprintf(fp, "%g %g\n", idx, idx);
7088 }
7089 fputs("e\n", fp);
7090
7091 gretl_pop_c_numeric_locale();
7092
7093 return finalize_plot_input_file(fp);
7094 }
7095
7096 /**
7097 * gini:
7098 * @varno: ID number of variable to examine.
7099 * @dset: dataset struct.
7100 * @opt: unused at present.
7101 * @prn: gretl printing struct.
7102 *
7103 * Graphs the Lorenz curve for variable @vnum and prints the
7104 * Gini coefficient.
7105 *
7106 * Returns: 0 on successful completion, error code on error.
7107 */
7108
gini(int varno,DATASET * dset,gretlopt opt,PRN * prn)7109 int gini (int varno, DATASET *dset, gretlopt opt, PRN *prn)
7110 {
7111 double *lz = NULL;
7112 double gini;
7113 int n, fulln;
7114 int err = 0;
7115
7116 gini = gini_coeff(dset->Z[varno], dset->t1, dset->t2,
7117 &lz, &n, &err);
7118
7119 if (err) {
7120 return err;
7121 }
7122
7123 fulln = dset->t2 - dset->t1 - 1;
7124 pprintf(prn, "%s\n", dset->varname[varno], n);
7125 pprintf(prn, _("Number of observations = %d\n"), n);
7126
7127 if (n < fulln) {
7128 pputs(prn, _("Warning: there were missing values\n"));
7129 }
7130
7131 pputc(prn, '\n');
7132
7133 pprintf(prn, "%s = %g\n", _("Sample Gini coefficient"), gini);
7134 pprintf(prn, "%s = %g\n", _("Estimate of population value"),
7135 gini * (double) n / (n - 1));
7136
7137 err = lorenz_graph(dset->varname[varno], lz, n);
7138
7139 free(lz);
7140
7141 return err;
7142 }
7143
7144 /* Following: apparatus for Shapiro-Wilk normality test.
7145 Thanks to Marcin Blazejowski for the contribution.
7146 */
7147
7148 #ifndef min
7149 # define min(a, b) ((a) > (b) ? (b) : (a))
7150 #endif
7151
sign(int a)7152 static int sign (int a)
7153 {
7154 return (a == 0)? 0 : ((a < 0)? -1 : 1);
7155 }
7156
compare_floats(const void * a,const void * b)7157 static int compare_floats (const void *a, const void *b)
7158 {
7159 const float *fa = (const float *) a;
7160 const float *fb = (const float *) b;
7161
7162 return (*fa > *fb) - (*fa < *fb);
7163 }
7164
7165 /* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
7166 Calculates the algebraic polynomial of order n-1 with
7167 array of coefficients cc. Zero-order coefficient is
7168 cc(1) = cc[0]
7169 */
7170
poly(const float * cc,int n,float x)7171 static double poly (const float *cc, int n, float x) {
7172 double p, ret = cc[0]; /* preserve precision! */
7173 int j;
7174
7175 if (n > 1) {
7176 p = x * cc[n-1];
7177 for (j=n-2; j>0; j--) {
7178 p = (p + cc[j]) * x;
7179 }
7180 ret += p;
7181 }
7182
7183 return ret;
7184 }
7185
7186 /* Calculate coefficients for the Shapiro-Wilk test */
7187
sw_coeff(int n,int n2,float * a)7188 static void sw_coeff (int n, int n2, float *a)
7189 {
7190 const float c1[6] = { 0.f,.221157f,-.147981f,-2.07119f, 4.434685f, -2.706056f };
7191 const float c2[6] = { 0.f,.042981f,-.293762f,-1.752461f,5.682633f, -3.582633f };
7192 float summ2, ssumm2;
7193 float a0, a1, an = (float) n;
7194 float fac, an25, rsn;
7195 int i, i1, nn2 = n / 2;
7196
7197 if (n == 3) {
7198 a[0] = sqrt(0.5);
7199 } else {
7200 an25 = an + .25;
7201 summ2 = 0.0;
7202 for (i=0; i<n2; i++) {
7203 a[i] = (float) normal_cdf_inverse((i + 1 - .375f) / an25);
7204 summ2 += a[i] * a[i];
7205 }
7206 summ2 *= 2.0;
7207 ssumm2 = sqrt(summ2);
7208 rsn = 1.0 / sqrt(an);
7209 a0 = poly(c1, 6, rsn) - a[0] / ssumm2;
7210
7211 /* Normalize array a */
7212 if (n > 5) {
7213 i1 = 2;
7214 a1 = -a[1] / ssumm2 + poly(c2, 6, rsn);
7215 fac = sqrt((summ2 - 2.0 * (a[0] * a[0]) - 2.0 * (a[1] * a[1])) /
7216 (1.0 - 2.0 * (a0 * a0) - 2.0 * (a1 * a1)));
7217 a[1] = a1;
7218 } else {
7219 i1 = 1;
7220 fac = sqrt((summ2 - 2.0 * (a[0] * a[0])) / (1.0 - 2.0 * (a0 * a0)));
7221 }
7222 a[0] = a0;
7223 for (i=i1; i<nn2; i++) {
7224 a[i] /= -fac;
7225 }
7226 }
7227 }
7228
7229 /* ALGORITHM AS R94 APPL. STATIST. (1995) vol. 44, no. 4, 547-551.
7230 Calculates the Shapiro-Wilk W test and its significance level,
7231 Rendered into C by Marcin Blazejowski, May 2008.
7232 */
7233
sw_w(float * x,int n,int n1,float * a,double * W,double * pval)7234 static int sw_w (float *x, int n, int n1, float *a,
7235 double *W, double *pval)
7236 {
7237 const float z90 = 1.2816f;
7238 const float z95 = 1.6449f;
7239 const float z99 = 2.3263f;
7240 const float zm = 1.7509f;
7241 const float zss = .56268f;
7242 const float bf1 = .8378f;
7243 const double xx90 = .556;
7244 const double xx95 = .622;
7245 const float little = 1e-19f; /* "small" is reserved on win32 */
7246 const float pi6 = 1.909859f;
7247 const float stqr = 1.047198f;
7248
7249 /* polynomial coefficients */
7250 const float g[2] = { -2.273f, .459f };
7251 const float c3[4] = { .544f, -.39978f, .025054f, -6.714e-4f };
7252 const float c4[4] = { 1.3822f, -.77857f, .062767f, -.0020322f };
7253 const float c5[4] = { -1.5861f, -.31082f, -.083751f, .0038915f };
7254 const float c6[3] = { -.4803f, -.082676f, .0030302f };
7255 const float c7[2] = { .164f, .533f };
7256 const float c8[2] = { .1736f, .315f };
7257 const float c9[2] = { .256f, -.00635f };
7258
7259 float r1, zbar, ssassx, gamma, range;
7260 float bf, ld, m, s, sa, xi, sx, xx, y, w1;
7261 float asa, ssa, z90f, sax, zfm, z95f, zsd, z99f, ssx, xsx;
7262 float an = (float) n;
7263 int i, j, ncens = n - n1;
7264
7265 /* Check for zero range */
7266 range = x[n1 - 1] - x[0];
7267 if (range < little) {
7268 fprintf(stderr, "sw_w: range is too small\n");
7269 return 1;
7270 }
7271
7272 /* Check for correct sort order on range-scaled X */
7273 /* In fact we don't need do this, since we use qsort() from libc
7274 and we can suppose, that qsort() can sort... */
7275 xx = x[0] / range;
7276 sx = xx;
7277 sa = -a[0];
7278 j = n - 1;
7279 for (i = 1; i < n1; --j) {
7280 xi = x[i] / range;
7281 sx += xi;
7282 ++i;
7283 if (i != j) {
7284 sa += sign(i - j) * a[min(i,j) - 1];
7285 }
7286 xx = xi;
7287 }
7288
7289 /* Calculate W statistic as squared correlation between data and
7290 coefficients */
7291 sa /= n1;
7292 sx /= n1;
7293 ssa = ssx = sax = 0.0;
7294 j = n - 1;
7295 for (i=0; i<n1; i++, j--) {
7296 if (i != j) {
7297 asa = sign(i - j) * a[min(i,j)] - sa;
7298 } else {
7299 asa = -sa;
7300 }
7301 xsx = x[i] / range - sx;
7302 ssa += asa * asa;
7303 ssx += xsx * xsx;
7304 sax += asa * xsx;
7305 }
7306
7307 /* W1 equals (1-W) calculated to avoid excessive rounding error
7308 for W very near 1 (a potential problem in very large samples)
7309 */
7310 ssassx = sqrt(ssa * ssx);
7311 w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
7312 *W = 1 - w1;
7313
7314 /* Calculate significance level for W */
7315 if (n == 3) {
7316 /* exact P-value */
7317 *pval = pi6 * (asin(sqrt(*W)) - stqr);
7318 return 0;
7319 }
7320
7321 y = log(w1);
7322 xx = log(an);
7323
7324 if (n <= 11) {
7325 gamma = poly(g, 2, an);
7326 if (y >= gamma) {
7327 /* FIXME: rather use an even smaller value, or NA? */
7328 *pval = little;
7329 return 0;
7330 }
7331 y = -log(gamma - y);
7332 m = poly(c3, 4, an);
7333 s = exp(poly(c4, 4, an));
7334 } else {
7335 /* n >= 12 */
7336 m = poly(c5, 4, xx);
7337 s = exp(poly(c6, 3, xx));
7338 }
7339
7340 if (ncens > 0) {
7341 /* Censoring by proportion ncens/n.
7342 Calculate mean and sd of normal equivalent deviate of W
7343 */
7344 float delta = (float) ncens / an;
7345
7346 ld = -log(delta);
7347 bf = 1.0 + xx * bf1;
7348 r1 = pow(xx90, (double) xx);
7349 z90f = z90 + bf * pow(poly(c7, 2, r1), (double) ld);
7350 r1 = pow(xx95, (double) xx);
7351 z95f = z95 + bf * pow(poly(c8, 2, r1), (double) ld);
7352 z99f = z99 + bf * pow(poly(c9, 2, xx), (double) ld);
7353
7354 /* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
7355 pseudo-mean and pseudo-sd of z as the slope and intercept
7356 */
7357 zfm = (z90f + z95f + z99f) / 3.0;
7358 zsd = (z90 * (z90f - zfm) + z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;
7359 zbar = zfm - zsd * zm;
7360 m += zbar * s;
7361 s *= zsd;
7362 }
7363
7364 *pval = normal_cdf_comp(((double) y - (double) m) / (double) s);
7365
7366 return 0;
7367 }
7368
sw_sample_check(int n,int n1)7369 static int sw_sample_check (int n, int n1)
7370 {
7371 int ncens = n - n1;
7372 float delta, an = n;
7373
7374 if (n < 3 || n1 < 3) {
7375 fprintf(stderr, "There is no way to run SW test for less then 3 obs\n");
7376 return E_DATA;
7377 }
7378
7379 if (ncens < 0 || (ncens > 0 && n < 20)) {
7380 fprintf(stderr, "sw_w: not enough uncensored obserations\n");
7381 return E_DATA;
7382 }
7383
7384 delta = (float) ncens / an;
7385 if (delta > .8f) {
7386 fprintf(stderr, "sw_w: too many censored obserations\n");
7387 return E_DATA;
7388 }
7389
7390 return 0;
7391 }
7392
7393 /**
7394 * shapiro_wilk:
7395 * @x: data array.
7396 * @t1: starting observation.
7397 * @t2: ending observation.
7398 * @W: location to receive test statistic.
7399 * @pval: location to receive p-value.
7400 *
7401 * Computes the Shapiro-Wilk W statistic as a test for
7402 * normality of the data @x, and also the p-value for
7403 * the test. These are written into the pointer
7404 * arguments @W and @pval.
7405 *
7406 * Returns: 0 on success, non-zero on failure.
7407 */
7408
shapiro_wilk(const double * x,int t1,int t2,double * W,double * pval)7409 int shapiro_wilk (const double *x, int t1, int t2, double *W, double *pval)
7410 {
7411 int n1 = 0; /* number of uncensored obs */
7412 float *a = NULL; /* array of coefficients */
7413 float *xf = NULL; /* copy of x in float format */
7414 int i, t, n2, n = 0;
7415 int err = 0;
7416
7417 *W = *pval = NADBL;
7418
7419 for (t=t1; t<=t2; t++) {
7420 if (!na(x[t])) n++;
7421 }
7422
7423 /* for now we assume all obs are uncensored */
7424 n1 = n;
7425
7426 err = sw_sample_check(n, n1);
7427 if (err) {
7428 return err;
7429 }
7430
7431 /* How many coeffs should be computed? */
7432 n2 = fmod(n, 2.0);
7433 n2 = (n2 == 0)? n / 2 : (n - 1) / 2;
7434
7435 xf = malloc(n * sizeof *xf);
7436 a = malloc(n2 * sizeof *a);
7437
7438 if (xf == NULL || a == NULL) {
7439 err = E_ALLOC;
7440 } else {
7441 i = 0;
7442 for (t=t1; t<=t2; t++) {
7443 if (!na(x[t])) {
7444 xf[i++] = x[t];
7445 }
7446 }
7447 qsort(xf, n, sizeof *xf, compare_floats);
7448 /* Main job: compute W stat and its p-value */
7449 sw_coeff(n, n2, a);
7450 err = sw_w(xf, n, n1, a, W, pval);
7451 }
7452
7453 free(a);
7454 free(xf);
7455
7456 return err;
7457 }
7458
round2(double x)7459 static double round2 (double x)
7460 {
7461 x *= 100.0;
7462 x = (x - floor(x) < 0.5)? floor(x) : ceil(x);
7463 return x / 100;
7464 }
7465
7466 /* Polynomial approximation to the p-value for the Lilliefors test,
7467 said to be good to 2 decimal places. See Abdi and Molin,
7468 "Lilliefors/Van Soest's test of normality", in Salkind (ed.)
7469 Encyclopedia of Measurement and Statistics, Sage, 2007; also
7470 http://www.utd.edu/~herve/Abdi-Lillie2007-pretty.pdf
7471 */
7472
lilliefors_pval(double L,int N)7473 static double lilliefors_pval (double L, int N)
7474 {
7475 double b0 = 0.37872256037043;
7476 double b1 = 1.30748185078790;
7477 double b2 = 0.08861783849346;
7478 double b1pN = b1 + N;
7479 double Lm2 = 1.0 / (L * L);
7480 double A, pv;
7481
7482 A = (-b1pN + sqrt(b1pN * b1pN - 4 * b2 * (b0 - Lm2))) / (2 * b2);
7483
7484 pv = -0.37782822932809 + 1.67819837908004*A
7485 - 3.02959249450445*A*A + 2.80015798142101*pow(A, 3.)
7486 - 1.39874347510845*pow(A, 4.) + 0.40466213484419*pow(A, 5.)
7487 - 0.06353440854207*pow(A, 6.) + 0.00287462087623*pow(A, 7.)
7488 + 0.00069650013110*pow(A, 8.) - 0.00011872227037*pow(A, 9.)
7489 + 0.00000575586834*pow(A, 10.);
7490
7491 if (pv < 0) {
7492 pv = 0;
7493 } else if (pv > 1) {
7494 pv = 1;
7495 } else {
7496 /* round to 2 digits */
7497 pv = round2(pv);
7498 }
7499
7500 return pv;
7501 }
7502
lilliefors_test(const double * x,int t1,int t2,double * L,double * pval)7503 static int lilliefors_test (const double *x, int t1, int t2,
7504 double *L, double *pval)
7505 {
7506 double *zx; /* copy of x for sorting, z-scoring */
7507 int i, t, n = 0;
7508 int err = 0;
7509
7510 *L = *pval = NADBL;
7511
7512 for (t=t1; t<=t2; t++) {
7513 if (!na(x[t])) n++;
7514 }
7515
7516 if (n < 5) {
7517 /* we need more than 4 data points */
7518 return E_DATA;
7519 }
7520
7521 zx = malloc(n * sizeof *zx);
7522
7523 if (zx == NULL) {
7524 err = E_ALLOC;
7525 } else {
7526 double dx, mx = 0.0, sx = 0.0;
7527 double Phi, Dp = 0.0, Dm = 0.0;
7528 double Dmax = 0.0;
7529
7530 /* compute sample mean and standard deviation plus
7531 sorted z-scores */
7532
7533 i = 0;
7534 for (t=t1; t<=t2; t++) {
7535 if (!na(x[t])) {
7536 zx[i++] = x[t];
7537 mx += x[t];
7538 }
7539 }
7540 mx /= n;
7541
7542 for (t=t1; t<=t2; t++) {
7543 if (!na(x[t])) {
7544 dx = x[t] - mx;
7545 sx += dx * dx;
7546 }
7547 }
7548 sx = sqrt(sx / (n - 1));
7549
7550 qsort(zx, n, sizeof *zx, gretl_compare_doubles);
7551
7552 for (i=0; i<n; i++) {
7553 zx[i] = (zx[i] - mx) / sx;
7554 }
7555
7556 /* The following is based on the formula given in the
7557 "nortest" package for GNU R, function lillie.test (written
7558 by Juergen Gross). It differs from the algorithm given by
7559 Abdi and Molin (whose p-value approximation is used above).
7560 The A & M account of the statistic itself seems to be
7561 wrong: it consistently over-rejects (i.e. around 8 percent
7562 of the time for a nominal 5 percent significance level).
7563 */
7564
7565 for (i=0; i<n; i++) {
7566 Phi = normal_cdf(zx[i]);
7567 Dp = (double) (i+1) / n - Phi;
7568 Dm = Phi - (double) i / n;
7569 if (Dp > Dmax) Dmax = Dp;
7570 if (Dm > Dmax) Dmax = Dm;
7571 }
7572
7573 *L = Dmax;
7574 *pval = lilliefors_pval(Dmax, n);
7575 }
7576
7577 free(zx);
7578
7579 return err;
7580 }
7581
skew_kurt_test(const double * x,int t1,int t2,double * test,double * pval,gretlopt opt)7582 static int skew_kurt_test (const double *x, int t1, int t2,
7583 double *test, double *pval,
7584 gretlopt opt)
7585 {
7586 double skew, xkurt;
7587 int n, err = 0;
7588
7589 err = series_get_moments(t1, t2, x, &skew, &xkurt, &n);
7590
7591 if (!err) {
7592 if (opt & OPT_J) {
7593 /* Jarque-Bera */
7594 *test = (n / 6.0) * (skew * skew + xkurt * xkurt/ 4.0);
7595 } else {
7596 /* Doornik-Hansen */
7597 *test = doornik_chisq(skew, xkurt, n);
7598 }
7599 }
7600
7601 if (!err && na(*test)) {
7602 err = E_NAN;
7603 }
7604
7605 if (!na(*test)) {
7606 *pval = chisq_cdf_comp(2, *test);
7607 }
7608
7609 return err;
7610 }
7611
print_normality_stat(double test,double pval,gretlopt opt,PRN * prn)7612 static void print_normality_stat (double test, double pval,
7613 gretlopt opt, PRN *prn)
7614 {
7615 const char *tstrs[] = {
7616 N_("Shapiro-Wilk W"),
7617 N_("Jarque-Bera test"),
7618 N_("Lilliefors test"),
7619 N_("Doornik-Hansen test")
7620 };
7621 int i = (opt & OPT_W)? 0 : (opt & OPT_J)? 1 :
7622 (opt & OPT_L)? 2 : 3;
7623
7624 if (na(test) || na(pval)) {
7625 return;
7626 }
7627
7628 if (opt & OPT_L) {
7629 pprintf(prn, " %s = %g, %s ~= %g\n\n",
7630 _(tstrs[i]), test, _("with p-value"), pval);
7631 } else {
7632 pprintf(prn, " %s = %g, %s %g\n\n",
7633 _(tstrs[i]), test, _("with p-value"), pval);
7634 }
7635 }
7636
7637 /**
7638 * gretl_normality_test:
7639 * @varno: ID number of the variable to process.
7640 * @dset: dataset struct.
7641 * @opt: OPT_A: all tests; OPT_D: Doornik-Hansen; OPT_W: Shapiro-Wilk;
7642 * OPT_J: Jarque-Bera; OPT_L: Lilliefors; default is Doornik-Hansen.
7643 * Also use OPT_Q for quiet.
7644 * @prn: gretl printing struct.
7645 *
7646 * Performs, and prints the results of, the specified test(s) randomness
7647 * for the variable specified by @v.
7648 *
7649 * Returns: 0 on successful completion, non-zero on error.
7650 */
7651
gretl_normality_test(int varno,const DATASET * dset,gretlopt opt,PRN * prn)7652 int gretl_normality_test (int varno, const DATASET *dset,
7653 gretlopt opt, PRN *prn)
7654 {
7655 gretlopt alltests = OPT_D | OPT_W | OPT_J | OPT_L;
7656 double test = NADBL;
7657 double pval = NADBL;
7658 double trec = NADBL;
7659 double pvrec = NADBL;
7660 int err;
7661
7662 if (varno < 0 || varno >= dset->v) {
7663 return E_DATA;
7664 }
7665
7666 err = incompatible_options(opt, OPT_A | alltests);
7667 if (err) {
7668 return err;
7669 }
7670
7671 if (opt & OPT_A) {
7672 /* show all tests */
7673 opt |= alltests;
7674 }
7675
7676 if (!(opt & alltests)) {
7677 /* no method selected: use Doornik-Hansen */
7678 opt |= OPT_D;
7679 }
7680
7681 if (!(opt & OPT_Q)) {
7682 pprintf(prn, _("Test for normality of %s:"), dset->varname[varno]);
7683 if (opt & OPT_A) {
7684 pputs(prn, "\n\n");
7685 } else {
7686 pputc(prn, '\n');
7687 }
7688 }
7689
7690 if (opt & OPT_D) {
7691 err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2,
7692 &test, &pval, OPT_D);
7693 if (!err && !(opt & OPT_Q)) {
7694 print_normality_stat(test, pval, OPT_D, prn);
7695 }
7696 if (!err) {
7697 /* record Doornik-Hansen result by default */
7698 trec = test;
7699 pvrec = pval;
7700 }
7701 }
7702
7703 if (opt & OPT_W) {
7704 err = shapiro_wilk(dset->Z[varno], dset->t1, dset->t2,
7705 &test, &pval);
7706 if (!err && !(opt & OPT_Q)) {
7707 print_normality_stat(test, pval, OPT_W, prn);
7708 }
7709 }
7710
7711 if (opt & OPT_L) {
7712 err = lilliefors_test(dset->Z[varno], dset->t1, dset->t2,
7713 &test, &pval);
7714 if (!err && !(opt & OPT_Q)) {
7715 print_normality_stat(test, pval, OPT_L, prn);
7716 }
7717 }
7718
7719 if (opt & OPT_J) {
7720 err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2,
7721 &test, &pval, OPT_J);
7722 if (!err && !(opt & OPT_Q)) {
7723 print_normality_stat(test, pval, OPT_J, prn);
7724 }
7725 }
7726
7727 if (na(trec) && !na(test)) {
7728 trec = test;
7729 }
7730
7731 if (na(pvrec) && !na(pval)) {
7732 pvrec = pval;
7733 }
7734
7735 if (!na(trec) && !na(pvrec)) {
7736 record_test_result(trec, pvrec);
7737 }
7738
7739 return err;
7740 }
7741
gretl_normtest_matrix(const double * y,int t1,int t2,gretlopt opt,int * err)7742 gretl_matrix *gretl_normtest_matrix (const double *y,
7743 int t1, int t2,
7744 gretlopt opt,
7745 int *err)
7746 {
7747 gretl_matrix *ret = NULL;
7748 double test = NADBL;
7749 double pval = NADBL;
7750
7751 if (opt & OPT_J) {
7752 /* Jarque-Bera */
7753 *err = skew_kurt_test(y, t1, t2, &test, &pval, opt);
7754 } else if (opt & OPT_W) {
7755 /* Shapiro-Wilk */
7756 *err = shapiro_wilk(y, t1, t2, &test, &pval);
7757 } else if (opt & OPT_L) {
7758 /* Lilliefors */
7759 *err = lilliefors_test(y, t1, t2, &test, &pval);
7760 } else {
7761 /* Doornik-Hansen */
7762 *err = skew_kurt_test(y, t1, t2, &test, &pval, opt);
7763 }
7764
7765 if (!*err) {
7766 ret = gretl_matrix_alloc(1, 2);
7767 if (ret == NULL) {
7768 *err = E_ALLOC;
7769 } else {
7770 ret->val[0] = test;
7771 ret->val[1] = pval;
7772 }
7773 }
7774
7775 return ret;
7776 }
7777