/* * gretl -- Gnu Regression, Econometrics and Time-series Library * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ #include "libgretl.h" #include "gretl_matrix.h" #include "gretl_cmatrix.h" #include "matrix_extra.h" #include "gretl_panel.h" #include "gretl_string_table.h" #include "libset.h" #include "compat.h" #include "plotspec.h" #include "usermat.h" #include #ifdef WIN32 # include #endif /** * SECTION:describe * @short_description: descriptive statistics plus some tests * @title: Descriptive statistics * @include: libgretl.h * * Computation and printing of numerous descriptive statistics * along with some hypothesis tests, for example regarding the * normality of a data series. */ /* return the number of "good" (non-missing) observations in series x; also (optionally) write to x0 the first non-missing value */ static int good_obs (const double *x, int n, double *x0) { int t, good = n; if (x0 != NULL) { *x0 = NADBL; } for (t=0; t *max) *max = x[t]; if (x[t] < *min) *min = x[t]; } n++; } } return n; } /** * gretl_min: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the minimum value of @x over the given range, * or #NADBL if no valid vaues are found. */ double gretl_min (int t1, int t2, const double *x) { double min, max; gretl_minmax(t1, t2, x, &min, &max); return min; } /** * gretl_max: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the maximum value of @x over the given range, * or #NADBL if no valid vaues are found. */ double gretl_max (int t1, int t2, const double *x) { double min, max; gretl_minmax(t1, t2, x, &min, &max); return max; } /** * gretl_sum: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the sum of the series @x from obs * @t1 to obs @t2, skipping any missing values, or #NADBL * in case there are no valid observations. */ double gretl_sum (int t1, int t2, const double *x) { double sum = 0.0; int t, n = 0; for (t=t1; t<=t2; t++) { if (!(na(x[t]))) { sum += x[t]; n++; } } return (n == 0)? NADBL : sum; } /** * gretl_mean: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the arithmetic mean of the series @x from obs * @t1 to obs @t2, skipping any missing values, or #NADBL * in case there are no valid observations. */ double gretl_mean (int t1, int t2, const double *x) { double xbar, sum = 0.0; int t, n = 0; for (t=t1; t<=t2; t++) { if (!(na(x[t]))) { sum += x[t]; n++; } } if (n == 0) { return NADBL; } xbar = sum / n; sum = 0.0; for (t=t1; t<=t2; t++) { if (!(na(x[t]))) { sum += (x[t] - xbar); } } return xbar + sum / n; } /** * eval_ytest: * @y: reference numerical value. * @op: operator. * @test: numerical test value. * * Returns: 1 if the expression @y @yop @test (for example * "y = 2" or "y <= 45") evaluates as true, else 0. */ int eval_ytest (double y, GretlOp op, double test) { int ret = 0; switch (op) { case OP_EQ: if (na(test)) { ret = na(y); } else { ret = (y == test); } break; case OP_GT: ret = (y > test); break; case OP_LT: ret = (y < test); break; case OP_NEQ: if (na(test)) { ret = !na(y); } else { ret = (y != test); } break; case OP_GTE: ret = (y >= test); break; case OP_LTE: ret = (y <= test); break; default: break; } return ret; } /** * gretl_restricted_mean: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @y: criterion series. * @yop: criterion operator. * @yval: criterion value. * * Returns: the arithmetic mean of the series @x in the * range @t1 to @t2 (inclusive), but including only * observations where the criterion variable @y bears the * relationship @yop to the value @yval -- or #NADBL in case * there are no observations that satisfy the restriction. */ double gretl_restricted_mean (int t1, int t2, const double *x, const double *y, GretlOp yop, double yval) { double xbar, sum = 0.0; int t, n = 0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && eval_ytest(y[t], yop, yval)) { sum += x[t]; n++; } } if (n == 0) { return NADBL; } xbar = sum / n; sum = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && eval_ytest(y[t], yop, yval)) { sum += (x[t] - xbar); } } return xbar + sum / n; } /* For an array a with n elements, find the element which would be a[h] if the array were sorted from smallest to largest (without the need to do a full sort). The algorithm is due to C. A. R. Hoare; the C implementation is shamelessly copied, with minor adaptations, from http://www.astro.northwestern.edu/~ato/um/programming/sorting.html (Atakan G\"urkan: link now dead). */ static double find_hoare (double *a, int n, int h) { double w, x; int l = 0; int r = n - 1; int i, j; while (l < r) { x = a[h]; i = l; j = r; while (j >= i) { while (a[i] < x) i++; while (x < a[j]) j--; if (i <= j) { w = a[i]; a[i++] = a[j]; a[j--] = w; } } if (j < h) l = i; if (h < i) r = j; } return a[h]; } static double find_hoare_inexact (double *a, double p, double xmin, double xmax, double frac, int n, int hf, int hc) { double tmp, high, low; int i; if (p < 0.5) { high = find_hoare(a, n, hc); tmp = xmin; for (i=0; i tmp) { tmp = a[i]; } } low = tmp; } else { low = find_hoare(a, n, hf); tmp = xmax; for (i=hc; i= 1) { /* sanity check */ *err = E_DATA; } else { n = gretl_minmax(t1, t2, x, &xmin, &xmax); if (n == 0) { *err = E_DATA; } } if (!*err) { h = quantile_index(n, p); hf = floor(h); hc = ceil(h); if (hc == 0 || hc == n) { /* too few usable observations for the specified quantile; don't treat as fatal error */ return NADBL; } else { a = malloc(n * sizeof *a); if (a == NULL) { *err = E_ALLOC; } } } if (*err) { return NADBL; } n = 0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { a[n++] = x[t]; } } if (hf == hc) { /* "exact" */ ret = find_hoare(a, n, hf); } else { ret = find_hoare_inexact(a, p, xmin, xmax, h - hf, n, hf, hc); } free(a); return ret; } /** * gretl_array_quantiles: * @a: data array (this gets re-ordered). * @n: length of array. * @p: array of probabilities (over-written by quantiles). * @k: number of probabilities. * * Computes @k quantiles (given by the elements of @p) for the * first n elements of the array @a, which is re-ordered in * the process. On successful exit, @p contains the quantiles. * * Returns: 0 on success, non-zero code on error. */ int gretl_array_quantiles (double *a, int n, double *p, int k) { double h, xmin = 0, xmax = NADBL; int hf, hc, i; int err = 0; if (n <= 0 || k <= 0) { return E_DATA; } for (i=0; i= 1.0) { p[i] = NADBL; err = E_INVARG; break; } h = quantile_index(n, p[i]); hf = floor(h); hc = ceil(h); if (hc == 0 || hc == n) { p[i] = NADBL; } else if (hf == hc) { p[i] = find_hoare(a, n, hf); } else { if (na(xmax)) { gretl_minmax(0, n-1, a, &xmin, &xmax); } p[i] = find_hoare_inexact(a, p[i], xmin, xmax, h - hf, n, hf, hc); } } return err; } /** * gretl_array_quantile: * @a: array on which to operate. * @n: number of elements in @a. * @p: probability. * * Returns: the @p quantile of the first @n elements in @a, * which is re-ordered in the process, or #NADBL on * failure. */ double gretl_array_quantile (double *a, int n, double p) { gretl_array_quantiles(a, n, &p, 1); return p; } /** * gretl_median: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the median value of the series @x from obs * @t1 to obs @t2, skipping any missing values, or #NADBL * on failure. */ double gretl_median (int t1, int t2, const double *x) { int err = 0; return gretl_quantile(t1, t2, x, 0.5, OPT_NONE, &err); } /** * gretl_sst: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the sum of squared deviations from the mean for * the series @x from obs @t1 to obs @t2, skipping any missing * values, or #NADBL on failure. */ double gretl_sst (int t1, int t2, const double *x) { int t, n = t2 - t1 + 1; double sumsq, xx, xbar; if (n == 0) { return NADBL; } xbar = gretl_mean(t1, t2, x); if (na(xbar)) { return NADBL; } sumsq = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { xx = x[t] - xbar; sumsq += xx * xx; } } return sumsq; } /* allows for computing MLE rather than using df correction, via the @asy argument */ double real_gretl_variance (int t1, int t2, const double *x, int asy) { int t, n = t2 - t1 + 1; double v, dx, xbar; if (n == 0) { /* null sample */ return NADBL; } xbar = gretl_mean(t1, t2, x); if (na(xbar)) { return NADBL; } v = 0.0; n = 0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { dx = x[t] - xbar; v += dx * dx; n++; } } if (asy && n > 0) { v /= n; } else if (n > 1) { v /= (n - 1); } else { v = 0.0; } return (v >= 0)? v : NADBL; } /** * gretl_variance: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the variance of the series @x from obs * @t1 to obs @t2, skipping any missing values, or #NADBL * on failure. */ double gretl_variance (int t1, int t2, const double *x) { return real_gretl_variance(t1, t2, x, 0); } /** * gretl_restricted_variance: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @y: criterion series. * @yop: criterion operator. * @yval: criterion value. * * Returns: the variance of the series @x from obs * @t1 to obs @t2, skipping any missing values and * observations where the series @y does not bear the * relationship @yop to the value @yval, or #NADBL on * failure. */ double gretl_restricted_variance (int t1, int t2, const double *x, const double *y, GretlOp yop, double yval) { double sumsq, xx, xbar; int t, n = 0; xbar = gretl_restricted_mean(t1, t2, x, y, yop, yval); if (na(xbar)) { return NADBL; } sumsq = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && eval_ytest(y[t], yop, yval)) { xx = x[t] - xbar; sumsq += xx * xx; n++; } } sumsq = (n > 1)? sumsq / (n - 1) : 0.0; return (sumsq >= 0)? sumsq : NADBL; } /** * gretl_stddev: * @t1: starting observation. * @t2: ending observation. * @x: data series. * * Returns: the standard deviation of the series @x from obs * @t1 to obs @t2, skipping any missing values, or #NADBL * on failure. */ double gretl_stddev (int t1, int t2, const double *x) { double v = gretl_variance(t1, t2, x); return (na(v))? v : sqrt(v); } /** * gretl_restricted_stddev: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @y: criterion series. * @yop: criterion operator. * @yval: criterion value. * * Returns: the standard deviation of the series @x from obs * @t1 to obs @t2, skipping any missing values and observations * where the series @y does not bear the relationship @yop to * the value @yval, or #NADBL on failure. */ double gretl_restricted_stddev (int t1, int t2, const double *x, const double *y, GretlOp yop, double yval) { double xx = gretl_restricted_variance(t1, t2, x, y, yop, yval); return (na(xx))? xx : sqrt(xx); } /** * gretl_long_run_variance: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @m: bandwidth (<= 0 for automatic). * @mu: mean (or NADBL to use sample mean). * * Returns: the long-run variance of the series @x from obs * @t1 to obs @t2, using Bartlett kernel weights, or #NADBL * on failure (which includes encountering missing values). */ double gretl_long_run_variance (int t1, int t2, const double *x, int m, double mu) { double zt, wi, xbar, s2 = 0.0; int use_mu = !na(mu); int i, t, n, order; if (series_adjust_sample(x, &t1, &t2) != 0) { return NADBL; } n = t2 - t1 + 1; if (n < 2) { return NADBL; } if (m < 0) { order = (int) exp(log(n) / 3.0); } else { order = m; } if (use_mu) { xbar = mu; } if (use_mu && mu == 0.0) { for (t=t1; t<=t2; t++) { s2 += x[t] * x[t]; } } else { if (!use_mu) { xbar = 0.0; for (t=t1; t<=t2; t++) { xbar += x[t]; } xbar /= n; } for (t=t1; t<=t2; t++) { zt = x[t] - xbar; s2 += zt * zt; } } for (i=1; i<=order; i++) { wi = 1.0 - i /((double) order + 1); for (t=t1+i; t<=t2; t++) { zt = x[t] - xbar; s2 += 2 * wi * zt * (x[t-i] - xbar); } } return s2 / n; } /** * gretl_covar: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @y: data series. * @missing: location to receive information on the number * of missing observations that were skipped, or %NULL. * * Returns: the covariance of the series @x and @y from obs * @t1 to obs @t2, skipping any missing values, or #NADBL * on failure. */ double gretl_covar (int t1, int t2, const double *x, const double *y, int *missing) { double sx, sy, sxy, xbar, ybar; int t, nn, n = t2 - t1 + 1; if (n == 0) { /* void sample */ return NADBL; } nn = 0; sx = sy = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && !na(y[t])) { sx += x[t]; sy += y[t]; nn++; } } if (nn < 2) { return NADBL; } xbar = sx / nn; ybar = sy / nn; sxy = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && !na(y[t])) { sx = x[t] - xbar; sy = y[t] - ybar; sxy = sxy + (sx * sy); } } if (missing != NULL) { /* record number of missing obs */ *missing = n - nn; } return sxy / (nn - 1); } /** * gretl_corr: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @y: data series. * @missing: location to receive information on the number * of missing observations that were skipped, or %NULL. * * Returns: the correlation coefficient for the series @x and @y * from obs @t1 to obs @t2, skipping any missing values, or #NADBL * on failure. */ double gretl_corr (int t1, int t2, const double *x, const double *y, int *missing) { int t, nn, n = t2 - t1 + 1; double sx, sy, sxx, syy, sxy, den, xbar, ybar; double cval = 0.0; if (n == 0) { /* void sample */ return NADBL; } if (gretl_isconst(t1, t2, x) || gretl_isconst(t1, t2, y)) { /* correlation between any x and a constant is undefined */ return NADBL; } nn = 0; sx = sy = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && !na(y[t])) { sx += x[t]; sy += y[t]; nn++; } } if (nn < 2) { /* zero or 1 observations: no go! */ return NADBL; } xbar = sx / nn; ybar = sy / nn; sxx = syy = sxy = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t]) && !na(y[t])) { sx = x[t] - xbar; sy = y[t] - ybar; sxx += sx * sx; syy += sy * sy; sxy += sx * sy; } } if (sxy != 0.0) { den = sxx * syy; if (den > 0.0) { cval = sxy / sqrt(den); } else { cval = NADBL; } } if (missing != NULL) { /* record number of missing observations */ *missing = n - nn; } return cval; } /** * gretl_corr_rsq: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @y: data series. * * Returns: the square of the correlation coefficient for the series * @x and @y from obs @t1 to obs @t2, skipping any missing values, * or #NADBL on failure. Used as alternative value for R^2 in a * regression without an intercept. */ double gretl_corr_rsq (int t1, int t2, const double *x, const double *y) { double r = gretl_corr(t1, t2, x, y, NULL); return (na(r))? r : r * r; } /* we're supposing a variance smaller than this is just noise */ #define TINYVAR 1.0e-36 double gretl_skewness (int t1, int t2, const double *x) { double s, sd, xx, xbar, ret; int t, n = 0; xbar = gretl_mean(t1, t2, x); if (na(xbar)) { return NADBL; } s = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { xx = x[t] - xbar; s += xx * xx; n++; } } s /= n; if (s <= TINYVAR) { ret = NADBL; } else { sd = sqrt(s); s = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { xx = (x[t] - xbar) / sd; s += xx * xx * xx; } } ret = s/n; } return ret; } /* note: the following computes EXCESS kurtosis */ double gretl_kurtosis (int t1, int t2, const double *x) { double s, sd, xx, xbar, ret; int t, n = 0; xbar = gretl_mean(t1, t2, x); if (na(xbar)) { return NADBL; } s = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { xx = x[t] - xbar; s += xx * xx; n++; } } s /= n; if (s <= TINYVAR) { ret = NADBL; } else { sd = sqrt(s); s = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { xx = (x[t] - xbar) / sd; s += xx * xx * xx * xx; } } ret = s/n - 3.0; } return ret; } /** * gretl_moments: * @t1: starting observation. * @t2: ending observation. * @x: data series. * @wts: weights (may be NULL) * @xbar: pointer to receive mean. * @sd: pointer to receive standard deviation. * @skew: pointer to receive skewness. * @kurt: pointer to receive excess kurtosis. * @k: degrees of freedom loss (generally 1). * * Calculates sample moments for series @x from obs @t1 to obs * @t2. * * Returns: 0 on success, 1 on error. */ int gretl_moments (int t1, int t2, const double *x, const double *wts, double *xbar, double *std, double *skew, double *kurt, int k) { int t, n; double dev, var; double s, s2, s3, s4; int allstats = 1; int weighted = (wts != NULL); double wt, wn = 0; if (skew == NULL && kurt == NULL) { allstats = 0; } while (na(x[t1]) && t1 <= t2) { /* skip missing values at start of sample */ t1++; } if (gretl_isconst(t1, t2, x)) { /* no variation in x */ *xbar = x[t1]; *std = 0.0; if (allstats) { *skew = *kurt = NADBL; } return 1; } /* get number and sum of valid observations */ s = 0.0; n = 0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { if (weighted) { wt = wts[t]; if (!na(wt) && wt != 0.0) { s += wt * x[t]; wn += wt; n++; } } else { s += x[t]; n++; } } } if (n == 0) { /* no valid data */ *xbar = *std = NADBL; if (allstats) { *skew = *kurt = 0.0; } return 1; } if (!weighted) { wn = n; } /* calculate mean and initialize other stats */ *xbar = s / wn; var = 0.0; if (allstats) { *skew = *kurt = 0.0; } /* compute quantities needed for higher moments */ s2 = s3 = s4 = 0.0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { dev = x[t] - *xbar; if (weighted) { wt = wts[t]; if (!na(wt) && wt != 0.0) { s2 += wt * dev * dev; if (allstats) { s3 += wt * pow(dev, 3); s4 += wt * pow(dev, 4); } } } else { s2 += dev * dev; if (allstats) { s3 += pow(dev, 3); s4 += pow(dev, 4); } } } } /* variance */ if (weighted) { /* as per Stata's -summary- with aweights */ var = n * s2 / (wn * (n-1)); } else { var = s2 / (wn - k); } if (var < 0.0) { /* in principle impossible, but this is a computer */ *std = NADBL; if (allstats) { *skew = *kurt = NADBL; } return 1; } if (var > TINYVAR) { *std = sqrt(var); } else { /* screen out minuscule variance */ *std = 0.0; } if (allstats) { if (var > TINYVAR) { *skew = (s3 / wn) / pow(s2 / wn, 1.5); *kurt = ((s4 / wn) / pow(s2 / wn, 2)) - 3.0; /* excess kurtosis */ } else { /* if variance is effectively zero, these should be undef'd */ *skew = *kurt = NADBL; } } return 0; } /** * gretl_sorted_series: * @v: ID number of input series. * @dset: dataset struct. * @opt: may include OPT_M to flag an error in case * missing values are found. * @n: on input, the minimum acceptable number of * non-missing observations; on output, the number * of valid observations. * @err: location to receive error code. * * Returns: an array containing the valid values of the * input series over the sample range given in @dset, * sorted from smallest to largest, or NULL on error. * An error is flagged if the number of valid observations * is less than that given in @n on input, or if OPT_M * is given and the input contains missing values. */ double *gretl_sorted_series (int v, const DATASET *dset, gretlopt opt, int *n, int *err) { double *y = NULL; int t, k = 0; if (*n < 1) { *n = 1; } for (t=dset->t1; t<=dset->t2; t++) { if (!na(dset->Z[v][t])) { k++; } else if (opt & OPT_M) { *err = E_MISSDATA; return NULL; } } if (k < *n) { gretl_errmsg_set(_("Insufficient data")); *err = E_DATA; return NULL; } y = malloc(k * sizeof *y); if (y == NULL) { *err = E_ALLOC; return NULL; } *n = k; k = 0; for (t=dset->t1; t<=dset->t2; t++) { if (!na(dset->Z[v][t])) { y[k++] = dset->Z[v][t]; } } qsort(y, k, sizeof *y, gretl_compare_doubles); return y; } /** * free_freq: * @freq: pointer to gretl frequency distribution struct * * Frees all resources associated with @freq, and the * pointer itself. */ void free_freq (FreqDist *freq) { if (freq == NULL) { return; } free(freq->midpt); free(freq->endpt); free(freq->f); if (freq->S != NULL) { strings_array_free(freq->S, freq->numbins); } free(freq); } static FreqDist *freq_new (const char *vname) { FreqDist *freq; freq = malloc(sizeof *freq); if (freq == NULL) return NULL; strcpy(freq->varname, vname); freq->midpt = NULL; freq->endpt = NULL; freq->S = NULL; freq->f = NULL; freq->dist = 0; freq->discrete = 0; freq->strvals = 0; freq->xbar = NADBL; freq->sdx = NADBL; freq->test = NADBL; return freq; } /* * dh_root_b1_to_z1: * @rb1: square root b1, skewness. * @n: number of observations. * * Performs the transformation from skewness, root b1, to the * normal score, z1, as set out in Doornik and Hansen, "An Omnibus * Test for Normality", 1994. The transformation is originally * due to D'Agostino (Biometrika, 1970). * * Returns: the z1 value. */ static double dh_root_b1_to_z1 (double rb1, double n) { double b, w2, d, y, z1; b = 3.0 * (n*n + 27*n - 70) * (n+1) * (n+3) / ((n-2) * (n+5) * (n+7) * (n+9)); w2 = -1.0 + sqrt(2 * (b-1)); d = 1.0 / sqrt(log(sqrt(w2))); y = rb1 * sqrt(((w2-1.0)/2.0) * ((n+1.0)*(n+3.0))/(6.0*(n-2))); z1 = d * log(y + sqrt(y*y + 1)); return z1; } /* * dh_b2_to_z2: * @b1: skewness. * @b2: kurtosis. * @n: number of observations. * * Performs the transformation from kurtosis, b2, to the * normal score, z2, as set out in Doornik and Hansen, "An Omnibus * Test for Normality", 1994. * * Returns: the z2 value, or #NADBL on failure. */ static double dh_b2_to_z2 (double b1, double b2, double n) { double d, a, c, k, alpha, chi, z2; double n2 = n * n; d = (n-3) * (n+1) * (n2 + 15*n - 4.0); a = ((n-2) * (n+5) * (n+7) * (n2 + 27*n - 70.0)) / (6.0 * d); c = ((n-7) * (n+5) * (n+7) * (n2 + 2*n - 5.0)) / (6.0 * d); k = ((n+5) * (n+7) * (n2*n + 37*n2 + 11*n - 313.0)) / (12.0 * d); alpha = a + b1 * c; z2 = (1.0 / (9.0 * alpha)) - 1.0; chi = (b2 - 1.0 - b1) * 2.0 * k; if (chi > 0.0) { z2 += pow(chi/(2*alpha), 1.0 / 3.0); } z2 *= sqrt(9.0*alpha); return z2; } /** * doornik_chisq: * @skew: skewness. * @xkurt: excess kurtosis. * @n: number of observations. * * Calculates the Chi-square test for normality as set out by * Doornik and Hansen, "An Omnibus Test for Normality", 1994. * This is a modified version of the test proposed by Bowman and * Shenton (Biometrika, 1975). * * Returns: the Chi-square value, which has 2 degrees of freedom. */ double doornik_chisq (double skew, double xkurt, int n) { double z1, z2, x2 = NADBL; z1 = dh_root_b1_to_z1(skew, (double) n); z2 = dh_b2_to_z2(skew * skew, xkurt + 3.0, (double) n); if (!na(z2)) { x2 = z1*z1 + z2*z2; } return x2; } static int series_get_moments (int t1, int t2, const double *x, double *skew, double *xkurt, int *pn) { double dev, s[4] = {0.0}; int t, n = 0; int err = 0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { s[0] += x[t]; n++; } } *pn = n; if (n < 3) { return E_DATA; } s[0] /= n; for (t=t1; t<=t2; t++) { if (!na(x[t])) { dev = x[t] - s[0]; s[1] += dev * dev; s[2] += pow(dev, 3); s[3] += pow(dev, 4); } } s[1] /= n; if (s[1] > 0.0) { *skew = (s[2] / n) / pow(s[1], 1.5); *xkurt = ((s[3] / n) / pow(s[1], 2)) - 3.0; } else { /* if variance is effectively zero, these should be undef'd */ *skew = *xkurt = NADBL; err = 1; } return err; } static int get_moments (const gretl_matrix *M, int row, double *skew, double *kurt) { int j, n = gretl_matrix_cols(M); double dev, s[4] = {0.0}; int err = 0; for (j=0; j 0.0) { *skew = (s[2] / n) / pow(s[1], 1.5); *kurt = ((s[3] / n) / pow(s[1], 2)); } else { /* if variance is effectively zero, these should be undef'd */ *skew = *kurt = NADBL; err = 1; } return err; } int multivariate_normality_test (const gretl_matrix *E, const gretl_matrix *Sigma, gretlopt opt, PRN *prn) { gretl_matrix *S = NULL; gretl_matrix *V = NULL; gretl_matrix *C = NULL; gretl_matrix *X = NULL; gretl_matrix *R = NULL; gretl_matrix *evals = NULL; gretl_matrix *tmp = NULL; /* convenience pointers: do not free! */ gretl_matrix *H; gretl_vector *Z1; gretl_vector *Z2; double x, skew, kurt; double X2 = NADBL; int n, p; int i, j; int err = 0; if (E == NULL || Sigma == NULL) { err = E_DATA; goto bailout; } p = gretl_matrix_cols(E); n = gretl_matrix_rows(E); clear_gretl_matrix_err(); S = gretl_matrix_copy(Sigma); V = gretl_vector_alloc(p); C = gretl_matrix_alloc(p, p); X = gretl_matrix_copy_transpose(E); R = gretl_matrix_alloc(p, n); tmp = gretl_matrix_alloc(p, p); err = get_gretl_matrix_err(); if (err) { goto bailout; } for (i=0; ival[i]); } pputc(prn, '\n'); } /* C should now contain eigenvectors of the original C: relabel as 'H' for perspicuity */ H = C; #if 0 gretl_matrix_print_to_prn(H, "Eigenvectors, H", prn); #endif gretl_matrix_copy_values(tmp, H); /* make "tmp" into $H \Lambda^{-1/2}$ */ for (i=0; ival[j]); gretl_matrix_set(tmp, i, j, x); } } /* make S into $H \Lambda^{-1/2} H'$ */ gretl_matrix_multiply_mod(tmp, GRETL_MOD_NONE, H, GRETL_MOD_TRANSPOSE, S, GRETL_MOD_NONE); #if 1 gretl_matrix_demean_by_row(X); #endif /* compute VX', in X (don't need to subtract means, because these are OLS residuals) */ for (i=0; idiscrete) { freq->endpt = malloc((n + 1) * sizeof *freq->endpt); if (freq->endpt == NULL) { err = E_ALLOC; } } if (!err) { if (freq->strvals) { freq->S = strings_array_new(n); if (freq->S == NULL) { err = E_ALLOC; } } else { freq->midpt = malloc(n * sizeof *freq->midpt); if (freq->midpt == NULL) { err = E_ALLOC; } } } if (!err) { freq->f = malloc(n * sizeof *freq->f); if (freq->f == NULL) { err = E_ALLOC; } } if (!err) { freq->numbins = n; } return err; } int freq_setup (int v, const DATASET *dset, int *pn, double *pxmax, double *pxmin, int *nbins, double *binwidth) { const double *x = dset->Z[v]; double xrange, xmin = 0, xmax = 0; int t, k = 0, n = 0; for (t=dset->t1; t<=dset->t2; t++) { if (!na(x[t])) { if (n == 0) { xmax = xmin = x[t]; } else { if (x[t] > xmax) xmax = x[t]; if (x[t] < xmin) xmin = x[t]; } n++; } } if (n < 8) { gretl_errmsg_sprintf(_("Insufficient data to build frequency " "distribution for variable %s"), dset->varname[v]); return E_TOOFEW; } xrange = xmax - xmin; if (xrange == 0) { gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[v]); return E_DATA; } if (*nbins > 0) { k = *nbins; } else if (n < 16) { k = 5; } else if (n < 50) { k = 7; } else if (n > 850) { k = 29; } else { k = (int) sqrt((double) n); if (k > 99) { k = 99; } } if (k % 2 == 0) { k++; } *pn = n; *pxmax = xmax; *pxmin = xmin; *nbins = k; *binwidth = xrange / (k - 1); return 0; } /* calculate test stat for distribution, if the sample is big enough */ static void freq_dist_stat (FreqDist *freq, const double *x, gretlopt opt, int k) { double skew, kurt; freq->test = NADBL; freq->dist = 0; gretl_moments(freq->t1, freq->t2, x, NULL, &freq->xbar, &freq->sdx, &skew, &kurt, k); if (freq->n > 7) { if (opt & OPT_O) { freq->test = lockes_test(x, freq->t1, freq->t2); freq->dist = D_GAMMA; } else if (opt & OPT_Z) { freq->test = doornik_chisq(skew, kurt, freq->n); freq->dist = D_NORMAL; } } } struct strval_sorter { char *s; int n; }; int compare_strvals (const void *a, const void *b) { const struct strval_sorter *sa = a; const struct strval_sorter *sb = b; return strcmp(sa->s, sb->s); } FreqDist *get_string_freq (int v, const DATASET *dset, int *err) { FreqDist *freq; struct strval_sorter *ss; const double *x = dset->Z[v]; series_table *st; char **S; int i, t, n, ns; freq = freq_new(dset->varname[v]); if (freq == NULL) { *err = E_ALLOC; return NULL; } st = series_get_string_table(dset, v); S = series_table_get_strings(st, &ns); ss = malloc(ns * sizeof *ss); if (ss == NULL) { *err = E_ALLOC; free(freq); return NULL; } for (i=0; it1; t<=dset->t2; t++) { if (!na(x[t])) { i = x[t] - 1; ss[i].n += 1; n++; } } qsort(ss, ns, sizeof *ss, compare_strvals); freq->t1 = dset->t1; freq->t2 = dset->t2; freq->n = n; freq->discrete = 1; freq->strvals = 1; if (freq_add_arrays(freq, ns)) { *err = E_ALLOC; } else { for (i=0; iS[i] = gretl_strdup(ss[i].s); freq->f[i] = ss[i].n; } } free(ss); if (*err && freq != NULL) { free_freq(freq); freq = NULL; } return freq; } FreqDist *get_discrete_freq (int v, const DATASET *dset, gretlopt opt, int *err) { FreqDist *freq; const double *x = dset->Z[v]; int *ifreq = NULL; double *ivals = NULL; double *sorted = NULL; double last; int i, t, nv; freq = freq_new(dset->varname[v]); if (freq == NULL) { *err = E_ALLOC; return NULL; } freq->t1 = dset->t1; freq->t2 = dset->t2; freq->n = 0; for (t=freq->t1; t<=freq->t2; t++) { if (!na(x[t])) { freq->n += 1; } } if (freq->n < 3) { gretl_errmsg_sprintf(_("Insufficient data to build frequency " "distribution for variable %s"), dset->varname[v]); *err = E_TOOFEW; goto bailout; } freq->discrete = 1; freq->test = NADBL; freq->dist = 0; sorted = malloc(freq->n * sizeof *sorted); if (sorted == NULL) { *err = E_ALLOC; goto bailout; } i = 0; for (t=freq->t1; t<=freq->t2; t++) { if (!na(x[t])) { sorted[i++] = x[t]; } } qsort(sorted, freq->n, sizeof *sorted, gretl_compare_doubles); nv = count_distinct_values(sorted, freq->n); if (nv >= 10 && !(opt & OPT_X)) { freq_dist_stat(freq, x, opt, 1); } else if (opt & (OPT_Z | OPT_O)) { freq->xbar = gretl_mean(freq->t1, freq->t2, x); freq->sdx = gretl_stddev(freq->t1, freq->t2, x); } ifreq = malloc(nv * sizeof *ifreq); ivals = malloc(nv * sizeof *ivals); if (ifreq == NULL || ivals == NULL) { *err = E_ALLOC; goto bailout; } ivals[0] = last = sorted[0]; ifreq[0] = i = 1; for (t=1; tn; t++) { if (sorted[t] != last) { last = sorted[t]; ifreq[i] = 1; ivals[i++] = last; } else { ifreq[i-1] += 1; } } if (freq_add_arrays(freq, nv)) { *err = E_ALLOC; } else { int allints = 1; for (i=0; imidpt[i] = ivals[i]; freq->f[i] = ifreq[i]; } if (allints) { freq->discrete = 2; } } bailout: free(sorted); free(ivals); free(ifreq); if (*err && freq != NULL) { free_freq(freq); freq = NULL; } return freq; } /** * get_freq: * @varno: ID number of variable to process. * @dset: dataset struct. * @fmin: lower limit of left-most bin (or #NADBL for automatic). * @fwid: bin width (or #NADBL for automatic). * @nbins: number of bins to use (or 0 for automatic). * @params: degrees of freedom loss (generally = 1 unless we're dealing * with the residual from a regression). * @opt: if includes %OPT_Z, set up for comparison with normal dist; * if includes %OPT_O, compare with gamma distribution; * if includes %OPT_S we're not printing results; if includes %OPT_D, * treat the variable as discrete; %OPT_X indicates that this function * is called as part of a cross-tabulation; if includes %OPT_G, * add the arrays that are needed for a plot even when we're not * printing (%OPT_S). * @err: location to receive error code. * * Calculates the frequency distribution for the specified variable. * * Returns: pointer to struct containing the distribution. */ FreqDist *get_freq (int varno, const DATASET *dset, double fmin, double fwid, int nbins, int params, gretlopt opt, int *err) { FreqDist *freq; const double *x; double xx, xmin, xmax; double binwidth = fwid; int t, k, n; if (is_string_valued(dset, varno)) { return get_string_freq(varno, dset, err); } if (series_is_discrete(dset, varno) || (opt & OPT_D)) { return get_discrete_freq(varno, dset, opt, err); } if (gretl_isdiscrete(dset->t1, dset->t2, dset->Z[varno]) > 1) { return get_discrete_freq(varno, dset, opt, err); } freq = freq_new(dset->varname[varno]); if (freq == NULL) { *err = E_ALLOC; return NULL; } *err = freq_setup(varno, dset, &n, &xmax, &xmin, &nbins, &binwidth); if (*err) { goto bailout; } if (!na(fmin) && !na(fwid)) { /* endogenous implied number of bins */ nbins = (int) ceil((xmax - fmin) / fwid); if (nbins <= 0 || nbins > 5000) { *err = E_INVARG; goto bailout; } else { binwidth = fwid; } } freq->t1 = dset->t1; freq->t2 = dset->t2; freq->n = n; x = dset->Z[varno]; freq_dist_stat(freq, x, opt, params); if (freq_add_arrays(freq, nbins)) { *err = E_ALLOC; goto bailout; } if (na(fmin) || na(fwid)) { freq->endpt[0] = xmin - .5 * binwidth; if (xmin > 0.0 && freq->endpt[0] < 0.0) { double rshift; freq->endpt[0] = 0.0; rshift = 1.0 - xmin / binwidth; freq->endpt[freq->numbins] = xmax + rshift * binwidth; } else { freq->endpt[freq->numbins] = xmax + .5 * binwidth; } } else { freq->endpt[0] = fmin; freq->endpt[freq->numbins] = fmin + nbins * fwid; } for (k=0; knumbins; k++) { freq->f[k] = 0; if (k > 0) { freq->endpt[k] = freq->endpt[k-1] + binwidth; } freq->midpt[k] = freq->endpt[k] + .5 * binwidth; } for (t=dset->t1; t<=dset->t2; t++) { xx = x[t]; if (na(xx)) { continue; } if (xx < freq->endpt[1]) { freq->f[0] += 1; } else if (xx >= freq->endpt[freq->numbins]) { freq->f[freq->numbins - 1] += 1; continue; } else { for (k=1; knumbins; k++) { if (freq->endpt[k] <= xx && xx < freq->endpt[k+1]) { freq->f[k] += 1; break; } } } } bailout: if (*err && freq != NULL) { free_freq(freq); freq = NULL; } return freq; } static void record_freq_test (const FreqDist *freq) { double pval = NADBL; if (freq->dist == D_NORMAL) { pval = chisq_cdf_comp(2, freq->test); } else if (freq->dist == D_GAMMA) { pval = normal_pvalue_2(freq->test); } if (!na(pval)) { record_test_result(freq->test, pval); } } static int check_freq_opts (gretlopt opt, int *n_bins, double *fmin, double *fwid) { double x; int err = 0; if (opt & OPT_N) { /* number of bins given */ if (opt & (OPT_M | OPT_W)) { /* can't give min, width spec */ return E_BADOPT; } else { int n = get_optval_int(FREQ, OPT_N, &err); if (!err) { if (n < 2 || n > 10000) { err = E_INVARG; } else { *n_bins = n; } } if (err) { return err; } } } if (opt & OPT_M) { /* minimum specified */ if (!(opt & OPT_W)) { /* but no width given */ return E_ARGS; } else { x = get_optval_double(FREQ, OPT_M, &err); if (err) { return err; } else if (na(x)) { return E_ARGS; } else { *fmin = x; } } } if (opt & OPT_W) { /* width specified */ if (!(opt & OPT_M)) { /* but no min given */ return E_ARGS; } else { x = get_optval_double(FREQ, OPT_W, &err); if (err) { return err; } else if (na(x)) { return E_ARGS; } else if (x <= 0) { return E_INVARG; } else { *fwid = x; } } } return 0; } static void record_freq_matrix (FreqDist *fd) { gretl_matrix *m = NULL; char **Sr = NULL; int i, n = fd->numbins; if (fd->S != NULL) { m = gretl_matrix_alloc(n, 1); Sr = strings_array_dup(fd->S, n); } else { m = gretl_matrix_alloc(n, 2); } if (m != NULL) { char **Sc = NULL; if (fd->S == NULL) { Sc = strings_array_new(2); if (Sc != NULL) { Sc[0] = gretl_strdup("midpoint"); Sc[1] = gretl_strdup("count"); } } for (i=0; iS != NULL) { gretl_matrix_set(m, i, 0, fd->f[i]); } else { gretl_matrix_set(m, i, 0, fd->midpt[i]); gretl_matrix_set(m, i, 1, fd->f[i]); } } if (Sc != NULL) { gretl_matrix_set_colnames(m, Sc); } if (Sr != NULL) { gretl_matrix_set_rownames(m, Sr); } set_last_result_data(m, GRETL_TYPE_MATRIX); } } /* Wrapper function: get the distribution, print it if wanted, graph it if wanted, then free stuff. */ int freqdist (int varno, const DATASET *dset, gretlopt opt, PRN *prn) { FreqDist *freq = NULL; DistCode dist = D_NONE; PlotType ptype = 0; double fmin = NADBL; double fwid = NADBL; int n_bins = 0; int do_graph = 0; int err; if (opt & OPT_O) { dist = D_GAMMA; ptype = PLOT_FREQ_GAMMA; } else if (opt & OPT_Z) { dist = D_NORMAL; ptype = PLOT_FREQ_NORMAL; } else { ptype = PLOT_FREQ_SIMPLE; } if (!(opt & OPT_Q)) { if (opt & OPT_G) { /* it's a legacy thing */ opt |= OPT_U; opt &= ~OPT_G; set_optval_string(FREQ, OPT_U, "display"); } do_graph = gnuplot_graph_wanted(ptype, opt); } err = check_freq_opts(opt, &n_bins, &fmin, &fwid); if (!err) { gretlopt fopt = opt; if (do_graph) { fopt |= OPT_G; } freq = get_freq(varno, dset, fmin, fwid, n_bins, 1, fopt, &err); } if (!err) { if (!(opt & OPT_S)) { print_freq(freq, varno, dset, prn); } else if (dist) { record_freq_test(freq); } record_freq_matrix(freq); if (do_graph && freq->numbins < 2) { do_graph = 0; } if (do_graph) { int gerr = plot_freq(freq, dist, opt); if (gerr) { pputs(prn, _("gnuplot command failed\n")); do_graph = 0; } } free_freq(freq); } return err; } /* Wrapper function: get the frequency distribution and write it into a gretl_matrix: the first column holds the mid-points of the bins and the second holds the frequencies. */ gretl_matrix *freqdist_matrix (const double *x, int t1, int t2, int *err) { DATASET *dset = NULL; FreqDist *freq = NULL; gretl_matrix *m = NULL; int i, t, T = t2 - t1 + 1; dset = create_auxiliary_dataset(1, T, 0); if (dset == NULL) { *err = E_ALLOC; } if (!*err) { i = 0; for (t=t1; t<=t2; t++) { dset->Z[0][i++] = x[t]; } freq = get_freq(0, dset, NADBL, NADBL, 0, 1, OPT_NONE, err); } if (!*err) { m = gretl_matrix_alloc(freq->numbins, 2); if (m == NULL) { *err = E_ALLOC; } } if (!*err) { for (i=0; inumbins; i++) { gretl_matrix_set(m, i, 0, freq->midpt[i]); gretl_matrix_set(m, i, 1, freq->f[i]); } } destroy_dataset(dset); free_freq(freq); return m; } gretl_matrix *xtab_to_matrix (const Xtab *tab) { gretl_matrix *m; double x; int i, j; if (tab == NULL) { return NULL; } m = gretl_matrix_alloc(tab->rows, tab->cols); if (m == NULL) { return NULL; } for (j=0; jcols; j++) { for (i=0; irows; i++) { x = (double) tab->f[i][j]; gretl_matrix_set(m, i, j, x); } } return m; } static void *last_result; static GretlType last_result_type; void *get_last_result_data (GretlType *type, int *err) { void *ret = NULL; if (last_result == NULL) { *type = GRETL_TYPE_NONE; *err = E_BADSTAT; } else { *type = last_result_type; if (*type == GRETL_TYPE_MATRIX) { ret = gretl_matrix_copy(last_result); } else { ret = gretl_bundle_copy(last_result, err); } } return ret; } void set_last_result_data (void *data, GretlType type) { if (last_result != NULL) { if (last_result_type == GRETL_TYPE_MATRIX) { gretl_matrix_free(last_result); } else if (last_result_type == GRETL_TYPE_BUNDLE) { gretl_bundle_destroy(last_result); } } last_result = data; last_result_type = type; } void last_result_cleanup (void) { if (last_result != NULL) { if (last_result_type == GRETL_TYPE_MATRIX) { gretl_matrix_free(last_result); } else if (last_result_type == GRETL_TYPE_BUNDLE) { gretl_bundle_destroy(last_result); } } last_result = NULL; last_result_type = 0; } /** * free_xtab: * @tab: pointer to gretl crosstab struct. * * Frees all resources associated with @tab, and the * pointer itself. */ void free_xtab (Xtab *tab) { int i; if (tab == NULL) { return; } free(tab->rtotal); free(tab->ctotal); free(tab->rval); free(tab->cval); if (tab->f != NULL) { for (i=0; irows; i++) { free(tab->f[i]); } free(tab->f); } if (tab->Sr != NULL) { strings_array_free(tab->Sr, tab->rows); } if (tab->Sc != NULL) { strings_array_free(tab->Sc, tab->cols); } free(tab); } static Xtab *xtab_new (int n, int t1, int t2) { Xtab *tab = malloc(sizeof *tab); if (tab == NULL) return NULL; tab->rtotal = NULL; tab->ctotal = NULL; tab->rval = NULL; tab->cval = NULL; tab->f = NULL; tab->n = n; tab->t1 = t1; tab->t2 = t2; tab->missing = 0; *tab->rvarname = '\0'; *tab->cvarname = '\0'; tab->Sr = NULL; tab->Sc = NULL; tab->rstrs = 0; tab->cstrs = 0; return tab; } /* allocate the required arrays (apart from S, which must be handled separately) and initialize counters to zero */ static int xtab_allocate_arrays (Xtab *tab) { int i, j, err = 0; if (!tab->rstrs && tab->rval == NULL) { tab->rval = malloc(tab->rows * sizeof *tab->rval); if (tab->rval == NULL) { err = E_ALLOC; } } if (!err && !tab->cstrs && tab->cval == NULL) { tab->cval = malloc(tab->cols * sizeof *tab->cval); if (tab->cval == NULL) { err = E_ALLOC; } } if (!err) { tab->rtotal = malloc(tab->rows * sizeof *tab->rtotal); tab->ctotal = malloc(tab->cols * sizeof *tab->ctotal); if (tab->rtotal == NULL || tab->ctotal == NULL) { err = E_ALLOC; } else { for (i=0; irows; i++) { tab->rtotal[i] = 0; } for (j=0; jcols; j++) { tab->ctotal[j] = 0; } } } if (!err) { tab->f = malloc(tab->rows * sizeof *tab->f); if (tab->f == NULL) { err = E_ALLOC; } else { for (i=0; irows; i++) { tab->f[i] = NULL; } } } for (i=0; irows && !err; i++) { tab->f[i] = malloc(tab->cols * sizeof *tab->f[i]); if (tab->f[i] == NULL) { err = E_ALLOC; } else { for (j=0; jcols; j++) { tab->f[i][j] = 0; } } } return err; } /* also used in gretl_matrix.c */ int compare_xtab_rows (const void *a, const void *b) { const double **da = (const double **) a; const double **db = (const double **) b; int ret; ret = da[0][0] - db[0][0]; if (ret == 0) { ret = da[0][1] - db[0][1]; } return ret; } static int xtab_get_data (Xtab *tab, int v, int j, const DATASET *dset, series_table **pst) { double **xtarg = (j == 1)? &tab->cval : &tab->rval; char ***Starg = (j == 1)? &tab->Sc : &tab->Sr; int *itarg = (j == 1)? &tab->cols : &tab->rows; int *ttarg = (j == 1)? &tab->cstrs : &tab->rstrs; double *x = dset->Z[v] + dset->t1; int n = sample_size(dset); gretl_matrix *u; int err = 0; u = gretl_matrix_values(x, n, OPT_S, &err); if (!err && is_string_valued(dset, v)) { series_table *st; int ns, nv = u->rows; char **S0, **S = NULL; *pst = st = series_get_string_table(dset, v); S0 = series_table_get_strings(st, &ns); if (ns > nv) { int i, k; S = strings_array_new(nv); if (S != NULL) { for (i=0; ival[i] - 1; S[i] = gretl_strdup(S0[k]); } } } else { S = strings_array_dup(S0, ns); } if (S == NULL) { err = E_ALLOC; } else { *ttarg = 1; *itarg = nv; *Starg = S; strings_array_sort(Starg, &nv, OPT_NONE); } } else if (!err) { *itarg = u->rows; *xtarg = u->val; u->val = NULL; } gretl_matrix_free(u); return err; } static int xtab_row_match (Xtab *tab, int i, const char *s, double x) { if (tab->rstrs) { return strcmp(s, tab->Sr[i]) == 0; } else { return x == tab->rval[i]; } } static int xtab_col_match (Xtab *tab, int j, const char *s, double x) { if (tab->cstrs) { return strcmp(s, tab->Sc[j]) == 0; } else { return x == tab->cval[j]; } } #define complete_obs(x,y,t) (!na(x[t]) && !na(y[t])) /* crosstab struct creation functions */ static Xtab *get_new_xtab (int rv, int cv, const DATASET *dset, int *err) { series_table *sti = NULL; series_table *stj = NULL; const char *s1 = NULL; const char *s2 = NULL; double x1 = 0, x2 = 0; Xtab *tab = NULL; int imatch, jmatch; int i, j, t, n = 0; for (t=dset->t1; t<=dset->t2; t++) { if (complete_obs(dset->Z[rv], dset->Z[cv], t)) { n++; } } if (n == 0) { *err = E_MISSDATA; } else { tab = xtab_new(n, dset->t1, dset->t2); if (tab == NULL) { *err = E_ALLOC; } } if (!*err) { /* assemble row data */ *err = xtab_get_data(tab, rv, 0, dset, &sti); } if (!*err) { /* assemble column data */ *err = xtab_get_data(tab, cv, 1, dset, &stj); } if (!*err) { *err = xtab_allocate_arrays(tab); } if (*err) goto bailout; tab->missing = (dset->t2 - dset->t1 + 1) - n; strcpy(tab->rvarname, dset->varname[rv]); strcpy(tab->cvarname, dset->varname[cv]); /* The following could be made more efficient by substituting sorted arrays for dset->Z[rv] and dset->Z[cv] but I'm not sure if the fixed cost would be recouped. */ for (t=dset->t1; t<=dset->t2; t++) { if (!complete_obs(dset->Z[rv], dset->Z[cv], t)) { continue; } x1 = dset->Z[rv][t]; if (tab->rstrs) { s1 = series_table_get_string(sti, x1); } x2 = dset->Z[cv][t]; if (tab->cstrs) { s2 = series_table_get_string(stj, x2); } for (i=0; irows; i++) { imatch = xtab_row_match(tab, i, s1, x1); if (imatch) { jmatch = 0; for (j=0; jcols && !jmatch; j++) { jmatch = xtab_col_match(tab, j, s2, x2); if (jmatch) { tab->f[i][j] += 1; tab->rtotal[i] += 1; tab->ctotal[j] += 1; } } break; } } } bailout: if (*err) { free_xtab(tab); tab = NULL; } return tab; } static void record_xtab (const Xtab *tab, const DATASET *dset, gretlopt opt) { gretl_matrix *X = NULL; char **Sc = NULL, **Sr = NULL; double xij, cj, ri; int totals, rows, cols; int i, j; totals = (opt & OPT_N)? 0 : 1; rows = tab->rows + totals; cols = tab->cols + totals; X = gretl_zero_matrix_new(rows, cols); if (X == NULL) { return; } /* column labels */ Sc = strings_array_new(cols); if (Sc != NULL) { for (j=0; jcols; j++) { if (tab->cstrs) { Sc[j] = gretl_strdup(tab->Sc[j]); } else { cj = tab->cval[j]; Sc[j] = gretl_strdup_printf("%4g", cj); } } if (totals) { Sc[cols-1] = gretl_strdup("TOTAL"); } } /* row labels */ Sr = strings_array_new(rows); if (Sr != NULL) { for (i=0; irows; i++) { if (tab->rstrs) { Sr[i] = gretl_strdup(tab->Sr[i]); } else { ri = tab->rval[i]; Sr[i] = gretl_strdup_printf("%4g", ri); } } if (totals) { Sr[rows-1] = gretl_strdup("TOTAL"); } } /* body of table */ for (i=0; irows; i++) { if (tab->rtotal[i] > 0) { /* row counts */ for (j=0; jcols; j++) { if (tab->ctotal[j] > 0) { if (opt & (OPT_C | OPT_R)) { if (opt & OPT_C) { xij = 100.0 * tab->f[i][j] / tab->ctotal[j]; } else { xij = 100.0 * tab->f[i][j] / tab->rtotal[i]; } } else { xij = tab->f[i][j]; } gretl_matrix_set(X, i, j, xij); } } if (totals) { /* row totals */ if (opt & OPT_C) { xij = 100.0 * tab->rtotal[i] / tab->n; } else { xij = tab->rtotal[i]; } gretl_matrix_set(X, i, cols-1, xij); } } } if (totals) { /* column totals */ for (j=0; jcols; j++) { if (opt & OPT_R) { xij = 100.0 * tab->ctotal[j] / tab->n; } else { xij = tab->ctotal[j]; } gretl_matrix_set(X, rows-1, j, xij); } gretl_matrix_set(X, rows-1, cols-1, tab->n); } gretl_matrix_set_colnames(X, Sc); gretl_matrix_set_rownames(X, Sr); set_last_result_data(X, GRETL_TYPE_MATRIX); } /* For use in the context of "xtab" with --quiet option: compute and record the Pearson chi-square value and its p-value. */ static int xtab_do_pearson (const Xtab *tab) { double x, y, ymin = 1.0e-7; double pearson = 0.0; int i, j, err = 0; for (i=0; irows && !err; i++) { if (tab->rtotal[i] > 0) { for (j=0; jcols && !err; j++) { y = ((double) tab->rtotal[i] * tab->ctotal[j]) / tab->n; if (y < ymin) { err = E_DATA; } else { x = (double) tab->f[i][j] - y; pearson += x * x / y; } } } } if (!err) { int df = (tab->rows - 1) * (tab->cols - 1); double pval = chisq_cdf_comp(df, pearson); if (!na(pval)) { record_test_result(pearson, pval); } else { err = E_DATA; } } if (err) { record_test_result(NADBL, NADBL); } return err; } int crosstab_from_matrix (gretlopt opt, PRN *prn) { const char *mname; const gretl_matrix *m; Xtab *tab; int i, j, nvals, n = 0; double x; int err = 0; mname = get_optval_string(XTAB, OPT_X); if (mname == NULL) { return E_DATA; } m = get_matrix_by_name(mname); if (m == NULL) { return E_UNKVAR; } if (m->rows < 2 || m->cols < 2) { err = E_DATA; } nvals = m->rows * m->cols; for (i=0; ival[i]; if (x < 0 || x != floor(x) || x > INT_MAX) { err = E_DATA; } n += x; } if (err) { gretl_errmsg_sprintf(_("Matrix %s does not represent a " "contingency table"), mname); return err; } tab = xtab_new(n, 0, 0); if (tab == NULL) { return E_ALLOC; } tab->rows = m->rows; tab->cols = m->cols; if (xtab_allocate_arrays(tab)) { free_xtab(tab); return E_ALLOC; } for (i=0; irows; i++) { tab->rval[i] = i + 1; tab->rtotal[i] = 0.0; for (j=0; jcols; j++) { tab->f[i][j] = gretl_matrix_get(m, i, j); tab->rtotal[i] += tab->f[i][j]; } } for (j=0; jcols; j++) { tab->cval[j] = j + 1; tab->ctotal[j] = 0.0; for (i=0; irows; i++) { tab->ctotal[j] += tab->f[i][j]; } } if (opt & OPT_Q) { xtab_do_pearson(tab); } else { print_xtab(tab, NULL, opt | OPT_S, prn); } free_xtab(tab); return err; } int crosstab (const int *list, const DATASET *dset, gretlopt opt, PRN *prn) { Xtab *tab; int *rowvar = NULL; int *colvar = NULL; int i, j, vi, vj, k; int pos, onelist; int nrv, ncv; int err = 0; pos = gretl_list_separator_position(list); onelist = (pos == 0); if (pos == 0) { /* single list case */ nrv = list[0]; ncv = nrv - 1; } else { /* double list case */ nrv = pos - 1; ncv = list[0] - pos; } if (nrv == 0 || ncv == 0) { return E_PARSE; } rowvar = gretl_list_new(nrv); if (rowvar == NULL) { return E_ALLOC; } j = 1; for (i=1; i<=nrv; i++) { k = list[i]; if (accept_as_discrete(dset, k, 0)) { rowvar[j++] = k; } else { pprintf(prn, _("dropping %s: not a discrete variable\n"), dset->varname[k]); rowvar[0] -= 1; } } if (rowvar[0] == 0 || (onelist && rowvar[0] == 1)) { gretl_errmsg_set("xtab: variables must be discrete"); free(rowvar); return E_TYPES; } if (onelist && rowvar[0] == 2) { /* the bivariate case */ tab = get_new_xtab(rowvar[1], rowvar[2], dset, &err); if (!err) { /* make $result matrix available */ record_xtab(tab, dset, opt); if (opt & OPT_Q) { /* quiet: run and record the Pearson test */ xtab_do_pearson(tab); } else { /* print, and record Pearson test */ print_xtab(tab, dset, opt | OPT_S, prn); } free_xtab(tab); } goto finish; } if (!onelist) { /* construct the second list */ colvar = gretl_list_new(ncv); if (colvar == NULL) { err = E_ALLOC; } else { j = 1; for (i=1; i<=ncv; i++) { k = list[pos+i]; if (accept_as_discrete(dset, k, 0)) { colvar[j++] = k; } else { colvar[0] -= 1; } } if (colvar[0] == 0) { err = E_TYPES; } } } for (i=1; i<=rowvar[0] && !err; i++) { vi = rowvar[i]; if (onelist) { /* single list case */ for (j=1; jdist == D_NORMAL) { pval = chisq_cdf_comp(2, freq->test); } else if (freq->dist == D_GAMMA) { pval = normal_pvalue_2(freq->test); } if (na(pval)) { return E_NAN; } else { record_test_result(freq->test, pval); return 0; } } int model_error_dist (const MODEL *pmod, DATASET *dset, gretlopt opt, PRN *prn) { FreqDist *freq = NULL; gretlopt fopt = OPT_Z; /* show normality test */ int save_t1 = dset->t1; int save_t2 = dset->t2; int err = 0; if (pmod == NULL || pmod->uhat == NULL) { return E_DATA; } err = gretl_model_get_normality_test(pmod, prn); if (!err) { return 0; } else if (LIMDEP(pmod->ci)) { return err; } else { err = 0; } if (exact_fit_check(pmod, prn)) { return 0; } if (genr_fit_resid(pmod, dset, M_UHAT)) { return E_ALLOC; } if (!err) { dset->t1 = pmod->t1; dset->t2 = pmod->t2; freq = get_freq(dset->v - 1, dset, NADBL, NADBL, 0, pmod->ncoeff, fopt, &err); } if (!err) { if (opt & OPT_I) { err = just_record_freq_test(freq); } else if (opt & OPT_Q) { print_freq_test(freq, prn); } else { print_freq(freq, 0, NULL, prn); } free_freq(freq); } dataset_drop_last_variables(dset, 1); dset->t1 = save_t1; dset->t2 = save_t2; return err; } /* PACF via Durbin-Levinson algorithm */ static int get_pacf (gretl_matrix *A) { int m = gretl_matrix_rows(A); gretl_matrix *phi; double *acf = A->val; double *pacf = acf + m; double x, num, den; int i, j; phi = gretl_matrix_alloc(m, m); if (phi == NULL) { return E_ALLOC; } gretl_matrix_set(A, 0, 1, acf[0]); gretl_matrix_set(phi, 0, 0, acf[0]); for (i=1; i T / 5) { /* restrict to 20 percent of data (Tadeusz) */ p = T / 5; } return p; } /** * gretl_acf: * @k: lag order. * @t1: starting observation. * @t2: ending observation. * @y: data series. * @ybar: mean of @y over range @t1 to @t2. * * Returns: the autocorrelation at lag @k for the series @y over * the range @t1 to @t2, or #NADBL on failure. */ static double gretl_acf (int k, int t1, int t2, const double *y, double ybar) { double z, num = 0, den = 0; int t, n = 0; for (t=t1; t<=t2; t++) { if (na(y[t])) { return NADBL; } z = y[t] - ybar; den += z * z; if (t - k >= t1) { if (na(y[t-k])) { return NADBL; } num += z * (y[t-k] - ybar); n++; } } if (n == 0) { return NADBL; } return num / den; } /** * ljung_box: * @m: maximum lag. * @t1: starting observation. * @t2: ending observation. * @y: data series. * @err: location to receive error code. * * Returns: the Ljung-Box statistic for lag order @m for * the series @y over the sample @t1 to @t2, or #NADBL * on failure. */ double ljung_box (int m, int t1, int t2, const double *y, int *err) { double acf, ybar = 0.0, LB = 0.0; int k, n = t2 - t1 + 1; *err = 0; if (n == 0 || gretl_isconst(t1, t2, y)) { *err = E_DATA; } else if (m <= 0) { gretl_errmsg_sprintf(_("Invalid lag order %d"), m); *err = E_DATA; } else { ybar = gretl_mean(t1, t2, y); if (na(ybar)) { *err = E_DATA; } } if (*err) { return NADBL; } /* calculate acf up to lag m, cumulating LB */ for (k=1; k<=m; k++) { acf = gretl_acf(k, t1, t2, y, ybar); if (na(acf)) { *err = E_MISSDATA; break; } LB += acf * acf / (n - k); } if (*err) { LB = NADBL; } else { LB *= n * (n + 2.0); } return LB; } /** * gretl_xcf: * @k: lag order (or lead order if < 0). * @t1: starting observation. * @t2: ending observation. * @x: first data series. * @y: second data series. * @xbar: mean of first series. * @ybar: mean of second series. * * Returns: the cross-correlation at lag (or lead) @k for the * series @x and @y over the range @t1 to @t2, or #NADBL on failure. */ static double gretl_xcf (int k, int t1, int t2, const double *x, const double *y, double xbar, double ybar) { double num = 0, den1 = 0, den2 = 0; double zx, zy; int t; for (t=t1; t<=t2; t++) { if (na(x[t]) || na(y[t])) { return NADBL; } zx = x[t] - xbar; zy = y[t] - ybar; den1 += zx * zx; den2 += zy * zy; if ((k >= 0 && t - k >= t1) || (k < 0 && t - k <= t2)) { num += zx * (y[t-k] - ybar); } } return num / sqrt(den1 * den2); } static int corrgm_ascii_plot (const char *vname, const gretl_matrix *A, PRN *prn) { int k, m = gretl_matrix_rows(A); double *xk = malloc(m * sizeof *xk); if (xk == NULL) { return E_ALLOC; } for (k=0; kval, xk, m, vname, _("lag"), prn); free(xk); return 0; } static void handle_corrgm_plot_options (int ci, gretlopt opt, int *ascii, int *gp) { if (opt & OPT_Q) { /* backward compat: --quiet = no plot */ *ascii = *gp = 0; return; } if (opt & OPT_U) { /* --plot=whatever */ const char *s = get_optval_string(ci, OPT_U); if (s != NULL) { if (!strcmp(s, "none")) { /* no plot */ *ascii = *gp = 0; return; } else if (!strcmp(s, "ascii")) { *ascii = 1; *gp = 0; return; } else { /* implies use of gnuplot */ *gp = 1; return; } } } /* the defaults */ if (gretl_in_batch_mode()) { *ascii = 1; } else { *gp = 1; } } static void do_acf_bartlett (gretl_matrix *PM, int i, int T, double ssr, const double *z) { double b; int j; for (j=0; jcols; j++) { b = z[j] * sqrt((1.0/T) * (1 + 2*ssr)); gretl_matrix_set(PM, i, j, b); } } /** * corrgram: * @varno: ID number of variable to process. * @order: integer order for autocorrelation function. * @nparam: number of estimated parameters (e.g. for the * case of ARMA), used to correct the degrees of freedom * for Q test. * @dset: dataset struct. * @opt: if includes OPT_R, variable in question is a model * residual generated "on the fly"; OPT_U can be used to * specify a plot option. * @prn: gretl printing struct. * * Computes the autocorrelation function and plots the correlogram for * the variable specified by @varno. * * Returns: 0 on successful completion, error code on error. */ int corrgram (int varno, int order, int nparam, DATASET *dset, gretlopt opt, PRN *prn) { const double z[] = {1.65, 1.96, 2.58}; gretl_matrix *PM = NULL; gretl_matrix *A = NULL; double ybar, ssr, lbox; double pval = NADBL; double pm[3]; double *acf, *pacf; const char *vname; int i, k, m, T, dfQ; int ascii_plot = 0; int use_gnuplot = 0; int t1 = dset->t1, t2 = dset->t2; int err = 0, pacf_err = 0; gretl_error_clear(); if (order < 0) { gretl_errmsg_sprintf(_("Invalid lag order %d"), order); return E_DATA; } err = series_adjust_sample(dset->Z[varno], &t1, &t2); if (err) { return err; } if ((T = t2 - t1 + 1) < 4) { return E_TOOFEW; } if (gretl_isconst(t1, t2, dset->Z[varno])) { gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[varno]); return E_DATA; } ybar = gretl_mean(t1, t2, dset->Z[varno]); if (na(ybar)) { return E_DATA; } vname = series_get_graph_name(dset, varno); /* lag order for acf */ m = order; if (m == 0) { m = auto_acf_order(T); } else if (m > T - dset->pd) { int mmax = T - 1; if (m > mmax) { m = mmax; } } A = gretl_matrix_alloc(m, 2); if (A == NULL) { err = E_ALLOC; goto bailout; } if (opt & OPT_B) { PM = gretl_matrix_alloc(m, 3); if (PM == NULL) { err = E_ALLOC; goto bailout; } } acf = A->val; pacf = acf + m; /* calculate acf up to order acf_m */ for (k=1; k<=m; k++) { acf[k-1] = gretl_acf(k, t1, t2, dset->Z[varno], ybar); } /* graphing? */ handle_corrgm_plot_options(CORRGM, opt, &ascii_plot, &use_gnuplot); if (ascii_plot) { corrgm_ascii_plot(vname, A, prn); } if (opt & OPT_R) { pprintf(prn, "\n%s\n", _("Residual autocorrelation function")); } else { pputc(prn, '\n'); pprintf(prn, _("Autocorrelation function for %s"), vname); pputc(prn, '\n'); } pputs(prn, _("***, **, * indicate significance at the 1%, 5%, 10% levels\n")); if (opt & OPT_B) { pputs(prn, _("using Bartlett standard errors for ACF")); } else { pprintf(prn, _("using standard error 1/T^%.1f"), 0.5); } pputs(prn, "\n\n"); /* for confidence bands */ for (i=0; i<3; i++) { pm[i] = z[i] / sqrt((double) T); if (pm[i] > 0.5) { pm[i] = 0.5; } } /* generate (and if not in batch mode) plot partial autocorrelation function */ err = pacf_err = get_pacf(A); pputs(prn, _(" LAG ACF PACF Q-stat. [p-value]")); pputs(prn, "\n\n"); lbox = ssr = 0.0; dfQ = 1; for (k=0; k 0 && !na(acf[k-1])) { ssr += acf[k-1] * acf[k-1]; } do_acf_bartlett(PM, k, T, ssr, z); pm0 = gretl_matrix_get(PM, k, 0); pm1 = gretl_matrix_get(PM, k, 1); pm2 = gretl_matrix_get(PM, k, 2); } else { pm0 = pm[0]; pm1 = pm[1]; pm2 = pm[2]; } if (fabs(acf[k]) > pm2) { pputs(prn, " ***"); } else if (fabs(acf[k]) > pm1) { pputs(prn, " ** "); } else if (fabs(acf[k]) > pm0) { pputs(prn, " * "); } else { pputs(prn, " "); } if (na(pacf[k])) { bufspace(13, prn); } else { /* PACF */ pprintf(prn, "%9.4f", pacf[k]); if (fabs(pacf[k]) > pm[2]) { pputs(prn, " ***"); } else if (fabs(pacf[k]) > pm[1]) { pputs(prn, " ** "); } else if (fabs(pacf[k]) > pm[0]) { pputs(prn, " * "); } else { pputs(prn, " "); } } /* Ljung-Box Q */ lbox += (T * (T + 2.0)) * acf[k] * acf[k] / (T - (k + 1)); if (k >= nparam) { /* i.e., if the real df is > 0 */ pprintf(prn, "%12.4f", lbox); pval = chisq_cdf_comp(dfQ++, lbox); pprintf(prn, " [%5.3f]", pval); } pputc(prn, '\n'); } pputc(prn, '\n'); if (lbox > 0 && !na(pval)) { record_test_result(lbox, pval); } if (use_gnuplot) { err = correlogram_plot(vname, acf, (pacf_err)? NULL : pacf, PM, m, pm[1], opt); } bailout: gretl_matrix_free(A); gretl_matrix_free(PM); return err; } /** * acf_matrix: * @x: series to analyse. * @order: maximum lag for autocorrelation function. * @dset: information on the data set, or %NULL. * @n: length of series (required if @dset is %NULL). * @err: location to receive error code. * * Computes the autocorrelation function for series @x with * maximum lag @order. * * Returns: two-column matrix containing the values of the * ACF and PACF at the successive lags, or %NULL on error. */ gretl_matrix *acf_matrix (const double *x, int order, const DATASET *dset, int n, int *err) { gretl_matrix *A = NULL; double xbar; int m, k, t, T; int t1, t2; if (dset != NULL) { t1 = dset->t1; t2 = dset->t2; while (na(x[t1])) t1++; while (na(x[t2])) t2--; T = t2 - t1 + 1; } else { t1 = 0; t2 = n - 1; T = n; } if (T < 4) { *err = E_TOOFEW; return NULL; } for (t=t1; t<=t2; t++) { if (na(x[t])) { *err = E_MISSDATA; return NULL; } } if (gretl_isconst(t1, t2, x)) { gretl_errmsg_set(_("Argument is a constant")); *err = E_DATA; return NULL; } xbar = gretl_mean(t1, t2, x); if (na(xbar)) { *err = E_DATA; return NULL; } m = order; if (dset == NULL) { if (m < 1 || m > T) { *err = E_DATA; return NULL; } } else { if (m == 0) { m = auto_acf_order(T); } else if (m > T - dset->pd) { int mmax = T - 1; if (m > mmax) { m = mmax; } } } A = gretl_matrix_alloc(m, 2); if (A == NULL) { *err = E_ALLOC; return NULL; } /* calculate ACF up to order m */ for (k=0; kval[k] = gretl_acf(k+1, t1, t2, x, xbar); if (na(A->val[k])) { *err = E_DATA; } } /* add PACF */ if (!*err) { *err = get_pacf(A); } if (*err) { gretl_matrix_free(A); A = NULL; } return A; } static int xcorrgm_graph (const char *xname, const char *yname, double *xcf, int m, double *pm, int allpos) { char crit_string[16]; gchar *title; FILE *fp; int k, err = 0; fp = open_plot_input_file(PLOT_XCORRELOGRAM, 0, &err); if (err) { return err; } sprintf(crit_string, "%.2f/T^%.1f", 1.96, 0.5); gretl_push_c_numeric_locale(); fputs("set xzeroaxis\n", fp); fputs("set yzeroaxis\n", fp); print_keypos_string(GP_KEY_RIGHT_TOP, fp); fprintf(fp, "set xlabel '%s'\n", _("lag")); if (allpos) { fputs("set yrange [-0.1:1.1]\n", fp); } else { fputs("set yrange [-1.1:1.1]\n", fp); } title = g_strdup_printf(_("Correlations of %s and lagged %s"), xname, yname); fprintf(fp, "set title '%s'\n", title); g_free(title); fprintf(fp, "set xrange [%d:%d]\n", -(m + 1), m + 1); if (allpos) { fprintf(fp, "plot \\\n" "'-' using 1:2 notitle w impulses lw 5, \\\n" "%g title '%s' lt 2\n", pm[1], crit_string); } else { fprintf(fp, "plot \\\n" "'-' using 1:2 notitle w impulses lw 5, \\\n" "%g title '+- %s' lt 2, \\\n" "%g notitle lt 2\n", pm[1], crit_string, -pm[1]); } for (k=-m; k<=m; k++) { fprintf(fp, "%d %g\n", k, xcf[k+m]); } fputs("e\n", fp); gretl_pop_c_numeric_locale(); return finalize_plot_input_file(fp); } static int xcorrgm_ascii_plot (double *xcf, int xcf_m, PRN *prn) { double *xk = malloc((xcf_m * 2 + 1) * sizeof *xk); int k; if (xk == NULL) { return E_ALLOC; } for (k=-xcf_m; k<=xcf_m; k++) { xk[k+xcf_m] = k; } pprintf(prn, "\n\n%s\n\n", _("Cross-correlogram")); graphyx(xcf, xk, 2 * xcf_m + 1, "", _("lag"), prn); free(xk); return 0; } /* We assume here that all data issues have already been assessed (lag length, missing values etc.) and we just get on with the job. */ static gretl_matrix *real_xcf_vec (const double *x, const double *y, int p, int T, int *err) { gretl_matrix *xcf; double xbar, ybar; int i; xbar = gretl_mean(0, T-1, x); if (na(xbar)) { *err = E_DATA; return NULL; } ybar = gretl_mean(0, T-1, y); if (na(ybar)) { *err = E_DATA; return NULL; } xcf = gretl_column_vector_alloc(p * 2 + 1); if (xcf == NULL) { *err = E_ALLOC; return NULL; } for (i=-p; i<=p; i++) { xcf->val[i+p] = gretl_xcf(i, 0, T - 1, x, y, xbar, ybar); } return xcf; } /* for arrays @x and @y, check that there are no missing values and that neither series has a constant value */ static int xcf_data_check (const double *x, const double *y, int T, int *badvar) { int xconst = 1, yconst = 1; int t; if (T < 5) { return E_TOOFEW; } for (t=0; t 0 && x[t] != x[0]) { xconst = 0; } if (t > 0 && y[t] != y[0]) { yconst = 0; } } if (xconst) { *badvar = 1; return E_DATA; } else if (yconst) { *badvar = 2; return E_DATA; } return 0; } /** * xcf_vec: * @x: first series. * @y: second series. * @p: maximum lag for cross-correlation function. * @dset: information on the data set, or NULL. * @n: length of series (required only if @dset is NULL). * @err: location to receive error code. * * Computes the cross-correlation function for series @x with * series @y up to maximum lag @order. * * Returns: column vector containing the values of the * cross-correlation function, or NULL on error. */ gretl_matrix *xcf_vec (const double *x, const double *y, int p, const DATASET *dset, int n, int *err) { gretl_matrix *xcf = NULL; int t1, t2; int T, badvar = 0; if (p <= 0) { *err = E_DATA; return NULL; } if (dset != NULL) { int yt1, yt2; t1 = yt1 = dset->t1; t2 = yt2 = dset->t2; while (na(x[t1])) t1++; while (na(y[yt1])) yt1++; while (na(x[t2])) t2--; while (na(y[yt2])) yt2--; t1 = (yt1 > t1)? yt1 : t1; t2 = (yt2 < t2)? yt2 : t2; T = t2 - t1 + 1; } else { t1 = 0; t2 = n - 1; T = n; } #if 0 fprintf(stderr, "t1=%d, t2=%d, T=%d\n", t1, t2, T); fprintf(stderr, "x[t1]=%g, y[t1]=%g\n\n", x[t1], y[t1]); #endif if (dset != NULL) { if (2 * p > T - dset->pd) { /* ?? */ *err = E_DATA; } } else if (2 * p > T) { *err = E_DATA; } if (!*err) { *err = xcf_data_check(x + t1, y + t1, T, &badvar); if (badvar) { gretl_errmsg_sprintf(_("Argument %d is a constant"), badvar); } } if (!*err) { xcf = real_xcf_vec(x + t1, y + t1, p, T, err); } return xcf; } /** * xcorrgram: * @list: should contain ID numbers of two variables. * @order: integer order for autocorrelation function. * @dset: dataset struct. * @opt: may include OPT_U for plot options. * @prn: gretl printing struct. * * Computes the cross-correlation function and plots the * cross-correlogram for the specified variables. * * Returns: 0 on successful completion, error code on error. */ int xcorrgram (const int *list, int order, DATASET *dset, gretlopt opt, PRN *prn) { gretl_matrix *xcf = NULL; double pm[3]; const char *xname, *yname; const double *x, *y; int t1 = dset->t1, t2 = dset->t2; int ascii_plot = 0; int use_gnuplot = 0; int k, p, badvar = 0; int T, err = 0; gretl_error_clear(); if (order < 0) { gretl_errmsg_sprintf(_("Invalid lag order %d"), order); return E_DATA; } if (list[0] != 2) { return E_DATA; } x = dset->Z[list[1]]; y = dset->Z[list[2]]; err = list_adjust_sample(list, &t1, &t2, dset, NULL); if (!err) { T = t2 - t1 + 1; err = xcf_data_check(x + t1, y + t1, T, &badvar); } if (err) { if (badvar) { gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[list[badvar]]); } return err; } xname = dset->varname[list[1]]; yname = dset->varname[list[2]]; p = order; if (p == 0) { p = auto_acf_order(T) / 2; } else if (2 * p > T - dset->pd) { p = (T - 1) / 2; /* ?? */ } xcf = real_xcf_vec(x + t1, y + t1, p, T, &err); if (err) { return err; } /* graphing? */ handle_corrgm_plot_options(XCORRGM, opt, &ascii_plot, &use_gnuplot); if (ascii_plot) { xcorrgm_ascii_plot(xcf->val, p, prn); } /* for confidence bands */ pm[0] = 1.65 / sqrt((double) T); pm[1] = 1.96 / sqrt((double) T); pm[2] = 2.58 / sqrt((double) T); pputc(prn, '\n'); pprintf(prn, _("Cross-correlation function for %s and %s"), xname, yname); pputs(prn, "\n\n"); pputs(prn, _(" LAG XCF")); pputs(prn, "\n\n"); for (k=-p; k<=p; k++) { double x = xcf->val[k + p]; pprintf(prn, "%5d%9.4f", k, x); if (fabs(x) > pm[2]) { pputs(prn, " ***"); } else if (fabs(x) > pm[1]) { pputs(prn, " **"); } else if (fabs(x) > pm[0]) { pputs(prn, " *"); } pputc(prn, '\n'); } pputc(prn, '\n'); if (use_gnuplot) { int allpos = 1; for (k=-p; k<=p; k++) { if (xcf->val[k+p] < 0) { allpos = 0; break; } } err = xcorrgm_graph(xname, yname, xcf->val, p, pm, allpos); } gretl_matrix_free(xcf); return err; } struct fractint_test { double d; /* estimated degree of integration */ double se; /* standard error of the above */ }; static int fract_int_GPH (int T, int m, const double *xvec, struct fractint_test *ft) { gretl_matrix *y = NULL; gretl_matrix *X = NULL; gretl_matrix *b = NULL; gretl_matrix *V = NULL; double x, w; int t, err = 0; ft->d = ft->se = NADBL; y = gretl_column_vector_alloc(m); X = gretl_unit_matrix_new(m, 2); b = gretl_column_vector_alloc(2); V = gretl_matrix_alloc(2, 2); if (y == NULL || X == NULL || b == NULL || V == NULL) { err = E_ALLOC; goto bailout; } /* Test from Geweke and Porter-Hudak, as set out in Greene, Econometric Analysis 4e, p. 787 */ for (t=0; tval[t] = log(xvec[t+1]); w = M_2PI * (t + 1) / (double) T; x = sin(w / 2); gretl_matrix_set(X, t, 1, log(4 * x * x)); } err = gretl_matrix_ols(y, X, b, V, NULL, &x); if (!err) { ft->d = -b->val[1]; ft->se = sqrt(gretl_matrix_get(V, 1, 1)); } bailout: gretl_matrix_free(y); gretl_matrix_free(X); gretl_matrix_free(b); gretl_matrix_free(V); return err; } static gretl_matrix *gretl_matrix_pergm (const gretl_matrix *x, int m, int *err) { gretl_matrix *p = NULL; gretl_matrix *f = NULL; f = gretl_matrix_fft(x, 0, err); if (*err) { return NULL; } p = gretl_column_vector_alloc(m); if (p == NULL) { *err = E_ALLOC; } else { int T = gretl_vector_get_length(x); double re, im, scale = M_2PI * T; int i; for (i=0; ival[i] = (re*re + im*im) / scale; } } gretl_matrix_free(f); return p; } struct LWE_helper { gretl_matrix *lambda; gretl_matrix *lpow; gretl_matrix *I1; gretl_matrix *I2; double lcm; }; static void LWE_free (struct LWE_helper *L) { gretl_matrix_free(L->lambda); gretl_matrix_free(L->lpow); gretl_matrix_free(L->I1); gretl_matrix_free(L->I2); } static double LWE_obj_func (struct LWE_helper *L, double d) { double dd = 2.0 * d; int i; gretl_matrix_copy_values(L->lpow, L->lambda); gretl_matrix_raise(L->lpow, dd); for (i=0; iI1->rows; i++) { L->I2->val[i] = L->I1->val[i] * L->lpow->val[i]; } return -(log(gretl_vector_mean(L->I2)) - dd * L->lcm); } static gretl_matrix *LWE_lambda (const gretl_matrix *I1, int n) { gretl_matrix *lambda; int i, m = gretl_vector_get_length(I1); lambda = gretl_column_vector_alloc(m); if (lambda != NULL) { for (i=0; iI2 = L->lpow = NULL; L->I1 = gretl_matrix_pergm(X, m, &err); if (err) { return err; } L->lambda = LWE_lambda(L->I1, X->rows); if (L->lambda == NULL) { gretl_matrix_free(L->I1); return E_ALLOC; } L->lpow = gretl_matrix_copy(L->lambda); L->I2 = gretl_matrix_copy(L->I1); if (L->lpow == NULL || L->I2 == NULL) { err = E_ALLOC; LWE_free(L); } else { L->lcm = 0.0; for (i=0; ilcm += log(L->lambda->val[i]); } L->lcm /= m; } return err; } #define LWE_MAXITER 100 static double LWE_calc (const gretl_matrix *X, int m, int *err) { struct LWE_helper L = {0}; double d = 0, dd = 1.0; double eps = 1.0e-05; double f, incr, incl, deriv, h; int iter = 0; *err = LWE_init(&L, X, m); if (*err) { return NADBL; } while (fabs(dd) > 1.0e-06 && iter++ < LWE_MAXITER) { f = LWE_obj_func(&L, d); incr = LWE_obj_func(&L, d + eps) / eps; incl = LWE_obj_func(&L, d - eps) / eps; deriv = (incr - incl) / 2.0; h = (0.5 * (incr + incl) - f / eps) / eps; dd = (h < 0)? (-deriv / h) : deriv; if (fabs(dd) > 1) { dd = (dd > 0)? 1 : -1; } d += 0.5 * dd; } if (*err) { d = NADBL; } else if (iter == LWE_MAXITER) { fprintf(stderr, "LWE: max iterations reached\n"); d = NADBL; } LWE_free(&L); return d; } int auto_spectrum_order (int T, gretlopt opt) { int m; if (opt & OPT_O) { /* Bartlett */ m = (int) 2.0 * sqrt((double) T); } else { /* fractional integration test */ double m1 = floor((double) T / 2.0); double m2 = floor(pow((double) T, 0.6)); m = (m1 < m2)? m1 : m2; m--; } return m; } static int fract_int_LWE (const double *x, int m, int t1, int t2, struct fractint_test *ft) { gretl_matrix *X; int err = 0; X = gretl_vector_from_series(x, t1, t2); if (X == NULL) { err = E_ALLOC; } else { ft->d = LWE_calc(X, m, &err); if (!err) { ft->se = 1.0 / (2 * sqrt((double) m)); } gretl_matrix_free(X); } return err; } static int pergm_do_plot (gretlopt opt) { if (opt & OPT_U) { /* --plot=whatever */ const char *s = get_optval_string(PERGM, OPT_U); if (s != NULL) { if (!strcmp(s, "none")) { return 0; } else { /* implies use of gnuplot */ return 1; } } } /* the default */ return !gretl_in_batch_mode(); } static void pergm_print (const char *vname, const double *d, int T, int L, gretlopt opt, PRN *prn) { char xstr[32]; double dt, yt; int t; if (vname == NULL) { pprintf(prn, "\n%s\n", _("Residual periodogram")); } else { pprintf(prn, _("\nPeriodogram for %s\n"), vname); } pprintf(prn, _("Number of observations = %d\n"), T); if (opt & OPT_O) { pprintf(prn, _("Using Bartlett lag window, length %d\n\n"), L); } else { pputc(prn, '\n'); } if (opt & OPT_L) { pputs(prn, _(" omega scaled frequency periods log spectral density\n\n")); } else { pputs(prn, _(" omega scaled frequency periods spectral density\n\n")); } for (t=1; t<=T/2; t++) { dt = (opt & OPT_L)? log(d[t]) : d[t]; if (opt & OPT_R) { yt = 2 * M_PI * (double) t / T; pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t); } else if (opt & OPT_D) { yt = 360 * t / (double) T; pprintf(prn, " %7.3f%8d%16.2f", yt, t, (double) T / t); } else { yt = M_2PI * t / (double) T; pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t); } dt = (opt & OPT_L)? log(d[t]) : d[t]; sprintf(xstr, "%#.5g", dt); gretl_fix_exponent(xstr); pprintf(prn, "%16s\n", xstr); } pputc(prn, '\n'); if (pergm_do_plot(opt)) { periodogram_plot(vname, T, L, d, opt); } } static int finalize_fractint (const double *x, const double *dens, int t1, int t2, int width, const char *vname, gretlopt opt, PRN *prn) { struct fractint_test ft; gretl_matrix *result = NULL; int do_GPH = opt & (OPT_G | OPT_A); int do_LWE = !(opt & OPT_G); int do_print = !(opt & OPT_Q); int T = t2 - t1 + 1; int m, err = 0; /* order for test */ if (width <= 0) { m = auto_spectrum_order(T, OPT_NONE); } else { m = (width > T / 2)? T / 2 : width; } if (do_print && vname != NULL) { pprintf(prn, "\n%s, T = %d\n\n", vname, T); } if (do_GPH && do_LWE) { result = gretl_matrix_alloc(2, 2); } else { result = gretl_matrix_alloc(1, 2); } if (do_LWE) { /* do Local Whittle if wanted */ err = fract_int_LWE(x, m, t1, t2, &ft); if (!err) { double z = ft.d / ft.se; double pv = normal_pvalue_2(z); record_test_result(z, pv); gretl_matrix_set(result, 0, 0, ft.d); gretl_matrix_set(result, 0, 1, ft.se); if (do_print) { pprintf(prn, "%s (m = %d)\n" " %s = %g (%g)\n" " %s: z = %g, %s %.4f\n\n", _("Local Whittle Estimator"), m, _("Estimated degree of integration"), ft.d, ft.se, _("test statistic"), z, _("with p-value"), pv); } } } if (!err && do_GPH) { /* do GPH if wanted (options --all or --gph) */ int row = do_LWE ? 1 : 0; err = fract_int_GPH(T, m, dens, &ft); gretl_matrix_set(result, row, 0, ft.d); gretl_matrix_set(result, row, 1, ft.se); if (!err && (!do_LWE || do_print)) { double tval = ft.d / ft.se; int df = m - 2; double pv = student_pvalue_2(df, tval); if (!do_LWE) { record_test_result(tval, pv); } if (do_print) { pprintf(prn, "%s (m = %d)\n" " %s = %g (%g)\n" " %s: t(%d) = %g, %s %.4f\n\n", _("GPH test"), m, _("Estimated degree of integration"), ft.d, ft.se, _("test statistic"), df, tval, _("with p-value"), pv); } } } if (err) { gretl_matrix_free(result); } else { char **colnames = strings_array_new(2); colnames[0] = gretl_strdup("d"); colnames[1] = gretl_strdup("se"); gretl_matrix_set_colnames(result, colnames); set_last_result_data(result, GRETL_TYPE_MATRIX); } return err; } static double *pergm_compute_density (const double *x, int t1, int t2, int L, int bartlett, int *err) { double *sdy, *acov, *dens; double xx, yy, vx, sx, w; int k, t, T = t2 - t1 + 1; sdy = malloc(T * sizeof *sdy); acov = malloc((L + 1) * sizeof *acov); dens = malloc((1 + T/2) * sizeof *dens); if (sdy == NULL || acov == NULL || dens == NULL) { *err = E_ALLOC; free(sdy); free(acov); free(dens); return NULL; } xx = gretl_mean(t1, t2, x); vx = real_gretl_variance(t1, t2, x, 1); sx = sqrt(vx); for (t=t1; t<=t2; t++) { sdy[t-t1] = (x[t] - xx) / sx; } /* autocovariances */ for (k=1; k<=L; k++) { acov[k] = 0.0; for (t=k; t T / 2)? T / 2 : width; } } else { /* use full sample */ L = T - 1; } dens = pergm_compute_density(x, t1, t2, L, bartlett, &err); if (err) { return err; } } if (usage == PERGM_FUNC) { /* make matrix for pergm() function */ int T2 = T / 2; gretl_matrix *pm; *pmat = pm = gretl_matrix_alloc(T2, 2); if (pm == NULL) { err = E_ALLOC; } else { for (t=1; t<=T2; t++) { gretl_matrix_set(pm, t-1, 0, M_2PI * t / (double) T); gretl_matrix_set(pm, t-1, 1, dens[t]); } } } else if (usage == FRACTINT_CMD) { /* supporting "fractint" command */ err = finalize_fractint(x, dens, t1, t2, width, vname, opt, prn); } else { /* supporting "pergm" command */ pergm_print(vname, dens, T, L, opt, prn); } free(dens); return err; } /** * periodogram: * @varno: ID number of variable to process. * @width: width of window. * @dset: dataset struct. * @opt: if includes OPT_O, use Bartlett lag window for periodogram; * OPT_L, use log scale. * @prn: gretl printing struct. * * Computes and displays the periodogram for the series specified * by @varno. * * Returns: 0 on successful completion, error code on error. */ int periodogram (int varno, int width, const DATASET *dset, gretlopt opt, PRN *prn) { return pergm_or_fractint(PERGM_CMD, dset->Z[varno], dset->t1, dset->t2, width, dset->varname[varno], opt, prn, NULL); } /** * residual_periodogram: * @x: series to process. * @width: width of window. * @dset: dataset struct. * @opt: if includes OPT_O, use Bartlett lag window for periodogram; * OPT_L, use log scale. * @prn: gretl printing struct. * * Computes and displays the periodogram for @x, which is presumed to * be a model residual series. * * Returns: 0 on successful completion, error code on error. */ int residual_periodogram (const double *x, int width, const DATASET *dset, gretlopt opt, PRN *prn) { return pergm_or_fractint(PERGM_CMD, x, dset->t1, dset->t2, width, NULL, opt, prn, NULL); } /** * periodogram_matrix: * @x: the series to process. * @t1: starting observation in @x. * @t2: ending observation in @x. * @width: width of Bartlett window, or -1 for plain sample * periodogram. * @err: location to receive error code. * * Implements the userspace gretl pergm function, which can * be used on either a series from the dataset or a gretl * vector. * * Returns: allocated matrix on success, NULL on failure. */ gretl_matrix *periodogram_matrix (const double *x, int t1, int t2, int width, int *err) { gretlopt opt = (width < 0)? OPT_NONE : OPT_O; gretl_matrix *m = NULL; *err = pergm_or_fractint(PERGM_FUNC, x, t1, t2, width, NULL, opt, NULL, &m); return m; } /** * fractint: * @varno: ID number of variable to process. * @order: lag order / window size. * @dset: dataset struct. * @opt: option flags. * @prn: gretl printing struct. * * Computes and prints a test for fractional integration of the * series specified by @varno. By default the test uses the * Local Whittle Estimator but if @opt includes OPT_G then * the Geweke and Porter-Hudak test is done instead, or if * OPT_A then both tests are shown. If OPT_Q is given the * test results are not printed, just recorded (with * preference given to the LWE in case of OPT_A). * * Returns: 0 on successful completion, error code on error. */ int fractint (int varno, int order, const DATASET *dset, gretlopt opt, PRN *prn) { int err = incompatible_options(opt, (OPT_G | OPT_A)); if (!err) { err = pergm_or_fractint(FRACTINT_CMD, dset->Z[varno], dset->t1, dset->t2, order, dset->varname[varno], opt, prn, NULL); } return err; } static void printf15 (double x, int d, PRN *prn) { pputc(prn, ' '); if (na(x)) { pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA")); } else if (x > 999 && x < 100000) { int p = 1 + floor(log10(x)); d -= p; if (d < 0) d = 0; pprintf(prn, "%14.*f", d, x); } else { pprintf(prn, "%#14.*g", d, x); } } static void printf11 (double x, int d, PRN *prn) { pputc(prn, ' '); if (na(x)) { pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 10), _("NA")); } else if (x > 999 && x < 100000) { int p = 1 + floor(log10(x)); d -= p; if (d < 0) d = 0; pprintf(prn, "%10.*f", d, x); } else { pprintf(prn, "%#10.*g", d, x); } } static void output_line (char *str, PRN *prn, int dblspc) { pputs(prn, str); if (dblspc) { pputs(prn, "\n\n"); } else { pputc(prn, '\n'); } } static void prhdr (const char *str, const DATASET *dset, int missing, PRN *prn) { char date1[OBSLEN], date2[OBSLEN]; gchar *tmp; ntolabel(date1, dset->t1, dset); ntolabel(date2, dset->t2, dset); pputc(prn, '\n'); tmp = g_strdup_printf(_("%s, using the observations %s - %s"), str, date1, date2); output_line(tmp, prn, 0); g_free(tmp); if (missing) { output_line(_("(missing values were skipped)"), prn, 1); } } static void summary_print_val (double x, int digits, int places, PRN *prn) { pputc(prn, ' '); if (na(x)) { pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA")); } else if (digits < 0) { pprintf(prn, "%14d", (int) x); } else if (digits > 0 || places > 0) { int prec = (digits > 0)? digits : places; int len = prec + 9; char *s = (digits > 0)? "#" : ""; char t = (digits > 0)? 'g' : 'f'; char fmt[32]; sprintf(fmt, "%%%s%d.%d%c", s, len, prec, t); pprintf(prn, fmt, x); } else { /* the default */ pprintf(prn, "%#14.5g", x); } } #define NSUMM 12 void print_summary_single (const Summary *s, int digits, int places, const DATASET *dset, PRN *prn) { const char *labels[NSUMM] = { N_("Mean"), N_("Median"), N_("Minimum"), N_("Maximum"), N_("Standard deviation"), N_("C.V."), N_("Skewness"), N_("Ex. kurtosis"), /* xgettext:no-c-format */ N_("5% percentile"), /* xgettext:no-c-format */ N_("95% percentile"), N_("Interquartile range"), N_("Missing obs.") }; const char *wstr = N_("Within s.d."); const char *bstr = N_("Between s.d."); double vals[NSUMM]; int simple_skip[NSUMM] = {0,1,0,0,0,1,1,1,1,1,1,0}; int skip0595 = 0; int offset = 2; int slen = 0, i = 0; if (s->opt & OPT_B) { offset = 4; } else { const char *vname = dset->varname[s->list[1]]; char obs1[OBSLEN], obs2[OBSLEN]; gchar *tmp = NULL; ntolabel(obs1, dset->t1, dset); ntolabel(obs2, dset->t2, dset); prhdr(_("Summary statistics"), dset, 0, prn); if (isdigit(*vname)) { const char *mname = dataset_get_matrix_name(dset); if (mname != NULL) { tmp = g_strdup_printf(_("for column %d of %s (%d valid observations)"), atoi(vname), mname, s->n); } else { tmp = g_strdup_printf(_("for column %d (%d valid observations)"), atoi(vname), s->n); } } else { tmp = g_strdup_printf(_("for the variable '%s' (%d valid observations)"), dset->varname[s->list[1]], s->n); } output_line(tmp, prn, 1); g_free(tmp); } vals[0] = s->mean[0]; vals[1] = s->median[0]; vals[2] = s->low[0]; vals[3] = s->high[0]; vals[4] = s->sd[0]; vals[5] = s->cv[0]; vals[6] = s->skew[0]; vals[7] = s->xkurt[0]; vals[8] = s->perc05[0]; vals[9] = s->perc95[0]; vals[10] = s->iqr[0]; vals[11] = s->misscount[0]; if (na(vals[8]) && na(vals[9])) { skip0595 = 1; } for (i=0; iopt & OPT_S) && simple_skip[i]) { continue; } else if (skip0595 && (i == 8 || i == 9)) { continue; } if (strlen(_(labels[i])) > slen) { slen = g_utf8_strlen(_(labels[i]), -1); } } slen++; for (i=0; iopt & OPT_S) && simple_skip[i]) { continue; } else if (skip0595 && (i == 8 || i == 9)) { continue; } bufspace(offset, prn); if (i == 8 || i == 9) { /* the strings contain '%' */ gchar *pcstr = g_strdup(_(labels[i])); int n = slen - g_utf8_strlen(pcstr, -1); pputs(prn, pcstr); if (n > 0) { bufspace(n, prn); } g_free(pcstr); } else { pprintf(prn, "%-*s", UTF_WIDTH(_(labels[i]), slen), _(labels[i])); } if (i == NSUMM - 1) { summary_print_val(vals[i], -1, places, prn); } else { summary_print_val(vals[i], digits, places, prn); } pputc(prn, '\n'); } if (!na(s->sw) && !na(s->sb)) { pputc(prn, '\n'); bufspace(offset, prn); pprintf(prn, "%-*s", UTF_WIDTH(_(wstr), slen), _(wstr)); summary_print_val(s->sw, digits, places, prn); pputc(prn, '\n'); bufspace(offset, prn); pprintf(prn, "%-*s", UTF_WIDTH(_(bstr), slen), _(bstr)); summary_print_val(s->sb, digits, places, prn); } pputc(prn, '\n'); } static void summary_print_varname (const char *src, int len, PRN *prn) { char vname[NAMETRUNC]; maybe_trim_varname(vname, src); pprintf(prn, "%-*s", len, vname); } /** * print_summary: * @summ: pointer to gretl summary statistics struct. * @dset: information on the data set. * @prn: gretl printing struct. * * Prints the summary statistics for a given variable. */ void print_summary (const Summary *summ, const DATASET *dset, PRN *prn) { int dmax, d = get_gretl_digits(); int len, maxlen = 0; int i, vi; if (summ->list == NULL || summ->list[0] == 0) { return; } if (summ->weight_var > 0) { pputc(prn, '\n'); pprintf(prn, _("Weighting variable: %s\n"), dset->varname[summ->weight_var]); } if (summ->list[0] == 1) { print_summary_single(summ, 0, 0, dset, prn); return; } /* number of significant figures to use */ dmax = (summ->opt & OPT_S)? 4 : 5; d = d > dmax ? dmax : d; maxlen = max_namelen_in_list(summ->list, dset); len = maxlen <= 8 ? 10 : (maxlen + 1); #if 0 if (!(summ->opt & OPT_B)) { int missing = summary_has_missing_values(summ); prhdr(_("Summary statistics"), dset, missing, prn); } #endif pputc(prn, '\n'); if (summ->opt & OPT_S) { /* the "simple" option */ const char *h[] = { N_("Mean"), N_("Median"), /* TRANSLATORS: 'S.D.' means Standard Deviation */ N_("S.D."), N_("Min"), N_("Max"), }; pprintf(prn, "%*s%*s%*s%*s%*s%*s\n", len, " ", UTF_WIDTH(_(h[0]), 11), _(h[0]), UTF_WIDTH(_(h[1]), 11), _(h[1]), UTF_WIDTH(_(h[2]), 11), _(h[2]), UTF_WIDTH(_(h[3]), 11), _(h[3]), UTF_WIDTH(_(h[4]), 11), _(h[4])); for (i=0; ilist[0]; i++) { vi = summ->list[i+1]; summary_print_varname(dset->varname[vi], len, prn); printf11(summ->mean[i], d, prn); printf11(summ->median[i], d, prn); printf11(summ->sd[i], d, prn); printf11(summ->low[i], d, prn); printf11(summ->high[i], d, prn); pputc(prn, '\n'); } pputc(prn, '\n'); } else { /* print all available stats */ const char *ha[] = { N_("Mean"), N_("Median"), N_("Minimum"), N_("Maximum"), }; const char *hb[] = { N_("Std. Dev."), N_("C.V."), N_("Skewness"), N_("Ex. kurtosis") }; const char *hc[] = { /* xgettext:no-c-format */ N_("5% perc."), /* xgettext:no-c-format */ N_("95% perc."), N_("IQ range"), N_("Missing obs.") }; /* cases where 0.05 and 0.95 quantiles are OK */ int npct = 0; pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ", UTF_WIDTH(_(ha[0]), 15), _(ha[0]), UTF_WIDTH(_(ha[1]), 15), _(ha[1]), UTF_WIDTH(_(ha[2]), 15), _(ha[2]), UTF_WIDTH(_(ha[3]), 15), _(ha[3])); for (i=0; ilist[0]; i++) { vi = summ->list[i+1]; summary_print_varname(dset->varname[vi], len, prn); printf15(summ->mean[i], d, prn); printf15(summ->median[i], d, prn); printf15(summ->low[i], d, prn); printf15(summ->high[i], d, prn); pputc(prn, '\n'); /* while we're at it, register cases where we can show the 0.05 and 0.95 quantiles */ if (!na(summ->perc05[i]) && !na(summ->perc95[i])) { npct++; } } pputc(prn, '\n'); pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ", UTF_WIDTH(_(hb[0]), 15), _(hb[0]), UTF_WIDTH(_(hb[1]), 15), _(hb[1]), UTF_WIDTH(_(hb[2]), 15), _(hb[2]), UTF_WIDTH(_(hb[3]), 15), _(hb[3])); for (i=0; ilist[0]; i++) { double cv; vi = summ->list[i+1]; summary_print_varname(dset->varname[vi], len, prn); if (floateq(summ->mean[i], 0.0)) { cv = NADBL; } else if (floateq(summ->sd[i], 0.0)) { cv = 0.0; } else { cv = fabs(summ->sd[i] / summ->mean[i]); } printf15(summ->sd[i], d, prn); printf15(cv, d, prn); printf15(summ->skew[i], d, prn); printf15(summ->xkurt[i], d, prn); pputc(prn, '\n'); } pputc(prn, '\n'); if (npct > 0) { /* note: use pputs for strings containing literal '%' */ gchar *hc0 = g_strdup(_(hc[0])); gchar *hc1 = g_strdup(_(hc[1])); int n; pprintf(prn, "%*s", len, " "); n = 15 - g_utf8_strlen(hc0, -1); if (n > 0) bufspace(n, prn); pputs(prn, hc0); n = 15 - g_utf8_strlen(hc1, -1); if (n > 0) bufspace(n, prn); pputs(prn, hc1); g_free(hc0); g_free(hc1); pprintf(prn, "%*s%*s\n", UTF_WIDTH(_(ha[2]), 15), _(hc[2]), UTF_WIDTH(_(ha[3]), 15), _(hc[3])); } else { /* not showing any 0.05, 0.95 quantiles */ pprintf(prn, "%*s%*s%*s\n", len, " ", UTF_WIDTH(_(ha[2]), 15), _(hc[2]), UTF_WIDTH(_(ha[3]), 15), _(hc[3])); } for (i=0; ilist[0]; i++) { vi = summ->list[i+1]; summary_print_varname(dset->varname[vi], len, prn); if (!na(summ->perc05[i]) && !na(summ->perc95[i])) { printf15(summ->perc05[i], d, prn); printf15(summ->perc95[i], d, prn); } else if (npct > 0) { pprintf(prn, "%*s", 15, "NA"); pprintf(prn, "%*s", 15, "NA"); } printf15(summ->iqr[i], d, prn); pprintf(prn, "%15d", (int) summ->misscount[i]); pputc(prn, '\n'); } pputc(prn, '\n'); } } /** * free_summary: * @summ: pointer to gretl summary statistics struct * * Frees all resources associated with @summ, and * the pointer itself. */ void free_summary (Summary *summ) { free(summ->list); free(summ->misscount); free(summ->stats); free(summ); } /*** * summary_new: * @list: list of variables we want summary statistics for * @wv: weighting variable (0=no weights) * @opt: options * * Allocates a new "Summary" struct and initializes a few * things inside it */ static Summary *summary_new (const int *list, int wv, gretlopt opt, int *err) { Summary *s; int nv; if (list == NULL) { fprintf(stderr, "summary_new: list is NULL\n"); *err = E_DATA; return NULL; } nv = list[0]; s = malloc(sizeof *s); if (s == NULL) { *err = E_ALLOC; return NULL; } s->list = gretl_list_copy(list); if (s->list == NULL) { *err = E_ALLOC; free(s); return NULL; } s->weight_var = wv; s->opt = opt; s->n = 0; s->misscount = malloc(nv * sizeof *s->misscount); s->stats = malloc(11 * nv * sizeof *s->stats); if (s->stats == NULL) { *err = E_ALLOC; free_summary(s); return NULL; } s->mean = s->stats; s->median = s->mean + nv; s->sd = s->median + nv; s->skew = s->sd + nv; s->xkurt = s->skew + nv; s->low = s->xkurt + nv; s->high = s->low + nv; s->cv = s->high + nv; s->perc05 = s->cv + nv; s->perc95 = s->perc05 + nv; s->iqr = s->perc95 + nv; s->sb = s->sw = NADBL; return s; } int summary_has_missing_values (const Summary *summ) { if (summ->misscount != NULL) { int i, nv = summ->list[0]; for (i=0; imisscount[i] > 0) { return 1; } } } return 0; } static int compare_wgtord_rows (const void *a, const void *b) { const double **da = (const double **) a; const double **db = (const double **) b; return (da[0][0] > db[0][0]) - (da[0][0] < db[0][0]); } static int weighted_order_stats (const double *y, const double *w, double *ostats, int n, int k) { /* on input "ostats" contains the quantiles to compute; on output it holds the results */ double p, q, wsum = 0.0; double **X; int t, i, err = 0; X = doubles_array_new(n, 2); if (X == NULL) { return E_ALLOC; } i = 0; for (t=0; t= n - 1) { ostats[i] = NADBL; continue; } if (X[t-1][0] == X[t][0]) { ostats[i] = X[t][0]; } else { double a = (q - p) / X[t][1]; ostats[i] = a * X[t+1][0] + (1-a) * X[t][0]; } } doubles_array_free(X, n); return err; } /** * get_summary_weighted: * @list: list of variables to process. * @dset: dataset struct. * @wgt: series to use as weights. * @opt: may include OPT_S for "simple" version. * @prn: gretl printing struct. * @err: location to receive error code. * * Calculates descriptive summary statistics for the specified * variables, weighting the observations by @rv. The series @rv must * be of full length (dset->n). * * Returns: #Summary object containing the summary statistics, or NULL * on failure. */ Summary *get_summary_weighted (const int *list, const DATASET *dset, int wtvar, gretlopt opt, PRN *prn, int *err) { const double *wts; double *x, ostats[5]; int t1 = dset->t1; int t2 = dset->t2; Summary *s; int i, t; s = summary_new(list, wtvar, opt, err); if (s == NULL) { return NULL; } x = malloc(dset->n * sizeof *x); if (x == NULL) { *err = E_ALLOC; free_summary(s); return NULL; } wts = dset->Z[wtvar]; for (i=0; ilist[i+1]; int ni = 0; int ntot = 0; double wt; /* prepare the series for weighting: substitute NAs for values at which the weights are invalid or zero */ for (t=t1; t<=t2; t++) { wt = wts[t]; if (!na(wt) && wt != 0.0) { ntot++; x[t] = dset->Z[vi][t]; if (!na(x[t])) { ni++; } } else { x[t] = NADBL; } } s->misscount[i] = ntot - ni; if (ni > s->n) { s->n = ni; } if (ni == 0) { pprintf(prn, _("Dropping %s: sample range contains no valid " "observations\n"), dset->varname[vi]); gretl_list_delete_at_pos(s->list, i + 1); if (s->list[0] == 0) { return s; } else { i--; continue; } } if (opt & OPT_S) { s->skew[i] = NADBL; s->xkurt[i] = NADBL; s->cv[i] = NADBL; s->median[i] = NADBL; } else { pskew = &s->skew[i]; pkurt = &s->xkurt[i]; } gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]); gretl_moments(t1, t2, x, wts, &s->mean[i], &s->sd[i], pskew, pkurt, 1); if (!(opt & OPT_S)) { int err; if (floateq(s->mean[i], 0.0)) { s->cv[i] = NADBL; } else if (floateq(s->sd[i], 0.0)) { s->cv[i] = 0.0; } else { s->cv[i] = fabs(s->sd[i] / s->mean[i]); } ostats[0] = 0.05; ostats[1] = 0.25; ostats[2] = 0.5; ostats[3] = 0.75; ostats[4] = 0.95; err = weighted_order_stats(x, wts, ostats, ni, 5); if (!err) { s->median[i] = ostats[2]; s->perc05[i] = ostats[0]; s->perc95[i] = ostats[4]; if (!na(ostats[1]) && !na(ostats[3])) { s->iqr[i] = ostats[3] - ostats[1]; } } } #if 0 if (dataset_is_panel(dset) && list[0] == 1) { panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb); } #endif } free(x); return s; } /* Get the additional statistics wanted if the --simple flag was not given to the summary command. */ static int get_extra_stats (Summary *s, int i, int t1, int t2, const double *x) { int err = 0; if (floateq(s->mean[i], 0.0)) { s->cv[i] = NADBL; } else if (floateq(s->sd[i], 0.0)) { s->cv[i] = 0.0; } else { s->cv[i] = fabs(s->sd[i] / s->mean[i]); } s->perc05[i] = gretl_quantile(t1, t2, x, 0.05, OPT_Q, &err); s->perc95[i] = gretl_quantile(t1, t2, x, 0.95, OPT_Q, &err); s->iqr[i] = gretl_quantile(t1, t2, x, 0.75, OPT_NONE, &err); if (!na(s->iqr[i])) { s->iqr[i] -= gretl_quantile(t1, t2, x, 0.25, OPT_NONE, &err); } return err; } /** * get_summary_restricted: * @list: list of variables to process. * @dset: dataset struct. * @rv: series to use as restriction dummy. * @opt: may include OPT_S for "simple" version. * @prn: gretl printing struct. * @err: location to receive error code. * * Calculates descriptive summary statistics for the specified variables, * with the observations restricted to those for which @rv has a non-zero * (and non-missing) value. The series @rv must be of full length (dset->n). * * Returns: #Summary object containing the summary statistics, or NULL * on failure. */ Summary *get_summary_restricted (const int *list, const DATASET *dset, const double *rv, gretlopt opt, PRN *prn, int *err) { int t1 = dset->t1; int t2 = dset->t2; Summary *s; double *x; int i, t; s = summary_new(list, 0, opt, err); if (s == NULL) { return NULL; } x = malloc(dset->n * sizeof *x); if (x == NULL) { *err = E_ALLOC; free_summary(s); return NULL; } for (i=0; ilist[0]; i++) { double *pskew = NULL, *pkurt = NULL; int vi = s->list[i+1]; int strvals; int ni = 0; int ntot = 0; strvals = is_string_valued(dset, vi); if (!strvals) { /* create the restricted series: substitute NAs for values at which the restriction dummy is invalid or zero */ for (t=t1; t<=t2; t++) { if (!na(rv[t]) && rv[t] != 0.0) { ntot++; x[t] = dset->Z[vi][t]; if (!na(x[t])) { ni++; } } else { x[t] = NADBL; } } s->misscount[i] = ntot - ni; if (ni > s->n) { s->n = ni; } } if (ni == 0) { if (strvals) { pprintf(prn, _("Dropping %s: string-valued series\n"), dset->varname[vi]); } else { pprintf(prn, _("Dropping %s: sample range contains no valid " "observations\n"), dset->varname[vi]); } gretl_list_delete_at_pos(s->list, i + 1); if (s->list[0] == 0) { return s; } else { i--; continue; } } if (opt & OPT_S) { s->skew[i] = NADBL; s->xkurt[i] = NADBL; s->cv[i] = NADBL; } else { pskew = &s->skew[i]; pkurt = &s->xkurt[i]; } gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]); gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i], pskew, pkurt, 1); s->median[i] = gretl_median(t1, t2, x); if (!(opt & OPT_S)) { *err = get_extra_stats(s, i, t1, t2, x); } if (dataset_is_panel(dset) && list[0] == 1) { panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb); } } free(x); return s; } /** * get_summary: * @list: list of variables to process. * @dset: dataset struct. * @opt: may include OPT_S for "simple" version. * @prn: gretl printing struct. * @err: location to receive error code. * * Calculates descriptive summary statistics for the specified variables. * * Returns: #Summary object containing the summary statistics, or NULL * on failure. */ Summary *get_summary (const int *list, const DATASET *dset, gretlopt opt, PRN *prn, int *err) { int t1 = dset->t1; int t2 = dset->t2; Summary *s; int i, nmax; s = summary_new(list, 0, opt, err); if (s == NULL) { return NULL; } nmax = sample_size(dset); for (i=0; ilist[0]; i++) { double *pskew = NULL, *pkurt = NULL; const double *x; double x0; int vi = s->list[i+1]; int strvals; int ni = 0; strvals = is_string_valued(dset, vi); if (!strvals) { x = dset->Z[vi]; ni = good_obs(x + t1, nmax, &x0); s->misscount[i] = nmax - ni; if (ni > s->n) { s->n = ni; } } if (ni == 0) { if (strvals) { pprintf(prn, _("Dropping %s: string-valued series\n"), dset->varname[vi]); } else { pprintf(prn, _("Dropping %s: sample range contains no valid " "observations\n"), dset->varname[vi]); } gretl_list_delete_at_pos(s->list, i + 1); if (s->list[0] == 0) { return s; } else { i--; continue; } } if (opt & OPT_S) { s->skew[i] = NADBL; s->xkurt[i] = NADBL; s->cv[i] = NADBL; s->median[i] = NADBL; } else { pskew = &s->skew[i]; pkurt = &s->xkurt[i]; } gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]); if (s->weight_var == 0) { gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i], pskew, pkurt, 1); } else { gretl_moments(t1, t2, x, dset->Z[s->weight_var], &s->mean[i], &s->sd[i], pskew, pkurt, 0); } /* included in both simple and full variants */ s->median[i] = gretl_median(t1, t2, x); if (!(opt & OPT_S)) { *err = get_extra_stats(s, i, t1, t2, x); } if (dataset_is_panel(dset) && list[0] == 1) { panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb); } } return s; } static void record_summary (Summary *summ, const DATASET *dset) { gretl_matrix *m = NULL; char **Sc, **Sr; int nv = summ->list[0]; int i, vi, cols; cols = (summ->opt & OPT_S)? 5 : 12; m = gretl_matrix_alloc(nv, cols); if (m == NULL) { return; } Sc = strings_array_new(cols); Sr = strings_array_new(nv); if (summ->opt & OPT_S) { /* the "simple" option */ const char *h[] = { N_("Mean"), N_("Median"), N_("S.D."), N_("Min"), N_("Max"), }; if (Sc != NULL) { for (i=0; ilist[i+1]; Sr[i] = gretl_strdup(dset->varname[vi]); } gretl_matrix_set(m, i, 0, summ->mean[i]); gretl_matrix_set(m, i, 1, summ->median[i]); gretl_matrix_set(m, i, 2, summ->sd[i]); gretl_matrix_set(m, i, 3, summ->low[i]); gretl_matrix_set(m, i, 4, summ->high[i]); } } else { /* record all available stats */ const char *h[] = { N_("Mean"), N_("Median"), N_("Minimum"), N_("Maximum"), N_("Std. Dev."), N_("C.V."), N_("Skewness"), N_("Ex. kurtosis"), /* xgettext:no-c-format */ N_("5% perc."), /* xgettext:no-c-format */ N_("95% perc."), N_("IQ range"), N_("Missing obs.") }; double cv; if (Sc != NULL) { for (i=0; ilist[i+1]; Sr[i] = gretl_strdup(dset->varname[vi]); } gretl_matrix_set(m, i, 0, summ->mean[i]); gretl_matrix_set(m, i, 1, summ->median[i]); gretl_matrix_set(m, i, 2, summ->low[i]); gretl_matrix_set(m, i, 3, summ->high[i]); gretl_matrix_set(m, i, 4, summ->sd[i]); if (floateq(summ->mean[i], 0.0)) { cv = NADBL; } else if (floateq(summ->sd[i], 0.0)) { cv = 0.0; } else { cv = fabs(summ->sd[i] / summ->mean[i]); } gretl_matrix_set(m, i, 5, cv); gretl_matrix_set(m, i, 6, summ->skew[i]); gretl_matrix_set(m, i, 7, summ->xkurt[i]); gretl_matrix_set(m, i, 8, summ->perc05[i]); gretl_matrix_set(m, i, 9, summ->perc95[i]); gretl_matrix_set(m, i, 10, summ->iqr[i]); gretl_matrix_set(m, i, 11, summ->misscount[i]); } } gretl_matrix_set_colnames(m, Sc); gretl_matrix_set_rownames(m, Sr); set_last_result_data(m, GRETL_TYPE_MATRIX); } /** * list_summary: * @list: list of series to process. * @dset: dataset struct. * @opt: may include %OPT_S for "simple" version. * @prn: gretl printing struct. * * Prints descriptive statistics for the listed variables. * * Returns: 0 on success, non-zero code on error. */ int list_summary (const int *list, int wgtvar, const DATASET *dset, gretlopt opt, PRN *prn) { Summary *summ = NULL; int err = 0; if (list != NULL) { if (list[0] == 0) { return 0; } if (wgtvar == 0) { /* no weights */ summ = get_summary(list, dset, opt, prn, &err); } else { summ = get_summary_weighted(list, dset, wgtvar, opt, prn, &err); } } else { int *tmplist = full_var_list(dset, NULL); if (tmplist != NULL) { if (wgtvar == 0) { /* no weights */ summ = get_summary(tmplist, dset, opt, prn, &err); } else { summ = get_summary_weighted(tmplist, dset, wgtvar, opt, prn, &err); } free(tmplist); } } if (summ != NULL) { if (!(opt & OPT_Q)) { print_summary(summ, dset, prn); } record_summary(summ, dset); free_summary(summ); } return err; } /** * vmatrix_new: * * Returns: an allocated and initialized #VMatrix, or * %NULL on failure. */ VMatrix *vmatrix_new (void) { VMatrix *v = malloc(sizeof *v); if (v != NULL) { v->vec = NULL; v->xbar = NULL; v->ssx = NULL; v->list = NULL; v->names = NULL; v->ci = 0; v->dim = 0; v->t1 = 0; v->t2 = 0; v->n = 0; v->missing = 0; } return v; } /** * free_vmatrix: * @vmat: pointer to gretl correlation matrix struct * * Frees all resources associated with @vmat, and * the pointer itself. */ void free_vmatrix (VMatrix *vmat) { if (vmat != NULL) { if (vmat->names != NULL) { strings_array_free(vmat->names, vmat->dim); } if (vmat->vec != NULL) { free(vmat->vec); } if (vmat->xbar != NULL) { free(vmat->xbar); } if (vmat->ssx != NULL) { free(vmat->ssx); } if (vmat->list != NULL) { free(vmat->list); } free(vmat); } } enum { CORRMAT = 0, COVMAT }; /* Compute correlation or covariance matrix, using the maximum available sample for each coefficient. */ static int max_corrcov_matrix (VMatrix *v, const DATASET *dset, int flag) { double **Z = dset->Z; int i, j, vi, vj, nij; int nmin, nmax = 0; int m = v->dim; int nmiss; nmin = v->n = v->t2 - v->t1 + 1; for (i=0; ilist[i+1]; for (j=i; jlist[j+1]; nij = ijton(i, j, m); if (i == j && flag == CORRMAT) { v->vec[nij] = 1.0; } else { nmiss = 0; if (flag == COVMAT) { v->vec[nij] = gretl_covar(v->t1, v->t2, Z[vi], Z[vj], &nmiss); } else { v->vec[nij] = gretl_corr(v->t1, v->t2, Z[vi], Z[vj], &nmiss); } if (nmiss > 0) { int n = v->n - nmiss; if (n < nmin && n > 0) { nmin = n; } if (n > nmax) { nmax = n; } v->missing = 1; } else { nmax = v->n; } } } } /* We'll record an "n" value if there's something resembling a common number of observations across the coefficients. */ if (v->missing) { v->n = 0; if (nmax > 0) { double d = (nmax - nmin) / (double) nmax; if (d < 0.10) { v->n = nmin; } } } return 0; } /* Compute correlation or covariance matrix, ensuring we use the same sample for all coefficients. We may be doing this in the context of the "corr" command or in the context of "pca". In the latter case we want to save the "xbar" and "ssx" values for later use when calculating the component series. */ static int uniform_corrcov_matrix (VMatrix *v, const DATASET *dset, int ci, int flag) { double **Z = dset->Z; double *xbar = NULL, *ssx = NULL; double *x; int m = v->dim; int mm = (m * (m + 1)) / 2; double d1, d2; int i, j, jmin, nij, t; int miss, n = 0; xbar = malloc(m * sizeof *xbar); if (xbar == NULL) { return E_ALLOC; } if (ci == PCA || flag == CORRMAT) { ssx = malloc(m * sizeof *ssx); if (ssx == NULL) { free(xbar); return E_ALLOC; } } for (i=0; imissing = 0; /* first pass: get sample size and sums */ for (t=v->t1; t<=v->t2; t++) { miss = 0; for (i=0; ilist[i+1]][t])) { miss = 1; v->missing += 1; break; } } if (!miss) { n++; for (i=0; ilist[i+1]][t]; } } } if (n < 2) { free(xbar); free(ssx); return E_MISSDATA; } for (i=0; ivec[i] = 0.0; } /* second pass: get deviations from means and cumulate */ for (t=v->t1; t<=v->t2; t++) { miss = 0; for (i=0; ilist[i+1]]; if (na(x[t])) { miss = 1; break; } } if (!miss) { for (i=0; ilist[i+1]]; d1 = x[t] - xbar[i]; if (ssx != NULL) { ssx[i] += d1 * d1; } jmin = (flag == COVMAT)? i : i + 1; for (j=jmin; jlist[j+1]]; nij = ijton(i, j, m); d2 = x[t] - xbar[j]; v->vec[nij] += d1 * d2; } } } } /* finalize: compute correlations or covariances */ if (flag == CORRMAT) { /* correlations */ for (i=0; ivec[nij] = 1.0; } else if (ssx[i] == 0.0 || ssx[j] == 0.0) { v->vec[nij] = NADBL; } else { v->vec[nij] /= sqrt(ssx[i] * ssx[j]); } } } } else { /* covariances */ for (i=0; ivec[i] /= (n - 1); } } v->n = n; if (ci == PCA) { v->xbar = xbar; v->ssx = ssx; } else { free(xbar); free(ssx); } return 0; } /** * corrlist: * @ci: command index. * @list: list of variables to process, by ID number. * @dset: dataset struct. * @opt: option flags. * @err: location to receive error code. * * Computes pairwise correlation coefficients for the variables * specified in @list, skipping any constants. If the option * flags contain OPT_N, a uniform sample is ensured: only those * observations for which all the listed variables have valid * values are used. If OPT_C is included, we actually calculate * covariances rather than correlations. * * Returns: gretl correlation matrix struct, or NULL on failure. */ VMatrix *corrlist (int ci, int *list, const DATASET *dset, gretlopt opt, int *err) { int flag = (opt & OPT_C)? COVMAT : CORRMAT; VMatrix *v; int i, m, mm; int nmiss = 0; if (list == NULL) { fprintf(stderr, "corrlist: list is NULL\n"); *err = E_DATA; return NULL; } v = vmatrix_new(); if (v == NULL) { *err = E_ALLOC; return NULL; } /* drop any constants from list */ for (i=list[0]; i>0; i--) { if (gretl_isconst(dset->t1, dset->t2, dset->Z[list[i]])) { gretl_list_delete_at_pos(list, i); } } if (list[0] < 2) { gretl_errmsg_set(_("corr: needs at least two non-constant arguments")); *err = E_DATA; goto bailout; } v->t1 = dset->t1; v->t2 = dset->t2; /* adjust sample range if need be */ list_adjust_sample(list, &v->t1, &v->t2, dset, &nmiss); if (v->t2 - v->t1 + 1 < 3) { *err = E_TOOFEW; goto bailout; } v->dim = m = list[0]; mm = (m * (m + 1)) / 2; v->names = strings_array_new(m); v->vec = malloc(mm * sizeof *v->vec); v->list = gretl_list_copy(list); if (v->names == NULL || v->vec == NULL || v->list == NULL) { *err = E_ALLOC; goto bailout; } if (ci == PCA) { opt |= OPT_N; } if (opt & OPT_N) { /* impose uniform sample size */ *err = uniform_corrcov_matrix(v, dset, ci, flag); } else { /* sample sizes may differ */ *err = max_corrcov_matrix(v, dset, flag); } if (!*err) { for (i=0; inames[i] = gretl_strdup(dset->varname[list[i+1]]); if (v->names[i] == NULL) { *err = E_ALLOC; goto bailout; } } } v->ci = (flag == CORRMAT)? CORR : 0; /* FIXME? */ bailout: if (*err) { free_vmatrix(v); v = NULL; } return v; } static void printcorr (const VMatrix *v, PRN *prn) { double r = v->vec[1]; pprintf(prn, "\ncorr(%s, %s)", v->names[0], v->names[1]); if (na(r)) { pprintf(prn, ": %s\n\n", _("undefined")); } else { pprintf(prn, " = %.8f\n", r); if (fabs(r) < 1.0) { int n2 = v->n - 2; double tval = r * sqrt(n2 / (1 - r*r)); pputs(prn, _("Under the null hypothesis of no correlation:\n ")); pprintf(prn, _("t(%d) = %g, with two-tailed p-value %.4f\n"), n2, tval, student_pvalue_2(n2, tval)); pputc(prn, '\n'); } else { pprintf(prn, _("5%% critical value (two-tailed) = " "%.4f for n = %d"), rhocrit95(v->n), v->n); pputs(prn, "\n\n"); } } } /** * print_corrmat: * @corr: gretl correlation matrix. * @dset: dataset information. * @prn: gretl printing struct. * * Prints a gretl correlation matrix to @prn. */ void print_corrmat (VMatrix *corr, const DATASET *dset, PRN *prn) { if (corr->dim == 2) { printcorr(corr, prn); } else { char date1[OBSLEN], date2[OBSLEN]; gchar *tmp = NULL; ntolabel(date1, corr->t1, dset); ntolabel(date2, corr->t2, dset); pputc(prn, '\n'); tmp = g_strdup_printf(_("%s, using the observations %s - %s"), _("Correlation Coefficients"), date1, date2); output_line(tmp, prn, 0); g_free(tmp); if (corr->missing) { output_line(_("(missing values were skipped)"), prn, 1); } if (corr->n > 0) { tmp = g_strdup_printf(_("5%% critical value (two-tailed) = " "%.4f for n = %d"), rhocrit95(corr->n), corr->n); output_line(tmp, prn, 1); g_free(tmp); } text_print_vmatrix(corr, prn); } } static void record_corr_matrix (VMatrix *c) { gretl_matrix *m = NULL; int i, j, k, n = c->dim; double cij; m = gretl_matrix_alloc(n, n); if (m != NULL) { k = 0; for (i=0; ivec[k]; gretl_matrix_set(m, i, j, cij); gretl_matrix_set(m, j, i, cij); k++; } } if (c->names) { char **rlab = strings_array_new(n); char **clab = strings_array_new(n); if (rlab != NULL && clab != NULL) { for (i=0; inames[i]); clab[i] = gretl_strdup(c->names[i]); } gretl_matrix_set_rownames(m, rlab); gretl_matrix_set_colnames(m, clab); } } set_last_result_data(m, GRETL_TYPE_MATRIX); } } /** * gretl_corrmx: * @list: gives the ID numbers of the variables to process. * @dset: dataset struct. * @opt: option flags: OPT_N means use uniform sample size, * OPT_U controls plotting options. * @prn: gretl printing struct. * * Computes and prints the correlation matrix for the specified list * of variables. * * Returns: 0 on successful completion, 1 on error. */ int gretl_corrmx (int *list, const DATASET *dset, gretlopt opt, PRN *prn) { VMatrix *corr = NULL; int err = 0; if (list != NULL) { if (list[0] == 0) { return 0; } else { corr = corrlist(CORR, list, dset, opt, &err); } } else { int *tmplist = full_var_list(dset, NULL); if (tmplist != NULL) { corr = corrlist(CORR, tmplist, dset, opt, &err); free(tmplist); } } if (corr != NULL) { if (!(opt & OPT_Q)) { print_corrmat(corr, dset, prn); } if (corr->dim > 2 && gnuplot_graph_wanted(PLOT_HEATMAP, opt)) { err = plot_corrmat(corr, opt); } record_corr_matrix(corr); free_vmatrix(corr); } return err; } /* Satterthwaite, F.E. (1946). "An Approximate Distribution of Estimates of Variance Components". Biometrics Bulletin, 2, 6, pp. 110–114. v = (s^2_1/n_1 + s^2_2/n_2)^2 / [(s^_1/n1)^2 / (n_1 - 1) + (s^2_2/n_2)^2 / (n_2 - 1)] */ int satterthwaite_df (double v1, int n1, double v2, int n2) { double rtnum = v1 / n1 + v2 / n2; double d1 = (v1/n1) * (v1/n1) / (n1 - 1); double d2 = (v2/n2) * (v2/n2) / (n2 - 1); double v = rtnum * rtnum / (d1 + d2); return (int) floor(v); } /** * means_test: * @list: gives the ID numbers of the variables to compare. * @dset: dataset struct. * @opt: if OPT_O, assume population variances are different. * @prn: gretl printing struct. * * Carries out test of the null hypothesis that the means of two * variables are equal. * * Returns: 0 on successful completion, error code on error. */ int means_test (const int *list, const DATASET *dset, gretlopt opt, PRN *prn) { double m1, m2, s1, s2, skew, kurt, se, mdiff, t, pval; double v1, v2; double *x = NULL, *y = NULL; int vardiff = (opt & OPT_O); int df, n1, n2, n = dset->n; if (list[0] < 2) { return E_ARGS; } x = malloc(n * sizeof *x); if (x == NULL) { return E_ALLOC; } y = malloc(n * sizeof *y); if (y == NULL) { free(x); return E_ALLOC; } n1 = transcribe_array(x, dset->Z[list[1]], dset); n2 = transcribe_array(y, dset->Z[list[2]], dset); if (n1 == 0 || n2 == 0) { pputs(prn, _("Sample range has no valid observations.")); free(x); free(y); return 1; } if (n1 == 1 || n2 == 1) { pputs(prn, _("Sample range has only one observation.")); free(x); free(y); return 1; } df = n1 + n2 - 2; gretl_moments(0, n1-1, x, NULL, &m1, &s1, &skew, &kurt, 1); gretl_moments(0, n2-1, y, NULL, &m2, &s2, &skew, &kurt, 1); mdiff = m1 - m2; v1 = s1 * s1; v2 = s2 * s2; if (vardiff) { /* do not assume the variances are equal */ se = sqrt((v1 / n1) + (v2 / n2)); df = satterthwaite_df(v1, n1, v2, n2); } else { /* form pooled estimate of variance */ double s2p; s2p = ((n1-1) * v1 + (n2-1) * v2) / df; se = sqrt(s2p / n1 + s2p / n2); df = n1 + n2 - 2; } t = mdiff / se; pval = student_pvalue_2(df, t); pprintf(prn, _("\nEquality of means test " "(assuming %s variances)\n\n"), (vardiff)? _("unequal") : _("equal")); pprintf(prn, " %s: ", dset->varname[list[1]]); pprintf(prn, _("Number of observations = %d\n"), n1); pprintf(prn, " %s: ", dset->varname[list[2]]); pprintf(prn, _("Number of observations = %d\n"), n2); pprintf(prn, _(" Difference between sample means = %g - %g = %g\n"), m1, m2, mdiff); pputs(prn, _(" Null hypothesis: The two population means are the same.\n")); pprintf(prn, _(" Estimated standard error = %g\n"), se); pprintf(prn, _(" Test statistic: t(%d) = %g\n"), df, t); pprintf(prn, _(" p-value (two-tailed) = %g\n\n"), pval); if (pval > .10) pputs(prn, _(" The difference is not statistically significant.\n\n")); record_test_result(t, pval); free(x); free(y); return 0; } /** * vars_test: * @list: gives the ID numbers of the variables to compare. * @dset: dataset struct. * @prn: gretl printing struct. * * Carries out test of the null hypothesis that the variances of two * variables are equal. * * Returns: 0 on successful completion, error code on error. */ int vars_test (const int *list, const DATASET *dset, PRN *prn) { double m, s1, s2, var1, var2; double F, pval; double *x = NULL, *y = NULL; int dfn, dfd, n1, n2, n = dset->n; if (list[0] < 2) return E_ARGS; x = malloc(n * sizeof *x); y = malloc(n * sizeof *y); if (x == NULL || y == NULL) { free(x); free(y); return E_ALLOC; } n1 = transcribe_array(x, dset->Z[list[1]], dset); n2 = transcribe_array(y, dset->Z[list[2]], dset); if (n1 == 0 || n2 == 0) { pputs(prn, _("Sample range has no valid observations.")); free(x); free(y); return 1; } if (n1 == 1 || n2 == 1) { pputs(prn, _("Sample range has only one observation.")); free(x); free(y); return 1; } gretl_moments(0, n1-1, x, NULL, &m, &s1, NULL, NULL, 1); gretl_moments(0, n2-1, y, NULL, &m, &s2, NULL, NULL, 1); var1 = s1*s1; var2 = s2*s2; if (var1 > var2) { F = var1 / var2; dfn = n1 - 1; dfd = n2 - 1; } else { F = var2 / var1; dfn = n2 - 1; dfd = n1 - 1; } pval = snedecor_cdf_comp(dfn, dfd, F); pputs(prn, _("\nEquality of variances test\n\n")); pprintf(prn, " %s: ", dset->varname[list[1]]); pprintf(prn, _("Number of observations = %d\n"), n1); pprintf(prn, " %s: ", dset->varname[list[2]]); pprintf(prn, _("Number of observations = %d\n"), n2); pprintf(prn, _(" Ratio of sample variances = %g\n"), F); pprintf(prn, " %s: %s\n", _("Null hypothesis"), _("The two population variances are equal")); pprintf(prn, " %s: F(%d,%d) = %g\n", _("Test statistic"), dfn, dfd, F); pprintf(prn, _(" p-value (two-tailed) = %g\n\n"), pval); if (snedecor_cdf_comp(dfn, dfd, F) > .10) pputs(prn, _(" The difference is not statistically significant.\n\n")); record_test_result(F, pval); free(x); free(y); return 0; } struct MahalDist_ { int *list; int n; double *d; }; const double *mahal_dist_get_distances (const MahalDist *md) { return md->d; } int mahal_dist_get_n (const MahalDist *md) { return md->n; } const int *mahal_dist_get_varlist(const MahalDist *md) { return md->list; } void free_mahal_dist (MahalDist *md) { free(md->list); free(md->d); free(md); } static MahalDist *mahal_dist_new (const int *list, int n) { MahalDist *md = malloc(sizeof *md); if (md != NULL) { md->d = malloc(n * sizeof *md->d); if (md->d == NULL) { free(md); md = NULL; } else { md->list = gretl_list_copy(list); if (md->list == NULL) { free(md->d); free(md); md = NULL; } else { md->n = n; } } } if (md != NULL) { int t; for (t=0; td[t] = NADBL; } } return md; } static int mdist_saver (DATASET *dset) { int t, v, err; err = dataset_add_series(dset, 1); if (err) { v = 0; } else { v = dset->v - 1; for (t=0; tn; t++) { dset->Z[v][t] = NADBL; } strcpy(dset->varname[v], "mdist"); make_varname_unique(dset->varname[v], v, dset); series_set_label(dset, v, _("Mahalanobis distances")); } return v; } static int real_mahalanobis_distance (const int *list, DATASET *dset, gretlopt opt, MahalDist *md, PRN *prn) { gretl_matrix *S = NULL; gretl_vector *means = NULL; gretl_vector *xdiff; int orig_t1 = dset->t1; int orig_t2 = dset->t2; int n, err = 0; list_adjust_sample(list, &dset->t1, &dset->t2, dset, NULL); n = sample_size(dset); if (n < 2) { err = E_DATA; goto bailout; } xdiff = gretl_column_vector_alloc(list[0]); if (xdiff == NULL) { err = E_ALLOC; goto bailout; } S = gretl_covariance_matrix_from_varlist(list, dset, &means, &err); if (!err) { if (opt & OPT_V) { gretl_matrix_print_to_prn(S, _("Covariance matrix"), prn); } err = gretl_invert_symmetric_matrix(S); if (err) { fprintf(stderr, "error inverting covariance matrix\n"); } else if (opt & OPT_V) { gretl_matrix_print_to_prn(S, _("Inverse of covariance matrix"), prn); } } if (!err) { int k = gretl_vector_get_length(means); int miss, obslen = 0, savevar = 0; double m, x, xbar; int i, t, vi; if (opt & OPT_S) { /* save the results to a data series */ savevar = mdist_saver(dset); } if (prn != NULL) { pprintf(prn, "%s\n", _("Mahalanobis distances from the centroid")); pprintf(prn, "%s\n", _("using the variables:")); for (i=1; i<=list[0]; i++) { pprintf(prn, " %s\n", dset->varname[list[i]]); } pputc(prn, '\n'); obslen = max_obs_marker_length(dset); if (obslen < 8) { obslen = 8; } } for (t=dset->t1; t<=dset->t2; t++) { miss = 0; /* write vector of deviations from centroid for observation t */ for (i=0; iZ[vi][t]; miss += na(x); if (!miss) { gretl_vector_set(xdiff, i, x - xbar); } } if (miss) { m = NADBL; } else { m = gretl_scalar_qform(xdiff, S, &err); if (!err) { m = sqrt(m); } } if (prn != NULL) { print_obs_marker(t, dset, obslen, prn); if (err || miss) { pprintf(prn, "NA\n"); } else { pprintf(prn, "%9.6f\n", m); } } if (savevar > 0) { dset->Z[savevar][t] = m; } else if (md != NULL) { md->d[t] = m; } } if (savevar > 0 && prn != NULL) { pputc(prn, '\n'); pprintf(prn, _("Distances saved as '%s'"), dset->varname[savevar]); pputc(prn, '\n'); } if (prn != NULL) { pputc(prn, '\n'); } } gretl_matrix_free(xdiff); gretl_matrix_free(means); gretl_matrix_free(S); bailout: dset->t1 = orig_t1; dset->t2 = orig_t2; return err; } int mahalanobis_distance (const int *list, DATASET *dset, gretlopt opt, PRN *prn) { return real_mahalanobis_distance(list, dset, opt, NULL, (opt & OPT_Q) ? NULL : prn); } MahalDist *get_mahal_distances (const int *list, DATASET *dset, gretlopt opt, PRN *prn, int *err) { MahalDist *md = mahal_dist_new(list, dset->n); if (md == NULL) { *err = E_ALLOC; } else { *err = real_mahalanobis_distance(list, dset, opt, md, prn); if (*err) { free_mahal_dist(md); md = NULL; } } return md; } /* G = [(2 * \sum_i^n (i * x_i)) / (n * \sum_i^n x_i )] - [(n + 1)/n] where x is sorted: x_i <= x_{i+1} */ static double gini_coeff (const double *x, int t1, int t2, double **plz, int *pn, int *err) { int m = t2 - t1 + 1; double *sx = NULL; double csx = 0.0, sumx = 0.0, sisx = 0.0; double G; int t, n = 0; gretl_error_clear(); sx = malloc(m * sizeof *sx); if (sx == NULL) { *err = E_ALLOC; return NADBL; } n = 0; for (t=t1; t<=t2; t++) { if (na(x[t])) { continue; } else if (x[t] < 0.0) { gretl_errmsg_set(_("illegal negative value")); *err = E_DATA; break; } else { sx[n++] = x[t]; sumx += x[t]; } } if (!*err && (n == 0 || sumx == 0.0)) { *err = E_DATA; } if (*err) { free(sx); return NADBL; } qsort(sx, n, sizeof *sx, gretl_compare_doubles); #if 0 if (1) { /* just testing alternative calculation for equivalence */ double num, S[2]; int s, fyi; num = S[0] = S[1] = 0.0; for (t=0; t 4000) { downsample = (int) (n / 1000.0); } for (t=0; tZ[varno], dset->t1, dset->t2, &lz, &n, &err); if (err) { return err; } fulln = dset->t2 - dset->t1 - 1; pprintf(prn, "%s\n", dset->varname[varno], n); pprintf(prn, _("Number of observations = %d\n"), n); if (n < fulln) { pputs(prn, _("Warning: there were missing values\n")); } pputc(prn, '\n'); pprintf(prn, "%s = %g\n", _("Sample Gini coefficient"), gini); pprintf(prn, "%s = %g\n", _("Estimate of population value"), gini * (double) n / (n - 1)); err = lorenz_graph(dset->varname[varno], lz, n); free(lz); return err; } /* Following: apparatus for Shapiro-Wilk normality test. Thanks to Marcin Blazejowski for the contribution. */ #ifndef min # define min(a, b) ((a) > (b) ? (b) : (a)) #endif static int sign (int a) { return (a == 0)? 0 : ((a < 0)? -1 : 1); } static int compare_floats (const void *a, const void *b) { const float *fa = (const float *) a; const float *fb = (const float *) b; return (*fa > *fb) - (*fa < *fb); } /* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2 Calculates the algebraic polynomial of order n-1 with array of coefficients cc. Zero-order coefficient is cc(1) = cc[0] */ static double poly (const float *cc, int n, float x) { double p, ret = cc[0]; /* preserve precision! */ int j; if (n > 1) { p = x * cc[n-1]; for (j=n-2; j>0; j--) { p = (p + cc[j]) * x; } ret += p; } return ret; } /* Calculate coefficients for the Shapiro-Wilk test */ static void sw_coeff (int n, int n2, float *a) { const float c1[6] = { 0.f,.221157f,-.147981f,-2.07119f, 4.434685f, -2.706056f }; const float c2[6] = { 0.f,.042981f,-.293762f,-1.752461f,5.682633f, -3.582633f }; float summ2, ssumm2; float a0, a1, an = (float) n; float fac, an25, rsn; int i, i1, nn2 = n / 2; if (n == 3) { a[0] = sqrt(0.5); } else { an25 = an + .25; summ2 = 0.0; for (i=0; i 5) { i1 = 2; a1 = -a[1] / ssumm2 + poly(c2, 6, rsn); fac = sqrt((summ2 - 2.0 * (a[0] * a[0]) - 2.0 * (a[1] * a[1])) / (1.0 - 2.0 * (a0 * a0) - 2.0 * (a1 * a1))); a[1] = a1; } else { i1 = 1; fac = sqrt((summ2 - 2.0 * (a[0] * a[0])) / (1.0 - 2.0 * (a0 * a0))); } a[0] = a0; for (i=i1; i= gamma) { /* FIXME: rather use an even smaller value, or NA? */ *pval = little; return 0; } y = -log(gamma - y); m = poly(c3, 4, an); s = exp(poly(c4, 4, an)); } else { /* n >= 12 */ m = poly(c5, 4, xx); s = exp(poly(c6, 3, xx)); } if (ncens > 0) { /* Censoring by proportion ncens/n. Calculate mean and sd of normal equivalent deviate of W */ float delta = (float) ncens / an; ld = -log(delta); bf = 1.0 + xx * bf1; r1 = pow(xx90, (double) xx); z90f = z90 + bf * pow(poly(c7, 2, r1), (double) ld); r1 = pow(xx95, (double) xx); z95f = z95 + bf * pow(poly(c8, 2, r1), (double) ld); z99f = z99 + bf * pow(poly(c9, 2, xx), (double) ld); /* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get pseudo-mean and pseudo-sd of z as the slope and intercept */ zfm = (z90f + z95f + z99f) / 3.0; zsd = (z90 * (z90f - zfm) + z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss; zbar = zfm - zsd * zm; m += zbar * s; s *= zsd; } *pval = normal_cdf_comp(((double) y - (double) m) / (double) s); return 0; } static int sw_sample_check (int n, int n1) { int ncens = n - n1; float delta, an = n; if (n < 3 || n1 < 3) { fprintf(stderr, "There is no way to run SW test for less then 3 obs\n"); return E_DATA; } if (ncens < 0 || (ncens > 0 && n < 20)) { fprintf(stderr, "sw_w: not enough uncensored obserations\n"); return E_DATA; } delta = (float) ncens / an; if (delta > .8f) { fprintf(stderr, "sw_w: too many censored obserations\n"); return E_DATA; } return 0; } /** * shapiro_wilk: * @x: data array. * @t1: starting observation. * @t2: ending observation. * @W: location to receive test statistic. * @pval: location to receive p-value. * * Computes the Shapiro-Wilk W statistic as a test for * normality of the data @x, and also the p-value for * the test. These are written into the pointer * arguments @W and @pval. * * Returns: 0 on success, non-zero on failure. */ int shapiro_wilk (const double *x, int t1, int t2, double *W, double *pval) { int n1 = 0; /* number of uncensored obs */ float *a = NULL; /* array of coefficients */ float *xf = NULL; /* copy of x in float format */ int i, t, n2, n = 0; int err = 0; *W = *pval = NADBL; for (t=t1; t<=t2; t++) { if (!na(x[t])) n++; } /* for now we assume all obs are uncensored */ n1 = n; err = sw_sample_check(n, n1); if (err) { return err; } /* How many coeffs should be computed? */ n2 = fmod(n, 2.0); n2 = (n2 == 0)? n / 2 : (n - 1) / 2; xf = malloc(n * sizeof *xf); a = malloc(n2 * sizeof *a); if (xf == NULL || a == NULL) { err = E_ALLOC; } else { i = 0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { xf[i++] = x[t]; } } qsort(xf, n, sizeof *xf, compare_floats); /* Main job: compute W stat and its p-value */ sw_coeff(n, n2, a); err = sw_w(xf, n, n1, a, W, pval); } free(a); free(xf); return err; } static double round2 (double x) { x *= 100.0; x = (x - floor(x) < 0.5)? floor(x) : ceil(x); return x / 100; } /* Polynomial approximation to the p-value for the Lilliefors test, said to be good to 2 decimal places. See Abdi and Molin, "Lilliefors/Van Soest's test of normality", in Salkind (ed.) Encyclopedia of Measurement and Statistics, Sage, 2007; also http://www.utd.edu/~herve/Abdi-Lillie2007-pretty.pdf */ static double lilliefors_pval (double L, int N) { double b0 = 0.37872256037043; double b1 = 1.30748185078790; double b2 = 0.08861783849346; double b1pN = b1 + N; double Lm2 = 1.0 / (L * L); double A, pv; A = (-b1pN + sqrt(b1pN * b1pN - 4 * b2 * (b0 - Lm2))) / (2 * b2); pv = -0.37782822932809 + 1.67819837908004*A - 3.02959249450445*A*A + 2.80015798142101*pow(A, 3.) - 1.39874347510845*pow(A, 4.) + 0.40466213484419*pow(A, 5.) - 0.06353440854207*pow(A, 6.) + 0.00287462087623*pow(A, 7.) + 0.00069650013110*pow(A, 8.) - 0.00011872227037*pow(A, 9.) + 0.00000575586834*pow(A, 10.); if (pv < 0) { pv = 0; } else if (pv > 1) { pv = 1; } else { /* round to 2 digits */ pv = round2(pv); } return pv; } static int lilliefors_test (const double *x, int t1, int t2, double *L, double *pval) { double *zx; /* copy of x for sorting, z-scoring */ int i, t, n = 0; int err = 0; *L = *pval = NADBL; for (t=t1; t<=t2; t++) { if (!na(x[t])) n++; } if (n < 5) { /* we need more than 4 data points */ return E_DATA; } zx = malloc(n * sizeof *zx); if (zx == NULL) { err = E_ALLOC; } else { double dx, mx = 0.0, sx = 0.0; double Phi, Dp = 0.0, Dm = 0.0; double Dmax = 0.0; /* compute sample mean and standard deviation plus sorted z-scores */ i = 0; for (t=t1; t<=t2; t++) { if (!na(x[t])) { zx[i++] = x[t]; mx += x[t]; } } mx /= n; for (t=t1; t<=t2; t++) { if (!na(x[t])) { dx = x[t] - mx; sx += dx * dx; } } sx = sqrt(sx / (n - 1)); qsort(zx, n, sizeof *zx, gretl_compare_doubles); for (i=0; i Dmax) Dmax = Dp; if (Dm > Dmax) Dmax = Dm; } *L = Dmax; *pval = lilliefors_pval(Dmax, n); } free(zx); return err; } static int skew_kurt_test (const double *x, int t1, int t2, double *test, double *pval, gretlopt opt) { double skew, xkurt; int n, err = 0; err = series_get_moments(t1, t2, x, &skew, &xkurt, &n); if (!err) { if (opt & OPT_J) { /* Jarque-Bera */ *test = (n / 6.0) * (skew * skew + xkurt * xkurt/ 4.0); } else { /* Doornik-Hansen */ *test = doornik_chisq(skew, xkurt, n); } } if (!err && na(*test)) { err = E_NAN; } if (!na(*test)) { *pval = chisq_cdf_comp(2, *test); } return err; } static void print_normality_stat (double test, double pval, gretlopt opt, PRN *prn) { const char *tstrs[] = { N_("Shapiro-Wilk W"), N_("Jarque-Bera test"), N_("Lilliefors test"), N_("Doornik-Hansen test") }; int i = (opt & OPT_W)? 0 : (opt & OPT_J)? 1 : (opt & OPT_L)? 2 : 3; if (na(test) || na(pval)) { return; } if (opt & OPT_L) { pprintf(prn, " %s = %g, %s ~= %g\n\n", _(tstrs[i]), test, _("with p-value"), pval); } else { pprintf(prn, " %s = %g, %s %g\n\n", _(tstrs[i]), test, _("with p-value"), pval); } } /** * gretl_normality_test: * @varno: ID number of the variable to process. * @dset: dataset struct. * @opt: OPT_A: all tests; OPT_D: Doornik-Hansen; OPT_W: Shapiro-Wilk; * OPT_J: Jarque-Bera; OPT_L: Lilliefors; default is Doornik-Hansen. * Also use OPT_Q for quiet. * @prn: gretl printing struct. * * Performs, and prints the results of, the specified test(s) randomness * for the variable specified by @v. * * Returns: 0 on successful completion, non-zero on error. */ int gretl_normality_test (int varno, const DATASET *dset, gretlopt opt, PRN *prn) { gretlopt alltests = OPT_D | OPT_W | OPT_J | OPT_L; double test = NADBL; double pval = NADBL; double trec = NADBL; double pvrec = NADBL; int err; if (varno < 0 || varno >= dset->v) { return E_DATA; } err = incompatible_options(opt, OPT_A | alltests); if (err) { return err; } if (opt & OPT_A) { /* show all tests */ opt |= alltests; } if (!(opt & alltests)) { /* no method selected: use Doornik-Hansen */ opt |= OPT_D; } if (!(opt & OPT_Q)) { pprintf(prn, _("Test for normality of %s:"), dset->varname[varno]); if (opt & OPT_A) { pputs(prn, "\n\n"); } else { pputc(prn, '\n'); } } if (opt & OPT_D) { err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2, &test, &pval, OPT_D); if (!err && !(opt & OPT_Q)) { print_normality_stat(test, pval, OPT_D, prn); } if (!err) { /* record Doornik-Hansen result by default */ trec = test; pvrec = pval; } } if (opt & OPT_W) { err = shapiro_wilk(dset->Z[varno], dset->t1, dset->t2, &test, &pval); if (!err && !(opt & OPT_Q)) { print_normality_stat(test, pval, OPT_W, prn); } } if (opt & OPT_L) { err = lilliefors_test(dset->Z[varno], dset->t1, dset->t2, &test, &pval); if (!err && !(opt & OPT_Q)) { print_normality_stat(test, pval, OPT_L, prn); } } if (opt & OPT_J) { err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2, &test, &pval, OPT_J); if (!err && !(opt & OPT_Q)) { print_normality_stat(test, pval, OPT_J, prn); } } if (na(trec) && !na(test)) { trec = test; } if (na(pvrec) && !na(pval)) { pvrec = pval; } if (!na(trec) && !na(pvrec)) { record_test_result(trec, pvrec); } return err; } gretl_matrix *gretl_normtest_matrix (const double *y, int t1, int t2, gretlopt opt, int *err) { gretl_matrix *ret = NULL; double test = NADBL; double pval = NADBL; if (opt & OPT_J) { /* Jarque-Bera */ *err = skew_kurt_test(y, t1, t2, &test, &pval, opt); } else if (opt & OPT_W) { /* Shapiro-Wilk */ *err = shapiro_wilk(y, t1, t2, &test, &pval); } else if (opt & OPT_L) { /* Lilliefors */ *err = lilliefors_test(y, t1, t2, &test, &pval); } else { /* Doornik-Hansen */ *err = skew_kurt_test(y, t1, t2, &test, &pval, opt); } if (!*err) { ret = gretl_matrix_alloc(1, 2); if (ret == NULL) { *err = E_ALLOC; } else { ret->val[0] = test; ret->val[1] = pval; } } return ret; }