0.0) {
*skew = (s[2] / n) / pow(s[1], 1.5);
*kurt = ((s[3] / n) / pow(s[1], 2));
} else {
/* if variance is effectively zero, these should be undef'd */
*skew = *kurt = NADBL;
err = 1;
}
return err;
}
int
multivariate_normality_test (const gretl_matrix *E,
const gretl_matrix *Sigma,
gretlopt opt, PRN *prn)
{
gretl_matrix *S = NULL;
gretl_matrix *V = NULL;
gretl_matrix *C = NULL;
gretl_matrix *X = NULL;
gretl_matrix *R = NULL;
gretl_matrix *evals = NULL;
gretl_matrix *tmp = NULL;
/* convenience pointers: do not free! */
gretl_matrix *H;
gretl_vector *Z1;
gretl_vector *Z2;
double x, skew, kurt;
double X2 = NADBL;
int n, p;
int i, j;
int err = 0;
if (E == NULL || Sigma == NULL) {
err = E_DATA;
goto bailout;
}
p = gretl_matrix_cols(E);
n = gretl_matrix_rows(E);
clear_gretl_matrix_err();
S = gretl_matrix_copy(Sigma);
V = gretl_vector_alloc(p);
C = gretl_matrix_alloc(p, p);
X = gretl_matrix_copy_transpose(E);
R = gretl_matrix_alloc(p, n);
tmp = gretl_matrix_alloc(p, p);
err = get_gretl_matrix_err();
if (err) {
goto bailout;
}
for (i=0; ival[i]);
}
pputc(prn, '\n');
}
/* C should now contain eigenvectors of the original C:
relabel as 'H' for perspicuity */
H = C;
#if 0
gretl_matrix_print_to_prn(H, "Eigenvectors, H", prn);
#endif
gretl_matrix_copy_values(tmp, H);
/* make "tmp" into $H \Lambda^{-1/2}$ */
for (i=0; i
val[j]);
gretl_matrix_set(tmp, i, j, x);
}
}
/* make S into $H \Lambda^{-1/2} H'$ */
gretl_matrix_multiply_mod(tmp, GRETL_MOD_NONE,
H, GRETL_MOD_TRANSPOSE,
S, GRETL_MOD_NONE);
#if 1
gretl_matrix_demean_by_row(X);
#endif
/* compute VX', in X (don't need to subtract means, because these
are OLS residuals)
*/
for (i=0; i
discrete) {
freq->endpt = malloc((n + 1) * sizeof *freq->endpt);
if (freq->endpt == NULL) {
err = E_ALLOC;
}
}
if (!err) {
if (freq->strvals) {
freq->S = strings_array_new(n);
if (freq->S == NULL) {
err = E_ALLOC;
}
} else {
freq->midpt = malloc(n * sizeof *freq->midpt);
if (freq->midpt == NULL) {
err = E_ALLOC;
}
}
}
if (!err) {
freq->f = malloc(n * sizeof *freq->f);
if (freq->f == NULL) {
err = E_ALLOC;
}
}
if (!err) {
freq->numbins = n;
}
return err;
}
int freq_setup (int v, const DATASET *dset, int *pn,
double *pxmax, double *pxmin, int *nbins,
double *binwidth)
{
const double *x = dset->Z[v];
double xrange, xmin = 0, xmax = 0;
int t, k = 0, n = 0;
for (t=dset->t1; t<=dset->t2; t++) {
if (!na(x[t])) {
if (n == 0) {
xmax = xmin = x[t];
} else {
if (x[t] > xmax) xmax = x[t];
if (x[t] < xmin) xmin = x[t];
}
n++;
}
}
if (n < 8) {
gretl_errmsg_sprintf(_("Insufficient data to build frequency "
"distribution for variable %s"),
dset->varname[v]);
return E_TOOFEW;
}
xrange = xmax - xmin;
if (xrange == 0) {
gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[v]);
return E_DATA;
}
if (*nbins > 0) {
k = *nbins;
} else if (n < 16) {
k = 5;
} else if (n < 50) {
k = 7;
} else if (n > 850) {
k = 29;
} else {
k = (int) sqrt((double) n);
if (k > 99) {
k = 99;
}
}
if (k % 2 == 0) {
k++;
}
*pn = n;
*pxmax = xmax;
*pxmin = xmin;
*nbins = k;
*binwidth = xrange / (k - 1);
return 0;
}
/* calculate test stat for distribution, if the sample
is big enough */
static void
freq_dist_stat (FreqDist *freq, const double *x, gretlopt opt, int k)
{
double skew, kurt;
freq->test = NADBL;
freq->dist = 0;
gretl_moments(freq->t1, freq->t2, x, NULL,
&freq->xbar, &freq->sdx,
&skew, &kurt, k);
if (freq->n > 7) {
if (opt & OPT_O) {
freq->test = lockes_test(x, freq->t1, freq->t2);
freq->dist = D_GAMMA;
} else if (opt & OPT_Z) {
freq->test = doornik_chisq(skew, kurt, freq->n);
freq->dist = D_NORMAL;
}
}
}
struct strval_sorter {
char *s;
int n;
};
int compare_strvals (const void *a, const void *b)
{
const struct strval_sorter *sa = a;
const struct strval_sorter *sb = b;
return strcmp(sa->s, sb->s);
}
FreqDist *get_string_freq (int v, const DATASET *dset,
int *err)
{
FreqDist *freq;
struct strval_sorter *ss;
const double *x = dset->Z[v];
series_table *st;
char **S;
int i, t, n, ns;
freq = freq_new(dset->varname[v]);
if (freq == NULL) {
*err = E_ALLOC;
return NULL;
}
st = series_get_string_table(dset, v);
S = series_table_get_strings(st, &ns);
ss = malloc(ns * sizeof *ss);
if (ss == NULL) {
*err = E_ALLOC;
free(freq);
return NULL;
}
for (i=0; it1; t<=dset->t2; t++) {
if (!na(x[t])) {
i = x[t] - 1;
ss[i].n += 1;
n++;
}
}
qsort(ss, ns, sizeof *ss, compare_strvals);
freq->t1 = dset->t1;
freq->t2 = dset->t2;
freq->n = n;
freq->discrete = 1;
freq->strvals = 1;
if (freq_add_arrays(freq, ns)) {
*err = E_ALLOC;
} else {
for (i=0; iS[i] = gretl_strdup(ss[i].s);
freq->f[i] = ss[i].n;
}
}
free(ss);
if (*err && freq != NULL) {
free_freq(freq);
freq = NULL;
}
return freq;
}
FreqDist *get_discrete_freq (int v, const DATASET *dset,
gretlopt opt, int *err)
{
FreqDist *freq;
const double *x = dset->Z[v];
int *ifreq = NULL;
double *ivals = NULL;
double *sorted = NULL;
double last;
int i, t, nv;
freq = freq_new(dset->varname[v]);
if (freq == NULL) {
*err = E_ALLOC;
return NULL;
}
freq->t1 = dset->t1;
freq->t2 = dset->t2;
freq->n = 0;
for (t=freq->t1; t<=freq->t2; t++) {
if (!na(x[t])) {
freq->n += 1;
}
}
if (freq->n < 3) {
gretl_errmsg_sprintf(_("Insufficient data to build frequency "
"distribution for variable %s"),
dset->varname[v]);
*err = E_TOOFEW;
goto bailout;
}
freq->discrete = 1;
freq->test = NADBL;
freq->dist = 0;
sorted = malloc(freq->n * sizeof *sorted);
if (sorted == NULL) {
*err = E_ALLOC;
goto bailout;
}
i = 0;
for (t=freq->t1; t<=freq->t2; t++) {
if (!na(x[t])) {
sorted[i++] = x[t];
}
}
qsort(sorted, freq->n, sizeof *sorted, gretl_compare_doubles);
nv = count_distinct_values(sorted, freq->n);
if (nv >= 10 && !(opt & OPT_X)) {
freq_dist_stat(freq, x, opt, 1);
} else if (opt & (OPT_Z | OPT_O)) {
freq->xbar = gretl_mean(freq->t1, freq->t2, x);
freq->sdx = gretl_stddev(freq->t1, freq->t2, x);
}
ifreq = malloc(nv * sizeof *ifreq);
ivals = malloc(nv * sizeof *ivals);
if (ifreq == NULL || ivals == NULL) {
*err = E_ALLOC;
goto bailout;
}
ivals[0] = last = sorted[0];
ifreq[0] = i = 1;
for (t=1; tn; t++) {
if (sorted[t] != last) {
last = sorted[t];
ifreq[i] = 1;
ivals[i++] = last;
} else {
ifreq[i-1] += 1;
}
}
if (freq_add_arrays(freq, nv)) {
*err = E_ALLOC;
} else {
int allints = 1;
for (i=0; imidpt[i] = ivals[i];
freq->f[i] = ifreq[i];
}
if (allints) {
freq->discrete = 2;
}
}
bailout:
free(sorted);
free(ivals);
free(ifreq);
if (*err && freq != NULL) {
free_freq(freq);
freq = NULL;
}
return freq;
}
/**
* get_freq:
* @varno: ID number of variable to process.
* @dset: dataset struct.
* @fmin: lower limit of left-most bin (or #NADBL for automatic).
* @fwid: bin width (or #NADBL for automatic).
* @nbins: number of bins to use (or 0 for automatic).
* @params: degrees of freedom loss (generally = 1 unless we're dealing
* with the residual from a regression).
* @opt: if includes %OPT_Z, set up for comparison with normal dist;
* if includes %OPT_O, compare with gamma distribution;
* if includes %OPT_S we're not printing results; if includes %OPT_D,
* treat the variable as discrete; %OPT_X indicates that this function
* is called as part of a cross-tabulation; if includes %OPT_G,
* add the arrays that are needed for a plot even when we're not
* printing (%OPT_S).
* @err: location to receive error code.
*
* Calculates the frequency distribution for the specified variable.
*
* Returns: pointer to struct containing the distribution.
*/
FreqDist *get_freq (int varno, const DATASET *dset,
double fmin, double fwid,
int nbins, int params,
gretlopt opt, int *err)
{
FreqDist *freq;
const double *x;
double xx, xmin, xmax;
double binwidth = fwid;
int t, k, n;
if (is_string_valued(dset, varno)) {
return get_string_freq(varno, dset, err);
}
if (series_is_discrete(dset, varno) || (opt & OPT_D)) {
return get_discrete_freq(varno, dset, opt, err);
}
if (gretl_isdiscrete(dset->t1, dset->t2, dset->Z[varno]) > 1) {
return get_discrete_freq(varno, dset, opt, err);
}
freq = freq_new(dset->varname[varno]);
if (freq == NULL) {
*err = E_ALLOC;
return NULL;
}
*err = freq_setup(varno, dset, &n, &xmax, &xmin, &nbins, &binwidth);
if (*err) {
goto bailout;
}
if (!na(fmin) && !na(fwid)) {
/* endogenous implied number of bins */
nbins = (int) ceil((xmax - fmin) / fwid);
if (nbins <= 0 || nbins > 5000) {
*err = E_INVARG;
goto bailout;
} else {
binwidth = fwid;
}
}
freq->t1 = dset->t1;
freq->t2 = dset->t2;
freq->n = n;
x = dset->Z[varno];
freq_dist_stat(freq, x, opt, params);
if (freq_add_arrays(freq, nbins)) {
*err = E_ALLOC;
goto bailout;
}
if (na(fmin) || na(fwid)) {
freq->endpt[0] = xmin - .5 * binwidth;
if (xmin > 0.0 && freq->endpt[0] < 0.0) {
double rshift;
freq->endpt[0] = 0.0;
rshift = 1.0 - xmin / binwidth;
freq->endpt[freq->numbins] = xmax + rshift * binwidth;
} else {
freq->endpt[freq->numbins] = xmax + .5 * binwidth;
}
} else {
freq->endpt[0] = fmin;
freq->endpt[freq->numbins] = fmin + nbins * fwid;
}
for (k=0; knumbins; k++) {
freq->f[k] = 0;
if (k > 0) {
freq->endpt[k] = freq->endpt[k-1] + binwidth;
}
freq->midpt[k] = freq->endpt[k] + .5 * binwidth;
}
for (t=dset->t1; t<=dset->t2; t++) {
xx = x[t];
if (na(xx)) {
continue;
}
if (xx < freq->endpt[1]) {
freq->f[0] += 1;
} else if (xx >= freq->endpt[freq->numbins]) {
freq->f[freq->numbins - 1] += 1;
continue;
} else {
for (k=1; knumbins; k++) {
if (freq->endpt[k] <= xx && xx < freq->endpt[k+1]) {
freq->f[k] += 1;
break;
}
}
}
}
bailout:
if (*err && freq != NULL) {
free_freq(freq);
freq = NULL;
}
return freq;
}
static void record_freq_test (const FreqDist *freq)
{
double pval = NADBL;
if (freq->dist == D_NORMAL) {
pval = chisq_cdf_comp(2, freq->test);
} else if (freq->dist == D_GAMMA) {
pval = normal_pvalue_2(freq->test);
}
if (!na(pval)) {
record_test_result(freq->test, pval);
}
}
static int check_freq_opts (gretlopt opt, int *n_bins,
double *fmin, double *fwid)
{
double x;
int err = 0;
if (opt & OPT_N) {
/* number of bins given */
if (opt & (OPT_M | OPT_W)) {
/* can't give min, width spec */
return E_BADOPT;
} else {
int n = get_optval_int(FREQ, OPT_N, &err);
if (!err) {
if (n < 2 || n > 10000) {
err = E_INVARG;
} else {
*n_bins = n;
}
}
if (err) {
return err;
}
}
}
if (opt & OPT_M) {
/* minimum specified */
if (!(opt & OPT_W)) {
/* but no width given */
return E_ARGS;
} else {
x = get_optval_double(FREQ, OPT_M, &err);
if (err) {
return err;
} else if (na(x)) {
return E_ARGS;
} else {
*fmin = x;
}
}
}
if (opt & OPT_W) {
/* width specified */
if (!(opt & OPT_M)) {
/* but no min given */
return E_ARGS;
} else {
x = get_optval_double(FREQ, OPT_W, &err);
if (err) {
return err;
} else if (na(x)) {
return E_ARGS;
} else if (x <= 0) {
return E_INVARG;
} else {
*fwid = x;
}
}
}
return 0;
}
static void record_freq_matrix (FreqDist *fd)
{
gretl_matrix *m = NULL;
char **Sr = NULL;
int i, n = fd->numbins;
if (fd->S != NULL) {
m = gretl_matrix_alloc(n, 1);
Sr = strings_array_dup(fd->S, n);
} else {
m = gretl_matrix_alloc(n, 2);
}
if (m != NULL) {
char **Sc = NULL;
if (fd->S == NULL) {
Sc = strings_array_new(2);
if (Sc != NULL) {
Sc[0] = gretl_strdup("midpoint");
Sc[1] = gretl_strdup("count");
}
}
for (i=0; iS != NULL) {
gretl_matrix_set(m, i, 0, fd->f[i]);
} else {
gretl_matrix_set(m, i, 0, fd->midpt[i]);
gretl_matrix_set(m, i, 1, fd->f[i]);
}
}
if (Sc != NULL) {
gretl_matrix_set_colnames(m, Sc);
}
if (Sr != NULL) {
gretl_matrix_set_rownames(m, Sr);
}
set_last_result_data(m, GRETL_TYPE_MATRIX);
}
}
/* Wrapper function: get the distribution, print it if
wanted, graph it if wanted, then free stuff.
*/
int freqdist (int varno, const DATASET *dset,
gretlopt opt, PRN *prn)
{
FreqDist *freq = NULL;
DistCode dist = D_NONE;
PlotType ptype = 0;
double fmin = NADBL;
double fwid = NADBL;
int n_bins = 0;
int do_graph = 0;
int err;
if (opt & OPT_O) {
dist = D_GAMMA;
ptype = PLOT_FREQ_GAMMA;
} else if (opt & OPT_Z) {
dist = D_NORMAL;
ptype = PLOT_FREQ_NORMAL;
} else {
ptype = PLOT_FREQ_SIMPLE;
}
if (!(opt & OPT_Q)) {
if (opt & OPT_G) {
/* it's a legacy thing */
opt |= OPT_U;
opt &= ~OPT_G;
set_optval_string(FREQ, OPT_U, "display");
}
do_graph = gnuplot_graph_wanted(ptype, opt);
}
err = check_freq_opts(opt, &n_bins, &fmin, &fwid);
if (!err) {
gretlopt fopt = opt;
if (do_graph) {
fopt |= OPT_G;
}
freq = get_freq(varno, dset, fmin, fwid, n_bins, 1, fopt, &err);
}
if (!err) {
if (!(opt & OPT_S)) {
print_freq(freq, varno, dset, prn);
} else if (dist) {
record_freq_test(freq);
}
record_freq_matrix(freq);
if (do_graph && freq->numbins < 2) {
do_graph = 0;
}
if (do_graph) {
int gerr = plot_freq(freq, dist, opt);
if (gerr) {
pputs(prn, _("gnuplot command failed\n"));
do_graph = 0;
}
}
free_freq(freq);
}
return err;
}
/* Wrapper function: get the frequency distribution and write it into
a gretl_matrix: the first column holds the mid-points of the bins
and the second holds the frequencies.
*/
gretl_matrix *freqdist_matrix (const double *x, int t1, int t2, int *err)
{
DATASET *dset = NULL;
FreqDist *freq = NULL;
gretl_matrix *m = NULL;
int i, t, T = t2 - t1 + 1;
dset = create_auxiliary_dataset(1, T, 0);
if (dset == NULL) {
*err = E_ALLOC;
}
if (!*err) {
i = 0;
for (t=t1; t<=t2; t++) {
dset->Z[0][i++] = x[t];
}
freq = get_freq(0, dset, NADBL, NADBL, 0, 1, OPT_NONE, err);
}
if (!*err) {
m = gretl_matrix_alloc(freq->numbins, 2);
if (m == NULL) {
*err = E_ALLOC;
}
}
if (!*err) {
for (i=0; inumbins; i++) {
gretl_matrix_set(m, i, 0, freq->midpt[i]);
gretl_matrix_set(m, i, 1, freq->f[i]);
}
}
destroy_dataset(dset);
free_freq(freq);
return m;
}
gretl_matrix *xtab_to_matrix (const Xtab *tab)
{
gretl_matrix *m;
double x;
int i, j;
if (tab == NULL) {
return NULL;
}
m = gretl_matrix_alloc(tab->rows, tab->cols);
if (m == NULL) {
return NULL;
}
for (j=0; jcols; j++) {
for (i=0; irows; i++) {
x = (double) tab->f[i][j];
gretl_matrix_set(m, i, j, x);
}
}
return m;
}
static void *last_result;
static GretlType last_result_type;
void *get_last_result_data (GretlType *type, int *err)
{
void *ret = NULL;
if (last_result == NULL) {
*type = GRETL_TYPE_NONE;
*err = E_BADSTAT;
} else {
*type = last_result_type;
if (*type == GRETL_TYPE_MATRIX) {
ret = gretl_matrix_copy(last_result);
} else {
ret = gretl_bundle_copy(last_result, err);
}
}
return ret;
}
void set_last_result_data (void *data, GretlType type)
{
if (last_result != NULL) {
if (last_result_type == GRETL_TYPE_MATRIX) {
gretl_matrix_free(last_result);
} else if (last_result_type == GRETL_TYPE_BUNDLE) {
gretl_bundle_destroy(last_result);
}
}
last_result = data;
last_result_type = type;
}
void last_result_cleanup (void)
{
if (last_result != NULL) {
if (last_result_type == GRETL_TYPE_MATRIX) {
gretl_matrix_free(last_result);
} else if (last_result_type == GRETL_TYPE_BUNDLE) {
gretl_bundle_destroy(last_result);
}
}
last_result = NULL;
last_result_type = 0;
}
/**
* free_xtab:
* @tab: pointer to gretl crosstab struct.
*
* Frees all resources associated with @tab, and the
* pointer itself.
*/
void free_xtab (Xtab *tab)
{
int i;
if (tab == NULL) {
return;
}
free(tab->rtotal);
free(tab->ctotal);
free(tab->rval);
free(tab->cval);
if (tab->f != NULL) {
for (i=0; irows; i++) {
free(tab->f[i]);
}
free(tab->f);
}
if (tab->Sr != NULL) {
strings_array_free(tab->Sr, tab->rows);
}
if (tab->Sc != NULL) {
strings_array_free(tab->Sc, tab->cols);
}
free(tab);
}
static Xtab *xtab_new (int n, int t1, int t2)
{
Xtab *tab = malloc(sizeof *tab);
if (tab == NULL) return NULL;
tab->rtotal = NULL;
tab->ctotal = NULL;
tab->rval = NULL;
tab->cval = NULL;
tab->f = NULL;
tab->n = n;
tab->t1 = t1;
tab->t2 = t2;
tab->missing = 0;
*tab->rvarname = '\0';
*tab->cvarname = '\0';
tab->Sr = NULL;
tab->Sc = NULL;
tab->rstrs = 0;
tab->cstrs = 0;
return tab;
}
/* allocate the required arrays (apart from S, which must be
handled separately) and initialize counters to zero
*/
static int xtab_allocate_arrays (Xtab *tab)
{
int i, j, err = 0;
if (!tab->rstrs && tab->rval == NULL) {
tab->rval = malloc(tab->rows * sizeof *tab->rval);
if (tab->rval == NULL) {
err = E_ALLOC;
}
}
if (!err && !tab->cstrs && tab->cval == NULL) {
tab->cval = malloc(tab->cols * sizeof *tab->cval);
if (tab->cval == NULL) {
err = E_ALLOC;
}
}
if (!err) {
tab->rtotal = malloc(tab->rows * sizeof *tab->rtotal);
tab->ctotal = malloc(tab->cols * sizeof *tab->ctotal);
if (tab->rtotal == NULL || tab->ctotal == NULL) {
err = E_ALLOC;
} else {
for (i=0; irows; i++) {
tab->rtotal[i] = 0;
}
for (j=0; jcols; j++) {
tab->ctotal[j] = 0;
}
}
}
if (!err) {
tab->f = malloc(tab->rows * sizeof *tab->f);
if (tab->f == NULL) {
err = E_ALLOC;
} else {
for (i=0; irows; i++) {
tab->f[i] = NULL;
}
}
}
for (i=0; irows && !err; i++) {
tab->f[i] = malloc(tab->cols * sizeof *tab->f[i]);
if (tab->f[i] == NULL) {
err = E_ALLOC;
} else {
for (j=0; jcols; j++) {
tab->f[i][j] = 0;
}
}
}
return err;
}
/* also used in gretl_matrix.c */
int compare_xtab_rows (const void *a, const void *b)
{
const double **da = (const double **) a;
const double **db = (const double **) b;
int ret;
ret = da[0][0] - db[0][0];
if (ret == 0) {
ret = da[0][1] - db[0][1];
}
return ret;
}
static int xtab_get_data (Xtab *tab, int v, int j,
const DATASET *dset,
series_table **pst)
{
double **xtarg = (j == 1)? &tab->cval : &tab->rval;
char ***Starg = (j == 1)? &tab->Sc : &tab->Sr;
int *itarg = (j == 1)? &tab->cols : &tab->rows;
int *ttarg = (j == 1)? &tab->cstrs : &tab->rstrs;
double *x = dset->Z[v] + dset->t1;
int n = sample_size(dset);
gretl_matrix *u;
int err = 0;
u = gretl_matrix_values(x, n, OPT_S, &err);
if (!err && is_string_valued(dset, v)) {
series_table *st;
int ns, nv = u->rows;
char **S0, **S = NULL;
*pst = st = series_get_string_table(dset, v);
S0 = series_table_get_strings(st, &ns);
if (ns > nv) {
int i, k;
S = strings_array_new(nv);
if (S != NULL) {
for (i=0; ival[i] - 1;
S[i] = gretl_strdup(S0[k]);
}
}
} else {
S = strings_array_dup(S0, ns);
}
if (S == NULL) {
err = E_ALLOC;
} else {
*ttarg = 1;
*itarg = nv;
*Starg = S;
strings_array_sort(Starg, &nv, OPT_NONE);
}
} else if (!err) {
*itarg = u->rows;
*xtarg = u->val;
u->val = NULL;
}
gretl_matrix_free(u);
return err;
}
static int xtab_row_match (Xtab *tab, int i, const char *s, double x)
{
if (tab->rstrs) {
return strcmp(s, tab->Sr[i]) == 0;
} else {
return x == tab->rval[i];
}
}
static int xtab_col_match (Xtab *tab, int j, const char *s, double x)
{
if (tab->cstrs) {
return strcmp(s, tab->Sc[j]) == 0;
} else {
return x == tab->cval[j];
}
}
#define complete_obs(x,y,t) (!na(x[t]) && !na(y[t]))
/* crosstab struct creation functions */
static Xtab *get_new_xtab (int rv, int cv, const DATASET *dset,
int *err)
{
series_table *sti = NULL;
series_table *stj = NULL;
const char *s1 = NULL;
const char *s2 = NULL;
double x1 = 0, x2 = 0;
Xtab *tab = NULL;
int imatch, jmatch;
int i, j, t, n = 0;
for (t=dset->t1; t<=dset->t2; t++) {
if (complete_obs(dset->Z[rv], dset->Z[cv], t)) {
n++;
}
}
if (n == 0) {
*err = E_MISSDATA;
} else {
tab = xtab_new(n, dset->t1, dset->t2);
if (tab == NULL) {
*err = E_ALLOC;
}
}
if (!*err) {
/* assemble row data */
*err = xtab_get_data(tab, rv, 0, dset, &sti);
}
if (!*err) {
/* assemble column data */
*err = xtab_get_data(tab, cv, 1, dset, &stj);
}
if (!*err) {
*err = xtab_allocate_arrays(tab);
}
if (*err) goto bailout;
tab->missing = (dset->t2 - dset->t1 + 1) - n;
strcpy(tab->rvarname, dset->varname[rv]);
strcpy(tab->cvarname, dset->varname[cv]);
/* The following could be made more efficient by substituting
sorted arrays for dset->Z[rv] and dset->Z[cv] but I'm not
sure if the fixed cost would be recouped.
*/
for (t=dset->t1; t<=dset->t2; t++) {
if (!complete_obs(dset->Z[rv], dset->Z[cv], t)) {
continue;
}
x1 = dset->Z[rv][t];
if (tab->rstrs) {
s1 = series_table_get_string(sti, x1);
}
x2 = dset->Z[cv][t];
if (tab->cstrs) {
s2 = series_table_get_string(stj, x2);
}
for (i=0; irows; i++) {
imatch = xtab_row_match(tab, i, s1, x1);
if (imatch) {
jmatch = 0;
for (j=0; jcols && !jmatch; j++) {
jmatch = xtab_col_match(tab, j, s2, x2);
if (jmatch) {
tab->f[i][j] += 1;
tab->rtotal[i] += 1;
tab->ctotal[j] += 1;
}
}
break;
}
}
}
bailout:
if (*err) {
free_xtab(tab);
tab = NULL;
}
return tab;
}
static void record_xtab (const Xtab *tab, const DATASET *dset,
gretlopt opt)
{
gretl_matrix *X = NULL;
char **Sc = NULL, **Sr = NULL;
double xij, cj, ri;
int totals, rows, cols;
int i, j;
totals = (opt & OPT_N)? 0 : 1;
rows = tab->rows + totals;
cols = tab->cols + totals;
X = gretl_zero_matrix_new(rows, cols);
if (X == NULL) {
return;
}
/* column labels */
Sc = strings_array_new(cols);
if (Sc != NULL) {
for (j=0; jcols; j++) {
if (tab->cstrs) {
Sc[j] = gretl_strdup(tab->Sc[j]);
} else {
cj = tab->cval[j];
Sc[j] = gretl_strdup_printf("%4g", cj);
}
}
if (totals) {
Sc[cols-1] = gretl_strdup("TOTAL");
}
}
/* row labels */
Sr = strings_array_new(rows);
if (Sr != NULL) {
for (i=0; irows; i++) {
if (tab->rstrs) {
Sr[i] = gretl_strdup(tab->Sr[i]);
} else {
ri = tab->rval[i];
Sr[i] = gretl_strdup_printf("%4g", ri);
}
}
if (totals) {
Sr[rows-1] = gretl_strdup("TOTAL");
}
}
/* body of table */
for (i=0; irows; i++) {
if (tab->rtotal[i] > 0) {
/* row counts */
for (j=0; jcols; j++) {
if (tab->ctotal[j] > 0) {
if (opt & (OPT_C | OPT_R)) {
if (opt & OPT_C) {
xij = 100.0 * tab->f[i][j] / tab->ctotal[j];
} else {
xij = 100.0 * tab->f[i][j] / tab->rtotal[i];
}
} else {
xij = tab->f[i][j];
}
gretl_matrix_set(X, i, j, xij);
}
}
if (totals) {
/* row totals */
if (opt & OPT_C) {
xij = 100.0 * tab->rtotal[i] / tab->n;
} else {
xij = tab->rtotal[i];
}
gretl_matrix_set(X, i, cols-1, xij);
}
}
}
if (totals) {
/* column totals */
for (j=0; jcols; j++) {
if (opt & OPT_R) {
xij = 100.0 * tab->ctotal[j] / tab->n;
} else {
xij = tab->ctotal[j];
}
gretl_matrix_set(X, rows-1, j, xij);
}
gretl_matrix_set(X, rows-1, cols-1, tab->n);
}
gretl_matrix_set_colnames(X, Sc);
gretl_matrix_set_rownames(X, Sr);
set_last_result_data(X, GRETL_TYPE_MATRIX);
}
/* For use in the context of "xtab" with --quiet option:
compute and record the Pearson chi-square value and its
p-value.
*/
static int xtab_do_pearson (const Xtab *tab)
{
double x, y, ymin = 1.0e-7;
double pearson = 0.0;
int i, j, err = 0;
for (i=0; irows && !err; i++) {
if (tab->rtotal[i] > 0) {
for (j=0; jcols && !err; j++) {
y = ((double) tab->rtotal[i] * tab->ctotal[j]) / tab->n;
if (y < ymin) {
err = E_DATA;
} else {
x = (double) tab->f[i][j] - y;
pearson += x * x / y;
}
}
}
}
if (!err) {
int df = (tab->rows - 1) * (tab->cols - 1);
double pval = chisq_cdf_comp(df, pearson);
if (!na(pval)) {
record_test_result(pearson, pval);
} else {
err = E_DATA;
}
}
if (err) {
record_test_result(NADBL, NADBL);
}
return err;
}
int crosstab_from_matrix (gretlopt opt, PRN *prn)
{
const char *mname;
const gretl_matrix *m;
Xtab *tab;
int i, j, nvals, n = 0;
double x;
int err = 0;
mname = get_optval_string(XTAB, OPT_X);
if (mname == NULL) {
return E_DATA;
}
m = get_matrix_by_name(mname);
if (m == NULL) {
return E_UNKVAR;
}
if (m->rows < 2 || m->cols < 2) {
err = E_DATA;
}
nvals = m->rows * m->cols;
for (i=0; ival[i];
if (x < 0 || x != floor(x) || x > INT_MAX) {
err = E_DATA;
}
n += x;
}
if (err) {
gretl_errmsg_sprintf(_("Matrix %s does not represent a "
"contingency table"), mname);
return err;
}
tab = xtab_new(n, 0, 0);
if (tab == NULL) {
return E_ALLOC;
}
tab->rows = m->rows;
tab->cols = m->cols;
if (xtab_allocate_arrays(tab)) {
free_xtab(tab);
return E_ALLOC;
}
for (i=0; irows; i++) {
tab->rval[i] = i + 1;
tab->rtotal[i] = 0.0;
for (j=0; jcols; j++) {
tab->f[i][j] = gretl_matrix_get(m, i, j);
tab->rtotal[i] += tab->f[i][j];
}
}
for (j=0; jcols; j++) {
tab->cval[j] = j + 1;
tab->ctotal[j] = 0.0;
for (i=0; irows; i++) {
tab->ctotal[j] += tab->f[i][j];
}
}
if (opt & OPT_Q) {
xtab_do_pearson(tab);
} else {
print_xtab(tab, NULL, opt | OPT_S, prn);
}
free_xtab(tab);
return err;
}
int crosstab (const int *list, const DATASET *dset,
gretlopt opt, PRN *prn)
{
Xtab *tab;
int *rowvar = NULL;
int *colvar = NULL;
int i, j, vi, vj, k;
int pos, onelist;
int nrv, ncv;
int err = 0;
pos = gretl_list_separator_position(list);
onelist = (pos == 0);
if (pos == 0) {
/* single list case */
nrv = list[0];
ncv = nrv - 1;
} else {
/* double list case */
nrv = pos - 1;
ncv = list[0] - pos;
}
if (nrv == 0 || ncv == 0) {
return E_PARSE;
}
rowvar = gretl_list_new(nrv);
if (rowvar == NULL) {
return E_ALLOC;
}
j = 1;
for (i=1; i<=nrv; i++) {
k = list[i];
if (accept_as_discrete(dset, k, 0)) {
rowvar[j++] = k;
} else {
pprintf(prn, _("dropping %s: not a discrete variable\n"),
dset->varname[k]);
rowvar[0] -= 1;
}
}
if (rowvar[0] == 0 || (onelist && rowvar[0] == 1)) {
gretl_errmsg_set("xtab: variables must be discrete");
free(rowvar);
return E_TYPES;
}
if (onelist && rowvar[0] == 2) {
/* the bivariate case */
tab = get_new_xtab(rowvar[1], rowvar[2], dset, &err);
if (!err) {
/* make $result matrix available */
record_xtab(tab, dset, opt);
if (opt & OPT_Q) {
/* quiet: run and record the Pearson test */
xtab_do_pearson(tab);
} else {
/* print, and record Pearson test */
print_xtab(tab, dset, opt | OPT_S, prn);
}
free_xtab(tab);
}
goto finish;
}
if (!onelist) {
/* construct the second list */
colvar = gretl_list_new(ncv);
if (colvar == NULL) {
err = E_ALLOC;
} else {
j = 1;
for (i=1; i<=ncv; i++) {
k = list[pos+i];
if (accept_as_discrete(dset, k, 0)) {
colvar[j++] = k;
} else {
colvar[0] -= 1;
}
}
if (colvar[0] == 0) {
err = E_TYPES;
}
}
}
for (i=1; i<=rowvar[0] && !err; i++) {
vi = rowvar[i];
if (onelist) {
/* single list case */
for (j=1; jdist == D_NORMAL) {
pval = chisq_cdf_comp(2, freq->test);
} else if (freq->dist == D_GAMMA) {
pval = normal_pvalue_2(freq->test);
}
if (na(pval)) {
return E_NAN;
} else {
record_test_result(freq->test, pval);
return 0;
}
}
int model_error_dist (const MODEL *pmod, DATASET *dset,
gretlopt opt, PRN *prn)
{
FreqDist *freq = NULL;
gretlopt fopt = OPT_Z; /* show normality test */
int save_t1 = dset->t1;
int save_t2 = dset->t2;
int err = 0;
if (pmod == NULL || pmod->uhat == NULL) {
return E_DATA;
}
err = gretl_model_get_normality_test(pmod, prn);
if (!err) {
return 0;
} else if (LIMDEP(pmod->ci)) {
return err;
} else {
err = 0;
}
if (exact_fit_check(pmod, prn)) {
return 0;
}
if (genr_fit_resid(pmod, dset, M_UHAT)) {
return E_ALLOC;
}
if (!err) {
dset->t1 = pmod->t1;
dset->t2 = pmod->t2;
freq = get_freq(dset->v - 1, dset, NADBL, NADBL, 0,
pmod->ncoeff, fopt, &err);
}
if (!err) {
if (opt & OPT_I) {
err = just_record_freq_test(freq);
} else if (opt & OPT_Q) {
print_freq_test(freq, prn);
} else {
print_freq(freq, 0, NULL, prn);
}
free_freq(freq);
}
dataset_drop_last_variables(dset, 1);
dset->t1 = save_t1;
dset->t2 = save_t2;
return err;
}
/* PACF via Durbin-Levinson algorithm */
static int get_pacf (gretl_matrix *A)
{
int m = gretl_matrix_rows(A);
gretl_matrix *phi;
double *acf = A->val;
double *pacf = acf + m;
double x, num, den;
int i, j;
phi = gretl_matrix_alloc(m, m);
if (phi == NULL) {
return E_ALLOC;
}
gretl_matrix_set(A, 0, 1, acf[0]);
gretl_matrix_set(phi, 0, 0, acf[0]);
for (i=1; i T / 5) {
/* restrict to 20 percent of data (Tadeusz) */
p = T / 5;
}
return p;
}
/**
* gretl_acf:
* @k: lag order.
* @t1: starting observation.
* @t2: ending observation.
* @y: data series.
* @ybar: mean of @y over range @t1 to @t2.
*
* Returns: the autocorrelation at lag @k for the series @y over
* the range @t1 to @t2, or #NADBL on failure.
*/
static double gretl_acf (int k, int t1, int t2, const double *y,
double ybar)
{
double z, num = 0, den = 0;
int t, n = 0;
for (t=t1; t<=t2; t++) {
if (na(y[t])) {
return NADBL;
}
z = y[t] - ybar;
den += z * z;
if (t - k >= t1) {
if (na(y[t-k])) {
return NADBL;
}
num += z * (y[t-k] - ybar);
n++;
}
}
if (n == 0) {
return NADBL;
}
return num / den;
}
/**
* ljung_box:
* @m: maximum lag.
* @t1: starting observation.
* @t2: ending observation.
* @y: data series.
* @err: location to receive error code.
*
* Returns: the Ljung-Box statistic for lag order @m for
* the series @y over the sample @t1 to @t2, or #NADBL
* on failure.
*/
double ljung_box (int m, int t1, int t2, const double *y, int *err)
{
double acf, ybar = 0.0, LB = 0.0;
int k, n = t2 - t1 + 1;
*err = 0;
if (n == 0 || gretl_isconst(t1, t2, y)) {
*err = E_DATA;
} else if (m <= 0) {
gretl_errmsg_sprintf(_("Invalid lag order %d"), m);
*err = E_DATA;
} else {
ybar = gretl_mean(t1, t2, y);
if (na(ybar)) {
*err = E_DATA;
}
}
if (*err) {
return NADBL;
}
/* calculate acf up to lag m, cumulating LB */
for (k=1; k<=m; k++) {
acf = gretl_acf(k, t1, t2, y, ybar);
if (na(acf)) {
*err = E_MISSDATA;
break;
}
LB += acf * acf / (n - k);
}
if (*err) {
LB = NADBL;
} else {
LB *= n * (n + 2.0);
}
return LB;
}
/**
* gretl_xcf:
* @k: lag order (or lead order if < 0).
* @t1: starting observation.
* @t2: ending observation.
* @x: first data series.
* @y: second data series.
* @xbar: mean of first series.
* @ybar: mean of second series.
*
* Returns: the cross-correlation at lag (or lead) @k for the
* series @x and @y over the range @t1 to @t2, or #NADBL on failure.
*/
static double
gretl_xcf (int k, int t1, int t2, const double *x, const double *y,
double xbar, double ybar)
{
double num = 0, den1 = 0, den2 = 0;
double zx, zy;
int t;
for (t=t1; t<=t2; t++) {
if (na(x[t]) || na(y[t])) {
return NADBL;
}
zx = x[t] - xbar;
zy = y[t] - ybar;
den1 += zx * zx;
den2 += zy * zy;
if ((k >= 0 && t - k >= t1) || (k < 0 && t - k <= t2)) {
num += zx * (y[t-k] - ybar);
}
}
return num / sqrt(den1 * den2);
}
static int corrgm_ascii_plot (const char *vname,
const gretl_matrix *A,
PRN *prn)
{
int k, m = gretl_matrix_rows(A);
double *xk = malloc(m * sizeof *xk);
if (xk == NULL) {
return E_ALLOC;
}
for (k=0; kval, xk, m, vname, _("lag"), prn);
free(xk);
return 0;
}
static void handle_corrgm_plot_options (int ci,
gretlopt opt,
int *ascii,
int *gp)
{
if (opt & OPT_Q) {
/* backward compat: --quiet = no plot */
*ascii = *gp = 0;
return;
}
if (opt & OPT_U) {
/* --plot=whatever */
const char *s = get_optval_string(ci, OPT_U);
if (s != NULL) {
if (!strcmp(s, "none")) {
/* no plot */
*ascii = *gp = 0;
return;
} else if (!strcmp(s, "ascii")) {
*ascii = 1;
*gp = 0;
return;
} else {
/* implies use of gnuplot */
*gp = 1;
return;
}
}
}
/* the defaults */
if (gretl_in_batch_mode()) {
*ascii = 1;
} else {
*gp = 1;
}
}
static void do_acf_bartlett (gretl_matrix *PM, int i,
int T, double ssr,
const double *z)
{
double b;
int j;
for (j=0; jcols; j++) {
b = z[j] * sqrt((1.0/T) * (1 + 2*ssr));
gretl_matrix_set(PM, i, j, b);
}
}
/**
* corrgram:
* @varno: ID number of variable to process.
* @order: integer order for autocorrelation function.
* @nparam: number of estimated parameters (e.g. for the
* case of ARMA), used to correct the degrees of freedom
* for Q test.
* @dset: dataset struct.
* @opt: if includes OPT_R, variable in question is a model
* residual generated "on the fly"; OPT_U can be used to
* specify a plot option.
* @prn: gretl printing struct.
*
* Computes the autocorrelation function and plots the correlogram for
* the variable specified by @varno.
*
* Returns: 0 on successful completion, error code on error.
*/
int corrgram (int varno, int order, int nparam, DATASET *dset,
gretlopt opt, PRN *prn)
{
const double z[] = {1.65, 1.96, 2.58};
gretl_matrix *PM = NULL;
gretl_matrix *A = NULL;
double ybar, ssr, lbox;
double pval = NADBL;
double pm[3];
double *acf, *pacf;
const char *vname;
int i, k, m, T, dfQ;
int ascii_plot = 0;
int use_gnuplot = 0;
int t1 = dset->t1, t2 = dset->t2;
int err = 0, pacf_err = 0;
gretl_error_clear();
if (order < 0) {
gretl_errmsg_sprintf(_("Invalid lag order %d"), order);
return E_DATA;
}
err = series_adjust_sample(dset->Z[varno], &t1, &t2);
if (err) {
return err;
}
if ((T = t2 - t1 + 1) < 4) {
return E_TOOFEW;
}
if (gretl_isconst(t1, t2, dset->Z[varno])) {
gretl_errmsg_sprintf(_("%s is a constant"), dset->varname[varno]);
return E_DATA;
}
ybar = gretl_mean(t1, t2, dset->Z[varno]);
if (na(ybar)) {
return E_DATA;
}
vname = series_get_graph_name(dset, varno);
/* lag order for acf */
m = order;
if (m == 0) {
m = auto_acf_order(T);
} else if (m > T - dset->pd) {
int mmax = T - 1;
if (m > mmax) {
m = mmax;
}
}
A = gretl_matrix_alloc(m, 2);
if (A == NULL) {
err = E_ALLOC;
goto bailout;
}
if (opt & OPT_B) {
PM = gretl_matrix_alloc(m, 3);
if (PM == NULL) {
err = E_ALLOC;
goto bailout;
}
}
acf = A->val;
pacf = acf + m;
/* calculate acf up to order acf_m */
for (k=1; k<=m; k++) {
acf[k-1] = gretl_acf(k, t1, t2, dset->Z[varno], ybar);
}
/* graphing? */
handle_corrgm_plot_options(CORRGM, opt, &ascii_plot, &use_gnuplot);
if (ascii_plot) {
corrgm_ascii_plot(vname, A, prn);
}
if (opt & OPT_R) {
pprintf(prn, "\n%s\n", _("Residual autocorrelation function"));
} else {
pputc(prn, '\n');
pprintf(prn, _("Autocorrelation function for %s"), vname);
pputc(prn, '\n');
}
pputs(prn, _("***, **, * indicate significance at the 1%, 5%, 10% levels\n"));
if (opt & OPT_B) {
pputs(prn, _("using Bartlett standard errors for ACF"));
} else {
pprintf(prn, _("using standard error 1/T^%.1f"), 0.5);
}
pputs(prn, "\n\n");
/* for confidence bands */
for (i=0; i<3; i++) {
pm[i] = z[i] / sqrt((double) T);
if (pm[i] > 0.5) {
pm[i] = 0.5;
}
}
/* generate (and if not in batch mode) plot partial
autocorrelation function */
err = pacf_err = get_pacf(A);
pputs(prn, _(" LAG ACF PACF Q-stat. [p-value]"));
pputs(prn, "\n\n");
lbox = ssr = 0.0;
dfQ = 1;
for (k=0; k 0 && !na(acf[k-1])) {
ssr += acf[k-1] * acf[k-1];
}
do_acf_bartlett(PM, k, T, ssr, z);
pm0 = gretl_matrix_get(PM, k, 0);
pm1 = gretl_matrix_get(PM, k, 1);
pm2 = gretl_matrix_get(PM, k, 2);
} else {
pm0 = pm[0];
pm1 = pm[1];
pm2 = pm[2];
}
if (fabs(acf[k]) > pm2) {
pputs(prn, " ***");
} else if (fabs(acf[k]) > pm1) {
pputs(prn, " ** ");
} else if (fabs(acf[k]) > pm0) {
pputs(prn, " * ");
} else {
pputs(prn, " ");
}
if (na(pacf[k])) {
bufspace(13, prn);
} else {
/* PACF */
pprintf(prn, "%9.4f", pacf[k]);
if (fabs(pacf[k]) > pm[2]) {
pputs(prn, " ***");
} else if (fabs(pacf[k]) > pm[1]) {
pputs(prn, " ** ");
} else if (fabs(pacf[k]) > pm[0]) {
pputs(prn, " * ");
} else {
pputs(prn, " ");
}
}
/* Ljung-Box Q */
lbox += (T * (T + 2.0)) * acf[k] * acf[k] / (T - (k + 1));
if (k >= nparam) {
/* i.e., if the real df is > 0 */
pprintf(prn, "%12.4f", lbox);
pval = chisq_cdf_comp(dfQ++, lbox);
pprintf(prn, " [%5.3f]", pval);
}
pputc(prn, '\n');
}
pputc(prn, '\n');
if (lbox > 0 && !na(pval)) {
record_test_result(lbox, pval);
}
if (use_gnuplot) {
err = correlogram_plot(vname, acf, (pacf_err)? NULL : pacf,
PM, m, pm[1], opt);
}
bailout:
gretl_matrix_free(A);
gretl_matrix_free(PM);
return err;
}
/**
* acf_matrix:
* @x: series to analyse.
* @order: maximum lag for autocorrelation function.
* @dset: information on the data set, or %NULL.
* @n: length of series (required if @dset is %NULL).
* @err: location to receive error code.
*
* Computes the autocorrelation function for series @x with
* maximum lag @order.
*
* Returns: two-column matrix containing the values of the
* ACF and PACF at the successive lags, or %NULL on error.
*/
gretl_matrix *acf_matrix (const double *x, int order,
const DATASET *dset, int n,
int *err)
{
gretl_matrix *A = NULL;
double xbar;
int m, k, t, T;
int t1, t2;
if (dset != NULL) {
t1 = dset->t1;
t2 = dset->t2;
while (na(x[t1])) t1++;
while (na(x[t2])) t2--;
T = t2 - t1 + 1;
} else {
t1 = 0;
t2 = n - 1;
T = n;
}
if (T < 4) {
*err = E_TOOFEW;
return NULL;
}
for (t=t1; t<=t2; t++) {
if (na(x[t])) {
*err = E_MISSDATA;
return NULL;
}
}
if (gretl_isconst(t1, t2, x)) {
gretl_errmsg_set(_("Argument is a constant"));
*err = E_DATA;
return NULL;
}
xbar = gretl_mean(t1, t2, x);
if (na(xbar)) {
*err = E_DATA;
return NULL;
}
m = order;
if (dset == NULL) {
if (m < 1 || m > T) {
*err = E_DATA;
return NULL;
}
} else {
if (m == 0) {
m = auto_acf_order(T);
} else if (m > T - dset->pd) {
int mmax = T - 1;
if (m > mmax) {
m = mmax;
}
}
}
A = gretl_matrix_alloc(m, 2);
if (A == NULL) {
*err = E_ALLOC;
return NULL;
}
/* calculate ACF up to order m */
for (k=0; kval[k] = gretl_acf(k+1, t1, t2, x, xbar);
if (na(A->val[k])) {
*err = E_DATA;
}
}
/* add PACF */
if (!*err) {
*err = get_pacf(A);
}
if (*err) {
gretl_matrix_free(A);
A = NULL;
}
return A;
}
static int xcorrgm_graph (const char *xname, const char *yname,
double *xcf, int m, double *pm,
int allpos)
{
char crit_string[16];
gchar *title;
FILE *fp;
int k, err = 0;
fp = open_plot_input_file(PLOT_XCORRELOGRAM, 0, &err);
if (err) {
return err;
}
sprintf(crit_string, "%.2f/T^%.1f", 1.96, 0.5);
gretl_push_c_numeric_locale();
fputs("set xzeroaxis\n", fp);
fputs("set yzeroaxis\n", fp);
print_keypos_string(GP_KEY_RIGHT_TOP, fp);
fprintf(fp, "set xlabel '%s'\n", _("lag"));
if (allpos) {
fputs("set yrange [-0.1:1.1]\n", fp);
} else {
fputs("set yrange [-1.1:1.1]\n", fp);
}
title = g_strdup_printf(_("Correlations of %s and lagged %s"),
xname, yname);
fprintf(fp, "set title '%s'\n", title);
g_free(title);
fprintf(fp, "set xrange [%d:%d]\n", -(m + 1), m + 1);
if (allpos) {
fprintf(fp, "plot \\\n"
"'-' using 1:2 notitle w impulses lw 5, \\\n"
"%g title '%s' lt 2\n", pm[1], crit_string);
} else {
fprintf(fp, "plot \\\n"
"'-' using 1:2 notitle w impulses lw 5, \\\n"
"%g title '+- %s' lt 2, \\\n"
"%g notitle lt 2\n", pm[1], crit_string, -pm[1]);
}
for (k=-m; k<=m; k++) {
fprintf(fp, "%d %g\n", k, xcf[k+m]);
}
fputs("e\n", fp);
gretl_pop_c_numeric_locale();
return finalize_plot_input_file(fp);
}
static int xcorrgm_ascii_plot (double *xcf, int xcf_m, PRN *prn)
{
double *xk = malloc((xcf_m * 2 + 1) * sizeof *xk);
int k;
if (xk == NULL) {
return E_ALLOC;
}
for (k=-xcf_m; k<=xcf_m; k++) {
xk[k+xcf_m] = k;
}
pprintf(prn, "\n\n%s\n\n", _("Cross-correlogram"));
graphyx(xcf, xk, 2 * xcf_m + 1, "", _("lag"), prn);
free(xk);
return 0;
}
/* We assume here that all data issues have already been
assessed (lag length, missing values etc.) and we just
get on with the job.
*/
static gretl_matrix *real_xcf_vec (const double *x, const double *y,
int p, int T, int *err)
{
gretl_matrix *xcf;
double xbar, ybar;
int i;
xbar = gretl_mean(0, T-1, x);
if (na(xbar)) {
*err = E_DATA;
return NULL;
}
ybar = gretl_mean(0, T-1, y);
if (na(ybar)) {
*err = E_DATA;
return NULL;
}
xcf = gretl_column_vector_alloc(p * 2 + 1);
if (xcf == NULL) {
*err = E_ALLOC;
return NULL;
}
for (i=-p; i<=p; i++) {
xcf->val[i+p] = gretl_xcf(i, 0, T - 1, x, y, xbar, ybar);
}
return xcf;
}
/* for arrays @x and @y, check that there are no missing values
and that neither series has a constant value
*/
static int xcf_data_check (const double *x, const double *y, int T,
int *badvar)
{
int xconst = 1, yconst = 1;
int t;
if (T < 5) {
return E_TOOFEW;
}
for (t=0; t 0 && x[t] != x[0]) {
xconst = 0;
}
if (t > 0 && y[t] != y[0]) {
yconst = 0;
}
}
if (xconst) {
*badvar = 1;
return E_DATA;
} else if (yconst) {
*badvar = 2;
return E_DATA;
}
return 0;
}
/**
* xcf_vec:
* @x: first series.
* @y: second series.
* @p: maximum lag for cross-correlation function.
* @dset: information on the data set, or NULL.
* @n: length of series (required only if @dset is NULL).
* @err: location to receive error code.
*
* Computes the cross-correlation function for series @x with
* series @y up to maximum lag @order.
*
* Returns: column vector containing the values of the
* cross-correlation function, or NULL on error.
*/
gretl_matrix *xcf_vec (const double *x, const double *y,
int p, const DATASET *dset,
int n, int *err)
{
gretl_matrix *xcf = NULL;
int t1, t2;
int T, badvar = 0;
if (p <= 0) {
*err = E_DATA;
return NULL;
}
if (dset != NULL) {
int yt1, yt2;
t1 = yt1 = dset->t1;
t2 = yt2 = dset->t2;
while (na(x[t1])) t1++;
while (na(y[yt1])) yt1++;
while (na(x[t2])) t2--;
while (na(y[yt2])) yt2--;
t1 = (yt1 > t1)? yt1 : t1;
t2 = (yt2 < t2)? yt2 : t2;
T = t2 - t1 + 1;
} else {
t1 = 0;
t2 = n - 1;
T = n;
}
#if 0
fprintf(stderr, "t1=%d, t2=%d, T=%d\n", t1, t2, T);
fprintf(stderr, "x[t1]=%g, y[t1]=%g\n\n", x[t1], y[t1]);
#endif
if (dset != NULL) {
if (2 * p > T - dset->pd) { /* ?? */
*err = E_DATA;
}
} else if (2 * p > T) {
*err = E_DATA;
}
if (!*err) {
*err = xcf_data_check(x + t1, y + t1, T, &badvar);
if (badvar) {
gretl_errmsg_sprintf(_("Argument %d is a constant"),
badvar);
}
}
if (!*err) {
xcf = real_xcf_vec(x + t1, y + t1, p, T, err);
}
return xcf;
}
/**
* xcorrgram:
* @list: should contain ID numbers of two variables.
* @order: integer order for autocorrelation function.
* @dset: dataset struct.
* @opt: may include OPT_U for plot options.
* @prn: gretl printing struct.
*
* Computes the cross-correlation function and plots the
* cross-correlogram for the specified variables.
*
* Returns: 0 on successful completion, error code on error.
*/
int xcorrgram (const int *list, int order, DATASET *dset,
gretlopt opt, PRN *prn)
{
gretl_matrix *xcf = NULL;
double pm[3];
const char *xname, *yname;
const double *x, *y;
int t1 = dset->t1, t2 = dset->t2;
int ascii_plot = 0;
int use_gnuplot = 0;
int k, p, badvar = 0;
int T, err = 0;
gretl_error_clear();
if (order < 0) {
gretl_errmsg_sprintf(_("Invalid lag order %d"), order);
return E_DATA;
}
if (list[0] != 2) {
return E_DATA;
}
x = dset->Z[list[1]];
y = dset->Z[list[2]];
err = list_adjust_sample(list, &t1, &t2, dset, NULL);
if (!err) {
T = t2 - t1 + 1;
err = xcf_data_check(x + t1, y + t1, T, &badvar);
}
if (err) {
if (badvar) {
gretl_errmsg_sprintf(_("%s is a constant"),
dset->varname[list[badvar]]);
}
return err;
}
xname = dset->varname[list[1]];
yname = dset->varname[list[2]];
p = order;
if (p == 0) {
p = auto_acf_order(T) / 2;
} else if (2 * p > T - dset->pd) {
p = (T - 1) / 2; /* ?? */
}
xcf = real_xcf_vec(x + t1, y + t1, p, T, &err);
if (err) {
return err;
}
/* graphing? */
handle_corrgm_plot_options(XCORRGM, opt, &ascii_plot, &use_gnuplot);
if (ascii_plot) {
xcorrgm_ascii_plot(xcf->val, p, prn);
}
/* for confidence bands */
pm[0] = 1.65 / sqrt((double) T);
pm[1] = 1.96 / sqrt((double) T);
pm[2] = 2.58 / sqrt((double) T);
pputc(prn, '\n');
pprintf(prn, _("Cross-correlation function for %s and %s"),
xname, yname);
pputs(prn, "\n\n");
pputs(prn, _(" LAG XCF"));
pputs(prn, "\n\n");
for (k=-p; k<=p; k++) {
double x = xcf->val[k + p];
pprintf(prn, "%5d%9.4f", k, x);
if (fabs(x) > pm[2]) {
pputs(prn, " ***");
} else if (fabs(x) > pm[1]) {
pputs(prn, " **");
} else if (fabs(x) > pm[0]) {
pputs(prn, " *");
}
pputc(prn, '\n');
}
pputc(prn, '\n');
if (use_gnuplot) {
int allpos = 1;
for (k=-p; k<=p; k++) {
if (xcf->val[k+p] < 0) {
allpos = 0;
break;
}
}
err = xcorrgm_graph(xname, yname, xcf->val, p, pm, allpos);
}
gretl_matrix_free(xcf);
return err;
}
struct fractint_test {
double d; /* estimated degree of integration */
double se; /* standard error of the above */
};
static int fract_int_GPH (int T, int m, const double *xvec,
struct fractint_test *ft)
{
gretl_matrix *y = NULL;
gretl_matrix *X = NULL;
gretl_matrix *b = NULL;
gretl_matrix *V = NULL;
double x, w;
int t, err = 0;
ft->d = ft->se = NADBL;
y = gretl_column_vector_alloc(m);
X = gretl_unit_matrix_new(m, 2);
b = gretl_column_vector_alloc(2);
V = gretl_matrix_alloc(2, 2);
if (y == NULL || X == NULL || b == NULL || V == NULL) {
err = E_ALLOC;
goto bailout;
}
/* Test from Geweke and Porter-Hudak, as set out in
Greene, Econometric Analysis 4e, p. 787 */
for (t=0; tval[t] = log(xvec[t+1]);
w = M_2PI * (t + 1) / (double) T;
x = sin(w / 2);
gretl_matrix_set(X, t, 1, log(4 * x * x));
}
err = gretl_matrix_ols(y, X, b, V, NULL, &x);
if (!err) {
ft->d = -b->val[1];
ft->se = sqrt(gretl_matrix_get(V, 1, 1));
}
bailout:
gretl_matrix_free(y);
gretl_matrix_free(X);
gretl_matrix_free(b);
gretl_matrix_free(V);
return err;
}
static gretl_matrix *gretl_matrix_pergm (const gretl_matrix *x, int m,
int *err)
{
gretl_matrix *p = NULL;
gretl_matrix *f = NULL;
f = gretl_matrix_fft(x, 0, err);
if (*err) {
return NULL;
}
p = gretl_column_vector_alloc(m);
if (p == NULL) {
*err = E_ALLOC;
} else {
int T = gretl_vector_get_length(x);
double re, im, scale = M_2PI * T;
int i;
for (i=0; ival[i] = (re*re + im*im) / scale;
}
}
gretl_matrix_free(f);
return p;
}
struct LWE_helper {
gretl_matrix *lambda;
gretl_matrix *lpow;
gretl_matrix *I1;
gretl_matrix *I2;
double lcm;
};
static void LWE_free (struct LWE_helper *L)
{
gretl_matrix_free(L->lambda);
gretl_matrix_free(L->lpow);
gretl_matrix_free(L->I1);
gretl_matrix_free(L->I2);
}
static double LWE_obj_func (struct LWE_helper *L, double d)
{
double dd = 2.0 * d;
int i;
gretl_matrix_copy_values(L->lpow, L->lambda);
gretl_matrix_raise(L->lpow, dd);
for (i=0; iI1->rows; i++) {
L->I2->val[i] = L->I1->val[i] * L->lpow->val[i];
}
return -(log(gretl_vector_mean(L->I2)) - dd * L->lcm);
}
static gretl_matrix *LWE_lambda (const gretl_matrix *I1, int n)
{
gretl_matrix *lambda;
int i, m = gretl_vector_get_length(I1);
lambda = gretl_column_vector_alloc(m);
if (lambda != NULL) {
for (i=0; iI2 = L->lpow = NULL;
L->I1 = gretl_matrix_pergm(X, m, &err);
if (err) {
return err;
}
L->lambda = LWE_lambda(L->I1, X->rows);
if (L->lambda == NULL) {
gretl_matrix_free(L->I1);
return E_ALLOC;
}
L->lpow = gretl_matrix_copy(L->lambda);
L->I2 = gretl_matrix_copy(L->I1);
if (L->lpow == NULL || L->I2 == NULL) {
err = E_ALLOC;
LWE_free(L);
} else {
L->lcm = 0.0;
for (i=0; ilcm += log(L->lambda->val[i]);
}
L->lcm /= m;
}
return err;
}
#define LWE_MAXITER 100
static double LWE_calc (const gretl_matrix *X, int m, int *err)
{
struct LWE_helper L = {0};
double d = 0, dd = 1.0;
double eps = 1.0e-05;
double f, incr, incl, deriv, h;
int iter = 0;
*err = LWE_init(&L, X, m);
if (*err) {
return NADBL;
}
while (fabs(dd) > 1.0e-06 && iter++ < LWE_MAXITER) {
f = LWE_obj_func(&L, d);
incr = LWE_obj_func(&L, d + eps) / eps;
incl = LWE_obj_func(&L, d - eps) / eps;
deriv = (incr - incl) / 2.0;
h = (0.5 * (incr + incl) - f / eps) / eps;
dd = (h < 0)? (-deriv / h) : deriv;
if (fabs(dd) > 1) {
dd = (dd > 0)? 1 : -1;
}
d += 0.5 * dd;
}
if (*err) {
d = NADBL;
} else if (iter == LWE_MAXITER) {
fprintf(stderr, "LWE: max iterations reached\n");
d = NADBL;
}
LWE_free(&L);
return d;
}
int auto_spectrum_order (int T, gretlopt opt)
{
int m;
if (opt & OPT_O) {
/* Bartlett */
m = (int) 2.0 * sqrt((double) T);
} else {
/* fractional integration test */
double m1 = floor((double) T / 2.0);
double m2 = floor(pow((double) T, 0.6));
m = (m1 < m2)? m1 : m2;
m--;
}
return m;
}
static int fract_int_LWE (const double *x, int m, int t1, int t2,
struct fractint_test *ft)
{
gretl_matrix *X;
int err = 0;
X = gretl_vector_from_series(x, t1, t2);
if (X == NULL) {
err = E_ALLOC;
} else {
ft->d = LWE_calc(X, m, &err);
if (!err) {
ft->se = 1.0 / (2 * sqrt((double) m));
}
gretl_matrix_free(X);
}
return err;
}
static int pergm_do_plot (gretlopt opt)
{
if (opt & OPT_U) {
/* --plot=whatever */
const char *s = get_optval_string(PERGM, OPT_U);
if (s != NULL) {
if (!strcmp(s, "none")) {
return 0;
} else {
/* implies use of gnuplot */
return 1;
}
}
}
/* the default */
return !gretl_in_batch_mode();
}
static void pergm_print (const char *vname, const double *d,
int T, int L, gretlopt opt, PRN *prn)
{
char xstr[32];
double dt, yt;
int t;
if (vname == NULL) {
pprintf(prn, "\n%s\n", _("Residual periodogram"));
} else {
pprintf(prn, _("\nPeriodogram for %s\n"), vname);
}
pprintf(prn, _("Number of observations = %d\n"), T);
if (opt & OPT_O) {
pprintf(prn, _("Using Bartlett lag window, length %d\n\n"), L);
} else {
pputc(prn, '\n');
}
if (opt & OPT_L) {
pputs(prn, _(" omega scaled frequency periods log spectral density\n\n"));
} else {
pputs(prn, _(" omega scaled frequency periods spectral density\n\n"));
}
for (t=1; t<=T/2; t++) {
dt = (opt & OPT_L)? log(d[t]) : d[t];
if (opt & OPT_R) {
yt = 2 * M_PI * (double) t / T;
pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t);
} else if (opt & OPT_D) {
yt = 360 * t / (double) T;
pprintf(prn, " %7.3f%8d%16.2f", yt, t, (double) T / t);
} else {
yt = M_2PI * t / (double) T;
pprintf(prn, " %.5f%8d%16.2f", yt, t, (double) T / t);
}
dt = (opt & OPT_L)? log(d[t]) : d[t];
sprintf(xstr, "%#.5g", dt);
gretl_fix_exponent(xstr);
pprintf(prn, "%16s\n", xstr);
}
pputc(prn, '\n');
if (pergm_do_plot(opt)) {
periodogram_plot(vname, T, L, d, opt);
}
}
static int finalize_fractint (const double *x,
const double *dens,
int t1, int t2, int width,
const char *vname,
gretlopt opt,
PRN *prn)
{
struct fractint_test ft;
gretl_matrix *result = NULL;
int do_GPH = opt & (OPT_G | OPT_A);
int do_LWE = !(opt & OPT_G);
int do_print = !(opt & OPT_Q);
int T = t2 - t1 + 1;
int m, err = 0;
/* order for test */
if (width <= 0) {
m = auto_spectrum_order(T, OPT_NONE);
} else {
m = (width > T / 2)? T / 2 : width;
}
if (do_print && vname != NULL) {
pprintf(prn, "\n%s, T = %d\n\n", vname, T);
}
if (do_GPH && do_LWE) {
result = gretl_matrix_alloc(2, 2);
} else {
result = gretl_matrix_alloc(1, 2);
}
if (do_LWE) {
/* do Local Whittle if wanted */
err = fract_int_LWE(x, m, t1, t2, &ft);
if (!err) {
double z = ft.d / ft.se;
double pv = normal_pvalue_2(z);
record_test_result(z, pv);
gretl_matrix_set(result, 0, 0, ft.d);
gretl_matrix_set(result, 0, 1, ft.se);
if (do_print) {
pprintf(prn, "%s (m = %d)\n"
" %s = %g (%g)\n"
" %s: z = %g, %s %.4f\n\n",
_("Local Whittle Estimator"), m,
_("Estimated degree of integration"), ft.d, ft.se,
_("test statistic"), z,
_("with p-value"), pv);
}
}
}
if (!err && do_GPH) {
/* do GPH if wanted (options --all or --gph) */
int row = do_LWE ? 1 : 0;
err = fract_int_GPH(T, m, dens, &ft);
gretl_matrix_set(result, row, 0, ft.d);
gretl_matrix_set(result, row, 1, ft.se);
if (!err && (!do_LWE || do_print)) {
double tval = ft.d / ft.se;
int df = m - 2;
double pv = student_pvalue_2(df, tval);
if (!do_LWE) {
record_test_result(tval, pv);
}
if (do_print) {
pprintf(prn, "%s (m = %d)\n"
" %s = %g (%g)\n"
" %s: t(%d) = %g, %s %.4f\n\n",
_("GPH test"), m,
_("Estimated degree of integration"), ft.d, ft.se,
_("test statistic"), df, tval,
_("with p-value"), pv);
}
}
}
if (err) {
gretl_matrix_free(result);
} else {
char **colnames = strings_array_new(2);
colnames[0] = gretl_strdup("d");
colnames[1] = gretl_strdup("se");
gretl_matrix_set_colnames(result, colnames);
set_last_result_data(result, GRETL_TYPE_MATRIX);
}
return err;
}
static double *pergm_compute_density (const double *x, int t1, int t2,
int L, int bartlett, int *err)
{
double *sdy, *acov, *dens;
double xx, yy, vx, sx, w;
int k, t, T = t2 - t1 + 1;
sdy = malloc(T * sizeof *sdy);
acov = malloc((L + 1) * sizeof *acov);
dens = malloc((1 + T/2) * sizeof *dens);
if (sdy == NULL || acov == NULL || dens == NULL) {
*err = E_ALLOC;
free(sdy);
free(acov);
free(dens);
return NULL;
}
xx = gretl_mean(t1, t2, x);
vx = real_gretl_variance(t1, t2, x, 1);
sx = sqrt(vx);
for (t=t1; t<=t2; t++) {
sdy[t-t1] = (x[t] - xx) / sx;
}
/* autocovariances */
for (k=1; k<=L; k++) {
acov[k] = 0.0;
for (t=k; t T / 2)? T / 2 : width;
}
} else {
/* use full sample */
L = T - 1;
}
dens = pergm_compute_density(x, t1, t2, L, bartlett, &err);
if (err) {
return err;
}
}
if (usage == PERGM_FUNC) {
/* make matrix for pergm() function */
int T2 = T / 2;
gretl_matrix *pm;
*pmat = pm = gretl_matrix_alloc(T2, 2);
if (pm == NULL) {
err = E_ALLOC;
} else {
for (t=1; t<=T2; t++) {
gretl_matrix_set(pm, t-1, 0, M_2PI * t / (double) T);
gretl_matrix_set(pm, t-1, 1, dens[t]);
}
}
} else if (usage == FRACTINT_CMD) {
/* supporting "fractint" command */
err = finalize_fractint(x, dens, t1, t2, width,
vname, opt, prn);
} else {
/* supporting "pergm" command */
pergm_print(vname, dens, T, L, opt, prn);
}
free(dens);
return err;
}
/**
* periodogram:
* @varno: ID number of variable to process.
* @width: width of window.
* @dset: dataset struct.
* @opt: if includes OPT_O, use Bartlett lag window for periodogram;
* OPT_L, use log scale.
* @prn: gretl printing struct.
*
* Computes and displays the periodogram for the series specified
* by @varno.
*
* Returns: 0 on successful completion, error code on error.
*/
int periodogram (int varno, int width, const DATASET *dset,
gretlopt opt, PRN *prn)
{
return pergm_or_fractint(PERGM_CMD, dset->Z[varno],
dset->t1, dset->t2,
width, dset->varname[varno],
opt, prn, NULL);
}
/**
* residual_periodogram:
* @x: series to process.
* @width: width of window.
* @dset: dataset struct.
* @opt: if includes OPT_O, use Bartlett lag window for periodogram;
* OPT_L, use log scale.
* @prn: gretl printing struct.
*
* Computes and displays the periodogram for @x, which is presumed to
* be a model residual series.
*
* Returns: 0 on successful completion, error code on error.
*/
int residual_periodogram (const double *x, int width, const DATASET *dset,
gretlopt opt, PRN *prn)
{
return pergm_or_fractint(PERGM_CMD, x,
dset->t1, dset->t2,
width, NULL,
opt, prn, NULL);
}
/**
* periodogram_matrix:
* @x: the series to process.
* @t1: starting observation in @x.
* @t2: ending observation in @x.
* @width: width of Bartlett window, or -1 for plain sample
* periodogram.
* @err: location to receive error code.
*
* Implements the userspace gretl pergm function, which can
* be used on either a series from the dataset or a gretl
* vector.
*
* Returns: allocated matrix on success, NULL on failure.
*/
gretl_matrix *periodogram_matrix (const double *x, int t1, int t2,
int width, int *err)
{
gretlopt opt = (width < 0)? OPT_NONE : OPT_O;
gretl_matrix *m = NULL;
*err = pergm_or_fractint(PERGM_FUNC, x, t1, t2, width,
NULL, opt, NULL, &m);
return m;
}
/**
* fractint:
* @varno: ID number of variable to process.
* @order: lag order / window size.
* @dset: dataset struct.
* @opt: option flags.
* @prn: gretl printing struct.
*
* Computes and prints a test for fractional integration of the
* series specified by @varno. By default the test uses the
* Local Whittle Estimator but if @opt includes OPT_G then
* the Geweke and Porter-Hudak test is done instead, or if
* OPT_A then both tests are shown. If OPT_Q is given the
* test results are not printed, just recorded (with
* preference given to the LWE in case of OPT_A).
*
* Returns: 0 on successful completion, error code on error.
*/
int fractint (int varno, int order, const DATASET *dset,
gretlopt opt, PRN *prn)
{
int err = incompatible_options(opt, (OPT_G | OPT_A));
if (!err) {
err = pergm_or_fractint(FRACTINT_CMD, dset->Z[varno],
dset->t1, dset->t2,
order,
dset->varname[varno],
opt, prn, NULL);
}
return err;
}
static void printf15 (double x, int d, PRN *prn)
{
pputc(prn, ' ');
if (na(x)) {
pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA"));
} else if (x > 999 && x < 100000) {
int p = 1 + floor(log10(x));
d -= p;
if (d < 0) d = 0;
pprintf(prn, "%14.*f", d, x);
} else {
pprintf(prn, "%#14.*g", d, x);
}
}
static void printf11 (double x, int d, PRN *prn)
{
pputc(prn, ' ');
if (na(x)) {
pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 10), _("NA"));
} else if (x > 999 && x < 100000) {
int p = 1 + floor(log10(x));
d -= p;
if (d < 0) d = 0;
pprintf(prn, "%10.*f", d, x);
} else {
pprintf(prn, "%#10.*g", d, x);
}
}
static void output_line (char *str, PRN *prn, int dblspc)
{
pputs(prn, str);
if (dblspc) {
pputs(prn, "\n\n");
} else {
pputc(prn, '\n');
}
}
static void prhdr (const char *str, const DATASET *dset,
int missing, PRN *prn)
{
char date1[OBSLEN], date2[OBSLEN];
gchar *tmp;
ntolabel(date1, dset->t1, dset);
ntolabel(date2, dset->t2, dset);
pputc(prn, '\n');
tmp = g_strdup_printf(_("%s, using the observations %s - %s"),
str, date1, date2);
output_line(tmp, prn, 0);
g_free(tmp);
if (missing) {
output_line(_("(missing values were skipped)"), prn, 1);
}
}
static void summary_print_val (double x, int digits, int places,
PRN *prn)
{
pputc(prn, ' ');
if (na(x)) {
pprintf(prn, "%*s", UTF_WIDTH(_("NA"), 14), _("NA"));
} else if (digits < 0) {
pprintf(prn, "%14d", (int) x);
} else if (digits > 0 || places > 0) {
int prec = (digits > 0)? digits : places;
int len = prec + 9;
char *s = (digits > 0)? "#" : "";
char t = (digits > 0)? 'g' : 'f';
char fmt[32];
sprintf(fmt, "%%%s%d.%d%c", s, len, prec, t);
pprintf(prn, fmt, x);
} else {
/* the default */
pprintf(prn, "%#14.5g", x);
}
}
#define NSUMM 12
void print_summary_single (const Summary *s,
int digits, int places,
const DATASET *dset,
PRN *prn)
{
const char *labels[NSUMM] = {
N_("Mean"),
N_("Median"),
N_("Minimum"),
N_("Maximum"),
N_("Standard deviation"),
N_("C.V."),
N_("Skewness"),
N_("Ex. kurtosis"),
/* xgettext:no-c-format */
N_("5% percentile"),
/* xgettext:no-c-format */
N_("95% percentile"),
N_("Interquartile range"),
N_("Missing obs.")
};
const char *wstr = N_("Within s.d.");
const char *bstr = N_("Between s.d.");
double vals[NSUMM];
int simple_skip[NSUMM] = {0,1,0,0,0,1,1,1,1,1,1,0};
int skip0595 = 0;
int offset = 2;
int slen = 0, i = 0;
if (s->opt & OPT_B) {
offset = 4;
} else {
const char *vname = dset->varname[s->list[1]];
char obs1[OBSLEN], obs2[OBSLEN];
gchar *tmp = NULL;
ntolabel(obs1, dset->t1, dset);
ntolabel(obs2, dset->t2, dset);
prhdr(_("Summary statistics"), dset, 0, prn);
if (isdigit(*vname)) {
const char *mname = dataset_get_matrix_name(dset);
if (mname != NULL) {
tmp = g_strdup_printf(_("for column %d of %s (%d valid observations)"),
atoi(vname), mname, s->n);
} else {
tmp = g_strdup_printf(_("for column %d (%d valid observations)"),
atoi(vname), s->n);
}
} else {
tmp = g_strdup_printf(_("for the variable '%s' (%d valid observations)"),
dset->varname[s->list[1]], s->n);
}
output_line(tmp, prn, 1);
g_free(tmp);
}
vals[0] = s->mean[0];
vals[1] = s->median[0];
vals[2] = s->low[0];
vals[3] = s->high[0];
vals[4] = s->sd[0];
vals[5] = s->cv[0];
vals[6] = s->skew[0];
vals[7] = s->xkurt[0];
vals[8] = s->perc05[0];
vals[9] = s->perc95[0];
vals[10] = s->iqr[0];
vals[11] = s->misscount[0];
if (na(vals[8]) && na(vals[9])) {
skip0595 = 1;
}
for (i=0; iopt & OPT_S) && simple_skip[i]) {
continue;
} else if (skip0595 && (i == 8 || i == 9)) {
continue;
}
if (strlen(_(labels[i])) > slen) {
slen = g_utf8_strlen(_(labels[i]), -1);
}
}
slen++;
for (i=0; iopt & OPT_S) && simple_skip[i]) {
continue;
} else if (skip0595 && (i == 8 || i == 9)) {
continue;
}
bufspace(offset, prn);
if (i == 8 || i == 9) {
/* the strings contain '%' */
gchar *pcstr = g_strdup(_(labels[i]));
int n = slen - g_utf8_strlen(pcstr, -1);
pputs(prn, pcstr);
if (n > 0) {
bufspace(n, prn);
}
g_free(pcstr);
} else {
pprintf(prn, "%-*s", UTF_WIDTH(_(labels[i]), slen), _(labels[i]));
}
if (i == NSUMM - 1) {
summary_print_val(vals[i], -1, places, prn);
} else {
summary_print_val(vals[i], digits, places, prn);
}
pputc(prn, '\n');
}
if (!na(s->sw) && !na(s->sb)) {
pputc(prn, '\n');
bufspace(offset, prn);
pprintf(prn, "%-*s", UTF_WIDTH(_(wstr), slen), _(wstr));
summary_print_val(s->sw, digits, places, prn);
pputc(prn, '\n');
bufspace(offset, prn);
pprintf(prn, "%-*s", UTF_WIDTH(_(bstr), slen), _(bstr));
summary_print_val(s->sb, digits, places, prn);
}
pputc(prn, '\n');
}
static void summary_print_varname (const char *src,
int len, PRN *prn)
{
char vname[NAMETRUNC];
maybe_trim_varname(vname, src);
pprintf(prn, "%-*s", len, vname);
}
/**
* print_summary:
* @summ: pointer to gretl summary statistics struct.
* @dset: information on the data set.
* @prn: gretl printing struct.
*
* Prints the summary statistics for a given variable.
*/
void print_summary (const Summary *summ,
const DATASET *dset,
PRN *prn)
{
int dmax, d = get_gretl_digits();
int len, maxlen = 0;
int i, vi;
if (summ->list == NULL || summ->list[0] == 0) {
return;
}
if (summ->weight_var > 0) {
pputc(prn, '\n');
pprintf(prn, _("Weighting variable: %s\n"),
dset->varname[summ->weight_var]);
}
if (summ->list[0] == 1) {
print_summary_single(summ, 0, 0, dset, prn);
return;
}
/* number of significant figures to use */
dmax = (summ->opt & OPT_S)? 4 : 5;
d = d > dmax ? dmax : d;
maxlen = max_namelen_in_list(summ->list, dset);
len = maxlen <= 8 ? 10 : (maxlen + 1);
#if 0
if (!(summ->opt & OPT_B)) {
int missing = summary_has_missing_values(summ);
prhdr(_("Summary statistics"), dset, missing, prn);
}
#endif
pputc(prn, '\n');
if (summ->opt & OPT_S) {
/* the "simple" option */
const char *h[] = {
N_("Mean"),
N_("Median"),
/* TRANSLATORS: 'S.D.' means Standard Deviation */
N_("S.D."),
N_("Min"),
N_("Max"),
};
pprintf(prn, "%*s%*s%*s%*s%*s%*s\n", len, " ",
UTF_WIDTH(_(h[0]), 11), _(h[0]),
UTF_WIDTH(_(h[1]), 11), _(h[1]),
UTF_WIDTH(_(h[2]), 11), _(h[2]),
UTF_WIDTH(_(h[3]), 11), _(h[3]),
UTF_WIDTH(_(h[4]), 11), _(h[4]));
for (i=0; ilist[0]; i++) {
vi = summ->list[i+1];
summary_print_varname(dset->varname[vi], len, prn);
printf11(summ->mean[i], d, prn);
printf11(summ->median[i], d, prn);
printf11(summ->sd[i], d, prn);
printf11(summ->low[i], d, prn);
printf11(summ->high[i], d, prn);
pputc(prn, '\n');
}
pputc(prn, '\n');
} else {
/* print all available stats */
const char *ha[] = {
N_("Mean"),
N_("Median"),
N_("Minimum"),
N_("Maximum"),
};
const char *hb[] = {
N_("Std. Dev."),
N_("C.V."),
N_("Skewness"),
N_("Ex. kurtosis")
};
const char *hc[] = {
/* xgettext:no-c-format */
N_("5% perc."),
/* xgettext:no-c-format */
N_("95% perc."),
N_("IQ range"),
N_("Missing obs.")
};
/* cases where 0.05 and 0.95 quantiles are OK */
int npct = 0;
pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ",
UTF_WIDTH(_(ha[0]), 15), _(ha[0]),
UTF_WIDTH(_(ha[1]), 15), _(ha[1]),
UTF_WIDTH(_(ha[2]), 15), _(ha[2]),
UTF_WIDTH(_(ha[3]), 15), _(ha[3]));
for (i=0; ilist[0]; i++) {
vi = summ->list[i+1];
summary_print_varname(dset->varname[vi], len, prn);
printf15(summ->mean[i], d, prn);
printf15(summ->median[i], d, prn);
printf15(summ->low[i], d, prn);
printf15(summ->high[i], d, prn);
pputc(prn, '\n');
/* while we're at it, register cases where we can
show the 0.05 and 0.95 quantiles
*/
if (!na(summ->perc05[i]) && !na(summ->perc95[i])) {
npct++;
}
}
pputc(prn, '\n');
pprintf(prn, "%*s%*s%*s%*s%*s\n", len, " ",
UTF_WIDTH(_(hb[0]), 15), _(hb[0]),
UTF_WIDTH(_(hb[1]), 15), _(hb[1]),
UTF_WIDTH(_(hb[2]), 15), _(hb[2]),
UTF_WIDTH(_(hb[3]), 15), _(hb[3]));
for (i=0; ilist[0]; i++) {
double cv;
vi = summ->list[i+1];
summary_print_varname(dset->varname[vi], len, prn);
if (floateq(summ->mean[i], 0.0)) {
cv = NADBL;
} else if (floateq(summ->sd[i], 0.0)) {
cv = 0.0;
} else {
cv = fabs(summ->sd[i] / summ->mean[i]);
}
printf15(summ->sd[i], d, prn);
printf15(cv, d, prn);
printf15(summ->skew[i], d, prn);
printf15(summ->xkurt[i], d, prn);
pputc(prn, '\n');
}
pputc(prn, '\n');
if (npct > 0) {
/* note: use pputs for strings containing literal '%' */
gchar *hc0 = g_strdup(_(hc[0]));
gchar *hc1 = g_strdup(_(hc[1]));
int n;
pprintf(prn, "%*s", len, " ");
n = 15 - g_utf8_strlen(hc0, -1);
if (n > 0) bufspace(n, prn);
pputs(prn, hc0);
n = 15 - g_utf8_strlen(hc1, -1);
if (n > 0) bufspace(n, prn);
pputs(prn, hc1);
g_free(hc0); g_free(hc1);
pprintf(prn, "%*s%*s\n",
UTF_WIDTH(_(ha[2]), 15), _(hc[2]),
UTF_WIDTH(_(ha[3]), 15), _(hc[3]));
} else {
/* not showing any 0.05, 0.95 quantiles */
pprintf(prn, "%*s%*s%*s\n", len, " ",
UTF_WIDTH(_(ha[2]), 15), _(hc[2]),
UTF_WIDTH(_(ha[3]), 15), _(hc[3]));
}
for (i=0; ilist[0]; i++) {
vi = summ->list[i+1];
summary_print_varname(dset->varname[vi], len, prn);
if (!na(summ->perc05[i]) && !na(summ->perc95[i])) {
printf15(summ->perc05[i], d, prn);
printf15(summ->perc95[i], d, prn);
} else if (npct > 0) {
pprintf(prn, "%*s", 15, "NA");
pprintf(prn, "%*s", 15, "NA");
}
printf15(summ->iqr[i], d, prn);
pprintf(prn, "%15d", (int) summ->misscount[i]);
pputc(prn, '\n');
}
pputc(prn, '\n');
}
}
/**
* free_summary:
* @summ: pointer to gretl summary statistics struct
*
* Frees all resources associated with @summ, and
* the pointer itself.
*/
void free_summary (Summary *summ)
{
free(summ->list);
free(summ->misscount);
free(summ->stats);
free(summ);
}
/***
* summary_new:
* @list: list of variables we want summary statistics for
* @wv: weighting variable (0=no weights)
* @opt: options
*
* Allocates a new "Summary" struct and initializes a few
* things inside it
*/
static Summary *summary_new (const int *list, int wv,
gretlopt opt, int *err)
{
Summary *s;
int nv;
if (list == NULL) {
fprintf(stderr, "summary_new: list is NULL\n");
*err = E_DATA;
return NULL;
}
nv = list[0];
s = malloc(sizeof *s);
if (s == NULL) {
*err = E_ALLOC;
return NULL;
}
s->list = gretl_list_copy(list);
if (s->list == NULL) {
*err = E_ALLOC;
free(s);
return NULL;
}
s->weight_var = wv;
s->opt = opt;
s->n = 0;
s->misscount = malloc(nv * sizeof *s->misscount);
s->stats = malloc(11 * nv * sizeof *s->stats);
if (s->stats == NULL) {
*err = E_ALLOC;
free_summary(s);
return NULL;
}
s->mean = s->stats;
s->median = s->mean + nv;
s->sd = s->median + nv;
s->skew = s->sd + nv;
s->xkurt = s->skew + nv;
s->low = s->xkurt + nv;
s->high = s->low + nv;
s->cv = s->high + nv;
s->perc05 = s->cv + nv;
s->perc95 = s->perc05 + nv;
s->iqr = s->perc95 + nv;
s->sb = s->sw = NADBL;
return s;
}
int summary_has_missing_values (const Summary *summ)
{
if (summ->misscount != NULL) {
int i, nv = summ->list[0];
for (i=0; imisscount[i] > 0) {
return 1;
}
}
}
return 0;
}
static int compare_wgtord_rows (const void *a, const void *b)
{
const double **da = (const double **) a;
const double **db = (const double **) b;
return (da[0][0] > db[0][0]) - (da[0][0] < db[0][0]);
}
static int weighted_order_stats (const double *y, const double *w,
double *ostats, int n, int k)
{
/* on input "ostats" contains the quantiles to compute;
on output it holds the results
*/
double p, q, wsum = 0.0;
double **X;
int t, i, err = 0;
X = doubles_array_new(n, 2);
if (X == NULL) {
return E_ALLOC;
}
i = 0;
for (t=0; t= n - 1) {
ostats[i] = NADBL;
continue;
}
if (X[t-1][0] == X[t][0]) {
ostats[i] = X[t][0];
} else {
double a = (q - p) / X[t][1];
ostats[i] = a * X[t+1][0] + (1-a) * X[t][0];
}
}
doubles_array_free(X, n);
return err;
}
/**
* get_summary_weighted:
* @list: list of variables to process.
* @dset: dataset struct.
* @wgt: series to use as weights.
* @opt: may include OPT_S for "simple" version.
* @prn: gretl printing struct.
* @err: location to receive error code.
*
* Calculates descriptive summary statistics for the specified
* variables, weighting the observations by @rv. The series @rv must
* be of full length (dset->n).
*
* Returns: #Summary object containing the summary statistics, or NULL
* on failure.
*/
Summary *get_summary_weighted (const int *list, const DATASET *dset,
int wtvar, gretlopt opt,
PRN *prn, int *err)
{
const double *wts;
double *x, ostats[5];
int t1 = dset->t1;
int t2 = dset->t2;
Summary *s;
int i, t;
s = summary_new(list, wtvar, opt, err);
if (s == NULL) {
return NULL;
}
x = malloc(dset->n * sizeof *x);
if (x == NULL) {
*err = E_ALLOC;
free_summary(s);
return NULL;
}
wts = dset->Z[wtvar];
for (i=0; ilist[i+1];
int ni = 0;
int ntot = 0;
double wt;
/* prepare the series for weighting: substitute NAs for values
at which the weights are invalid or zero
*/
for (t=t1; t<=t2; t++) {
wt = wts[t];
if (!na(wt) && wt != 0.0) {
ntot++;
x[t] = dset->Z[vi][t];
if (!na(x[t])) {
ni++;
}
} else {
x[t] = NADBL;
}
}
s->misscount[i] = ntot - ni;
if (ni > s->n) {
s->n = ni;
}
if (ni == 0) {
pprintf(prn, _("Dropping %s: sample range contains no valid "
"observations\n"), dset->varname[vi]);
gretl_list_delete_at_pos(s->list, i + 1);
if (s->list[0] == 0) {
return s;
} else {
i--;
continue;
}
}
if (opt & OPT_S) {
s->skew[i] = NADBL;
s->xkurt[i] = NADBL;
s->cv[i] = NADBL;
s->median[i] = NADBL;
} else {
pskew = &s->skew[i];
pkurt = &s->xkurt[i];
}
gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
gretl_moments(t1, t2, x, wts, &s->mean[i], &s->sd[i], pskew, pkurt, 1);
if (!(opt & OPT_S)) {
int err;
if (floateq(s->mean[i], 0.0)) {
s->cv[i] = NADBL;
} else if (floateq(s->sd[i], 0.0)) {
s->cv[i] = 0.0;
} else {
s->cv[i] = fabs(s->sd[i] / s->mean[i]);
}
ostats[0] = 0.05;
ostats[1] = 0.25;
ostats[2] = 0.5;
ostats[3] = 0.75;
ostats[4] = 0.95;
err = weighted_order_stats(x, wts, ostats, ni, 5);
if (!err) {
s->median[i] = ostats[2];
s->perc05[i] = ostats[0];
s->perc95[i] = ostats[4];
if (!na(ostats[1]) && !na(ostats[3])) {
s->iqr[i] = ostats[3] - ostats[1];
}
}
}
#if 0
if (dataset_is_panel(dset) && list[0] == 1) {
panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
}
#endif
}
free(x);
return s;
}
/* Get the additional statistics wanted if the --simple
flag was not given to the summary command.
*/
static int get_extra_stats (Summary *s, int i,
int t1, int t2,
const double *x)
{
int err = 0;
if (floateq(s->mean[i], 0.0)) {
s->cv[i] = NADBL;
} else if (floateq(s->sd[i], 0.0)) {
s->cv[i] = 0.0;
} else {
s->cv[i] = fabs(s->sd[i] / s->mean[i]);
}
s->perc05[i] = gretl_quantile(t1, t2, x, 0.05, OPT_Q, &err);
s->perc95[i] = gretl_quantile(t1, t2, x, 0.95, OPT_Q, &err);
s->iqr[i] = gretl_quantile(t1, t2, x, 0.75, OPT_NONE, &err);
if (!na(s->iqr[i])) {
s->iqr[i] -= gretl_quantile(t1, t2, x, 0.25, OPT_NONE, &err);
}
return err;
}
/**
* get_summary_restricted:
* @list: list of variables to process.
* @dset: dataset struct.
* @rv: series to use as restriction dummy.
* @opt: may include OPT_S for "simple" version.
* @prn: gretl printing struct.
* @err: location to receive error code.
*
* Calculates descriptive summary statistics for the specified variables,
* with the observations restricted to those for which @rv has a non-zero
* (and non-missing) value. The series @rv must be of full length (dset->n).
*
* Returns: #Summary object containing the summary statistics, or NULL
* on failure.
*/
Summary *get_summary_restricted (const int *list, const DATASET *dset,
const double *rv, gretlopt opt,
PRN *prn, int *err)
{
int t1 = dset->t1;
int t2 = dset->t2;
Summary *s;
double *x;
int i, t;
s = summary_new(list, 0, opt, err);
if (s == NULL) {
return NULL;
}
x = malloc(dset->n * sizeof *x);
if (x == NULL) {
*err = E_ALLOC;
free_summary(s);
return NULL;
}
for (i=0; ilist[0]; i++) {
double *pskew = NULL, *pkurt = NULL;
int vi = s->list[i+1];
int strvals;
int ni = 0;
int ntot = 0;
strvals = is_string_valued(dset, vi);
if (!strvals) {
/* create the restricted series: substitute NAs
for values at which the restriction dummy is
invalid or zero
*/
for (t=t1; t<=t2; t++) {
if (!na(rv[t]) && rv[t] != 0.0) {
ntot++;
x[t] = dset->Z[vi][t];
if (!na(x[t])) {
ni++;
}
} else {
x[t] = NADBL;
}
}
s->misscount[i] = ntot - ni;
if (ni > s->n) {
s->n = ni;
}
}
if (ni == 0) {
if (strvals) {
pprintf(prn, _("Dropping %s: string-valued series\n"),
dset->varname[vi]);
} else {
pprintf(prn, _("Dropping %s: sample range contains no valid "
"observations\n"), dset->varname[vi]);
}
gretl_list_delete_at_pos(s->list, i + 1);
if (s->list[0] == 0) {
return s;
} else {
i--;
continue;
}
}
if (opt & OPT_S) {
s->skew[i] = NADBL;
s->xkurt[i] = NADBL;
s->cv[i] = NADBL;
} else {
pskew = &s->skew[i];
pkurt = &s->xkurt[i];
}
gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i], pskew, pkurt, 1);
s->median[i] = gretl_median(t1, t2, x);
if (!(opt & OPT_S)) {
*err = get_extra_stats(s, i, t1, t2, x);
}
if (dataset_is_panel(dset) && list[0] == 1) {
panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
}
}
free(x);
return s;
}
/**
* get_summary:
* @list: list of variables to process.
* @dset: dataset struct.
* @opt: may include OPT_S for "simple" version.
* @prn: gretl printing struct.
* @err: location to receive error code.
*
* Calculates descriptive summary statistics for the specified variables.
*
* Returns: #Summary object containing the summary statistics, or NULL
* on failure.
*/
Summary *get_summary (const int *list, const DATASET *dset,
gretlopt opt, PRN *prn, int *err)
{
int t1 = dset->t1;
int t2 = dset->t2;
Summary *s;
int i, nmax;
s = summary_new(list, 0, opt, err);
if (s == NULL) {
return NULL;
}
nmax = sample_size(dset);
for (i=0; ilist[0]; i++) {
double *pskew = NULL, *pkurt = NULL;
const double *x;
double x0;
int vi = s->list[i+1];
int strvals;
int ni = 0;
strvals = is_string_valued(dset, vi);
if (!strvals) {
x = dset->Z[vi];
ni = good_obs(x + t1, nmax, &x0);
s->misscount[i] = nmax - ni;
if (ni > s->n) {
s->n = ni;
}
}
if (ni == 0) {
if (strvals) {
pprintf(prn, _("Dropping %s: string-valued series\n"),
dset->varname[vi]);
} else {
pprintf(prn, _("Dropping %s: sample range contains no valid "
"observations\n"), dset->varname[vi]);
}
gretl_list_delete_at_pos(s->list, i + 1);
if (s->list[0] == 0) {
return s;
} else {
i--;
continue;
}
}
if (opt & OPT_S) {
s->skew[i] = NADBL;
s->xkurt[i] = NADBL;
s->cv[i] = NADBL;
s->median[i] = NADBL;
} else {
pskew = &s->skew[i];
pkurt = &s->xkurt[i];
}
gretl_minmax(t1, t2, x, &s->low[i], &s->high[i]);
if (s->weight_var == 0) {
gretl_moments(t1, t2, x, NULL, &s->mean[i], &s->sd[i],
pskew, pkurt, 1);
} else {
gretl_moments(t1, t2, x, dset->Z[s->weight_var], &s->mean[i], &s->sd[i],
pskew, pkurt, 0);
}
/* included in both simple and full variants */
s->median[i] = gretl_median(t1, t2, x);
if (!(opt & OPT_S)) {
*err = get_extra_stats(s, i, t1, t2, x);
}
if (dataset_is_panel(dset) && list[0] == 1) {
panel_variance_info(x, dset, s->mean[0], &s->sw, &s->sb);
}
}
return s;
}
static void record_summary (Summary *summ, const DATASET *dset)
{
gretl_matrix *m = NULL;
char **Sc, **Sr;
int nv = summ->list[0];
int i, vi, cols;
cols = (summ->opt & OPT_S)? 5 : 12;
m = gretl_matrix_alloc(nv, cols);
if (m == NULL) {
return;
}
Sc = strings_array_new(cols);
Sr = strings_array_new(nv);
if (summ->opt & OPT_S) {
/* the "simple" option */
const char *h[] = {
N_("Mean"),
N_("Median"),
N_("S.D."),
N_("Min"),
N_("Max"),
};
if (Sc != NULL) {
for (i=0; ilist[i+1];
Sr[i] = gretl_strdup(dset->varname[vi]);
}
gretl_matrix_set(m, i, 0, summ->mean[i]);
gretl_matrix_set(m, i, 1, summ->median[i]);
gretl_matrix_set(m, i, 2, summ->sd[i]);
gretl_matrix_set(m, i, 3, summ->low[i]);
gretl_matrix_set(m, i, 4, summ->high[i]);
}
} else {
/* record all available stats */
const char *h[] = {
N_("Mean"),
N_("Median"),
N_("Minimum"),
N_("Maximum"),
N_("Std. Dev."),
N_("C.V."),
N_("Skewness"),
N_("Ex. kurtosis"),
/* xgettext:no-c-format */
N_("5% perc."),
/* xgettext:no-c-format */
N_("95% perc."),
N_("IQ range"),
N_("Missing obs.")
};
double cv;
if (Sc != NULL) {
for (i=0; ilist[i+1];
Sr[i] = gretl_strdup(dset->varname[vi]);
}
gretl_matrix_set(m, i, 0, summ->mean[i]);
gretl_matrix_set(m, i, 1, summ->median[i]);
gretl_matrix_set(m, i, 2, summ->low[i]);
gretl_matrix_set(m, i, 3, summ->high[i]);
gretl_matrix_set(m, i, 4, summ->sd[i]);
if (floateq(summ->mean[i], 0.0)) {
cv = NADBL;
} else if (floateq(summ->sd[i], 0.0)) {
cv = 0.0;
} else {
cv = fabs(summ->sd[i] / summ->mean[i]);
}
gretl_matrix_set(m, i, 5, cv);
gretl_matrix_set(m, i, 6, summ->skew[i]);
gretl_matrix_set(m, i, 7, summ->xkurt[i]);
gretl_matrix_set(m, i, 8, summ->perc05[i]);
gretl_matrix_set(m, i, 9, summ->perc95[i]);
gretl_matrix_set(m, i, 10, summ->iqr[i]);
gretl_matrix_set(m, i, 11, summ->misscount[i]);
}
}
gretl_matrix_set_colnames(m, Sc);
gretl_matrix_set_rownames(m, Sr);
set_last_result_data(m, GRETL_TYPE_MATRIX);
}
/**
* list_summary:
* @list: list of series to process.
* @dset: dataset struct.
* @opt: may include %OPT_S for "simple" version.
* @prn: gretl printing struct.
*
* Prints descriptive statistics for the listed variables.
*
* Returns: 0 on success, non-zero code on error.
*/
int list_summary (const int *list, int wgtvar,
const DATASET *dset,
gretlopt opt, PRN *prn)
{
Summary *summ = NULL;
int err = 0;
if (list != NULL) {
if (list[0] == 0) {
return 0;
}
if (wgtvar == 0) {
/* no weights */
summ = get_summary(list, dset, opt, prn, &err);
} else {
summ = get_summary_weighted(list, dset, wgtvar,
opt, prn, &err);
}
} else {
int *tmplist = full_var_list(dset, NULL);
if (tmplist != NULL) {
if (wgtvar == 0) {
/* no weights */
summ = get_summary(tmplist, dset, opt, prn, &err);
} else {
summ = get_summary_weighted(tmplist, dset, wgtvar,
opt, prn, &err);
}
free(tmplist);
}
}
if (summ != NULL) {
if (!(opt & OPT_Q)) {
print_summary(summ, dset, prn);
}
record_summary(summ, dset);
free_summary(summ);
}
return err;
}
/**
* vmatrix_new:
*
* Returns: an allocated and initialized #VMatrix, or
* %NULL on failure.
*/
VMatrix *vmatrix_new (void)
{
VMatrix *v = malloc(sizeof *v);
if (v != NULL) {
v->vec = NULL;
v->xbar = NULL;
v->ssx = NULL;
v->list = NULL;
v->names = NULL;
v->ci = 0;
v->dim = 0;
v->t1 = 0;
v->t2 = 0;
v->n = 0;
v->missing = 0;
}
return v;
}
/**
* free_vmatrix:
* @vmat: pointer to gretl correlation matrix struct
*
* Frees all resources associated with @vmat, and
* the pointer itself.
*/
void free_vmatrix (VMatrix *vmat)
{
if (vmat != NULL) {
if (vmat->names != NULL) {
strings_array_free(vmat->names, vmat->dim);
}
if (vmat->vec != NULL) {
free(vmat->vec);
}
if (vmat->xbar != NULL) {
free(vmat->xbar);
}
if (vmat->ssx != NULL) {
free(vmat->ssx);
}
if (vmat->list != NULL) {
free(vmat->list);
}
free(vmat);
}
}
enum {
CORRMAT = 0,
COVMAT
};
/* Compute correlation or covariance matrix, using the maximum
available sample for each coefficient.
*/
static int max_corrcov_matrix (VMatrix *v, const DATASET *dset,
int flag)
{
double **Z = dset->Z;
int i, j, vi, vj, nij;
int nmin, nmax = 0;
int m = v->dim;
int nmiss;
nmin = v->n = v->t2 - v->t1 + 1;
for (i=0; ilist[i+1];
for (j=i; jlist[j+1];
nij = ijton(i, j, m);
if (i == j && flag == CORRMAT) {
v->vec[nij] = 1.0;
} else {
nmiss = 0;
if (flag == COVMAT) {
v->vec[nij] = gretl_covar(v->t1, v->t2, Z[vi], Z[vj],
&nmiss);
} else {
v->vec[nij] = gretl_corr(v->t1, v->t2, Z[vi], Z[vj],
&nmiss);
}
if (nmiss > 0) {
int n = v->n - nmiss;
if (n < nmin && n > 0) {
nmin = n;
}
if (n > nmax) {
nmax = n;
}
v->missing = 1;
} else {
nmax = v->n;
}
}
}
}
/* We'll record an "n" value if there's something resembling
a common number of observations across the coefficients.
*/
if (v->missing) {
v->n = 0;
if (nmax > 0) {
double d = (nmax - nmin) / (double) nmax;
if (d < 0.10) {
v->n = nmin;
}
}
}
return 0;
}
/* Compute correlation or covariance matrix, ensuring we use the same
sample for all coefficients. We may be doing this in the context of
the "corr" command or in the context of "pca". In the latter case
we want to save the "xbar" and "ssx" values for later use when
calculating the component series.
*/
static int uniform_corrcov_matrix (VMatrix *v, const DATASET *dset,
int ci, int flag)
{
double **Z = dset->Z;
double *xbar = NULL, *ssx = NULL;
double *x;
int m = v->dim;
int mm = (m * (m + 1)) / 2;
double d1, d2;
int i, j, jmin, nij, t;
int miss, n = 0;
xbar = malloc(m * sizeof *xbar);
if (xbar == NULL) {
return E_ALLOC;
}
if (ci == PCA || flag == CORRMAT) {
ssx = malloc(m * sizeof *ssx);
if (ssx == NULL) {
free(xbar);
return E_ALLOC;
}
}
for (i=0; imissing = 0;
/* first pass: get sample size and sums */
for (t=v->t1; t<=v->t2; t++) {
miss = 0;
for (i=0; ilist[i+1]][t])) {
miss = 1;
v->missing += 1;
break;
}
}
if (!miss) {
n++;
for (i=0; ilist[i+1]][t];
}
}
}
if (n < 2) {
free(xbar);
free(ssx);
return E_MISSDATA;
}
for (i=0; ivec[i] = 0.0;
}
/* second pass: get deviations from means and cumulate */
for (t=v->t1; t<=v->t2; t++) {
miss = 0;
for (i=0; ilist[i+1]];
if (na(x[t])) {
miss = 1;
break;
}
}
if (!miss) {
for (i=0; ilist[i+1]];
d1 = x[t] - xbar[i];
if (ssx != NULL) {
ssx[i] += d1 * d1;
}
jmin = (flag == COVMAT)? i : i + 1;
for (j=jmin; jlist[j+1]];
nij = ijton(i, j, m);
d2 = x[t] - xbar[j];
v->vec[nij] += d1 * d2;
}
}
}
}
/* finalize: compute correlations or covariances */
if (flag == CORRMAT) {
/* correlations */
for (i=0; ivec[nij] = 1.0;
} else if (ssx[i] == 0.0 || ssx[j] == 0.0) {
v->vec[nij] = NADBL;
} else {
v->vec[nij] /= sqrt(ssx[i] * ssx[j]);
}
}
}
} else {
/* covariances */
for (i=0; ivec[i] /= (n - 1);
}
}
v->n = n;
if (ci == PCA) {
v->xbar = xbar;
v->ssx = ssx;
} else {
free(xbar);
free(ssx);
}
return 0;
}
/**
* corrlist:
* @ci: command index.
* @list: list of variables to process, by ID number.
* @dset: dataset struct.
* @opt: option flags.
* @err: location to receive error code.
*
* Computes pairwise correlation coefficients for the variables
* specified in @list, skipping any constants. If the option
* flags contain OPT_N, a uniform sample is ensured: only those
* observations for which all the listed variables have valid
* values are used. If OPT_C is included, we actually calculate
* covariances rather than correlations.
*
* Returns: gretl correlation matrix struct, or NULL on failure.
*/
VMatrix *corrlist (int ci, int *list, const DATASET *dset,
gretlopt opt, int *err)
{
int flag = (opt & OPT_C)? COVMAT : CORRMAT;
VMatrix *v;
int i, m, mm;
int nmiss = 0;
if (list == NULL) {
fprintf(stderr, "corrlist: list is NULL\n");
*err = E_DATA;
return NULL;
}
v = vmatrix_new();
if (v == NULL) {
*err = E_ALLOC;
return NULL;
}
/* drop any constants from list */
for (i=list[0]; i>0; i--) {
if (gretl_isconst(dset->t1, dset->t2, dset->Z[list[i]])) {
gretl_list_delete_at_pos(list, i);
}
}
if (list[0] < 2) {
gretl_errmsg_set(_("corr: needs at least two non-constant arguments"));
*err = E_DATA;
goto bailout;
}
v->t1 = dset->t1;
v->t2 = dset->t2;
/* adjust sample range if need be */
list_adjust_sample(list, &v->t1, &v->t2, dset, &nmiss);
if (v->t2 - v->t1 + 1 < 3) {
*err = E_TOOFEW;
goto bailout;
}
v->dim = m = list[0];
mm = (m * (m + 1)) / 2;
v->names = strings_array_new(m);
v->vec = malloc(mm * sizeof *v->vec);
v->list = gretl_list_copy(list);
if (v->names == NULL || v->vec == NULL || v->list == NULL) {
*err = E_ALLOC;
goto bailout;
}
if (ci == PCA) {
opt |= OPT_N;
}
if (opt & OPT_N) {
/* impose uniform sample size */
*err = uniform_corrcov_matrix(v, dset, ci, flag);
} else {
/* sample sizes may differ */
*err = max_corrcov_matrix(v, dset, flag);
}
if (!*err) {
for (i=0; inames[i] = gretl_strdup(dset->varname[list[i+1]]);
if (v->names[i] == NULL) {
*err = E_ALLOC;
goto bailout;
}
}
}
v->ci = (flag == CORRMAT)? CORR : 0; /* FIXME? */
bailout:
if (*err) {
free_vmatrix(v);
v = NULL;
}
return v;
}
static void printcorr (const VMatrix *v, PRN *prn)
{
double r = v->vec[1];
pprintf(prn, "\ncorr(%s, %s)", v->names[0], v->names[1]);
if (na(r)) {
pprintf(prn, ": %s\n\n", _("undefined"));
} else {
pprintf(prn, " = %.8f\n", r);
if (fabs(r) < 1.0) {
int n2 = v->n - 2;
double tval = r * sqrt(n2 / (1 - r*r));
pputs(prn, _("Under the null hypothesis of no correlation:\n "));
pprintf(prn, _("t(%d) = %g, with two-tailed p-value %.4f\n"), n2,
tval, student_pvalue_2(n2, tval));
pputc(prn, '\n');
} else {
pprintf(prn, _("5%% critical value (two-tailed) = "
"%.4f for n = %d"), rhocrit95(v->n), v->n);
pputs(prn, "\n\n");
}
}
}
/**
* print_corrmat:
* @corr: gretl correlation matrix.
* @dset: dataset information.
* @prn: gretl printing struct.
*
* Prints a gretl correlation matrix to @prn.
*/
void print_corrmat (VMatrix *corr, const DATASET *dset, PRN *prn)
{
if (corr->dim == 2) {
printcorr(corr, prn);
} else {
char date1[OBSLEN], date2[OBSLEN];
gchar *tmp = NULL;
ntolabel(date1, corr->t1, dset);
ntolabel(date2, corr->t2, dset);
pputc(prn, '\n');
tmp = g_strdup_printf(_("%s, using the observations %s - %s"),
_("Correlation Coefficients"), date1, date2);
output_line(tmp, prn, 0);
g_free(tmp);
if (corr->missing) {
output_line(_("(missing values were skipped)"), prn, 1);
}
if (corr->n > 0) {
tmp = g_strdup_printf(_("5%% critical value (two-tailed) = "
"%.4f for n = %d"), rhocrit95(corr->n),
corr->n);
output_line(tmp, prn, 1);
g_free(tmp);
}
text_print_vmatrix(corr, prn);
}
}
static void record_corr_matrix (VMatrix *c)
{
gretl_matrix *m = NULL;
int i, j, k, n = c->dim;
double cij;
m = gretl_matrix_alloc(n, n);
if (m != NULL) {
k = 0;
for (i=0; ivec[k];
gretl_matrix_set(m, i, j, cij);
gretl_matrix_set(m, j, i, cij);
k++;
}
}
if (c->names) {
char **rlab = strings_array_new(n);
char **clab = strings_array_new(n);
if (rlab != NULL && clab != NULL) {
for (i=0; inames[i]);
clab[i] = gretl_strdup(c->names[i]);
}
gretl_matrix_set_rownames(m, rlab);
gretl_matrix_set_colnames(m, clab);
}
}
set_last_result_data(m, GRETL_TYPE_MATRIX);
}
}
/**
* gretl_corrmx:
* @list: gives the ID numbers of the variables to process.
* @dset: dataset struct.
* @opt: option flags: OPT_N means use uniform sample size,
* OPT_U controls plotting options.
* @prn: gretl printing struct.
*
* Computes and prints the correlation matrix for the specified list
* of variables.
*
* Returns: 0 on successful completion, 1 on error.
*/
int gretl_corrmx (int *list, const DATASET *dset,
gretlopt opt, PRN *prn)
{
VMatrix *corr = NULL;
int err = 0;
if (list != NULL) {
if (list[0] == 0) {
return 0;
} else {
corr = corrlist(CORR, list, dset, opt, &err);
}
} else {
int *tmplist = full_var_list(dset, NULL);
if (tmplist != NULL) {
corr = corrlist(CORR, tmplist, dset, opt, &err);
free(tmplist);
}
}
if (corr != NULL) {
if (!(opt & OPT_Q)) {
print_corrmat(corr, dset, prn);
}
if (corr->dim > 2 && gnuplot_graph_wanted(PLOT_HEATMAP, opt)) {
err = plot_corrmat(corr, opt);
}
record_corr_matrix(corr);
free_vmatrix(corr);
}
return err;
}
/*
Satterthwaite, F.E. (1946). "An Approximate Distribution of Estimates
of Variance Components". Biometrics Bulletin, 2, 6, pp. 110–114.
v = (s^2_1/n_1 + s^2_2/n_2)^2 /
[(s^_1/n1)^2 / (n_1 - 1) + (s^2_2/n_2)^2 / (n_2 - 1)]
*/
int satterthwaite_df (double v1, int n1,
double v2, int n2)
{
double rtnum = v1 / n1 + v2 / n2;
double d1 = (v1/n1) * (v1/n1) / (n1 - 1);
double d2 = (v2/n2) * (v2/n2) / (n2 - 1);
double v = rtnum * rtnum / (d1 + d2);
return (int) floor(v);
}
/**
* means_test:
* @list: gives the ID numbers of the variables to compare.
* @dset: dataset struct.
* @opt: if OPT_O, assume population variances are different.
* @prn: gretl printing struct.
*
* Carries out test of the null hypothesis that the means of two
* variables are equal.
*
* Returns: 0 on successful completion, error code on error.
*/
int means_test (const int *list, const DATASET *dset,
gretlopt opt, PRN *prn)
{
double m1, m2, s1, s2, skew, kurt, se, mdiff, t, pval;
double v1, v2;
double *x = NULL, *y = NULL;
int vardiff = (opt & OPT_O);
int df, n1, n2, n = dset->n;
if (list[0] < 2) {
return E_ARGS;
}
x = malloc(n * sizeof *x);
if (x == NULL) {
return E_ALLOC;
}
y = malloc(n * sizeof *y);
if (y == NULL) {
free(x);
return E_ALLOC;
}
n1 = transcribe_array(x, dset->Z[list[1]], dset);
n2 = transcribe_array(y, dset->Z[list[2]], dset);
if (n1 == 0 || n2 == 0) {
pputs(prn, _("Sample range has no valid observations."));
free(x); free(y);
return 1;
}
if (n1 == 1 || n2 == 1) {
pputs(prn, _("Sample range has only one observation."));
free(x); free(y);
return 1;
}
df = n1 + n2 - 2;
gretl_moments(0, n1-1, x, NULL, &m1, &s1, &skew, &kurt, 1);
gretl_moments(0, n2-1, y, NULL, &m2, &s2, &skew, &kurt, 1);
mdiff = m1 - m2;
v1 = s1 * s1;
v2 = s2 * s2;
if (vardiff) {
/* do not assume the variances are equal */
se = sqrt((v1 / n1) + (v2 / n2));
df = satterthwaite_df(v1, n1, v2, n2);
} else {
/* form pooled estimate of variance */
double s2p;
s2p = ((n1-1) * v1 + (n2-1) * v2) / df;
se = sqrt(s2p / n1 + s2p / n2);
df = n1 + n2 - 2;
}
t = mdiff / se;
pval = student_pvalue_2(df, t);
pprintf(prn, _("\nEquality of means test "
"(assuming %s variances)\n\n"), (vardiff)? _("unequal") : _("equal"));
pprintf(prn, " %s: ", dset->varname[list[1]]);
pprintf(prn, _("Number of observations = %d\n"), n1);
pprintf(prn, " %s: ", dset->varname[list[2]]);
pprintf(prn, _("Number of observations = %d\n"), n2);
pprintf(prn, _(" Difference between sample means = %g - %g = %g\n"),
m1, m2, mdiff);
pputs(prn, _(" Null hypothesis: The two population means are the same.\n"));
pprintf(prn, _(" Estimated standard error = %g\n"), se);
pprintf(prn, _(" Test statistic: t(%d) = %g\n"), df, t);
pprintf(prn, _(" p-value (two-tailed) = %g\n\n"), pval);
if (pval > .10)
pputs(prn, _(" The difference is not statistically significant.\n\n"));
record_test_result(t, pval);
free(x);
free(y);
return 0;
}
/**
* vars_test:
* @list: gives the ID numbers of the variables to compare.
* @dset: dataset struct.
* @prn: gretl printing struct.
*
* Carries out test of the null hypothesis that the variances of two
* variables are equal.
*
* Returns: 0 on successful completion, error code on error.
*/
int vars_test (const int *list, const DATASET *dset, PRN *prn)
{
double m, s1, s2, var1, var2;
double F, pval;
double *x = NULL, *y = NULL;
int dfn, dfd, n1, n2, n = dset->n;
if (list[0] < 2) return E_ARGS;
x = malloc(n * sizeof *x);
y = malloc(n * sizeof *y);
if (x == NULL || y == NULL) {
free(x);
free(y);
return E_ALLOC;
}
n1 = transcribe_array(x, dset->Z[list[1]], dset);
n2 = transcribe_array(y, dset->Z[list[2]], dset);
if (n1 == 0 || n2 == 0) {
pputs(prn, _("Sample range has no valid observations."));
free(x); free(y);
return 1;
}
if (n1 == 1 || n2 == 1) {
pputs(prn, _("Sample range has only one observation."));
free(x); free(y);
return 1;
}
gretl_moments(0, n1-1, x, NULL, &m, &s1, NULL, NULL, 1);
gretl_moments(0, n2-1, y, NULL, &m, &s2, NULL, NULL, 1);
var1 = s1*s1;
var2 = s2*s2;
if (var1 > var2) {
F = var1 / var2;
dfn = n1 - 1;
dfd = n2 - 1;
} else {
F = var2 / var1;
dfn = n2 - 1;
dfd = n1 - 1;
}
pval = snedecor_cdf_comp(dfn, dfd, F);
pputs(prn, _("\nEquality of variances test\n\n"));
pprintf(prn, " %s: ", dset->varname[list[1]]);
pprintf(prn, _("Number of observations = %d\n"), n1);
pprintf(prn, " %s: ", dset->varname[list[2]]);
pprintf(prn, _("Number of observations = %d\n"), n2);
pprintf(prn, _(" Ratio of sample variances = %g\n"), F);
pprintf(prn, " %s: %s\n", _("Null hypothesis"),
_("The two population variances are equal"));
pprintf(prn, " %s: F(%d,%d) = %g\n", _("Test statistic"), dfn, dfd, F);
pprintf(prn, _(" p-value (two-tailed) = %g\n\n"), pval);
if (snedecor_cdf_comp(dfn, dfd, F) > .10)
pputs(prn, _(" The difference is not statistically significant.\n\n"));
record_test_result(F, pval);
free(x);
free(y);
return 0;
}
struct MahalDist_ {
int *list;
int n;
double *d;
};
const double *mahal_dist_get_distances (const MahalDist *md)
{
return md->d;
}
int mahal_dist_get_n (const MahalDist *md)
{
return md->n;
}
const int *mahal_dist_get_varlist(const MahalDist *md)
{
return md->list;
}
void free_mahal_dist (MahalDist *md)
{
free(md->list);
free(md->d);
free(md);
}
static MahalDist *mahal_dist_new (const int *list, int n)
{
MahalDist *md = malloc(sizeof *md);
if (md != NULL) {
md->d = malloc(n * sizeof *md->d);
if (md->d == NULL) {
free(md);
md = NULL;
} else {
md->list = gretl_list_copy(list);
if (md->list == NULL) {
free(md->d);
free(md);
md = NULL;
} else {
md->n = n;
}
}
}
if (md != NULL) {
int t;
for (t=0; td[t] = NADBL;
}
}
return md;
}
static int mdist_saver (DATASET *dset)
{
int t, v, err;
err = dataset_add_series(dset, 1);
if (err) {
v = 0;
} else {
v = dset->v - 1;
for (t=0; tn; t++) {
dset->Z[v][t] = NADBL;
}
strcpy(dset->varname[v], "mdist");
make_varname_unique(dset->varname[v], v, dset);
series_set_label(dset, v, _("Mahalanobis distances"));
}
return v;
}
static int
real_mahalanobis_distance (const int *list, DATASET *dset,
gretlopt opt, MahalDist *md,
PRN *prn)
{
gretl_matrix *S = NULL;
gretl_vector *means = NULL;
gretl_vector *xdiff;
int orig_t1 = dset->t1;
int orig_t2 = dset->t2;
int n, err = 0;
list_adjust_sample(list, &dset->t1, &dset->t2, dset, NULL);
n = sample_size(dset);
if (n < 2) {
err = E_DATA;
goto bailout;
}
xdiff = gretl_column_vector_alloc(list[0]);
if (xdiff == NULL) {
err = E_ALLOC;
goto bailout;
}
S = gretl_covariance_matrix_from_varlist(list,
dset,
&means,
&err);
if (!err) {
if (opt & OPT_V) {
gretl_matrix_print_to_prn(S, _("Covariance matrix"), prn);
}
err = gretl_invert_symmetric_matrix(S);
if (err) {
fprintf(stderr, "error inverting covariance matrix\n");
} else if (opt & OPT_V) {
gretl_matrix_print_to_prn(S, _("Inverse of covariance matrix"), prn);
}
}
if (!err) {
int k = gretl_vector_get_length(means);
int miss, obslen = 0, savevar = 0;
double m, x, xbar;
int i, t, vi;
if (opt & OPT_S) {
/* save the results to a data series */
savevar = mdist_saver(dset);
}
if (prn != NULL) {
pprintf(prn, "%s\n", _("Mahalanobis distances from the centroid"));
pprintf(prn, "%s\n", _("using the variables:"));
for (i=1; i<=list[0]; i++) {
pprintf(prn, " %s\n", dset->varname[list[i]]);
}
pputc(prn, '\n');
obslen = max_obs_marker_length(dset);
if (obslen < 8) {
obslen = 8;
}
}
for (t=dset->t1; t<=dset->t2; t++) {
miss = 0;
/* write vector of deviations from centroid for
observation t */
for (i=0; iZ[vi][t];
miss += na(x);
if (!miss) {
gretl_vector_set(xdiff, i, x - xbar);
}
}
if (miss) {
m = NADBL;
} else {
m = gretl_scalar_qform(xdiff, S, &err);
if (!err) {
m = sqrt(m);
}
}
if (prn != NULL) {
print_obs_marker(t, dset, obslen, prn);
if (err || miss) {
pprintf(prn, "NA\n");
} else {
pprintf(prn, "%9.6f\n", m);
}
}
if (savevar > 0) {
dset->Z[savevar][t] = m;
} else if (md != NULL) {
md->d[t] = m;
}
}
if (savevar > 0 && prn != NULL) {
pputc(prn, '\n');
pprintf(prn, _("Distances saved as '%s'"),
dset->varname[savevar]);
pputc(prn, '\n');
}
if (prn != NULL) {
pputc(prn, '\n');
}
}
gretl_matrix_free(xdiff);
gretl_matrix_free(means);
gretl_matrix_free(S);
bailout:
dset->t1 = orig_t1;
dset->t2 = orig_t2;
return err;
}
int mahalanobis_distance (const int *list, DATASET *dset,
gretlopt opt, PRN *prn)
{
return real_mahalanobis_distance(list, dset, opt, NULL,
(opt & OPT_Q) ? NULL : prn);
}
MahalDist *get_mahal_distances (const int *list, DATASET *dset,
gretlopt opt, PRN *prn, int *err)
{
MahalDist *md = mahal_dist_new(list, dset->n);
if (md == NULL) {
*err = E_ALLOC;
} else {
*err = real_mahalanobis_distance(list, dset, opt,
md, prn);
if (*err) {
free_mahal_dist(md);
md = NULL;
}
}
return md;
}
/*
G = [(2 * \sum_i^n (i * x_i)) / (n * \sum_i^n x_i )] - [(n + 1)/n]
where x is sorted: x_i <= x_{i+1}
*/
static double gini_coeff (const double *x, int t1, int t2, double **plz,
int *pn, int *err)
{
int m = t2 - t1 + 1;
double *sx = NULL;
double csx = 0.0, sumx = 0.0, sisx = 0.0;
double G;
int t, n = 0;
gretl_error_clear();
sx = malloc(m * sizeof *sx);
if (sx == NULL) {
*err = E_ALLOC;
return NADBL;
}
n = 0;
for (t=t1; t<=t2; t++) {
if (na(x[t])) {
continue;
} else if (x[t] < 0.0) {
gretl_errmsg_set(_("illegal negative value"));
*err = E_DATA;
break;
} else {
sx[n++] = x[t];
sumx += x[t];
}
}
if (!*err && (n == 0 || sumx == 0.0)) {
*err = E_DATA;
}
if (*err) {
free(sx);
return NADBL;
}
qsort(sx, n, sizeof *sx, gretl_compare_doubles);
#if 0
if (1) {
/* just testing alternative calculation for equivalence */
double num, S[2];
int s, fyi;
num = S[0] = S[1] = 0.0;
for (t=0; t 4000) {
downsample = (int) (n / 1000.0);
}
for (t=0; tZ[varno], dset->t1, dset->t2,
&lz, &n, &err);
if (err) {
return err;
}
fulln = dset->t2 - dset->t1 - 1;
pprintf(prn, "%s\n", dset->varname[varno], n);
pprintf(prn, _("Number of observations = %d\n"), n);
if (n < fulln) {
pputs(prn, _("Warning: there were missing values\n"));
}
pputc(prn, '\n');
pprintf(prn, "%s = %g\n", _("Sample Gini coefficient"), gini);
pprintf(prn, "%s = %g\n", _("Estimate of population value"),
gini * (double) n / (n - 1));
err = lorenz_graph(dset->varname[varno], lz, n);
free(lz);
return err;
}
/* Following: apparatus for Shapiro-Wilk normality test.
Thanks to Marcin Blazejowski for the contribution.
*/
#ifndef min
# define min(a, b) ((a) > (b) ? (b) : (a))
#endif
static int sign (int a)
{
return (a == 0)? 0 : ((a < 0)? -1 : 1);
}
static int compare_floats (const void *a, const void *b)
{
const float *fa = (const float *) a;
const float *fb = (const float *) b;
return (*fa > *fb) - (*fa < *fb);
}
/* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
Calculates the algebraic polynomial of order n-1 with
array of coefficients cc. Zero-order coefficient is
cc(1) = cc[0]
*/
static double poly (const float *cc, int n, float x) {
double p, ret = cc[0]; /* preserve precision! */
int j;
if (n > 1) {
p = x * cc[n-1];
for (j=n-2; j>0; j--) {
p = (p + cc[j]) * x;
}
ret += p;
}
return ret;
}
/* Calculate coefficients for the Shapiro-Wilk test */
static void sw_coeff (int n, int n2, float *a)
{
const float c1[6] = { 0.f,.221157f,-.147981f,-2.07119f, 4.434685f, -2.706056f };
const float c2[6] = { 0.f,.042981f,-.293762f,-1.752461f,5.682633f, -3.582633f };
float summ2, ssumm2;
float a0, a1, an = (float) n;
float fac, an25, rsn;
int i, i1, nn2 = n / 2;
if (n == 3) {
a[0] = sqrt(0.5);
} else {
an25 = an + .25;
summ2 = 0.0;
for (i=0; i 5) {
i1 = 2;
a1 = -a[1] / ssumm2 + poly(c2, 6, rsn);
fac = sqrt((summ2 - 2.0 * (a[0] * a[0]) - 2.0 * (a[1] * a[1])) /
(1.0 - 2.0 * (a0 * a0) - 2.0 * (a1 * a1)));
a[1] = a1;
} else {
i1 = 1;
fac = sqrt((summ2 - 2.0 * (a[0] * a[0])) / (1.0 - 2.0 * (a0 * a0)));
}
a[0] = a0;
for (i=i1; i= gamma) {
/* FIXME: rather use an even smaller value, or NA? */
*pval = little;
return 0;
}
y = -log(gamma - y);
m = poly(c3, 4, an);
s = exp(poly(c4, 4, an));
} else {
/* n >= 12 */
m = poly(c5, 4, xx);
s = exp(poly(c6, 3, xx));
}
if (ncens > 0) {
/* Censoring by proportion ncens/n.
Calculate mean and sd of normal equivalent deviate of W
*/
float delta = (float) ncens / an;
ld = -log(delta);
bf = 1.0 + xx * bf1;
r1 = pow(xx90, (double) xx);
z90f = z90 + bf * pow(poly(c7, 2, r1), (double) ld);
r1 = pow(xx95, (double) xx);
z95f = z95 + bf * pow(poly(c8, 2, r1), (double) ld);
z99f = z99 + bf * pow(poly(c9, 2, xx), (double) ld);
/* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
pseudo-mean and pseudo-sd of z as the slope and intercept
*/
zfm = (z90f + z95f + z99f) / 3.0;
zsd = (z90 * (z90f - zfm) + z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;
zbar = zfm - zsd * zm;
m += zbar * s;
s *= zsd;
}
*pval = normal_cdf_comp(((double) y - (double) m) / (double) s);
return 0;
}
static int sw_sample_check (int n, int n1)
{
int ncens = n - n1;
float delta, an = n;
if (n < 3 || n1 < 3) {
fprintf(stderr, "There is no way to run SW test for less then 3 obs\n");
return E_DATA;
}
if (ncens < 0 || (ncens > 0 && n < 20)) {
fprintf(stderr, "sw_w: not enough uncensored obserations\n");
return E_DATA;
}
delta = (float) ncens / an;
if (delta > .8f) {
fprintf(stderr, "sw_w: too many censored obserations\n");
return E_DATA;
}
return 0;
}
/**
* shapiro_wilk:
* @x: data array.
* @t1: starting observation.
* @t2: ending observation.
* @W: location to receive test statistic.
* @pval: location to receive p-value.
*
* Computes the Shapiro-Wilk W statistic as a test for
* normality of the data @x, and also the p-value for
* the test. These are written into the pointer
* arguments @W and @pval.
*
* Returns: 0 on success, non-zero on failure.
*/
int shapiro_wilk (const double *x, int t1, int t2, double *W, double *pval)
{
int n1 = 0; /* number of uncensored obs */
float *a = NULL; /* array of coefficients */
float *xf = NULL; /* copy of x in float format */
int i, t, n2, n = 0;
int err = 0;
*W = *pval = NADBL;
for (t=t1; t<=t2; t++) {
if (!na(x[t])) n++;
}
/* for now we assume all obs are uncensored */
n1 = n;
err = sw_sample_check(n, n1);
if (err) {
return err;
}
/* How many coeffs should be computed? */
n2 = fmod(n, 2.0);
n2 = (n2 == 0)? n / 2 : (n - 1) / 2;
xf = malloc(n * sizeof *xf);
a = malloc(n2 * sizeof *a);
if (xf == NULL || a == NULL) {
err = E_ALLOC;
} else {
i = 0;
for (t=t1; t<=t2; t++) {
if (!na(x[t])) {
xf[i++] = x[t];
}
}
qsort(xf, n, sizeof *xf, compare_floats);
/* Main job: compute W stat and its p-value */
sw_coeff(n, n2, a);
err = sw_w(xf, n, n1, a, W, pval);
}
free(a);
free(xf);
return err;
}
static double round2 (double x)
{
x *= 100.0;
x = (x - floor(x) < 0.5)? floor(x) : ceil(x);
return x / 100;
}
/* Polynomial approximation to the p-value for the Lilliefors test,
said to be good to 2 decimal places. See Abdi and Molin,
"Lilliefors/Van Soest's test of normality", in Salkind (ed.)
Encyclopedia of Measurement and Statistics, Sage, 2007; also
http://www.utd.edu/~herve/Abdi-Lillie2007-pretty.pdf
*/
static double lilliefors_pval (double L, int N)
{
double b0 = 0.37872256037043;
double b1 = 1.30748185078790;
double b2 = 0.08861783849346;
double b1pN = b1 + N;
double Lm2 = 1.0 / (L * L);
double A, pv;
A = (-b1pN + sqrt(b1pN * b1pN - 4 * b2 * (b0 - Lm2))) / (2 * b2);
pv = -0.37782822932809 + 1.67819837908004*A
- 3.02959249450445*A*A + 2.80015798142101*pow(A, 3.)
- 1.39874347510845*pow(A, 4.) + 0.40466213484419*pow(A, 5.)
- 0.06353440854207*pow(A, 6.) + 0.00287462087623*pow(A, 7.)
+ 0.00069650013110*pow(A, 8.) - 0.00011872227037*pow(A, 9.)
+ 0.00000575586834*pow(A, 10.);
if (pv < 0) {
pv = 0;
} else if (pv > 1) {
pv = 1;
} else {
/* round to 2 digits */
pv = round2(pv);
}
return pv;
}
static int lilliefors_test (const double *x, int t1, int t2,
double *L, double *pval)
{
double *zx; /* copy of x for sorting, z-scoring */
int i, t, n = 0;
int err = 0;
*L = *pval = NADBL;
for (t=t1; t<=t2; t++) {
if (!na(x[t])) n++;
}
if (n < 5) {
/* we need more than 4 data points */
return E_DATA;
}
zx = malloc(n * sizeof *zx);
if (zx == NULL) {
err = E_ALLOC;
} else {
double dx, mx = 0.0, sx = 0.0;
double Phi, Dp = 0.0, Dm = 0.0;
double Dmax = 0.0;
/* compute sample mean and standard deviation plus
sorted z-scores */
i = 0;
for (t=t1; t<=t2; t++) {
if (!na(x[t])) {
zx[i++] = x[t];
mx += x[t];
}
}
mx /= n;
for (t=t1; t<=t2; t++) {
if (!na(x[t])) {
dx = x[t] - mx;
sx += dx * dx;
}
}
sx = sqrt(sx / (n - 1));
qsort(zx, n, sizeof *zx, gretl_compare_doubles);
for (i=0; i Dmax) Dmax = Dp;
if (Dm > Dmax) Dmax = Dm;
}
*L = Dmax;
*pval = lilliefors_pval(Dmax, n);
}
free(zx);
return err;
}
static int skew_kurt_test (const double *x, int t1, int t2,
double *test, double *pval,
gretlopt opt)
{
double skew, xkurt;
int n, err = 0;
err = series_get_moments(t1, t2, x, &skew, &xkurt, &n);
if (!err) {
if (opt & OPT_J) {
/* Jarque-Bera */
*test = (n / 6.0) * (skew * skew + xkurt * xkurt/ 4.0);
} else {
/* Doornik-Hansen */
*test = doornik_chisq(skew, xkurt, n);
}
}
if (!err && na(*test)) {
err = E_NAN;
}
if (!na(*test)) {
*pval = chisq_cdf_comp(2, *test);
}
return err;
}
static void print_normality_stat (double test, double pval,
gretlopt opt, PRN *prn)
{
const char *tstrs[] = {
N_("Shapiro-Wilk W"),
N_("Jarque-Bera test"),
N_("Lilliefors test"),
N_("Doornik-Hansen test")
};
int i = (opt & OPT_W)? 0 : (opt & OPT_J)? 1 :
(opt & OPT_L)? 2 : 3;
if (na(test) || na(pval)) {
return;
}
if (opt & OPT_L) {
pprintf(prn, " %s = %g, %s ~= %g\n\n",
_(tstrs[i]), test, _("with p-value"), pval);
} else {
pprintf(prn, " %s = %g, %s %g\n\n",
_(tstrs[i]), test, _("with p-value"), pval);
}
}
/**
* gretl_normality_test:
* @varno: ID number of the variable to process.
* @dset: dataset struct.
* @opt: OPT_A: all tests; OPT_D: Doornik-Hansen; OPT_W: Shapiro-Wilk;
* OPT_J: Jarque-Bera; OPT_L: Lilliefors; default is Doornik-Hansen.
* Also use OPT_Q for quiet.
* @prn: gretl printing struct.
*
* Performs, and prints the results of, the specified test(s) randomness
* for the variable specified by @v.
*
* Returns: 0 on successful completion, non-zero on error.
*/
int gretl_normality_test (int varno, const DATASET *dset,
gretlopt opt, PRN *prn)
{
gretlopt alltests = OPT_D | OPT_W | OPT_J | OPT_L;
double test = NADBL;
double pval = NADBL;
double trec = NADBL;
double pvrec = NADBL;
int err;
if (varno < 0 || varno >= dset->v) {
return E_DATA;
}
err = incompatible_options(opt, OPT_A | alltests);
if (err) {
return err;
}
if (opt & OPT_A) {
/* show all tests */
opt |= alltests;
}
if (!(opt & alltests)) {
/* no method selected: use Doornik-Hansen */
opt |= OPT_D;
}
if (!(opt & OPT_Q)) {
pprintf(prn, _("Test for normality of %s:"), dset->varname[varno]);
if (opt & OPT_A) {
pputs(prn, "\n\n");
} else {
pputc(prn, '\n');
}
}
if (opt & OPT_D) {
err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2,
&test, &pval, OPT_D);
if (!err && !(opt & OPT_Q)) {
print_normality_stat(test, pval, OPT_D, prn);
}
if (!err) {
/* record Doornik-Hansen result by default */
trec = test;
pvrec = pval;
}
}
if (opt & OPT_W) {
err = shapiro_wilk(dset->Z[varno], dset->t1, dset->t2,
&test, &pval);
if (!err && !(opt & OPT_Q)) {
print_normality_stat(test, pval, OPT_W, prn);
}
}
if (opt & OPT_L) {
err = lilliefors_test(dset->Z[varno], dset->t1, dset->t2,
&test, &pval);
if (!err && !(opt & OPT_Q)) {
print_normality_stat(test, pval, OPT_L, prn);
}
}
if (opt & OPT_J) {
err = skew_kurt_test(dset->Z[varno], dset->t1, dset->t2,
&test, &pval, OPT_J);
if (!err && !(opt & OPT_Q)) {
print_normality_stat(test, pval, OPT_J, prn);
}
}
if (na(trec) && !na(test)) {
trec = test;
}
if (na(pvrec) && !na(pval)) {
pvrec = pval;
}
if (!na(trec) && !na(pvrec)) {
record_test_result(trec, pvrec);
}
return err;
}
gretl_matrix *gretl_normtest_matrix (const double *y,
int t1, int t2,
gretlopt opt,
int *err)
{
gretl_matrix *ret = NULL;
double test = NADBL;
double pval = NADBL;
if (opt & OPT_J) {
/* Jarque-Bera */
*err = skew_kurt_test(y, t1, t2, &test, &pval, opt);
} else if (opt & OPT_W) {
/* Shapiro-Wilk */
*err = shapiro_wilk(y, t1, t2, &test, &pval);
} else if (opt & OPT_L) {
/* Lilliefors */
*err = lilliefors_test(y, t1, t2, &test, &pval);
} else {
/* Doornik-Hansen */
*err = skew_kurt_test(y, t1, t2, &test, &pval, opt);
}
if (!*err) {
ret = gretl_matrix_alloc(1, 2);
if (ret == NULL) {
*err = E_ALLOC;
} else {
ret->val[0] = test;
ret->val[1] = pval;
}
}
return ret;
}