1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #include "libgretl.h"
21 #include "version.h"
22 
23 #include <glib.h>
24 
25 /* provides for one-way and two-way ANOVA */
26 
27 struct anova {
28     int n;          /* observations used */
29     int nt;         /* number of treatment 'levels' */
30     int nb;         /* number of 'blocks' (if applicable) */
31     double SST;     /* total sum of squares */
32     double SSTr;    /* treatment sum of squares */
33     double SSB;     /* 'block' sum of squares (if applicable) */
34     double SSE;     /* error sum of squares */
35     double F;       /* F-test */
36     double pval;    /* p-value for F-test */
37     double *cmeans; /* column means */
38     double *rmeans; /* row means */
39     int *ccount;    /* column counts */
40     int *rcount;    /* row counts */
41     double *tvec;   /* workspace follows */
42     double *bvec;
43     gretl_matrix *tvals;
44     gretl_matrix *bvals;
45 };
46 
anova_init(struct anova * v)47 static void anova_init (struct anova *v)
48 {
49     v->n = v->nt = v->nb = 0;
50     v->SST = v->SSTr = v->SSB = v->SSE = 0.0;
51     v->F = v->pval = NADBL;
52     v->cmeans = v->rmeans = NULL;
53     v->ccount = v->rcount = NULL;
54     v->tvec = v->bvec = NULL;
55     v->tvals = v->bvals = NULL;
56 }
57 
anova_free(struct anova * v)58 static void anova_free (struct anova *v)
59 {
60     free(v->cmeans);
61     free(v->rmeans);
62     free(v->ccount);
63     free(v->rcount);
64     free(v->tvec);
65     free(v->bvec);
66 
67     gretl_matrix_free(v->tvals);
68     gretl_matrix_free(v->bvals);
69 }
70 
print_anova(struct anova * v,PRN * prn)71 static int print_anova (struct anova *v, PRN *prn)
72 {
73     int dftotal, dftreat, dfblock, dferr;
74     double mst, msr, mse;
75     int n, c1, c2, c3;
76 
77     dftotal = v->n - 1;
78     dftreat = v->nt - 1;
79     dfblock = (v->nb > 0)? v->nb - 1 : 0;
80     dferr = dftotal - dftreat - dfblock;
81 
82     pputs(prn, "\n\n");
83 
84     c1 = g_utf8_strlen(_("Sum of squares"), -1);
85     c2 = g_utf8_strlen(_("df"), -1);
86     c3 = g_utf8_strlen(_("Mean square"), -1);
87 
88     c1 = (c1 < 35)? 35 : c1;
89     c2 = (c2 > 8)? c2 + 1 : (c2 < 8)? 8 : c2;
90     c3 = (c3 > 16)? c3 + 1 : (c3 < 16)? 16 : c3;
91 
92     /* header strings are right-aligned */
93     n = g_utf8_strlen(_("Sum of squares"), -1);
94     bufspace(c1 - n, prn);
95     pputs(prn, _("Sum of squares"));
96     n = g_utf8_strlen(_("df"), -1);
97     bufspace(c2 + 1 - n, prn);
98     pputs(prn, _("df"));
99     n = g_utf8_strlen(_("Mean square"), -1);
100     bufspace(c3 + 1 - n, prn);
101     pputs(prn, _("Mean square"));
102     pputs(prn, "\n\n");
103     c1 = 16;
104 
105     /* Mean Square, treatment */
106     msr = v->SSTr / dftreat;
107     /* string left-aligned with initial offset of 2 */
108     n = g_utf8_strlen(_("Treatment"), -1);
109     bufspace(2, prn);
110     pputs(prn, _("Treatment"));
111     bufspace(16 - n, prn);
112     pprintf(prn, " %*g %*d %*g\n", c1, v->SSTr, c2, dftreat, c3, msr);
113 
114     if (dfblock > 0) {
115 	/* Mean Square, block */
116 	double msb = v->SSB / dfblock;
117 
118 	/* string left-aligned with initial offset of 2 */
119 	n = g_utf8_strlen(_("Block"), -1);
120 	bufspace(2, prn);
121 	pputs(prn, _("Block"));
122 	bufspace(16 - n, prn);
123 	pprintf(prn, " %*g %*d %*g\n", c1, v->SSB, c2, dfblock, c3, msb);
124     }
125 
126     /* Mean Square, errors */
127     mse = v->SSE / dferr;
128     /* string left-aligned with initial offset of 2 */
129     n = g_utf8_strlen(_("Residual"), -1);
130     bufspace(2, prn);
131     pputs(prn, _("Residual"));
132     bufspace(16 - n, prn);
133     pprintf(prn, " %*g %*d %*g\n", c1, v->SSE, c2, dferr, c3, mse);
134 
135     /* Mean Square, total */
136     mst = v->SST / dftotal;
137     /* string left-aligned with initial offset of 2 */
138     n = g_utf8_strlen(_("Total"), -1);
139     bufspace(2, prn);
140     pputs(prn, _("Total"));
141     bufspace(16 - n, prn);
142     pprintf(prn, " %*g %*d %*g\n", c1, v->SST, c2, dftotal, c3, mst);
143 
144     pputc(prn, '\n');
145 
146     if (na(v->F)) {
147 	pprintf(prn, "  F(%d, %d) = %g / %g (%s)\n\n", dftreat, dferr,
148 		msr, mse, _("undefined"));
149     } else {
150 	pprintf(prn, "  F(%d, %d) = %g / %g = %g ",
151 		dftreat, dferr, msr, mse, v->F);
152 	if (v->pval < .0001) {
153 	    pprintf(prn, "[%s %.3g]\n\n", _("p-value"), v->pval);
154 	} else if (!na(v->pval)) {
155 	    pprintf(prn, "[%s %.4f]\n\n", _("p-value"), v->pval);
156 	}
157     }
158 
159     return 0;
160 }
161 
162 /* one-way anova only: print the mean and standard deviation
163    of the response at each level of treatment */
164 
anova_print_means(struct anova * v,const double * xt,const double * y,double ybar,int t1,int t2,PRN * prn)165 static int anova_print_means (struct anova *v, const double *xt,
166 			      const double *y, double ybar,
167 			      int t1, int t2, PRN *prn)
168 {
169     double d, *csd = malloc(v->nt * sizeof *csd);
170     int c1, c2, c3, c4;
171     int i, t;
172 
173     if (csd == NULL) {
174 	return E_ALLOC;
175     }
176 
177     for (i=0; i<v->nt; i++) {
178 	csd[i] = 0.0;
179     }
180 
181     for (t=t1; t<=t2; t++) {
182 	if (!na(xt[t]) && !na(y[t])) {
183 	    for (i=0; i<v->nt; i++) {
184 		if (xt[t] == v->tvals->val[i]) {
185 		    d = y[t] - v->cmeans[i];
186 		    csd[i] += d * d;
187 		    break;
188 		}
189 	    }
190 	}
191     }
192 
193     c1 = g_utf8_strlen(_("Level"), -1);
194     c2 = g_utf8_strlen(_("n"), -1);
195     c3 = g_utf8_strlen(_("mean"), -1);
196     c4 = g_utf8_strlen(_("std. dev"), -1);
197 
198     c1 = (c1 < 8)? 8 : c1;
199     c2 = (c2 > 6)? c2 + 1 : (c2 < 6)? 6 : c2;
200     c3 = (c3 > 10)? c3 + 1 : (c3 < 10)? 10 : c3;
201     c4 = (c4 > 12)? c4 + 1 : (c4 < 12)? 12 : c4;
202 
203     pprintf(prn, "  %-*s %*s %*s %*s\n\n", c1, _("Level"), c2, _("n"),
204 	    c3, _("mean"), c4, _("std. dev"));
205 
206     for (i=0; i<v->nt; i++) {
207 	if (v->ccount[i] > 1) {
208 	    csd[i] /= v->ccount[i] - 1;
209 	    csd[i] = sqrt(csd[i]);
210 	    pprintf(prn, "  %-*g %*d %*g %#*.5g\n", c1, v->tvals->val[i],
211 		    c2, v->ccount[i], c3, v->cmeans[i], c4, csd[i]);
212 	} else {
213 	    pprintf(prn, "  %-*g %*d %*g %*s\n", c1, v->tvals->val[i],
214 		    c2, v->ccount[i], c3, v->cmeans[i], c4, "NA");
215 	}
216     }
217 
218     pprintf(prn, "\n  %s = %g\n\n", _("Grand mean"), ybar);
219 
220     free(csd);
221 
222     return 0;
223 }
224 
225 /* allocate arrays to hold the valid, in-sample values of the
226    treatment and block variables */
227 
anova_make_arrays(const double * xb,struct anova * v)228 static int anova_make_arrays (const double *xb, struct anova *v)
229 {
230     int err = 0;
231 
232     v->tvec = malloc(v->n * sizeof *v->tvec);
233 
234     if (v->tvec == NULL) {
235 	err = E_ALLOC;
236     } else if (xb != NULL) {
237 	v->bvec = malloc(v->n * sizeof *v->bvec);
238 	if (v->bvec == NULL) {
239 	    free(v->tvec);
240 	    v->tvec = NULL;
241 	    err = E_ALLOC;
242 	}
243     }
244 
245     return err;
246 }
247 
248 /* construct vectors holding the distinct values of the
249    treatment and block variables */
250 
anova_make_value_vecs(struct anova * v)251 static int anova_make_value_vecs (struct anova *v)
252 {
253     int err = 0;
254 
255     v->tvals = gretl_matrix_values(v->tvec, v->n, OPT_S, &err);
256 
257     if (!err && v->tvals->rows < 2) {
258 	gretl_errmsg_set("Insufficient observations");
259 	err = E_DATA;
260     }
261 
262     if (!err && v->bvec != NULL) {
263 	v->bvals = gretl_matrix_values(v->bvec, v->n, OPT_S, &err);
264 	if (!err && v->bvals->rows < 2) {
265 	    gretl_errmsg_set("Insufficient observations");
266 	    err = E_DATA;
267 	}
268     }
269 
270     if (!err) {
271 	v->nt = v->tvals->rows;
272 	if (v->bvals != NULL) {
273 	    v->nb = v->bvals->rows;
274 	}
275     }
276 
277     return err;
278 }
279 
280 /* allocate and initialize arrays to calculate the column
281    and row means of the ANOVA table */
282 
anova_accounting_arrays(struct anova * v)283 static int anova_accounting_arrays (struct anova *v)
284 {
285     int i;
286 
287     v->cmeans = malloc(v->nt * sizeof *v->cmeans);
288     v->ccount = malloc(v->nt * sizeof *v->ccount);
289 
290     if (v->cmeans == NULL || v->ccount == NULL) {
291 	return E_ALLOC;
292     }
293 
294     for (i=0; i<v->nt; i++) {
295 	v->cmeans[i] = 0.0;
296 	v->ccount[i] = 0;
297     }
298 
299     if (v->nb > 0) {
300 	v->rmeans = malloc(v->nb * sizeof *v->rmeans);
301 	v->rcount = malloc(v->nb * sizeof *v->rcount);
302 
303 	if (v->rmeans == NULL || v->rcount == NULL) {
304 	    return E_ALLOC;
305 	}
306 
307 	for (i=0; i<v->nb; i++) {
308 	    v->rmeans[i] = 0.0;
309 	    v->rcount[i] = 0;
310 	}
311     }
312 
313     return 0;
314 }
315 
anova_add_F_stat(struct anova * v)316 static void anova_add_F_stat (struct anova *v)
317 {
318     int dfn = v->nt - 1;
319     int dfd = v->n - 1 - dfn;
320     double MSE, MSTr = v->SSTr / dfn;
321 
322     if (v->nb > 0) {
323 	dfd -= v->nb - 1;
324     }
325 
326     MSE = v->SSE / dfd;
327 
328     if (MSE > 0) {
329 	v->F = MSTr / MSE;
330 	v->pval = snedecor_cdf_comp(dfn, dfd, v->F);
331     }
332 
333     record_test_result(v->F, v->pval);
334 }
335 
336 #define anova_obs_ok(y,x,z,t) (!na(y[t]) && !na(x[t]) && \
337                                (z == NULL || !na(z[t])))
338 
339 /* For one-way anova @list contains response and treatment; for
340    two-way it should in addition contain the block variable.
341    @opt can contain OPT_Q to suppress printing.
342 */
343 
gretl_anova(const int * list,const DATASET * dset,gretlopt opt,PRN * prn)344 int gretl_anova (const int *list, const DATASET *dset,
345 		 gretlopt opt, PRN *prn)
346 {
347     struct anova v;
348     const double *y, *xt, *xb;
349     double ybar, dev;
350     int i, t, t1, t2;
351     int missvals = 0;
352     int err = 0;
353 
354     if (list[0] < 2 || list[0] > 3) {
355 	return E_DATA;
356     }
357 
358     anova_init(&v);
359 
360     t1 = dset->t1;
361     t2 = dset->t2;
362     list_adjust_sample(list, &t1, &t2, dset, &missvals);
363 
364     v.n = t2 - t1 + 1 - missvals;
365     if (v.n < 2) {
366 	return E_TOOFEW;
367     }
368 
369     y = dset->Z[list[1]];
370     xt = dset->Z[list[2]];
371     xb = (list[0] == 3)? dset->Z[list[3]] : NULL;
372 
373     /* check that treatment (and block, if present) are discrete */
374 
375     if (!accept_as_discrete(dset, list[2], 0)) {
376 	gretl_errmsg_set(_("anova: the treatment variable must be discrete"));
377 	err = E_DATA;
378     } else if (xb != NULL && !accept_as_discrete(dset, list[3], 0)) {
379 	gretl_errmsg_set(_("anova: the block variable must be discrete"));
380 	err = E_DATA;
381     }
382 
383     if (err) {
384 	return err;
385     }
386 
387     v.n = 0;
388     for (t=t1; t<=t2; t++) {
389 	if (anova_obs_ok(y, xt, xb, t)) {
390 	    v.n += 1;
391 	}
392     }
393 
394     if (v.n < 2) {
395 	return E_TOOFEW;
396     }
397 
398     err = anova_make_arrays(xb, &v);
399     if (err) {
400 	return err;
401     }
402 
403     /* fill tvec and bvec; calculate grand mean */
404 
405     ybar = 0.0;
406     i = 0;
407     for (t=t1; t<=t2; t++) {
408 	if (anova_obs_ok(y, xt, xb, t)) {
409 	    v.tvec[i] = xt[t];
410 	    ybar += y[t];
411 	    if (v.bvec != NULL) {
412 		v.bvec[i] = xb[t];
413 	    }
414 	    i++;
415 	}
416     }
417 
418     ybar /= v.n;
419 
420     err = anova_make_value_vecs(&v);
421     if (err) {
422 	goto bailout;
423     }
424 
425     err = anova_accounting_arrays(&v);
426     if (err) {
427 	goto bailout;
428     }
429 
430     /* find column (treatment) means */
431 
432     for (t=t1; t<=t2; t++) {
433 	if (anova_obs_ok(y, xt, xb, t)) {
434 	    dev = y[t] - ybar;
435 	    v.SST += dev * dev;
436 	    for (i=0; i<v.nt; i++) {
437 		if (xt[t] == v.tvals->val[i]) {
438 		    v.cmeans[i] += y[t];
439 		    v.ccount[i] += 1;
440 		    break;
441 		}
442 	    }
443 	}
444     }
445 
446     for (i=0; i<v.nt; i++) {
447 	v.cmeans[i] /= v.ccount[i];
448     }
449 
450     /* sums of squares */
451 
452     if (v.nb > 0) {
453 	/* two-way ANOVA */
454 	for (t=t1; t<=t2; t++) {
455 	    if (anova_obs_ok(y, xt, xb, t)) {
456 		for (i=0; i<v.nb; i++) {
457 		    if (xb[t] == v.bvals->val[i]) {
458 			v.rmeans[i] += y[t];
459 			v.rcount[i] += 1;
460 			break;
461 		    }
462 		}
463 	    }
464 	}
465 
466 	for (i=0; i<v.nb; i++) {
467 	    v.rmeans[i] /= v.rcount[i];
468 	    dev = v.rmeans[i] - ybar;
469 	    v.SSB += dev * dev * v.rcount[i];
470 	}
471 
472 	for (i=0; i<v.nt; i++) {
473 	    dev = v.cmeans[i] - ybar;
474 	    v.SSTr += dev * dev * v.ccount[i];
475 	}
476 
477 	v.SSE = v.SST - v.SSTr - v.SSB;
478     } else {
479 	/* one-way ANOVA */
480 	for (t=t1; t<=t2; t++) {
481 	    if (!na(xt[t]) && !na(y[t])) {
482 		for (i=0; i<v.nt; i++) {
483 		    if (xt[t] == v.tvals->val[i]) {
484 			dev = y[t] - v.cmeans[i];
485 			v.SSE += dev * dev;
486 			break;
487 		    }
488 		}
489 	    }
490 	}
491 	v.SSTr = v.SST - v.SSE;
492     }
493 
494     anova_add_F_stat(&v);
495 
496     if (!(opt & OPT_Q)) {
497 	const char *yname = dset->varname[list[1]];
498 	const char *tname = dset->varname[list[2]];
499 
500 	pputc(prn, '\n');
501 	pprintf(prn, _("%s, response = %s, treatment = %s:"),
502 		_("Analysis of Variance"), yname, tname);
503 
504 	err = print_anova(&v, prn);
505 
506 	if (!err && v.nb == 0) {
507 	    anova_print_means(&v, xt, y, ybar, t1, t2, prn);
508 	}
509     }
510 
511  bailout:
512 
513     anova_free(&v);
514 
515     return err;
516 }
517