1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 /* panel estimation and dataset handling for gretl */
21 
22 #include "libgretl.h"
23 #include "gretl_f2c.h"
24 #include "clapack_double.h"
25 #include "gretl_model.h"
26 #include "gretl_panel.h"
27 #include "libset.h"
28 #include "uservar.h"
29 #include "gretl_string_table.h"
30 #include "matrix_extra.h" /* for testing */
31 
32 /**
33  * SECTION:gretl_panel
34  * @short_description: estimation of panel data models
35  * @title: Panel data
36  * @include: libgretl.h
37  *
38  * Provides support for estimating panel data models
39  * such as fixed and random effects.
40  */
41 
42 #define PDEBUG 0
43 
44 enum vcv_ops {
45     VCV_INIT,
46     VCV_SUBTRACT
47 };
48 
49 typedef struct unit_ts_ unit_ts;
50 
51 struct unit_ts_ {
52     int *t0;  /* start of longest contiguous time series, per unit */
53     int *T;   /* length of such sequence, per unit */
54 };
55 
56 typedef struct panelmod_t_ panelmod_t;
57 
58 struct panelmod_t_ {
59     gretlopt opt;         /* option flags */
60     int nunits;           /* total cross-sectional units in sample range */
61     int effn;             /* effective (included) cross-section units */
62     int T;                /* times-series length of panel */
63     int Tmax;             /* effective times-series length (max usable obs per unit) */
64     int Tmin;             /* shortest usable times-series */
65     double Tbar;          /* harmonic mean of per-unit time-series lengths */
66     int NT;               /* total observations used (based on pooled model) */
67     int ntdum;            /* number of time dummies added */
68     int *unit_obs;        /* array of number of observations per x-sect unit */
69     char *varying;        /* array to record properties of pooled-model regressors */
70     int *vlist;           /* list of time-varying variables from pooled model */
71     int balanced;         /* 1 if the model dataset is balanced, else 0 */
72     int dfH;              /* number of coeffs for Hausman test */
73     int Fdfn;             /* numerator df, F for differing intercepts */
74     double Fdfd;          /* denominator df, F for differing intercepts */
75     double theta;         /* quasi-demeaning coefficient */
76     double theta_bar;     /* mean of theta_i (unbalanced case) */
77     double Ffe;           /* joint significance of fixed effects */
78     double BP;            /* Breusch-Pagan test statistic */
79     double H;             /* Hausman test statistic */
80     gretl_matrix *bdiff;  /* array of coefficient differences */
81     gretl_matrix *Sigma;  /* Hausman covariance matrix */
82     double s2b;           /* residual variance, group means regression */
83     double s2e;           /* residual variance, fixed-effects regression */
84     double s2v;           /* estimate of between variance */
85     double ubPub;         /* for use in unbalanced Swamy-Arora */
86     double dw;            /* Durbin-Watson, for transfer to random effects */
87     double rho;           /* for transfer to random effects */
88     int *small2big;       /* data indexation array */
89     int *big2small;       /* reverse data indexation array */
90     MODEL *pooled;        /* reference model (pooled OLS) */
91     MODEL *realmod;       /* fixed or random effects model */
92     double *re_uhat;      /* "fixed" random-effects residuals */
93 };
94 
95 struct {
96     int n;      /* number of cross-sectional units */
97     int T;      /* number of observations per unit */
98     int offset; /* sampling offset into full dataset */
99 } panidx;
100 
101 /* for testing purposes */
102 int stata_sa;
103 int IGLS;
104 char glsmat[MAXLEN];
105 
106 static int varying_vars_list (const DATASET *dset, panelmod_t *pan);
107 static void calculate_Tbar (panelmod_t *pan);
108 
109 /* translate from (i = unit, t = time period for that unit) to
110    overall 0-based index into the data set */
111 #define panel_index(i,t) (i * panidx.T + t + panidx.offset)
112 
113 #define panel_missing(p, t) (na(p->pooled->uhat[t]))
114 
115 
116 static void
panel_index_init(const DATASET * dset,int nunits,int T)117 panel_index_init (const DATASET *dset, int nunits, int T)
118 {
119     panidx.n = nunits;
120     panidx.T = T;
121     panidx.offset = dset->t1;
122 
123 #if PDEBUG
124     fprintf(stderr, "panel_index_init: n=%d, T=%d, offset=%d\n",
125 	    panidx.n, panidx.T, panidx.offset);
126 #endif
127 }
128 
129 /* Allocate the indexation arrays that allow us to translate between
130    the full-length data array and the possibly shorter array used for
131    transformed data (de-meaned or quasi-demeaned).
132 */
133 
allocate_data_finders(panelmod_t * pan,int bign)134 static int allocate_data_finders (panelmod_t *pan, int bign)
135 {
136     int s;
137 
138     if (pan->small2big != NULL) {
139 	/* already done */
140 	return 0;
141     }
142 
143     pan->small2big = malloc(pan->NT * sizeof *pan->small2big);
144     pan->big2small = malloc(bign * sizeof *pan->big2small);
145 
146     if (pan->small2big == NULL || pan->big2small == NULL) {
147 	return E_ALLOC;
148     }
149 
150     for (s=0; s<bign; s++) {
151 	pan->big2small[s] = -1;
152     }
153 
154     return 0;
155 }
156 
157 #define small_index(p,t) ((p->big2small == NULL)? t : p->big2small[t])
158 #define big_index(p,t)   ((p->small2big == NULL)? t : p->small2big[t])
159 
panelmod_init(panelmod_t * pan)160 static void panelmod_init (panelmod_t *pan)
161 {
162     pan->nunits = 0;
163     pan->effn = 0;
164     pan->T = 0;
165     pan->Tmax = 0;
166     pan->Tmin = 0;
167     pan->Tbar = 0;
168     pan->NT = 0;
169     pan->ntdum = 0;
170     pan->unit_obs = NULL;
171     pan->varying = NULL;
172     pan->vlist = NULL;
173     pan->opt = OPT_NONE;
174 
175     pan->balanced = 1;
176     pan->theta = NADBL;
177     pan->theta_bar = NADBL;
178     pan->ubPub = NADBL;
179     pan->dw = NADBL;
180     pan->rho = NADBL;
181 
182     pan->Ffe = NADBL;
183     pan->Fdfn = 0;
184     pan->Fdfd = 0;
185 
186     pan->BP = NADBL;
187     pan->H = NADBL;
188     pan->dfH = 0;
189 
190     pan->bdiff = NULL;
191     pan->Sigma = NULL;
192 
193     pan->small2big = NULL;
194     pan->big2small = NULL;
195 
196     pan->pooled = NULL;
197     pan->realmod = NULL;
198     pan->re_uhat = NULL;
199 }
200 
panelmod_free(panelmod_t * pan)201 static void panelmod_free (panelmod_t *pan)
202 {
203     free(pan->unit_obs);
204     free(pan->varying);
205     free(pan->vlist);
206 
207     gretl_matrix_free(pan->bdiff);
208     gretl_matrix_free(pan->Sigma);
209 
210     free(pan->small2big);
211     free(pan->big2small);
212     free(pan->re_uhat);
213 
214     free(pan->realmod);
215 }
216 
217 /* test variable number v against the (possibly reduced) regression
218    list which contains only time-varying regressors
219 */
220 
var_is_varying(const panelmod_t * pan,int v)221 static int var_is_varying (const panelmod_t *pan, int v)
222 {
223     if (v != 0) {
224 	int i;
225 
226 	for (i=2; i<=pan->vlist[0]; i++) {
227 	    if (pan->vlist[i] == v) {
228 		return 1;
229 	    }
230 	}
231     }
232 
233     return 0;
234 }
235 
236 /* retrieve X'X^{-1} from the pooled or fixed effects model */
237 
panel_model_xpxinv(MODEL * pmod,int * err)238 static gretl_matrix *panel_model_xpxinv (MODEL *pmod, int *err)
239 {
240     gretl_matrix *X;
241     int k = pmod->ncoeff;
242     double x;
243     int i, j, m;
244 
245 #if 0
246     fprintf(stderr, "panel_model_xpxinv...\n");
247 #endif
248 
249     if (gretl_model_get_int(pmod, "vcv_xpx")) {
250 	/* already done */
251 	gretl_model_set_int(pmod, "vcv_xpx", 0);
252     } else if (pmod->vcv != NULL) {
253 	double s2 = pmod->sigma * pmod->sigma;
254 	int nv = (k * k + k) / 2;
255 
256 	for (i=0; i<nv; i++) {
257 	    pmod->vcv[i] /= s2;
258 	}
259     } else {
260 	*err = makevcv(pmod, 1.0);
261 	if (*err) {
262 	    return NULL;
263 	}
264 	gretl_model_set_int(pmod, "vcv_xpx", 1);
265     }
266 
267     X = gretl_matrix_alloc(k, k);
268     if (X == NULL) {
269 	*err = E_ALLOC;
270 	return NULL;
271     }
272 
273     m = 0;
274     for (j=0; j<k; j++) {
275 	for (i=j; i<k; i++) {
276 	    x = pmod->vcv[m++];
277 	    gretl_matrix_set(X, i, j, x);
278 	    gretl_matrix_set(X, j, i, x);
279 	}
280     }
281 
282 #if 0
283     fprintf(stderr, "pmod->sigma = %g\n", pmod->sigma);
284     printlist(pmod->list, "pmod->list");
285     gretl_matrix_print(X, "panel (X'X)^{-1}");
286 #endif
287 
288     return X;
289 }
290 
291 /* Beck and Katz, as outlined in Greene.  We offer the following
292    for pooled OLS and FE.  Greene writes (in effect):
293 
294    Var(b) = A^{-1} W A^{-1}
295 
296    where A = \sum_{i=1}^n X'_i X_i (called "XX" below)
297          W = \sum_{i=1}^n \sum_{j=1}^n \sigma_{ij} X'_{i} X_{j}
298 
299    and \sigma_{ij} is estimated as (1/T) \sum_{t=1}^T e_i e_j,
300    with the e's being OLS (or FE) residuals.
301 
302    In computing this, we need to check that W is positive definite:
303    this seems not to be guaranteed for unbalanced panels.
304 */
305 
306 static int
beck_katz_vcv(MODEL * pmod,panelmod_t * pan,const DATASET * dset,gretl_matrix * XX,gretl_matrix * W,gretl_matrix * V)307 beck_katz_vcv (MODEL *pmod, panelmod_t *pan, const DATASET *dset,
308 	       gretl_matrix *XX, gretl_matrix *W, gretl_matrix *V)
309 {
310     gretl_matrix *Xi = NULL;
311     gretl_matrix *Xj = NULL;
312     int T = pan->T;
313     int k = pmod->ncoeff;
314     int s, si, sj;
315     int i, j, p, v, t;
316     int err = 0;
317 
318     Xi = gretl_matrix_alloc(pan->Tmax, k);
319     Xj = gretl_matrix_alloc(pan->Tmax, k);
320 
321     if (Xi == NULL || Xj == NULL) {
322 	err = E_ALLOC;
323 	goto bailout;
324     }
325 
326     for (i=0; i<pan->nunits; i++) {
327 	int Ti = pan->unit_obs[i];
328 	double sii = 0.0;
329 
330 	if (Ti == 0) {
331 	    continue;
332 	}
333 
334 	Xi = gretl_matrix_reuse(Xi, Ti, k);
335 
336 	s = 0;
337 	for (t=0; t<T; t++) {
338 	    si = panel_index(i, t);
339 	    if (!na(pmod->uhat[si])) {
340 		sii += pmod->uhat[si] * pmod->uhat[si];
341 		si = small_index(pan, si);
342 		for (p=0; p<k; p++) {
343 		    v = pmod->list[p + 2];
344 		    gretl_matrix_set(Xi, s, p, dset->Z[v][si]);
345 		}
346 		s++;
347 	    }
348 	}
349 
350 	sii /= Ti;
351 	sii = sqrt(sii);
352 
353 	/* "diagonal" component of W */
354 	gretl_matrix_multiply_by_scalar(Xi, sii);
355 	gretl_matrix_multiply_mod(Xi, GRETL_MOD_TRANSPOSE,
356 				  Xi, GRETL_MOD_NONE,
357 				  W, GRETL_MOD_CUMULATE);
358 
359 	for (j=i+1; j<pan->nunits; j++) {
360 	    int Tij = 0;
361 	    double sij = 0.0;
362 
363 	    if (pan->unit_obs[j] == 0) {
364 		continue;
365 	    }
366 
367 	    /* count matching observations */
368 	    for (t=0; t<T; t++) {
369 		si = panel_index(i, t);
370 		sj = panel_index(j, t);
371 		if (!na(pmod->uhat[si]) && !na(pmod->uhat[sj])) {
372 		    sij += pmod->uhat[si] * pmod->uhat[sj];
373 		    Tij++;
374 		}
375 	    }
376 
377 	    if (Tij == 0) {
378 		continue;
379 	    }
380 
381 	    sij /= Tij;
382 	    Xi = gretl_matrix_reuse(Xi, Tij, k);
383 	    Xj = gretl_matrix_reuse(Xj, Tij, k);
384 
385 	    s = 0;
386 	    for (t=0; t<T; t++) {
387 		si = panel_index(i, t);
388 		sj = panel_index(j, t);
389 		if (!na(pmod->uhat[si]) && !na(pmod->uhat[sj])) {
390 		    si = small_index(pan, si);
391 		    sj = small_index(pan, sj);
392 		    for (p=0; p<k; p++) {
393 			v = pmod->list[p + 2];
394 			gretl_matrix_set(Xi, s, p, dset->Z[v][si]);
395 			gretl_matrix_set(Xj, s, p, dset->Z[v][sj]);
396 		    }
397 		    s++;
398 		}
399 	    }
400 
401 	    /* cumulate s_ij * Xi'Xj into W (using V as storage) */
402 	    gretl_matrix_multiply_by_scalar(Xi, sij);
403 	    gretl_matrix_multiply_mod(Xi, GRETL_MOD_TRANSPOSE,
404 				      Xj, GRETL_MOD_NONE,
405 				      V, GRETL_MOD_NONE);
406 	    gretl_matrix_add_to(W, V);
407 	    /* and take in s_ij * Xj'Xi */
408 	    gretl_matrix_add_transpose_to(W, V);
409 	}
410     }
411 
412     /* check that the middle term is p.d. */
413     gretl_matrix_copy_values(V, W);
414     err = gretl_matrix_cholesky_decomp(V);
415     if (err) {
416 	fprintf(stderr, "beck_katz_vcv: matrix W is not p.d.\n");
417 	gretl_model_set_int(pmod, "panel_bk_failed", 1);
418 	goto bailout;
419     }
420 
421     /* form V = (Xi'Xi)^{-1} W (Xi'Xi)^{-1} */
422     gretl_matrix_qform(XX, GRETL_MOD_NONE, W,
423 		       V, GRETL_MOD_NONE);
424 
425     gretl_model_set_vcv_info(pmod, VCV_PANEL, PANEL_BK);
426 
427  bailout:
428 
429     gretl_matrix_free(Xi);
430     gretl_matrix_free(Xj);
431 
432     return err;
433 }
434 
435 /* HAC covariance matrix for pooled, fixed- or random-effects models,
436    given "fixed T and large N".  In the case of "large T" a different
437    form is needed for robustness in respect of autocorrelation.  See
438    Arellano, "Panel Data Econometrics" (Oxford, 2003), pages 18-19.
439 
440    In the case of fixed or random effects there should be no missing
441    values, since we created a special dataset purged of same. But
442    with the pooled model we need to index into the full dataset, and
443    work around any missing values.
444 */
445 
446 static int
arellano_vcv(MODEL * pmod,panelmod_t * pan,const DATASET * dset,gretl_matrix * XX,gretl_matrix * W,gretl_matrix * V)447 arellano_vcv (MODEL *pmod, panelmod_t *pan, const DATASET *dset,
448 	      gretl_matrix *XX, gretl_matrix *W, gretl_matrix *V)
449 {
450     gretl_vector *e = NULL;
451     gretl_matrix *Xi = NULL;
452     gretl_vector *eXi = NULL;
453     int T = pan->Tmax;
454     int k = pmod->ncoeff;
455     double Nfac = 1.0;
456     int i, j, v, s, t;
457     int err = 0;
458 
459     e   = gretl_column_vector_alloc(T);
460     Xi  = gretl_matrix_alloc(T, k);
461     eXi = gretl_vector_alloc(k);
462 
463     if (e == NULL || Xi == NULL || eXi == NULL) {
464 	err = E_ALLOC;
465 	goto bailout;
466     }
467 
468     if (getenv("ARELLANO_ASY") != NULL) {
469 	/* don't do the @Nfac thing */
470 	fprintf(stderr, "skipping Cameron-Miller adjustment\n");
471     } else {
472 	/* Small N adjustment factor: "reduce downward bias in
473 	   case of finite N" (Cameron and Miller, "A Practitioner's
474 	   Guide to Cluster-Robust Inference", Journal of Human
475 	   Resources, Spring 2015).
476 	*/
477 	if (pmod->ci != IVREG) {
478 	    /* FIXME IVREG? */
479 	    Nfac = pan->effn / (pan->effn - 1.0);
480 	    Nfac *= (pmod->nobs - 1.0) / (pmod->nobs - k);
481 	    Nfac = sqrt(Nfac);
482 	}
483 #if 0
484 	fprintf(stderr, "Nfac1 = %d/%d = %g\n", pan->effn, pan->effn-1,
485 		pan->effn / (pan->effn - 1.0));
486 	fprintf(stderr, "Nfac2 = %d/%d = %g\n", pmod->nobs-1, pmod->nobs-k,
487 		(pmod->nobs - 1.0) / (pmod->nobs - k));
488 	fprintf(stderr, "Nfac final = %g\n", Nfac);
489 #endif
490     }
491 
492     for (i=0; i<pan->nunits; i++) {
493 	int Ti = pan->unit_obs[i];
494 	int p = 0;
495 
496 	if (Ti == 0) {
497 	    continue;
498 	}
499 
500 	e = gretl_matrix_reuse(e, Ti, 1);
501 	Xi = gretl_matrix_reuse(Xi, Ti, k);
502 
503 	for (t=0; t<pan->T; t++) {
504 	    s = panel_index(i, t);
505 	    if (na(pmod->uhat[s])) {
506 		continue;
507 	    }
508 	    gretl_vector_set(e, p, Nfac * pmod->uhat[s]);
509 	    s = small_index(pan, s);
510 	    for (j=0; j<k; j++) {
511 		v = pmod->list[j+2];
512 		if (s < 0) {
513 		    gretl_matrix_set(Xi, p, j, 0);
514 		} else {
515 		    gretl_matrix_set(Xi, p, j, dset->Z[v][s]);
516 		}
517 	    }
518 	    p++;
519 	}
520 
521 	gretl_matrix_multiply_mod(e, GRETL_MOD_TRANSPOSE,
522 				  Xi, GRETL_MOD_NONE,
523 				  eXi, GRETL_MOD_NONE);
524 	gretl_matrix_multiply_mod(eXi, GRETL_MOD_TRANSPOSE,
525 				  eXi, GRETL_MOD_NONE,
526 				  W, GRETL_MOD_CUMULATE);
527     }
528 
529     /* form V(b) = (X'X)^{-1} W (X'X)^{-1} */
530     gretl_matrix_qform(XX, GRETL_MOD_NONE, W,
531 		       V, GRETL_MOD_NONE);
532 
533 #if 0
534     gretl_matrix_print(XX, "X'X^{-1}");
535     gretl_matrix_print(W, "W");
536     gretl_matrix_print(V, "V");
537 #endif
538 
539     gretl_model_set_vcv_info(pmod, VCV_PANEL, PANEL_HAC);
540 
541  bailout:
542 
543     gretl_matrix_free(e);
544     gretl_matrix_free(Xi);
545     gretl_matrix_free(eXi);
546 
547     return err;
548 }
549 
550 /* common setup for Arellano and Beck-Katz VCV estimators
551    (note that pooled OLS with the --cluster option is
552    handled separately)
553 */
554 
555 static int
panel_robust_vcv(MODEL * pmod,panelmod_t * pan,const DATASET * dset)556 panel_robust_vcv (MODEL *pmod, panelmod_t *pan, const DATASET *dset)
557 {
558     gretl_matrix *W = NULL;
559     gretl_matrix *V = NULL;
560     gretl_matrix *XX = NULL;
561     int k = pmod->ncoeff;
562     int err = 0;
563 
564     W = gretl_zero_matrix_new(k, k);
565     V = gretl_matrix_alloc(k, k);
566 
567     if (W == NULL || V == NULL) {
568 	err = E_ALLOC;
569 	goto bailout;
570     }
571 
572     XX = panel_model_xpxinv(pmod, &err);
573     if (err) {
574 	fprintf(stderr, "panel_robust_vcv: failed at panel_model_xpxinv\n");
575 	goto bailout;
576     }
577 
578     /* call the appropriate function */
579     if (libset_get_bool(USE_PCSE)) {
580 	err = beck_katz_vcv(pmod, pan, dset, XX, W, V);
581     } else {
582 	err = arellano_vcv(pmod, pan, dset, XX, W, V);
583     }
584 
585     if (!err) {
586 	int i, j, s = 0;
587 
588 	for (i=0; i<k; i++) {
589 	    for (j=i; j<k; j++) {
590 		pmod->vcv[s++] = gretl_matrix_get(V, i, j);
591 	    }
592 	    pmod->sderr[i] = sqrt(gretl_matrix_get(V, i, i));
593 	}
594     }
595 
596  bailout:
597 
598     gretl_matrix_free(W);
599     gretl_matrix_free(V);
600     gretl_matrix_free(XX);
601 
602     if (err && pmod->vcv != NULL) {
603 	free(pmod->vcv);
604 	pmod->vcv = NULL;
605     }
606 
607     return err;
608 }
609 
femod_regular_vcv(MODEL * pmod)610 static void femod_regular_vcv (MODEL *pmod)
611 {
612     if (pmod->vcv == NULL) {
613 	/* estimated via Cholesky: no vcv yet */
614 	makevcv(pmod, pmod->sigma);
615     } else {
616 	/* estimated via QR: "vcv" = (X'X)^{-1} */
617 	int i, k = pmod->ncoeff;
618 	int n = k * (k + 1) / 2;
619 	double s2 = pmod->sigma * pmod->sigma;
620 
621 	for (i=0; i<n; i++) {
622 	    pmod->vcv[i] *= s2;
623 	}
624     }
625 }
626 
627 #if 0 /* not ready yet */
628 
629 /* See Cameron and Trivedi, Microeconometrics, 21.3.4 */
630 
631 static int panel_autocorr_1 (MODEL *pmod, const panelmod_t *pan)
632 {
633     double *ubar;
634     double *ctt, *css, *cst;
635     double uit, uis, rho;
636     int i, t, ti;
637     int n = 0;
638 
639     ubar = malloc(pan->T * sizeof *ubar);
640     ctt = malloc(pan->T * sizeof *ctt);
641     css = malloc(pan->T * sizeof *css);
642     cst = malloc(pan->T * sizeof *cst);
643 
644     if (ubar == NULL) {
645 	return E_ALLOC;
646     }
647 
648     for (t=0; t<pan->T; t++) {
649 	ubar[t] = 0.0;
650 	ctt[t] = css[t] = cst[t] = 0.0;
651     }
652 
653     for (i=0; i<pan->nunits; i++) {
654 	if (pan->unit_obs[i] > 0) {
655 	    for (t=0; t<pan->T; t++) {
656 		ti = panel_index(i, t);
657 		uit = pmod->uhat[ti];
658 		if (!na(uit)) {
659 		    ubar[t] += uit;
660 		}
661 	    }
662 	    n++;
663 	}
664     }
665 
666     for (t=0; t<pan->T; t++) {
667 	ubar[t] /= n;
668     }
669 
670     for (i=0; i<pan->nunits; i++) {
671 	if (pan->unit_obs[i] > 0) {
672 	    for (t=0; t<pan->T; t++) {
673 		ti = panel_index(i, t);
674 		uit = pmod->uhat[ti];
675 		if (t > 0) {
676 		    uis = pmod->uhat[ti-1];
677 		} else {
678 		    uis = NADBL;
679 		}
680 		if (!na(uit)) {
681 		    ctt[t] += (uit - ubar[t]) * (uit - ubar[t]);
682 		}
683 		if (!na(uis)) {
684 		    css[t] += (uis - ubar[t-1]) * (uis - ubar[t-1]);
685 		}
686 		if (!na(uis) && !na(uit)) {
687 		    cst[t] += (uis - ubar[t-1]) * (uit - ubar[t-1]);
688 		}
689 	    }
690 	}
691     }
692 
693     for (t=1; t<pan->T; t++) {
694 	ctt[t] /= (n - 1);
695 	css[t] /= (n - 1);
696 	cst[t] /= (n - 1);
697 	rho = cst[t] / sqrt(css[t] * ctt[t]);
698 	fprintf(stderr, "rho(%d) = %g\n", t, rho);
699     }
700 
701     free(ubar);
702     free(ctt);
703     free(cst);
704     free(css);
705 
706     return 0;
707 }
708 
709 #endif
710 
711 #define DWPVAL_TESTING 1
712 
panel_DW_pval_ok(const MODEL * pmod)713 int panel_DW_pval_ok (const MODEL *pmod)
714 {
715     if (na(pmod->dw)) {
716 	return 0;
717     } else {
718 #if DWPVAL_TESTING
719 	return 1;
720 #else
721 	int Tmax = gretl_model_get_int(pmod, "Tmax");
722 	int Tmin = gretl_model_get_int(pmod, "Tmin");
723 
724 	/* Too restrictive? Or not restrictive enough? */
725 	return Tmax == Tmin;
726 #endif
727     }
728 }
729 
730 /* See Bhargava, Franzini and Narendranathan, "Serial Correlation and
731    the Fixed Effects Model", Review of Economic Studies 49, 1982,
732    page 536. Strictly speaking what's calculated here is the marginal
733    significance level of the DW stat when considered as a d_L value.
734    It's unclear if this method can really be extended to unbalanced
735    panels.
736 */
737 
BFN_panel_DW_pvalue(MODEL * pmod,const DATASET * dset,int * err)738 double BFN_panel_DW_pvalue (MODEL *pmod, const DATASET *dset, int *err)
739 {
740     gretl_matrix *lam = NULL;
741     double r, pv, lamq, sinarg, pi_2T;
742     int T = gretl_model_get_int(pmod, "Tmax");
743     int N = gretl_model_get_int(pmod, "n_included_units");
744     int nlam, k = pmod->ncoeff;
745     int i, q;
746 
747     if (pmod->ifc) {
748 	k--; /* don't include the constant */
749     }
750 
751     nlam = pmod->nobs - N - k;
752     lam = gretl_column_vector_alloc(nlam);
753     if (lam == NULL) {
754 	*err = E_ALLOC;
755 	return NADBL;
756     }
757 
758     pi_2T = M_PI / (2.0*T);
759     sinarg = sin(pi_2T);
760     lamq = 4 * sinarg * sinarg;
761     r = pmod->dw;
762 
763     q = 1;
764     for (i=0; i<nlam; i++) {
765 	lam->val[i] = lamq - r;
766 	if ((i+1) % N == 0) {
767 	    q++;
768 	    sinarg = sin(q*pi_2T);
769 	    lamq = 4 * sinarg * sinarg;
770 	}
771     }
772 
773     pv = imhof(lam, 0, err);
774     if (!*err) {
775 	if (pv < 0) {
776 	    pv = 0;
777 	}
778 	gretl_model_set_double(pmod, "dw_pval", pv);
779     }
780 
781 #if 0
782     fprintf(stderr, "DW: T=%d, Tmin=%d, N=%d, nlam=%d, DW=%g, pv=%g\n",
783 	    T, gretl_model_get_int(pmod, "Tmin"), N, nlam, pmod->dw, pv);
784 #endif
785 
786     gretl_matrix_free(lam);
787 
788     return pv;
789 }
790 
791 /* Durbin-Watson statistic for the pooled or fixed effects model.
792 
793    See Bhargava, Franzini and Narendranathan, "Serial Correlation and
794    the Fixed Effects Model", Review of Economic Studies 49, 1982,
795    pp. 533-549, and also Baltagi and Wu, "Unequally Spaced Panel Data
796    Regressions With AR(1) Disturbances", Econometric Theory, 15, 1999,
797    pp. 814-823 for discussion of the unbalanced case.
798 */
799 
panel_dwstat(MODEL * pmod,panelmod_t * pan)800 static void panel_dwstat (MODEL *pmod, panelmod_t *pan)
801 {
802     double dwnum = 0.0;
803     double rnum = 0.0;
804     double rden = 0.0;
805     double ut, u1;
806     int in_bounds = 1;
807     int i, t, s;
808 
809     pmod->dw = pmod->rho = NADBL;
810 
811     if (pmod->ess <= 0.0) {
812 	return;
813     }
814 
815 #if 0
816     fprintf(stderr, "DW: pmod=%p, pooled=%p, T=%d\n",
817 	    (void *) pmod, (void *) pan->pooled, pan->T);
818     fprintf(stderr, " t1=%d, t2=%d, nobs=%d, full_n=%d\n", pmod->t1,
819 	    pmod->t2, pmod->nobs, pmod->full_n);
820 #endif
821 
822     for (i=0; i<pan->nunits && in_bounds; i++) {
823 	int started = 0;
824 
825 	if (pan->unit_obs[i] == 0) {
826 	    continue;
827 	}
828 	for (t=0; t<pan->T; t++) {
829 	    s = panel_index(i, t);
830 	    if (s >= pmod->t2) {
831 		in_bounds = 0;
832 		break;
833 	    }
834 	    ut = pmod->uhat[s];
835 	    if (!na(ut)) {
836 		if (t == 0) {
837 		    started = 1;
838 		    continue;
839 		}
840 		u1 = pmod->uhat[s-1];
841 		if (na(u1)) {
842 		    /* implicitly take u1 as 0 */
843 		    if (started) {
844 			dwnum += ut * ut;
845 		    }
846 		} else {
847 		    dwnum += (ut - u1) * (ut - u1);
848 		    rnum += ut * u1;
849 		    rden += u1 * u1;
850 		}
851 		started = 1;
852 	    }
853 	}
854     }
855 
856     if (dwnum > 0.0) {
857 	pmod->dw = dwnum / pmod->ess;
858     }
859     if (na(pmod->rho) && rden > 0.0 && !na(rden)) {
860 	pmod->rho = rnum / rden;
861     }
862     if (!na(pmod->rho) && (pmod->rho <= -1.0 || pmod->rho >= 1.0)) {
863 	pmod->rho = NADBL;
864     }
865 
866     if (pan->opt & OPT_U) {
867 	/* make these stats available for random effects reporting */
868 	pan->dw = pmod->dw;
869 	pan->rho = pmod->rho;
870     }
871 }
872 
873 /* Allocate the arrays needed to perform the Hausman test,
874    in its matrix formulation.
875 */
876 
matrix_hausman_allocate(panelmod_t * pan)877 static int matrix_hausman_allocate (panelmod_t *pan)
878 {
879     int k = pan->vlist[0] - 2;
880     int err = 0;
881 
882     /* array to hold differences between coefficient estimates */
883     pan->bdiff = gretl_vector_alloc(k);
884     if (pan->bdiff == NULL) {
885 	err = E_ALLOC;
886     } else {
887 	/* array to hold covariance matrix */
888 	pan->Sigma = gretl_matrix_alloc(k, k);
889 	if (pan->Sigma == NULL) {
890 	    gretl_matrix_free(pan->bdiff);
891 	    pan->bdiff = NULL;
892 	    err = E_ALLOC;
893 	}
894     }
895 
896     if (!err) {
897 	pan->dfH = k;
898     }
899 
900     return err;
901 }
902 
real_varying_list(panelmod_t * pan)903 static int *real_varying_list (panelmod_t *pan)
904 {
905     int *vlist = gretl_list_copy(pan->vlist);
906     int i;
907 
908     if (vlist != NULL) {
909 	for (i=2; i<=vlist[0]; i++) {
910 	    if (vlist[i] == 0) {
911 		gretl_list_delete_at_pos(vlist, i);
912 		break;
913 	    }
914 	}
915     }
916 
917     return vlist;
918 }
919 
920 /* Construct a version of the dataset from which the group means are
921    subtracted, for the "within" regression.  Nota bene: this auxiliary
922    dataset is not necessarily of full length: missing observations
923    are skipped.
924 */
925 
within_groups_dataset(const DATASET * dset,panelmod_t * pan)926 static DATASET *within_groups_dataset (const DATASET *dset,
927 				       panelmod_t *pan)
928 {
929     DATASET *wset = NULL;
930     int *vlist = NULL;
931     int i, j, vj, nv;
932     int s, t, bigt;
933     int err = 0;
934 
935     pan->balanced = 1;
936 
937     for (i=0; i<pan->nunits; i++) {
938  	if (pan->unit_obs[i] > 0) {
939  	    if (pan->unit_obs[i] != pan->Tmax) {
940  		pan->balanced = 0;
941  	    }
942  	}
943     }
944 
945     if (pan->NT < dset->n) {
946 	err = allocate_data_finders(pan, dset->n);
947 	if (err) {
948 	    return NULL;
949 	}
950     }
951 
952     vlist = real_varying_list(pan);
953     if (vlist == NULL) {
954 	return NULL;
955     }
956 
957     nv = pan->vlist[0];
958 
959 #if PDEBUG
960     fprintf(stderr, "within_groups dataset: nvars=%d, nobs=%d\n",
961 	    pan->vlist[0], pan->NT);
962 #endif
963 
964     wset = create_auxiliary_dataset(nv, pan->NT, 0);
965     if (wset == NULL) {
966 	free(vlist);
967 	return NULL;
968     }
969 
970     for (j=1; j<=vlist[0]; j++) {
971 	double xbar, gxbar = 0.0;
972 	int allzero = 1;
973 
974 	vj = vlist[j];
975 	gxbar = 0.0;
976 	s = 0;
977 
978 #if PDEBUG
979 	strcpy(wset->varname[j], dset->varname[vj]);
980 	fprintf(stderr, "de-meaning: working on list[%d], %s\n",
981 		j, dset->varname[vj]);
982 #endif
983 
984 	for (i=0; i<pan->nunits; i++) {
985 	    int Ti = pan->unit_obs[i];
986 	    int grpzero = 1;
987 	    int got = 0;
988 
989 	    if (Ti == 0) {
990 		continue;
991 	    }
992 
993 	    /* first pass: find the group mean */
994 	    xbar = 0.0;
995 	    for (t=0; t<pan->T; t++) {
996 		bigt = panel_index(i, t);
997 		if (!panel_missing(pan, bigt)) {
998 		    xbar += dset->Z[vj][bigt];
999 		}
1000 	    }
1001 
1002 	    gxbar += xbar;
1003 	    xbar /= Ti;
1004 
1005 	    /* second pass: calculate de-meaned values */
1006 	    got = 0;
1007 	    for (t=0; t<pan->T && got<Ti; t++) {
1008 		bigt = panel_index(i, t);
1009 		if (!panel_missing(pan, bigt)) {
1010 		    wset->Z[j][s] = dset->Z[vj][bigt] - xbar;
1011 		    if (wset->Z[j][s] != 0.0) {
1012 			grpzero = 0;
1013 		    }
1014 		    got++;
1015 		    if (pan->small2big != NULL) {
1016 			pan->small2big[s] = bigt;
1017 			pan->big2small[bigt] = s;
1018 		    }
1019 		    s++;
1020 		}
1021 	    }
1022 
1023 	    if (!grpzero) {
1024 		allzero = 0;
1025 	    }
1026 	} /* end loop over units */
1027 
1028 	gxbar /= pan->NT;
1029 
1030 	if (j == 1 && allzero) {
1031 	    /* the dependent variable is not time-varying */
1032 	    fprintf(stderr, "fixed effects: dependent var is time-invariant\n");
1033 	}
1034 
1035 	/* wZ = data - group mean + grand mean */
1036 	for (s=0; s<pan->NT; s++) {
1037 	    wset->Z[j][s] += gxbar;
1038 	}
1039     }
1040 
1041     free(vlist);
1042 
1043     return wset;
1044 }
1045 
1046 /* Construct a quasi-demeaned version of the dataset so we can apply
1047    least squares to estimate the random effects model.  This dataset
1048    is not necessarily of full length.  If we're implementing the
1049    Hausman test using the regression approach this dataset will also
1050    include "straight" de-meaned counterparts of the time-varying
1051    variables.
1052 */
1053 
1054 static DATASET *
random_effects_dataset(const DATASET * dset,const DATASET * gset,int * relist,int * hlist,panelmod_t * pan)1055 random_effects_dataset (const DATASET *dset,
1056 			const DATASET *gset,
1057 			int *relist,
1058 			int *hlist,
1059 			panelmod_t *pan)
1060 {
1061     DATASET *rset;
1062     double xbar, theta_i;
1063     int hreg = (hlist != NULL);
1064     int v1 = relist[0];
1065     int v2 = 0;
1066     int i, j, k, k2, t;
1067     int vj, s, bigt, u;
1068     int err = 0;
1069 
1070     if (hreg) {
1071 	/* Apparatus for regression version of Hausman test:
1072 	   note that we shouldn't include time dummies here
1073 	   since the de-meaned versions would be perfectly
1074 	   collinear with the quasi-demeaned ones.
1075 	*/
1076 	int hmax = pan->vlist[0] - pan->ntdum;
1077 
1078 	for (i=1; i<hmax; i++) {
1079 	    if (pan->vlist[i+1] != 0) {
1080 		hlist[0] = ++v2;
1081 		hlist[i-1] = v1 + i - 2;
1082 	    }
1083 	}
1084     }
1085 
1086     if (pan->NT < dset->n) {
1087 	err = allocate_data_finders(pan, dset->n);
1088 	if (err) {
1089 	    return NULL;
1090 	}
1091     }
1092 
1093 #if PDEBUG
1094     fprintf(stderr, "random_effects_dataset: nvars=%d, nobs=%d\n",
1095 	    v1 + v2, pan->NT);
1096 #endif
1097 
1098     rset = create_auxiliary_dataset(v1 + v2, pan->NT, 0);
1099     if (rset == NULL) {
1100 	return NULL;
1101     }
1102 
1103     /* build GLS regression list, and process varnames for regression
1104        version of Hausman test if wanted
1105     */
1106 
1107     k = 0;
1108     k2 = v1 - 1;
1109     for (j=1; j<=v1; j++) {
1110 	vj = pan->pooled->list[j];
1111 	if (vj == 0) {
1112 	    relist[j] = 0;
1113 	} else {
1114 	    relist[j] = ++k;
1115 	    strcpy(rset->varname[k], dset->varname[vj]);
1116 	    if (hreg && k2 < rset->v - 1 && j > 1 && var_is_varying(pan, vj)) {
1117 		/* hausman-related term */
1118 		k2++;
1119 		strcpy(rset->varname[k2], "_");
1120 		strncat(rset->varname[k2], dset->varname[vj],
1121 			VNAMELEN - 2);
1122 	    }
1123 	}
1124     }
1125 
1126     /* Now create the transformed variables: original data minus theta
1127        times the appropriate group mean (and in addition, original
1128        data minus the group mean for time-varying vars, if doing the
1129        Hausman test by the regression method).
1130     */
1131 
1132     s = u = 0;
1133     pan->theta_bar = 0.0;
1134 
1135     for (i=0; i<pan->nunits; i++) {
1136 	int Ti = pan->unit_obs[i];
1137 	int got = 0;
1138 
1139 	if (Ti == 0) {
1140 	    continue;
1141 	}
1142 
1143 	if (Ti != pan->Tmax) {
1144 	    theta_i = 1.0 - sqrt(pan->s2e / (Ti * pan->s2v + pan->s2e));
1145 	} else {
1146 	    theta_i = pan->theta;
1147 	}
1148 
1149 	pan->theta_bar += theta_i;
1150 
1151 	for (t=0; t<pan->T && got<Ti; t++) {
1152 	    bigt = panel_index(i, t);
1153 	    k = 0;
1154 	    k2 = v1 - 1;
1155 	    if (!panel_missing(pan, bigt)) {
1156 		for (j=0; j<v1; j++) {
1157 		    vj = pan->pooled->list[j+1];
1158 		    if (vj == 0) {
1159 			rset->Z[0][s] -= theta_i;
1160 		    } else {
1161 			k++;
1162 			xbar = (k < gset->v)? gset->Z[k][u] : 1.0 / pan->Tmax;
1163 			rset->Z[k][s] = dset->Z[vj][bigt] - theta_i * xbar;
1164 			if (hreg && k2 < rset->v - 1 && var_is_varying(pan, vj)) {
1165 			    /* hausman-related term */
1166 			    rset->Z[++k2][s] = dset->Z[vj][bigt] - xbar;
1167 			}
1168 		    }
1169 		}
1170 		got++;
1171 		if (pan->small2big != NULL) {
1172 		    pan->small2big[s] = bigt;
1173 		    pan->big2small[bigt] = s;
1174 		}
1175 		s++;
1176 	    }
1177 	}
1178 	u++;
1179     }
1180 
1181     return rset;
1182 }
1183 
1184 #define BETWEEN_DEBUG 0
1185 
1186 /* Construct a mini-dataset containing the group means, in
1187    order to run the group-means or "between" regression.
1188 */
1189 
group_means_dataset(panelmod_t * pan,const DATASET * dset)1190 static DATASET *group_means_dataset (panelmod_t *pan,
1191 				     const DATASET *dset)
1192 {
1193     DATASET *gset;
1194     double x;
1195     int gn = pan->effn;
1196     int gv = pan->pooled->list[0];
1197     int i, j, k;
1198     int s, t, bigt;
1199 
1200     if (pan->balanced && pan->ntdum > 0) {
1201 	gv -= pan->ntdum;
1202     }
1203 
1204 #if PDEBUG
1205     fprintf(stderr, "group_means_dataset: nvars=%d, nobs=%d\n",
1206 	    gv, gn);
1207 #endif
1208 
1209     gset = create_auxiliary_dataset(gv, gn, 0);
1210     if (gset == NULL) {
1211 	return NULL;
1212     }
1213 
1214     k = 1;
1215     for (j=1; j<=gv; j++) {
1216 	int vj = pan->pooled->list[j];
1217 
1218 	if (vj == 0) {
1219 	    continue;
1220 	}
1221 
1222 #if BETWEEN_DEBUG
1223 	strcpy(ginfo->varname[k], dset->varname[vj]);
1224 #else
1225 	if (pan->opt & OPT_B) {
1226 	    /* will save the "between" model: so name the variables */
1227 	    strcpy(gset->varname[k], dset->varname[vj]);
1228 	}
1229 #endif
1230 
1231 	s = 0;
1232 	for (i=0; i<pan->nunits; i++) {
1233 	    int Ti = pan->unit_obs[i];
1234 
1235 	    if (Ti == 0) {
1236 		continue;
1237 	    }
1238 
1239 	    x = 0.0;
1240 	    for (t=0; t<pan->T; t++) {
1241 		bigt = panel_index(i, t);
1242 		if (!panel_missing(pan, bigt)) {
1243 		    x += dset->Z[vj][bigt];
1244 		}
1245 	    }
1246 	    gset->Z[k][s++] = x / Ti;
1247 	}
1248 	k++;
1249     }
1250 
1251 #if BETWEEN_DEBUG
1252     if (1) {
1253 	PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
1254 
1255 	printdata(NULL, NULL, gset, OPT_O, prn);
1256 	gretl_print_destroy(prn);
1257     }
1258 #endif
1259 
1260     return gset;
1261 }
1262 
1263 /* spruce up the between model and attach it to pan */
1264 
save_between_model(MODEL * pmod,const int * blist,DATASET * gset,panelmod_t * pan)1265 static int save_between_model (MODEL *pmod, const int *blist,
1266 			       DATASET *gset, panelmod_t *pan)
1267 {
1268     gretl_matrix *uh, *yh;
1269     int i, err = 0;
1270 
1271     pmod->ci = PANEL;
1272     pmod->opt |= OPT_B;
1273     pmod->dw = NADBL;
1274     gretl_model_add_panel_varnames(pmod, gset, NULL);
1275 
1276     uh = gretl_column_vector_alloc(pmod->nobs);
1277     yh = gretl_column_vector_alloc(pmod->nobs);
1278 
1279     if (uh == NULL || yh == NULL) {
1280 	err = E_ALLOC;
1281     } else {
1282 	for (i=0; i<pmod->nobs; i++) {
1283 	    uh->val[i] = pmod->uhat[i];
1284 	    yh->val[i] = pmod->yhat[i];
1285 	}
1286 	gretl_model_set_matrix_as_data(pmod, "uhat", uh);
1287 	gretl_model_set_matrix_as_data(pmod, "yhat", yh);
1288     }
1289 
1290 #if 0
1291     /* this is risky at present: too many functions want
1292        to read pmod->uhat directly */
1293     free(pmod->uhat); free(pmod->yhat);
1294     pmod->uhat = pmod->yhat = NULL;
1295 #endif
1296 
1297     *pan->realmod = *pmod;
1298 
1299     return err;
1300 }
1301 
1302 /* Compute @ubPub as the Ti-weighted sum of the squared
1303    residuals from the Between model, as per Stata (but
1304    in disagreement with Baltagi and Chang, 1994).
1305 */
1306 
alt_compute_ubPub(panelmod_t * pan,MODEL * bmod)1307 static int alt_compute_ubPub (panelmod_t *pan, MODEL *bmod)
1308 {
1309     int i, Ti, t = 0;
1310 
1311     pan->ubPub = 0.0;
1312 
1313     for (i=0; i<pan->nunits; i++) {
1314 	Ti = pan->unit_obs[i];
1315 	if (Ti > 0) {
1316 	    pan->ubPub += Ti * bmod->uhat[t] * bmod->uhat[t];
1317 	    t++;
1318 	}
1319     }
1320 
1321     return 0;
1322 }
1323 
adjust_gset_data(panelmod_t * pan,DATASET * gset,int step)1324 static void adjust_gset_data (panelmod_t *pan, DATASET *gset,
1325 			      int step)
1326 {
1327     int i, j, Ti, t = 0;
1328     double adj;
1329 
1330     for (i=0; i<pan->nunits; i++) {
1331 	Ti = pan->unit_obs[i];
1332 	if (Ti > 0) {
1333 	    adj = step == 0 ? sqrt(Ti) : 1.0/sqrt(Ti);
1334 	    for (j=0; j<gset->v; j++) {
1335 		gset->Z[j][t] *= adj;
1336 	    }
1337 	    t++;
1338 	}
1339     }
1340 }
1341 
1342 /* Compute @ubPub as the sum of squared residuals from a
1343    Ti-weighted Between regression, as per Baltagi and Chang,
1344    1994, and also Baltagi, 2013.
1345 */
1346 
compute_ubPub(panelmod_t * pan,MODEL * bmod,int * blist,DATASET * gset)1347 static int compute_ubPub (panelmod_t *pan, MODEL *bmod,
1348 			  int *blist, DATASET *gset)
1349 {
1350     int err;
1351 
1352     /* multiply all data by sqrt(Ti) */
1353     adjust_gset_data(pan, gset, 0);
1354     clear_model(bmod);
1355     *bmod = lsq(blist, gset, OLS, OPT_A);
1356     err = bmod->errcode;
1357     if (!err) {
1358 	pan->ubPub = bmod->ess;
1359     }
1360     /* put the original data back */
1361     adjust_gset_data(pan, gset, 1);
1362 
1363     return err;
1364 }
1365 
1366 /* calculate the group means or "between" regression and its error
1367    variance */
1368 
between_variance(panelmod_t * pan,DATASET * gset)1369 static int between_variance (panelmod_t *pan, DATASET *gset)
1370 {
1371     gretlopt bopt;
1372     MODEL bmod;
1373     int *blist;
1374     int i, j, k;
1375     int err = 0;
1376 
1377     blist = gretl_list_new(gset->v);
1378     if (blist == NULL) {
1379 	return E_ALLOC;
1380     }
1381 
1382     j = k = 1;
1383     for (i=1; i<=gset->v; i++) {
1384 	if (pan->pooled->list[i] == 0) {
1385 	    blist[k++] = 0;
1386 	} else {
1387 	    blist[k++] = j++;
1388 	}
1389     }
1390 
1391     bopt = (pan->opt & OPT_B)? OPT_NONE : OPT_A;
1392     bmod = lsq(blist, gset, OLS, bopt);
1393 
1394     if (bmod.errcode == 0) {
1395 	pan->s2b = bmod.ess / (bmod.nobs - bmod.ncoeff);
1396     } else {
1397 	err = bmod.errcode;
1398     }
1399 
1400 #if PDEBUG
1401     if (err) {
1402 	fprintf(stderr, "error %d in between_variance\n", err);
1403     } else {
1404 	fprintf(stderr, "pan->s2b = %g\n", pan->s2b);
1405     }
1406 #endif
1407 
1408     if (!err && (pan->opt & OPT_B)) {
1409 	err = save_between_model(&bmod, blist, gset, pan);
1410     } else {
1411 	if (!err && !pan->balanced && (pan->opt & OPT_U) &&
1412 	    (pan->opt & OPT_X) && !(pan->opt & OPT_N)) {
1413 	    /* Prepare for the Baltagi-Chang take on Swamy-Arora
1414 	       in the case of an unbalanced panel
1415 	    */
1416 	    if (stata_sa) {
1417 		err = alt_compute_ubPub(pan, &bmod);
1418 	    } else {
1419 		err = compute_ubPub(pan, &bmod, blist, gset);
1420 	    }
1421 	}
1422 	clear_model(&bmod);
1423     }
1424 
1425     free(blist);
1426 
1427     return err;
1428 }
1429 
1430 /* op is VCV_SUBTRACT for the random effects model.  With the fixed
1431    effects model we don't have to worry about excluding vcv entries
1432    for time-invariant variables, since none of these are included.
1433 */
1434 
1435 static int
vcv_skip(const MODEL * pmod,int i,const panelmod_t * pan,int op)1436 vcv_skip (const MODEL *pmod, int i, const panelmod_t *pan, int op)
1437 {
1438     int skip = 0;
1439 
1440     if (pmod->list[i+2] == 0) {
1441 	/* always skip the constant */
1442 	skip = 1;
1443     } else if (op == VCV_SUBTRACT && !pan->varying[i]) {
1444 	/* random effects, time-invariant var */
1445 	skip = 1;
1446     }
1447 
1448     return skip;
1449 }
1450 
1451 /* Fill out the covariance matrix for use with the Hausman test:
1452    the entries that get transcribed here are only those for
1453    slopes with respect to time-varying variables.
1454 */
1455 
1456 static void
vcv_slopes(panelmod_t * pan,const MODEL * pmod,int op)1457 vcv_slopes (panelmod_t *pan, const MODEL *pmod, int op)
1458 {
1459     int idx, i, j;
1460     int mj, mi = 0;
1461     int sj, si = 0;
1462     double x;
1463 
1464     for (i=0; i<pan->dfH; i++) {
1465 	if (vcv_skip(pmod, mi, pan, op)) {
1466 	    i--;
1467 	    mi++;
1468 	    continue;
1469 	}
1470 	mj = mi;
1471 	sj = si;
1472 	for (j=i; j<pan->dfH; j++) {
1473 	    if (vcv_skip(pmod, mj, pan, op)) {
1474 		j--;
1475 		mj++;
1476 		continue;
1477 	    }
1478 
1479 	    idx = ijton(mi, mj, pmod->ncoeff);
1480 
1481 	    if (op == VCV_SUBTRACT) {
1482 		x = gretl_matrix_get(pan->Sigma, si, sj);
1483 		x -= pmod->vcv[idx];
1484 	    } else {
1485 		x = pmod->vcv[idx];
1486 	    }
1487 
1488 	    gretl_matrix_set(pan->Sigma, si, sj, x);
1489 	    if (si != sj) {
1490 		gretl_matrix_set(pan->Sigma, sj, si, x);
1491 	    }
1492 
1493 	    mj++;
1494 	    sj++;
1495 	}
1496 	mi++;
1497 	si++;
1498     }
1499 }
1500 
1501 /* calculate Hausman test statistic, matrix diff style */
1502 
bXb(panelmod_t * pan)1503 static int bXb (panelmod_t *pan)
1504 {
1505     int err;
1506 
1507     err = gretl_invert_symmetric_matrix(pan->Sigma);
1508 
1509     if (!err) {
1510 	pan->H = gretl_scalar_qform(pan->bdiff, pan->Sigma, &err);
1511     }
1512 
1513     if (err) {
1514 	pan->H = NADBL;
1515     }
1516 
1517     return err;
1518 }
1519 
fixed_effects_df_correction(MODEL * pmod,int k)1520 static void fixed_effects_df_correction (MODEL *pmod, int k)
1521 {
1522     double dfcorr = sqrt((double) pmod->dfd / (pmod->dfd - k));
1523     int i;
1524 
1525     pmod->dfd -= k;
1526     pmod->dfn += k;
1527 
1528     pmod->sigma *= dfcorr;
1529 
1530     for (i=0; i<pmod->ncoeff; i++) {
1531 	pmod->sderr[i] *= dfcorr;
1532     }
1533 }
1534 
1535 /* used for printing fixed- or random-effects estimates
1536    in the context of the "panel diagnostics" routine
1537 */
1538 
simple_print_panel_model(MODEL * pmod,DATASET * dset,const int * xlist,PRN * prn)1539 static void simple_print_panel_model (MODEL *pmod,
1540 				      DATASET *dset,
1541 				      const int *xlist,
1542 				      PRN *prn)
1543 {
1544     int *savelist = gretl_list_copy(pmod->list);
1545     int i;
1546 
1547     for (i=0; i<pmod->ncoeff; i++) {
1548 	pmod->list[i+2] = xlist[i+2];
1549     }
1550     printmodel(pmod, dset, OPT_P, prn);
1551     free(pmod->list);
1552     pmod->list = savelist;
1553 }
1554 
print_re_results(panelmod_t * pan,MODEL * pmod,DATASET * dset,PRN * prn)1555 static void print_re_results (panelmod_t *pan,
1556 			      MODEL *pmod,
1557 			      DATASET *dset,
1558 			      PRN *prn)
1559 {
1560     pputs(prn, _("Variance estimators:"));
1561     pputc(prn, '\n');
1562     pprintf(prn, _(" between = %g"), pan->s2v);
1563     pputc(prn, '\n');
1564     pprintf(prn, _(" within = %g"), pan->s2e);
1565     pputc(prn, '\n');
1566 
1567     if (pan->balanced || pan->s2v == 0) {
1568 	pprintf(prn, _("theta used for quasi-demeaning = %g"), pan->theta);
1569 	pputc(prn, '\n');
1570     } else {
1571 	pputs(prn, _("Panel is unbalanced: theta varies across units"));
1572 	pputc(prn, '\n');
1573     }
1574     pputc(prn, '\n');
1575 
1576     pputs(prn, _("Random effects estimator\n"
1577 		 "allows for a unit-specific component to the error term\n"));
1578     pputc(prn, '\n');
1579 
1580     simple_print_panel_model(pmod, dset, pan->pooled->list, prn);
1581 }
1582 
print_fe_results(panelmod_t * pan,MODEL * pmod,DATASET * dset,PRN * prn)1583 static int print_fe_results (panelmod_t *pan,
1584 			     MODEL *pmod,
1585 			     DATASET *dset,
1586 			     PRN *prn)
1587 {
1588     int dfn = pan->effn - 1;
1589 
1590     pputs(prn, _("Fixed effects estimator\n"
1591 		 "allows for differing intercepts by cross-sectional unit\n"));
1592     pputc(prn, '\n');
1593 
1594     simple_print_panel_model(pmod, dset, pan->vlist, prn);
1595 
1596     pprintf(prn, _("Residual variance: %g/(%d - %d) = %g\n"),
1597 	    pmod->ess, pmod->nobs, pan->vlist[0] - 1 + dfn, pan->s2e);
1598     pputc(prn, '\n');
1599 
1600     if (!na(pan->Ffe)) {
1601 	pprintf(prn, _("Joint significance of differing group means:\n"));
1602 	pprintf(prn, " F(%d, %g) = %g %s %g\n", pan->Fdfn, pan->Fdfd, pan->Ffe,
1603 		_("with p-value"), snedecor_cdf_comp(pan->Fdfn, pan->Fdfd, pan->Ffe));
1604 	pputs(prn, _("(A low p-value counts against the null hypothesis that "
1605 		     "the pooled OLS model\nis adequate, in favor of the fixed "
1606 		     "effects alternative.)\n\n"));
1607     } else {
1608 	pputc(prn, '\n');
1609     }
1610 
1611     return 0;
1612 }
1613 
time_dummies_wald_test(panelmod_t * pan,MODEL * pmod)1614 static int time_dummies_wald_test (panelmod_t *pan, MODEL *pmod)
1615 {
1616     gretl_matrix *vcv = NULL;
1617     gretl_vector *b = NULL;
1618     double x;
1619     int i, j, k, bigk;
1620     int di, dj;
1621     int err;
1622 
1623     if (pan->ntdum == 0) {
1624 	return 0;
1625     }
1626 
1627     k = pan->ntdum;
1628     bigk = pmod->ncoeff;
1629 
1630     if (pmod->vcv == NULL) {
1631 	err = makevcv(pmod, pmod->sigma);
1632 	if (err) {
1633 	    return err;
1634 	}
1635     }
1636 
1637     b = gretl_column_vector_alloc(k);
1638     vcv = gretl_matrix_alloc(k, k);
1639     if (b == NULL || vcv == NULL) {
1640 	err = E_ALLOC;
1641 	goto bailout;
1642     }
1643 
1644     di = bigk - k;
1645     for (i=0; i<k; i++) {
1646 	b->val[i] = pmod->coeff[di++];
1647     }
1648 
1649     di = bigk - k;
1650     for (i=0; i<k; i++) {
1651 	dj = bigk - k;
1652 	for (j=0; j<=i; j++) {
1653 	    x = pmod->vcv[ijton(di, dj++, bigk)];
1654 	    gretl_matrix_set(vcv, i, j, x);
1655 	    gretl_matrix_set(vcv, j, i, x);
1656 	}
1657 	di++;
1658     }
1659 
1660     err = gretl_invert_symmetric_matrix(vcv);
1661     if (err) {
1662 	goto bailout;
1663     }
1664 
1665     x = gretl_scalar_qform(b, vcv, &err);
1666     if (err) {
1667 	fprintf(stderr, _("Failed to compute test statistic\n"));
1668 	goto bailout;
1669     }
1670 
1671     if (!err) {
1672 	ModelTest *test = model_test_new(GRETL_TEST_PANEL_TIMEDUM);
1673 
1674 	if (test != NULL) {
1675 	    model_test_set_teststat(test, GRETL_STAT_WALD_CHISQ);
1676 	    model_test_set_dfn(test, k);
1677 	    model_test_set_value(test, x);
1678 	    model_test_set_pvalue(test, chisq_cdf_comp(k, x));
1679 	    maybe_add_test_to_model(pmod, test);
1680 	}
1681     }
1682 
1683  bailout:
1684 
1685     gretl_matrix_free(vcv);
1686     gretl_vector_free(b);
1687 
1688     return err;
1689 }
1690 
save_fixed_effects_F(panelmod_t * pan,MODEL * wmod)1691 static void save_fixed_effects_F (panelmod_t *pan, MODEL *wmod)
1692 {
1693     int robust = (pan->opt & OPT_R);
1694     ModelTest *test;
1695 
1696     if (na(pan->Ffe)) {
1697 	return;
1698     }
1699 
1700     test = model_test_new(robust ? GRETL_TEST_PANEL_WELCH :
1701 			  GRETL_TEST_PANEL_F);
1702 
1703     if (test != NULL) {
1704 	model_test_set_teststat(test, robust ? GRETL_STAT_WF : GRETL_STAT_F);
1705 	model_test_set_dfn(test, pan->Fdfn);
1706 	model_test_set_dfd(test, pan->Fdfd);
1707 	model_test_set_value(test, pan->Ffe);
1708 	model_test_set_pvalue(test, snedecor_cdf_comp(pan->Fdfn, pan->Fdfd, pan->Ffe));
1709 	maybe_add_test_to_model(wmod, test);
1710     }
1711 }
1712 
1713 /* "regular" = not robust: sums-of-squares based joint test on
1714    the fixed effects */
1715 
regular_fixed_effects_F(panelmod_t * pan,MODEL * wmod)1716 static void regular_fixed_effects_F (panelmod_t *pan, MODEL *wmod)
1717 {
1718     int k_pooled = pan->pooled->list[0];
1719     int k_fe = pan->vlist[0];
1720 
1721     pan->Fdfn = pan->effn - 1;
1722     pan->Fdfd = wmod->dfd;
1723 
1724     if (k_pooled > k_fe) {
1725 	pan->Fdfn -= k_pooled - k_fe;
1726 	if (pan->Fdfn <= 0) {
1727 	    pan->Ffe = NADBL;
1728 	    return;
1729 	}
1730     }
1731 
1732     pan->Ffe = (pan->pooled->ess - wmod->ess) * pan->Fdfd /
1733 	(wmod->ess * pan->Fdfn);
1734     if (na(pan->Ffe)) {
1735 	pan->Ffe = NADBL;
1736     } else if (pan->Ffe < 0) {
1737 	pan->Ffe = 0;
1738     }
1739 }
1740 
fix_panelmod_list(MODEL * targ,panelmod_t * pan)1741 static int fix_panelmod_list (MODEL *targ, panelmod_t *pan)
1742 {
1743     int i;
1744 
1745 #if PDEBUG
1746     printlist(targ->list, "targ->list");
1747     printlist(pan->pooled->list, "pan->pooled->list");
1748     printlist(pan->vlist, "pan->vlist");
1749 #endif
1750 
1751     free(targ->list);
1752     targ->list = gretl_list_copy(pan->pooled->list);
1753     if (targ->list == NULL) {
1754 	return E_ALLOC;
1755     }
1756 
1757     if (pan->opt & OPT_F) {
1758 	/* fixed effects: remove any non-varying variables */
1759 	for (i=2; i<=targ->list[0]; i++) {
1760 	    if (!in_gretl_list(pan->vlist, targ->list[i])) {
1761 		gretl_list_delete_at_pos(targ->list, i--);
1762 	    }
1763 	}
1764     }
1765 
1766 #if PDEBUG
1767     printlist(targ->list, "new targ->list");
1768 #endif
1769 
1770     return 0;
1771 }
1772 
1773 /* Compute F-test or chi-square test for the regular
1774    regressors, skipping @nskip trailing coefficients
1775    (which can be used to skip time dummies)
1776 */
1777 
panel_overall_test(MODEL * pmod,panelmod_t * pan,int nskip,gretlopt opt)1778 static double panel_overall_test (MODEL *pmod, panelmod_t *pan,
1779 				  int nskip, gretlopt opt)
1780 {
1781     double test = NADBL;
1782     int *omitlist = NULL;
1783 
1784     if (pmod->ncoeff == 1) {
1785 	return test;
1786     }
1787 
1788     if (nskip > 0) {
1789 	int i, k = pmod->list[0] - nskip - 2;
1790 
1791 	omitlist = gretl_list_new(k);
1792 	for (i=1; i<=k; i++) {
1793 	    omitlist[i] = pmod->list[i+2];
1794 	}
1795     }
1796 
1797     if (opt & OPT_X) {
1798 	test = wald_omit_chisq(omitlist, pmod);
1799     } else {
1800 	test = wald_omit_F(omitlist, pmod);
1801     }
1802 
1803     free(omitlist);
1804 
1805     return test;
1806 }
1807 
1808 /* Correct various model statistics, in the case where we estimated
1809    the fixed effects or "within" model on an auxiliary dataset
1810    from which the group means were subtracted.
1811 */
1812 
fix_within_stats(MODEL * fmod,panelmod_t * pan)1813 static int fix_within_stats (MODEL *fmod, panelmod_t *pan)
1814 {
1815     double wrsq, wfstt = NADBL;
1816     int wdfn, nc = fmod->ncoeff;
1817     int err = 0;
1818 
1819     err = fix_panelmod_list(fmod, pan);
1820     if (err) {
1821 	return err;
1822     }
1823 
1824     fmod->ybar = pan->pooled->ybar;
1825     fmod->sdy = pan->pooled->sdy;
1826     fmod->tss = pan->pooled->tss;
1827     fmod->ifc = 1;
1828 
1829     wrsq = fmod->rsq;
1830     if (wrsq < 0.0) {
1831 	wrsq = 0.0;
1832     } else if (na(wrsq)) {
1833 	wrsq = NADBL;
1834     }
1835 
1836     /* Should we differentiate "regular" regressors from
1837        time dummies, if included? For now, yes.
1838     */
1839 
1840     if (pan->ntdum > 0) {
1841 	wdfn = fmod->ncoeff - 1 - pan->ntdum;
1842 	if (wdfn > 0) {
1843 	    wfstt = panel_overall_test(fmod, pan, pan->ntdum,
1844 				       OPT_NONE);
1845 	}
1846     } else {
1847 	wdfn = fmod->ncoeff - 1;
1848 	if (wdfn > 0) {
1849 	    if (pan->opt & OPT_R) {
1850 		wfstt = panel_overall_test(fmod, pan, 0, OPT_NONE);
1851 	    } else {
1852 		wfstt = (wrsq / (1.0 - wrsq)) * ((double) fmod->dfd / wdfn);
1853 	    }
1854 	}
1855     }
1856 
1857     if (!na(wfstt) && wfstt >= 0.0) {
1858 	ModelTest *test = model_test_new(GRETL_TEST_WITHIN_F);
1859 	int wdfd = fmod->dfd;
1860 
1861 	if (pan->opt & OPT_R) {
1862 	    fmod->dfd = wdfd = pan->effn - 1;
1863 	}
1864 
1865 	if (test != NULL) {
1866 	    model_test_set_teststat(test, GRETL_STAT_F);
1867 	    model_test_set_dfn(test, wdfn);
1868 	    model_test_set_dfd(test, wdfd);
1869 	    model_test_set_value(test, wfstt);
1870 	    model_test_set_pvalue(test, snedecor_cdf_comp(wdfn, wdfd, wfstt));
1871 	    maybe_add_test_to_model(fmod, test);
1872 	}
1873     }
1874 
1875     /* note: this member is being borrowed for the "Within R-squared" */
1876     fmod->adjrsq = wrsq;
1877 
1878     /* LSDV-based statistics (FIXME: can we do this in
1879        the --robust case?)
1880     */
1881     if (pan->opt & OPT_R) {
1882 	fmod->rsq = 1.0 - (fmod->ess / fmod->tss);
1883 	fmod->fstt = NADBL;
1884     } else {
1885 	fmod->rsq = 1.0 - (fmod->ess / fmod->tss);
1886 	if (fmod->rsq < 0.0) {
1887 	    fmod->rsq = 0.0;
1888 	} else {
1889 	    fmod->fstt = (fmod->rsq / (1.0 - fmod->rsq)) *
1890 		((double) fmod->dfd / fmod->dfn);
1891 	}
1892     }
1893 
1894     fmod->ncoeff = fmod->dfn + 1; /* number of params estimated */
1895     ls_criteria(fmod);
1896     fmod->ncoeff = nc;
1897 
1898     return err;
1899 }
1900 
1901 /* Fixed-effects model: add the per-unit intercept estimates
1902    to the model in case the user wants to retrieve them.
1903    Random-effects model: add estimate of the individual effects.
1904 
1905    By this point the model -- even if it been estimated on a short
1906    dataset -- should have a full-length residual series.
1907 */
1908 
panel_model_add_ahat(MODEL * pmod,const DATASET * dset,panelmod_t * pan)1909 static int panel_model_add_ahat (MODEL *pmod, const DATASET *dset,
1910 				 panelmod_t *pan)
1911 {
1912     double *ahat = NULL;
1913     const double *x;
1914     int i, j, t, bigt;
1915     int n, err = 0;
1916 
1917     n = pmod->full_n;
1918 
1919     ahat = malloc(n * sizeof *ahat);
1920     if (ahat == NULL) {
1921 	return E_ALLOC;
1922     }
1923 
1924     for (t=0; t<n; t++) {
1925 	ahat[t] = NADBL;
1926     }
1927 
1928     if (pan->opt & OPT_F) {
1929 	/* fixed effects */
1930 	for (i=0; i<pan->nunits; i++) {
1931 	    if (pan->unit_obs[i] > 0) {
1932 		double a = 0.0;
1933 
1934 		/* a = y - Xb, where the 'b' is based on de-meaned data */
1935 
1936 		for (t=0; t<pan->T; t++) {
1937 		    bigt = panel_index(i, t);
1938 		    if (!na(pmod->uhat[bigt])) {
1939 			a += dset->Z[pmod->list[1]][bigt];
1940 			for (j=1; j<pmod->ncoeff; j++) {
1941 			    x = dset->Z[pmod->list[j+2]];
1942 			    a -= pmod->coeff[j] * x[bigt];
1943 			}
1944 		    }
1945 		}
1946 
1947 		a /= pan->unit_obs[i];
1948 
1949 		for (t=0; t<pan->T; t++) {
1950 		    bigt = panel_index(i, t);
1951 		    if (!na(pmod->uhat[bigt])) {
1952 			ahat[bigt] = a;
1953 		    }
1954 		}
1955 	    }
1956 	}
1957     } else {
1958 	/* random effects */
1959 	double uhbar, frac = 0;
1960 
1961 	if (pan->balanced) {
1962 	    frac = 1.0 - pan->theta;
1963 	    frac = 1.0 - frac * frac;
1964 	}
1965 
1966 	for (i=0; i<pan->nunits; i++) {
1967 	    if (pan->unit_obs[i] > 0) {
1968 		/* get mean residual */
1969 		uhbar = 0.0;
1970 		for (t=0; t<pan->T; t++) {
1971 		    bigt = panel_index(i, t);
1972 		    if (!na(pmod->uhat[bigt])) {
1973 			uhbar += pmod->uhat[bigt];
1974 		    }
1975 		}
1976 		uhbar /= pan->unit_obs[i];
1977 		/* ahat = frac * uhbar */
1978 		if (!pan->balanced) {
1979 		    frac = pan->s2v / (pan->s2v + pan->s2e / pan->unit_obs[i]);
1980 		}
1981 		for (t=0; t<pan->T; t++) {
1982 		    bigt = panel_index(i, t);
1983 		    if (!na(pmod->uhat[bigt])) {
1984 			ahat[bigt] = frac * uhbar;
1985 		    }
1986 		}
1987 	    }
1988 	}
1989     }
1990 
1991     err = gretl_model_set_data(pmod, "ahat", ahat,
1992 			       GRETL_TYPE_DOUBLE_ARRAY,
1993 			       n * sizeof *ahat);
1994 
1995     return err;
1996 }
1997 
1998 /* If we're estimating the fixed effects model with the --robust
1999    flag, we should do a robust version of the joint test on
2000    the fixed effects. Here we use the algorithm of B. L. Welch,
2001    "On the Comparison of Several Mean Values: An Alternative
2002    Approach" (Biometrika 38, 1951, pp. 330-336). The variable
2003    we're testing for difference of means (by individual) is the
2004    residual from pooled OLS.
2005 */
2006 
robust_fixed_effects_F(panelmod_t * pan)2007 static int robust_fixed_effects_F (panelmod_t *pan)
2008 {
2009     MODEL *pmod = pan->pooled;
2010     double *u, *w, *h, *xbar;
2011     double muhat = 0.0;
2012     double x, s2, W = 0.0;
2013     double A, B, sum_h;
2014     int k = pan->effn;
2015     int i, j, t, s;
2016     int Ti, bigt;
2017     int err = 0;
2018 
2019     u = malloc(pan->Tmax * sizeof *u);
2020     w = malloc(3 * k * sizeof *w);
2021 
2022     if (u == NULL || w == NULL) {
2023 	free(u);
2024 	free(w);
2025 	return E_ALLOC;
2026     }
2027 
2028     h = w + k;
2029     xbar = h + k;
2030 
2031     j = 0;
2032     for (i=0; i<pan->nunits; i++) {
2033 	Ti = pan->unit_obs[i];
2034 	if (Ti > 1) {
2035 	    s = 0;
2036 	    for (t=0; t<pan->T; t++) {
2037 		bigt = panel_index(i, t);
2038 		if (!panel_missing(pan, bigt)) {
2039 		    u[s++] = pmod->uhat[bigt];
2040 		}
2041 	    }
2042 	    xbar[j] = gretl_mean(0, s-1, u);
2043 	    s2 = gretl_variance(0, s-1, u);
2044 	    w[j] = Ti / s2;
2045 	    W += w[j];
2046 	    muhat += w[j] * xbar[j];
2047 	    j++;
2048 	}
2049     }
2050 
2051     muhat /= W;
2052     A = sum_h = 0.0;
2053 
2054     j = 0;
2055     for (i=0; i<pan->nunits; i++) {
2056 	Ti = pan->unit_obs[i];
2057 	if (Ti > 1) {
2058 	    x = 1 - w[j]/W;
2059 	    h[j] = (x * x) / (Ti - 1);
2060 	    sum_h += h[j];
2061 	    x = xbar[j] - muhat;
2062 	    A += w[j] * x * x;
2063 	    j++;
2064 	}
2065     }
2066 
2067     A /= (k - 1);
2068     B = 1.0 + (2.0*(k-2.0)/(k * k - 1.0)) * sum_h;
2069 
2070     pan->Ffe = A / B;
2071     pan->Fdfn = k - 1;
2072     pan->Fdfd = (k * k - 1.0)/(3 * sum_h);
2073 
2074     free(u);
2075     free(w);
2076 
2077     return err;
2078 }
2079 
real_FE_list(panelmod_t * pan)2080 static int *real_FE_list (panelmod_t *pan)
2081 {
2082     int *list = gretl_list_copy(pan->pooled->list);
2083     int i;
2084 
2085     if (list != NULL) {
2086 	/* purge any non-time-varying variables */
2087 	for (i=2; i<=list[0]; i++) {
2088 	    if (!in_gretl_list(pan->vlist, list[i])) {
2089 		gretl_list_delete_at_pos(list, i--);
2090 	    }
2091 	}
2092     }
2093 
2094     return list;
2095 }
2096 
2097 /* For testing purposes: retrieve user-specified values for
2098    s2v and s2e. These would typically be known good values
2099    in the context of a simulation.
2100 */
2101 
read_true_variances(panelmod_t * pan)2102 static int read_true_variances (panelmod_t *pan)
2103 {
2104     gretl_matrix *m;
2105     int err = 0;
2106 
2107     m = gretl_matrix_read_from_file(glsmat, 0, &err);
2108     if (m == NULL) {
2109 	fprintf(stderr, "read_true_variances: no matrix!\n");
2110 	if (!err) {
2111 	    err = E_DATA;
2112 	}
2113     } else {
2114 	pan->s2v = m->val[0];
2115 	pan->s2e = m->val[1];
2116     }
2117 
2118     return err;
2119 }
2120 
2121 /* computation of $\hat{\sigma}^2_v$ a la Nerlove, if wanted */
2122 
nerlove_s2v(MODEL * pmod,const DATASET * dset,panelmod_t * pan)2123 static int nerlove_s2v (MODEL *pmod, const DATASET *dset,
2124 			panelmod_t *pan)
2125 {
2126     double a, amean, *ahat;
2127     double wmean, *wi = NULL;
2128     const double *x;
2129     int i, j, t, k, bigt;
2130     int *list = NULL;
2131     int err = 0;
2132 
2133     ahat = malloc(pan->effn * sizeof *ahat);
2134     if (ahat == NULL) {
2135 	return E_ALLOC;
2136     }
2137 
2138     if (pan->opt & OPT_X) {
2139 	/* --unbalanced */
2140 	wi = malloc(pan->effn * sizeof *wi);
2141 	if (wi == NULL) {
2142 	    free(ahat);
2143 	    return E_ALLOC;
2144 	}
2145     }
2146 
2147     list = real_FE_list(pan);
2148     if (list == NULL) {
2149 	free(ahat);
2150 	free(wi);
2151 	return E_ALLOC;
2152     }
2153 
2154     wmean = amean = 0.0;
2155     k = 0;
2156 
2157     for (i=0; i<pan->nunits; i++) {
2158 	int Ti = pan->unit_obs[i];
2159 
2160 	if (Ti > 0) {
2161 	    a = 0.0;
2162 	    for (t=0; t<pan->T; t++) {
2163 		bigt = panel_index(i, t);
2164 		if (!na(pan->pooled->uhat[bigt])) {
2165 		    a += dset->Z[list[1]][bigt];
2166 		    for (j=1; j<pmod->ncoeff; j++) {
2167 			x = dset->Z[list[j+2]];
2168 			a -= pmod->coeff[j] * x[bigt];
2169 		    }
2170 		}
2171 	    }
2172 	    a /= Ti;
2173 	    ahat[k] = a;
2174 	    amean += a;
2175 	    if (wi != NULL) {
2176 		wi[k] = Ti / (double) pmod->nobs;
2177 		wmean += wi[k] * a;
2178 	    }
2179 	    k++;
2180 	}
2181     }
2182 
2183     pan->s2v = 0.0;
2184 
2185     if (wi != NULL) {
2186 	for (i=0; i<pan->effn; i++) {
2187 	    pan->s2v += wi[i] * (ahat[i] - wmean) * (ahat[i] - wmean);
2188 	}
2189 	pan->s2v /= (pan->effn - 1.0) / (double) pan->effn;
2190 	free(wi);
2191     } else {
2192 	amean /= pan->effn;
2193 	for (i=0; i<pan->effn; i++) {
2194 	    pan->s2v += (ahat[i] - amean) * (ahat[i] - amean);
2195 	}
2196 	pan->s2v /= pan->effn - 1;
2197     }
2198 
2199     free(ahat);
2200     free(list);
2201 
2202     return err;
2203 }
2204 
2205 /* robust RE: we need an extra residuals array */
2206 
re_hatvars_prep(panelmod_t * pan)2207 static int re_hatvars_prep (panelmod_t *pan)
2208 {
2209     int t, n = pan->pooled->full_n;
2210 
2211     pan->re_uhat = malloc(n * sizeof *pan->re_uhat);
2212 
2213     if (pan->re_uhat == NULL) {
2214 	return E_ALLOC;
2215     } else {
2216 	for (t=0; t<n; t++) {
2217 	    pan->re_uhat[t] = NADBL;
2218 	}
2219 	return 0;
2220     }
2221 }
2222 
2223 /* When calculating DW using the fixed-effects residuals,
2224    in the context of estimation of the random-effects
2225    specification, we need to expand up the residual series
2226    temporarily.
2227 */
2228 
expand_fe_uhat(MODEL * femod,panelmod_t * pan)2229 static int expand_fe_uhat (MODEL *femod, panelmod_t *pan)
2230 {
2231     double *uhat = NULL;
2232     int NT = pan->pooled->full_n;
2233     int i, s, t;
2234 
2235     uhat = malloc(NT * sizeof *uhat);
2236     if (uhat == NULL) {
2237 	return E_ALLOC;
2238     }
2239 
2240     for (t=0; t<NT; t++) {
2241 	uhat[t] = NADBL;
2242     }
2243 
2244     s = 0;
2245     for (i=0; i<pan->nunits; i++) {
2246 	int ti, Ti = pan->unit_obs[i];
2247 
2248 	if (Ti == 0) {
2249 	    continue;
2250 	}
2251 	for (ti=0; ti<Ti; ti++) {
2252 	    t = big_index(pan, s);
2253 	    uhat[t] = femod->uhat[s];
2254 	    if (s == 0) {
2255 		femod->t1 = t;
2256 	    } else if (s == femod->nobs - 1) {
2257 		femod->t2 = t;
2258 	    }
2259 	    s++;
2260 	}
2261     }
2262 
2263     /* replace the uhat array on @femod */
2264     free(femod->uhat);
2265     femod->uhat = uhat;
2266 
2267     return 0;
2268 }
2269 
2270 /* Fix uhat and yhat in two cases.
2271 
2272    (a) When we estimated fixed effects using a de-meaned dataset we
2273    need to ensure that the uhat and yhat values get written to the
2274    right observation slots in relation to the full dataset, and also
2275    that the yhat values get corrected, putting the means back in.
2276 
2277    (b) When estimating the random effects model we need to compute
2278    residuals based on the untransformed data (and again, place them
2279    correctly in relation to the full dataset). However, if we're
2280    going to produce robust standard errors we also need to preserve
2281    the GLS residuals.
2282 
2283    The placement issue arises because the special datasets used in
2284    these cases are not necessarily of full length, since they are
2285    purged of missing observations.
2286 */
2287 
2288 static int
fix_panel_hatvars(MODEL * pmod,panelmod_t * pan,const double ** Z)2289 fix_panel_hatvars (MODEL *pmod, panelmod_t *pan, const double **Z)
2290 {
2291     const double *y = NULL;
2292     double *yhat = pan->pooled->yhat;
2293     double *uhat = NULL;
2294     int n = pan->pooled->full_n;
2295     int re_n = 0;
2296     double yht, SSR = 0.0;
2297     int i, j, s, t;
2298     int err = 0;
2299 
2300     if (yhat == NULL) {
2301 	fprintf(stderr, "fix_panel_hatvars: pan->pooled->yhat is NULL\n");
2302 	return E_DATA;
2303     }
2304 
2305     y = Z[pan->pooled->list[1]];
2306 
2307     uhat = malloc(n * sizeof *uhat);
2308     if (uhat == NULL) {
2309 	return E_ALLOC;
2310     }
2311 
2312     if (pan->opt & OPT_U) {
2313 	/* random effects */
2314 	if (pan->opt & OPT_R) {
2315 	    err = re_hatvars_prep(pan);
2316 	}
2317 	if (err) {
2318 	    return err;
2319 	}
2320     }
2321 
2322     for (t=0; t<n; t++) {
2323 	uhat[t] = NADBL;
2324     }
2325 
2326     s = 0;
2327     for (i=0; i<pan->nunits; i++) {
2328 	int ti, Ti = pan->unit_obs[i];
2329 
2330 	if (Ti == 0) {
2331 	    continue;
2332 	}
2333 
2334 	for (ti=0; ti<Ti; ti++) {
2335 	    t = big_index(pan, s);
2336 	    if (pan->opt & OPT_U) {
2337 		/* random effects */
2338 		yht = 0.0;
2339 		for (j=0; j<pmod->ncoeff; j++) {
2340 		    yht += pmod->coeff[j] * Z[pan->pooled->list[j+2]][t];
2341 		}
2342 		yhat[t] = yht;
2343 		re_n++;
2344 		uhat[t] = y[t] - yht;
2345 		SSR += uhat[t] * uhat[t];
2346 		if (pan->re_uhat != NULL) {
2347 		    /* store both the "fixed" and the GLS residuals */
2348 		    pan->re_uhat[t] = uhat[t];
2349 		    uhat[t] = pmod->uhat[s];
2350 		}
2351 	    } else {
2352 		/* fixed effects */
2353 		uhat[t] = pmod->uhat[s];
2354 		yhat[t] = y[t] - uhat[t];
2355 	    }
2356 	    if (s == 0) {
2357 		pmod->t1 = t;
2358 	    } else if (s == pmod->nobs - 1) {
2359 		pmod->t2 = t;
2360 	    }
2361 	    s++;
2362 	}
2363     }
2364 
2365     if (pan->opt & OPT_U) {
2366 	double r;
2367 
2368 	/* Defer rewriting of pmod->ess and pmod->sigma to avoid
2369 	   screwing up the covariance matrix estimator and Hausman
2370 	   test.
2371 	*/
2372 	gretl_model_set_double(pmod, "fixed_SSR", SSR);
2373 	r = gretl_corr(pmod->t1, pmod->t2, y, yhat, NULL);
2374 	if (!na(r)) {
2375 	    gretl_model_set_double(pmod, "corr-rsq", r * r);
2376 	}
2377     }
2378 
2379     pmod->full_n = n;
2380 
2381     /* replace the uhat and yhat arrays on @pmod */
2382     free(pmod->uhat);
2383     pmod->uhat = uhat;
2384     free(pmod->yhat);
2385     pmod->yhat = yhat;
2386 
2387     /* NULLify stolen pointer */
2388     pan->pooled->yhat = NULL;
2389 
2390     return err;
2391 }
2392 
2393 static int
hausman_move_uhat(MODEL * pmod,panelmod_t * pan)2394 hausman_move_uhat (MODEL *pmod, panelmod_t *pan)
2395 {
2396     int n = pan->pooled->full_n;
2397     double *uhat;
2398     int i, s, t, ti;
2399 
2400     uhat = malloc(n * sizeof *uhat);
2401     if (uhat == NULL) {
2402 	return E_ALLOC;
2403     }
2404 
2405     for (t=0; t<n; t++) {
2406 	uhat[t] = NADBL;
2407     }
2408 
2409     s = 0;
2410     for (i=0; i<pan->nunits; i++) {
2411 	int Ti = pan->unit_obs[i];
2412 
2413 	if (Ti > 0) {
2414 	    for (ti=0; ti<Ti; ti++) {
2415 		t = big_index(pan, s);
2416 		uhat[t] = pmod->uhat[s];
2417 		s++;
2418 	    }
2419 	}
2420     }
2421 
2422     free(pmod->uhat);
2423     pmod->uhat = uhat;
2424 
2425     return 0;
2426 }
2427 
2428 #if PDEBUG > 1
2429 
verbose_femod_print(MODEL * femod,DATASET * wset,PRN * prn)2430 static void verbose_femod_print (MODEL *femod, DATASET *wset,
2431 				 PRN *prn)
2432 {
2433     pprintf(prn, "*** initial FE model (on within data)\n");
2434     printmodel(femod, wset, OPT_O, prn);
2435 
2436 # if PDEBUG > 2
2437     int i, j;
2438 
2439     fprintf(stderr, "femod: data series length = %d\n", wset->n);
2440     for (i=0; i<wset->n; i++) {
2441 	fprintf(stderr, "femod.uhat[%d] = %g, ", i, femod->uhat[i]);
2442 	fprintf(stderr, "data: ");
2443 	for (j=0; j<wset->v; j++) {
2444 	    fprintf(stderr, "%g ", wset->Z[j][i]);
2445 	}
2446 	fputc('\n', stderr);
2447     }
2448 # endif
2449 }
2450 
2451 #endif
2452 
2453 /* Estimate the fixed-effects model using a parallel dataset
2454    with the group means subtracted from all variables.
2455 */
2456 
2457 static MODEL
fixed_effects_model(panelmod_t * pan,DATASET * dset,PRN * prn)2458 fixed_effects_model (panelmod_t *pan, DATASET *dset, PRN *prn)
2459 {
2460     MODEL femod;
2461     gretlopt lsqopt = OPT_A | OPT_X;
2462     DATASET *wset = NULL;
2463     int *felist = NULL;
2464     int save_qr = 0;
2465     int i;
2466 
2467 #if PDEBUG
2468     fprintf(stderr, "fixed_effects: using de-meaned data\n");
2469 #endif
2470 
2471     gretl_model_init(&femod, dset);
2472 
2473     felist = gretl_list_new(pan->vlist[0]);
2474     if (felist == NULL) {
2475 	femod.errcode = E_ALLOC;
2476 	return femod;
2477     }
2478 
2479     wset = within_groups_dataset(dset, pan);
2480     if (wset == NULL) {
2481 	free(felist);
2482 	femod.errcode = E_ALLOC;
2483 	return femod;
2484     }
2485 
2486     felist[1] = 1;
2487     felist[2] = 0;
2488     for (i=3; i<=felist[0]; i++) {
2489 	felist[i] = i - 1;
2490     }
2491 
2492     if (pan->opt & OPT_F) {
2493 	/* suppress auto-removal of collinear terms */
2494 	lsqopt |= OPT_Z;
2495     }
2496 
2497     save_qr = libset_get_bool(USE_QR);
2498     libset_set_bool(USE_QR, 1);
2499 
2500     femod = lsq(felist, wset, OLS, lsqopt);
2501 
2502     libset_set_bool(USE_QR, save_qr);
2503 
2504     if (femod.errcode) {
2505 	fprintf(stderr, "femod.errcode = %d\n", femod.errcode);
2506     } else if ((pan->opt & OPT_F) && femod.list[0] < felist[0]) {
2507 	femod.errcode = E_SINGULAR;
2508     } else {
2509 	/* we estimated a bunch of group means, and have to
2510 	   subtract degrees of freedom */
2511 	fixed_effects_df_correction(&femod, pan->effn - 1);
2512 #if PDEBUG > 1
2513 	verbose_femod_print(&femod, wset, prn);
2514 #endif
2515 	if (pan->opt & OPT_F) {
2516 	    /* estimating the FE model in its own right */
2517 	    if (pan->opt & OPT_R) {
2518 		/* we have to do this before the pooled residual
2519 		   array is "stolen" for the fixed-effects model
2520 		*/
2521 		robust_fixed_effects_F(pan);
2522 	    }
2523 	    fix_panel_hatvars(&femod, pan, (const double **) dset->Z);
2524 	    if (pan->opt & OPT_R) {
2525 		panel_robust_vcv(&femod, pan, wset);
2526 	    } else {
2527 		femod_regular_vcv(&femod);
2528 	    }
2529 	} else if (pan->opt & OPT_N) {
2530 	    if (IGLS) {
2531 		read_true_variances(pan);
2532 	    } else {
2533 		femod.errcode = nerlove_s2v(&femod, dset, pan);
2534 	    }
2535 	}
2536     }
2537 
2538     destroy_dataset(wset);
2539     free(felist);
2540 
2541     return femod;
2542 }
2543 
2544 /* Construct a gretl list containing the index numbers of the
2545    cross-sectional units included in the fixed-effects
2546    regression. This is for the purpose of naming the per-unit
2547    intercepts.
2548 */
2549 
fe_units_list(const panelmod_t * pan)2550 static int *fe_units_list (const panelmod_t *pan)
2551 {
2552     int *ulist = NULL;
2553     int i, j, n = 0;
2554 
2555     for (i=0; i<pan->nunits; i++) {
2556 	if (pan->unit_obs[i] > 0) {
2557 	    n++;
2558 	}
2559     }
2560 
2561     ulist = gretl_list_new(n);
2562 
2563     if (ulist != NULL) {
2564 	j = 1;
2565 	for (i=0; i<pan->nunits; i++) {
2566 	    if (pan->unit_obs[i] > 0) {
2567 		ulist[j++] = i + 1;
2568 	    }
2569 	}
2570     }
2571 
2572     return ulist;
2573 }
2574 
2575 /* Compose a list referencing all variables that were dropped from the
2576    final panel model relative to the incoming regression
2577    specification.  This may include some variables that were dropped
2578    at the stage of running the baseline pooled model (presumably
2579    because of perfect collinearity).
2580 
2581    In the case of fixed effects it may include additional variables
2582    dropped due to the fact that they are time-invariant.
2583 */
2584 
compose_panel_droplist(MODEL * pmod,panelmod_t * pan)2585 static int compose_panel_droplist (MODEL *pmod, panelmod_t *pan)
2586 {
2587     int fixed_effects = (pmod->opt & OPT_F);
2588     const int *pooldrop;
2589     int *dlist;
2590     int i, ndrop = 0;
2591 
2592     /* regressors dropped at the stage of estimating the
2593        initial pooled model */
2594     pooldrop = gretl_model_get_list(pan->pooled, "droplist");
2595     if (pooldrop != NULL) {
2596 	ndrop += pooldrop[0];
2597     }
2598 
2599     if (fixed_effects) {
2600 	/* regressors dropped because time-invariant */
2601 	ndrop += pan->pooled->list[0] - pan->vlist[0];
2602     }
2603 
2604     if (ndrop == 0) {
2605 	/* nothing to be done */
2606 	return 0;
2607     }
2608 
2609     dlist = gretl_list_new(ndrop);
2610     if (dlist == NULL) {
2611 	return E_ALLOC;
2612     }
2613 
2614     i = 1;
2615 
2616     if (pooldrop != NULL) {
2617 	for (i=1; i<=pooldrop[0]; i++) {
2618 	    dlist[i] = pooldrop[i];
2619 	}
2620     }
2621 
2622     if (fixed_effects) {
2623 	if (pan->vlist[0] < pan->pooled->list[0]) {
2624 	    int j, vj;
2625 
2626 	    for (j=2; j<=pan->pooled->list[0]; j++) {
2627 		vj = pan->pooled->list[j];
2628 		if (!in_gretl_list(pan->vlist, vj)) {
2629 		    dlist[i++] = vj;
2630 		}
2631 	    }
2632 	}
2633     }
2634 
2635     return gretl_model_set_list_as_data(pmod, "droplist", dlist);
2636 }
2637 
fix_gls_stats(MODEL * pmod,panelmod_t * pan)2638 static void fix_gls_stats (MODEL *pmod, panelmod_t *pan)
2639 {
2640     double chisq = NADBL;
2641     int nc, df = 0;
2642 
2643     fix_panelmod_list(pmod, pan);
2644 
2645     pmod->ybar = pan->pooled->ybar;
2646     pmod->sdy = pan->pooled->sdy;
2647     pmod->tss = pan->pooled->tss;
2648 
2649     pmod->rsq = NADBL;
2650     pmod->adjrsq = NADBL;
2651     pmod->fstt = NADBL;
2652     pmod->rho = pan->rho;
2653     pmod->dw = pan->dw;
2654 
2655     /* add joint chi-square test on regressors */
2656 
2657     if (pan->ntdum > 0) {
2658 	df = pmod->ncoeff - 1 - pan->ntdum;
2659 	if (df > 0) {
2660 	    chisq = panel_overall_test(pmod, pan, pan->ntdum,
2661 				       OPT_X);
2662 	}
2663     } else {
2664 	df = pmod->ncoeff - 1;
2665 	if (df > 0) {
2666 	    chisq = panel_overall_test(pmod, pan, 0, OPT_X);
2667 	}
2668     }
2669 
2670     /* deferred rewriting of ess and sigma, etc. */
2671     pmod->ess = gretl_model_get_double(pmod, "fixed_SSR");
2672     pmod->sigma = sqrt(pmod->ess / (pmod->nobs - (pmod->ncoeff - 1)));
2673     gretl_model_destroy_data_item(pmod, "fixed_SSR");
2674     nc = pmod->ncoeff;
2675     pmod->ncoeff = pmod->dfn + 1;
2676     ls_criteria(pmod);
2677     pmod->ncoeff = nc;
2678 
2679     if (!na(chisq) && chisq >= 0.0) {
2680 	ModelTest *test = model_test_new(GRETL_TEST_RE_WALD);
2681 
2682 	if (test != NULL) {
2683 	    model_test_set_teststat(test, GRETL_STAT_WALD_CHISQ);
2684 	    model_test_set_dfn(test, df);
2685 	    model_test_set_value(test, chisq);
2686 	    model_test_set_pvalue(test, chisq_cdf_comp(df, chisq));
2687 	    maybe_add_test_to_model(pmod, test);
2688 	}
2689     }
2690 }
2691 
add_panel_obs_info(MODEL * pmod,panelmod_t * pan)2692 static void add_panel_obs_info (MODEL *pmod, panelmod_t *pan)
2693 {
2694     gretl_model_set_int(pmod, "n_included_units", pan->effn);
2695     gretl_model_set_int(pmod, "panel_T", pan->T);
2696     gretl_model_set_int(pmod, "Tmin", pan->Tmin);
2697     gretl_model_set_int(pmod, "Tmax", pan->Tmax);
2698     if (pan->Tmax > pan->Tmin && pan->Tbar == 0) {
2699 	calculate_Tbar(pan);
2700     }
2701     if (pan->Tbar > 0) {
2702 	gretl_model_set_double(pmod, "Tbar", pan->Tbar);
2703     }
2704 }
2705 
replace_re_residuals(MODEL * pmod,panelmod_t * pan)2706 static void replace_re_residuals (MODEL *pmod, panelmod_t *pan)
2707 {
2708     int t;
2709 
2710     /* replace the GLS residuals with the "fixed up" ones */
2711 
2712     for (t=0; t<pmod->full_n; t++) {
2713 	pmod->uhat[t] = pan->re_uhat[t];
2714     }
2715 }
2716 
2717 /* We use this to "finalize" models estimated via fixed effects
2718    and random effects */
2719 
save_panel_model(MODEL * pmod,panelmod_t * pan,const double ** Z,const DATASET * dset)2720 static int save_panel_model (MODEL *pmod, panelmod_t *pan,
2721 			     const double **Z,
2722 			     const DATASET *dset)
2723 {
2724     int err = 0;
2725 
2726     pmod->ci = PANEL;
2727 
2728     if (pan->opt & OPT_F) {
2729 	/* fixed effects */
2730 	int *ulist;
2731 
2732 	pmod->opt |= OPT_F;
2733 	err = fix_within_stats(pmod, pan);
2734 	if (err) {
2735 	    return err;
2736 	}
2737 	ulist = fe_units_list(pan);
2738 	gretl_model_add_panel_varnames(pmod, dset, ulist);
2739 	free(ulist);
2740 	panel_model_add_ahat(pmod, dset, pan);
2741 	save_fixed_effects_F(pan, pmod);
2742     } else {
2743 	/* random effects */
2744 	pmod->opt |= OPT_U;
2745 	gretl_model_set_double(pmod, "s2v", pan->s2v);
2746 	gretl_model_set_double(pmod, "s2e", pan->s2e);
2747 	if (pan->balanced) {
2748 	    gretl_model_set_double(pmod, "theta", pan->theta);
2749 	} else if (!na(pan->theta_bar)) {
2750 	    pan->theta_bar /= pan->effn;
2751 	    gretl_model_set_double(pmod, "theta_bar", pan->theta_bar);
2752 	}
2753 	gretl_model_add_panel_varnames(pmod, dset, NULL);
2754 	if (pan->re_uhat != NULL) {
2755 	    replace_re_residuals(pmod, pan);
2756 	}
2757 	fix_gls_stats(pmod, pan);
2758 	panel_model_add_ahat(pmod, dset, pan);
2759 	if (pan->opt & OPT_N) {
2760 	    /* record use of Nerlove transformation */
2761 	    pmod->opt |= OPT_N;
2762 	}
2763 	if ((pan->opt & OPT_X) && !IGLS) {
2764 	    /* record use of special unbalanced ANOVA */
2765 	    pmod->opt |= OPT_X;
2766 	    if (!(pan->opt & OPT_N)) {
2767 		const char *meth = stata_sa ? "stata" : "bc";
2768 
2769 		gretl_model_set_string_as_data(pmod, "anova_method",
2770 					       gretl_strdup(meth));
2771 	    }
2772 	}
2773     }
2774 
2775     /* compose list of dropped variables, if any */
2776     compose_panel_droplist(pmod, pan);
2777 
2778     if (!(pan->opt & OPT_A)) {
2779 	set_model_id(pmod, pan->opt);
2780     }
2781 
2782     if (pan->opt & OPT_F) {
2783 	panel_dwstat(pmod, pan);
2784     }
2785 
2786     if (pan->opt & OPT_R) {
2787 	pmod->opt |= OPT_R;
2788     }
2789 
2790     time_dummies_wald_test(pan, pmod);
2791     add_panel_obs_info(pmod, pan);
2792     *pan->realmod = *pmod;
2793 
2794     return err;
2795 }
2796 
2797 /* drive the calculation of the fixed effects regression, print the
2798    results (if wanted), and compute the "within" error variance */
2799 
within_variance(panelmod_t * pan,DATASET * dset,PRN * prn)2800 static int within_variance (panelmod_t *pan,
2801 			    DATASET *dset,
2802 			    PRN *prn)
2803 {
2804     MODEL femod;
2805     int i, err = 0;
2806 
2807     femod = fixed_effects_model(pan, dset, prn);
2808 
2809     if (femod.errcode) {
2810 	pputs(prn, _("Error estimating fixed effects model\n"));
2811 	errmsg(femod.errcode, prn);
2812 	err = femod.errcode;
2813 	clear_model(&femod);
2814     } else {
2815 	int den;
2816 
2817 	if (pan->opt & OPT_N) {
2818 	    /* Nerlove */
2819 	    den = femod.nobs;
2820 	} else {
2821 	    /* as per Greene: nT - n - K */
2822 	    den = femod.nobs - pan->effn - (pan->vlist[0] - 2);
2823 	}
2824 
2825 	if (den == 0) {
2826 	    gretl_errmsg_set(_("Inadequate data for panel estimation"));
2827 	    return E_DF;
2828 	} else {
2829 	    pan->s2e = femod.ess / den;
2830 	}
2831 
2832 #if PDEBUG
2833 	fprintf(stderr, "nT = %d, n = %d, K = %d\n", femod.nobs,
2834 		pan->effn, pan->vlist[0] - 2);
2835 	fprintf(stderr, "pan->s2e = %g / %d = %g\n", femod.ess,
2836 		den, pan->s2e);
2837 	fprintf(stderr, "sqrt(pan->s2e) = %g\n", sqrt(pan->s2e));
2838 #endif
2839 
2840 	if (!(pan->opt & OPT_R)) {
2841 	    regular_fixed_effects_F(pan, &femod);
2842 	}
2843 
2844 	if (pan->opt & OPT_V) {
2845 	    print_fe_results(pan, &femod, dset, prn);
2846 	}
2847 
2848 	if (pan->bdiff != NULL && pan->Sigma != NULL) {
2849 	    for (i=1; i<femod.ncoeff; i++) {
2850 		pan->bdiff->val[i-1] = femod.coeff[i];
2851 	    }
2852 	    femod_regular_vcv(&femod);
2853 	    vcv_slopes(pan, &femod, VCV_INIT);
2854 	}
2855 
2856 	if (pan->opt & OPT_F) {
2857 	    err = save_panel_model(&femod, pan, (const double **) dset->Z,
2858 				   dset);
2859 	    if (err) {
2860 		clear_model(&femod);
2861 	    }
2862 	} else {
2863 	    if (pan->opt & OPT_U) {
2864 		if (expand_fe_uhat(&femod, pan) == 0) {
2865 		    panel_dwstat(&femod, pan);
2866 		}
2867 	    }
2868 	    clear_model(&femod);
2869 	}
2870     }
2871 
2872     return err;
2873 }
2874 
print_hausman_result(panelmod_t * pan,PRN * prn)2875 static void print_hausman_result (panelmod_t *pan, PRN *prn)
2876 {
2877     if (na(pan->H)) {
2878 	pputs(prn, _("Hausman test matrix is not positive definite!\n"));
2879     } else {
2880 	pprintf(prn, _("Hausman test statistic:\n"
2881 		       " H = %g with p-value = prob(chi-square(%d) > %g) = %g\n"),
2882 		pan->H, pan->dfH, pan->H, chisq_cdf_comp(pan->dfH, pan->H));
2883 	pputs(prn, _("(A low p-value counts against the null hypothesis that "
2884 		     "the random effects\nmodel is consistent, in favor of the fixed "
2885 		     "effects model.)\n"));
2886     }
2887 }
2888 
save_hausman_result(panelmod_t * pan)2889 static void save_hausman_result (panelmod_t *pan)
2890 {
2891     ModelTest *test;
2892 
2893     if (pan->realmod == NULL || na(pan->H) || pan->dfH == 0) {
2894 	return;
2895     }
2896 
2897     test = model_test_new(GRETL_TEST_PANEL_HAUSMAN);
2898 
2899     if (test != NULL) {
2900 	model_test_set_teststat(test, GRETL_STAT_WALD_CHISQ);
2901 	model_test_set_dfn(test, pan->dfH);
2902 	model_test_set_value(test, pan->H);
2903 	if (na(pan->H)) {
2904 	    model_test_set_pvalue(test, NADBL);
2905 	} else {
2906 	    model_test_set_pvalue(test, chisq_cdf_comp(pan->dfH, pan->H));
2907 	}
2908 	maybe_add_test_to_model(pan->realmod, test);
2909     }
2910 }
2911 
2912 /* Handle the case where collinear terms were dropped
2913    when doing the Hausman test via the regression
2914    method in robust mode.
2915 */
2916 
robust_hausman_fixup(const int * hlist,MODEL * pmod)2917 static double robust_hausman_fixup (const int *hlist,
2918 				    MODEL *pmod)
2919 {
2920     int *wlist = gretl_list_copy(hlist);
2921     double H = NADBL;
2922 
2923     if (wlist != NULL) {
2924 	int i;
2925 
2926 	for (i=wlist[0]; i>0; i--) {
2927 	    if (!in_gretl_list(pmod->list, wlist[i])) {
2928 		gretl_list_delete_at_pos(wlist, i);
2929 	    }
2930 	}
2931 	H = wald_omit_chisq(wlist, pmod);
2932 	free(wlist);
2933     }
2934 
2935     return H;
2936 }
2937 
2938 /* Estimate the augmented GLS model for the Hausman test;
2939    return either the error sum of squares or, in the
2940    robust case, a Wald chi-square statistic.
2941 */
2942 
hausman_regression_result(panelmod_t * pan,const int * relist,const int * hlist,DATASET * rset,PRN * prn)2943 static double hausman_regression_result (panelmod_t *pan,
2944 					 const int *relist,
2945 					 const int *hlist,
2946 					 DATASET *rset,
2947 					 PRN *prn)
2948 {
2949     double ret = NADBL;
2950     int *biglist = NULL;
2951     int err = 0;
2952 
2953     biglist = gretl_list_add(relist, hlist, &err);
2954 
2955     if (biglist != NULL) {
2956 	MODEL hmod;
2957 
2958 	gretl_model_init(&hmod, NULL);
2959 	hmod = lsq(biglist, rset, OLS, OPT_A);
2960 #if PDEBUG > 1
2961 	pputs(prn, "Hausman test regression\n");
2962 	printmodel(&hmod, rset, OPT_NONE, prn);
2963 #endif
2964 	if (hmod.errcode == 0) {
2965 	    /* Find the number of additional regressors actually used,
2966 	       relative to @relist, allowing for the possibility
2967 	       that one or more elements of @biglist were dropped
2968 	       in estimation of @hmod due to excessive collinearity.
2969 	    */
2970 	    pan->dfH = hmod.list[0] - relist[0];
2971 	    if (pan->dfH > 0) {
2972 		if (pan->opt & OPT_R) {
2973 		    /* do robust Wald test */
2974 		    if (hmod.full_n < pan->pooled->full_n) {
2975 			hausman_move_uhat(&hmod, pan);
2976 		    }
2977 		    panel_robust_vcv(&hmod, pan, rset); /* FIXME? */
2978 		    if (hmod.vcv != NULL) {
2979 			if (pan->dfH < hlist[0]) {
2980 			    ret = robust_hausman_fixup(hlist, &hmod);
2981 			} else {
2982 			    ret = wald_omit_chisq(hlist, &hmod);
2983 			}
2984 		    }
2985 		} else {
2986 		    /* just record the unrestricted SSR */
2987 		    ret = hmod.ess;
2988 		}
2989 	    }
2990 	}
2991 	clear_model(&hmod);
2992     }
2993 
2994     free(biglist);
2995 
2996     return ret;
2997 }
2998 
2999 /* Computation of s2_v in the manner of Swamy and Arora, for an
3000    unbalanced panel. See Baltagi, Econometric Analysis of Panel
3001    Data, 3e, section 9.2.1. */
3002 
unbalanced_SA_s2v(panelmod_t * pan,DATASET * dset)3003 static int unbalanced_SA_s2v (panelmod_t *pan,
3004 			      DATASET *dset)
3005 {
3006     gretl_matrix_block *B;
3007     gretl_matrix *ZmZ = NULL;
3008     gretl_matrix *PZ = NULL;
3009     gretl_matrix *Z = NULL;
3010     gretl_matrix *ZPZ = NULL;
3011     gretl_matrix *D2 = NULL;
3012     gretl_matrix *trmat = NULL;
3013     int k = pan->pooled->ncoeff;
3014     double zjt, z, tr;
3015     int i, j, s, t, p;
3016     int bigt, got;
3017     int err = 0;
3018 
3019     B = gretl_matrix_block_new(&ZmZ, pan->effn, k,
3020 			       &PZ,  pan->NT, k,
3021 			       &Z,   pan->NT, k,
3022 			       &ZPZ, k, k,
3023 			       &D2,  k, k, NULL);
3024 
3025     if (B == NULL) {
3026 	return E_ALLOC;
3027     }
3028 
3029     s = p = 0;
3030     for (i=0; i<pan->nunits; i++) {
3031 	int Ti = pan->unit_obs[i];
3032 
3033 	if (Ti == 0) {
3034 	    continue;
3035 	}
3036 
3037 	for (j=0; j<k; j++) {
3038 	    zjt = 0.0;
3039 	    got = 0;
3040 	    /* cumulate sum of observations for unit */
3041 	    for (t=0; t<pan->T && got<Ti; t++) {
3042 		bigt = panel_index(i, t);
3043 		if (!panel_missing(pan, bigt)) {
3044 		    z = dset->Z[pan->pooled->list[j+2]][bigt];
3045 		    zjt += z;
3046 		    gretl_matrix_set(Z, p + got, j, z);
3047 		    got++;
3048 		}
3049 	    }
3050 	    gretl_matrix_set(ZmZ, s, j, zjt);
3051 	    for (t=0; t<Ti; t++) {
3052 		gretl_matrix_set(PZ, p + t, j, zjt / Ti);
3053 	    }
3054 	}
3055 	s++;
3056 	p += Ti;
3057     }
3058 
3059     gretl_matrix_multiply_mod(Z, GRETL_MOD_TRANSPOSE,
3060 			      PZ, GRETL_MOD_NONE,
3061 			      ZPZ, GRETL_MOD_NONE);
3062     gretl_invert_symmetric_matrix(ZPZ);
3063 
3064     gretl_matrix_multiply_mod(ZmZ, GRETL_MOD_TRANSPOSE,
3065 			      ZmZ, GRETL_MOD_NONE,
3066 			      D2, GRETL_MOD_NONE);
3067 
3068     trmat = gretl_matrix_reuse(PZ, k, k);
3069     gretl_matrix_multiply(ZPZ, D2, trmat);
3070     tr = gretl_matrix_trace(trmat);
3071 
3072 #if 0
3073     gretl_matrix_print(trmat, "trmat");
3074     fprintf(stderr, "S-A: effn=%d, k=%d, NT=%d, tr=%g\n", pan->effn,
3075 	    k, pan->NT, tr);
3076 #endif
3077 
3078     pan->s2v = (pan->ubPub - (pan->effn - k) * pan->s2e) / (pan->NT - tr);
3079 
3080 #if PDEBUG
3081     fprintf(stderr, "S-A: ubPub=%#.8g, tr=%#.8g, s2v=%#.8g, sv=%#.8g\n",
3082 	    pan->ubPub, tr, pan->s2v, sqrt(pan->s2v));
3083 #endif
3084 
3085     gretl_matrix_block_destroy(B);
3086 
3087     return err;
3088 }
3089 
3090 /* Calculate the random effects regression.  Print the results
3091    here if we're doing the "panel diagnostics" test, otherwise
3092    save the results.
3093 */
3094 
random_effects(panelmod_t * pan,DATASET * dset,DATASET * gset,PRN * prn)3095 static int random_effects (panelmod_t *pan,
3096 			   DATASET *dset,
3097 			   DATASET *gset,
3098 			   PRN *prn)
3099 {
3100     DATASET *rset;
3101     MODEL remod;
3102     gretlopt lsqopt = OPT_A | OPT_Z;
3103     int *relist = NULL;
3104     int *hlist = NULL;
3105     double hres = NADBL;
3106     int i, err = 0;
3107 
3108     gretl_model_init(&remod, dset);
3109 
3110     /* FGLS regression list */
3111     relist = gretl_list_new(pan->pooled->list[0]);
3112     if (relist == NULL) {
3113 	return E_ALLOC;
3114     }
3115 
3116     if (!(pan->opt & OPT_M)) {
3117 	/* extra regressors for Hausman test, regression approach */
3118 	hlist = gretl_list_new(pan->vlist[0] - 1);
3119 	if (hlist == NULL) {
3120 	    free(relist);
3121 	    return E_ALLOC;
3122 	}
3123     }
3124 
3125     /* If OPT_N (--nerlove) was given, we've already calculated
3126        pan->s2v as the variance of the fixed effects. Otherwise
3127        we're using the Swamy and Arora method, and pan->s2v still
3128        needs to be computed.
3129 
3130        Note: for unbalanced panels, theta will vary across the
3131        units in the final calculation.
3132     */
3133     if (!(pan->opt & OPT_N)) {
3134 	if (IGLS) {
3135 	    /* get user-specified values */
3136 	    err = read_true_variances(pan);
3137 	} else if (!pan->balanced && !na(pan->ubPub)) {
3138 	    err = unbalanced_SA_s2v(pan, dset);
3139 	} else {
3140 	    pan->s2v = pan->s2b - pan->s2e / pan->Tbar;
3141 #if PDEBUG
3142 	    fprintf(stderr, "Swamy-Arora: initial s2v = %g - %g = %g\n",
3143 		    pan->s2b, pan->s2e / pan->Tbar, pan->s2v);
3144 #endif
3145 	}
3146 	if (pan->s2v < 0) {
3147 	    pan->s2v = 0.0;
3148 	}
3149     }
3150 
3151     /* theta, the quasi-demeaning coefficient */
3152     pan->theta = 1.0 - sqrt(pan->s2e / (pan->s2e + pan->Tmax * pan->s2v));
3153 
3154 #if PDEBUG
3155     fprintf(stderr, "pan->s2v = %.8g, pan->s2e = %.8g\n", pan->s2v, pan->s2e);
3156     fprintf(stderr, "random_effects theta = %g\n", pan->theta);
3157 #endif
3158 
3159     /* make special transformed dataset, and regression list */
3160     rset = random_effects_dataset(dset, gset, relist, hlist, pan);
3161     if (rset == NULL) {
3162 	free(relist);
3163 	free(hlist);
3164 	return E_ALLOC;
3165     }
3166 
3167     if (hlist != NULL) {
3168 	hres = hausman_regression_result(pan, relist, hlist, rset, prn);
3169     }
3170 
3171     /* regular random-effects model */
3172 #if PDEBUG
3173     fprintf(stderr, "estimate regular GLS model\n");
3174 #endif
3175     remod = lsq(relist, rset, OLS, lsqopt);
3176 
3177     if ((err = remod.errcode)) {
3178 	pputs(prn, _("Error estimating random effects model\n"));
3179 	errmsg(err, prn);
3180     } else {
3181 #if PDEBUG > 1
3182 	for (i=0; i<20 && i<remod.nobs; i++) {
3183 	    fprintf(stderr, "remod uhat[%d] = %g\n", i, remod.uhat[i]);
3184 	}
3185 #endif
3186 	if (pan->bdiff != NULL) {
3187 	    /* matrix-diff Hausman variant, if wanted */
3188 	    int vi, k = 0;
3189 
3190 	    for (i=0; i<remod.ncoeff; i++) {
3191 		vi = pan->pooled->list[i+2];
3192 		if (var_is_varying(pan, vi)) {
3193 		    pan->bdiff->val[k++] -= remod.coeff[i];
3194 		}
3195 	    }
3196 	}
3197 
3198 	/* note: this call moved to here 2017-10-18 so we
3199 	   get computation of robust standard errors right
3200 	*/
3201 	fix_panel_hatvars(&remod, pan, (const double **) dset->Z);
3202 
3203 	if (pan->opt & OPT_R) {
3204 	    panel_robust_vcv(&remod, pan, rset);
3205 	} else {
3206 	    double sigma = remod.sigma; /* or... ? */
3207 #if PDEBUG
3208 	    fprintf(stderr, "GLS sigma = %g\n", sigma);
3209 	    fprintf(stderr, "GLS SSR   = %.7g\n", remod.ess);
3210 #endif
3211 	    makevcv(&remod, sigma);
3212 	}
3213 
3214 	if (pan->opt & OPT_V) {
3215 	    print_re_results(pan, &remod, dset, prn);
3216 	}
3217 
3218 	if (!na(hres)) {
3219 	    if (pan->opt & OPT_R) {
3220 		/* @hres is already the (robust) Hausman test statistic */
3221 		pan->H = hres;
3222 	    } else {
3223 		/* @hres is the unrestricted ess */
3224 		pan->H = (remod.ess / hres - 1.0) * (remod.nobs);
3225 	    }
3226 	} else if (pan->Sigma != NULL) {
3227 	    vcv_slopes(pan, &remod, VCV_SUBTRACT);
3228 	}
3229     }
3230 
3231 #if PDEBUG > 1
3232     if (remod.errcode == 0) {
3233 	printmodel(&remod, rset, OPT_NONE, prn);
3234     }
3235 #endif
3236 
3237     if (!err && (pan->opt & OPT_U)) {
3238 	save_panel_model(&remod, pan, (const double **) dset->Z, rset);
3239     } else {
3240 	clear_model(&remod);
3241     }
3242 
3243     destroy_dataset(rset);
3244 
3245     free(relist);
3246     free(hlist);
3247 
3248     return err;
3249 }
3250 
save_breusch_pagan_result(panelmod_t * pan)3251 static void save_breusch_pagan_result (panelmod_t *pan)
3252 {
3253     ModelTest *test;
3254 
3255     if (pan->realmod == NULL || na(pan->BP)) {
3256 	return;
3257     }
3258 
3259     test = model_test_new(GRETL_TEST_PANEL_BP);
3260 
3261     if (test != NULL) {
3262 	model_test_set_teststat(test, GRETL_STAT_WALD_CHISQ);
3263 	model_test_set_dfn(test, 1);
3264 	model_test_set_value(test, pan->BP);
3265 	model_test_set_pvalue(test, chisq_cdf_comp(1, pan->BP));
3266 	maybe_add_test_to_model(pan->realmod, test);
3267     }
3268 }
3269 
3270 /* do the panel Breusch-Pagan test and either print or save
3271    the results */
3272 
breusch_pagan_LM(panelmod_t * pan,PRN * prn)3273 static int breusch_pagan_LM (panelmod_t *pan, PRN *prn)
3274 {
3275     double A = 0.0;
3276     int n = pan->pooled->nobs;
3277     int print_means = 0;
3278     int i, t, M = 0;
3279 
3280     if ((pan->opt & OPT_V) && pan->effn <= 10) {
3281 	print_means = 1;
3282     }
3283 
3284     if (print_means) {
3285 	pputs(prn, _("Means of pooled OLS residuals for cross-sectional units:"));
3286 	pputs(prn, "\n\n");
3287     }
3288 
3289     for (i=0; i<pan->nunits; i++) {
3290 	int Ti = pan->unit_obs[i];
3291 
3292 	if (Ti > 0) {
3293 	    double u, usum = 0.0;
3294 
3295 	    for (t=0; t<pan->T; t++) {
3296 		u = pan->pooled->uhat[panel_index(i, t)];
3297 		if (!na(u)) {
3298 		    usum += u;
3299 		}
3300 	    }
3301 	    if (print_means) {
3302 		pprintf(prn, _(" unit %2d: %13.5g\n"), i + 1,
3303 			usum / (double) Ti);
3304 	    }
3305 	    A += usum * usum;
3306 	    M += Ti * Ti;
3307 	}
3308     }
3309 
3310     pan->BP = ((double) n * n /(2.0 * (M - n))) * pow((A / pan->pooled->ess) - 1.0, 2);
3311 
3312     if (pan->opt & OPT_V) {
3313 	pputc(prn, '\n');
3314 	pprintf(prn, _("Breusch-Pagan test statistic:\n"
3315 		       " LM = %g with p-value = prob(chi-square(1) > %g) = %g\n"),
3316 		pan->BP, pan->BP, chisq_cdf_comp(1, pan->BP));
3317 
3318 	pputs(prn, _("(A low p-value counts against the null hypothesis that "
3319 		     "the pooled OLS model\nis adequate, in favor of the random "
3320 		     "effects alternative.)\n\n"));
3321     }
3322 
3323     return 0;
3324 }
3325 
save_panspec_result(panelmod_t * pan,PRN * prn)3326 static void save_panspec_result (panelmod_t *pan, PRN *prn)
3327 {
3328     gretl_matrix *tests = gretl_column_vector_alloc(3);
3329     gretl_matrix *pvals = gretl_column_vector_alloc(3);
3330     char **S = strings_array_new(3);
3331 
3332     /* fixed-effects poolability F-test */
3333     tests->val[0] = pan->Ffe;
3334     pvals->val[0] = snedecor_cdf_comp(pan->Fdfn, pan->Fdfd, pan->Ffe);
3335     /* random-effects poolability test */
3336     tests->val[1] = pan->BP;
3337     pvals->val[1] = chisq_cdf_comp(1, pan->BP);
3338     /* Hausman test */
3339     tests->val[2] = pan->H;
3340     pvals->val[2] = chisq_cdf_comp(pan->dfH, pan->H);
3341 
3342     if (S != NULL) {
3343 	/* add row names */
3344 	char **S2;
3345 
3346 	S[0] = gretl_strdup("Poolability (Wald)");
3347 	S[1] = gretl_strdup("Poolability (B-P)");
3348 	S[2] = gretl_strdup("Hausman");
3349 	S2 = strings_array_dup(S, 3);
3350 	gretl_matrix_set_rownames(tests, S);
3351 	if (S2 != NULL) {
3352 	    gretl_matrix_set_rownames(pvals, S2);
3353 	}
3354     }
3355 
3356     if (pan->opt & OPT_V) {
3357 	print_hausman_result(pan, prn);
3358     }
3359     record_matrix_test_result(tests, pvals);
3360 }
3361 
finalize_hausman_test(panelmod_t * pan,int ci,PRN * prn)3362 static int finalize_hausman_test (panelmod_t *pan, int ci, PRN *prn)
3363 {
3364     int mdiff = 0, err = 0;
3365 
3366     if (pan->bdiff != NULL && pan->Sigma != NULL) {
3367 	/* matrix approach */
3368 	mdiff = 1;
3369 	err = bXb(pan);
3370     } else if (na(pan->H)) {
3371 	/* regression approach bombed somehow? */
3372 	err = E_DATA;
3373     }
3374 
3375     if (ci == PANSPEC) {
3376 	/* the context is the "panspec" command */
3377 	if (!err || (mdiff && err == E_NOTPD)) {
3378 	    save_panspec_result(pan, prn);
3379 	}
3380     } else {
3381 	/* the context is random-effects estimation */
3382 	if (mdiff && err == E_NOTPD) {
3383 	    pputs(prn, _("Hausman test matrix is not positive definite"));
3384 	    pputc(prn, '\n');
3385 	}
3386 	save_hausman_result(pan);
3387     }
3388 
3389     return err;
3390 }
3391 
3392 /* Based on the residuals from pooled OLS, do some accounting to see
3393    (a) how many cross-sectional units were actually included (after
3394    omitting any missing values); (b) how many time-series observations
3395    were included for each unit; and (c) what was the maximum number
3396    of time-series observations used.  Return 0 if all goes OK,
3397    non-zero otherwise.
3398 */
3399 
panel_obs_accounts(panelmod_t * pan)3400 static int panel_obs_accounts (panelmod_t *pan)
3401 {
3402     int *uobs;
3403     int i, t, bigt;
3404 
3405     uobs = malloc(pan->nunits * sizeof *uobs);
3406     if (uobs == NULL) {
3407 	return E_ALLOC;
3408     }
3409 
3410     pan->NT = 0;
3411     pan->effn = 0;
3412     pan->Tmax = 0;
3413     pan->Tmin = pan->T;
3414 
3415     for (i=0; i<pan->nunits; i++) {
3416 	uobs[i] = 0;
3417 	for (t=0; t<pan->T; t++) {
3418 	    bigt = panel_index(i, t);
3419 #if PDEBUG > 2
3420 	    fprintf(stderr, "unit %d, bigt=%d, pmod->uhat[%d]: %s\n", i, t,
3421 		    bigt, (panel_missing(pan, bigt))? "NA" : "OK");
3422 #endif
3423 	    if (!panel_missing(pan, bigt)) {
3424 		uobs[i] += 1;
3425 	    }
3426 	}
3427 	if (uobs[i] > 0) {
3428 	    pan->effn += 1;
3429 	    if (uobs[i] > pan->Tmax) {
3430 		pan->Tmax = uobs[i];
3431 	    }
3432 	    if (uobs[i] < pan->Tmin) {
3433 		pan->Tmin = uobs[i];
3434 	    }
3435 	    pan->NT += uobs[i];
3436 	}
3437     }
3438 
3439     for (i=0; i<pan->nunits; i++) {
3440 	if (uobs[i] > 0 && uobs[i] != pan->Tmax) {
3441 	    pan->balanced = 0;
3442 	    break;
3443 	}
3444     }
3445 
3446     pan->unit_obs = uobs;
3447 
3448     return 0;
3449 }
3450 
3451 /* Construct an array of char to record which parameters in the full
3452    model correspond to time-varying regressors
3453 */
3454 
panel_set_varying(panelmod_t * pan,const MODEL * pmod)3455 static int panel_set_varying (panelmod_t *pan, const MODEL *pmod)
3456 {
3457     int i;
3458 
3459     pan->varying = calloc(pmod->ncoeff, 1);
3460     if (pan->varying == NULL) {
3461 	return E_ALLOC;
3462     }
3463 
3464     for (i=0; i<pmod->ncoeff; i++) {
3465 	if (var_is_varying(pan, pmod->list[i+2])) {
3466 	    pan->varying[i] = 1;
3467 	}
3468     }
3469 
3470     return 0;
3471 }
3472 
3473 /* find harmonic mean of the number of time-series observations
3474    per included group */
3475 
calculate_Tbar(panelmod_t * pan)3476 static void calculate_Tbar (panelmod_t *pan)
3477 {
3478     int i;
3479 
3480     if (pan->balanced) {
3481 	pan->Tbar = pan->Tmax;
3482     } else {
3483 	double den = 0.0;
3484 
3485 	for (i=0; i<pan->nunits; i++) {
3486 	    if (pan->unit_obs[i] > 0) {
3487 		den += 1.0 / pan->unit_obs[i];
3488 	    }
3489 	}
3490 	pan->Tbar = pan->effn / den;
3491     }
3492 }
3493 
3494 static int
panelmod_setup(panelmod_t * pan,MODEL * pmod,const DATASET * dset,int ntdum,gretlopt opt)3495 panelmod_setup (panelmod_t *pan, MODEL *pmod, const DATASET *dset,
3496 		int ntdum, gretlopt opt)
3497 {
3498     int nobs, dsetT = sample_size(dset);
3499     int err = 0;
3500 
3501     pan->opt = opt;
3502     pan->pooled = pmod;
3503 
3504     /* assumes (possibly padded) balanced panel dataset */
3505     pan->nunits = dsetT / dset->pd;
3506     pan->T = dset->pd;
3507     nobs = pan->nunits * pan->T;
3508 
3509     if (nobs < dsetT) {
3510 	fprintf(stderr, "dataset nobs = %d, but panel nobs = %d\n",
3511 		dsetT, nobs);
3512     }
3513 
3514     panel_index_init(dset, pan->nunits, pan->T);
3515     pan->ntdum = ntdum;
3516 
3517     err = panel_obs_accounts(pan);
3518 
3519     if (!err && (pan->opt & (OPT_U | OPT_F | OPT_B))) {
3520 	pan->realmod = malloc(sizeof *pan->realmod);
3521 	if (pan->realmod == NULL) {
3522 	    err = E_ALLOC;
3523 	}
3524     }
3525 
3526     IGLS = stata_sa = 0;
3527 
3528     if (!err && (opt & OPT_X)) {
3529 	if (!(pan->opt & OPT_U)) {
3530 	    /* --unbalanced --random-effects */
3531 	    err = E_BADOPT;
3532 	} else {
3533 	    /* probe optional argument to --unbalanced */
3534 	    const char *s = get_optval_string(PANEL, OPT_X);
3535 
3536 	    if (s != NULL) {
3537 		if (!strcmp(s, "stata")) {
3538 		    stata_sa = 1;
3539 		} else if (!strcmp(s, "bc")) {
3540 		    stata_sa = 0;
3541 		} else if (has_suffix(s, ".mat")) {
3542 		    /* hidden testing option */
3543 		    strcpy(glsmat, s);
3544 		    IGLS = 1;
3545 		} else {
3546 		    gretl_errmsg_sprintf(_("%s: invalid option argument"), s);
3547 		    err = E_INVARG;
3548 		}
3549 	    }
3550 	}
3551     }
3552 
3553     if (err && pan->unit_obs != NULL) {
3554 	free(pan->unit_obs);
3555 	pan->unit_obs = NULL;
3556     }
3557 
3558     return err;
3559 }
3560 
3561 /* Called in relation to a model estimated by pooled OLS: test for
3562    both fixed and random effects. Implements the "panspec" command.
3563 */
3564 
panel_diagnostics(MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)3565 int panel_diagnostics (MODEL *pmod, DATASET *dset,
3566 		       gretlopt opt, PRN *prn)
3567 {
3568     int nerlove = 0;
3569     int quiet = (opt & OPT_Q);
3570     gretlopt psopt = opt;
3571     panelmod_t pan;
3572     int xdf, err = 0;
3573 
3574 #if PDEBUG
3575     fputs("\n*** Starting panel_diagnostics ***\n", stderr);
3576     fprintf(stderr, " OPT_M? %s\n", (opt & OPT_M)? "yes" : "no");
3577 #endif
3578 
3579     if (pmod->ifc == 0) {
3580 	/* at many points we assume the base regression has an
3581 	   intercept included */
3582 	return E_NOCONST;
3583     }
3584 
3585 #if PDEBUG /* not really ready yet! */
3586     if (pmod->opt & OPT_R) {
3587 	/* forward the robust option */
3588 	opt |= OPT_R;
3589     }
3590 #endif
3591 
3592     if (opt & OPT_N) {
3593 	nerlove = 1;
3594     }
3595 
3596     /* Add OPT_V to make the fixed and random effects functions verbose,
3597        unless we've been passed the --quiet option.
3598     */
3599     if (!quiet) {
3600 	psopt |= OPT_V;
3601     }
3602     panelmod_init(&pan);
3603     err = panelmod_setup(&pan, pmod, dset, 0, psopt);
3604     if (err) {
3605 	goto bailout;
3606     }
3607 
3608     if (pan.effn < pan.nunits) {
3609 	fprintf(stderr, "number of units included = %d\n", pan.effn);
3610 	if (pan.effn <= 0) {
3611 	    return E_DATA;
3612 	}
3613     }
3614 
3615     /* figure out which of the original regressors are time-varying */
3616     err = varying_vars_list(dset, &pan);
3617     if (err) {
3618 	goto bailout;
3619     }
3620 
3621     err = panel_set_varying(&pan, pmod);
3622     if (err) {
3623 	goto bailout;
3624     }
3625 
3626     calculate_Tbar(&pan);
3627 
3628     /* degrees of freedom relative to # of x-sectional units */
3629     xdf = pan.effn - pmod->ncoeff;
3630 
3631 #if PDEBUG
3632     fprintf(stderr, "nunits=%d, T=%d, effn=%d, Tmax=%d, xdf=%d\n",
3633 	    pan.nunits, pan.T, pan.effn, pan.Tmax, xdf);
3634 #endif
3635 
3636     if (xdf <= 0 && !nerlove) {
3637 	; /* can't do Random Effects */
3638     } else if (pan.opt & OPT_M) {
3639 	/* matrix version of Hausman test */
3640 	err = matrix_hausman_allocate(&pan);
3641 	if (err) {
3642 	    goto bailout;
3643 	}
3644     }
3645 
3646     if (!quiet) {
3647 	/* header */
3648 	pputc(prn, '\n');
3649 	pprintf(prn, _("Diagnostics: using n = %d cross-sectional units\n"),
3650 		pan.effn);
3651 	pputc(prn, '\n');
3652     }
3653 
3654     err = within_variance(&pan, dset, prn);
3655     if (err) {
3656 	goto bailout;
3657     }
3658 
3659     if (xdf <= 0) {
3660 	pprintf(prn, "Omitting group means regression: "
3661 		"insufficient degrees of freedom\n");
3662 	goto bailout;
3663     }
3664 
3665     if (xdf > 0 && !na(pan.s2e)) {
3666 	DATASET *gset;
3667 
3668 	gset = group_means_dataset(&pan, dset);
3669 	if (gset == NULL) {
3670 	    err = E_ALLOC;
3671 	} else {
3672 	    err = between_variance(&pan, gset);
3673 	}
3674 
3675 	if (err) {
3676 	    pputs(prn, _("Couldn't estimate group means regression\n"));
3677 	    if (err == E_SINGULAR) {
3678 		/* treat this as non-fatal */
3679 		err = 0;
3680 	    }
3681 	} else {
3682 	    random_effects(&pan, dset, gset, prn);
3683 	    if (pan.theta > 0) {
3684 		/* this test hardly makes sense if GLS = OLS */
3685 		breusch_pagan_LM(&pan, prn);
3686 	    }
3687 	    finalize_hausman_test(&pan, PANSPEC, prn);
3688 	}
3689 
3690 	if (gset != NULL) {
3691 	    destroy_dataset(gset);
3692 	}
3693     }
3694 
3695  bailout:
3696 
3697     panelmod_free(&pan);
3698 
3699     return err;
3700 }
3701 
between_model(panelmod_t * pan,const DATASET * dset)3702 static int between_model (panelmod_t *pan, const DATASET *dset)
3703 {
3704     DATASET *gset;
3705     int err = 0;
3706 
3707     gset = group_means_dataset(pan, dset);
3708 
3709     if (gset == NULL) {
3710 	err = E_ALLOC;
3711     } else {
3712 	err = between_variance(pan, gset);
3713 	if (err) {
3714 	    destroy_dataset(gset);
3715 	} else {
3716 	    pan->realmod->dataset = gset;
3717 	}
3718     }
3719 
3720     return err;
3721 }
3722 
3723 /* When we automatically add dummy variables to the dataset,
3724    in estimating a panel model with the --time-dummies option,
3725    should we delete these when we're done, or leave them
3726    in the dataset? 2013-11-18: we'll delete them if the dataset
3727    is subsampled, otherwise leave them.
3728 */
3729 
3730 static int
process_time_dummies(MODEL * pmod,const DATASET * dset,int v)3731 process_time_dummies (MODEL *pmod, const DATASET *dset, int v)
3732 {
3733     int i, vi, n = 0;
3734 
3735     for (i=pmod->list[0]; i>1; i--) {
3736 	if (pmod->list[i] >= v) {
3737 	    n++;
3738 	}
3739     }
3740 
3741     if (n > 0) {
3742 	gretl_model_allocate_param_names(pmod, pmod->ncoeff);
3743 
3744 	if (pmod->errcode) {
3745 	    return pmod->errcode;
3746 	}
3747 
3748 	for (i=2; i<=pmod->list[0]; i++) {
3749 	    vi = pmod->list[i];
3750 	    gretl_model_set_param_name(pmod, i-2, dset->varname[vi]);
3751 	}
3752 
3753 	for (i=pmod->list[0]; i>1; i--) {
3754 	    if (pmod->list[i] >= v) {
3755 		gretl_list_delete_at_pos(pmod->list, i);
3756 	    }
3757 	}
3758     }
3759 
3760     /* Record the fact that the model used time dummies, in case
3761        the user wants to add/omit variables or something.
3762     */
3763     pmod->opt |= OPT_D;
3764 
3765     return 0;
3766 }
3767 
add_time_dummies_to_list(const int * list,DATASET * dset,int ** plist)3768 static int add_time_dummies_to_list (const int *list,
3769 				     DATASET *dset,
3770 				     int **plist)
3771 {
3772     char dname[VNAMELEN];
3773     int *biglist = NULL;
3774     int ntdum = dset->pd - 1;
3775     int i, j, v;
3776     int err = 0;
3777 
3778     biglist = gretl_list_new(list[0] + ntdum);
3779     if (biglist == NULL) {
3780 	return E_ALLOC;
3781     }
3782 
3783     for (i=1; i<=list[0]; i++) {
3784 	biglist[i] = list[i];
3785     }
3786 
3787     j = list[0] + 1;
3788 
3789     for (i=2; i<=dset->pd; i++) {
3790 	sprintf(dname, "dt_%d", i);
3791 	v = series_greatest_index(dset, dname);
3792 	if (v == dset->v) {
3793 	    err = E_DATA;
3794 	    break;
3795 	} else if (in_gretl_list(list, v)) {
3796 	    /* already present */
3797 	    biglist[0] -= 1;
3798 	} else {
3799 	    biglist[j++] = v;
3800 	}
3801     }
3802 
3803     if (err) {
3804 	free(biglist);
3805     } else {
3806 	*plist = biglist;
3807     }
3808 
3809     return err;
3810 }
3811 
panel_check_for_const(const int * list)3812 static int panel_check_for_const (const int *list)
3813 {
3814     int i;
3815 
3816     for (i=2; i<=list[0]; i++) {
3817 	if (list[i] == 0) {
3818 	    return 0; /* OK */
3819 	}
3820     }
3821 
3822     return E_NOCONST;
3823 }
3824 
get_ntdum(const int * orig,const int * new)3825 static int get_ntdum (const int *orig, const int *new)
3826 {
3827     int i, n = 0;
3828 
3829     for (i=2; i<=new[0]; i++) {
3830 	if (!in_gretl_list(orig, new[i])) {
3831 	    n++;
3832 	}
3833     }
3834 
3835     return n;
3836 }
3837 
save_pooled_model(MODEL * pmod,panelmod_t * pan,const DATASET * dset)3838 static void save_pooled_model (MODEL *pmod, panelmod_t *pan,
3839 			       const DATASET *dset)
3840 {
3841     gretl_model_set_int(pmod, "pooled", 1);
3842     add_panel_obs_info(pmod, pan);
3843 
3844     if (!(pan->opt & OPT_A)) {
3845 	set_model_id(pmod, pan->opt);
3846     }
3847 
3848     if (pan->opt & OPT_R) {
3849 	panel_robust_vcv(pmod, pan, dset);
3850 	pmod->opt |= OPT_R;
3851 	pmod->fstt = panel_overall_test(pmod, pan, 0, OPT_NONE);
3852 	pmod->dfd = pan->effn - 1;
3853     }
3854 
3855     panel_dwstat(pmod, pan);
3856 }
3857 
3858 /* fixed effects | random effects | between | pooled */
3859 
3860 #define estimator_specified(o) (o & (OPT_F|OPT_U|OPT_B|OPT_P))
3861 
3862 /**
3863  * real_panel_model:
3864  * @list: list containing model specification.
3865  * @dset: dataset struct.
3866  * @opt: may include %OPT_U for the random effects model;
3867  * %OPT_R for robust standard errors (fixed effects model
3868  * and pooled OLS only); %OPT_M to use the matrix-difference
3869  * version of the Hausman test (random effects only); %OPT_B for
3870  * the "between" model; %OPT_P for pooled OLS; and %OPT_D to
3871  * include time dummies.
3872  * If and only if %OPT_P is given, %OPT_C (clustered standard
3873  * errors) is accepted.
3874  * If %OPT_U is given, either of the mutually incompatible options
3875  * %OPT_N and %OPT_X may be given to inflect the calculation of the
3876  * variance of the individual effects: %OPT_N means use Nerlove's
3877  * method, %OPT_X means use the special "exact" method of Swamy
3878  * and Arora in the case of an unbalanced panel. The default is
3879  * to use the "generic" Swamy-Arora method.
3880  * @prn: printing struct.
3881  *
3882  * Estimates a panel model, by default the fixed effects model.
3883  *
3884  * Returns: a #MODEL struct, containing the estimates.
3885  */
3886 
real_panel_model(const int * list,DATASET * dset,gretlopt opt,PRN * prn)3887 MODEL real_panel_model (const int *list, DATASET *dset,
3888 			gretlopt opt, PRN *prn)
3889 {
3890     int orig_v = dset->v;
3891     MODEL mod;
3892     panelmod_t pan;
3893     gretlopt pan_opt = opt;
3894     gretlopt ols_opt = OPT_A;
3895     int *olslist = NULL;
3896     int nerlove = 0;
3897     int ntdum = 0;
3898     int err = 0;
3899 
3900     gretl_model_init(&mod, dset);
3901     panelmod_init(&pan);
3902 
3903     if (!estimator_specified(opt)) {
3904 	/* default: add OPT_F to save the fixed effects model */
3905 	pan_opt |= OPT_F;
3906     }
3907 
3908     if (opt & OPT_P) {
3909 	/* doing pooled OLS */
3910 	if (opt & OPT_C) {
3911 	    /* clustered */
3912 	    ols_opt |= (OPT_C | OPT_R);
3913 	}
3914     } else {
3915 	/* not just pooled OLS */
3916 	mod.errcode = panel_check_for_const(list);
3917 	if (mod.errcode) {
3918 	    return mod;
3919 	}
3920     }
3921 
3922     if (opt & OPT_D) {
3923 	/* time dummies wanted */
3924 	err = gen_panel_dummies(dset, OPT_T, prn);
3925 	if (!err) {
3926 	    err = add_time_dummies_to_list(list, dset, &olslist);
3927 	}
3928     } else {
3929 	olslist = gretl_list_copy(list);
3930 	if (olslist == NULL) {
3931 	    err = E_ALLOC;
3932 	}
3933     }
3934 
3935     if (err) {
3936 	goto bailout;
3937     }
3938 
3939     /* baseline: estimate via pooled OLS */
3940     mod = lsq(olslist, dset, OLS, ols_opt);
3941     if (mod.errcode) {
3942 	err = mod.errcode;
3943 	fprintf(stderr, "real_panel_model: error %d in initial OLS\n",
3944 		mod.errcode);
3945 	goto bailout;
3946     }
3947 
3948 #if PDEBUG > 1
3949     pprintf(prn, "*** initial baseline OLS\n");
3950     printlist(olslist, "olslist");
3951     printmodel(&mod, dset, OPT_NONE, prn);
3952 #endif
3953 
3954     free(olslist);
3955 
3956 #if PDEBUG
3957     if (pan_opt & OPT_F) {
3958 	fprintf(stderr, "\n*** Doing fixed effects\n");
3959     } else if (pan_opt & OPT_U) {
3960 	fprintf(stderr, "\n*** Doing random effects\n");
3961     }
3962 #endif
3963 
3964     if ((opt & OPT_D) && !(opt & OPT_B)) {
3965 	ntdum = get_ntdum(list, mod.list);
3966     }
3967 
3968     err = panelmod_setup(&pan, &mod, dset, ntdum, pan_opt);
3969     if (err) {
3970 	goto bailout;
3971     }
3972 
3973     if (opt & OPT_P) {
3974 	save_pooled_model(&mod, &pan, dset);
3975 	goto bailout;
3976     }
3977 
3978     if (opt & OPT_B) {
3979 	err = between_model(&pan, dset);
3980 	goto bailout;
3981     }
3982 
3983     /* figure out which of the original regressors are time-varying,
3984        or unit-varying as the case may be
3985     */
3986     err = varying_vars_list(dset, &pan);
3987     if (err) {
3988 	goto bailout;
3989     }
3990 
3991     err = panel_set_varying(&pan, &mod);
3992     if (err) {
3993 	goto bailout;
3994     }
3995 
3996     if (opt & OPT_U) {
3997 	nerlove = (opt & OPT_N)? 1 : 0;
3998     }
3999 
4000     calculate_Tbar(&pan);
4001 
4002     if (opt & OPT_U) {
4003 	/* trying to do random effects */
4004 	int xdf = pan.effn - (mod.ncoeff - pan.ntdum);
4005 
4006 	if (xdf <= 0 && !nerlove) {
4007 	    gretl_errmsg_set(_("Couldn't estimate group means regression"));
4008 	    err = mod.errcode = E_DF;
4009 	    goto bailout;
4010 	} else if (pan.opt & OPT_M) {
4011 	    /* matrix version of Hausman test */
4012 	    err = matrix_hausman_allocate(&pan);
4013 	    if (err) {
4014 		goto bailout;
4015 	    }
4016 	}
4017     }
4018 
4019     err = within_variance(&pan, dset, prn);
4020     if (err) {
4021 	goto bailout;
4022     }
4023 
4024     if ((opt & OPT_U) && !na(pan.s2e)) {
4025 	DATASET *gset;
4026 
4027 	breusch_pagan_LM(&pan, prn);
4028 
4029 	gset = group_means_dataset(&pan, dset);
4030 	if (gset == NULL) {
4031 	    err = E_ALLOC;
4032 	} else {
4033 	    err = between_variance(&pan, gset);
4034 	}
4035 
4036 	if (err && !nerlove) {
4037 	    pputs(prn, _("Couldn't estimate group means regression\n"));
4038 	} else {
4039 	    err = random_effects(&pan, dset, gset, prn);
4040 	    if (!err) {
4041 		save_breusch_pagan_result(&pan);
4042 		finalize_hausman_test(&pan, PANEL, prn);
4043 	    }
4044 	}
4045 
4046 	if (gset != NULL) {
4047 	    destroy_dataset(gset);
4048 	}
4049     }
4050 
4051  bailout:
4052 
4053     if (!err) {
4054 	if ((pan_opt & (OPT_F | OPT_U)) && mod.missmask != NULL) {
4055 	    /* preserve missing obs mask, if any */
4056 	    char *mask = mod.missmask;
4057 
4058 	    mod.missmask = NULL;
4059 	    clear_model(&mod);
4060 	    mod = *pan.realmod;
4061 	    mod.missmask = mask;
4062 	} else if (!(opt & OPT_P)) {
4063 	    /* not doing pooled OLS, in which case @mod would
4064 	       already hold the correct model
4065 	    */
4066 	    clear_model(&mod);
4067 	    mod = *pan.realmod;
4068 	}
4069 	gretl_model_smpl_init(&mod, dset);
4070 	if (opt & OPT_D) {
4071 	    if (pan.ntdum > 0) {
4072 		gretl_model_set_int(&mod, "ntdum", pan.ntdum);
4073 		maybe_suppress_time_dummies(&mod, pan.ntdum);
4074 	    }
4075 	    if (complex_subsampled()) {
4076 		process_time_dummies(&mod, dset, orig_v);
4077 	    }
4078 	}
4079     }
4080 
4081     panelmod_free(&pan);
4082 
4083     if (err && mod.errcode == 0) {
4084 	mod.errcode = err;
4085     }
4086 
4087     if (complex_subsampled()) {
4088 	dataset_drop_last_variables(dset, dset->v - orig_v);
4089     }
4090 
4091     return mod;
4092 }
4093 
4094 /* Called from qr_estimate.c in case robust VCV estimation is called
4095    for in the context of TSLS estimation on panel data.
4096 */
4097 
panel_tsls_robust_vcv(MODEL * pmod,const DATASET * dset)4098 int panel_tsls_robust_vcv (MODEL *pmod, const DATASET *dset)
4099 {
4100     panelmod_t pan;
4101     int err = 0;
4102 
4103     panelmod_init(&pan);
4104 
4105     err = panelmod_setup(&pan, pmod, dset, 0, OPT_NONE);
4106     if (!err) {
4107 	err = panel_robust_vcv(pmod, &pan, dset);
4108     }
4109 
4110     panelmod_free(&pan);
4111 
4112     return err;
4113 }
4114 
4115 /* write weights for groupwise weighted least squares into the
4116    last variable in the dataset */
4117 
4118 static int
write_weights_to_dataset(double * uvar,int nunits,int T,DATASET * dset)4119 write_weights_to_dataset (double *uvar, int nunits, int T,
4120 			  DATASET *dset)
4121 {
4122     int w = dset->v - 1;
4123     double wi;
4124     int i, t;
4125 
4126     for (i=0; i<nunits; i++) {
4127 	if (uvar[i] <= 0.0 || na(uvar[i])) {
4128 	    wi = 0.0;
4129 	} else {
4130 	    wi = 1.0 / uvar[i]; /* sqrt? */
4131 	}
4132 	for (t=0; t<T; t++) {
4133 	    dset->Z[w][panel_index(i, t)] = wi;
4134 	}
4135     }
4136 
4137     return 0;
4138 }
4139 
allocate_weight_var(DATASET * dset)4140 static int allocate_weight_var (DATASET *dset)
4141 {
4142     if (dataset_add_series(dset, 1)) {
4143 	return E_ALLOC;
4144     }
4145 
4146     strcpy(dset->varname[dset->v - 1], "unit_wt");
4147 
4148     return 0;
4149 }
4150 
max_coeff_diff(const MODEL * pmod,const double * bvec)4151 static double max_coeff_diff (const MODEL *pmod, const double *bvec)
4152 {
4153     double diff, maxdiff = 0.0;
4154     int i;
4155 
4156     for (i=0; i<pmod->ncoeff; i++) {
4157 	diff = fabs(pmod->coeff[i] - bvec[i]);
4158 	if (diff > maxdiff) {
4159 	    maxdiff = diff;
4160 	}
4161     }
4162 
4163     return maxdiff;
4164 }
4165 
4166 #define S2MINOBS 1
4167 
4168 /* Wald test for groupwise heteroskedasticity, without assuming
4169    normality of the errors: see Greene, 4e, p. 598.  Note that the
4170    computation involves a factor of (1/(T_i - 1)) so we cannot
4171    include groups that have only one observation.
4172 */
4173 
4174 static double
wald_hetero_test(const MODEL * pmod,double s2,const double * uvar,panelmod_t * pan,int * df)4175 wald_hetero_test (const MODEL *pmod, double s2,
4176 		  const double *uvar, panelmod_t *pan,
4177 		  int *df)
4178 {
4179     double x, W = 0.0;
4180     int i, t, Ti;
4181 
4182     *df = 0;
4183 
4184     for (i=0; i<pan->nunits; i++) {
4185 	double fii = 0.0;
4186 
4187 	Ti = pan->unit_obs[i];
4188 	if (Ti < 2) {
4189 	    continue;
4190 	}
4191 
4192 	for (t=0; t<pan->T; t++) {
4193 	    x = pmod->uhat[panel_index(i, t)];
4194 	    if (!na(x)) {
4195 		x = x * x - uvar[i];
4196 		fii += x * x;
4197 	    }
4198 	}
4199 
4200 	if (fii <= 0) {
4201 	    W = NADBL;
4202 	    break;
4203 	}
4204 
4205 	fii *= (1.0 / Ti) * (1.0 / (Ti - 1.0));
4206 	x = uvar[i] - s2;
4207 	W += x * x / fii;
4208 	*df += 1;
4209     }
4210 
4211     if (*df < 2) {
4212 	W = NADBL;
4213     }
4214 
4215     return W;
4216 }
4217 
4218 /* Likelihood-ratio test for groupwise heteroskedasticity: see Greene,
4219    4e, pp. 597 and 599.  This test requires that we're able to
4220    calculate the ML estimates (using iterated WLS).
4221 */
4222 
4223 static int
ml_hetero_test(MODEL * pmod,double s2,const double * uvar,int nunits,const int * unit_obs)4224 ml_hetero_test (MODEL *pmod, double s2, const double *uvar,
4225 		int nunits, const int *unit_obs)
4226 {
4227     ModelTest *test;
4228     double x2, s2h = 0.0;
4229     int i, Ti, df = 0;
4230     int err = 0;
4231 
4232     for (i=0; i<nunits; i++) {
4233 	Ti = unit_obs[i];
4234 	if (Ti > 0) {
4235 	    s2h += Ti * log(uvar[i]);
4236 	    df++;
4237 	}
4238     }
4239 
4240     x2 = pmod->nobs * log(s2) - s2h;
4241     df--;
4242 
4243     test = model_test_new(GRETL_TEST_GROUPWISE);
4244     if (test != NULL) {
4245 	model_test_set_teststat(test, GRETL_STAT_LR);
4246 	model_test_set_dfn(test, df);
4247 	model_test_set_value(test, x2);
4248 	model_test_set_pvalue(test, chisq_cdf_comp(df, x2));
4249 	maybe_add_test_to_model(pmod, test);
4250     } else {
4251 	err = 1;
4252     }
4253 
4254     return err;
4255 }
4256 
pooled_ll(const MODEL * pmod)4257 static double pooled_ll (const MODEL *pmod)
4258 {
4259     double n = pmod->nobs;
4260 
4261     return -(n / 2.0) * (1.0 + LN_2_PI - log(n) + log(pmod->ess));
4262 }
4263 
4264 /* calculate log-likelihood for panel MLE with groupwise
4265    heteroskedasticity
4266 */
4267 
panel_ML_ll(MODEL * pmod,const double * uvar,int nunits,const int * unit_obs)4268 static void panel_ML_ll (MODEL *pmod, const double *uvar,
4269 			 int nunits, const int *unit_obs)
4270 {
4271     double ll = -(pmod->nobs / 2.0) * LN_2_PI;
4272     int i, Ti;
4273 
4274     for (i=0; i<nunits; i++) {
4275 	Ti = unit_obs[i];
4276 	if (Ti > 0) {
4277 	    ll -= (Ti / 2.0) * (1.0 + log(uvar[i]));
4278 	}
4279     }
4280 
4281     pmod->lnL = ll;
4282     mle_criteria(pmod, 0);
4283 }
4284 
4285 /* we can't estimate a group-specific variance based on just one
4286    observation */
4287 
singleton_check(const int * unit_obs,int nunits)4288 static int singleton_check (const int *unit_obs, int nunits)
4289 {
4290     int i;
4291 
4292     for (i=0; i<nunits; i++) {
4293 	if (unit_obs[i] == 1) {
4294 	    return 1;
4295 	}
4296     }
4297 
4298     return 0;
4299 }
4300 
4301 /* compute per-unit error variances */
4302 
unit_error_variances(double * uvar,const MODEL * pmod,panelmod_t * pan,int * df)4303 static void unit_error_variances (double *uvar, const MODEL *pmod,
4304 				  panelmod_t *pan, int *df)
4305 {
4306     int i, t;
4307     double uit;
4308 
4309     *df = 0;
4310 
4311     for (i=0; i<pan->nunits; i++) {
4312 	if (pan->unit_obs[i] < S2MINOBS) {
4313 	    uvar[i] = NADBL;
4314 	} else {
4315 	    uvar[i] = 0.0;
4316 	    for (t=0; t<pan->T; t++) {
4317 		uit = pmod->uhat[panel_index(i, t)];
4318 		if (!na(uit)) {
4319 		    uvar[i] += uit * uit;
4320 		}
4321 	    }
4322 	    uvar[i] /= pan->unit_obs[i];
4323 	    *df += 1;
4324 	}
4325     }
4326 }
4327 
print_unit_variances(panelmod_t * pan,double * uvar,int wald_test,PRN * prn)4328 static void print_unit_variances (panelmod_t *pan, double *uvar,
4329 				  int wald_test, PRN *prn)
4330 {
4331     int i, Ti;
4332 
4333     pputs(prn, " unit    variance\n");
4334 
4335     for (i=0; i<pan->nunits; i++) {
4336 	Ti = pan->unit_obs[i];
4337 	if (na(uvar[i]) || (wald_test && Ti < 2)) {
4338 	    pprintf(prn, "%5d%12s (T = %d)\n", i+1, "NA", Ti);
4339 	} else {
4340 	    pprintf(prn, "%5d%#12g (T = %d)\n", i+1, uvar[i], Ti);
4341 	}
4342     }
4343 }
4344 
4345 #define SMALLDIFF 0.0001
4346 #define WLS_MAX   30
4347 
4348 /**
4349  * panel_wls_by_unit:
4350  * @list: regression list.
4351  * @dset: dataset struct.
4352  * @opt: may include %OPT_I (iterate to ML solution),
4353  * %OPT_V for verbose operation.
4354  * @prn: for printing details of iterations (or %NULL).
4355  *
4356  * Performs weighted least squares (possibly iterated) on
4357  * a panel model, allowing for groupwise heteroskedasticity.
4358  *
4359  * Returns: the estimates, in a %MODEL.
4360  */
4361 
panel_wls_by_unit(const int * list,DATASET * dset,gretlopt opt,PRN * prn)4362 MODEL panel_wls_by_unit (const int *list, DATASET *dset,
4363 			 gretlopt opt, PRN *prn)
4364 {
4365     MODEL mdl;
4366     panelmod_t pan;
4367     gretlopt wlsopt = OPT_A | OPT_Z;
4368     double *uvar = NULL;
4369     double *bvec = NULL;
4370     double s2, diff = 1.0;
4371     int *wlist = NULL;
4372     int orig_v = dset->v;
4373     int i, iter = 0;
4374 
4375     gretl_error_clear();
4376     panelmod_init(&pan);
4377 
4378     if (opt & OPT_I) {
4379 	/* iterating: no degrees-of-freedom correction in WLS */
4380 	wlsopt |= OPT_N;
4381     }
4382 
4383     /* baseline pooled model */
4384     mdl = lsq(list, dset, OLS, OPT_A);
4385     if (mdl.errcode) {
4386 	goto bailout;
4387     }
4388 
4389     mdl.errcode = panelmod_setup(&pan, &mdl, dset, 0, OPT_NONE);
4390     if (mdl.errcode) {
4391 	goto bailout;
4392     }
4393 
4394     uvar = malloc(pan.nunits * sizeof *uvar);
4395     if (uvar == NULL) {
4396 	free(pan.unit_obs);
4397 	mdl.errcode = E_ALLOC;
4398 	return mdl;
4399     }
4400 
4401     if (opt & OPT_I) {
4402 	if (singleton_check(pan.unit_obs, pan.nunits)) {
4403 	    pprintf(prn, _("Can't produce ML estimates: "
4404 			   "some units have only one observation"));
4405 	    pputc(prn, '\n');
4406 	    opt ^= OPT_I;
4407 	}
4408     }
4409 
4410     s2 = mdl.ess / mdl.nobs;
4411 
4412     if ((opt & OPT_V) && (opt & OPT_I)) {
4413 	pprintf(prn, "\nOLS error variance = %g\n", s2);
4414 	pprintf(prn, "log-likelihood = %g\n", pooled_ll(&mdl));
4415     }
4416 
4417     if (allocate_weight_var(dset)) {
4418 	mdl.errcode = E_ALLOC;
4419 	goto bailout;
4420     }
4421 
4422     if (opt & OPT_I) {
4423 	bvec = malloc(mdl.ncoeff * sizeof *bvec);
4424 	if (bvec == NULL) {
4425 	    mdl.errcode = E_ALLOC;
4426 	    goto bailout;
4427 	}
4428     }
4429 
4430     /* allocate and construct WLS regression list */
4431 
4432     wlist = gretl_list_new(mdl.list[0] + 1);
4433     if (wlist == NULL) {
4434 	mdl.errcode = E_ALLOC;
4435 	goto bailout;
4436     }
4437 
4438     wlist[1] = dset->v - 1; /* weight variable: the last var added */
4439     for (i=2; i<=wlist[0]; i++) {
4440 	wlist[i] = mdl.list[i-1];
4441     }
4442 
4443     /* If wanted (and possible) iterate to ML solution; otherwise just do
4444        one-step FGLS estimation.
4445     */
4446 
4447     while (diff > SMALLDIFF) {
4448 	int df = 0;
4449 
4450 	iter++;
4451 
4452 	unit_error_variances(uvar, &mdl, &pan, &df);
4453 
4454 	if (opt & OPT_V) {
4455 	    if (opt & OPT_I) {
4456 		pprintf(prn, "\n*** %s %d ***\n", _("iteration"),
4457 			iter);
4458 	    } else {
4459 		pputc(prn, '\n');
4460 	    }
4461 	    print_unit_variances(&pan, uvar, 0, prn);
4462 	}
4463 
4464 	write_weights_to_dataset(uvar, pan.nunits, pan.T, dset);
4465 
4466 	if (opt & OPT_I) {
4467 	    /* save coefficients for comparison */
4468 	    for (i=0; i<mdl.ncoeff; i++) {
4469 		bvec[i] = mdl.coeff[i];
4470 	    }
4471 	}
4472 
4473 	clear_model(&mdl);
4474 	mdl = lsq(wlist, dset, WLS, wlsopt);
4475 	if (mdl.errcode) {
4476 	    break;
4477 	}
4478 
4479 	if (iter > WLS_MAX) {
4480 	    mdl.errcode = E_NOCONV;
4481 	    break;
4482 	}
4483 
4484 	if (opt & OPT_I) {
4485 	    diff = max_coeff_diff(&mdl, bvec);
4486 	    if ((opt & OPT_V) && iter == 1) {
4487 		pprintf(prn, "\nFGLS pooled error variance = %g\n",
4488 			mdl.ess / mdl.nobs);
4489 	    }
4490 	} else {
4491 	    /* one-step FGLS */
4492 	    break;
4493 	}
4494     }
4495 
4496     if (!mdl.errcode) {
4497 	mdl.ci = PANEL;
4498 	if (!(opt & OPT_A)) {
4499 	    set_model_id(&mdl, pan.opt);
4500 	}
4501 	gretl_model_set_int(&mdl, "n_included_units", pan.effn);
4502 	mdl.opt |= OPT_H; /* --unit-weights */
4503 	mdl.nwt = 0;
4504 
4505 	if (opt & OPT_I) {
4506 	    int df = 0;
4507 
4508 	    gretl_model_set_int(&mdl, "iters", iter);
4509 	    ml_hetero_test(&mdl, s2, uvar, pan.nunits, pan.unit_obs);
4510 	    unit_error_variances(uvar, &mdl, &pan, &df);
4511 	    panel_ML_ll(&mdl, uvar, pan.nunits, pan.unit_obs);
4512 	    if (opt & OPT_V) {
4513 		pputc(prn, '\n');
4514 	    }
4515 	}
4516     }
4517 
4518  bailout:
4519 
4520     free(pan.unit_obs);
4521     free(uvar);
4522     free(wlist);
4523     free(bvec);
4524 
4525     if (!(opt & OPT_V)) {
4526 	dataset_drop_last_variables(dset, dset->v - orig_v);
4527     }
4528 
4529     return mdl;
4530 }
4531 
print_wald_test(double W,int df,double pval,panelmod_t * pan,double * uvar,double s2,PRN * prn)4532 static void print_wald_test (double W, int df, double pval,
4533 			     panelmod_t *pan, double *uvar,
4534 			     double s2, PRN *prn)
4535 {
4536     pprintf(prn, "\n%s:\n",
4537 	    _("Distribution free Wald test for heteroskedasticity"));
4538     pprintf(prn, " %s(%d) = %g, ",  _("Chi-square"), df, W);
4539     pprintf(prn, "%s = %g\n\n", _("with p-value"), pval);
4540 
4541     if (pan->nunits <= 30) {
4542 	pprintf(prn, "%s = %g\n\n", _("Pooled error variance"), s2);
4543 	print_unit_variances(pan, uvar, 1, prn);
4544     }
4545 }
4546 
4547 /**
4548  * groupwise_hetero_test:
4549  * @pmod: panel model to be tested.
4550  * @dset: information on the (panel) data set.
4551  * @opt: may contain OPT_S to attach the test result
4552  * to @pmod, OPT_I for silent operation.
4553  * @prn: for printing details of iterations (or %NULL).
4554  *
4555  * Performs a Wald test for the null hypothesis that the
4556  * error variance is uniform across the units.
4557  *
4558  * Returns: 0 on success, non-zero error code on failure.
4559  */
4560 
groupwise_hetero_test(MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)4561 int groupwise_hetero_test (MODEL *pmod, DATASET *dset,
4562 			   gretlopt opt, PRN *prn)
4563 {
4564     panelmod_t pan;
4565     double *uvar = NULL;
4566     double s2, W = NADBL;
4567     int df, err = 0;
4568 
4569     if (pmod->ci == OLS || (pmod->ci == PANEL && (pmod->opt & OPT_F))) {
4570 	; /* OK for pooled or fixed effects */
4571     } else if (0 && pmod->ci == PANEL && (pmod->opt & OPT_U)) {
4572 	; /* just testing! */
4573     } else {
4574 	return E_NOTIMP;
4575     }
4576 
4577     gretl_error_clear();
4578     panelmod_init(&pan);
4579 
4580     err = panelmod_setup(&pan, pmod, dset, 0, OPT_NONE);
4581     if (err) {
4582 	return err;
4583     }
4584 
4585     uvar = malloc(pan.nunits * sizeof *uvar);
4586     if (uvar == NULL) {
4587 	free(pan.unit_obs);
4588 	return E_ALLOC;
4589     }
4590 
4591     s2 = pmod->ess / pmod->nobs;
4592 
4593     unit_error_variances(uvar, pmod, &pan, &df);
4594 
4595     if (df >= 2) {
4596 	W = wald_hetero_test(pmod, s2, uvar, &pan, &df);
4597     }
4598 
4599     if (na(W)) {
4600 	err = E_DATA;
4601     } else {
4602 	double pval = chisq_cdf_comp(df, W);
4603 
4604 	if (!(opt & OPT_I)) {
4605 	    print_wald_test(W, df, pval, &pan, uvar, s2, prn);
4606 	}
4607 
4608 	if (opt & OPT_S) {
4609 	    ModelTest *test = model_test_new(GRETL_TEST_GROUPWISE);
4610 
4611 	    if (test != NULL) {
4612 		model_test_set_teststat(test, GRETL_STAT_WALD_CHISQ);
4613 		model_test_set_dfn(test, df);
4614 		model_test_set_value(test, W);
4615 		model_test_set_pvalue(test, pval);
4616 		maybe_add_test_to_model(pmod, test);
4617 	    }
4618 	}
4619 
4620 	record_test_result(W, pval);
4621     }
4622 
4623     free(pan.unit_obs);
4624     free(uvar);
4625 
4626     return err;
4627 }
4628 
print_ar_aux_model(MODEL * pmod,DATASET * dset,int j,PRN * prn)4629 static int print_ar_aux_model (MODEL *pmod, DATASET *dset,
4630 			       int j, PRN *prn)
4631 {
4632     const char *heads[] = {
4633         N_("First differenced equation (dependent, d_y)"),
4634         N_("Autoregression of residuals (dependent, uhat)"),
4635 	N_("Auxiliary regression including lagged residual"),
4636     };
4637     gretl_matrix *cse;
4638     gretl_array *S;
4639     int i, vi, err = 0;
4640 
4641     cse = gretl_matrix_alloc(pmod->ncoeff, 2);
4642     S = gretl_array_new(GRETL_TYPE_STRINGS, pmod->ncoeff, &err);
4643 
4644     if (cse == NULL || S == NULL) {
4645 	return E_ALLOC;
4646     }
4647 
4648     for (i=0; i<pmod->ncoeff; i++) {
4649 	gretl_matrix_set(cse, i, 0, pmod->coeff[i]);
4650 	gretl_matrix_set(cse, i, 1, pmod->sderr[i]);
4651 	vi = pmod->list[i+2];
4652 	gretl_array_set_string(S, i, dset->varname[vi], 0);
4653     }
4654 
4655     if (j >= 0) {
4656 	pprintf(prn, "%s:\n", _(heads[j]));
4657     }
4658     print_model_from_matrices(cse, NULL, S, pmod->dfd, OPT_NONE, prn);
4659     pprintf(prn, "  n = %d, R-squared = %.4f\n\n", pmod->nobs, pmod->rsq);
4660 
4661     gretl_matrix_free(cse);
4662     gretl_array_nullify_elements(S);
4663     gretl_array_destroy(S);
4664 
4665     return 0;
4666 }
4667 
finalize_autocorr_test(MODEL * pmod,ModelTest * test,gretlopt opt,PRN * prn)4668 static void finalize_autocorr_test (MODEL *pmod, ModelTest *test,
4669 				    gretlopt opt, PRN *prn)
4670 {
4671     if (!(opt & OPT_I)) {
4672 	if (opt & OPT_Q) {
4673 	    pputc(prn, '\n');
4674 	}
4675 	gretl_model_test_print_direct(test, 1, prn);
4676     }
4677     if (opt & OPT_S) {
4678 	maybe_add_test_to_model(pmod, test);
4679     } else {
4680 	free(test);
4681     }
4682 }
4683 
4684 /* See Wooldridge's Econometric Analysis of Cross Section
4685    and Panel Data (MIT Press, 2002), pages 176-7. Test
4686    for first order autocorrelation in the context of pooled OLS.
4687 */
4688 
pooled_autocorr_test(MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)4689 static int pooled_autocorr_test (MODEL *pmod, DATASET *dset,
4690 				 gretlopt opt, PRN *prn)
4691 {
4692     int quiet = (opt & OPT_Q);
4693     int *ulist = NULL;
4694     int i, vi = 0;
4695     int err = 0;
4696 
4697     if (pmod->full_n != dset->n) {
4698 	return E_DATA;
4699     }
4700 
4701     err = dataset_add_NA_series(dset, 1);
4702     if (!err) {
4703 	vi = dset->v - 1;
4704 	strcpy(dset->varname[vi], "uhat");
4705 	for (i=0; i<dset->n; i++) {
4706 	    dset->Z[vi][i] = pmod->uhat[i];
4707 	}
4708     }
4709 
4710     if (!err) {
4711 	ulist = gretl_list_new(pmod->ncoeff + 2);
4712 	if (ulist == NULL) {
4713 	    err = E_ALLOC;
4714 	} else {
4715 	    for (i=1; i<=pmod->list[0]; i++) {
4716 		ulist[i] = pmod->list[i];
4717 	    }
4718 	    /* append lagged residual to list */
4719 	    ulist[i] = laggenr(vi, 1, dset);
4720 	    if (ulist[i] < 0) {
4721 		err = E_DATA;
4722 	    } else {
4723 		strcpy(dset->varname[ulist[i]], "uhat(-1)");
4724 	    }
4725 	}
4726     }
4727 
4728     if (!err) {
4729 	gretlopt auxopt = OPT_P | OPT_R | OPT_Q;
4730 	MODEL tmp;
4731 
4732 	/* estimate model including lagged residual */
4733 	tmp = real_panel_model(ulist, dset, auxopt, NULL);
4734 	err = tmp.errcode;
4735 	if (!err && !quiet) {
4736 	    if (gretl_echo_on()) {
4737 		pputc(prn, '\n');
4738 	    }
4739 	    print_ar_aux_model(&tmp, dset, 2, prn);
4740 	}
4741 	if (!err) {
4742 	    double tstat, pval;
4743 	    int df = tmp.dfd;
4744 
4745 	    i = tmp.ncoeff - 1;
4746 	    tstat = tmp.coeff[i] / tmp.sderr[i];
4747 	    pval = student_pvalue_2(df, tstat);
4748 	    record_test_result(tstat, pval);
4749 
4750 	    if ((opt & OPT_S) || !(opt & OPT_I)) {
4751 		/* saving the test onto @pmod, or not silent */
4752 		ModelTest *test = model_test_new(GRETL_TEST_PANEL_AR);
4753 
4754 		if (test != NULL) {
4755 		    model_test_set_teststat(test, GRETL_STAT_STUDENT);
4756 		    model_test_set_value(test, tstat);
4757 		    model_test_set_dfn(test, df);
4758 		    model_test_set_pvalue(test, pval);
4759 		    finalize_autocorr_test(pmod, test, opt, prn);
4760 		}
4761 	    }
4762 	}
4763 	clear_model(&tmp);
4764     }
4765 
4766     free(ulist);
4767 
4768     return err;
4769 }
4770 
4771 /* See Wooldridge's Econometric Analysis of Cross Section
4772    and Panel Data (MIT Press, 2002), pages 282-3. Test
4773    for first order autocorrelation based on the residuals
4774    from the first-differenced model.
4775 */
4776 
wooldridge_autocorr_test(MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)4777 static int wooldridge_autocorr_test (MODEL *pmod, DATASET *dset,
4778 				     gretlopt opt, PRN *prn)
4779 {
4780     MODEL tmp;
4781     gretlopt tmp_opt;
4782     int quiet = (opt & OPT_Q);
4783     int *dlist = NULL;
4784     int i, j, vi;
4785     int clearit = 0;
4786     int err = 0;
4787 
4788     dlist = gretl_list_new(pmod->ncoeff + 1);
4789     if (dlist == NULL) {
4790 	return E_ALLOC;
4791     }
4792 
4793     dlist[0] = 0;
4794     for (i=1, j=1; i<=pmod->list[0] && !err; i++) {
4795 	vi = pmod->list[i];
4796 	if (vi != 0 && !gretl_isconst(dset->t1, dset->t2, dset->Z[vi])) {
4797 	    dlist[j] = diffgenr(vi, DIFF, dset);
4798 	    if (dlist[j] < 0) {
4799 		err = E_DATA;
4800 	    } else {
4801 		j++;
4802 		dlist[0] += 1;
4803 	    }
4804 	} else if (i == 1) {
4805 	    gretl_errmsg_set("The dependent variable is constant");
4806 	    err = E_DATA;
4807 	}
4808     }
4809 
4810     /* aux panel models: pooled, robust, quiet */
4811     tmp_opt = OPT_P | OPT_R | OPT_Q;
4812 
4813     if (!err) {
4814 	/* estimate model in first-differenced form */
4815 	tmp = real_panel_model(dlist, dset, tmp_opt, NULL);
4816 	err = tmp.errcode;
4817 	if (!err && !quiet) {
4818 	    if (gretl_echo_on()) {
4819 		pputc(prn, '\n');
4820 	    }
4821 	    print_ar_aux_model(&tmp, dset, 0, prn);
4822 	}
4823     }
4824 
4825     if (!err) {
4826 	err = dataset_add_allocated_series(dset, tmp.uhat);
4827 	tmp.uhat = NULL;
4828 	clear_model(&tmp);
4829     }
4830 
4831     if (!err) {
4832 	vi = dset->v - 1; /* the last series added */
4833 	strcpy(dset->varname[vi], "uhat");
4834 	dlist[0] = 2;
4835 	dlist[1] = vi;
4836 	dlist[2] = laggenr(vi, 1, dset);
4837 	if (dlist[2] < 0) {
4838 	    err = E_DATA;
4839 	} else {
4840 	    /* regress residual on its first lag */
4841 	    tmp = real_panel_model(dlist, dset, tmp_opt, NULL);
4842 	    err = tmp.errcode;
4843 	    if (!err && !quiet) {
4844 		strcpy(dset->varname[dlist[2]], "uhat(-1)");
4845 		print_ar_aux_model(&tmp, dset, 1, prn);
4846 	    }
4847 	    clearit = 1;
4848 	}
4849     }
4850 
4851     if (!err) {
4852 	double c, s, F, pval;
4853 
4854 	c = tmp.coeff[0] + 0.5;
4855 	s = tmp.sderr[0];
4856 	F = c * c / (s * s);
4857 	pval = snedecor_cdf_comp(1, tmp.dfd, F);
4858 	record_test_result(F, pval);
4859 
4860 	if ((opt & OPT_S) || !(opt & OPT_I)) {
4861 	    /* saving the test onto @pmod, or not silent */
4862 	    ModelTest *test = model_test_new(GRETL_TEST_PANEL_AR);
4863 
4864 	    if (test != NULL) {
4865 		model_test_set_teststat(test, GRETL_STAT_F);
4866 		model_test_set_value(test, F);
4867 		model_test_set_dfn(test, 1);
4868 		model_test_set_dfd(test, tmp.dfd);
4869 		model_test_set_pvalue(test, pval);
4870 		finalize_autocorr_test(pmod, test, opt, prn);
4871 	    }
4872 	}
4873     }
4874 
4875     if (clearit) {
4876 	clear_model(&tmp);
4877     }
4878     free(dlist);
4879 
4880     return err;
4881 }
4882 
panel_autocorr_test(MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)4883 int panel_autocorr_test (MODEL *pmod, DATASET *dset,
4884 			 gretlopt opt, PRN *prn)
4885 {
4886     int save_pcse = libset_get_bool(USE_PCSE);
4887     int orig_v = dset->v;
4888     int err;
4889 
4890     libset_set_bool(USE_PCSE, 0);
4891 
4892     if (pmod->ci == OLS || (pmod->opt & OPT_P)) {
4893 	err = pooled_autocorr_test(pmod, dset, opt, prn);
4894     } else {
4895 	err = wooldridge_autocorr_test(pmod, dset, opt, prn);
4896     }
4897 
4898     libset_set_bool(USE_PCSE, save_pcse);
4899     dataset_drop_last_variables(dset, dset->v - orig_v);
4900 
4901     return err;
4902 }
4903 
4904 /* test for cross-sectional dependence */
4905 
panel_xdepend_test(MODEL * pmod,DATASET * dset,gretlopt opt,PRN * prn)4906 int panel_xdepend_test (MODEL *pmod, DATASET *dset,
4907 			gretlopt opt, PRN *prn)
4908 {
4909     const double *u;
4910     double rij, rsum = 0.0;
4911     double arsum = 0.0;
4912     double ssx, ssy, sxy;
4913     double xbar, ybar;
4914     int N1, N2, T, Tij;
4915     int i, j, t, si, sj;
4916     int N = 0, Nr = 0;
4917     int err = 0;
4918 
4919     if (dset->structure != STACKED_TIME_SERIES) {
4920 	return E_PDWRONG;
4921     } else if (pmod->uhat == NULL) {
4922 	return E_DATA;
4923     }
4924 
4925     T = dset->pd;
4926     N1 = pmod->t1 / T;
4927     N2 = pmod->t2 / T;
4928     u = pmod->uhat;
4929 
4930     for (i=N1; i<N2; i++) {
4931 	int Nj = 0;
4932 
4933 	for (j=i+1; j<=N2; j++) {
4934 	    xbar = ybar = 0.0;
4935 	    Tij = 0;
4936 	    for (t=0; t<T; t++) {
4937 		si = i * T + t;
4938 		sj = j * T + t;
4939 		if (!na(u[si]) && !na(u[sj])) {
4940 		    Tij++;
4941 		    xbar += u[si];
4942 		    ybar += u[sj];
4943 		}
4944 	    }
4945 	    if (Tij >= 2) {
4946 		ssx = ssy = sxy = 0.0;
4947 		xbar /= Tij;
4948 		ybar /= Tij;
4949 		for (t=0; t<T; t++) {
4950 		    si = i * T + t;
4951 		    sj = j * T + t;
4952 		    if (!na(u[si]) && !na(u[sj])) {
4953 			ssx += (u[si] - xbar) * (u[si] - xbar);
4954 			ssy += (u[sj] - ybar) * (u[sj] - ybar);
4955 			sxy += (u[si] - xbar) * (u[sj] - ybar);
4956 		    }
4957 		}
4958 		rij = sxy / sqrt(ssx * ssy);
4959 		rsum += sqrt(Tij) * rij;
4960 		arsum += fabs(rij);
4961 		Nr++;
4962 		Nj++;
4963 	    }
4964 	}
4965 	if (Nj > 0) {
4966 	    N++;
4967 	}
4968     }
4969 
4970     if (N == 0) {
4971 	err = E_TOOFEW;
4972     }
4973 
4974     if (!err) {
4975 	double CD, pval;
4976 
4977 	N = N + 1;
4978 	CD = sqrt(2.0 / (N * (N - 1.0))) * rsum;
4979 	pval = normal_pvalue_2(CD);
4980 
4981 	if (!(opt & OPT_I)) {
4982 	    pputs(prn, _("Pesaran CD test for cross-sectional dependence"));
4983 	    pprintf(prn, "\n%s: z = %f,\n", _("Test statistic"), CD);
4984 	    pprintf(prn, "%s = P(|z| > %g) = %.3g\n", _("with p-value"),
4985 		    CD, pval);
4986 	    pprintf(prn, _("Average absolute correlation = %.3f"), arsum / Nr);
4987 	    pputc(prn, '\n');
4988 	}
4989 
4990 	if (opt & OPT_S) {
4991 	    ModelTest *test = model_test_new(GRETL_TEST_XDEPEND);
4992 
4993 	    if (test != NULL) {
4994 		model_test_set_teststat(test, GRETL_STAT_Z);
4995 		model_test_set_value(test, CD);
4996 		model_test_set_pvalue(test, pval);
4997 		maybe_add_test_to_model(pmod, test);
4998 	    }
4999 	}
5000 
5001 	record_test_result(CD, pval);
5002     }
5003 
5004     return err;
5005 }
5006 
5007 /**
5008  * switch_panel_orientation:
5009  * @dset: pointer to dataset struct.
5010  *
5011  * Reorganizes @dset, transforming from stacked cross sections to
5012  * stacked time series or vice versa.  If the transformation is from
5013  * stacked time series to stacked cross section, the dataset will
5014  * no longer be acceptable as a panel for gretl's purposes; it
5015  * may be useful for export purposes, though.
5016  *
5017  * Returns: 0 on successful completion, non-zero on error.
5018  */
5019 
switch_panel_orientation(DATASET * dset)5020 int switch_panel_orientation (DATASET *dset)
5021 {
5022     int oldmode = dset->structure;
5023     double *tmp;
5024     char **markers = NULL;
5025     double pdx;
5026     int T, n;
5027     int i, j, s, t;
5028 
5029     if (dset->Z == NULL) {
5030 	return E_NODATA;
5031     } else if (oldmode != STACKED_TIME_SERIES &&
5032 	oldmode != STACKED_CROSS_SECTION) {
5033 	return E_DATA;
5034     }
5035 
5036     tmp = malloc(dset->n * sizeof *tmp);
5037     if (tmp == NULL) {
5038 	return E_ALLOC;
5039     }
5040 
5041     if (oldmode == STACKED_CROSS_SECTION) {
5042 	n = dset->pd;
5043 	T = dset->n / n;
5044     } else {
5045 	T = dset->pd;
5046 	n = dset->n / T;
5047     }
5048 
5049     /* copy the data series across in transformed order */
5050     for (i=1; i<dset->v; i++) {
5051 	for (t=0; t<dset->n; t++) {
5052 	    /* transcribe to tmp in original order */
5053 	    tmp[t] = dset->Z[i][t];
5054 	}
5055 	s = 0;
5056 	if (oldmode == STACKED_CROSS_SECTION) {
5057 	    /* convert to stacked time-series */
5058 	    for (j=0; j<n; j++) {
5059 		for (t=0; t<T; t++) {
5060 		    dset->Z[i][s++] = tmp[t * n + j];
5061 		}
5062 	    }
5063 	} else {
5064 	    /* convert to stacked cross-sections */
5065 	    for (t=0; t<T; t++) {
5066 		for (j=0; j<n; j++) {
5067 		    dset->Z[i][s++] = tmp[j * T + t];
5068 		}
5069 	    }
5070 	}
5071     }
5072 
5073     /* rearrange observations markers if relevant */
5074     if (dset->S != NULL) {
5075 	markers = strings_array_new_with_length(dset->n, OBSLEN);
5076 	if (markers != NULL) {
5077 	    for (t=0; t<dset->n; t++) {
5078 		strcpy(markers[t], dset->S[t]);
5079 	    }
5080 	    s = 0;
5081 	    if (oldmode == STACKED_CROSS_SECTION) {
5082 		for (j=0; j<n; j++) {
5083 		    for (t=0; t<T; t++) {
5084 			strcpy(dset->S[s++], markers[t * n + j]);
5085 		    }
5086 		}
5087 	    } else {
5088 		for (t=0; t<T; t++) {
5089 		    for (j=0; j<n; j++) {
5090 			strcpy(dset->S[s++], markers[j * T + t]);
5091 		    }
5092 		}
5093 	    }
5094 	    strings_array_free(markers, dset->n);
5095 	} else {
5096 	    /* should we flag an error? */
5097 	    dataset_destroy_obs_markers(dset);
5098 	}
5099     }
5100 
5101     dset->sd0 = 1.0;
5102     pdx = 0.1;
5103 
5104     /* change the datainfo setup */
5105     if (oldmode == STACKED_CROSS_SECTION) {
5106 	dset->structure = STACKED_TIME_SERIES;
5107 	dset->pd = T;
5108 	while (T /= 10) {
5109 	    pdx *= 0.1;
5110 	}
5111     } else {
5112 	dset->structure = STACKED_CROSS_SECTION;
5113 	dset->pd = n;
5114 	while (n /= 10) {
5115 	    pdx *= 0.1;
5116 	}
5117     }
5118 
5119     dset->sd0 += pdx;
5120     ntolabel(dset->stobs, 0, dset);
5121     ntolabel(dset->endobs, dset->n - 1, dset);
5122 
5123     free(tmp);
5124 
5125     return 0;
5126 }
5127 
5128 /**
5129  * panel_isconst:
5130  * @t1: starting observation.
5131  * @t2: ending observation.
5132  * @pd: panel time-series length.
5133  * @x: data series to examine.
5134  * @bygroup: use 1 to check for constancy across groups,
5135  * 0 for constancy across time.
5136  *
5137  * Check whether series @x is constant (either over time or
5138  * across groups) in a panel dataset. Missing values are
5139  * ignored.
5140  *
5141  * Returns: 1 if the variable is constant, otherwise 0.
5142  */
5143 
panel_isconst(int t1,int t2,int pd,const double * x,int bygroup)5144 int panel_isconst (int t1, int t2, int pd, const double *x,
5145 		   int bygroup)
5146 {
5147     double x0 = NADBL;
5148     int t, ret = 1;
5149 
5150     if (bygroup) {
5151 	/* check for variation across groups */
5152 	int tref;
5153 
5154 	for (t=t1; t<=t2 && ret; t++) {
5155 	    if (na(x[t])) {
5156 		continue;
5157 	    }
5158 	    tref = t - pd;
5159 	    while (tref >= t1) {
5160 		/* check last obs for the same period */
5161 		x0 = x[t-pd];
5162 		if (na(x0)) {
5163 		    tref -= pd;
5164 		} else {
5165 		    if (x[t] != x0) {
5166 			ret = 0;
5167 		    }
5168 		    break;
5169 		}
5170 	    }
5171 	}
5172     } else {
5173 	/* check for time-variation */
5174 	int u, ubak = -1;
5175 
5176 	for (t=t1; t<=t2 && ret; t++) {
5177 	    if (na(x[t])) {
5178 		continue;
5179 	    }
5180 	    u = t / pd;
5181 	    if (u == ubak) {
5182 		/* same group */
5183 		if (!na(x0) && x[t] != x0) {
5184 		    ret = 0;
5185 		}
5186 	    } else {
5187 		/* starting a new group */
5188 		x0 = x[t];
5189 		ubak = u;
5190 	    }
5191 	}
5192     }
5193 
5194     return ret;
5195 }
5196 
varying_vars_list(const DATASET * dset,panelmod_t * pan)5197 static int varying_vars_list (const DATASET *dset, panelmod_t *pan)
5198 {
5199     int i, j, k, t, bigt;
5200     int err = 0;
5201 
5202     pan->vlist = gretl_list_new(pan->pooled->list[0]);
5203     if (pan->vlist == NULL) {
5204 	return E_ALLOC;
5205     }
5206 
5207     pan->vlist[0] = 1;
5208     pan->vlist[1] = pan->pooled->list[1];
5209 
5210     k = 2;
5211 
5212     for (j=2; j<=pan->pooled->list[0]; j++) {
5213 	int vj = pan->pooled->list[j];
5214 	int varies = 0;
5215 
5216 	if (vj == 0) {
5217 	    pan->vlist[k++] = 0;
5218 	    pan->vlist[0] += 1;
5219 	    continue;
5220 	}
5221 
5222 	for (i=0; i<pan->nunits && !varies; i++) {
5223 	    int started = 0;
5224 	    double xval = NADBL;
5225 
5226 	    if (pan->unit_obs[i] == 0) {
5227 		continue;
5228 	    }
5229 
5230 	    for (t=0; t<pan->T && !varies; t++) {
5231 		bigt = panel_index(i, t);
5232 		if (panel_missing(pan, bigt)) {
5233 		    continue;
5234 		}
5235 		if (!started) {
5236 		    xval = dset->Z[vj][bigt];
5237 		    started = 1;
5238 		} else if (dset->Z[vj][bigt] != xval) {
5239 		    varies = 1;
5240 		}
5241 	    }
5242 	}
5243 
5244 	if (varies) {
5245 	    pan->vlist[k++] = vj;
5246 	    pan->vlist[0] += 1;
5247 	} else {
5248 	    fprintf(stderr, "Variable %d '%s' is time-invariant\n",
5249 		    vj, dset->varname[vj]);
5250 	}
5251     }
5252 
5253 #if PDEBUG
5254     printlist(pan->pooled->list, "original regressors");
5255     printlist(pan->vlist, "time-varying regressors");
5256 #endif
5257 
5258     return err;
5259 }
5260 
5261 /* apparatus for setting panel structure by reading two variables,
5262    indexing the cross-sectional unit and the time period
5263    respectively
5264 */
5265 
5266 typedef struct s_point_t s_point;
5267 typedef struct sorter_t sorter;
5268 
5269 struct s_point_t {
5270     int obsnum;
5271     double val1;
5272     double val2;
5273 };
5274 
5275 struct sorter_t {
5276     int sortvar1;
5277     int sortvar2;
5278     s_point *points;
5279 };
5280 
sorter_fill_points(sorter * s,const double ** Z,int n)5281 static void sorter_fill_points (sorter *s, const double **Z,
5282 				int n)
5283 {
5284     int t;
5285 
5286     for (t=0; t<n; t++) {
5287 	s->points[t].obsnum = t;
5288 	s->points[t].val1 = Z[s->sortvar1][t];
5289 	s->points[t].val2 = Z[s->sortvar2][t];
5290     }
5291 }
5292 
compare_obs(const void * a,const void * b)5293 static int compare_obs (const void *a, const void *b)
5294 {
5295     s_point *pa = (s_point *) a;
5296     s_point *pb = (s_point *) b;
5297     int ret;
5298 
5299     ret = (pa->val1 > pb->val1) - (pa->val1 < pb->val1);
5300     if (ret == 0) {
5301 	ret = (pa->val2 > pb->val2) - (pa->val2 < pb->val2);
5302     }
5303 
5304     if (ret == 0) {
5305 	/* mark an error: got a duplicated value */
5306 	pa->obsnum = pb->obsnum = -1;
5307     }
5308 
5309     return ret;
5310 }
5311 
check_indices(sorter * s,int n)5312 static int check_indices (sorter *s, int n)
5313 {
5314     int i;
5315 
5316     for (i=0; i<n; i++) {
5317 	if (s->points[i].obsnum < 0) {
5318 	    gretl_errmsg_sprintf(_("Error: unit %g, period %g: duplicated observation"),
5319 				 s->points[i].val1, s->points[i].val2);
5320 	    return E_DATA;
5321 	}
5322     }
5323 
5324     return 0;
5325 }
5326 
panel_is_sorted(DATASET * dset,int uv,int tv)5327 static int panel_is_sorted (DATASET *dset, int uv, int tv)
5328 {
5329     const double *unit = dset->Z[uv];
5330     const double *period = dset->Z[tv];
5331     int t, ret = 1;
5332 
5333     for (t=1; t<dset->n && ret; t++) {
5334 	if (unit[t] < unit[t-1]) {
5335 	    /* the unit variable must be non-decreasing */
5336 	    ret = 0;
5337 	} else if (unit[t] == unit[t-1] && period[t] <= period[t-1]) {
5338 	    /* the period variable must be increasing within the
5339 	       observations for each unit */
5340 	    ret = 0;
5341 	}
5342     }
5343 
5344     return ret;
5345 }
5346 
panel_data_sort_by(DATASET * dset,int uv,int tv,int * ustrs)5347 static int panel_data_sort_by (DATASET *dset, int uv, int tv, int *ustrs)
5348 {
5349     int n = dset->n;
5350     char **S = NULL;
5351     double *tmp = NULL;
5352     int i, t;
5353     sorter s;
5354     int err = 0;
5355 
5356     tmp = malloc(n * sizeof *tmp);
5357     if (tmp == NULL) {
5358 	return E_ALLOC;
5359     }
5360 
5361     s.points = malloc(n * sizeof *s.points);
5362     if (s.points == NULL) {
5363 	free(tmp);
5364 	return E_ALLOC;
5365     }
5366 
5367     if (dset->S != NULL) {
5368 	S = strings_array_new_with_length(n, OBSLEN);
5369 	if (S == NULL) {
5370 	    free(tmp);
5371 	    free(s.points);
5372 	    return E_ALLOC;
5373 	}
5374     }
5375 
5376     s.sortvar1 = uv;
5377     s.sortvar2 = tv;
5378 
5379     sorter_fill_points(&s, (const double **) dset->Z, n);
5380 
5381     qsort((void *) s.points, (size_t) n, sizeof s.points[0],
5382 	  compare_obs);
5383 
5384     err = check_indices(&s, n);
5385     if (err) {
5386 	goto bailout;
5387     }
5388 
5389     for (i=1; i<dset->v; i++) {
5390 	for (t=0; t<n; t++) {
5391 	    tmp[t] = dset->Z[i][t];
5392 	}
5393 	for (t=0; t<n; t++) {
5394 	    dset->Z[i][t] = tmp[s.points[t].obsnum];
5395 	}
5396     }
5397 
5398     if (S != NULL) {
5399 	*ustrs = 1;
5400 	for (t=0; t<n; t++) {
5401 	    strcpy(S[t], dset->S[t]);
5402 	    if (S[t][0] == '\0') {
5403 		*ustrs = 0;
5404 	    }
5405 	}
5406 	for (t=0; t<n; t++) {
5407 	    strcpy(dset->S[t], S[s.points[t].obsnum]);
5408 	}
5409 	for (t=1; t<n && *ustrs; t++) {
5410 	    if (dset->Z[uv][t] == dset->Z[uv][t-1] &&
5411 		strcmp(dset->S[t], dset->S[t-1])) {
5412 		*ustrs = 0;
5413 	    }
5414 	}
5415 	strings_array_free(S, n);
5416     }
5417 
5418  bailout:
5419 
5420     free(s.points);
5421     free(tmp);
5422 
5423     return err;
5424 }
5425 
5426 /* Given the variables coding for panel unit and time period,
5427    construct parallel arrays of int that code for the same
5428    information, but are zero-based and consecutive.
5429 */
5430 
normalize_uid_tid(const double * tid,int T,const DATASET * dset,int uv,int tv,int ** pnuid,int ** pntid)5431 static int normalize_uid_tid (const double *tid, int T,
5432 			      const DATASET *dset,
5433 			      int uv, int tv,
5434 			      int **pnuid, int **pntid)
5435 {
5436     int *nuid = NULL;
5437     int *ntid = NULL;
5438     int ui, tmin;
5439     int i, t;
5440 
5441     nuid = malloc(dset->n * sizeof *nuid);
5442     ntid = malloc(dset->n * sizeof *ntid);
5443     if (nuid == NULL || ntid == NULL) {
5444 	free(nuid);
5445 	free(ntid);
5446 	return E_ALLOC;
5447     }
5448 
5449     ui = tmin = 0;
5450 
5451     for (i=0; i<dset->n; i++) {
5452 	if (i > 0 && dset->Z[uv][i] > dset->Z[uv][i-1]) {
5453 	    tmin = 0;
5454 	    ui++;
5455 	} else if (i > 0) {
5456 	    tmin++;
5457 	}
5458 	nuid[i] = ui;
5459 	for (t=tmin; t<T; t++) {
5460 	    if (dset->Z[tv][i] == tid[t]) {
5461 		ntid[i] = t;
5462 		break;
5463 	    }
5464 	}
5465     }
5466 
5467     *pnuid = nuid;
5468     *pntid = ntid;
5469 
5470     return 0;
5471 }
5472 
5473 /* Handle the case where a sub-sampled panel dataset has been padded
5474    with rows containing NAs to recreate a (nominally) balanced panel
5475    structure. We have to get rid of the padding before we try
5476    restoring the full data range. Note that while this function
5477    "shrinks" @dset in the sense of reducing the series length as
5478    recorded in dset->n, it does not "physically" shrink the array
5479    members of dset->Z.
5480 */
5481 
undo_panel_padding(DATASET * dset)5482 int undo_panel_padding (DATASET *dset)
5483 {
5484     char *mask = dset->padmask;
5485     double *Zi;
5486     char **S = NULL;
5487     int padded_n = dset->n; /* current series length */
5488     int real_n = dset->n;   /* target series length */
5489     int i, t;
5490     int err = 0;
5491 
5492     for (t=0; t<dset->n; t++) {
5493 	if (mask[t]) {
5494 	    real_n--;
5495 	}
5496     }
5497 
5498     fprintf(stderr, "undo_panel_padding: padded n*T = %d, original dset->n = %d\n",
5499 	    padded_n, real_n);
5500 
5501     if (real_n == padded_n) {
5502 	fprintf(stderr, "strange, couldn't find any padding!\n");
5503 	return E_DATA;
5504     }
5505 
5506     /* temporary holder for shorter series */
5507     Zi = malloc(real_n * sizeof *Zi);
5508 
5509     if (Zi == NULL) {
5510 	return E_ALLOC;
5511     }
5512 
5513     if (dset->S != NULL) {
5514 	/* holder for shorter obs labels array */
5515 	S = strings_array_new_with_length(real_n, OBSLEN);
5516     }
5517 
5518     if (!err) {
5519 	/* write non-padding rows from dset->Z (and dset->S, if present)
5520 	   into the right places in the reduced-size arrays */
5521 	int s;
5522 
5523 	for (i=0; i<dset->v; i++) {
5524 	    s = 0;
5525 	    for (t=0; t<padded_n; t++) {
5526 		if (!mask[t]) {
5527 		   Zi[s] = dset->Z[i][t];
5528 		   if (i == 0 && S != NULL) {
5529 		       strcpy(S[s], dset->S[t]);
5530 		   }
5531 		   s++;
5532 		}
5533 	    }
5534 	    /* copy data back to dset->Z */
5535 	    memcpy(dset->Z[i], Zi, real_n * sizeof *Zi);
5536 	}
5537 
5538 	if (dset->S != NULL && S != NULL) {
5539 	    strings_array_free(dset->S, padded_n);
5540 	    dset->S = S;
5541 	}
5542     }
5543 
5544     free(Zi);
5545 
5546     dset->n = real_n;
5547     dset->t2 = dset->n - 1;
5548     free(dset->padmask);
5549     dset->padmask = NULL;
5550 
5551     return err;
5552 }
5553 
pad_panel_dataset(const double * uid,int uv,int nunits,const double * tid,int tv,int nperiods,DATASET * dset,int ustrs,char * mask)5554 static int pad_panel_dataset (const double *uid, int uv, int nunits,
5555 			      const double *tid, int tv, int nperiods,
5556 			      DATASET *dset, int ustrs, char *mask)
5557 {
5558     double **bigZ = NULL;
5559     char **S = NULL;
5560     int *nuid = NULL;
5561     int *ntid = NULL;
5562     int big_n, n_orig = dset->n;
5563     int i, s, t;
5564     int err = 0;
5565 
5566     big_n = nunits * nperiods;
5567 
5568     fprintf(stderr, "pad_panel_dataset: n*T = %d*%d = %d but dset->n = %d\n",
5569 	    nunits, nperiods, big_n, n_orig);
5570 
5571     err = normalize_uid_tid(tid, nperiods, dset, uv, tv,
5572 			    &nuid, &ntid);
5573     if (err) {
5574 	return err;
5575     }
5576 
5577     bigZ = doubles_array_new(dset->v, big_n);
5578     if (bigZ == NULL) {
5579 	err = E_ALLOC;
5580     } else {
5581 	dset->n = big_n;
5582 	dset->t2 = dset->n - 1;
5583 	for (i=0; i<dset->v; i++) {
5584 	    for (t=0; t<big_n; t++) {
5585 		bigZ[i][t] = (i == 0)? 1.0 : NADBL;
5586 	    }
5587 	}
5588     }
5589 
5590     if (!err && dset->S != NULL && ustrs) {
5591 	S = strings_array_new_with_length(dset->n, OBSLEN);
5592     }
5593 
5594     if (!err) {
5595 	int j, buv = 0, btv = 0;
5596 
5597 	if (mask != NULL) {
5598 	    for (t=0; t<big_n; t++) {
5599 		mask[t] = 1;
5600 	    }
5601 	}
5602 
5603 	/* write rows from original Z into the right places in bigZ */
5604 	for (t=0; t<n_orig; t++) {
5605 	    s = nuid[t] * nperiods + ntid[t];
5606 	    for (i=1; i<dset->v; i++) {
5607 		bigZ[i][s] = dset->Z[i][t];
5608 	    }
5609 	    if (S != NULL) {
5610 		strcpy(S[s], dset->S[t]);
5611 	    }
5612 	    if (mask != NULL) {
5613 		/* record (non-)padding in @mask */
5614 		mask[s] = 0;
5615 	    }
5616 	}
5617 
5618 	/* where are uv and tv in bigZ? */
5619 	j = 1;
5620 	for (i=1; i<dset->v; i++) {
5621 	    if (i == uv) {
5622 		buv = j;
5623 	    } else if (i == tv) {
5624 		btv = j;
5625 	    }
5626 	    j++;
5627 	}
5628 
5629 	/* complete the index info in slots uv and tv */
5630 	i = j = 0;
5631 	for (t=0; t<dset->n; t++) {
5632 	    if (t > 0 && t % nperiods == 0) {
5633 		i++;
5634 		j = 0;
5635 	    }
5636 	    bigZ[buv][t] = uid[i];
5637 	    bigZ[btv][t] = tid[j++];
5638 	}
5639 
5640 	/* swap the padded arrays into Z */
5641 	for (i=0; i<dset->v; i++) {
5642 	    free(dset->Z[i]);
5643 	    dset->Z[i] = bigZ[i];
5644 	}
5645 
5646 	free(bigZ);
5647     }
5648 
5649     free(nuid);
5650     free(ntid);
5651 
5652     if (!err && dset->S != NULL) {
5653 	/* expand the obs (unit) marker strings appropriately */
5654 	if (S == NULL) {
5655 	    strings_array_free(dset->S, n_orig);
5656 	    dset->S = NULL;
5657 	    dset->markers = NO_MARKERS;
5658 	} else {
5659 	    char si[OBSLEN];
5660 	    int j;
5661 
5662 	    for (i=0; i<nunits; i++) {
5663 		t = i * nperiods;
5664 		for (j=0; j<nperiods; j++) {
5665 		    if (S[t][0] != '\0') {
5666 			strcpy(si, S[t]);
5667 			break;
5668 		    }
5669 		    t++;
5670 		}
5671 		t = i * nperiods;
5672 		for (j=0; j<nperiods; j++) {
5673 		    strcpy(S[t++], si);
5674 		}
5675 	    }
5676 	    strings_array_free(dset->S, n_orig);
5677 	    dset->S = S;
5678 	}
5679     }
5680 
5681     return err;
5682 }
5683 
check_index_values(const double * x,int n)5684 static int check_index_values (const double *x, int n)
5685 {
5686     int i;
5687 
5688     for (i=1; i<n; i++) {
5689 	if (x[i] < 0) {
5690 	    return E_DATA;
5691 	} else if (na(x[i])) {
5692 	    return E_MISSDATA;
5693 	}
5694     }
5695 
5696     return 0;
5697 }
5698 
uv_tv_from_varnames(const char * uname,const char * tname,const DATASET * dset,int * uv,int * tv)5699 static int uv_tv_from_varnames (const char *uname, const char *tname,
5700 				const DATASET *dset, int *uv, int *tv)
5701 {
5702     *uv = current_series_index(dset, uname);
5703     if (*uv <= 0) {
5704 	return E_DATA;
5705     }
5706 
5707     *tv = current_series_index(dset, tname);
5708     if (*tv <= 0) {
5709 	return E_DATA;
5710     }
5711 
5712     return 0;
5713 }
5714 
5715 #if PDEBUG
print_unit_var(int uv,double ** Z,int n,int after)5716 static void print_unit_var (int uv, double **Z, int n, int after)
5717 {
5718     int i, imax = 20;
5719 
5720     fprintf(stderr, "Z[uv], %s sorting (first %d obs):\n",
5721 	    (after)? "after" : "before", imax);
5722     for (i=0; i<n && i<imax; i++) {
5723 	fprintf(stderr, " Z[%d][%d] = %g\n", uv, i, Z[uv][i]);
5724     }
5725 }
5726 #endif
5727 
finalize_panel_datainfo(DATASET * dset,int nperiods)5728 static void finalize_panel_datainfo (DATASET *dset, int nperiods)
5729 {
5730     int pdp = nperiods;
5731     int den = 10.0;
5732 
5733     while ((pdp = pdp / 10)) {
5734 	den *= 10;
5735     }
5736     dset->structure = STACKED_TIME_SERIES;
5737     dset->pd = nperiods;
5738     dset->sd0 = 1.0 + 1.0 / den;
5739     ntolabel(dset->stobs, 0, dset);
5740     ntolabel(dset->endobs, dset->n - 1, dset);
5741 }
5742 
check_full_dataset(void)5743 static int check_full_dataset (void)
5744 {
5745     DATASET *fset = fetch_full_dataset();
5746 
5747     if (!dataset_is_panel(fset)) {
5748 	const char *msg =
5749 	    "You cannot use the --panel-vars option with the setobs command when\n"
5750 	    "\n"
5751 	    "* the dataset is currently sub-sampled and\n"
5752 	    "* the full dataset is not a panel.\n"
5753 	    "\n"
5754 	    "If you first structure your full dataset as a panel, you can then\n"
5755 	    "do what you are trying to do.";
5756 
5757 	gretl_errmsg_set(msg);
5758 	return E_DATA;
5759     }
5760 
5761     return 0;
5762 }
5763 
5764 /**
5765  * set_panel_structure_from_vars:
5766  * @uv: index of series uniquely identifying units/groups.
5767  * @tv: index of series uniquely identifying time periods.
5768  * @dset: pointer to dataset.
5769  *
5770  * Sets the panel structure of @dset based on the information
5771  * in the series with indices @uv and @tv, if possible.
5772  *
5773  * Returns: 0 on success, non-zero code on error.
5774  */
5775 
set_panel_structure_from_vars(int uv,int tv,DATASET * dset)5776 int set_panel_structure_from_vars (int uv, int tv, DATASET *dset)
5777 {
5778     int subsampled, presorted;
5779     double *uid = NULL;
5780     double *tid = NULL;
5781     char *mask = NULL;
5782     int n = dset->n;
5783     int totmiss = 0;
5784     int fulln = 0;
5785     int nunits = 0;
5786     int nperiods = 0;
5787     int ustrs = 0;
5788     int err = 0;
5789 
5790     subsampled = complex_subsampled();
5791 
5792     if (subsampled) {
5793 	err = check_full_dataset();
5794 	if (err) {
5795 	    return err;
5796 	}
5797     }
5798 
5799 #if PDEBUG
5800     fprintf(stderr, "\n*** set_panel_structure_from_vars:\n "
5801 	    "using var %d ('%s') for unit, var %d ('%s') for time\n",
5802 	    uv, dset->varname[uv], tv, dset->varname[tv]);
5803     print_unit_var(uv, dset->Z, n, 0);
5804 #endif
5805 
5806     /* start by making sorted copies of the unit and time
5807        variables and counting the unique values of each
5808     */
5809 
5810     uid = copyvec(dset->Z[uv], n);
5811     tid = copyvec(dset->Z[tv], n);
5812 
5813     if (uid == NULL || tid == NULL) {
5814 	err = E_ALLOC;
5815 	goto bailout;
5816     }
5817 
5818     qsort(uid, n, sizeof *uid, gretl_compare_doubles);
5819     nunits = count_distinct_values(uid, dset->n);
5820 
5821     qsort(tid, n, sizeof *tid, gretl_compare_doubles);
5822     nperiods = count_distinct_values(tid, dset->n);
5823 
5824     /* check that we have a possible panel structure */
5825 
5826     if (nunits == 1 || nperiods == 1 ||
5827 	nunits == n || nperiods == n ||
5828 	n > nunits * nperiods) {
5829 	fprintf(stderr, "Dataset does not have a panel structure\n");
5830 	fprintf(stderr, " (nunits=%d, nperiods=%d, n=%d)\n", nunits, nperiods, n);
5831 	err = E_DATA;
5832 	goto bailout;
5833     }
5834 
5835     /* figure how many panel-data rows are implied, and how many
5836        "padding" rows are needed to complete the structure
5837     */
5838 
5839     fulln = nunits * nperiods;
5840     totmiss = fulln - dset->n;
5841 
5842 #if PDEBUG
5843     fprintf(stderr, "Found %d units, %d periods\n", nunits, nperiods);
5844     fprintf(stderr, "Units: min %g, max %g\n", uid[0], uid[n - 1]);
5845     fprintf(stderr, "Periods: min %g, max %g\n", tid[0], tid[n - 1]);
5846     fprintf(stderr, "Required rows = %d * %d = %d\n", nunits, nperiods, fulln);
5847     fprintf(stderr, "Missing rows = %d - %d = %d\n", fulln, dset->n, totmiss);
5848 #endif
5849 
5850     /* determine if the data rows are already in sort order by unit,
5851        and by period for each unit */
5852 
5853     presorted = panel_is_sorted(dset, uv, tv);
5854 
5855 #if PDEBUG
5856     fprintf(stderr, "panel_is_sorted? %s\n", presorted ? "yes" : "no");
5857 #endif
5858 
5859     if (!presorted) {
5860 	if (subsampled) {
5861 	    /* We can't re-order the rows in a subsampled dataset, or the
5862 	       data will get scrambled when restoration of the full dataset
5863 	       is attempted.
5864 	    */
5865 	    gretl_errmsg_set(_("Sorry, can't do this with a sub-sampled dataset"));
5866 	    err = E_DATA;
5867 	} else {
5868 	    /* sort full dataset by unit and period */
5869 	    err = panel_data_sort_by(dset, uv, tv, &ustrs);
5870 	}
5871 #if PDEBUG
5872 	if (!err) print_unit_var(uv, dset->Z, n, 1);
5873 #endif
5874     }
5875 
5876     if (!err && totmiss > 0) {
5877 	/* establish a nominally balanced panel */
5878 	mask = malloc(fulln);
5879 	if (mask == NULL) {
5880 	    err = E_ALLOC;
5881 	} else {
5882 	    rearrange_id_array(uid, nunits, n);
5883 	    rearrange_id_array(tid, nperiods, n);
5884 	    err = pad_panel_dataset(uid, uv, nunits,
5885 				    tid, tv, nperiods,
5886 				    dset, ustrs,
5887 				    mask);
5888 	}
5889     }
5890 
5891     if (!err) {
5892 	finalize_panel_datainfo(dset, nperiods);
5893     }
5894 
5895  bailout:
5896 
5897     free(uid);
5898     free(tid);
5899 
5900     if (!err && complex_subsampled()) {
5901 	dset->padmask = mask;
5902     } else {
5903 	free(mask);
5904     }
5905 
5906     return err;
5907 }
5908 
5909 int
set_panel_structure_from_varnames(const char * uname,const char * tname,DATASET * dset)5910 set_panel_structure_from_varnames (const char *uname, const char *tname,
5911 				   DATASET *dset)
5912 {
5913     int n = dset->n;
5914     int uv = 0, tv = 0;
5915     int err = 0;
5916 
5917     err = uv_tv_from_varnames(uname, tname, dset, &uv, &tv);
5918 
5919     if (!err) {
5920 	err = check_index_values(dset->Z[uv], n);
5921     }
5922 
5923     if (!err) {
5924 	err = check_index_values(dset->Z[tv], n);
5925     }
5926 
5927     if (!err) {
5928 	err = set_panel_structure_from_vars(uv, tv, dset);
5929     }
5930 
5931     return err;
5932 }
5933 
group_uniqueness_check(char ** S,int n)5934 static int group_uniqueness_check (char **S, int n)
5935 {
5936     int i, j;
5937 
5938     for (i=0; i<n-1; i++) {
5939 	for (j=i+1; j<n; j++) {
5940 	    if (!strcmp(S[i], S[j])) {
5941 		gretl_errmsg_sprintf("The string '%s' is given for "
5942 				     "two or more groups", S[i]);
5943 		return E_DATA;
5944 	    }
5945 	}
5946     }
5947 
5948     return 0;
5949 }
5950 
group_names_from_array(const char * aname,int n_groups,int * err)5951 static char **group_names_from_array (const char *aname,
5952 				      int n_groups,
5953 				      int *err)
5954 {
5955     gretl_array *A = get_array_by_name(aname);
5956     char **S = NULL;
5957     int ns = 0;
5958 
5959     if (A == NULL) {
5960 	*err = E_DATA;
5961     } else {
5962 	char **AS = gretl_array_get_strings(A, &ns);
5963 
5964 	if (ns != n_groups) {
5965 	    *err = E_DATA;
5966 	} else {
5967 	    S = strings_array_dup(AS, ns);
5968 	    if (S == NULL) {
5969 		*err = E_ALLOC;
5970 	    }
5971 	}
5972     }
5973 
5974     return S;
5975 }
5976 
5977 /* usable_groups_series: to be usable as the basis for panel
5978    groups strings, a series must have values that are (a)
5979    constant within-group and (b) unique across groups. Note
5980    that we only get to this test if the series in question
5981    has already been found to be string-valued.
5982 */
5983 
usable_groups_series(DATASET * dset,int v,const char * vname)5984 static int usable_groups_series (DATASET *dset, int v,
5985 				 const char *vname)
5986 {
5987     const double *x = dset->Z[v];
5988     int i, j, g = 0;
5989     int ok = 1;
5990 
5991     if (na(x[0])) {
5992 	return 0;
5993     }
5994 
5995     for (i=1; i<dset->n && ok; i++) {
5996 	if (na(x[i])) {
5997 	    ok = 0;
5998 	} else if (i % dset->pd == 0) {
5999 	    /* starting a new group: x-value must not repeat a
6000 	       prior group's value */
6001 	    for (j=0; j<=g; j++) {
6002 		if (x[i] == x[j*dset->pd]) {
6003 		    gretl_errmsg_sprintf("%s: values are not unique per group",
6004 					 vname);
6005 		    ok = 0;
6006 		}
6007 	    }
6008 	    g++;
6009 	} else if (x[i] != x[i-1]) {
6010 	    gretl_errmsg_sprintf("%s: is not constant within group", vname);
6011 	    ok = 0;
6012 	}
6013     }
6014 
6015     return ok;
6016 }
6017 
maybe_use_strval_series(DATASET * dset,const char * vname,int ng)6018 static int maybe_use_strval_series (DATASET *dset,
6019 				    const char *vname,
6020 				    int ng)
6021 {
6022     int v = current_series_index(dset, vname);
6023     series_table *st = NULL;
6024     int ns = 0;
6025     int err = 0;
6026 
6027     if (v <= 0) {
6028 	return E_INVARG;
6029     }
6030 
6031     st = series_get_string_table(dset, v);
6032     if (st == NULL) {
6033 	gretl_errmsg_sprintf("The series %s is not string-valued", vname);
6034 	return E_INVARG;
6035     }
6036 
6037     series_table_get_strings(st, &ns);
6038 
6039 #if 0
6040     fprintf(stderr, "maybe_use_strval_series (%s, ID %d, ns=%d)\n", vname, v, ns);
6041 #endif
6042 
6043     if (ns < ng) {
6044 	gretl_errmsg_sprintf("The series %s holds %d strings but %d "
6045 			     "are needed", vname, ns, ng);
6046 	err = E_INVARG;
6047     } else {
6048 	/* note: don't mess with numerical values, just check them */
6049 	if (usable_groups_series(dset, v, vname)) {
6050 	    set_panel_groups_name(dset, vname);
6051 	} else {
6052 	    err = E_INVARG;
6053 	}
6054     }
6055 
6056     return err;
6057 }
6058 
6059 /* Enable construction of a string-valued series holding a name for
6060    each panel group/unit. The name of this series is set on the
6061    'pangrps' member of @dset, and the names are then used in panel
6062    graphs where appropriate.
6063 
6064    @vname should contain the name of a series, either an existing one
6065    (which may be overwritten) or a new one to create; and @grpnames
6066    should be (1) a string literal, or (2) the name of a string variable
6067    (which in either case should hold N space-separated strings, where N
6068    is the number of panel groups), or (3) the name of an array variable
6069    holding N strings.
6070 */
6071 
set_panel_group_strings(const char * vname,const char * grpnames,DATASET * dset)6072 int set_panel_group_strings (const char *vname,
6073 			     const char *grpnames,
6074 			     DATASET *dset)
6075 {
6076     const char *namestr = NULL;
6077     char **S = NULL;
6078     int ng = dset->n / dset->pd;
6079     int v, orig_v = dset->v;
6080     int err = 0;
6081 
6082     if (vname == NULL || *vname == '\0') {
6083 	return E_DATA;
6084     }
6085 
6086     if (grpnames == NULL || *grpnames == '\0') {
6087 	/* We just got the name of a series? That may be
6088 	   OK if it's string-valued and has a suitable
6089 	   set of values.
6090 	*/
6091 	return maybe_use_strval_series(dset, vname, ng);
6092     }
6093 
6094     if (strchr(grpnames, ' ')) {
6095 	/* group names as string literal */
6096 	namestr = grpnames;
6097     } else if (gretl_is_string(grpnames)) {
6098 	/* group names in a single string variable */
6099 	namestr = get_string_by_name(grpnames);
6100     } else {
6101 	/* try for array of strings */
6102 	S = group_names_from_array(grpnames, ng, &err);
6103     }
6104 
6105     if (namestr != NULL) {
6106 	/* we must obtain @S by splitting */
6107 	int ns = 0;
6108 
6109 	if (strchr(namestr, '"') != NULL) {
6110 	    S = gretl_string_split_quoted(namestr, &ns, NULL, &err);
6111 	} else {
6112 	    S = gretl_string_split(namestr, &ns, " \n\t");
6113 	}
6114 	if (!err && S == NULL) {
6115 	    err = E_ALLOC;
6116 	}
6117 	if (!err && ns != ng) {
6118 	    /* FIXME subsampled case? */
6119 	    fprintf(stderr, "Got %d strings but there are %d groups\n",
6120 		    ns, ng);
6121 	    strings_array_free(S, ns);
6122 	    S = NULL;
6123 	    err = E_DATA;
6124 	}
6125     }
6126 
6127     if (!err && S != NULL) {
6128 	err = group_uniqueness_check(S, ng);
6129     }
6130 
6131     if (!err) {
6132 	v = current_series_index(dset, vname);
6133 	if (v < 0) {
6134 	    /* we need to add a series */
6135 	    char *gen = gretl_strdup_printf("%s", vname);
6136 
6137 	    err = generate(gen, dset, GRETL_TYPE_SERIES, OPT_Q, NULL);
6138 	    if (!err) {
6139 		v = dset->v - 1;
6140 	    }
6141 	    free(gen);
6142 	}
6143     }
6144 
6145     if (!err) {
6146 	series_table *st = series_table_new(S, ng, &err);
6147 
6148 	if (!err) {
6149 	    int i, g = 0;
6150 
6151 	    series_attach_string_table(dset, v, st);
6152 	    for (i=0; i<dset->n; i++) {
6153 		if (i % dset->pd == 0) {
6154 		    g++;
6155 		}
6156 		dset->Z[v][i] = g;
6157 	    }
6158 	}
6159     }
6160 
6161     if (err) {
6162 	if (S != NULL) {
6163 	    strings_array_free(S, ng);
6164 	}
6165 	if (dset->v > orig_v) {
6166 	    dataset_drop_last_variables(dset, dset->v - orig_v);
6167 	}
6168     } else {
6169 	set_panel_groups_name(dset, vname);
6170     }
6171 
6172     return err;
6173 }
6174 
6175 /* utility functions */
6176 
find_time_var(const DATASET * dset)6177 static int find_time_var (const DATASET *dset)
6178 {
6179     const char *tnames[] = {
6180 	"year",
6181 	"Year",
6182 	"period",
6183 	"Period",
6184 	NULL
6185     };
6186     int i, v;
6187 
6188     for (i=0; tnames[i] != NULL; i++) {
6189 	v = series_index(dset, tnames[i]);
6190 	if (v < dset->v) {
6191 	    return v;
6192 	}
6193     }
6194 
6195     return 0;
6196 }
6197 
guess_panel_structure(double ** Z,DATASET * dset)6198 int guess_panel_structure (double **Z, DATASET *dset)
6199 {
6200     int ret, v = find_time_var(dset);
6201 
6202     if (v == 0) {
6203 	ret = 0; /* can't guess */
6204     } else if (floateq(Z[v][0], Z[v][1])) {
6205 	/* "year" or whatever is same for first two obs */
6206 	dset->structure = STACKED_CROSS_SECTION;
6207 	ret = STACKED_CROSS_SECTION;
6208     } else {
6209 	dset->structure = STACKED_TIME_SERIES;
6210 	ret = STACKED_TIME_SERIES;
6211     }
6212 
6213     return ret;
6214 }
6215 
balanced_panel(const DATASET * dset)6216 int balanced_panel (const DATASET *dset)
6217 {
6218     int n = dset->t2 - dset->t1 + 1;
6219     int ret = 0;
6220 
6221     if (n % dset->pd == 0) {
6222 	char unit[OBSLEN], period[OBSLEN];
6223 
6224 	if (sscanf(dset->endobs, "%[^:]:%s", unit, period) == 2) {
6225 	    if (atoi(period) == dset->pd) {
6226 		ret = 1;
6227 	    }
6228 	}
6229     }
6230 
6231     return ret;
6232 }
6233 
may_be_time_name(const char * vname)6234 static int may_be_time_name (const char *vname)
6235 {
6236     char test[VNAMELEN];
6237 
6238     strcpy(test, vname);
6239     gretl_lower(test);
6240 
6241     return strcmp(test, "year") == 0 ||
6242 	strcmp(test, "period") == 0;
6243 }
6244 
6245 /* See if the panel dataset contains a variable that plausibly represents
6246    the year of the observations. We look for series labeled "year" or
6247    "period" (in a case insensitive comparison) and, if found, check
6248    that the series has the same sequence of values for each individual,
6249    and the same increment between successive time-series observations.
6250    (The increment does not have to be 1, to accommodate, e.g.,
6251    quinquennial or decennial observations.)
6252 
6253    Return the ID number of a suitable variable, or 0 if there's nothing
6254    suitable.
6255 
6256    This may be used in setting the x-axis tic marks in a panel time-
6257    series plot.
6258 */
6259 
plausible_panel_time_var(const DATASET * dset)6260 int plausible_panel_time_var (const DATASET *dset)
6261 {
6262     int i, t, ret = 0;
6263 
6264     for (i=1; i<dset->v && ret==0; i++) {
6265 	if (may_be_time_name(dset->varname[i])) {
6266 	    const double *x = dset->Z[i];
6267 	    int val0 = x[0];
6268 	    int incr0 = x[1] - x[0];
6269 	    int ok = 1;
6270 
6271 	    for (t=0; t<dset->n && ok; t++) {
6272 		if (na(x[t]) || x[t] < 0) {
6273 		    ok = 0;
6274 		} else if (t > 0 && t % dset->pd == 0) {
6275 		    if (x[t] != val0) {
6276 			ok = 0;
6277 		    }
6278 		} else if (t > 1 && x[t] - x[t-1] != incr0) {
6279 		    ok = 0;
6280 		}
6281 	    }
6282 	    if (ok) {
6283 		ret = i;
6284 	    }
6285 	}
6286     }
6287 
6288     return ret;
6289 }
6290 
6291 /* FIXME: this does not yet handle the dropping of instruments */
6292 
dpanel_list_omit(const MODEL * orig,const int * drop,int * err)6293 static int *dpanel_list_omit (const MODEL *orig, const int *drop, int *err)
6294 {
6295     const int *old = orig->list;
6296     int *new = gretl_list_copy(old);
6297     int sep = 0;
6298     int i, j;
6299 
6300     if (new == NULL) {
6301 	*err = E_ALLOC;
6302 	return NULL;
6303     }
6304 
6305     for (i=2; i<=new[0]; i++) {
6306 	if (new[i] == LISTSEP) {
6307 	    sep++;
6308 	}
6309 	if (sep == 1) {
6310 	    for (j=1; j<=drop[0]; j++) {
6311 		if (drop[j] == new[i]) {
6312 		    gretl_list_delete_at_pos(new, i--);
6313 		}
6314 	    }
6315 	}
6316     }
6317 
6318 #if 0
6319     printlist(old, "old");
6320     printlist(drop, "drop");
6321     printlist(new, "new");
6322 #endif
6323 
6324     return new;
6325 }
6326 
6327 /**
6328  * panel_list_omit:
6329  * @orig: list specifying original panel model.
6330  * @drop: list of variables to be omitted.
6331  * @err: pointer to receive error code.
6332  *
6333  * Creates a new panel regression list, by first reconstructing
6334  * the regression specification from @orig then deleting from
6335  * the reconstruction the variables found in @drop.
6336  *
6337  * Returns: the new, reduced list or %NULL on error.
6338  */
6339 
panel_list_omit(const MODEL * orig,const int * drop,int * err)6340 int *panel_list_omit (const MODEL *orig, const int *drop, int *err)
6341 {
6342     int *newlist = NULL;
6343     int i;
6344 
6345     if (orig->ci == DPANEL) {
6346 	return dpanel_list_omit(orig, drop, err);
6347     }
6348 
6349     /* sorry, can't drop the constant */
6350     if (drop != NULL) {
6351 	int cpos = in_gretl_list(drop, 0);
6352 
6353 	if (cpos >= 2) {
6354 	    gretl_errmsg_set("Panel models must include an intercept");
6355 	    *err = E_DATA;
6356 	    return NULL;
6357 	}
6358     }
6359 
6360     if (orig->opt & OPT_F) {
6361 	int *panlist;
6362 
6363 	/* fixed-effects lists have the constant removed,
6364 	   so we need to put it back first
6365 	*/
6366 	panlist = gretl_list_new(orig->list[0] + 1);
6367 	if (panlist != NULL) {
6368 	    panlist[1] = orig->list[1];
6369 	    panlist[2] = 0;
6370 	    for (i=3; i<=panlist[0]; i++) {
6371 		panlist[i] = orig->list[i-1];
6372 	    }
6373 	    if (drop == NULL) {
6374 		newlist = gretl_list_omit_last(panlist, err);
6375 	    } else {
6376 		newlist = gretl_list_omit(panlist, drop, 2, err);
6377 	    }
6378 	    free(panlist);
6379 	}
6380     } else if (drop == NULL) {
6381 	newlist = gretl_list_omit_last(orig->list, err);
6382     } else {
6383 	newlist = gretl_list_omit(orig->list, drop, 2, err);
6384     }
6385 
6386     return newlist;
6387 }
6388 
6389 /* FIXME doesn't handle adding instruments */
6390 
dpanel_list_add(const MODEL * orig,const int * add,int * err)6391 static int *dpanel_list_add (const MODEL *orig, const int *add, int *err)
6392 {
6393     const int *old = orig->list;
6394     int *new = gretl_list_copy(old);
6395     int sep = 0, pos = old[0] + 1;
6396     int i;
6397 
6398     if (new == NULL) {
6399 	*err = E_ALLOC;
6400 	return NULL;
6401     }
6402 
6403     for (i=2; i<=old[0]; i++) {
6404 	if (old[i] == LISTSEP) {
6405 	    sep++;
6406 	    if (sep == 2) {
6407 		pos = i - 1;
6408 	    }
6409 	}
6410     }
6411 
6412     gretl_list_insert_list(&new, add, pos);
6413     if (new == NULL) {
6414 	*err = E_ALLOC;
6415 	return NULL;
6416     }
6417 
6418 #if 0
6419     printlist(old, "old");
6420     printlist(add, "add");
6421     printlist(new, "new");
6422 #endif
6423 
6424     return new;
6425 }
6426 
6427 /**
6428  * panel_list_add:
6429  * @orig: list specifying original panel model.
6430  * @add: list of variables to be added.
6431  * @err: pointer to receive error code.
6432  *
6433  * Creates a new panel regression list, by first reconstructing
6434  * the regression specification from @orig then adding to
6435  * the reconstruction the variables found in @add.
6436  *
6437  * Returns: the new, augmented list or %NULL on error.
6438  */
6439 
panel_list_add(const MODEL * orig,const int * add,int * err)6440 int *panel_list_add (const MODEL *orig, const int *add, int *err)
6441 {
6442     if (orig->ci == DPANEL) {
6443 	return dpanel_list_add(orig, add, err);
6444     } else {
6445 	return gretl_list_add(orig->list, add, err);
6446     }
6447 }
6448 
6449 /* calculate the within and between standard deviations for a given
6450    variable in a panel data set */
6451 
panel_variance_info(const double * x,const DATASET * dset,double xbar,double * psw,double * psb)6452 int panel_variance_info (const double *x, const DATASET *dset,
6453 			 double xbar, double *psw, double *psb)
6454 {
6455     double sw = 0.0, sb = 0.0;
6456     double xibar, d;
6457     int effn, effnT;
6458     int n, T, nT, Ti;
6459     int i, t, s;
6460 
6461     if (!dataset_is_panel(dset)) {
6462 	return E_PDWRONG;
6463     }
6464 
6465     nT = dset->t2 - dset->t1 + 1;
6466     T = dset->pd;
6467     n = nT / T;
6468 
6469     effn = 0;
6470     effnT = 0;
6471 
6472     for (i=0; i<n; i++) {
6473 	Ti = 0;
6474 	xibar = 0.0;
6475 	for (t=0; t<T; t++) {
6476 	    s = dset->t1 + i * T + t;
6477 	    if (!na(x[s])) {
6478 		Ti++;
6479 		xibar += x[s];
6480 	    }
6481 	}
6482 	if (Ti > 1) {
6483 	    xibar /= Ti;
6484 	    for (t=0; t<T; t++) {
6485 		s = dset->t1 + i * T + t;
6486 		if (!na(x[s])) {
6487 		    d = x[s] - xibar;
6488 		    sw += d * d;
6489 		}
6490 	    }
6491 	}
6492 	if (Ti > 0) {
6493 	    /* is this right for singleton observations? */
6494 	    d = xibar - xbar;
6495 	    sb += d * d;
6496 	    effn++;
6497 	    effnT += Ti;
6498 	}
6499     }
6500 
6501     if (effn > 1) {
6502 	/* between variance: \sum_{i=1}^N (\bar{x}_i - \bar{x})^2
6503 	   \times (N-1)^{-1}
6504 	*/
6505 	sb /= (effn - 1);
6506 	sb = sqrt(sb);
6507     } else {
6508 	sb = NADBL;
6509     }
6510 
6511     if (effnT - effn > 0) {
6512 	/* within variance: \sum_{i=1}^N \sum_{t=1}^T (x_{it} - \bar{x}_i)^2
6513 	   \times (\sum_{i=1}^N T_i - N)^{-1}
6514 	*/
6515 	sw /= (effnT - effn);
6516 	sw = sqrt(sw);
6517     } else {
6518 	sw = NADBL;
6519     }
6520 
6521     *psw = sw;
6522     *psb = sb;
6523 
6524     return 0;
6525 }
6526 
series_is_group_invariant(const DATASET * dset,int v)6527 int series_is_group_invariant (const DATASET *dset, int v)
6528 {
6529     const double *x = dset->Z[v];
6530     int T = dset->pd;
6531     int N = dset->n / T;
6532     int i, t, s;
6533 
6534     for (i=1; i<N; i++) {
6535 	for (t=0; t<T; t++) {
6536 	    s = t + i * T;
6537 	    if (x[s] != x[t]) {
6538 		return 0;
6539 	    }
6540 	}
6541     }
6542 
6543     return 1;
6544 }
6545 
panel_padding_rows(const DATASET * dset)6546 int panel_padding_rows (const DATASET *dset)
6547 {
6548     int missrow, nmiss = 0;
6549     int i, t;
6550 
6551     for (t=dset->t1; t<=dset->t2; t++) {
6552 	missrow = 1;
6553 	for (i=1; i<dset->v; i++) {
6554 	    if (!na(dset->Z[i][t])) {
6555 		missrow = 0;
6556 		break;
6557 	    }
6558 	}
6559 	if (missrow) {
6560 	    nmiss++;
6561 	}
6562     }
6563 
6564     return nmiss;
6565 }
6566