1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 /* mechanisms for handling missing observations */
21 
22 #include "libgretl.h"
23 #include "missing_private.h"
24 #include "gretl_func.h"
25 
26 typedef struct MISSOBS_ MISSOBS;
27 
28 struct MISSOBS_ {
29     int misscount;
30     char *missvec;
31 };
32 
33 #define MASKDEBUG 0
34 
35 /**
36  * model_has_missing_obs:
37  * @pmod: pointer to model.
38  *
39  * Returns: 1 if there are missing observations in the
40  * model's sample range, otherwise 0.
41  */
42 
model_has_missing_obs(const MODEL * pmod)43 int model_has_missing_obs (const MODEL *pmod)
44 {
45     int sample = pmod->t2 - pmod->t1 + 1;
46 
47     return (pmod->nobs < sample);
48 }
49 
really_missing(int v,int t,const double ** Z,int d)50 static int really_missing (int v, int t, const double **Z, int d)
51 {
52     if (d > 0 && Z[d][t] == 0) {
53 	/* the obs is dummied out */
54 	return 0;
55     } else {
56 	return na(Z[v][t]);
57     }
58 }
59 
model_missing(const MODEL * pmod,int t)60 int model_missing (const MODEL *pmod, int t)
61 {
62     if (pmod->missmask != NULL) {
63 	return pmod->missmask[t] == '1';
64     } else {
65 	return 0;
66     }
67 }
68 
69 /* Construct a mask to code for missing observations within the sample
70    range for a model.  The mask is an array of char with '0's
71    for OK observations and '1's for missing obs, terminated with
72    a NUL byte. This format allows copying of the mask using
73    strcpy.
74 */
75 
model_missmask(const int * list,int t1,int t2,int n,const double ** Z,int dwt,int * misscount)76 static char *model_missmask (const int *list, int t1, int t2,
77 			     int n, const double **Z, int dwt,
78 			     int *misscount)
79 {
80     char *mask;
81     int i, vi, t;
82 
83     mask = malloc(n + 1);
84     if (mask == NULL) {
85 	return NULL;
86     }
87 
88     memset(mask, '0', n);
89     mask[n] = 0; /* note NUL-termination */
90 
91 #if MASKDEBUG
92     fprintf(stderr, "model_missmask: using series length %d\n", n);
93 #endif
94 
95     if (misscount != NULL) {
96 	*misscount = 0;
97     }
98 
99     for (t=t1; t<=t2; t++) {
100 	for (i=1; i<=list[0]; i++) {
101 	    vi = list[i];
102 	    if (vi > 0 && vi != LISTSEP) {
103 		if (really_missing(vi, t, Z, dwt)) {
104 #if MASKDEBUG > 1
105 		    fprintf(stderr, "model_missmask: NA at list[%d] (%d), "
106 			    "obs %d\n", i, vi, t);
107 #endif
108 		    /* FIXME dwt case and nobs? */
109 		    mask[t] = '1';
110 		    if (misscount != NULL) {
111 			*misscount += 1;
112 		    }
113 		    break;
114 		}
115 	    }
116 	}
117     }
118 
119     return mask;
120 }
121 
122 /**
123  * model_adjust_sample:
124  * @pmod: pointer to gretl model.
125  * @n: full length of data array.
126  * @Z: data array.
127  * @misst: location to receive the first observation with a
128  *         missing value inside the sample range, or NULL.
129  *
130  * Drops leading or trailing observations from the sample range
131  * initially given by the values in the t1 and t2 members of
132  * @pmod, if missing values are found among the variables given
133  * in the list member of @pmod.
134  *
135  * Also checks for missing values within the adjusted sample range.
136  * If such values are encountered, either flag an error (if
137  * @misst != %NULL), or construct a "missing mask" and attach it
138  * to @pmod.
139  *
140  * Returns: if @misst is not NULL, either the ID number of the
141  * variable for which a missing value is first found inside the
142  * adjusted sample range or 0 if there is no such variable.  If
143  * @misst is NULL, returns 1 if there is an error creating
144  * the missing obs mask, otherwise 0.
145  */
146 
model_adjust_sample(MODEL * pmod,int n,const double ** Z,int * misst)147 int model_adjust_sample (MODEL *pmod, int n, const double **Z,
148 			 int *misst)
149 {
150     int i, t, dwt = 0, t1min = pmod->t1, t2max = pmod->t2;
151     int vi, missobs, ret = 0;
152     int move_ends = 1;
153 
154     if (gretl_model_get_int(pmod, "wt_dummy")) {
155 	/* we have a weight variable which is a 0/1 dummy */
156 	dwt = pmod->nwt;
157     }
158 
159     /* advance start of sample range to skip missing obs */
160     for (t=t1min; t<t2max; t++) {
161 	missobs = 0;
162 	for (i=1; i<=pmod->list[0]; i++) {
163 	    vi = pmod->list[i];
164 	    if (vi > 0 && vi != LISTSEP) {
165 		if (really_missing(vi, t, Z, dwt)) {
166 		    missobs = 1;
167 		    break;
168 		}
169 	    }
170 	}
171 	if (missobs) {
172 	    t1min++;
173 	} else {
174 	    break;
175 	}
176     }
177 
178     /* retard end of sample range to skip missing obs */
179     for (t=t2max; t>t1min; t--) {
180 	missobs = 0;
181 	for (i=1; i<=pmod->list[0]; i++) {
182 	    vi = pmod->list[i];
183 	    if (vi > 0 && vi != LISTSEP) {
184 		if (really_missing(vi, t, Z, dwt)) {
185 		    missobs = 1;
186 		    break;
187 		}
188 	    }
189 	}
190 	if (missobs) {
191 	    t2max--;
192 	} else {
193 	    break;
194 	}
195     }
196 
197     if (misst != NULL) {
198 	/* check for missing values within remaining range and
199 	   flag an error in case any are found */
200 	for (t=t1min; t<=t2max && !ret; t++) {
201 	    for (i=1; i<=pmod->list[0]; i++) {
202 		vi = pmod->list[i];
203 		if (vi > 0 && vi != LISTSEP) {
204 		    if (really_missing(vi, t, Z, dwt)) {
205 			/* identify first missing obs and var */
206 			*misst = t + 1;
207 			ret = vi;
208 			break;
209 		    }
210 		}
211 	    }
212 	}
213     } else {
214 	/* construct a mask for missing values within remaining range?
215 	   we do this only if misst == NULL */
216 	missobs = 0;
217 	for (t=t1min; t<=t2max; t++) {
218 	    for (i=1; i<=pmod->list[0]; i++) {
219 		vi = pmod->list[i];
220 		if (vi > 0 || vi != LISTSEP) {
221 		    if (really_missing(vi, t, Z, dwt)) {
222 			missobs++;
223 			break;
224 		    }
225 		}
226 	    }
227 	}
228 
229 	if (missobs == t2max - t1min + 1) {
230 	    /* no valid observations */
231 	    pmod->errcode = E_MISSDATA;
232 	    ret = 1;
233 	} else if (missobs > 0) {
234 	    pmod->missmask = model_missmask(pmod->list, pmod->t1, pmod->t2,
235 					    n, Z, dwt, NULL);
236 	    move_ends = 0;
237 	    if (pmod->missmask == NULL) {
238 		pmod->errcode = E_ALLOC;
239 		ret = 1;
240 	    }
241 	}
242     }
243 
244     if (move_ends) {
245 	pmod->t1 = t1min;
246 	pmod->t2 = t2max;
247     }
248 
249 #if MASKDEBUG
250     if (pmod->missmask != NULL) {
251 	fprintf(stderr, "model at %p: now has mask at %p\n",
252 		(void *) pmod, (void *) pmod->missmask);
253     }
254 #endif
255 
256     return ret;
257 }
258 
259 /**
260  * first_missing_index:
261  * @x: array to be checked for missing values.
262  * @t1: start of range to check.
263  * @t2: end of range to check.
264  *
265  * Returns: the index of the first missing observation in @x
266  * over the sample range @t1 to @t2, or -1 if there is no
267  * such observation.
268  */
269 
first_missing_index(const double * x,int t1,int t2)270 int first_missing_index (const double *x, int t1, int t2)
271 {
272     int t;
273 
274     for (t=t1; t<=t2; t++) {
275 	if (na(x[t])) {
276 	    return t;
277 	}
278     }
279 
280     return -1;
281 }
282 
283 /**
284  * series_adjust_sample:
285  * @x: series to be checked for missing values.
286  * @t1: on entry, initial start of sample range; on exit,
287  *      start of sample range adjusted for missing values.
288  * @t2: on entry, initial end of sample range; on exit, end
289  *      of sample range adjusted for missing values.
290  *
291  * Adjusts @t1 and @t2 so as to drop any leading or trailing
292  * missing observations.
293  *
294  * Returns: E_MISSDATA if interior missing values were found
295  * within the (possibly adjusted) sample range, otherwise 0.
296  */
297 
series_adjust_sample(const double * x,int * t1,int * t2)298 int series_adjust_sample (const double *x, int *t1, int *t2)
299 {
300     int t, t1min = *t1, t2max = *t2;
301     int err = 0;
302 
303     for (t=t1min; t<t2max; t++) {
304 	if (na(x[t])) t1min++;
305 	else break;
306     }
307 
308     for (t=t2max; t>t1min; t--) {
309 	if (na(x[t])) t2max--;
310 	else break;
311     }
312 
313     for (t=t1min; t<=t2max; t++) {
314 	if (na(x[t])) {
315 	    err = E_MISSDATA;
316 	    break;
317 	}
318     }
319 
320     *t1 = t1min;
321     *t2 = t2max;
322 
323     return err;
324 }
325 
326 /**
327  * list_adjust_sample:
328  * @list: list of variables to be tested for missing values,
329  * or %NULL to test all series.
330  * @t1: on entry, initial start of sample range; on exit,
331  *      start of sample range adjusted for missing values.
332  * @t2: on entry, initial end of sample range; on exit, end
333  *      of sample range adjusted for missing values.
334  * @dset: dataset struct.
335  * @nmiss: location to receive number of missing values within
336  * (possibly adjusted) sample range.
337  *
338  * Drops leading or trailing observations from the sample range
339  * initially given by the values in @t1 and @t2 if missing values
340  * are found for any of the variables given in @list.
341  *
342  * If @nmiss is non-NULL it receives the number of missing values
343  * inside the (possibly reduced) sample range, otherwise it is
344  * considered an error if there are any such missing values.
345  *
346  * Returns: 0 on success or %E_MISSDATA or error.
347  */
348 
list_adjust_sample(const int * list,int * t1,int * t2,const DATASET * dset,int * nmiss)349 int list_adjust_sample (const int *list, int *t1, int *t2,
350 			const DATASET *dset, int *nmiss)
351 {
352     int i, t, t1min = *t1, t2max = *t2;
353     int k, vi, missing, err = 0;
354 
355     if (list != NULL) {
356 	k = list[0];
357     } else {
358 	/* check all series */
359 	k = dset->v - 1;
360     }
361 
362     /* advance start of sample range to skip missing obs? */
363     for (t=t1min; t<t2max; t++) {
364 	missing = 0;
365 	for (i=1; i<=k; i++) {
366 	    vi = list == NULL ? i : list[i];
367 	    if (vi > 0 && vi != LISTSEP) {
368 		if (na(dset->Z[vi][t])) {
369 		    missing = 1;
370 		    break;
371 		}
372 	    }
373 	}
374 	if (missing) {
375 	    t1min++;
376 	} else {
377 	    break;
378 	}
379     }
380 
381     /* retard end of sample range to skip missing obs? */
382     for (t=t2max; t>t1min; t--) {
383 	missing = 0;
384 	for (i=1; i<=k; i++) {
385 	    vi = list == NULL ? i : list[i];
386 	    if (vi > 0 && vi != LISTSEP) {
387 		if (na(dset->Z[vi][t])) {
388 		    missing = 1;
389 		    break;
390 		}
391 	    }
392 	}
393 	if (missing) {
394 	    t2max--;
395 	} else {
396 	    break;
397 	}
398     }
399 
400     if (nmiss != NULL) {
401 	*nmiss = 0;
402     }
403 
404     /* check for missing values within remaining range */
405     for (t=t1min; t<=t2max && !err; t++) {
406 	missing = 0;
407 	for (i=1; i<=k; i++) {
408 	    vi = list == NULL ? i : list[i];
409 	    if (vi > 0 && vi != LISTSEP) {
410 		if (na(dset->Z[vi][t])) {
411 		    if (nmiss == NULL) {
412 			err = E_MISSDATA;
413 		    } else {
414 			*nmiss += 1;
415 		    }
416 		    break;
417 		}
418 	    }
419 	}
420     }
421 
422     *t1 = t1min;
423     *t2 = t2max;
424 
425     return err;
426 }
427 
428 /* For handling the "omit" command, applied to a model that has
429    missing values within the sample range.  The model as re-estimated
430    with a reduced set of regressors must use the sample sample range
431    as the original model, or else the comparison of the original and
432    reduced models will be invalid.
433 */
434 
435 /* copy of a given model's missmask, or NULL */
436 
437 static char *refmask;
438 
build_refmask_from_model(const MODEL * pmod)439 static char *build_refmask_from_model (const MODEL *pmod)
440 {
441     int t, n = pmod->full_n;
442     char *mask = malloc(n + 1);
443 
444     if (mask != NULL) {
445 	memset(mask, '0', n);
446 	mask[n] = 0; /* NUL-terminate */
447 	for (t=pmod->t1; t<=pmod->t2; t++) {
448 	    if (na(pmod->uhat[t])) {
449 		mask[t] = '1';
450 	    }
451 	}
452     }
453 
454     return mask;
455 }
456 
457 /* Set the "reference" mask based on a given model */
458 
set_reference_missmask_from_model(const MODEL * pmod)459 void set_reference_missmask_from_model (const MODEL *pmod)
460 {
461     if (pmod != NULL) {
462 	if (refmask != NULL) {
463 	    free(refmask);
464 	    refmask = NULL;
465 	}
466 	if (pmod->missmask != NULL) {
467 	    refmask = gretl_strdup(pmod->missmask);
468 	} else if (model_has_missing_obs(pmod)) {
469 	    refmask = build_refmask_from_model(pmod);
470 	}
471     }
472 }
473 
copy_to_reference_missmask(const char * mask)474 int copy_to_reference_missmask (const char *mask)
475 {
476     if (refmask != NULL) {
477 	free(refmask);
478 	refmask = NULL;
479     }
480 
481     refmask = gretl_strdup(mask);
482 
483 #if MASKDEBUG
484     fprintf(stderr, "copy_to_reference_missmask: refmask = %p\n",
485 	    (void *) refmask);
486 #endif
487 
488     return (refmask == NULL)? E_ALLOC : 0;
489 }
490 
491 /* attach the reference missing obs mask to a specified model */
492 
apply_reference_missmask(MODEL * pmod)493 int apply_reference_missmask (MODEL *pmod)
494 {
495     if (refmask != NULL) {
496 #if MASKDEBUG
497 	fprintf(stderr, "applying reference mask to model = %p\n",
498 		(void *) pmod);
499 #endif
500 	pmod->missmask = refmask;
501 	refmask = NULL; /* note: apply it only once */
502     }
503 
504     return 0;
505 }
506 
reference_missmask_present(void)507 int reference_missmask_present (void)
508 {
509     return (refmask != NULL);
510 }
511 
real_setmiss(double missval,int varno,DATASET * dset)512 static int real_setmiss (double missval, int varno,
513 			 DATASET *dset)
514 {
515     int i, t, count = 0;
516     int start = 1, end = dset->v;
517 
518     if (varno) {
519 	start = varno;
520 	end = varno + 1;
521     }
522 
523     for (i=start; i<end; i++) {
524 	for (t=0; t<dset->n; t++) {
525 	    if (dset->Z[i][t] == missval) {
526 		dset->Z[i][t] = NADBL;
527 		count++;
528 	    }
529 	}
530     }
531 
532     return count;
533 }
534 
535 /**
536  * set_miss:
537  * @list: list of variables to process, or an empty list or %NULL
538  *        to process all variables.
539  * @param: string representation of the numerical value to treat as missing.
540  * @dset: dataset struct.
541  * @prn: pointer to printing struct.
542  *
543  * Set to "missing" each observation of each series in @list that
544  * has the value represented by @param.
545  *
546  * Returns: 0 on success, non-zero code on failure.
547  */
548 
set_miss(const int * list,const char * param,DATASET * dset,PRN * prn)549 int set_miss (const int *list, const char *param,
550 	      DATASET *dset, PRN *prn)
551 {
552     int i, vi, count;
553     double missval;
554     int err = 0;
555 
556     if (param == NULL) {
557 	return E_ARGS;
558     }
559 
560     if (dset == NULL || dset->n == 0) {
561 	return E_NODATA;
562     }
563 
564     missval = gretl_double_from_string(param, &err);
565     if (err) {
566 	return err;
567     }
568 
569     if (list == NULL || list[0] == 0) {
570 	count = real_setmiss(missval, 0, dset);
571 	if (count) {
572 	    pprintf(prn, _("Set %d values to \"missing\"\n"), count);
573 	} else {
574 	    pputs(prn, _("Didn't find any matching observations\n"));
575 	}
576     } else {
577 	for (i=1; i<=list[0]; i++) {
578 	    vi = list[i];
579 	    if (vi == 0 || object_is_const(dset->varname[vi], vi)) {
580 		gretl_errmsg_sprintf(_("The variable %s is read-only"),
581 				     dset->varname[vi]);
582 		err = E_DATA;
583 		break;
584 	    }
585 	    count = real_setmiss(missval, vi, dset);
586 	    if (count) {
587 		pprintf(prn, _("%s: set %d observations to \"missing\"\n"),
588 			dset->varname[vi], count);
589 	    } else {
590 		pprintf(prn, _("%s: Didn't find any matching observations\n"),
591 			dset->varname[vi]);
592 	    }
593 	}
594     }
595 
596     return err;
597 }
598 
599 /**
600  * missing_obs_fraction:
601  * @dset: dataset struct.
602  *
603  * Returns: the fraction of the observations in @Z for which
604  * all variables have missing values (empty rows).
605  */
606 
missing_obs_fraction(const DATASET * dset)607 double missing_obs_fraction (const DATASET *dset)
608 {
609     int missrow, totmiss = 0;
610     int i, t;
611 
612     if (dset->n == 0) {
613 	return 0.0;
614     }
615 
616     for (t=0; t<dset->n; t++) {
617 	missrow = 1;
618 	for (i=1; i<dset->v; i++) {
619 	    if (!na(dset->Z[i][t])) {
620 		missrow = 0;
621 		break;
622 	    }
623 	}
624 	totmiss += missrow;
625     }
626 
627     return (double) totmiss / dset->n;
628 }
629 
630 /**
631  * any_missing_user_values:
632  * @dset: dataset struct.
633  *
634  * Returns: 1 if there are missing values for any non-hidden
635  * variables within the current sample range, otherwise 0.
636  */
637 
any_missing_user_values(const DATASET * dset)638 int any_missing_user_values (const DATASET *dset)
639 {
640     int i, t;
641 
642     if (dset->n == 0) {
643 	return 0;
644     }
645 
646     for (i=1; i<dset->v; i++) {
647 	if (!series_is_hidden(dset, i)) {
648 	    for (t=dset->t1; t<=dset->t2; t++) {
649 		if (na(dset->Z[i][t])) {
650 		    return 1;
651 		}
652 	    }
653 	}
654     }
655 
656     return 0;
657 }
658 
659 /**
660  * model_add_missmask:
661  * @pmod: pointer to gretl MODEL.
662  * @n: length of mask.
663  *
664  * Allocates a missing obs mask of length @n and attaches it
665  * to @pmod. All elements are initialized to '0' (non-missing).
666  *
667  * Returns: 0 on success, non-zero if allocation fails.
668  */
669 
model_add_missmask(MODEL * pmod,int n)670 int model_add_missmask (MODEL *pmod, int n)
671 {
672     pmod->missmask = malloc((n+1) * sizeof *pmod->missmask);
673     if (pmod->missmask == NULL) {
674 	return E_ALLOC;
675     } else {
676 	memset(pmod->missmask, '0', n);
677 	pmod->missmask[n] = 0; /* NUL-termination */
678 	return 0;
679     }
680 }
681