1 /*
2 * gretl -- Gnu Regression, Econometrics and Time-series Library
3 * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 /* mechanisms for handling missing observations */
21
22 #include "libgretl.h"
23 #include "missing_private.h"
24 #include "gretl_func.h"
25
26 typedef struct MISSOBS_ MISSOBS;
27
28 struct MISSOBS_ {
29 int misscount;
30 char *missvec;
31 };
32
33 #define MASKDEBUG 0
34
35 /**
36 * model_has_missing_obs:
37 * @pmod: pointer to model.
38 *
39 * Returns: 1 if there are missing observations in the
40 * model's sample range, otherwise 0.
41 */
42
model_has_missing_obs(const MODEL * pmod)43 int model_has_missing_obs (const MODEL *pmod)
44 {
45 int sample = pmod->t2 - pmod->t1 + 1;
46
47 return (pmod->nobs < sample);
48 }
49
really_missing(int v,int t,const double ** Z,int d)50 static int really_missing (int v, int t, const double **Z, int d)
51 {
52 if (d > 0 && Z[d][t] == 0) {
53 /* the obs is dummied out */
54 return 0;
55 } else {
56 return na(Z[v][t]);
57 }
58 }
59
model_missing(const MODEL * pmod,int t)60 int model_missing (const MODEL *pmod, int t)
61 {
62 if (pmod->missmask != NULL) {
63 return pmod->missmask[t] == '1';
64 } else {
65 return 0;
66 }
67 }
68
69 /* Construct a mask to code for missing observations within the sample
70 range for a model. The mask is an array of char with '0's
71 for OK observations and '1's for missing obs, terminated with
72 a NUL byte. This format allows copying of the mask using
73 strcpy.
74 */
75
model_missmask(const int * list,int t1,int t2,int n,const double ** Z,int dwt,int * misscount)76 static char *model_missmask (const int *list, int t1, int t2,
77 int n, const double **Z, int dwt,
78 int *misscount)
79 {
80 char *mask;
81 int i, vi, t;
82
83 mask = malloc(n + 1);
84 if (mask == NULL) {
85 return NULL;
86 }
87
88 memset(mask, '0', n);
89 mask[n] = 0; /* note NUL-termination */
90
91 #if MASKDEBUG
92 fprintf(stderr, "model_missmask: using series length %d\n", n);
93 #endif
94
95 if (misscount != NULL) {
96 *misscount = 0;
97 }
98
99 for (t=t1; t<=t2; t++) {
100 for (i=1; i<=list[0]; i++) {
101 vi = list[i];
102 if (vi > 0 && vi != LISTSEP) {
103 if (really_missing(vi, t, Z, dwt)) {
104 #if MASKDEBUG > 1
105 fprintf(stderr, "model_missmask: NA at list[%d] (%d), "
106 "obs %d\n", i, vi, t);
107 #endif
108 /* FIXME dwt case and nobs? */
109 mask[t] = '1';
110 if (misscount != NULL) {
111 *misscount += 1;
112 }
113 break;
114 }
115 }
116 }
117 }
118
119 return mask;
120 }
121
122 /**
123 * model_adjust_sample:
124 * @pmod: pointer to gretl model.
125 * @n: full length of data array.
126 * @Z: data array.
127 * @misst: location to receive the first observation with a
128 * missing value inside the sample range, or NULL.
129 *
130 * Drops leading or trailing observations from the sample range
131 * initially given by the values in the t1 and t2 members of
132 * @pmod, if missing values are found among the variables given
133 * in the list member of @pmod.
134 *
135 * Also checks for missing values within the adjusted sample range.
136 * If such values are encountered, either flag an error (if
137 * @misst != %NULL), or construct a "missing mask" and attach it
138 * to @pmod.
139 *
140 * Returns: if @misst is not NULL, either the ID number of the
141 * variable for which a missing value is first found inside the
142 * adjusted sample range or 0 if there is no such variable. If
143 * @misst is NULL, returns 1 if there is an error creating
144 * the missing obs mask, otherwise 0.
145 */
146
model_adjust_sample(MODEL * pmod,int n,const double ** Z,int * misst)147 int model_adjust_sample (MODEL *pmod, int n, const double **Z,
148 int *misst)
149 {
150 int i, t, dwt = 0, t1min = pmod->t1, t2max = pmod->t2;
151 int vi, missobs, ret = 0;
152 int move_ends = 1;
153
154 if (gretl_model_get_int(pmod, "wt_dummy")) {
155 /* we have a weight variable which is a 0/1 dummy */
156 dwt = pmod->nwt;
157 }
158
159 /* advance start of sample range to skip missing obs */
160 for (t=t1min; t<t2max; t++) {
161 missobs = 0;
162 for (i=1; i<=pmod->list[0]; i++) {
163 vi = pmod->list[i];
164 if (vi > 0 && vi != LISTSEP) {
165 if (really_missing(vi, t, Z, dwt)) {
166 missobs = 1;
167 break;
168 }
169 }
170 }
171 if (missobs) {
172 t1min++;
173 } else {
174 break;
175 }
176 }
177
178 /* retard end of sample range to skip missing obs */
179 for (t=t2max; t>t1min; t--) {
180 missobs = 0;
181 for (i=1; i<=pmod->list[0]; i++) {
182 vi = pmod->list[i];
183 if (vi > 0 && vi != LISTSEP) {
184 if (really_missing(vi, t, Z, dwt)) {
185 missobs = 1;
186 break;
187 }
188 }
189 }
190 if (missobs) {
191 t2max--;
192 } else {
193 break;
194 }
195 }
196
197 if (misst != NULL) {
198 /* check for missing values within remaining range and
199 flag an error in case any are found */
200 for (t=t1min; t<=t2max && !ret; t++) {
201 for (i=1; i<=pmod->list[0]; i++) {
202 vi = pmod->list[i];
203 if (vi > 0 && vi != LISTSEP) {
204 if (really_missing(vi, t, Z, dwt)) {
205 /* identify first missing obs and var */
206 *misst = t + 1;
207 ret = vi;
208 break;
209 }
210 }
211 }
212 }
213 } else {
214 /* construct a mask for missing values within remaining range?
215 we do this only if misst == NULL */
216 missobs = 0;
217 for (t=t1min; t<=t2max; t++) {
218 for (i=1; i<=pmod->list[0]; i++) {
219 vi = pmod->list[i];
220 if (vi > 0 || vi != LISTSEP) {
221 if (really_missing(vi, t, Z, dwt)) {
222 missobs++;
223 break;
224 }
225 }
226 }
227 }
228
229 if (missobs == t2max - t1min + 1) {
230 /* no valid observations */
231 pmod->errcode = E_MISSDATA;
232 ret = 1;
233 } else if (missobs > 0) {
234 pmod->missmask = model_missmask(pmod->list, pmod->t1, pmod->t2,
235 n, Z, dwt, NULL);
236 move_ends = 0;
237 if (pmod->missmask == NULL) {
238 pmod->errcode = E_ALLOC;
239 ret = 1;
240 }
241 }
242 }
243
244 if (move_ends) {
245 pmod->t1 = t1min;
246 pmod->t2 = t2max;
247 }
248
249 #if MASKDEBUG
250 if (pmod->missmask != NULL) {
251 fprintf(stderr, "model at %p: now has mask at %p\n",
252 (void *) pmod, (void *) pmod->missmask);
253 }
254 #endif
255
256 return ret;
257 }
258
259 /**
260 * first_missing_index:
261 * @x: array to be checked for missing values.
262 * @t1: start of range to check.
263 * @t2: end of range to check.
264 *
265 * Returns: the index of the first missing observation in @x
266 * over the sample range @t1 to @t2, or -1 if there is no
267 * such observation.
268 */
269
first_missing_index(const double * x,int t1,int t2)270 int first_missing_index (const double *x, int t1, int t2)
271 {
272 int t;
273
274 for (t=t1; t<=t2; t++) {
275 if (na(x[t])) {
276 return t;
277 }
278 }
279
280 return -1;
281 }
282
283 /**
284 * series_adjust_sample:
285 * @x: series to be checked for missing values.
286 * @t1: on entry, initial start of sample range; on exit,
287 * start of sample range adjusted for missing values.
288 * @t2: on entry, initial end of sample range; on exit, end
289 * of sample range adjusted for missing values.
290 *
291 * Adjusts @t1 and @t2 so as to drop any leading or trailing
292 * missing observations.
293 *
294 * Returns: E_MISSDATA if interior missing values were found
295 * within the (possibly adjusted) sample range, otherwise 0.
296 */
297
series_adjust_sample(const double * x,int * t1,int * t2)298 int series_adjust_sample (const double *x, int *t1, int *t2)
299 {
300 int t, t1min = *t1, t2max = *t2;
301 int err = 0;
302
303 for (t=t1min; t<t2max; t++) {
304 if (na(x[t])) t1min++;
305 else break;
306 }
307
308 for (t=t2max; t>t1min; t--) {
309 if (na(x[t])) t2max--;
310 else break;
311 }
312
313 for (t=t1min; t<=t2max; t++) {
314 if (na(x[t])) {
315 err = E_MISSDATA;
316 break;
317 }
318 }
319
320 *t1 = t1min;
321 *t2 = t2max;
322
323 return err;
324 }
325
326 /**
327 * list_adjust_sample:
328 * @list: list of variables to be tested for missing values,
329 * or %NULL to test all series.
330 * @t1: on entry, initial start of sample range; on exit,
331 * start of sample range adjusted for missing values.
332 * @t2: on entry, initial end of sample range; on exit, end
333 * of sample range adjusted for missing values.
334 * @dset: dataset struct.
335 * @nmiss: location to receive number of missing values within
336 * (possibly adjusted) sample range.
337 *
338 * Drops leading or trailing observations from the sample range
339 * initially given by the values in @t1 and @t2 if missing values
340 * are found for any of the variables given in @list.
341 *
342 * If @nmiss is non-NULL it receives the number of missing values
343 * inside the (possibly reduced) sample range, otherwise it is
344 * considered an error if there are any such missing values.
345 *
346 * Returns: 0 on success or %E_MISSDATA or error.
347 */
348
list_adjust_sample(const int * list,int * t1,int * t2,const DATASET * dset,int * nmiss)349 int list_adjust_sample (const int *list, int *t1, int *t2,
350 const DATASET *dset, int *nmiss)
351 {
352 int i, t, t1min = *t1, t2max = *t2;
353 int k, vi, missing, err = 0;
354
355 if (list != NULL) {
356 k = list[0];
357 } else {
358 /* check all series */
359 k = dset->v - 1;
360 }
361
362 /* advance start of sample range to skip missing obs? */
363 for (t=t1min; t<t2max; t++) {
364 missing = 0;
365 for (i=1; i<=k; i++) {
366 vi = list == NULL ? i : list[i];
367 if (vi > 0 && vi != LISTSEP) {
368 if (na(dset->Z[vi][t])) {
369 missing = 1;
370 break;
371 }
372 }
373 }
374 if (missing) {
375 t1min++;
376 } else {
377 break;
378 }
379 }
380
381 /* retard end of sample range to skip missing obs? */
382 for (t=t2max; t>t1min; t--) {
383 missing = 0;
384 for (i=1; i<=k; i++) {
385 vi = list == NULL ? i : list[i];
386 if (vi > 0 && vi != LISTSEP) {
387 if (na(dset->Z[vi][t])) {
388 missing = 1;
389 break;
390 }
391 }
392 }
393 if (missing) {
394 t2max--;
395 } else {
396 break;
397 }
398 }
399
400 if (nmiss != NULL) {
401 *nmiss = 0;
402 }
403
404 /* check for missing values within remaining range */
405 for (t=t1min; t<=t2max && !err; t++) {
406 missing = 0;
407 for (i=1; i<=k; i++) {
408 vi = list == NULL ? i : list[i];
409 if (vi > 0 && vi != LISTSEP) {
410 if (na(dset->Z[vi][t])) {
411 if (nmiss == NULL) {
412 err = E_MISSDATA;
413 } else {
414 *nmiss += 1;
415 }
416 break;
417 }
418 }
419 }
420 }
421
422 *t1 = t1min;
423 *t2 = t2max;
424
425 return err;
426 }
427
428 /* For handling the "omit" command, applied to a model that has
429 missing values within the sample range. The model as re-estimated
430 with a reduced set of regressors must use the sample sample range
431 as the original model, or else the comparison of the original and
432 reduced models will be invalid.
433 */
434
435 /* copy of a given model's missmask, or NULL */
436
437 static char *refmask;
438
build_refmask_from_model(const MODEL * pmod)439 static char *build_refmask_from_model (const MODEL *pmod)
440 {
441 int t, n = pmod->full_n;
442 char *mask = malloc(n + 1);
443
444 if (mask != NULL) {
445 memset(mask, '0', n);
446 mask[n] = 0; /* NUL-terminate */
447 for (t=pmod->t1; t<=pmod->t2; t++) {
448 if (na(pmod->uhat[t])) {
449 mask[t] = '1';
450 }
451 }
452 }
453
454 return mask;
455 }
456
457 /* Set the "reference" mask based on a given model */
458
set_reference_missmask_from_model(const MODEL * pmod)459 void set_reference_missmask_from_model (const MODEL *pmod)
460 {
461 if (pmod != NULL) {
462 if (refmask != NULL) {
463 free(refmask);
464 refmask = NULL;
465 }
466 if (pmod->missmask != NULL) {
467 refmask = gretl_strdup(pmod->missmask);
468 } else if (model_has_missing_obs(pmod)) {
469 refmask = build_refmask_from_model(pmod);
470 }
471 }
472 }
473
copy_to_reference_missmask(const char * mask)474 int copy_to_reference_missmask (const char *mask)
475 {
476 if (refmask != NULL) {
477 free(refmask);
478 refmask = NULL;
479 }
480
481 refmask = gretl_strdup(mask);
482
483 #if MASKDEBUG
484 fprintf(stderr, "copy_to_reference_missmask: refmask = %p\n",
485 (void *) refmask);
486 #endif
487
488 return (refmask == NULL)? E_ALLOC : 0;
489 }
490
491 /* attach the reference missing obs mask to a specified model */
492
apply_reference_missmask(MODEL * pmod)493 int apply_reference_missmask (MODEL *pmod)
494 {
495 if (refmask != NULL) {
496 #if MASKDEBUG
497 fprintf(stderr, "applying reference mask to model = %p\n",
498 (void *) pmod);
499 #endif
500 pmod->missmask = refmask;
501 refmask = NULL; /* note: apply it only once */
502 }
503
504 return 0;
505 }
506
reference_missmask_present(void)507 int reference_missmask_present (void)
508 {
509 return (refmask != NULL);
510 }
511
real_setmiss(double missval,int varno,DATASET * dset)512 static int real_setmiss (double missval, int varno,
513 DATASET *dset)
514 {
515 int i, t, count = 0;
516 int start = 1, end = dset->v;
517
518 if (varno) {
519 start = varno;
520 end = varno + 1;
521 }
522
523 for (i=start; i<end; i++) {
524 for (t=0; t<dset->n; t++) {
525 if (dset->Z[i][t] == missval) {
526 dset->Z[i][t] = NADBL;
527 count++;
528 }
529 }
530 }
531
532 return count;
533 }
534
535 /**
536 * set_miss:
537 * @list: list of variables to process, or an empty list or %NULL
538 * to process all variables.
539 * @param: string representation of the numerical value to treat as missing.
540 * @dset: dataset struct.
541 * @prn: pointer to printing struct.
542 *
543 * Set to "missing" each observation of each series in @list that
544 * has the value represented by @param.
545 *
546 * Returns: 0 on success, non-zero code on failure.
547 */
548
set_miss(const int * list,const char * param,DATASET * dset,PRN * prn)549 int set_miss (const int *list, const char *param,
550 DATASET *dset, PRN *prn)
551 {
552 int i, vi, count;
553 double missval;
554 int err = 0;
555
556 if (param == NULL) {
557 return E_ARGS;
558 }
559
560 if (dset == NULL || dset->n == 0) {
561 return E_NODATA;
562 }
563
564 missval = gretl_double_from_string(param, &err);
565 if (err) {
566 return err;
567 }
568
569 if (list == NULL || list[0] == 0) {
570 count = real_setmiss(missval, 0, dset);
571 if (count) {
572 pprintf(prn, _("Set %d values to \"missing\"\n"), count);
573 } else {
574 pputs(prn, _("Didn't find any matching observations\n"));
575 }
576 } else {
577 for (i=1; i<=list[0]; i++) {
578 vi = list[i];
579 if (vi == 0 || object_is_const(dset->varname[vi], vi)) {
580 gretl_errmsg_sprintf(_("The variable %s is read-only"),
581 dset->varname[vi]);
582 err = E_DATA;
583 break;
584 }
585 count = real_setmiss(missval, vi, dset);
586 if (count) {
587 pprintf(prn, _("%s: set %d observations to \"missing\"\n"),
588 dset->varname[vi], count);
589 } else {
590 pprintf(prn, _("%s: Didn't find any matching observations\n"),
591 dset->varname[vi]);
592 }
593 }
594 }
595
596 return err;
597 }
598
599 /**
600 * missing_obs_fraction:
601 * @dset: dataset struct.
602 *
603 * Returns: the fraction of the observations in @Z for which
604 * all variables have missing values (empty rows).
605 */
606
missing_obs_fraction(const DATASET * dset)607 double missing_obs_fraction (const DATASET *dset)
608 {
609 int missrow, totmiss = 0;
610 int i, t;
611
612 if (dset->n == 0) {
613 return 0.0;
614 }
615
616 for (t=0; t<dset->n; t++) {
617 missrow = 1;
618 for (i=1; i<dset->v; i++) {
619 if (!na(dset->Z[i][t])) {
620 missrow = 0;
621 break;
622 }
623 }
624 totmiss += missrow;
625 }
626
627 return (double) totmiss / dset->n;
628 }
629
630 /**
631 * any_missing_user_values:
632 * @dset: dataset struct.
633 *
634 * Returns: 1 if there are missing values for any non-hidden
635 * variables within the current sample range, otherwise 0.
636 */
637
any_missing_user_values(const DATASET * dset)638 int any_missing_user_values (const DATASET *dset)
639 {
640 int i, t;
641
642 if (dset->n == 0) {
643 return 0;
644 }
645
646 for (i=1; i<dset->v; i++) {
647 if (!series_is_hidden(dset, i)) {
648 for (t=dset->t1; t<=dset->t2; t++) {
649 if (na(dset->Z[i][t])) {
650 return 1;
651 }
652 }
653 }
654 }
655
656 return 0;
657 }
658
659 /**
660 * model_add_missmask:
661 * @pmod: pointer to gretl MODEL.
662 * @n: length of mask.
663 *
664 * Allocates a missing obs mask of length @n and attaches it
665 * to @pmod. All elements are initialized to '0' (non-missing).
666 *
667 * Returns: 0 on success, non-zero if allocation fails.
668 */
669
model_add_missmask(MODEL * pmod,int n)670 int model_add_missmask (MODEL *pmod, int n)
671 {
672 pmod->missmask = malloc((n+1) * sizeof *pmod->missmask);
673 if (pmod->missmask == NULL) {
674 return E_ALLOC;
675 } else {
676 memset(pmod->missmask, '0', n);
677 pmod->missmask[n] = 0; /* NUL-termination */
678 return 0;
679 }
680 }
681