1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 /* subsample.c for gretl */
21 
22 #define FULL_XML_HEADERS
23 
24 #include "libgretl.h"
25 #include "libset.h"
26 #include "gretl_func.h"
27 #include "gretl_panel.h"
28 #include "cmd_private.h"
29 #include "dbread.h"
30 #include "uservar.h"
31 #include "gretl_xml.h"
32 
33 #define SUBDEBUG 0
34 #define FULLDEBUG 0
35 #define PANDEBUG 0
36 
37 typedef enum {
38     SUBSAMPLE_NONE,
39     SUBSAMPLE_DROP_MISSING,
40     SUBSAMPLE_DROP_EMPTY,
41     SUBSAMPLE_DROP_WKENDS,
42     SUBSAMPLE_USE_DUMMY,
43     SUBSAMPLE_BOOLEAN,
44     SUBSAMPLE_RANDOM,
45     SUBSAMPLE_UNKNOWN
46 } SubsampleMode;
47 
48 #define RECODE_ON_PERMA 1 /* experiment */
49 
50 #if RECODE_ON_PERMA
51 
recode_strvals(DATASET * dset,gretlopt opt)52 static int recode_strvals (DATASET *dset, gretlopt opt)
53 {
54     int i, err = 0;
55 
56     for (i=1; i<dset->v && !err; i++) {
57 	if (is_string_valued(dset, i)) {
58 	    err = series_recode_strings(dset, i, opt, NULL);
59 	}
60     }
61 
62     return err;
63 }
64 
65 #endif
66 
67 /*
68   The purpose of the static pointers below: When the user subsamples
69   the current dataset in a non-trivial way -- i.e., by selecting cases
70   rather than just moving the starting or ending points of the data
71   range -- we create a new sub-dataset, and we need to keep the full
72   dataset around so that it can be restored later.  The pointer
73   @fullset is used to record the address of the full dataset.
74 
75   In addition, @peerset keeps track of the location of the DATASET
76   struct associated with the backed-up full dataset; by means of this,
77   we can know when to free the full dataset and when not to (for
78   instance, if we're freeing an auxiliary dataset).
79 */
80 
81 static DATASET *fullset;
82 static DATASET *peerset;
83 
84 #define SUBMASK_SENTINEL 127
85 
86 static int smpl_get_int (const char *s, DATASET *dset, int *err);
87 
88 /* accessors for full dataset, when sub-sampled */
89 
fetch_full_dataset(void)90 DATASET *fetch_full_dataset (void)
91 {
92     return fullset;
93 }
94 
get_submask_length(const char * s)95 static int get_submask_length (const char *s)
96 {
97     int n = 1;
98 
99     if (s == NULL || s == RESAMPLED) {
100 	n = 0;
101     } else {
102 	while (*s != SUBMASK_SENTINEL) {
103 	    n++;
104 	    s++;
105 	}
106     }
107 
108     return n;
109 }
110 
get_dataset_submask_size(const DATASET * dset)111 int get_dataset_submask_size (const DATASET *dset)
112 {
113     int n = 0;
114 
115     if (dset != NULL) {
116 	n = get_submask_length(dset->submask);
117 	if (n > 0) n--; /* omit the sentinel */
118     }
119 
120     return n;
121 }
122 
get_model_submask_size(const MODEL * pmod)123 int get_model_submask_size (const MODEL *pmod)
124 {
125     int n = 0;
126 
127     if (pmod != NULL) {
128 	n = get_submask_length(pmod->submask);
129 	if (n > 0) n--; /* omit the sentinel */
130     }
131 
132     return n;
133 }
134 
free_subsample_mask(char * s)135 void free_subsample_mask (char *s)
136 {
137     if (s != RESAMPLED && s != NULL) {
138 	free(s);
139     }
140 }
141 
copy_subsample_mask(const char * src,int * err)142 char *copy_subsample_mask (const char *src, int *err)
143 {
144     char *ret = NULL;
145 
146     if (src == RESAMPLED) {
147 	ret = RESAMPLED;
148     } else if (src != NULL) {
149 	int n = get_submask_length(src);
150 
151 	ret = malloc(n * sizeof *ret);
152 	if (ret != NULL) {
153 	    memcpy(ret, src, n);
154 	} else {
155 	    *err = E_ALLOC;
156 	}
157     }
158 
159     return ret;
160 }
161 
copy_dataset_submask(const DATASET * dset,int * err)162 char *copy_dataset_submask (const DATASET *dset, int *err)
163 {
164     char *mask = NULL;
165 
166     if (complex_subsampled()) {
167 	mask = copy_subsample_mask(dset->submask, err);
168     }
169 
170     return mask;
171 }
172 
write_model_submask(const MODEL * pmod,PRN * prn)173 int write_model_submask (const MODEL *pmod, PRN *prn)
174 {
175     int ret = 0;
176 
177     if (pmod->submask == RESAMPLED) {
178 	pputs(prn, "<submask length=\"0\"></submask>\n");
179 	ret = 1;
180     } else if (pmod->submask != NULL) {
181 	int i, n = get_submask_length(pmod->submask);
182 
183 	pprintf(prn, "<submask length=\"%d\">", n);
184 	for (i=0; i<n; i++) {
185 	    pprintf(prn, "%d ", (int) pmod->submask[i]);
186 	}
187 	pputs(prn, "</submask>\n");
188 	ret = 1;
189     }
190 
191     return ret;
192 }
193 
write_dataset_submask(const DATASET * dset,PRN * prn)194 int write_dataset_submask (const DATASET *dset, PRN *prn)
195 {
196     int ret = 0;
197 
198     if (dset->submask == RESAMPLED) {
199 	unsigned int seed = get_resampling_seed();
200 
201 	pprintf(prn, "<resample seed=\"%u\" n=\"%d\"/>\n", seed, dset->n);
202 	ret = 1;
203     } else if (complex_subsampled()) {
204 	int i, n = get_submask_length(dset->submask);
205 
206 	pprintf(prn, "<submask length=\"%d\">", n);
207 	for (i=0; i<n; i++) {
208 	    pprintf(prn, "%d ", (int) dset->submask[i]);
209 	}
210 	pputs(prn, "</submask>\n");
211 
212 	if (dset->restriction != NULL) {
213 	    gretl_xml_put_tagged_string("restriction", dset->restriction, prn);
214 	}
215 
216 	ret = 1;
217     }
218 
219     return ret;
220 }
221 
submask_cmp(const char * m1,const char * m2)222 int submask_cmp (const char *m1, const char *m2)
223 {
224     if (m1 == NULL && m2 == NULL) {
225 	return 0;
226     } else if (m1 == NULL || m2 == NULL) {
227 	return 1;
228     }
229 
230     if (m1 == RESAMPLED || m2 == RESAMPLED) {
231 	return m1 != RESAMPLED || m2 != RESAMPLED;
232     }
233 
234     while (*m1 != SUBMASK_SENTINEL && *m2 != SUBMASK_SENTINEL) {
235 	if (*m1 != *m2) {
236 	    return 1;
237 	}
238 	m1++;
239 	m2++;
240     }
241 
242     return 0;
243 }
244 
245 /* All values apart from the sentinel are initialized to zero; once
246    the mask is used, 1s will indicate included observations and
247    0s excluded observations.
248 */
249 
make_submask(int n)250 static char *make_submask (int n)
251 {
252     char *mask = calloc(n + 1, 1);
253 
254     if (mask != NULL) {
255 	mask[n] = SUBMASK_SENTINEL;
256     }
257 
258     return mask;
259 }
260 
261 #if 0
262 
263 static void debug_print_submask (char *mask, char *msg)
264 {
265     if (mask == NULL) {
266 	fprintf(stderr, "%s: NULL\n", msg);
267     } else if (mask == RESAMPLED) {
268 	fprintf(stderr, "%s: RESAMPLED\n", msg);
269     } else {
270 	char *s = mask;
271 
272 	fprintf(stderr, "%s: ", msg);
273 
274 	while (*s != SUBMASK_SENTINEL) {
275 	    if (*s == 0) {
276 		fputc('0', stderr);
277 	    } else if (*s == 1) {
278 		fputc('1', stderr);
279 	    } else {
280 		fputc('?', stderr);
281 	    }
282 	    s++;
283 	}
284 	fputc('\n', stderr);
285     }
286 }
287 
288 #endif
289 
set_dataset_resampled(DATASET * dset,unsigned int seed)290 void set_dataset_resampled (DATASET *dset, unsigned int seed)
291 {
292     dset->submask = RESAMPLED;
293     dset->rseed = seed;
294 }
295 
dataset_is_resampled(const DATASET * dset)296 int dataset_is_resampled (const DATASET *dset)
297 {
298     return (dset != NULL && dset->submask == RESAMPLED);
299 }
300 
maybe_free_full_dataset(const DATASET * dset)301 void maybe_free_full_dataset (const DATASET *dset)
302 {
303     if (dset == peerset) {
304 	if (fullset != NULL) {
305 #if SUBDEBUG
306 	    fprintf(stderr, "maybe_free_full_dataset: freeing fullset at %p (Z at %p)\n",
307 		    (void *) fullset, (void *) fullset->Z);
308 #endif
309 	    if (fullset->Z != NULL) {
310 		free_Z(fullset);
311 	    }
312 	    clear_datainfo(fullset, CLEAR_SUBSAMPLE);
313 	    free(fullset);
314 	    fullset = NULL;
315 	}
316 	peerset = NULL;
317     }
318 }
319 
320 /* we do this on "restore full sample" */
321 
relink_to_full_dataset(DATASET * dset)322 static void relink_to_full_dataset (DATASET *dset)
323 {
324 #if SUBDEBUG
325     fprintf(stderr, "relink_to_full_dataset: fullset = %p (freeing and nulling)\n",
326 	    (void *) fullset);
327     fprintf(stderr, "fullset: v = %d, n = %d\n", fullset->v, fullset->n);
328 #endif
329 
330     *dset = *fullset;
331     free(fullset);
332     fullset = NULL;
333     peerset = NULL;
334 }
335 
sync_dataset_shared_members(const DATASET * dset)336 void sync_dataset_shared_members (const DATASET *dset)
337 {
338     if (fullset != NULL && dset != fullset) {
339 	fullset->varname = dset->varname;
340 	fullset->varinfo = dset->varinfo;
341     }
342 }
343 
344 /* sync malloced elements of the fullset struct that might
345    have been moved via realloc
346 */
347 
sync_datainfo_members(const DATASET * dset)348 static void sync_datainfo_members (const DATASET *dset)
349 {
350     if (fullset->v > dset->v) {
351 	int i;
352 
353 #if FULLDEBUG
354 	fprintf(stderr, "*** sync_datainfo_members: fullset->v = %d but dset->v = %d\n",
355 		fullset->v, dset->v);
356 	fprintf(stderr, " deleting the last %d element(s) of fullZ\n",
357 		fullset->v - dset->v);
358 #endif
359 	for (i=dset->v; i<fullset->v; i++) {
360 	    free(fullset->Z[i]);
361 	    fullset->Z[i] = NULL;
362 	}
363 	fullset->v = dset->v;
364     }
365 
366     fullset->varname = dset->varname;
367     fullset->varinfo = dset->varinfo;
368     fullset->descrip = dset->descrip;
369 }
370 
371 /* attach_subsample_to_model:
372  * @pmod: model to which subsample should be attached.
373  * @dset: pointer to current dataset info.
374  *
375  * If the dataset is currently subsampled, record the subsample
376  * information with the model so that it can be retrieved later.
377  *
378  * Returns: 0 if the recording is not needed, or on success; non-zero
379  * error code failure.
380  */
381 
attach_subsample_to_model(MODEL * pmod,const DATASET * dset)382 int attach_subsample_to_model (MODEL *pmod, const DATASET *dset)
383 {
384     int err = 0;
385 
386 #if SUBDEBUG
387     fprintf(stderr, "attach_subsample_to_model: fullset = %p\n",
388 	    (void *) fullset);
389 #endif
390 
391     if (fullset != NULL) {
392 	/* sync, in case anything has moved */
393 	sync_datainfo_members(dset);
394 
395 	if (pmod->submask != NULL) {
396 	    free_subsample_mask(pmod->submask);
397 	}
398 
399 	pmod->submask = copy_subsample_mask(dset->submask, &err);
400     }
401 
402     return err;
403 }
404 
405 #define MDEBUG 0
406 
407 /* This is called from objstack.c for each saved model, to
408    determine whether a proposed permanent shrinkage of the
409    dataset will invalidate the model.
410 */
411 
subsample_check_model(MODEL * pmod,char * mask)412 int subsample_check_model (MODEL *pmod, char *mask)
413 {
414     if (submask_cmp(pmod->submask, mask)) {
415 	return E_DATA;
416     } else {
417 	return 0;
418     }
419 }
420 
421 /* Called from objstack.c for each saved model, when the
422    user calls for the imposition of a permanent subsample
423    restriction on the dataset. We've already checked that
424    the "new" subsample matches that on which model was
425    estimated, so all we have to do now is sync by removing
426    the model's subsample mask.
427 */
428 
remove_model_subsample_info(MODEL * pmod)429 int remove_model_subsample_info (MODEL *pmod)
430 {
431     free_subsample_mask(pmod->submask);
432     pmod->submask = NULL;
433 
434     return 0;
435 }
436 
437 /* check for at least two observations, all contiguous */
438 
ts_contig(const char * ts,int T)439 static int ts_contig (const char *ts, int T)
440 {
441     int contig = 1;
442     int stopped = 0;
443     int t, n = 0;
444 
445     for (t=0; t<T && contig; t++) {
446 	if (ts[t]) {
447 	    if (stopped) {
448 		contig = 0;
449 	    } else {
450 		n++;
451 	    }
452 	} else if (n > 0) {
453 	    stopped = 1;
454 	}
455     }
456 
457     return n >= 2 && contig;
458 }
459 
460 /* For forecasting purposes, check to see if a subsample
461    mask represents a sub-sampling of a panel dataset in
462    the time dimension only.
463 */
464 
is_panel_time_sample(const char * mask,const DATASET * dset)465 static int is_panel_time_sample (const char *mask,
466 				 const DATASET *dset)
467 {
468     int i, t, s, N, T, ret = 1;
469     char *ts = NULL;
470 
471     if (dset == NULL || !dataset_is_panel(dset)) {
472 	return 0;
473     } else if (get_submask_length(mask) != dset->n + 1) {
474 	return 0;
475     }
476 
477     T = dset->pd;
478     N = dset->n / T;
479     ts = calloc(T, 1);
480 
481     s = 0;
482     for (i=0; i<N && ret; i++) {
483 	for (t=0; t<T && ret; t++) {
484 	    if (i == 0) {
485 		/* first unit sets the pattern */
486 		if (mask[s]) {
487 		    ts[t] = 1;
488 		}
489 	    } else if ((mask[s] && !ts[t]) || (ts[t] && !mask[s])) {
490 		/* subsequent units must match */
491 		ret = 0;
492 	    }
493 	    s++;
494 	}
495 	if (i == 0 && !ts_contig(ts, T)) {
496 	    ret = 0;
497 	}
498     }
499 
500     free(ts);
501 
502     return ret;
503 }
504 
505 /* If series have been added to a resampled dataset, we can't
506    bring these back to the "full" dataset, which may have a
507    longer or shorter series length, and from which there is
508    no definite mapping by row. So we just delete them. In
509    this function we destroy their varnames and varinfo
510    structures; the numerical arrays get deleted later.
511 */
512 
resample_sync_dataset(DATASET * dset)513 static int resample_sync_dataset (DATASET *dset)
514 {
515     int err = 0;
516 
517     if (dset->v > fullset->v) {
518 	err = shrink_varinfo(dset, fullset->v);
519     }
520 
521     /* sync */
522     fullset->varname = dset->varname;
523     fullset->varinfo = dset->varinfo;
524     fullset->descrip = dset->descrip;
525 
526     return err;
527 }
528 
529 /* Apparatus for updating full dataset when restoring full sample
530    after sub-sampling.
531 */
532 
533 static void
update_full_data_values(const DATASET * dset)534 update_full_data_values (const DATASET *dset)
535 {
536     int i, s, t;
537 
538 #if SUBDEBUG
539     fprintf(stderr, "update_full_data_values: fullset->Z=%p, dset->Z=%p, dset=%p\n",
540 	    (void *) fullset->Z, (void *) dset->Z, (void *) dset);
541 #endif
542 
543     for (i=1; i<fullset->v && i<dset->v; i++) {
544 	s = 0;
545 	for (t=0; t<fullset->n; t++) {
546 	    if (dset->submask[t] == 1) {
547 		fullset->Z[i][t] = dset->Z[i][s++];
548 	    } else if (dset->submask[t] == 'p') {
549 		/* skip panel padding (?) */
550 		s++;
551 	    }
552 	}
553     }
554 }
555 
update_case_markers(const DATASET * dset)556 static int update_case_markers (const DATASET *dset)
557 {
558     int err = 0;
559 
560     if (fullset->markers == DAILY_DATE_STRINGS) {
561 	; /* don't mess with them */
562     } else if (dated_daily_data(fullset)) {
563 	; /* again, don't mess! */
564     } else if (dset->markers && !fullset->markers) {
565 	dataset_allocate_obs_markers(fullset);
566 	if (fullset->S == NULL) {
567 	    err = 1;
568 	} else {
569 	    int t, subt = 0;
570 
571 	    for (t=0; t<fullset->n; t++) {
572 		if (dset->submask[t]) {
573 		    strcpy(fullset->S[t], dset->S[subt++]);
574 		} else {
575 		    sprintf(fullset->S[t], "%d", t + 1);
576 		}
577 	    }
578 	}
579     }
580 
581     return err;
582 }
583 
add_new_vars_to_full(DATASET * dset)584 static int add_new_vars_to_full (DATASET *dset)
585 {
586     int V1 = dset->v;
587     int V0 = fullset->v;
588     int N = fullset->n;
589     double **newZ = NULL;
590     int i, t, s;
591     int err = 0;
592 
593     if (V1 <= V0) {
594 	return 0;
595     }
596 
597     if (dset->submask == NULL) {
598 	return E_NOMERGE;
599     }
600 
601 #if SUBDEBUG
602     fprintf(stderr, "add_new_vars_to_full:\n");
603     fprintf(stderr, " V1 = dset->v = %d; V0 = fullset->v = %d\n",
604 	    V1, V0);
605     fprintf(stderr, " dset->Z = %p, fullset->Z = %p\n", (void *) dset->Z,
606 	    (void *) fullset->Z);
607 #endif
608 
609     /* allocate expanded data array */
610     newZ = realloc(fullset->Z, V1 * sizeof *fullset->Z);
611 
612     if (newZ == NULL) {
613 	return E_ALLOC;
614     }
615 
616     fullset->Z = newZ;
617 
618     for (i=V0; i<dset->v && !err; i++) {
619 #if FULLDEBUG
620 	fprintf(stderr, "adding to full: var %d (%s, level %d)\n",
621 		i, dset->varname[i], series_get_stack_level(dset, i));
622 #endif
623 	fullset->Z[i] = malloc(N * sizeof **fullset->Z);
624 	if (fullset->Z[i] == NULL) {
625 	    err = E_ALLOC;
626 	}
627     }
628 
629     if (err) {
630 	return E_ALLOC;
631     }
632 
633 #if SUBDEBUG
634     fprintf(stderr, "After expansion, fullset->Z = %p (%d vars)\n",
635 	    (void *) fullset->Z, dset->v);
636 #endif
637 
638     for (i=V0; i<dset->v; i++) {
639 	s = 0;
640 	for (t=0; t<N; t++) {
641 	    fullset->Z[i][t] = (dset->submask[t])?
642 		dset->Z[i][s++] : NADBL;
643 	}
644     }
645 
646     fullset->v = V1;
647 
648     return 0;
649 }
650 
sync_data_to_full(DATASET * dset)651 static int sync_data_to_full (DATASET *dset)
652 {
653     int err;
654 
655     /* update values for pre-existing series, which may have been
656        modified via "genr" etc. */
657     update_full_data_values(dset);
658 
659     /* if case markers were added when subsampled, carry them back */
660     update_case_markers(dset);
661 
662     /* delete any newly added hidden vars */
663     err = dataset_destroy_hidden_variables(dset, fullset->v);
664 
665     /* in case any new vars were added when subsampled, try to merge
666        them into the full dataset */
667     if (!err) {
668 	err = add_new_vars_to_full(dset);
669     }
670 
671     return err;
672 }
673 
674 /* Here we make a mask representing the "complete" sample
675    restriction currently in force, for use in cumulating
676    restrictions. "Complete" means that we take into account
677    both an existing subsampling restriction, if any, and
678    possible setting of the t1 and t2 members of @dset to
679    exclude certain observation ranges.
680 
681    The mask returned, if non-NULL, will be the length of
682    the full dataset. It will be NULL (without error) if
683    the current dataset is neither subsampled nor range-
684    restricted.
685 */
686 
make_current_sample_mask(DATASET * dset,int * err)687 static char *make_current_sample_mask (DATASET *dset, int *err)
688 {
689     char *currmask = NULL;
690     int range_set;
691     int s, t;
692 
693     range_set = (dset->t1 > 0 || dset->t2 < dset->n - 1);
694 
695     if (dset->submask == NULL) {
696 	/* no pre-existing mask, so not subsampled, but
697 	   we should restrict currmask to observations
698 	   included in the current (t1, t2) range, if
699 	   applicable
700 	*/
701 	if (range_set) {
702 	    currmask = make_submask(dset->n);
703 	    if (currmask == NULL) {
704 		*err = E_ALLOC;
705 	    } else {
706 		for (t=dset->t1; t<=dset->t2; t++) {
707 		    currmask[t] = 1;
708 		}
709 	    }
710 	}
711     } else {
712 	/* there's a pre-existing mask: in addition we
713 	   mask out observations outside of the current
714 	   (t1, t2) range, if applicable
715 	*/
716 	currmask = copy_subsample_mask(dset->submask, err);
717 	if (!*err && range_set) {
718 	    s = -1;
719 	    for (t=0; t<fullset->n; t++) {
720 		if (dset->submask[t]) s++;
721 		if (s < dset->t1 || s > dset->t2) {
722 		    currmask[t] = 0;
723 		}
724 	    }
725 	}
726     }
727 
728     return currmask;
729 }
730 
731 /* Deal with the case where sampling has been done simply by
732    moving the sample-range endpoints
733 */
734 
restore_full_easy(DATASET * dset,ExecState * state)735 static int restore_full_easy (DATASET *dset, ExecState *state)
736 {
737     int t1min, t2max;
738 
739     if (state == NULL) {
740 	/* not inside a function */
741 	t1min = 0;
742 	t2max = dset->n - 1;
743 	dataset_clear_sample_record(dset);
744     } else {
745 	/* don't go outside the bounds set on entry to
746 	   a function */
747 	sample_range_get_extrema(dset, &t1min, &t2max);
748     }
749 
750     if (dset->t1 != t1min || dset->t2 != t2max) {
751 	dset->t1 = t1min;
752 	dset->t2 = t2max;
753 #if PANDEBUG || SUBDEBUG
754 	fprintf(stderr, "restore_full_sample: just set t1=%d and t2=%d\n",
755 		t1min, t2max);
756 #endif
757     }
758 
759     return 0;
760 }
761 
762 /* restore_full_sample:
763  * @dset: dataset struct.
764  * @state: structure representing program execution state,
765  * or %NULL.
766  *
767  * Restore the full data range, undoing any sub-sampling that was
768  * previously performed (either by shifting the endpoints of the
769  * sample range or by selecting cases on some criterion or other).
770  * However, if @state is not %NULL, and if it contains a submask,
771  * the "full" range is relative to that mask.
772  *
773  * Returns: 0 on success, non-zero error code on failure.
774  */
775 
restore_full_sample(DATASET * dset,ExecState * state)776 int restore_full_sample (DATASET *dset, ExecState *state)
777 {
778     int err = 0;
779 
780     if (dset == NULL) {
781 	return E_NODATA;
782     } else if (!complex_subsampled()) {
783 	return restore_full_easy(dset, state);
784     }
785 
786     if (dset != peerset) {
787 	fprintf(stderr, "restore_full_sample: dset is not peerset!\n");
788 	return E_DATA;
789     }
790 
791 #if FULLDEBUG || SUBDEBUG
792     fprintf(stderr, "\nrestore_full_sample: dset=%p, state=%p, fullset=%p\n",
793 	    (void *) dset, (void *) state, (void *) fullset);
794 #endif
795 
796     /* Beyond this point we are doing a non-trivial restoration
797        of a stored "full" dataset which has previously been
798        subsampled, e.g., by some boolean criterion.
799     */
800 
801     if (dset->submask == RESAMPLED) {
802 	err = resample_sync_dataset(dset);
803     } else {
804 	if (dset->padmask != NULL) {
805 	    fprintf(stderr, "restore_full_sample: first undo panel padding\n");
806 	    err = undo_panel_padding(dset);
807 	}
808 	if (!err) {
809 	    sync_datainfo_members(dset);
810 	    err = sync_data_to_full(dset);
811 	}
812     }
813 
814     if (err == E_NOMERGE) {
815         gretl_errmsg_set(_("Missing sub-sample information; can't merge data\n"));
816     }
817 
818     if (err) {
819 	return err;
820     }
821 
822     /* destroy sub-sampled data array */
823 #if PANDEBUG || SUBDEBUG
824     fprintf(stderr, "restore_full_sample: freeing sub-sampled Z at %p (v = %d, n = %d)\n"
825 	    " and clearing dset at %p\n", (void *) dset->Z, dset->v, dset->n,
826 	    (void *) dset);
827 #endif
828     free_Z(dset);
829     clear_datainfo(dset, CLEAR_SUBSAMPLE);
830 
831     relink_to_full_dataset(dset);
832 
833     if (state != NULL) {
834 	/* in this case restoring the "full" sample really means, relative
835 	   to the original state
836 	*/
837 	if (state->submask != NULL) {
838 	    err = restrict_sample_from_mask(state->submask, dset, OPT_NONE);
839 	} else {
840 	    int t1min, t2max;
841 
842 	    sample_range_get_extrema(dset, &t1min, &t2max);
843 	    if (dset->t1 < t1min) {
844 		dset->t1 = t1min;
845 	    }
846 	    if (dset->t2 > t2max) {
847 		dset->t2 = t2max;
848 	    }
849 	}
850     }
851 
852     return err;
853 }
854 
overlay_masks(char * targ,const char * src,int n)855 static int overlay_masks (char *targ, const char *src, int n)
856 {
857     int i, sn = 0;
858 
859     for (i=0; i<n; i++) {
860 	if (targ[i] == 1 && src[i] == 1) {
861 	    targ[i] = 1;
862 	    sn++;
863 	} else {
864 	    targ[i] = 0;
865 	}
866     }
867 
868     return sn;
869 }
870 
make_weekday_mask(const DATASET * dset,char * mask)871 static int make_weekday_mask (const DATASET *dset, char *mask)
872 {
873     if (!dated_daily_data(dset) ||
874 	dset->markers != 0 || dset->pd < 6) {
875 	return E_PDWRONG;
876     } else {
877 	char datestr[OBSLEN];
878 	int t, wd;
879 
880 	for (t=0; t<dset->n; t++) {
881 	    ntolabel(datestr, t, dset);
882 	    wd = weekday_from_date(datestr);
883 	    mask[t] = (wd >= 1 && wd <= 5);
884 	}
885 
886 	return 0;
887     }
888 }
889 
890 /* write into @mask: 0 for observations at which _all_ variables
891    (or all variables in @list, if @list is non-NULL) have missing
892    values, 1 for all other observations (that have at least one
893    non-NA). (Refinement: if no list is given, we ignore the generic
894    variables "time" and "index".)
895 */
896 
897 static int
make_empty_mask(const int * list,const DATASET * dset,char * mask)898 make_empty_mask (const int *list, const DATASET *dset, char *mask)
899 {
900     int i, vi, t, vt;
901 
902     if (list != NULL && list[0] > 0) {
903 	/* check specified list of variables */
904 	for (t=0; t<dset->n; t++) {
905 	    mask[t] = 0;
906 	    for (i=1; i<=list[0]; i++) {
907 		vi = list[i];
908 		if (!na(dset->Z[vi][t])) {
909 		    mask[t] = 1;
910 		    break;
911 		}
912 	    }
913 	}
914     } else {
915 	/* check (almost) all variables */
916 	vt = current_series_index(dset, "time");
917 	vi = current_series_index(dset, "index");
918 
919 	for (t=0; t<dset->n; t++) {
920 	    mask[t] = 0;
921 	    for (i=1; i<dset->v; i++) {
922 		if (!series_is_hidden(dset, i) &&
923 		    i != vt && i != vi &&
924 		    !na(dset->Z[i][t])) {
925 		    mask[t] = 1;
926 		    break;
927 		}
928 	    }
929 	}
930     }
931 
932     return 0;
933 }
934 
935 /* write into @mask: 0 for observations at which _any_ variable
936    (or any variable in @list, if @list is non-NULL) has a missing
937    value, 1 for all other observations.
938 */
939 
940 static int
make_missing_mask(const int * list,const DATASET * dset,char * mask)941 make_missing_mask (const int *list, const DATASET *dset, char *mask)
942 {
943     int i, vi, t;
944 
945     if (list != NULL && list[0] > 0) {
946 	/* check specified list of variables */
947 	for (t=0; t<dset->n; t++) {
948 	    mask[t] = 1;
949 	    for (i=1; i<=list[0]; i++) {
950 		vi = list[i];
951 		if (na(dset->Z[vi][t])) {
952 		    mask[t] = 0;
953 		    break;
954 		}
955 	    }
956 	}
957     } else {
958 	/* check all variables */
959 	for (t=0; t<dset->n; t++) {
960 	    mask[t] = 1;
961 	    for (i=1; i<dset->v; i++) {
962 		if (!series_is_hidden(dset, i) && na(dset->Z[i][t])) {
963 		    mask[t] = 0;
964 		    break;
965 		}
966 	    }
967 	}
968     }
969 
970     return 0;
971 }
972 
copy_dummy_to_mask(char * mask,const double * x,int n)973 static int copy_dummy_to_mask (char *mask, const double *x, int n)
974 {
975     int t, nsel = 0, err = 0;
976 
977 #if SUBDEBUG
978     fprintf(stderr, "copy_dummy_to_mask, n = %d\n", n);
979 #endif
980 
981     for (t=0; t<n && !err; t++) {
982 	if (x[t] == 1.0) {
983 	    mask[t] = 1;
984 	    nsel++;
985 	} else if (!na(x[t]) && x[t] != 0.0) { /* NA? */
986 	    err = 1;
987 	}
988     }
989 
990     if (!err && nsel == 0) {
991 	err = E_ZERO;
992     }
993 
994     return err;
995 }
996 
mask_from_temp_dummy(const char * s,DATASET * dset,char * mask,PRN * prn)997 static int mask_from_temp_dummy (const char *s, DATASET *dset,
998 				 char *mask, PRN *prn)
999 {
1000     char formula[MAXLINE];
1001     double *x;
1002     int err = 0;
1003 
1004     *formula = '\0';
1005     strncat(formula, s, MAXLINE - 1);
1006 
1007     x = generate_series(formula, dset, prn, &err);
1008 
1009 #if SUBDEBUG
1010     fprintf(stderr, "mask_from_temp_dummy: formula='%s', err=%d\n",
1011 	    formula, err);
1012 #endif
1013 
1014     if (!err) {
1015 	err = copy_dummy_to_mask(mask, x, dset->n);
1016 	if (err == E_ZERO) {
1017 	    gretl_errmsg_set(_("No observations would be left!"));
1018 	} else if (err) {
1019 	    gretl_errmsg_sprintf(_("'%s' is not a dummy variable"), "mask");
1020 	}
1021     }
1022 
1023     free(x);
1024 
1025     return err;
1026 }
1027 
mask_from_dummy(const char * s,const DATASET * dset,char * mask)1028 static int mask_from_dummy (const char *s, const DATASET *dset,
1029 			    char *mask)
1030 {
1031     double *x = NULL;
1032     char dname[VNAMELEN] = {0};
1033     int free_x = 0;
1034     int err = 0;
1035 
1036     if (!strcmp(s, "$sample")) {
1037 	/* try fetching the sample series from the last model */
1038 	GretlObjType type = 0;
1039 	void *ptr = get_last_model(&type);
1040 
1041 	if (ptr == NULL || type != GRETL_OBJ_EQN) {
1042 	    err = E_DATA;
1043 	} else {
1044 	    strcpy(dname, s);
1045 	    x = malloc(dset->n * sizeof *x);
1046 	    if (x == NULL) {
1047 		err = E_ALLOC;
1048 	    } else {
1049 		free_x = 1;
1050 		err = gretl_model_get_series(x, ptr, dset, M_SAMPLE);
1051 	    }
1052 	}
1053     } else {
1054 	int dnum;
1055 
1056 	gretl_scan_varname(s, dname);
1057 	dnum = series_index(dset, dname);
1058 	if (dnum == dset->v) {
1059 	    gretl_errmsg_sprintf(_("Variable '%s' not defined"), dname);
1060 	    err = E_DATA;
1061 	} else {
1062 	    x = dset->Z[dnum];
1063 	}
1064     }
1065 
1066     if (!err) {
1067 	err = copy_dummy_to_mask(mask, x, dset->n);
1068 	if (err) {
1069 	    gretl_errmsg_sprintf(_("'%s' is not a dummy variable"), dname);
1070 	}
1071     }
1072 
1073     if (free_x) {
1074 	free(x);
1075     }
1076 
1077     return err;
1078 }
1079 
1080 /* how many observations are selected by the given
1081    subsample mask? */
1082 
1083 static int
count_selected_cases(const char * mask,const DATASET * dset)1084 count_selected_cases (const char *mask, const DATASET *dset)
1085 {
1086     int i, n = 0;
1087 
1088     for (i=0; i<dset->n; i++) {
1089 	if (mask[i]) {
1090 	    n++;
1091 	}
1092     }
1093 
1094     return n;
1095 }
1096 
1097 /* panel: how many distinct cross-sectional units are included
1098    in the masked subset of observations? */
1099 
1100 static int
count_panel_units(const char * mask,const DATASET * dset)1101 count_panel_units (const char *mask, const DATASET *dset)
1102 {
1103     int u, ubak = -1;
1104     int i, n = 0;
1105 
1106     for (i=0; i<dset->n; i++) {
1107 	if (mask[i]) {
1108 	    u = i / dset->pd;
1109 	    if (u != ubak) {
1110 		n++;
1111 		ubak = u;
1112 	    }
1113 	}
1114     }
1115 
1116 #if SUBDEBUG
1117     fprintf(stderr, "count_panel_units: got n = %d\n", n);
1118 #endif
1119 
1120     return n;
1121 }
1122 
1123 /* construct mask for taking random sub-sample from dataset:
1124    we're selecting @subn cases without replacement, and
1125    there may or may not be an existing mask in place that
1126    has to be respected.
1127 */
1128 
make_random_mask(const char * s,const char * oldmask,DATASET * dset,char * mask)1129 static int make_random_mask (const char *s, const char *oldmask,
1130 			     DATASET *dset, char *mask)
1131 {
1132     unsigned u;
1133     int targ, avail, oldn = dset->n;
1134     int i, subn, cases, rejn;
1135     int err = 0;
1136 
1137     /* how many cases are requested? */
1138     subn = smpl_get_int(s, dset, &err);
1139 
1140     if (subn <= 0 || subn >= dset->n) {
1141 	err = 1;
1142     } else if (oldmask != NULL) {
1143 	/* dataset is already sub-sampled */
1144 	oldn = 0;
1145 	for (i=0; i<dset->n; i++) {
1146 	    if (oldmask[i]) {
1147 		oldn++;
1148 	    }
1149 	}
1150 	if (subn >= oldn) {
1151 	    err = 1;
1152 	}
1153     }
1154 
1155     if (err && err != E_PARSE) {
1156 	gretl_errmsg_sprintf(_("Invalid number of cases %d"), subn);
1157 	return err;
1158     }
1159 
1160     /* Which is smaller: the number of cases to be selected or the
1161        complement, the number to be discarded?  For the sake of
1162        efficiency we'll go for the smaller value.
1163     */
1164     rejn = oldn - subn;
1165     if (rejn < subn) {
1166 	/* select @rejn observations to discard */
1167 	targ = rejn;
1168 	avail = 1;
1169     } else {
1170 	/* select @subn observations to include */
1171 	targ = subn;
1172 	avail = 0;
1173     }
1174 
1175     for (i=0; i<dset->n; i++) {
1176 	if (oldmask != NULL && oldmask[i] == 0) {
1177 	    /* obs is already excluded, hence not selectable */
1178 	    mask[i] = -1;
1179 	} else {
1180 	    mask[i] = avail;
1181 	}
1182     }
1183 
1184     cases = 0;
1185 
1186     while (cases < targ) {
1187 	u = gretl_rand_int_max(dset->n);
1188 	if (mask[u] == avail) {
1189 	    /* obs is available, not yet selected */
1190 	    mask[u] = !avail;
1191 	    cases++;
1192 	}
1193     }
1194 
1195     if (oldmask != NULL) {
1196 	/* undo the 'already excluded' coding above */
1197 	for (i=0; i<dset->n; i++) {
1198 	    if (mask[i] == -1) {
1199 		mask[i] = 0;
1200 	    }
1201 	}
1202     }
1203 
1204     return err;
1205 }
1206 
backup_full_dataset(DATASET * dset)1207 int backup_full_dataset (DATASET *dset)
1208 {
1209 #if SUBDEBUG
1210     int newfull = 0;
1211 #endif
1212 
1213     if (fullset == NULL) {
1214 	fullset = malloc(sizeof *fullset);
1215 	if (fullset == NULL) {
1216 	    return E_ALLOC;
1217 	}
1218 #if SUBDEBUG
1219 	newfull = 1;
1220 #endif
1221     }
1222 
1223     if (dset != NULL) {
1224 	*fullset = *dset;
1225 	peerset = dset;
1226     }
1227 
1228 #if SUBDEBUG
1229     fprintf(stderr, "backup_full_dataset: fullset = %p (%s)\n",
1230 	    (void *) fullset, newfull ? "new" : "old");
1231 #endif
1232 
1233     return 0;
1234 }
1235 
destroy_full_dataset(DATASET * dset)1236 static void destroy_full_dataset (DATASET *dset)
1237 {
1238     if (fullset != NULL) {
1239 	if (fullset->varname == dset->varname) {
1240 	    fullset->varname = NULL;
1241 	}
1242 	if (fullset->varinfo == dset->varinfo) {
1243 	    fullset->varinfo = NULL;
1244 	}
1245 	if (fullset->descrip == dset->descrip) {
1246 	    fullset->descrip = NULL;
1247 	}
1248 	destroy_dataset(fullset);
1249 	fullset = NULL;
1250     }
1251 
1252     dset->varname = NULL;
1253     dset->varinfo = NULL;
1254     dset->descrip = NULL;
1255 
1256     free_Z(dset);
1257     clear_datainfo(dset, CLEAR_SUBSAMPLE);
1258 
1259     peerset = NULL;
1260 }
1261 
complex_subsampled(void)1262 int complex_subsampled (void)
1263 {
1264     return (fullset != NULL && fullset->Z != NULL);
1265 }
1266 
get_full_length_n(void)1267 int get_full_length_n (void)
1268 {
1269     return (fullset != NULL) ? fullset->n : 0;
1270 }
1271 
dataset_is_subsampled(const DATASET * dset)1272 int dataset_is_subsampled (const DATASET *dset)
1273 {
1274     if (fullset != NULL && fullset->Z != NULL) {
1275 	return 1;
1276     } else if (dset->t1 > 0 || dset->t2 < dset->n - 1) {
1277 	return 1;
1278     } else {
1279 	return 0;
1280     }
1281 }
1282 
1283 /* When sub-sampling on some boolean criterion, check to see if we can
1284    meet the criterion by simply adjusting the endpoints of the sample
1285    range: life will be simpler if that is so.
1286 */
1287 
mask_contiguous(const char * mask,const DATASET * dset,int * pt1,int * pt2)1288 static int mask_contiguous (const char *mask,
1289 			    const DATASET *dset,
1290 			    int *pt1, int *pt2)
1291 {
1292     int t, t1 = 0, t2 = dset->n - 1;
1293     int contig = 1;
1294 
1295     for (t=0; t<dset->n; t++) {
1296 	if (mask[t] == 0) {
1297 	    t1++;
1298 	} else {
1299 	    break;
1300 	}
1301     }
1302 
1303     for (t=dset->n - 1; t>=0; t--) {
1304 	if (mask[t] == 0) {
1305 	    t2--;
1306 	} else {
1307 	    break;
1308 	}
1309     }
1310 
1311     for (t=t1; t<=t2; t++) {
1312 	if (mask[t] == 0) {
1313 	    /* there's a hole inside the range */
1314 	    contig = 0;
1315 	    break;
1316 	}
1317     }
1318 
1319     if (contig && dataset_is_panel(dset)) {
1320 	int n = t2 - t1 + 1;
1321 
1322 	/* sample must leave a whole number of panel units; moreover,
1323 	   to retain "panelness" this number must be greater than 1
1324 	*/
1325 	if (t1 % dset->pd != 0 || n % dset->pd != 0) {
1326 	    contig = 0;
1327 	} else if (n == dset->pd) {
1328 	    contig = 0;
1329 	}
1330     }
1331 
1332     if (contig) {
1333 	*pt1 = t1;
1334 	*pt2 = t2;
1335     }
1336 
1337     return contig;
1338 }
1339 
1340 static void
copy_data_to_subsample(DATASET * subset,const DATASET * dset,int maxv,const char * mask)1341 copy_data_to_subsample (DATASET *subset, const DATASET *dset,
1342 			int maxv, const char *mask)
1343 {
1344     int i, t, s;
1345 
1346 #if SUBDEBUG
1347     fprintf(stderr, "copy_data_to_subsample: subset = %p, dset = %p\n",
1348 	    (void *) subset, (void *) dset);
1349 #endif
1350 
1351     /* copy data values */
1352     for (i=1; i<maxv; i++) {
1353 	s = 0;
1354 	for (t=0; t<dset->n; t++) {
1355 	    if (mask == NULL) {
1356 		subset->Z[i][s++] = dset->Z[i][t];
1357 	    } else if (mask[t] == 1) {
1358 		subset->Z[i][s++] = dset->Z[i][t];
1359 	    } else if (mask[t] == 'p') {
1360 		/* panel padding */
1361 		subset->Z[i][s++] = NADBL;
1362 	    }
1363 	}
1364     }
1365 
1366     /* copy observation markers, if any */
1367     if (dset->markers && subset->markers) {
1368 	s = 0;
1369 	for (t=0; t<dset->n; t++) {
1370 	    if (mask == NULL || mask[t] == 1 || mask[t] == 'p') {
1371 		strcpy(subset->S[s++], dset->S[t]);
1372 	    }
1373 	}
1374     }
1375 
1376     /* copy panel time info? */
1377     if (dataset_is_panel(subset)) {
1378 	if (subset->pd == dset->pd) {
1379 	    subset->panel_pd = dset->panel_pd;
1380 	    subset->panel_sd0 = dset->panel_sd0;
1381 	}
1382     }
1383 
1384     if (subset->stobs[0] == '\0' || subset->endobs[0] == '\0') {
1385 	/* impose simple default if not already handled */
1386 	strcpy(subset->stobs, "1");
1387 	sprintf(subset->endobs, "%d", subset->n);
1388     }
1389 }
1390 
get_restriction_mode(gretlopt opt)1391 int get_restriction_mode (gretlopt opt)
1392 {
1393     int mode = SUBSAMPLE_UNKNOWN;
1394 
1395     if (opt & (OPT_M | OPT_C)) {
1396 	mode = SUBSAMPLE_DROP_MISSING;
1397     } else if (opt & OPT_R) {
1398 	mode = SUBSAMPLE_BOOLEAN;
1399     } else if (opt & OPT_N) {
1400 	mode = SUBSAMPLE_RANDOM;
1401     } else if (opt & OPT_O) {
1402 	mode = SUBSAMPLE_USE_DUMMY;
1403     } else if (opt & OPT_A) {
1404 	mode = SUBSAMPLE_DROP_EMPTY;
1405     } else if (opt & OPT_W) {
1406 	mode = SUBSAMPLE_DROP_WKENDS;
1407     }
1408 
1409     return mode;
1410 }
1411 
1412 /* Copy list of panel periods from Ti to T0; return 1 if members of
1413    the list Ti are consecutive, 0 if they are not.
1414 */
1415 
copy_periods_list(int * T0,const int * Ti)1416 static int copy_periods_list (int *T0, const int *Ti)
1417 {
1418     int j, ret = 1;
1419 
1420     for (j=0; j<=Ti[0] && ret; j++) {
1421 	if (j > 1 && Ti[j] != Ti[j-1] + 1) {
1422 	    ret = 0;
1423 	} else {
1424 	    T0[j] = Ti[j];
1425 	}
1426     }
1427 
1428     return ret;
1429 }
1430 
1431 /* When sub-sampling panel data on some boolean criterion: see if the
1432    exclusion of certain rows leaves a balanced panel.  Note that the
1433    requirement is not simply that each unit has the same number of
1434    temporal observations -- they must have the _same_ observations,
1435    and in addition the observations must be temporally contiguous.
1436 */
1437 
mask_leaves_balanced_panel(char * mask,const DATASET * dset)1438 static int mask_leaves_balanced_panel (char *mask, const DATASET *dset)
1439 {
1440     int T = dset->pd;
1441     int *T0, *Ti;
1442     int i, u, ubak = -1;
1443     int ret = 1;
1444 
1445     T0 = gretl_list_new(T);
1446     Ti = gretl_list_new(T);
1447 
1448     if (T0 == NULL || Ti == NULL) {
1449 	free(T0);
1450 	free(Ti);
1451 	return 0;
1452     }
1453 
1454     T0[0] = Ti[0] = 0;
1455 
1456     for (i=0; i<dset->n && ret; i++) {
1457 	if (mask[i]) {
1458 	    u = i / T;
1459 	    if (u != ubak) {
1460 		if (Ti[0] > 0) {
1461 		    if (T0[0] == 0) {
1462 			/* we haven't made the reference list, T0, yet */
1463 			ret = copy_periods_list(T0, Ti);
1464 		    } else if (gretl_list_cmp(Ti, T0)) {
1465 			/* the current list differs from the reference one */
1466 			ret = 0;
1467 		    }
1468 		}
1469 		Ti[0] = 1;
1470 		Ti[1] = i % T;
1471 		ubak = u;
1472 	    } else {
1473 		Ti[0] += 1;
1474 		Ti[Ti[0]] = i % T;
1475 	    }
1476 	}
1477     }
1478 
1479     if (gretl_list_cmp(Ti, T0)) {
1480 	/* check the last unit */
1481 	ret = 0;
1482     }
1483 
1484     free(T0);
1485     free(Ti);
1486 
1487     return ret;
1488 }
1489 
1490 static int
make_panel_submask(char * mask,const DATASET * dset,int * err)1491 make_panel_submask (char *mask, const DATASET *dset, int *err)
1492 {
1493     int T = dset->pd;
1494     int N = dset->n / T;
1495     char *umask, *pmask;
1496     int i, np = 0;
1497 
1498     umask = calloc(N + T, 1);
1499     if (umask == NULL) {
1500 	*err = E_ALLOC;
1501 	return 0;
1502     }
1503 
1504     pmask = umask + N;
1505 
1506     for (i=0; i<dset->n; i++) {
1507 	if (mask[i]) {
1508 	    umask[i / T] = 1;
1509 	    pmask[i % T] = 1;
1510 	}
1511     }
1512 
1513     for (i=0; i<dset->n; i++) {
1514 	if (!mask[i]) {
1515 	    if (umask[i / T] && pmask[i % T]) {
1516 		mask[i] = 'p'; /* mark as padding row */
1517 		np++;
1518 	    }
1519 	}
1520     }
1521 
1522 #if SUBDEBUG
1523     fprintf(stderr, "make_panel_submask: number of padding rows = %d\n",
1524 	    np);
1525 #endif
1526 
1527     free(umask);
1528 
1529     return np;
1530 }
1531 
add_daily_date_strings(char * selected,const DATASET * dset,DATASET * subset)1532 static int add_daily_date_strings (char *selected,
1533 				   const DATASET *dset,
1534 				   DATASET *subset)
1535 {
1536     int err, t, i = 0;
1537 
1538     err = dataset_allocate_obs_markers(subset);
1539 
1540     if (!err) {
1541 	subset->markers = DAILY_DATE_STRINGS;
1542 	for (t=0; t<dset->n; t++) {
1543 	    if (selected[t]) {
1544 		ntolabel(subset->S[i++], t, dset);
1545 	    }
1546 	}
1547     }
1548 
1549     return err;
1550 }
1551 
1552 /* See if sub-sampling dated daily data leaves a useable daily dataset
1553    (as in dropping weekends and/or holidays).
1554 */
1555 
try_for_daily_subset(char * selected,const DATASET * dset,DATASET * subset,gretlopt * optp)1556 static int try_for_daily_subset (char *selected,
1557 				 const DATASET *dset,
1558 				 DATASET *subset,
1559 				 gretlopt *optp)
1560 {
1561     char datestr[OBSLEN];
1562     guint32 ed, ed0 = 0, edbak = 0;
1563     int t, wd, ngaps = 0;
1564     int delta, mon_delta;
1565     int delta_max = 0;
1566     int started = 0;
1567     int any_sat = 0;
1568     int newpd = 5;
1569     int err = 0;
1570 
1571     if (dset->pd > 5) {
1572 	/* first pass: look at weekend status of included obs */
1573 	for (t=0; t<dset->n; t++) {
1574 	    if (selected[t]) {
1575 		ntolabel(datestr, t, dset);
1576 		wd = weekday_from_date(datestr);
1577 		if (wd == 0) {
1578 		    /* got a Sunday: result must be 7-day */
1579 		    newpd = 7;
1580 		    break;
1581 		} else if (wd == 6) {
1582 		    any_sat = 1;
1583 		}
1584 	    }
1585 	}
1586 	if (newpd == 5 && any_sat) {
1587 	    /* no Sundays, but got a Saurday: result
1588 	       must be 6-day data */
1589 	    newpd = 6;
1590 	}
1591     }
1592 
1593 #if SUBDEBUG
1594     fprintf(stderr, "try_for_daily_subset: newpd = %d\n", newpd);
1595 #endif
1596 
1597     /* determine expected "delta days" for Mondays */
1598     mon_delta = (newpd == 7) ? 1 : (newpd == 6)? 2 : 3;
1599 
1600     /* second pass: count the calendar gaps */
1601     for (t=0; t<dset->n; t++) {
1602 	if (selected[t]) {
1603 	    ntolabel(datestr, t, dset);
1604 	    wd = weekday_from_date(datestr);
1605 	    ed = get_epoch_day(datestr);
1606 	    if (started) {
1607 		delta = ed - edbak;
1608 		ngaps += (wd == 1)? (delta > mon_delta) : (delta > 1);
1609 		if (delta > delta_max) {
1610 		    delta_max = delta;
1611 		}
1612 	    } else {
1613 		ed0 = ed;
1614 		started = 1;
1615 	    }
1616 	    edbak = ed;
1617 	}
1618     }
1619 
1620     /* Now, we should probably restrict the proportion of
1621        missing days for this treatment: for exclusion of
1622        non-trading days 10 percent should be generous.
1623        But we'll also require that the maximum "daily
1624        delta" be less than 10.
1625     */
1626 
1627     if (delta_max < 10) {
1628 	double keepfrac = subset->n / (double) dset->n;
1629 	double keepmin = 0.9 * newpd / (double) dset->pd;
1630 
1631 #if SUBDEBUG
1632 	fprintf(stderr, " daily: delta_max=%d, keepfrac=%.4f\n"
1633 		" keepmin=%.4f, ngaps=%d, newpd=%d\n", delta_max, keepfrac,
1634 		keepmin, ngaps, newpd);
1635 #endif
1636 
1637 	if (keepfrac >= keepmin) {
1638 	    if (dset->S == NULL && ngaps > 0) {
1639 		err = add_daily_date_strings(selected, dset, subset);
1640 		/* flag not to destroy subset date strings */
1641 		*optp |= OPT_P;
1642 	    }
1643 	    if (!err) {
1644 		subset->structure = TIME_SERIES;
1645 		subset->pd = newpd;
1646 		subset->sd0 = (double) ed0;
1647 		subset->t2 = subset->n - 1;
1648 		ntolabel(subset->stobs, 0, subset);
1649 		ntolabel(subset->endobs, subset->t2, subset);
1650 	    }
1651 	}
1652     }
1653 
1654     return err;
1655 }
1656 
1657 /* does the given policy lead to an empty sample, or no change
1658    in the sample, perchance?
1659 */
1660 
check_subsample_n(int n,DATASET * dset,char ** pmask,PRN * prn)1661 static int check_subsample_n (int n, DATASET *dset,
1662 			      char **pmask, PRN *prn)
1663 {
1664     if (n == 0) {
1665 	gretl_errmsg_set(_("No observations would be left!"));
1666 	return E_DATA;
1667     } else if (n == dset->n) {
1668 	/* not really an error, just a no-op */
1669 	if (gretl_messages_on()) {
1670 	    pputs(prn, _("No observations were dropped!"));
1671 	    pputc(prn, '\n');
1672 	}
1673 	free(*pmask);
1674 	*pmask = NULL;
1675     }
1676 
1677     return 0;
1678 }
1679 
1680 #define needs_string_arg(m) (m == SUBSAMPLE_RANDOM || \
1681 	                     m == SUBSAMPLE_USE_DUMMY || \
1682                              m == SUBSAMPLE_BOOLEAN)
1683 
1684 static int
make_restriction_mask(int mode,const char * s,const int * list,DATASET * dset,const char * oldmask,char ** pmask,PRN * prn)1685 make_restriction_mask (int mode, const char *s,
1686 		       const int *list, DATASET *dset,
1687 		       const char *oldmask, char **pmask,
1688 		       PRN *prn)
1689 {
1690     char *mask = NULL;
1691     int sn = 0, err = 0;
1692 
1693     if (needs_string_arg(mode) && (s == NULL || *s == '\0')) {
1694 	return E_ARGS;
1695     }
1696 
1697     mask = make_submask(dset->n);
1698     if (mask == NULL) {
1699 	return E_ALLOC;
1700     }
1701 
1702 #if SUBDEBUG
1703     fprintf(stderr, "make_restriction_mask: oldmask = %p\n", (void *) oldmask);
1704 #endif
1705 
1706     /* construct subsample mask in one of several possible ways */
1707 
1708     if (mode == SUBSAMPLE_DROP_MISSING) {
1709 	err = make_missing_mask(list, dset, mask);
1710     } else if (mode == SUBSAMPLE_DROP_EMPTY) {
1711 	err = make_empty_mask(list, dset, mask);
1712     } else if (mode == SUBSAMPLE_DROP_WKENDS) {
1713 	err = make_weekday_mask(dset, mask);
1714     } else if (mode == SUBSAMPLE_RANDOM) {
1715 	err = make_random_mask(s, oldmask, dset, mask);
1716     } else if (mode == SUBSAMPLE_USE_DUMMY) {
1717 	err = mask_from_dummy(s, dset, mask);
1718     } else if (mode == SUBSAMPLE_BOOLEAN) {
1719 	err = mask_from_temp_dummy(s, dset, mask, prn);
1720     } else {
1721 	gretl_errmsg_set(_("Sub-sample command failed mysteriously"));
1722 	err = 1;
1723     }
1724 
1725     if (err) {
1726 	/* exit now on unrecoverable error */
1727 	free(mask);
1728 	return err;
1729     }
1730 
1731     /* cumulate sample restrictions, if appropriate */
1732     if (oldmask != NULL && mode != SUBSAMPLE_RANDOM) {
1733 	sn = overlay_masks(mask, oldmask, dset->n);
1734     } else {
1735 	sn = count_selected_cases(mask, dset);
1736     }
1737 
1738     err = check_subsample_n(sn, dset, &mask, prn);
1739 
1740     if (err) {
1741 	free(mask);
1742     } else {
1743 	*pmask = mask;
1744     }
1745 
1746     return err;
1747 }
1748 
finalize_panel_subset(DATASET * subset,DATASET * dset,int npad)1749 static void finalize_panel_subset (DATASET *subset,
1750 				   DATASET *dset,
1751 				   int npad)
1752 {
1753     int pdp = subset->pd;
1754     int den = 10.0;
1755 
1756     while ((pdp = pdp / 10)) {
1757 	den *= 10;
1758     }
1759 
1760     subset->sd0 = 1.0 + 1.0 / den;
1761     ntolabel(subset->stobs, 0, subset);
1762     ntolabel(subset->endobs, subset->n - 1, subset);
1763 
1764     if (dset->pangrps != NULL && npad == 0) {
1765 	/* carry over panel group names from full dataset */
1766 	subset->pangrps = gretl_strdup(dset->pangrps);
1767     }
1768 }
1769 
1770 /* This is also used elsewhere:
1771 
1772    in gui2/session.c, when re-establishing a previously sub-sampled data
1773    state on reopening a saved session
1774 
1775    in gretl_func.c, on exit from a user function when the dataset was
1776    sub-sampled on entry to the function (and we need to re-establish the
1777    original sub-sample)
1778 */
1779 
1780 int
restrict_sample_from_mask(char * mask,DATASET * dset,gretlopt opt)1781 restrict_sample_from_mask (char *mask, DATASET *dset, gretlopt opt)
1782 {
1783     DATASET *subset;
1784     gretlopt zopt = OPT_R;
1785     int err = 0;
1786 
1787     if (dset->auxiliary) {
1788 	fprintf(stderr, "restrict_sample_from_mask: attempting to restrict "
1789 		"an auxiliary data\n");
1790 	return E_DATA;
1791     }
1792 
1793     if (mask == RESAMPLED) {
1794 	fprintf(stderr, "restrict_sample_from_mask: got RESAMPLED!\n");
1795 	return E_DATA;
1796     }
1797 
1798     if ((opt & OPT_B) && !dataset_is_panel(dset)) {
1799 	gretl_errmsg_set(_("--balanced option is invalid: the (full) "
1800 			   "dataset is not a panel"));
1801 	return E_BADOPT;
1802     }
1803 
1804     subset = datainfo_new();
1805     if (subset == NULL) {
1806 	return E_ALLOC;
1807     }
1808 
1809     subset->n = count_selected_cases(mask, dset);
1810     subset->v = dset->v;
1811 
1812 #if SUBDEBUG
1813     fprintf(stderr, "restrict_sample_from_mask:\n new subset=%p, "
1814 	    "%d obs in subsample vs %d in full dataset\n",
1815 	    (void *) subset, subset->n, dset->n);
1816 #endif
1817 
1818     if (dataset_is_panel(dset)) {
1819 	/* are we able to reconstitute a panel? */
1820 	int nunits = count_panel_units(mask, dset);
1821 	int ok = 0, npad = 0;
1822 
1823 	if (nunits > 1 && subset->n > nunits) {
1824 	    /* there's some possibility of doing so */
1825 	    if (opt & OPT_B) {
1826 		/* add padding rows? only if this was requested */
1827 		npad = make_panel_submask(mask, dset, &err);
1828 		if (err) {
1829 		    free(subset);
1830 		    return err;
1831 		}
1832 		ok = 1;
1833 	    } else {
1834 		ok = mask_leaves_balanced_panel(mask, dset);
1835 	    }
1836 	    if (ok) {
1837 		subset->structure = STACKED_TIME_SERIES;
1838 		subset->n += npad;
1839 		subset->pd = subset->n / nunits;
1840 		finalize_panel_subset(subset, dset, npad);
1841 	    }
1842 	} else if (nunits == 1 && subset->n == dset->pd) {
1843 	    /* time series for single panel unit */
1844 	    subset->structure = SPECIAL_TIME_SERIES;
1845 	}
1846     } else if (dated_daily_data(dset)) {
1847 	/* see if we can preserve daily time series */
1848 	err = try_for_daily_subset(mask, dset, subset, &zopt);
1849     }
1850 
1851     if (!err) {
1852 	/* set up the sub-sampled dataset */
1853 	err = start_new_Z(subset, zopt);
1854     }
1855 
1856     if (err) {
1857 	free(subset);
1858 	return err;
1859     }
1860 
1861 #if SUBDEBUG
1862     fprintf(stderr, "started new Z for subset (v=%d, n=%d, Z=%p)\n",
1863 	    subset->v, subset->n, (void *) subset->Z);
1864 #endif
1865 
1866     /* link (don't copy) varnames and descriptions, since these are
1867        not dependent on the series length */
1868     subset->varname = dset->varname;
1869     subset->varinfo = dset->varinfo;
1870     subset->descrip = dset->descrip;
1871     subset->mapfile = dset->mapfile;
1872 
1873     /* set up case markers? */
1874     if (dset->markers) {
1875 	err = dataset_allocate_obs_markers(subset);
1876 	if (err) {
1877 	    free_Z(subset);
1878 	    free(subset);
1879 	    return E_ALLOC;
1880 	}
1881     }
1882 
1883     /* copy across data (and case markers, if any) */
1884     copy_data_to_subsample(subset, dset, dset->v, mask);
1885 
1886     if (opt & OPT_T) {
1887 	/* --permanent */
1888 	check_models_for_subsample(mask, NULL);
1889 	destroy_full_dataset(dset);
1890 #if RECODE_ON_PERMA
1891 	recode_strvals(subset, OPT_NONE);
1892 #endif
1893     } else {
1894 	err = backup_full_dataset(dset);
1895 	subset->submask = copy_subsample_mask(mask, &err);
1896     }
1897 
1898     /* switch pointers */
1899     *dset = *subset;
1900     free(subset);
1901 
1902     return err;
1903 }
1904 
expand_mask(char * tmpmask,const char * oldmask,int * err)1905 static char *expand_mask (char *tmpmask, const char *oldmask,
1906 			  int *err)
1907 {
1908     char *newmask = make_submask(fullset->n);
1909 
1910     if (newmask == NULL) {
1911 	*err = E_ALLOC;
1912     } else {
1913 	/* map @tmpmask onto the full data range */
1914 	int t, i = 0;
1915 
1916 	for (t=0; t<fullset->n; t++) {
1917 	    if (oldmask[t]) {
1918 		newmask[t] = tmpmask[i++];
1919 	    }
1920 	}
1921     }
1922 
1923     return newmask;
1924 }
1925 
1926 /* Below: we do this "precompute" thing if the dataset is already
1927    subsampled and the user wants to compound the restriction with a
1928    boolean restriction. One reason for this is that "obs" references
1929    in the new restriction may get out of whack if we restore the full
1930    dataset first.  For example, say the spec is "obs!=50" to exclude
1931    observation 50: presumably the user means to exclude the 50th
1932    observation in the current, subsampled dataset, which may not be
1933    the same as the 50th observation in the full dataset.
1934 */
1935 
precompute_mask(const char * s,const char * oldmask,DATASET * dset,PRN * prn,int * err)1936 static char *precompute_mask (const char *s, const char *oldmask,
1937 			      DATASET *dset, PRN *prn, int *err)
1938 {
1939     char *tmpmask = make_submask(dset->n);
1940     char *newmask = NULL;
1941 
1942 #if SUBDEBUG
1943     fprintf(stderr, "restrict_sample: precomputing new mask\n");
1944 #endif
1945 
1946     if (tmpmask == NULL) {
1947 	*err = E_ALLOC;
1948     }
1949 
1950     if (!*err) {
1951 	/* fill out mask relative to current, restricted dataset */
1952 	*err = mask_from_temp_dummy(s, dset, tmpmask, prn);
1953     }
1954 
1955     if (!*err) {
1956 	if (fullset != NULL) {
1957 	    newmask = expand_mask(tmpmask, oldmask, err);
1958 	} else {
1959 	    newmask = tmpmask;
1960 	    tmpmask = NULL;
1961 	}
1962     }
1963 
1964     free(tmpmask);
1965 
1966     return newmask;
1967 }
1968 
1969 /* Intended for time series data: trim any missing values
1970    at the start and end of the current sample range, then
1971    check the remaining range for missing values and flag
1972    an error if any are found. If opt contains OPT_T (for
1973    permanence) and no error is encountered, we shrink the
1974    dataset to the contiguous range.
1975 */
1976 
set_contiguous_sample(const int * list,DATASET * dset,gretlopt opt)1977 static int set_contiguous_sample (const int *list,
1978 				  DATASET *dset,
1979 				  gretlopt opt)
1980 {
1981     int ct1 = dset->t1;
1982     int ct2 = dset->t2;
1983     int err = 0;
1984 
1985     if (list != NULL && list[0] > 0) {
1986 	err = list_adjust_sample(list, &ct1, &ct2, dset, NULL);
1987     } else {
1988 	int *biglist = NULL;
1989 	int nvars = 0;
1990 
1991 	biglist = full_var_list(dset, &nvars);
1992 	if (nvars == 0) {
1993 	    ; /* no-op */
1994 	} else if (biglist == NULL) {
1995 	    err = E_ALLOC;
1996 	} else {
1997 	    err = list_adjust_sample(biglist, &ct1, &ct2,
1998 				     dset, NULL);
1999 	    free(biglist);
2000 	}
2001     }
2002 
2003     if (!err) {
2004 	int changed = ct1 != dset->t1 || ct2 != dset->t2;
2005 
2006 	dset->t1 = ct1;
2007 	dset->t2 = ct2;
2008 	if (changed && (opt & OPT_T)) {
2009 	    err = perma_sample(dset, OPT_T, NULL, NULL);
2010 	}
2011     }
2012 
2013     return err;
2014 }
2015 
2016 /* Make a string representing the sample restriction. This is
2017    for reporting purposes. Note that in some cases the
2018    incoming @restr string may be NULL, for instance if the
2019    no-missing option is chosen via the gretl GUI.
2020 */
2021 
make_restriction_string(DATASET * dset,char * old,const char * restr,int mode)2022 static int make_restriction_string (DATASET *dset, char *old,
2023 				    const char *restr, int mode)
2024 {
2025     char *s = NULL;
2026     int n = 0, err = 0;
2027 
2028     dataset_clear_sample_record(dset);
2029 
2030     if (old != NULL) {
2031 	n = strlen(old);
2032     }
2033 
2034     if (mode == SUBSAMPLE_RANDOM) {
2035 	n += strlen("random");
2036     } else if (mode == SUBSAMPLE_DROP_MISSING) {
2037 	n += strlen("no-missing");
2038     } else if (restr != NULL) {
2039 	n += strlen(restr);
2040     }
2041 
2042     if (n > 0) {
2043 	s = malloc(n + 5);
2044 	if (s == NULL) {
2045 	    err = E_ALLOC;
2046 	}
2047     }
2048 
2049     if (s != NULL) {
2050 	*s = '\0';
2051 	if (old != NULL) {
2052 	    strcpy(s, old);
2053 	    strcat(s, " && ");
2054 	}
2055 	if (mode == SUBSAMPLE_RANDOM) {
2056 	    strcat(s, "random");
2057 	} else if (mode == SUBSAMPLE_DROP_MISSING) {
2058 	    strcat(s, "no-missing");
2059 	} else if (restr != NULL) {
2060 	    strcat(s, restr);
2061 	}
2062 	dset->restriction = s;
2063     }
2064 
2065     return err;
2066 }
2067 
full_sample(const DATASET * dset)2068 static int full_sample (const DATASET *dset)
2069 {
2070     if (complex_subsampled()) {
2071 	return 0;
2072     } else if (dset->t1 > 0 || dset->t2 < dset->n - 1) {
2073 	return 0;
2074     } else {
2075 	return 1;
2076     }
2077 }
2078 
handle_ts_restrict(char * mask,DATASET * dset,gretlopt opt,int t1)2079 static int handle_ts_restrict (char *mask, DATASET *dset,
2080 			       gretlopt opt, int t1)
2081 {
2082     char stobs[OBSLEN];
2083     int pd = dset->pd;
2084     double sd0;
2085     int err;
2086 
2087     ntolabel(stobs, t1, dset);
2088     sd0 = get_date_x(dset->pd, stobs);
2089 
2090     err = restrict_sample_from_mask(mask, dset, opt);
2091 
2092     if (!err) {
2093 	/* re-establish time-series characteristics */
2094 	dset->structure = TIME_SERIES;
2095 	dset->pd = pd;
2096 	dset->sd0 = sd0;
2097 	strcpy(dset->stobs, stobs);
2098 	ntolabel(dset->endobs, dset->n - 1, dset);
2099     }
2100 
2101     return err;
2102 }
2103 
check_restrict_options(gretlopt opt,const char * param)2104 static int check_restrict_options (gretlopt opt, const char *param)
2105 {
2106     /* We'll accept the redundant combination of the options --dummy
2107        (OPT_O) and --restrict (OPT_R), but other than that the options
2108        --dummy,
2109        --restrict,
2110        --no-missing (OPT_M),
2111        --no-all-missing (OPT_A) and
2112        --random (OPT_N)
2113        are all mutually incompatible.
2114     */
2115     if (incompatible_options(opt, OPT_O | OPT_M | OPT_A | OPT_N) ||
2116 	incompatible_options(opt, OPT_R | OPT_M | OPT_A | OPT_N)) {
2117 	return E_BADOPT;
2118     }
2119 
2120     /* The --contiguous option (OPT_C) is compatible only with
2121        --no-missing (which is implied) and --permanent */
2122     if (opt & OPT_C) {
2123 	if ((opt & ~(OPT_C | OPT_M | OPT_T)) != OPT_NONE) {
2124 	    return E_BADOPT;
2125 	}
2126     }
2127 
2128     if (opt & (OPT_O | OPT_R | OPT_N)) {
2129 	/* parameter is required */
2130 	if (param == NULL || *param == '\0') {
2131 	    return E_ARGS;
2132 	}
2133     }
2134 
2135     return 0;
2136 }
2137 
check_permanent_option(gretlopt opt,DATASET * dset,int * n_models)2138 static int check_permanent_option (gretlopt opt,
2139 				   DATASET *dset,
2140 				   int *n_models)
2141 {
2142     if (gretl_function_depth() > 0) {
2143 	/* can't permanently shrink the dataset within a function */
2144 	gretl_errmsg_set(_("The dataset cannot be modified at present"));
2145 	return E_DATA;
2146     } else if (!(opt & (OPT_O | OPT_M | OPT_A | OPT_N | OPT_R | OPT_C))) {
2147 	/* we need some kind of restriction flag */
2148 	return E_ARGS;
2149     } else if (gretl_in_gui_mode() && !(opt & OPT_F)) {
2150 	/* GUI program, without internal "force" option */
2151 	*n_models = n_stacked_models();
2152 	if (*n_models > 0 && !full_sample(dset)) {
2153 	    /* too difficult to recover gracefully on error */
2154 	    gretl_errmsg_set(_("The full dataset must be restored before "
2155 			       "imposing a permanent sample restriction"));
2156 	    return E_DATA;
2157 	}
2158     }
2159 
2160     return 0;
2161 }
2162 
2163 /* "Precomputing" a mask means computing a mask based on a
2164    current subsampled dataset (before restoring the full dataset,
2165    which is a part of the subsampling process). We do this if
2166    we're cumulating a boolean restriction on top of an existing
2167    restriction.
2168 */
2169 
do_precompute(int mode,char * oldmask,const char * param)2170 static int do_precompute (int mode, char *oldmask, const char *param)
2171 {
2172     return oldmask != NULL && mode == SUBSAMPLE_BOOLEAN;
2173 }
2174 
2175 /* restrict_sample:
2176  * @param: restriction string (or %NULL).
2177  * @list: list of variables in case of OPT_M (or %NULL).
2178  * @dset: dataset struct.
2179  * @state: structure representing program state (or %NULL).
2180  * @opt: option flags.
2181  * @prn: printing apparatus.
2182  * @n_dropped: location to receive count of dropped
2183  * observations, or NULL.
2184  *
2185  * Sub-sample the data set, based on the criterion of skipping all
2186  * observations with missing data values (OPT_M); or using as mask a
2187  * specified dummy variable (OPT_O); or masking with a specified
2188  * boolean condition (OPT_R); or selecting at random (OPT_N).
2189  *
2190  * In case OPT_M or OPT_C a @list of variables may be supplied; in
2191  * cases OPT_O, OPT_R and OPT_N, @param must contain specifics.
2192  *
2193  * In case OPT_P is included, the restriction will rePlace any
2194  * existing sample restriction, otherwise the resulting restriction
2195  * will be the logical product of the new restriction and any
2196  * existing restriction.
2197  *
2198  * In case the original dataset was a panel and OPT_B was given,
2199  * we'll pad with missing values if necessary, to try to reconstitute
2200  * a balanced panel.
2201  *
2202  * In case OPT_T is included, the sample restriction will be
2203  * permanent (the dataset is shrunk to the sample), otherwise the
2204  * restriction can be undone via the command "smpl full".
2205  *
2206  * Returns: 0 on success, non-zero error code on failure.
2207  */
2208 
restrict_sample(const char * param,const int * list,DATASET * dset,ExecState * state,gretlopt opt,PRN * prn,int * n_dropped)2209 int restrict_sample (const char *param, const int *list,
2210 		     DATASET *dset, ExecState *state,
2211 		     gretlopt opt, PRN *prn,
2212 		     int *n_dropped)
2213 {
2214     char *oldrestr = NULL;
2215     char *oldmask = NULL;
2216     char *mask = NULL;
2217     int free_oldmask = 0;
2218     int permanent = 0;
2219     int n_models = 0;
2220     int n_orig = 0;
2221     int mode, err = 0;
2222 
2223     if (dset == NULL || dset->Z == NULL) {
2224 	return E_NODATA;
2225     }
2226 
2227     n_orig = dset->n;
2228 
2229     /* general check on incoming options */
2230     err = check_restrict_options(opt, param);
2231 
2232     /* take a closer look at the --permanent option */
2233     if (!err && (opt & OPT_T)) {
2234 	err = check_permanent_option(opt, dset, &n_models);
2235 	if (!err) {
2236 	    permanent = 1;
2237 	}
2238     }
2239 
2240     if (err) {
2241 	return err;
2242     }
2243 
2244     gretl_error_clear();
2245 
2246 #if FULLDEBUG || SUBDEBUG
2247     fprintf(stderr, "\nrestrict_sample: param='%s'\n", param);
2248     fprintf(stderr, " dset=%p, state=%p, fullset=%p\n", (void *) dset,
2249 	    (void *) state, (void *) fullset);
2250     printlist(list, "list param");
2251 #endif
2252 
2253     if (opt & OPT_C) {
2254 	/* --contiguous */
2255 	return set_contiguous_sample(list, dset, opt);
2256     }
2257 
2258     mode = get_restriction_mode(opt);
2259     if (mode == SUBSAMPLE_UNKNOWN) {
2260 	gretl_errmsg_set("Unrecognized sample command");
2261 	return 1;
2262     }
2263 
2264     if (!(opt & OPT_P)) {
2265 	/* not replacing but cumulating any existing restrictions */
2266 	oldmask = make_current_sample_mask(dset, &err);
2267 	if (err) {
2268 	    return err;
2269 	}
2270 	free_oldmask = 1;
2271 	if (dset->restriction != NULL) {
2272 	    oldrestr = gretl_strdup(dset->restriction);
2273 	}
2274     } else if (state != NULL && state->submask != NULL) {
2275 	/* subsampling within a function: this necessarily
2276 	   cumulates with the incoming sample restriction
2277 	   recorded in state->submask
2278 	*/
2279 	oldmask = state->submask;
2280     } else if (state != NULL) {
2281 	/* FIXME? This is a loophole that potentially allows
2282 	   overriding of a (t1, t2) sample range at caller
2283 	   level, via "restrict replace" sampling within a
2284 	   function.
2285 	*/
2286 	;
2287     }
2288 
2289     if (do_precompute(mode, oldmask, param)) {
2290 	/* we come in here only if cumulating restrictions */
2291 	mask = precompute_mask(param, oldmask, dset, prn, &err);
2292     }
2293 
2294     if (!err && !full_sample(dset)) {
2295 	/* restore the full data range, for housekeeping purposes */
2296 	err = restore_full_sample(dset, NULL);
2297     }
2298 
2299     if (err) {
2300 	/* clean up and get out */
2301 	free(mask);
2302 	if (free_oldmask) {
2303 	    free(oldmask);
2304 	}
2305 	return err;
2306     }
2307 
2308     if (mask == NULL) {
2309 	/* not already handled by "precompute" above */
2310 	err = make_restriction_mask(mode, param, list, dset,
2311 				    oldmask, &mask, prn);
2312     }
2313 
2314     if (err && state != NULL && state->submask != NULL) {
2315 	/* Should we try to restore the status quo ante on error?
2316 	   Too tricky, I think -- best to cancel the 'catch' flag
2317 	   if it's present.
2318 	*/
2319 	if (state->cmd->flags & CMD_CATCH) {
2320 	    state->cmd->flags ^= CMD_CATCH;
2321 	    gretl_errmsg_append("Sorry, cannot 'catch' this error", err);
2322 	}
2323     }
2324 
2325     if (!err && n_models > 0) {
2326 	err = check_models_for_subsample(mask, n_dropped);
2327     }
2328 
2329     if (!err && mask != NULL) {
2330 	int contiguous = 0;
2331 	int t1 = 0, t2 = 0;
2332 
2333 	if (!(opt & OPT_N)) {
2334 	    /* don't bother with this for the --random case */
2335 	    contiguous = mask_contiguous(mask, dset, &t1, &t2);
2336 	}
2337 
2338 #if SUBDEBUG
2339 	fprintf(stderr, "restrict sample: contiguous range? %s\n",
2340 		contiguous ? "yes" : "no");
2341 #endif
2342 
2343 	if (contiguous) {
2344 	    if (permanent && dataset_is_time_series(dset)) {
2345 		/* apply the restriction, but then re-establish the
2346 		   time-series character of the dataset
2347 		*/
2348 		err = handle_ts_restrict(mask, dset, opt, t1);
2349 	    } else if (!permanent) {
2350 		/* just move the sample range pointers, avoiding
2351 		   the overhead of creating a parallel dataset
2352 		*/
2353 		dset->t1 = t1;
2354 		dset->t2 = t2;
2355 		/* but record mask? */
2356 	    } else {
2357 		err = restrict_sample_from_mask(mask, dset, opt);
2358 	    }
2359 	} else {
2360 	    err = restrict_sample_from_mask(mask, dset, opt);
2361 	}
2362     }
2363 
2364     free(mask);
2365 
2366     if (free_oldmask) {
2367 	free(oldmask);
2368     }
2369 
2370     if (!err) {
2371 	if (permanent) {
2372 	    dataset_clear_sample_record(dset);
2373 	} else {
2374 	    make_restriction_string(dset, oldrestr, param, mode);
2375 	}
2376 	if (n_dropped != NULL) {
2377 	    *n_dropped = n_orig - sample_size(dset);
2378 	}
2379     }
2380 
2381     free(oldrestr);
2382 
2383 #if PANDEBUG
2384     panreport("restrict sample", dset);
2385 #endif
2386 
2387     return err;
2388 }
2389 
2390 /* perma_sample:
2391  * @dset: dataset struct.
2392  * @opt: option flags: must be OPT_T.
2393  * @prn: printing apparatus.
2394  * @n_dropped: location to receive count of dropped models,
2395  * or NULL.
2396  *
2397  * Make the current sub-sampling of the dataset permanent.
2398  *
2399  * Returns: 0 on success, non-zero error code on failure.
2400  */
2401 
perma_sample(DATASET * dset,gretlopt opt,PRN * prn,int * n_dropped)2402 int perma_sample (DATASET *dset, gretlopt opt, PRN *prn,
2403 		  int *n_dropped)
2404 {
2405     if (gretl_function_depth() > 0) {
2406 	gretl_errmsg_set(_("The dataset cannot be modified at present"));
2407 	return E_DATA;
2408     } else if (!dataset_is_subsampled(dset)) {
2409 	pputs(prn, "smpl: nothing to be done\n");
2410 	return 0;
2411     } else if (dset->submask == RESAMPLED) {
2412 	pputs(prn, "smpl: dataset is resampled\n");
2413 	return E_DATA;
2414     } else if (opt != OPT_T) {
2415 	return E_BADOPT;
2416     }
2417 
2418 #if RECODE_ON_PERMA
2419     recode_strvals(dset, OPT_NONE);
2420 #endif
2421 
2422     if (dset->submask == NULL) {
2423 	/* just trim observations */
2424 	return dataset_shrink_obs_range(dset);
2425     }
2426 
2427     if (n_dropped != NULL) {
2428 	int err;
2429 
2430 	err = check_models_for_subsample(dset->submask, n_dropped);
2431 	if (err) {
2432 	    return err;
2433 	}
2434     } else {
2435 	check_models_for_subsample(dset->submask, NULL);
2436     }
2437 
2438     free(dset->submask);
2439     dset->submask = NULL;
2440     dataset_clear_sample_record(dset);
2441 
2442     if (fullset->varname == dset->varname) {
2443 	fullset->varname = NULL;
2444     }
2445     if (fullset->varinfo == dset->varinfo) {
2446 	fullset->varinfo = NULL;
2447     }
2448     if (fullset->descrip == dset->descrip) {
2449 	fullset->descrip = NULL;
2450     }
2451 
2452     destroy_dataset(fullset);
2453     fullset = peerset = NULL;
2454 
2455     return 0;
2456 }
2457 
2458 enum {
2459     SMPL_T1,
2460     SMPL_T2
2461 };
2462 
smpl_get_int(const char * s,DATASET * dset,int * err)2463 static int smpl_get_int (const char *s, DATASET *dset, int *err)
2464 {
2465     int k = -1;
2466 
2467     if (integer_string(s)) {
2468 	k = atoi(s);
2469     } else {
2470 	double x;
2471 
2472 	if (gretl_is_scalar(s)) {
2473 	    x = gretl_scalar_get_value(s, NULL);
2474 	} else {
2475 	    x = generate_scalar(s, dset, err);
2476 	}
2477 
2478 	if (!na(x) && x < INT_MAX) {
2479 	    k = (int) x;
2480 	}
2481     }
2482 
2483     return k;
2484 }
2485 
2486 /* take integer_string(s) as given here */
2487 
probably_year(const char * s,DATASET * dset)2488 static int probably_year (const char *s, DATASET *dset)
2489 {
2490     int sval = atoi(s);
2491 
2492     return (annual_data(dset) && dset->sd0 > 1000 &&
2493 	    sval > 1000 && sval > dset->n);
2494 }
2495 
got_two_ints(const char * s,int * pi,int * pt)2496 static int got_two_ints (const char *s, int *pi, int *pt)
2497 {
2498     int n = 0;
2499 
2500     if (strchr(s, ':') != NULL) {
2501 	n = sscanf(s, "%d:%d", pi, pt);
2502     } else if (strchr(s, '.') != NULL) {
2503 	n = sscanf(s, "%d.%d", pi, pt);
2504     }
2505 
2506     return n == 2;
2507 }
2508 
get_sample_limit(const char * s,DATASET * dset,int code)2509 static int get_sample_limit (const char *s, DATASET *dset, int code)
2510 {
2511     int ret = -1;
2512     int i, t;
2513     int err = 0;
2514 
2515 #if SUBDEBUG
2516     fprintf(stderr, "get_sample_limit: s = '%s'\n", s);
2517 #endif
2518 
2519     if (dataset_is_panel(dset) && got_two_ints(s, &i, &t)) {
2520 	if (code == SMPL_T1 && t != 1) {
2521 	    gretl_errmsg_sprintf(_("'%s': invalid starting observation"), s);
2522 	    err = E_DATA;
2523 	} else if (code == SMPL_T2 && t != dset->pd) {
2524 	    gretl_errmsg_sprintf(_("'%s': invalid ending observation"), s);
2525 	    err = E_DATA;
2526 	}
2527 	if (err) {
2528 	    return -1;
2529 	}
2530     }
2531 
2532     if (*s == '-' || *s == '+') {
2533 	/* increment/decrement form */
2534 	int incr = smpl_get_int(s + 1, dset, &err);
2535 
2536 	if (!err) {
2537 	    if (*s == '-') {
2538 		incr = -incr;
2539 	    }
2540 	    if (dataset_is_panel(dset)) {
2541 		incr *= dset->pd;
2542 	    }
2543 	    if (code == SMPL_T1) {
2544 		ret = dset->t1 + incr;
2545 	    } else {
2546 		ret = dset->t2 + incr;
2547 	    }
2548 	}
2549     } else {
2550 	/* absolute form */
2551 	if (!integer_string(s) || probably_year(s, dset)) {
2552 	    ret = get_t_from_obs_string(s, dset);
2553 	}
2554 	if (ret < 0) {
2555 	    gretl_error_clear();
2556 	    ret = smpl_get_int(s, dset, &err);
2557 	    /* convert to base 0 */
2558 	    if (!err) ret--;
2559 	}
2560     }
2561 
2562     return ret;
2563 }
2564 
2565 /* Catch the case where we're in a function and attempting to
2566    move t1 or t2 out of the range established on entry to the
2567    function: we don't want to carry forward any error message
2568    that implies t was out of the full data range.
2569 */
2570 
maybe_clear_range_error(int t,DATASET * dset)2571 static void maybe_clear_range_error (int t, DATASET *dset)
2572 {
2573     if (t >= 0 && t < dset->n) {
2574 	gretl_error_clear();
2575     }
2576 }
2577 
2578 /**
2579  * set_sample:
2580  * @start: start of sample range.
2581  * @stop: end of sample range.
2582  * @dset: pointer to dataset struct.
2583  * @opt: options flag(s).
2584  *
2585  * Sets the current sample range for @dset to that specified by
2586  * @start and @stop, if possible.
2587  *
2588  * If @opt contains %OPT_T, make the sub-sampling permanent,
2589  * or in other words shrink the dataset to the selected range.
2590  *
2591  * Returns: 0 on successful completion, non-zero error code
2592  * otherwise.
2593  */
2594 
set_sample(const char * start,const char * stop,DATASET * dset,gretlopt opt)2595 int set_sample (const char *start,
2596 		const char *stop,
2597 		DATASET *dset,
2598 		gretlopt opt)
2599 {
2600     int nf, new_t1 = dset->t1, new_t2 = dset->t2;
2601     int tmin = 0, tmax = 0;
2602     int err = 0;
2603 
2604     if (dset == NULL) {
2605 	return E_NODATA;
2606     }
2607 
2608     /* currently --quiet (OPT_Q) and --permanent (OPT_T)
2609        are the only acceptable options here
2610     */
2611     opt &= ~OPT_Q;
2612     if (opt != OPT_NONE && opt != OPT_T) {
2613 	return E_BADOPT;
2614     }
2615 
2616     gretl_error_clear();
2617 
2618     nf = (start != NULL) + (stop != NULL);
2619 
2620 #if SUBDEBUG
2621     fprintf(stderr, "set_sample: start='%s', stop='%s', dset=%p\n",
2622 	    start, stop, (void *) dset);
2623     fprintf(stderr, "dset->v = %d, dset->n = %d, pd = %d\n",
2624 	    dset->v, dset->n, dset->pd);
2625 #endif
2626 
2627     if (nf == 2 && dset->n == 0) {
2628 	/* database special */
2629 	return db_set_sample(start, stop, dset);
2630     }
2631 
2632     if (nf == 0) {
2633 	/* no-op, just print the current sample */
2634 	return 0;
2635     }
2636 
2637     sample_range_get_extrema(dset, &tmin, &tmax);
2638 
2639 #if SUBDEBUG
2640     fprintf(stderr, "sample extrema: lo = %d, hi = %d\n", tmin, tmax);
2641 #endif
2642 
2643     if (nf == 1) {
2644 	/* implicitly just setting the starting observation? */
2645 	if (start == NULL) {
2646 	    return E_ARGS;
2647 	}
2648 	new_t1 = get_sample_limit(start, dset, SMPL_T1);
2649 	if (new_t1 < tmin || new_t1 > tmax) {
2650 	    maybe_clear_range_error(new_t1, dset);
2651 	    gretl_errmsg_set(_("error in new starting obs"));
2652 	    return E_INVARG;
2653 	}
2654 	goto finish;
2655     }
2656 
2657     /* now we're looking at the 2 fields case */
2658 
2659     if (strcmp(start, ";")) {
2660 	new_t1 = get_sample_limit(start, dset, SMPL_T1);
2661 	if (new_t1 < tmin || new_t1 > tmax) {
2662 	    maybe_clear_range_error(new_t1, dset);
2663 	    gretl_errmsg_set(_("error in new starting obs"));
2664 	    err = E_DATA;
2665 	}
2666     }
2667 
2668     if (!err && strcmp(stop, ";")) {
2669 	new_t2 = get_sample_limit(stop, dset, SMPL_T2);
2670 	if (new_t2 < tmin || new_t2 > tmax) {
2671 	    maybe_clear_range_error(new_t2, dset);
2672 	    gretl_errmsg_set(_("error in new ending obs"));
2673 	    err = E_DATA;
2674 	}
2675     }
2676 
2677     if (!err && (new_t1 < tmin || new_t1 > new_t2)) {
2678 	gretl_error_clear();
2679 	gretl_errmsg_set(_("Invalid null sample"));
2680 	err = E_DATA;
2681     }
2682 
2683  finish:
2684 
2685     if (!err && dataset_is_panel(dset)) {
2686 	int T = dset->pd;
2687 
2688 	if (new_t1 % T != 0 || (new_t2 + 1) % T != 0) {
2689 	    gretl_errmsg_set("Invalid panel sample range");
2690 	    err = E_INVARG;
2691 	}
2692     }
2693 
2694     if (!err) {
2695 	dset->t1 = new_t1;
2696 	dset->t2 = new_t2;
2697     }
2698 
2699     if (!err && (opt & OPT_T)) {
2700 	err = perma_sample(dset, opt, NULL, NULL);
2701     }
2702 
2703     return err;
2704 }
2705 
set_panel_sample(const char * start,const char * stop,gretlopt opt,DATASET * dset)2706 int set_panel_sample (const char *start, const char *stop,
2707 		      gretlopt opt, DATASET *dset)
2708 {
2709     int s1, s2, err = 0;
2710 
2711     if (!dataset_is_panel(dset)) {
2712 	gretl_errmsg_sprintf(_("%s: inapplicable option"), print_flags(opt, SMPL));
2713 	return E_BADOPT;
2714     }
2715 
2716     /* maybe some day implement option to sample in the
2717        panel time dimension here? (not trivial)
2718     */
2719 
2720     s1 = smpl_get_int(start, dset, &err);
2721     if (!err) {
2722 	s2 = smpl_get_int(stop, dset, &err);
2723     }
2724 
2725     if (!err) {
2726 	if (s1 < 1) {
2727 	    gretl_errmsg_sprintf(_("invalid first unit %d"), s1);
2728 	    err = E_DATA;
2729 	} else if (s2 > dset->n / dset->pd) {
2730 	    gretl_errmsg_sprintf(_("invalid last unit %d"), s2);
2731 	    err = E_DATA;
2732 	} else if (s2 < s1) {
2733 	    gretl_errmsg_set(_("invalid null sample"));
2734 	    err = E_DATA;
2735 	}
2736     }
2737 
2738     if (!err) {
2739 	int t1 = (s1 - 1) * dset->pd;
2740 	int t2 = s2 * dset->pd - 1;
2741 	int tmin = 0, tmax = 0;
2742 
2743 	sample_range_get_extrema(dset, &tmin, &tmax);
2744 	if (t1 < tmin || t2 > tmax) {
2745 	    gretl_errmsg_set("sample range out of bounds");
2746 	    err = E_DATA;
2747 	} else {
2748 	    dset->t1 = t1;
2749 	    dset->t2 = t2;
2750 	}
2751     }
2752 
2753     return err;
2754 }
2755 
2756 /**
2757  * count_missing_values:
2758  * @dset: dataset struct.
2759  * @opt: use %OPT_V for verbose operation, %OPT_A to
2760  * examine all data.
2761  * @prn: printing struct.
2762  * @err: location to receive error code.
2763  *
2764  * Prints a count of missing values (if any) in the current
2765  * dataset over the currently defined sample range (or the
2766  * entire data range if %OPT_A is given). If %OPT_V is given
2767  * this includes a count of missing values at each observation;
2768  * otherwise it just includes global and per-variable counts.
2769  *
2770  * Returns: 0 if no missing values are found (or on error),
2771  * otherwise the total number of missing values.
2772  */
2773 
count_missing_values(const DATASET * dset,gretlopt opt,PRN * prn,int * err)2774 int count_missing_values (const DATASET *dset, gretlopt opt,
2775 			  PRN *prn, int *err)
2776 {
2777     int missval = 0, missobs = 0, totvals = 0, oldmiss = 0;
2778     int T, t1, t2;
2779     int *missvec;
2780     double missfrac;
2781     int i, t, tmiss;
2782 
2783     if (opt & OPT_A) {
2784 	t1 = 0;
2785 	t2 = dset->n - 1;
2786     } else {
2787 	t1 = dset->t1;
2788 	t2 = dset->t2;
2789     }
2790 
2791     T = t2 - t1 + 1;
2792 
2793     missvec = malloc(dset->v * sizeof missvec);
2794 
2795     if (missvec == NULL) {
2796 	*err = E_ALLOC;
2797 	return 0;
2798     }
2799 
2800     for (i=0; i<dset->v; i++) {
2801 	missvec[i] = 0;
2802     }
2803 
2804     for (t=t1; t<=t2; t++) {
2805 	tmiss = 0;
2806 	for (i=1; i<dset->v; i++) {
2807 	    if (series_is_hidden(dset, i)) {
2808 		continue;
2809 	    }
2810 	    if (na(dset->Z[i][t])) {
2811 		if (missvec[i] == 0) {
2812 		    missvec[0] += 1;
2813 		}
2814 		missvec[i] += 1;
2815 		missval++;
2816 	    }
2817 	    totvals++;
2818 	}
2819 
2820 	tmiss = missval - oldmiss;
2821 
2822 	if (tmiss) {
2823 	    missobs++;
2824 
2825 	    if (opt & OPT_V) {
2826 		/* verbose print by observation */
2827 		if (dset->markers) {
2828 		    pprintf(prn, "%8s %4d %s\n", dset->S[t], tmiss,
2829 			    _("missing values"));
2830 		} else {
2831 		    char tmp[OBSLEN];
2832 
2833 		    ntolabel(tmp, t, dset);
2834 		    pprintf(prn, "%8s %4d %s\n", tmp, tmiss,
2835 			    _("missing values"));
2836 		}
2837 	    }
2838 	}
2839 	oldmiss = missval;
2840     }
2841 
2842     missfrac = 100.0 * (double) missobs / T;
2843 
2844     pprintf(prn, _("\nNumber of observations (rows) with missing data "
2845 		   "values = %d (%.2f%%)\n"), missobs, missfrac);
2846 
2847     pprintf(prn, _("Total number of missing data values = %d (%.2f%% "
2848 	    "of total data values)\n"), missval,
2849 	    (100.0 * (double) missval / totvals));
2850 
2851     if (missvec[0] > 0) {
2852 	pputc(prn, '\n');
2853 	for (i=1; i<dset->v; i++) {
2854 	    if (missvec[i] > 0) {
2855 		missfrac = 100.0 * (double) missvec[i] / T;
2856 		pprintf(prn, "%8s: %d %s (%.2f%%); %d %s (%.2f%%)\n",
2857 			dset->varname[i], missvec[i], _("missing values"),
2858 			missfrac, T - missvec[i], _("valid values"),
2859 			100.0 - missfrac);
2860 	    }
2861 	}
2862     }
2863 
2864     free(missvec);
2865 
2866     return missval;
2867 }
2868 
copy_series_info(DATASET * dest,const DATASET * src,int maxv)2869 static void copy_series_info (DATASET *dest, const DATASET *src,
2870 			      int maxv)
2871 {
2872     int i;
2873 
2874     for (i=1; i<maxv; i++) {
2875 	strcpy(dest->varname[i], src->varname[i]);
2876 	if (src->varinfo != NULL) {
2877 	    copy_varinfo(dest->varinfo[i], src->varinfo[i]);
2878 	}
2879     }
2880 }
2881 
2882 /* Below: For a model that was estimated using a data sample which
2883    differs from the current sample, reconstitute the dataset
2884    appropriate to the model.  If the model used a subsample, we use
2885    the special dummy variable (submask) saved with the model to select
2886    observations from the full dataset.  Another possibility is that
2887    the model used the full dataset, which has by now been subsampled.
2888 
2889    We attach this dataset to the model, so that it can be used for,
2890    e.g., residual or fitted versus actual plots.  The attached data
2891    set will be destroyed when the model itself is destroyed.
2892 
2893    By default we create a dataset containing series up to the
2894    highest numbered series associated with the model (regressand
2895    and regressors).  If OPT_F is given we include all series; if
2896    OPT_G is given we include only the constant.
2897 */
2898 
add_dataset_to_model(MODEL * pmod,const DATASET * dset,gretlopt opt)2899 int add_dataset_to_model (MODEL *pmod, const DATASET *dset,
2900 			  gretlopt opt)
2901 {
2902     const DATASET *srcset;
2903     char *mask = NULL;
2904     int maxv, sn = 0;
2905 
2906     if (gretl_is_between_model(pmod)) {
2907 	/* special case: we can't reconstruct the dataset here,
2908 	   but it may already be attached to @pmod.
2909 	*/
2910 	if (pmod->dataset != NULL) {
2911 	    return 0;
2912 	} else {
2913 	    return E_DATA;
2914 	}
2915     }
2916 
2917     if (pmod->dataset != NULL) {
2918 	/* FIXME? */
2919 	destroy_dataset(pmod->dataset);
2920 	pmod->dataset = NULL;
2921     }
2922 
2923 #if SUBDEBUG
2924     fprintf(stderr, "add_dataset_to_model: fullset=%p, pmod->submask=%p\n",
2925 	    (void *) fullset, (void *) pmod->submask);
2926 #endif
2927 
2928     if (fullset != NULL) {
2929 	sync_datainfo_members(dset);
2930 	srcset = fullset;
2931     } else {
2932 	srcset = dset;
2933     }
2934 
2935     if (pmod->submask == NULL) {
2936 	/* no subsample info: pmod was estimated on the full dataset,
2937 	   so we'll reconstruct the full dataset */
2938 	sn = srcset->n;
2939     } else {
2940 	/* pmod was estimated on a subsample, which has to
2941 	   be reconstructed */
2942 	int t;
2943 
2944 	mask = calloc(srcset->n, 1);
2945 	if (mask == NULL) {
2946 	    return E_ALLOC;
2947 	}
2948 	for (t=0; t<srcset->n; t++) {
2949 	    if (pmod->submask[t] > 0) {
2950 		mask[t] = 1;
2951 		sn++;
2952 	    }
2953 	}
2954 	if (sn == 0) {
2955 	    free(mask);
2956 	    return 1;
2957 	}
2958     }
2959 
2960     if (opt & OPT_F) {
2961 	maxv = srcset->v;
2962     } else if (opt & OPT_G) {
2963 	maxv = 1;
2964     } else {
2965 	maxv = highest_numbered_var_in_model(pmod, dset) + 1;
2966     }
2967 
2968     /* allocate auxiliary dataset */
2969     pmod->dataset = create_auxiliary_dataset(maxv, sn, 0);
2970     if (pmod->dataset == NULL) {
2971 	free(mask);
2972 	return E_ALLOC;
2973     }
2974 
2975 #if SUBDEBUG
2976     fprintf(stderr, "pmod->dataset allocated at %p\n",
2977 	    (void *) pmod->dataset);
2978 #endif
2979 
2980     /* copy across info on series */
2981     copy_series_info(pmod->dataset, srcset, maxv);
2982 
2983     /* copy across data */
2984     copy_data_to_subsample(pmod->dataset, srcset, maxv, mask);
2985 
2986     /* dataset characteristics such as pd: if we're rebuilding the
2987        full dataset copy these across; but if we're reconstructing a
2988        subsampled dataset these features from the full dataset will be
2989        wrong, and we stay with the simple defaults
2990     */
2991     if (pmod->submask == NULL) {
2992 	pmod->dataset->pd = srcset->pd;
2993 	pmod->dataset->sd0 = srcset->sd0;
2994 	strcpy(pmod->dataset->stobs, srcset->stobs);
2995 	strcpy(pmod->dataset->endobs, srcset->endobs);
2996 	pmod->dataset->structure = srcset->structure;
2997     }
2998 
2999     free(mask);
3000 
3001 #if SUBDEBUG
3002     fputs("add_subsampled_dataset_to_model: success\n", stderr);
3003 #endif
3004 
3005     return 0;
3006 }
3007 
submasks_match(const DATASET * dset,const MODEL * pmod)3008 static int submasks_match (const DATASET *dset, const MODEL *pmod)
3009 {
3010     char *s1 = dset->submask;
3011     char *s2 = pmod->submask;
3012 
3013     if (s1 == RESAMPLED && s2 == RESAMPLED) {
3014 	return dset->n == pmod->full_n &&
3015 	    dset->rseed == pmod->smpl.rseed;
3016     } else if (s1 == RESAMPLED || s2 == RESAMPLED) {
3017 	return 0;
3018     } else {
3019 	return submask_cmp(s1, s2) == 0;
3020     }
3021 }
3022 
3023 /* check the subsample mask from a model against @dset to
3024    see if it may have been estimated on a different
3025    (subsampled) data set from the current one
3026 */
3027 
model_sample_problem(const MODEL * pmod,const DATASET * dset)3028 int model_sample_problem (const MODEL *pmod, const DATASET *dset)
3029 {
3030     int ret = 1;
3031 
3032     if (pmod->ci == PANEL && (pmod->opt & OPT_B)) {
3033 	; /* special case: panel "between" model */
3034     } else if (pmod->submask == NULL) {
3035 	/* the model has no sub-sampling info recorded */
3036 	if (dset->submask == NULL) {
3037 	    /* data set is not sub-sampled either, OK */
3038 	    ret = 0;
3039 	} else {
3040 	    fputs("dataset is subsampled, model is not\n", stderr);
3041 	    gretl_errmsg_set(_("dataset is subsampled, model is not\n"));
3042 	    ret = 1;
3043 	}
3044     } else {
3045 	/* the model does have sub-sampling info recorded */
3046 	if (dset->submask == NULL) {
3047 	    fputs("model is subsampled, dataset is not\n", stderr);
3048 	    gretl_errmsg_set(_("model is subsampled, dataset is not\n"));
3049 	    ret = 1;
3050 	} else if (submasks_match(dset, pmod)) {
3051 	    /* the subsamples (model and current data set) agree, OK */
3052 	    ret = 0;
3053 	} else {
3054 	    /* the subsamples differ */
3055 	    gretl_errmsg_set(_("model and dataset subsamples not the same\n"));
3056 	    ret = 1;
3057 	}
3058     }
3059 
3060     return ret;
3061 }
3062 
fcast_not_feasible(const MODEL * pmod,const DATASET * dset)3063 int fcast_not_feasible (const MODEL *pmod, const DATASET *dset)
3064 {
3065     int ret = E_DATA;
3066 
3067     if (pmod->ci == PANEL && (pmod->opt & OPT_B)) {
3068 	; /* special case: panel "between" model */
3069     } else if (pmod->submask == NULL) {
3070 	/* @pmod was estimated on the full dataset */
3071 	if (dset->submask == NULL) {
3072 	    /* @dset not sub-sampled either, OK */
3073 	    ret = 0;
3074 	} else if (dataset_is_cross_section(dset)) {
3075 	    /* @dset is cross-sectional subset, OK? */
3076 	    ret = 0;
3077 	} else {
3078 	    gretl_errmsg_set(_("dataset is subsampled, model is not\n"));
3079 	}
3080     } else {
3081 	/* the model does have sub-sampling info recorded */
3082 	if (dset->submask == NULL) {
3083 	    /* the dataset is not sub-sampled */
3084 	    if (dataset_is_cross_section(dset)) {
3085 		ret = 0; /* OK? */
3086 	    } else if (is_panel_time_sample(pmod->submask, dset)) {
3087 		ret = 0; /* OK? */
3088 	    } else {
3089 		gretl_errmsg_set(_("model is subsampled, dataset is not\n"));
3090 	    }
3091 	} else if (submasks_match(dset, pmod)) {
3092 	    /* the subsamples (model and current data set) agree, OK */
3093 	    ret = 0;
3094 	} else {
3095 	    /* the subsamples differ */
3096 	    if (dataset_is_cross_section(fullset) &&
3097 		dataset_is_cross_section(dset)) {
3098 		ret = 0; /* OK ? */
3099 	    } else if (is_panel_time_sample(pmod->submask, fullset) &&
3100 		       is_panel_time_sample(dset->submask, fullset)) {
3101 		/* we should be able to make this work */
3102 		ret = 0; /* new 2019-03-23 */
3103 	    } else {
3104 		gretl_errmsg_set(_("model and dataset subsamples not the same\n"));
3105 	    }
3106 	}
3107     }
3108 
3109     return ret;
3110 }
3111 
same_dataset(const MODEL * pmod,const DATASET * dset)3112 int same_dataset (const MODEL *pmod, const DATASET *dset)
3113 {
3114     if (pmod->submask == NULL && dset->submask == NULL) {
3115 	return 1;
3116     } else if (pmod->submask != NULL && dset->submask != NULL) {
3117 	return submasks_match(dset, pmod);
3118     } else {
3119 	return 0;
3120     }
3121 }
3122 
dataset_type_string(char * str,const DATASET * dset)3123 static void dataset_type_string (char *str, const DATASET *dset)
3124 {
3125     if (dataset_is_time_series(dset)) {
3126 	strcpy(str, _("time series"));
3127     } else if (dataset_is_panel(dset)) {
3128         strcpy(str, _("panel"));
3129     } else {
3130         strcpy(str, _("undated"));
3131     }
3132 }
3133 
pd_string(char * str,const DATASET * dset)3134 static void pd_string (char *str, const DATASET *dset)
3135 {
3136     if (custom_time_series(dset)) {
3137 	strcpy(str, _("special"));
3138     } else {
3139 	switch (dset->pd) {
3140 	case 1:
3141 	    strcpy(str, _("annual")); break;
3142 	case 4:
3143 	    strcpy(str, _("quarterly")); break;
3144 	case 12:
3145 	    strcpy(str, _("monthly")); break;
3146 	case 24:
3147 	    strcpy(str, _("hourly")); break;
3148 	case 52:
3149 	    strcpy(str, _("weekly")); break;
3150 	case 5:
3151 	    strcpy(str, _("daily (5 days)")); break;
3152 	case 6:
3153 	    strcpy(str, _("daily (6 days)")); break;
3154 	case 7:
3155 	    strcpy(str, _("daily (7 days)")); break;
3156 	case 10:
3157 	    strcpy(str, _("decennial")); break;
3158 	default:
3159 	    strcpy(str, _("unknown")); break;
3160 	}
3161     }
3162 }
3163 
print_sample_obs(const DATASET * dset,PRN * prn)3164 void print_sample_obs (const DATASET *dset, PRN *prn)
3165 {
3166     char d1[OBSLEN], d2[OBSLEN];
3167 
3168     ntolabel(d1, dset->t1, dset);
3169     ntolabel(d2, dset->t2, dset);
3170 
3171     pprintf(prn, "%s: %s - %s", _("Current sample"), d1, d2);
3172     pprintf(prn, " (n = %d)\n", dset->t2 - dset->t1 + 1);
3173 }
3174 
print_sample_status(const DATASET * dset,PRN * prn)3175 void print_sample_status (const DATASET *dset, PRN *prn)
3176 {
3177     char tmp[128];
3178 
3179     if (complex_subsampled()) {
3180 	pprintf(prn, "%s\n", _("Full dataset"));
3181 	dataset_type_string(tmp, fullset);
3182 	pprintf(prn, "%s: %s\n", _("Type"), tmp);
3183 	if (dataset_is_time_series(fullset)) {
3184 	    pd_string(tmp, fullset);
3185 	    pprintf(prn, "%s: %s\n", _("Frequency"), tmp);
3186 	} else if (dataset_is_panel(fullset)) {
3187 	    int nu = fullset->n / fullset->pd;
3188 
3189 	    pprintf(prn, "%s: %d\n", _("Number of cross-sectional units"), nu);
3190 	    pprintf(prn, "%s: %d\n", _("Number of time periods"), fullset->pd);
3191 	}
3192 	pprintf(prn, "%s: %s - %s (n = %d)\n", _("Range"),
3193 		fullset->stobs, fullset->endobs, fullset->n);
3194 
3195 	pprintf(prn, "\n%s\n", _("Subsampled data"));
3196 	if (dset->restriction != NULL) {
3197 	    pprintf(prn, "%s: %s\n", _("restriction"), dset->restriction);
3198 	}
3199     } else {
3200 	pprintf(prn, "%s\n", _("Full dataset"));
3201     }
3202 
3203     dataset_type_string(tmp, dset);
3204     pprintf(prn, "%s: %s\n", _("Type"), tmp);
3205     if (dataset_is_time_series(dset)) {
3206 	pd_string(tmp, dset);
3207 	pprintf(prn, "%s: %s\n", _("Frequency"), tmp);
3208     } else if (dataset_is_panel(dset)) {
3209 	int nu = dset->n / dset->pd;
3210 
3211 	pprintf(prn, "%s: %d\n", _("Number of cross-sectional units"), nu);
3212 	pprintf(prn, "%s: %d\n", _("Number of time periods"), dset->pd);
3213     }
3214     if (dset->t1 == 0 && dset->t2 == dset->n - 1) {
3215 	pprintf(prn, "%s: %s - %s (n = %d)\n", _("Range"),
3216 		dset->stobs, dset->endobs, dset->n);
3217     } else {
3218 	pprintf(prn, "%s: %s - %s (n = %d)\n", _("Range"),
3219 		dset->stobs, dset->endobs, dset->n);
3220 	print_sample_obs(dset, prn);
3221 	if (dset->restriction != NULL) {
3222 	    pprintf(prn, "(%s: %s)\n", _("restriction"), dset->restriction);
3223 	}
3224     }
3225 }
3226 
3227 /**
3228  * data_report:
3229  * @dset: data information struct.
3230  * @fname: filename for current datafile.
3231  * @prn: gretl printing struct.
3232  *
3233  * Write out a summary of the content of the current data set.
3234  *
3235  * Returns: 0 on successful completion, non-zero on error.
3236  *
3237  */
3238 
data_report(const DATASET * dset,const char * fname,PRN * prn)3239 int data_report (const DATASET *dset, const char *fname, PRN *prn)
3240 {
3241     char startdate[OBSLEN], enddate[OBSLEN], tmp[MAXLEN];
3242     char tstr[48];
3243     const char *vlabel;
3244     int i;
3245 
3246     ntolabel(startdate, 0, dset);
3247     ntolabel(enddate, dset->n - 1, dset);
3248 
3249     sprintf(tmp, _("Data file %s\nas of"),
3250 	    (*fname != '\0')? fname : _("(unsaved)"));
3251 
3252     print_time(tstr);
3253     pprintf(prn, "%s %s\n\n", tmp, tstr);
3254 
3255     if (dset->descrip != NULL && *dset->descrip != '\0') {
3256 	pprintf(prn, "%s:\n\n", _("Description"));
3257 	pputs(prn, dset->descrip);
3258 	pputs(prn, "\n\n");
3259     }
3260 
3261     dataset_type_string(tmp, dset);
3262     pprintf(prn, "%s: %s\n", _("Type of data"), tmp);
3263 
3264     if (dataset_is_time_series(dset)) {
3265 	pd_string(tmp, dset);
3266 	pprintf(prn, "%s: %s\n", _("Frequency"), tmp);
3267     }
3268 
3269     pprintf(prn, "%s: %s - %s (n = %d)\n\n", _("Range"),
3270 	    startdate, enddate, dset->n);
3271 
3272     pprintf(prn, "%s:\n\n", _("Listing of variables"));
3273 
3274     for (i=1; i<dset->v; i++) {
3275 	vlabel = series_get_label(dset, i);
3276 	pprintf(prn, "%*s  %s\n", VNAMELEN, dset->varname[i],
3277 		vlabel == NULL ? "" : vlabel);
3278     }
3279 
3280     return 0;
3281 }
3282