1 /*
2 * gretl -- Gnu Regression, Econometrics and Time-series Library
3 * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 /* subsample.c for gretl */
21
22 #define FULL_XML_HEADERS
23
24 #include "libgretl.h"
25 #include "libset.h"
26 #include "gretl_func.h"
27 #include "gretl_panel.h"
28 #include "cmd_private.h"
29 #include "dbread.h"
30 #include "uservar.h"
31 #include "gretl_xml.h"
32
33 #define SUBDEBUG 0
34 #define FULLDEBUG 0
35 #define PANDEBUG 0
36
37 typedef enum {
38 SUBSAMPLE_NONE,
39 SUBSAMPLE_DROP_MISSING,
40 SUBSAMPLE_DROP_EMPTY,
41 SUBSAMPLE_DROP_WKENDS,
42 SUBSAMPLE_USE_DUMMY,
43 SUBSAMPLE_BOOLEAN,
44 SUBSAMPLE_RANDOM,
45 SUBSAMPLE_UNKNOWN
46 } SubsampleMode;
47
48 #define RECODE_ON_PERMA 1 /* experiment */
49
50 #if RECODE_ON_PERMA
51
recode_strvals(DATASET * dset,gretlopt opt)52 static int recode_strvals (DATASET *dset, gretlopt opt)
53 {
54 int i, err = 0;
55
56 for (i=1; i<dset->v && !err; i++) {
57 if (is_string_valued(dset, i)) {
58 err = series_recode_strings(dset, i, opt, NULL);
59 }
60 }
61
62 return err;
63 }
64
65 #endif
66
67 /*
68 The purpose of the static pointers below: When the user subsamples
69 the current dataset in a non-trivial way -- i.e., by selecting cases
70 rather than just moving the starting or ending points of the data
71 range -- we create a new sub-dataset, and we need to keep the full
72 dataset around so that it can be restored later. The pointer
73 @fullset is used to record the address of the full dataset.
74
75 In addition, @peerset keeps track of the location of the DATASET
76 struct associated with the backed-up full dataset; by means of this,
77 we can know when to free the full dataset and when not to (for
78 instance, if we're freeing an auxiliary dataset).
79 */
80
81 static DATASET *fullset;
82 static DATASET *peerset;
83
84 #define SUBMASK_SENTINEL 127
85
86 static int smpl_get_int (const char *s, DATASET *dset, int *err);
87
88 /* accessors for full dataset, when sub-sampled */
89
fetch_full_dataset(void)90 DATASET *fetch_full_dataset (void)
91 {
92 return fullset;
93 }
94
get_submask_length(const char * s)95 static int get_submask_length (const char *s)
96 {
97 int n = 1;
98
99 if (s == NULL || s == RESAMPLED) {
100 n = 0;
101 } else {
102 while (*s != SUBMASK_SENTINEL) {
103 n++;
104 s++;
105 }
106 }
107
108 return n;
109 }
110
get_dataset_submask_size(const DATASET * dset)111 int get_dataset_submask_size (const DATASET *dset)
112 {
113 int n = 0;
114
115 if (dset != NULL) {
116 n = get_submask_length(dset->submask);
117 if (n > 0) n--; /* omit the sentinel */
118 }
119
120 return n;
121 }
122
get_model_submask_size(const MODEL * pmod)123 int get_model_submask_size (const MODEL *pmod)
124 {
125 int n = 0;
126
127 if (pmod != NULL) {
128 n = get_submask_length(pmod->submask);
129 if (n > 0) n--; /* omit the sentinel */
130 }
131
132 return n;
133 }
134
free_subsample_mask(char * s)135 void free_subsample_mask (char *s)
136 {
137 if (s != RESAMPLED && s != NULL) {
138 free(s);
139 }
140 }
141
copy_subsample_mask(const char * src,int * err)142 char *copy_subsample_mask (const char *src, int *err)
143 {
144 char *ret = NULL;
145
146 if (src == RESAMPLED) {
147 ret = RESAMPLED;
148 } else if (src != NULL) {
149 int n = get_submask_length(src);
150
151 ret = malloc(n * sizeof *ret);
152 if (ret != NULL) {
153 memcpy(ret, src, n);
154 } else {
155 *err = E_ALLOC;
156 }
157 }
158
159 return ret;
160 }
161
copy_dataset_submask(const DATASET * dset,int * err)162 char *copy_dataset_submask (const DATASET *dset, int *err)
163 {
164 char *mask = NULL;
165
166 if (complex_subsampled()) {
167 mask = copy_subsample_mask(dset->submask, err);
168 }
169
170 return mask;
171 }
172
write_model_submask(const MODEL * pmod,PRN * prn)173 int write_model_submask (const MODEL *pmod, PRN *prn)
174 {
175 int ret = 0;
176
177 if (pmod->submask == RESAMPLED) {
178 pputs(prn, "<submask length=\"0\"></submask>\n");
179 ret = 1;
180 } else if (pmod->submask != NULL) {
181 int i, n = get_submask_length(pmod->submask);
182
183 pprintf(prn, "<submask length=\"%d\">", n);
184 for (i=0; i<n; i++) {
185 pprintf(prn, "%d ", (int) pmod->submask[i]);
186 }
187 pputs(prn, "</submask>\n");
188 ret = 1;
189 }
190
191 return ret;
192 }
193
write_dataset_submask(const DATASET * dset,PRN * prn)194 int write_dataset_submask (const DATASET *dset, PRN *prn)
195 {
196 int ret = 0;
197
198 if (dset->submask == RESAMPLED) {
199 unsigned int seed = get_resampling_seed();
200
201 pprintf(prn, "<resample seed=\"%u\" n=\"%d\"/>\n", seed, dset->n);
202 ret = 1;
203 } else if (complex_subsampled()) {
204 int i, n = get_submask_length(dset->submask);
205
206 pprintf(prn, "<submask length=\"%d\">", n);
207 for (i=0; i<n; i++) {
208 pprintf(prn, "%d ", (int) dset->submask[i]);
209 }
210 pputs(prn, "</submask>\n");
211
212 if (dset->restriction != NULL) {
213 gretl_xml_put_tagged_string("restriction", dset->restriction, prn);
214 }
215
216 ret = 1;
217 }
218
219 return ret;
220 }
221
submask_cmp(const char * m1,const char * m2)222 int submask_cmp (const char *m1, const char *m2)
223 {
224 if (m1 == NULL && m2 == NULL) {
225 return 0;
226 } else if (m1 == NULL || m2 == NULL) {
227 return 1;
228 }
229
230 if (m1 == RESAMPLED || m2 == RESAMPLED) {
231 return m1 != RESAMPLED || m2 != RESAMPLED;
232 }
233
234 while (*m1 != SUBMASK_SENTINEL && *m2 != SUBMASK_SENTINEL) {
235 if (*m1 != *m2) {
236 return 1;
237 }
238 m1++;
239 m2++;
240 }
241
242 return 0;
243 }
244
245 /* All values apart from the sentinel are initialized to zero; once
246 the mask is used, 1s will indicate included observations and
247 0s excluded observations.
248 */
249
make_submask(int n)250 static char *make_submask (int n)
251 {
252 char *mask = calloc(n + 1, 1);
253
254 if (mask != NULL) {
255 mask[n] = SUBMASK_SENTINEL;
256 }
257
258 return mask;
259 }
260
261 #if 0
262
263 static void debug_print_submask (char *mask, char *msg)
264 {
265 if (mask == NULL) {
266 fprintf(stderr, "%s: NULL\n", msg);
267 } else if (mask == RESAMPLED) {
268 fprintf(stderr, "%s: RESAMPLED\n", msg);
269 } else {
270 char *s = mask;
271
272 fprintf(stderr, "%s: ", msg);
273
274 while (*s != SUBMASK_SENTINEL) {
275 if (*s == 0) {
276 fputc('0', stderr);
277 } else if (*s == 1) {
278 fputc('1', stderr);
279 } else {
280 fputc('?', stderr);
281 }
282 s++;
283 }
284 fputc('\n', stderr);
285 }
286 }
287
288 #endif
289
set_dataset_resampled(DATASET * dset,unsigned int seed)290 void set_dataset_resampled (DATASET *dset, unsigned int seed)
291 {
292 dset->submask = RESAMPLED;
293 dset->rseed = seed;
294 }
295
dataset_is_resampled(const DATASET * dset)296 int dataset_is_resampled (const DATASET *dset)
297 {
298 return (dset != NULL && dset->submask == RESAMPLED);
299 }
300
maybe_free_full_dataset(const DATASET * dset)301 void maybe_free_full_dataset (const DATASET *dset)
302 {
303 if (dset == peerset) {
304 if (fullset != NULL) {
305 #if SUBDEBUG
306 fprintf(stderr, "maybe_free_full_dataset: freeing fullset at %p (Z at %p)\n",
307 (void *) fullset, (void *) fullset->Z);
308 #endif
309 if (fullset->Z != NULL) {
310 free_Z(fullset);
311 }
312 clear_datainfo(fullset, CLEAR_SUBSAMPLE);
313 free(fullset);
314 fullset = NULL;
315 }
316 peerset = NULL;
317 }
318 }
319
320 /* we do this on "restore full sample" */
321
relink_to_full_dataset(DATASET * dset)322 static void relink_to_full_dataset (DATASET *dset)
323 {
324 #if SUBDEBUG
325 fprintf(stderr, "relink_to_full_dataset: fullset = %p (freeing and nulling)\n",
326 (void *) fullset);
327 fprintf(stderr, "fullset: v = %d, n = %d\n", fullset->v, fullset->n);
328 #endif
329
330 *dset = *fullset;
331 free(fullset);
332 fullset = NULL;
333 peerset = NULL;
334 }
335
sync_dataset_shared_members(const DATASET * dset)336 void sync_dataset_shared_members (const DATASET *dset)
337 {
338 if (fullset != NULL && dset != fullset) {
339 fullset->varname = dset->varname;
340 fullset->varinfo = dset->varinfo;
341 }
342 }
343
344 /* sync malloced elements of the fullset struct that might
345 have been moved via realloc
346 */
347
sync_datainfo_members(const DATASET * dset)348 static void sync_datainfo_members (const DATASET *dset)
349 {
350 if (fullset->v > dset->v) {
351 int i;
352
353 #if FULLDEBUG
354 fprintf(stderr, "*** sync_datainfo_members: fullset->v = %d but dset->v = %d\n",
355 fullset->v, dset->v);
356 fprintf(stderr, " deleting the last %d element(s) of fullZ\n",
357 fullset->v - dset->v);
358 #endif
359 for (i=dset->v; i<fullset->v; i++) {
360 free(fullset->Z[i]);
361 fullset->Z[i] = NULL;
362 }
363 fullset->v = dset->v;
364 }
365
366 fullset->varname = dset->varname;
367 fullset->varinfo = dset->varinfo;
368 fullset->descrip = dset->descrip;
369 }
370
371 /* attach_subsample_to_model:
372 * @pmod: model to which subsample should be attached.
373 * @dset: pointer to current dataset info.
374 *
375 * If the dataset is currently subsampled, record the subsample
376 * information with the model so that it can be retrieved later.
377 *
378 * Returns: 0 if the recording is not needed, or on success; non-zero
379 * error code failure.
380 */
381
attach_subsample_to_model(MODEL * pmod,const DATASET * dset)382 int attach_subsample_to_model (MODEL *pmod, const DATASET *dset)
383 {
384 int err = 0;
385
386 #if SUBDEBUG
387 fprintf(stderr, "attach_subsample_to_model: fullset = %p\n",
388 (void *) fullset);
389 #endif
390
391 if (fullset != NULL) {
392 /* sync, in case anything has moved */
393 sync_datainfo_members(dset);
394
395 if (pmod->submask != NULL) {
396 free_subsample_mask(pmod->submask);
397 }
398
399 pmod->submask = copy_subsample_mask(dset->submask, &err);
400 }
401
402 return err;
403 }
404
405 #define MDEBUG 0
406
407 /* This is called from objstack.c for each saved model, to
408 determine whether a proposed permanent shrinkage of the
409 dataset will invalidate the model.
410 */
411
subsample_check_model(MODEL * pmod,char * mask)412 int subsample_check_model (MODEL *pmod, char *mask)
413 {
414 if (submask_cmp(pmod->submask, mask)) {
415 return E_DATA;
416 } else {
417 return 0;
418 }
419 }
420
421 /* Called from objstack.c for each saved model, when the
422 user calls for the imposition of a permanent subsample
423 restriction on the dataset. We've already checked that
424 the "new" subsample matches that on which model was
425 estimated, so all we have to do now is sync by removing
426 the model's subsample mask.
427 */
428
remove_model_subsample_info(MODEL * pmod)429 int remove_model_subsample_info (MODEL *pmod)
430 {
431 free_subsample_mask(pmod->submask);
432 pmod->submask = NULL;
433
434 return 0;
435 }
436
437 /* check for at least two observations, all contiguous */
438
ts_contig(const char * ts,int T)439 static int ts_contig (const char *ts, int T)
440 {
441 int contig = 1;
442 int stopped = 0;
443 int t, n = 0;
444
445 for (t=0; t<T && contig; t++) {
446 if (ts[t]) {
447 if (stopped) {
448 contig = 0;
449 } else {
450 n++;
451 }
452 } else if (n > 0) {
453 stopped = 1;
454 }
455 }
456
457 return n >= 2 && contig;
458 }
459
460 /* For forecasting purposes, check to see if a subsample
461 mask represents a sub-sampling of a panel dataset in
462 the time dimension only.
463 */
464
is_panel_time_sample(const char * mask,const DATASET * dset)465 static int is_panel_time_sample (const char *mask,
466 const DATASET *dset)
467 {
468 int i, t, s, N, T, ret = 1;
469 char *ts = NULL;
470
471 if (dset == NULL || !dataset_is_panel(dset)) {
472 return 0;
473 } else if (get_submask_length(mask) != dset->n + 1) {
474 return 0;
475 }
476
477 T = dset->pd;
478 N = dset->n / T;
479 ts = calloc(T, 1);
480
481 s = 0;
482 for (i=0; i<N && ret; i++) {
483 for (t=0; t<T && ret; t++) {
484 if (i == 0) {
485 /* first unit sets the pattern */
486 if (mask[s]) {
487 ts[t] = 1;
488 }
489 } else if ((mask[s] && !ts[t]) || (ts[t] && !mask[s])) {
490 /* subsequent units must match */
491 ret = 0;
492 }
493 s++;
494 }
495 if (i == 0 && !ts_contig(ts, T)) {
496 ret = 0;
497 }
498 }
499
500 free(ts);
501
502 return ret;
503 }
504
505 /* If series have been added to a resampled dataset, we can't
506 bring these back to the "full" dataset, which may have a
507 longer or shorter series length, and from which there is
508 no definite mapping by row. So we just delete them. In
509 this function we destroy their varnames and varinfo
510 structures; the numerical arrays get deleted later.
511 */
512
resample_sync_dataset(DATASET * dset)513 static int resample_sync_dataset (DATASET *dset)
514 {
515 int err = 0;
516
517 if (dset->v > fullset->v) {
518 err = shrink_varinfo(dset, fullset->v);
519 }
520
521 /* sync */
522 fullset->varname = dset->varname;
523 fullset->varinfo = dset->varinfo;
524 fullset->descrip = dset->descrip;
525
526 return err;
527 }
528
529 /* Apparatus for updating full dataset when restoring full sample
530 after sub-sampling.
531 */
532
533 static void
update_full_data_values(const DATASET * dset)534 update_full_data_values (const DATASET *dset)
535 {
536 int i, s, t;
537
538 #if SUBDEBUG
539 fprintf(stderr, "update_full_data_values: fullset->Z=%p, dset->Z=%p, dset=%p\n",
540 (void *) fullset->Z, (void *) dset->Z, (void *) dset);
541 #endif
542
543 for (i=1; i<fullset->v && i<dset->v; i++) {
544 s = 0;
545 for (t=0; t<fullset->n; t++) {
546 if (dset->submask[t] == 1) {
547 fullset->Z[i][t] = dset->Z[i][s++];
548 } else if (dset->submask[t] == 'p') {
549 /* skip panel padding (?) */
550 s++;
551 }
552 }
553 }
554 }
555
update_case_markers(const DATASET * dset)556 static int update_case_markers (const DATASET *dset)
557 {
558 int err = 0;
559
560 if (fullset->markers == DAILY_DATE_STRINGS) {
561 ; /* don't mess with them */
562 } else if (dated_daily_data(fullset)) {
563 ; /* again, don't mess! */
564 } else if (dset->markers && !fullset->markers) {
565 dataset_allocate_obs_markers(fullset);
566 if (fullset->S == NULL) {
567 err = 1;
568 } else {
569 int t, subt = 0;
570
571 for (t=0; t<fullset->n; t++) {
572 if (dset->submask[t]) {
573 strcpy(fullset->S[t], dset->S[subt++]);
574 } else {
575 sprintf(fullset->S[t], "%d", t + 1);
576 }
577 }
578 }
579 }
580
581 return err;
582 }
583
add_new_vars_to_full(DATASET * dset)584 static int add_new_vars_to_full (DATASET *dset)
585 {
586 int V1 = dset->v;
587 int V0 = fullset->v;
588 int N = fullset->n;
589 double **newZ = NULL;
590 int i, t, s;
591 int err = 0;
592
593 if (V1 <= V0) {
594 return 0;
595 }
596
597 if (dset->submask == NULL) {
598 return E_NOMERGE;
599 }
600
601 #if SUBDEBUG
602 fprintf(stderr, "add_new_vars_to_full:\n");
603 fprintf(stderr, " V1 = dset->v = %d; V0 = fullset->v = %d\n",
604 V1, V0);
605 fprintf(stderr, " dset->Z = %p, fullset->Z = %p\n", (void *) dset->Z,
606 (void *) fullset->Z);
607 #endif
608
609 /* allocate expanded data array */
610 newZ = realloc(fullset->Z, V1 * sizeof *fullset->Z);
611
612 if (newZ == NULL) {
613 return E_ALLOC;
614 }
615
616 fullset->Z = newZ;
617
618 for (i=V0; i<dset->v && !err; i++) {
619 #if FULLDEBUG
620 fprintf(stderr, "adding to full: var %d (%s, level %d)\n",
621 i, dset->varname[i], series_get_stack_level(dset, i));
622 #endif
623 fullset->Z[i] = malloc(N * sizeof **fullset->Z);
624 if (fullset->Z[i] == NULL) {
625 err = E_ALLOC;
626 }
627 }
628
629 if (err) {
630 return E_ALLOC;
631 }
632
633 #if SUBDEBUG
634 fprintf(stderr, "After expansion, fullset->Z = %p (%d vars)\n",
635 (void *) fullset->Z, dset->v);
636 #endif
637
638 for (i=V0; i<dset->v; i++) {
639 s = 0;
640 for (t=0; t<N; t++) {
641 fullset->Z[i][t] = (dset->submask[t])?
642 dset->Z[i][s++] : NADBL;
643 }
644 }
645
646 fullset->v = V1;
647
648 return 0;
649 }
650
sync_data_to_full(DATASET * dset)651 static int sync_data_to_full (DATASET *dset)
652 {
653 int err;
654
655 /* update values for pre-existing series, which may have been
656 modified via "genr" etc. */
657 update_full_data_values(dset);
658
659 /* if case markers were added when subsampled, carry them back */
660 update_case_markers(dset);
661
662 /* delete any newly added hidden vars */
663 err = dataset_destroy_hidden_variables(dset, fullset->v);
664
665 /* in case any new vars were added when subsampled, try to merge
666 them into the full dataset */
667 if (!err) {
668 err = add_new_vars_to_full(dset);
669 }
670
671 return err;
672 }
673
674 /* Here we make a mask representing the "complete" sample
675 restriction currently in force, for use in cumulating
676 restrictions. "Complete" means that we take into account
677 both an existing subsampling restriction, if any, and
678 possible setting of the t1 and t2 members of @dset to
679 exclude certain observation ranges.
680
681 The mask returned, if non-NULL, will be the length of
682 the full dataset. It will be NULL (without error) if
683 the current dataset is neither subsampled nor range-
684 restricted.
685 */
686
make_current_sample_mask(DATASET * dset,int * err)687 static char *make_current_sample_mask (DATASET *dset, int *err)
688 {
689 char *currmask = NULL;
690 int range_set;
691 int s, t;
692
693 range_set = (dset->t1 > 0 || dset->t2 < dset->n - 1);
694
695 if (dset->submask == NULL) {
696 /* no pre-existing mask, so not subsampled, but
697 we should restrict currmask to observations
698 included in the current (t1, t2) range, if
699 applicable
700 */
701 if (range_set) {
702 currmask = make_submask(dset->n);
703 if (currmask == NULL) {
704 *err = E_ALLOC;
705 } else {
706 for (t=dset->t1; t<=dset->t2; t++) {
707 currmask[t] = 1;
708 }
709 }
710 }
711 } else {
712 /* there's a pre-existing mask: in addition we
713 mask out observations outside of the current
714 (t1, t2) range, if applicable
715 */
716 currmask = copy_subsample_mask(dset->submask, err);
717 if (!*err && range_set) {
718 s = -1;
719 for (t=0; t<fullset->n; t++) {
720 if (dset->submask[t]) s++;
721 if (s < dset->t1 || s > dset->t2) {
722 currmask[t] = 0;
723 }
724 }
725 }
726 }
727
728 return currmask;
729 }
730
731 /* Deal with the case where sampling has been done simply by
732 moving the sample-range endpoints
733 */
734
restore_full_easy(DATASET * dset,ExecState * state)735 static int restore_full_easy (DATASET *dset, ExecState *state)
736 {
737 int t1min, t2max;
738
739 if (state == NULL) {
740 /* not inside a function */
741 t1min = 0;
742 t2max = dset->n - 1;
743 dataset_clear_sample_record(dset);
744 } else {
745 /* don't go outside the bounds set on entry to
746 a function */
747 sample_range_get_extrema(dset, &t1min, &t2max);
748 }
749
750 if (dset->t1 != t1min || dset->t2 != t2max) {
751 dset->t1 = t1min;
752 dset->t2 = t2max;
753 #if PANDEBUG || SUBDEBUG
754 fprintf(stderr, "restore_full_sample: just set t1=%d and t2=%d\n",
755 t1min, t2max);
756 #endif
757 }
758
759 return 0;
760 }
761
762 /* restore_full_sample:
763 * @dset: dataset struct.
764 * @state: structure representing program execution state,
765 * or %NULL.
766 *
767 * Restore the full data range, undoing any sub-sampling that was
768 * previously performed (either by shifting the endpoints of the
769 * sample range or by selecting cases on some criterion or other).
770 * However, if @state is not %NULL, and if it contains a submask,
771 * the "full" range is relative to that mask.
772 *
773 * Returns: 0 on success, non-zero error code on failure.
774 */
775
restore_full_sample(DATASET * dset,ExecState * state)776 int restore_full_sample (DATASET *dset, ExecState *state)
777 {
778 int err = 0;
779
780 if (dset == NULL) {
781 return E_NODATA;
782 } else if (!complex_subsampled()) {
783 return restore_full_easy(dset, state);
784 }
785
786 if (dset != peerset) {
787 fprintf(stderr, "restore_full_sample: dset is not peerset!\n");
788 return E_DATA;
789 }
790
791 #if FULLDEBUG || SUBDEBUG
792 fprintf(stderr, "\nrestore_full_sample: dset=%p, state=%p, fullset=%p\n",
793 (void *) dset, (void *) state, (void *) fullset);
794 #endif
795
796 /* Beyond this point we are doing a non-trivial restoration
797 of a stored "full" dataset which has previously been
798 subsampled, e.g., by some boolean criterion.
799 */
800
801 if (dset->submask == RESAMPLED) {
802 err = resample_sync_dataset(dset);
803 } else {
804 if (dset->padmask != NULL) {
805 fprintf(stderr, "restore_full_sample: first undo panel padding\n");
806 err = undo_panel_padding(dset);
807 }
808 if (!err) {
809 sync_datainfo_members(dset);
810 err = sync_data_to_full(dset);
811 }
812 }
813
814 if (err == E_NOMERGE) {
815 gretl_errmsg_set(_("Missing sub-sample information; can't merge data\n"));
816 }
817
818 if (err) {
819 return err;
820 }
821
822 /* destroy sub-sampled data array */
823 #if PANDEBUG || SUBDEBUG
824 fprintf(stderr, "restore_full_sample: freeing sub-sampled Z at %p (v = %d, n = %d)\n"
825 " and clearing dset at %p\n", (void *) dset->Z, dset->v, dset->n,
826 (void *) dset);
827 #endif
828 free_Z(dset);
829 clear_datainfo(dset, CLEAR_SUBSAMPLE);
830
831 relink_to_full_dataset(dset);
832
833 if (state != NULL) {
834 /* in this case restoring the "full" sample really means, relative
835 to the original state
836 */
837 if (state->submask != NULL) {
838 err = restrict_sample_from_mask(state->submask, dset, OPT_NONE);
839 } else {
840 int t1min, t2max;
841
842 sample_range_get_extrema(dset, &t1min, &t2max);
843 if (dset->t1 < t1min) {
844 dset->t1 = t1min;
845 }
846 if (dset->t2 > t2max) {
847 dset->t2 = t2max;
848 }
849 }
850 }
851
852 return err;
853 }
854
overlay_masks(char * targ,const char * src,int n)855 static int overlay_masks (char *targ, const char *src, int n)
856 {
857 int i, sn = 0;
858
859 for (i=0; i<n; i++) {
860 if (targ[i] == 1 && src[i] == 1) {
861 targ[i] = 1;
862 sn++;
863 } else {
864 targ[i] = 0;
865 }
866 }
867
868 return sn;
869 }
870
make_weekday_mask(const DATASET * dset,char * mask)871 static int make_weekday_mask (const DATASET *dset, char *mask)
872 {
873 if (!dated_daily_data(dset) ||
874 dset->markers != 0 || dset->pd < 6) {
875 return E_PDWRONG;
876 } else {
877 char datestr[OBSLEN];
878 int t, wd;
879
880 for (t=0; t<dset->n; t++) {
881 ntolabel(datestr, t, dset);
882 wd = weekday_from_date(datestr);
883 mask[t] = (wd >= 1 && wd <= 5);
884 }
885
886 return 0;
887 }
888 }
889
890 /* write into @mask: 0 for observations at which _all_ variables
891 (or all variables in @list, if @list is non-NULL) have missing
892 values, 1 for all other observations (that have at least one
893 non-NA). (Refinement: if no list is given, we ignore the generic
894 variables "time" and "index".)
895 */
896
897 static int
make_empty_mask(const int * list,const DATASET * dset,char * mask)898 make_empty_mask (const int *list, const DATASET *dset, char *mask)
899 {
900 int i, vi, t, vt;
901
902 if (list != NULL && list[0] > 0) {
903 /* check specified list of variables */
904 for (t=0; t<dset->n; t++) {
905 mask[t] = 0;
906 for (i=1; i<=list[0]; i++) {
907 vi = list[i];
908 if (!na(dset->Z[vi][t])) {
909 mask[t] = 1;
910 break;
911 }
912 }
913 }
914 } else {
915 /* check (almost) all variables */
916 vt = current_series_index(dset, "time");
917 vi = current_series_index(dset, "index");
918
919 for (t=0; t<dset->n; t++) {
920 mask[t] = 0;
921 for (i=1; i<dset->v; i++) {
922 if (!series_is_hidden(dset, i) &&
923 i != vt && i != vi &&
924 !na(dset->Z[i][t])) {
925 mask[t] = 1;
926 break;
927 }
928 }
929 }
930 }
931
932 return 0;
933 }
934
935 /* write into @mask: 0 for observations at which _any_ variable
936 (or any variable in @list, if @list is non-NULL) has a missing
937 value, 1 for all other observations.
938 */
939
940 static int
make_missing_mask(const int * list,const DATASET * dset,char * mask)941 make_missing_mask (const int *list, const DATASET *dset, char *mask)
942 {
943 int i, vi, t;
944
945 if (list != NULL && list[0] > 0) {
946 /* check specified list of variables */
947 for (t=0; t<dset->n; t++) {
948 mask[t] = 1;
949 for (i=1; i<=list[0]; i++) {
950 vi = list[i];
951 if (na(dset->Z[vi][t])) {
952 mask[t] = 0;
953 break;
954 }
955 }
956 }
957 } else {
958 /* check all variables */
959 for (t=0; t<dset->n; t++) {
960 mask[t] = 1;
961 for (i=1; i<dset->v; i++) {
962 if (!series_is_hidden(dset, i) && na(dset->Z[i][t])) {
963 mask[t] = 0;
964 break;
965 }
966 }
967 }
968 }
969
970 return 0;
971 }
972
copy_dummy_to_mask(char * mask,const double * x,int n)973 static int copy_dummy_to_mask (char *mask, const double *x, int n)
974 {
975 int t, nsel = 0, err = 0;
976
977 #if SUBDEBUG
978 fprintf(stderr, "copy_dummy_to_mask, n = %d\n", n);
979 #endif
980
981 for (t=0; t<n && !err; t++) {
982 if (x[t] == 1.0) {
983 mask[t] = 1;
984 nsel++;
985 } else if (!na(x[t]) && x[t] != 0.0) { /* NA? */
986 err = 1;
987 }
988 }
989
990 if (!err && nsel == 0) {
991 err = E_ZERO;
992 }
993
994 return err;
995 }
996
mask_from_temp_dummy(const char * s,DATASET * dset,char * mask,PRN * prn)997 static int mask_from_temp_dummy (const char *s, DATASET *dset,
998 char *mask, PRN *prn)
999 {
1000 char formula[MAXLINE];
1001 double *x;
1002 int err = 0;
1003
1004 *formula = '\0';
1005 strncat(formula, s, MAXLINE - 1);
1006
1007 x = generate_series(formula, dset, prn, &err);
1008
1009 #if SUBDEBUG
1010 fprintf(stderr, "mask_from_temp_dummy: formula='%s', err=%d\n",
1011 formula, err);
1012 #endif
1013
1014 if (!err) {
1015 err = copy_dummy_to_mask(mask, x, dset->n);
1016 if (err == E_ZERO) {
1017 gretl_errmsg_set(_("No observations would be left!"));
1018 } else if (err) {
1019 gretl_errmsg_sprintf(_("'%s' is not a dummy variable"), "mask");
1020 }
1021 }
1022
1023 free(x);
1024
1025 return err;
1026 }
1027
mask_from_dummy(const char * s,const DATASET * dset,char * mask)1028 static int mask_from_dummy (const char *s, const DATASET *dset,
1029 char *mask)
1030 {
1031 double *x = NULL;
1032 char dname[VNAMELEN] = {0};
1033 int free_x = 0;
1034 int err = 0;
1035
1036 if (!strcmp(s, "$sample")) {
1037 /* try fetching the sample series from the last model */
1038 GretlObjType type = 0;
1039 void *ptr = get_last_model(&type);
1040
1041 if (ptr == NULL || type != GRETL_OBJ_EQN) {
1042 err = E_DATA;
1043 } else {
1044 strcpy(dname, s);
1045 x = malloc(dset->n * sizeof *x);
1046 if (x == NULL) {
1047 err = E_ALLOC;
1048 } else {
1049 free_x = 1;
1050 err = gretl_model_get_series(x, ptr, dset, M_SAMPLE);
1051 }
1052 }
1053 } else {
1054 int dnum;
1055
1056 gretl_scan_varname(s, dname);
1057 dnum = series_index(dset, dname);
1058 if (dnum == dset->v) {
1059 gretl_errmsg_sprintf(_("Variable '%s' not defined"), dname);
1060 err = E_DATA;
1061 } else {
1062 x = dset->Z[dnum];
1063 }
1064 }
1065
1066 if (!err) {
1067 err = copy_dummy_to_mask(mask, x, dset->n);
1068 if (err) {
1069 gretl_errmsg_sprintf(_("'%s' is not a dummy variable"), dname);
1070 }
1071 }
1072
1073 if (free_x) {
1074 free(x);
1075 }
1076
1077 return err;
1078 }
1079
1080 /* how many observations are selected by the given
1081 subsample mask? */
1082
1083 static int
count_selected_cases(const char * mask,const DATASET * dset)1084 count_selected_cases (const char *mask, const DATASET *dset)
1085 {
1086 int i, n = 0;
1087
1088 for (i=0; i<dset->n; i++) {
1089 if (mask[i]) {
1090 n++;
1091 }
1092 }
1093
1094 return n;
1095 }
1096
1097 /* panel: how many distinct cross-sectional units are included
1098 in the masked subset of observations? */
1099
1100 static int
count_panel_units(const char * mask,const DATASET * dset)1101 count_panel_units (const char *mask, const DATASET *dset)
1102 {
1103 int u, ubak = -1;
1104 int i, n = 0;
1105
1106 for (i=0; i<dset->n; i++) {
1107 if (mask[i]) {
1108 u = i / dset->pd;
1109 if (u != ubak) {
1110 n++;
1111 ubak = u;
1112 }
1113 }
1114 }
1115
1116 #if SUBDEBUG
1117 fprintf(stderr, "count_panel_units: got n = %d\n", n);
1118 #endif
1119
1120 return n;
1121 }
1122
1123 /* construct mask for taking random sub-sample from dataset:
1124 we're selecting @subn cases without replacement, and
1125 there may or may not be an existing mask in place that
1126 has to be respected.
1127 */
1128
make_random_mask(const char * s,const char * oldmask,DATASET * dset,char * mask)1129 static int make_random_mask (const char *s, const char *oldmask,
1130 DATASET *dset, char *mask)
1131 {
1132 unsigned u;
1133 int targ, avail, oldn = dset->n;
1134 int i, subn, cases, rejn;
1135 int err = 0;
1136
1137 /* how many cases are requested? */
1138 subn = smpl_get_int(s, dset, &err);
1139
1140 if (subn <= 0 || subn >= dset->n) {
1141 err = 1;
1142 } else if (oldmask != NULL) {
1143 /* dataset is already sub-sampled */
1144 oldn = 0;
1145 for (i=0; i<dset->n; i++) {
1146 if (oldmask[i]) {
1147 oldn++;
1148 }
1149 }
1150 if (subn >= oldn) {
1151 err = 1;
1152 }
1153 }
1154
1155 if (err && err != E_PARSE) {
1156 gretl_errmsg_sprintf(_("Invalid number of cases %d"), subn);
1157 return err;
1158 }
1159
1160 /* Which is smaller: the number of cases to be selected or the
1161 complement, the number to be discarded? For the sake of
1162 efficiency we'll go for the smaller value.
1163 */
1164 rejn = oldn - subn;
1165 if (rejn < subn) {
1166 /* select @rejn observations to discard */
1167 targ = rejn;
1168 avail = 1;
1169 } else {
1170 /* select @subn observations to include */
1171 targ = subn;
1172 avail = 0;
1173 }
1174
1175 for (i=0; i<dset->n; i++) {
1176 if (oldmask != NULL && oldmask[i] == 0) {
1177 /* obs is already excluded, hence not selectable */
1178 mask[i] = -1;
1179 } else {
1180 mask[i] = avail;
1181 }
1182 }
1183
1184 cases = 0;
1185
1186 while (cases < targ) {
1187 u = gretl_rand_int_max(dset->n);
1188 if (mask[u] == avail) {
1189 /* obs is available, not yet selected */
1190 mask[u] = !avail;
1191 cases++;
1192 }
1193 }
1194
1195 if (oldmask != NULL) {
1196 /* undo the 'already excluded' coding above */
1197 for (i=0; i<dset->n; i++) {
1198 if (mask[i] == -1) {
1199 mask[i] = 0;
1200 }
1201 }
1202 }
1203
1204 return err;
1205 }
1206
backup_full_dataset(DATASET * dset)1207 int backup_full_dataset (DATASET *dset)
1208 {
1209 #if SUBDEBUG
1210 int newfull = 0;
1211 #endif
1212
1213 if (fullset == NULL) {
1214 fullset = malloc(sizeof *fullset);
1215 if (fullset == NULL) {
1216 return E_ALLOC;
1217 }
1218 #if SUBDEBUG
1219 newfull = 1;
1220 #endif
1221 }
1222
1223 if (dset != NULL) {
1224 *fullset = *dset;
1225 peerset = dset;
1226 }
1227
1228 #if SUBDEBUG
1229 fprintf(stderr, "backup_full_dataset: fullset = %p (%s)\n",
1230 (void *) fullset, newfull ? "new" : "old");
1231 #endif
1232
1233 return 0;
1234 }
1235
destroy_full_dataset(DATASET * dset)1236 static void destroy_full_dataset (DATASET *dset)
1237 {
1238 if (fullset != NULL) {
1239 if (fullset->varname == dset->varname) {
1240 fullset->varname = NULL;
1241 }
1242 if (fullset->varinfo == dset->varinfo) {
1243 fullset->varinfo = NULL;
1244 }
1245 if (fullset->descrip == dset->descrip) {
1246 fullset->descrip = NULL;
1247 }
1248 destroy_dataset(fullset);
1249 fullset = NULL;
1250 }
1251
1252 dset->varname = NULL;
1253 dset->varinfo = NULL;
1254 dset->descrip = NULL;
1255
1256 free_Z(dset);
1257 clear_datainfo(dset, CLEAR_SUBSAMPLE);
1258
1259 peerset = NULL;
1260 }
1261
complex_subsampled(void)1262 int complex_subsampled (void)
1263 {
1264 return (fullset != NULL && fullset->Z != NULL);
1265 }
1266
get_full_length_n(void)1267 int get_full_length_n (void)
1268 {
1269 return (fullset != NULL) ? fullset->n : 0;
1270 }
1271
dataset_is_subsampled(const DATASET * dset)1272 int dataset_is_subsampled (const DATASET *dset)
1273 {
1274 if (fullset != NULL && fullset->Z != NULL) {
1275 return 1;
1276 } else if (dset->t1 > 0 || dset->t2 < dset->n - 1) {
1277 return 1;
1278 } else {
1279 return 0;
1280 }
1281 }
1282
1283 /* When sub-sampling on some boolean criterion, check to see if we can
1284 meet the criterion by simply adjusting the endpoints of the sample
1285 range: life will be simpler if that is so.
1286 */
1287
mask_contiguous(const char * mask,const DATASET * dset,int * pt1,int * pt2)1288 static int mask_contiguous (const char *mask,
1289 const DATASET *dset,
1290 int *pt1, int *pt2)
1291 {
1292 int t, t1 = 0, t2 = dset->n - 1;
1293 int contig = 1;
1294
1295 for (t=0; t<dset->n; t++) {
1296 if (mask[t] == 0) {
1297 t1++;
1298 } else {
1299 break;
1300 }
1301 }
1302
1303 for (t=dset->n - 1; t>=0; t--) {
1304 if (mask[t] == 0) {
1305 t2--;
1306 } else {
1307 break;
1308 }
1309 }
1310
1311 for (t=t1; t<=t2; t++) {
1312 if (mask[t] == 0) {
1313 /* there's a hole inside the range */
1314 contig = 0;
1315 break;
1316 }
1317 }
1318
1319 if (contig && dataset_is_panel(dset)) {
1320 int n = t2 - t1 + 1;
1321
1322 /* sample must leave a whole number of panel units; moreover,
1323 to retain "panelness" this number must be greater than 1
1324 */
1325 if (t1 % dset->pd != 0 || n % dset->pd != 0) {
1326 contig = 0;
1327 } else if (n == dset->pd) {
1328 contig = 0;
1329 }
1330 }
1331
1332 if (contig) {
1333 *pt1 = t1;
1334 *pt2 = t2;
1335 }
1336
1337 return contig;
1338 }
1339
1340 static void
copy_data_to_subsample(DATASET * subset,const DATASET * dset,int maxv,const char * mask)1341 copy_data_to_subsample (DATASET *subset, const DATASET *dset,
1342 int maxv, const char *mask)
1343 {
1344 int i, t, s;
1345
1346 #if SUBDEBUG
1347 fprintf(stderr, "copy_data_to_subsample: subset = %p, dset = %p\n",
1348 (void *) subset, (void *) dset);
1349 #endif
1350
1351 /* copy data values */
1352 for (i=1; i<maxv; i++) {
1353 s = 0;
1354 for (t=0; t<dset->n; t++) {
1355 if (mask == NULL) {
1356 subset->Z[i][s++] = dset->Z[i][t];
1357 } else if (mask[t] == 1) {
1358 subset->Z[i][s++] = dset->Z[i][t];
1359 } else if (mask[t] == 'p') {
1360 /* panel padding */
1361 subset->Z[i][s++] = NADBL;
1362 }
1363 }
1364 }
1365
1366 /* copy observation markers, if any */
1367 if (dset->markers && subset->markers) {
1368 s = 0;
1369 for (t=0; t<dset->n; t++) {
1370 if (mask == NULL || mask[t] == 1 || mask[t] == 'p') {
1371 strcpy(subset->S[s++], dset->S[t]);
1372 }
1373 }
1374 }
1375
1376 /* copy panel time info? */
1377 if (dataset_is_panel(subset)) {
1378 if (subset->pd == dset->pd) {
1379 subset->panel_pd = dset->panel_pd;
1380 subset->panel_sd0 = dset->panel_sd0;
1381 }
1382 }
1383
1384 if (subset->stobs[0] == '\0' || subset->endobs[0] == '\0') {
1385 /* impose simple default if not already handled */
1386 strcpy(subset->stobs, "1");
1387 sprintf(subset->endobs, "%d", subset->n);
1388 }
1389 }
1390
get_restriction_mode(gretlopt opt)1391 int get_restriction_mode (gretlopt opt)
1392 {
1393 int mode = SUBSAMPLE_UNKNOWN;
1394
1395 if (opt & (OPT_M | OPT_C)) {
1396 mode = SUBSAMPLE_DROP_MISSING;
1397 } else if (opt & OPT_R) {
1398 mode = SUBSAMPLE_BOOLEAN;
1399 } else if (opt & OPT_N) {
1400 mode = SUBSAMPLE_RANDOM;
1401 } else if (opt & OPT_O) {
1402 mode = SUBSAMPLE_USE_DUMMY;
1403 } else if (opt & OPT_A) {
1404 mode = SUBSAMPLE_DROP_EMPTY;
1405 } else if (opt & OPT_W) {
1406 mode = SUBSAMPLE_DROP_WKENDS;
1407 }
1408
1409 return mode;
1410 }
1411
1412 /* Copy list of panel periods from Ti to T0; return 1 if members of
1413 the list Ti are consecutive, 0 if they are not.
1414 */
1415
copy_periods_list(int * T0,const int * Ti)1416 static int copy_periods_list (int *T0, const int *Ti)
1417 {
1418 int j, ret = 1;
1419
1420 for (j=0; j<=Ti[0] && ret; j++) {
1421 if (j > 1 && Ti[j] != Ti[j-1] + 1) {
1422 ret = 0;
1423 } else {
1424 T0[j] = Ti[j];
1425 }
1426 }
1427
1428 return ret;
1429 }
1430
1431 /* When sub-sampling panel data on some boolean criterion: see if the
1432 exclusion of certain rows leaves a balanced panel. Note that the
1433 requirement is not simply that each unit has the same number of
1434 temporal observations -- they must have the _same_ observations,
1435 and in addition the observations must be temporally contiguous.
1436 */
1437
mask_leaves_balanced_panel(char * mask,const DATASET * dset)1438 static int mask_leaves_balanced_panel (char *mask, const DATASET *dset)
1439 {
1440 int T = dset->pd;
1441 int *T0, *Ti;
1442 int i, u, ubak = -1;
1443 int ret = 1;
1444
1445 T0 = gretl_list_new(T);
1446 Ti = gretl_list_new(T);
1447
1448 if (T0 == NULL || Ti == NULL) {
1449 free(T0);
1450 free(Ti);
1451 return 0;
1452 }
1453
1454 T0[0] = Ti[0] = 0;
1455
1456 for (i=0; i<dset->n && ret; i++) {
1457 if (mask[i]) {
1458 u = i / T;
1459 if (u != ubak) {
1460 if (Ti[0] > 0) {
1461 if (T0[0] == 0) {
1462 /* we haven't made the reference list, T0, yet */
1463 ret = copy_periods_list(T0, Ti);
1464 } else if (gretl_list_cmp(Ti, T0)) {
1465 /* the current list differs from the reference one */
1466 ret = 0;
1467 }
1468 }
1469 Ti[0] = 1;
1470 Ti[1] = i % T;
1471 ubak = u;
1472 } else {
1473 Ti[0] += 1;
1474 Ti[Ti[0]] = i % T;
1475 }
1476 }
1477 }
1478
1479 if (gretl_list_cmp(Ti, T0)) {
1480 /* check the last unit */
1481 ret = 0;
1482 }
1483
1484 free(T0);
1485 free(Ti);
1486
1487 return ret;
1488 }
1489
1490 static int
make_panel_submask(char * mask,const DATASET * dset,int * err)1491 make_panel_submask (char *mask, const DATASET *dset, int *err)
1492 {
1493 int T = dset->pd;
1494 int N = dset->n / T;
1495 char *umask, *pmask;
1496 int i, np = 0;
1497
1498 umask = calloc(N + T, 1);
1499 if (umask == NULL) {
1500 *err = E_ALLOC;
1501 return 0;
1502 }
1503
1504 pmask = umask + N;
1505
1506 for (i=0; i<dset->n; i++) {
1507 if (mask[i]) {
1508 umask[i / T] = 1;
1509 pmask[i % T] = 1;
1510 }
1511 }
1512
1513 for (i=0; i<dset->n; i++) {
1514 if (!mask[i]) {
1515 if (umask[i / T] && pmask[i % T]) {
1516 mask[i] = 'p'; /* mark as padding row */
1517 np++;
1518 }
1519 }
1520 }
1521
1522 #if SUBDEBUG
1523 fprintf(stderr, "make_panel_submask: number of padding rows = %d\n",
1524 np);
1525 #endif
1526
1527 free(umask);
1528
1529 return np;
1530 }
1531
add_daily_date_strings(char * selected,const DATASET * dset,DATASET * subset)1532 static int add_daily_date_strings (char *selected,
1533 const DATASET *dset,
1534 DATASET *subset)
1535 {
1536 int err, t, i = 0;
1537
1538 err = dataset_allocate_obs_markers(subset);
1539
1540 if (!err) {
1541 subset->markers = DAILY_DATE_STRINGS;
1542 for (t=0; t<dset->n; t++) {
1543 if (selected[t]) {
1544 ntolabel(subset->S[i++], t, dset);
1545 }
1546 }
1547 }
1548
1549 return err;
1550 }
1551
1552 /* See if sub-sampling dated daily data leaves a useable daily dataset
1553 (as in dropping weekends and/or holidays).
1554 */
1555
try_for_daily_subset(char * selected,const DATASET * dset,DATASET * subset,gretlopt * optp)1556 static int try_for_daily_subset (char *selected,
1557 const DATASET *dset,
1558 DATASET *subset,
1559 gretlopt *optp)
1560 {
1561 char datestr[OBSLEN];
1562 guint32 ed, ed0 = 0, edbak = 0;
1563 int t, wd, ngaps = 0;
1564 int delta, mon_delta;
1565 int delta_max = 0;
1566 int started = 0;
1567 int any_sat = 0;
1568 int newpd = 5;
1569 int err = 0;
1570
1571 if (dset->pd > 5) {
1572 /* first pass: look at weekend status of included obs */
1573 for (t=0; t<dset->n; t++) {
1574 if (selected[t]) {
1575 ntolabel(datestr, t, dset);
1576 wd = weekday_from_date(datestr);
1577 if (wd == 0) {
1578 /* got a Sunday: result must be 7-day */
1579 newpd = 7;
1580 break;
1581 } else if (wd == 6) {
1582 any_sat = 1;
1583 }
1584 }
1585 }
1586 if (newpd == 5 && any_sat) {
1587 /* no Sundays, but got a Saurday: result
1588 must be 6-day data */
1589 newpd = 6;
1590 }
1591 }
1592
1593 #if SUBDEBUG
1594 fprintf(stderr, "try_for_daily_subset: newpd = %d\n", newpd);
1595 #endif
1596
1597 /* determine expected "delta days" for Mondays */
1598 mon_delta = (newpd == 7) ? 1 : (newpd == 6)? 2 : 3;
1599
1600 /* second pass: count the calendar gaps */
1601 for (t=0; t<dset->n; t++) {
1602 if (selected[t]) {
1603 ntolabel(datestr, t, dset);
1604 wd = weekday_from_date(datestr);
1605 ed = get_epoch_day(datestr);
1606 if (started) {
1607 delta = ed - edbak;
1608 ngaps += (wd == 1)? (delta > mon_delta) : (delta > 1);
1609 if (delta > delta_max) {
1610 delta_max = delta;
1611 }
1612 } else {
1613 ed0 = ed;
1614 started = 1;
1615 }
1616 edbak = ed;
1617 }
1618 }
1619
1620 /* Now, we should probably restrict the proportion of
1621 missing days for this treatment: for exclusion of
1622 non-trading days 10 percent should be generous.
1623 But we'll also require that the maximum "daily
1624 delta" be less than 10.
1625 */
1626
1627 if (delta_max < 10) {
1628 double keepfrac = subset->n / (double) dset->n;
1629 double keepmin = 0.9 * newpd / (double) dset->pd;
1630
1631 #if SUBDEBUG
1632 fprintf(stderr, " daily: delta_max=%d, keepfrac=%.4f\n"
1633 " keepmin=%.4f, ngaps=%d, newpd=%d\n", delta_max, keepfrac,
1634 keepmin, ngaps, newpd);
1635 #endif
1636
1637 if (keepfrac >= keepmin) {
1638 if (dset->S == NULL && ngaps > 0) {
1639 err = add_daily_date_strings(selected, dset, subset);
1640 /* flag not to destroy subset date strings */
1641 *optp |= OPT_P;
1642 }
1643 if (!err) {
1644 subset->structure = TIME_SERIES;
1645 subset->pd = newpd;
1646 subset->sd0 = (double) ed0;
1647 subset->t2 = subset->n - 1;
1648 ntolabel(subset->stobs, 0, subset);
1649 ntolabel(subset->endobs, subset->t2, subset);
1650 }
1651 }
1652 }
1653
1654 return err;
1655 }
1656
1657 /* does the given policy lead to an empty sample, or no change
1658 in the sample, perchance?
1659 */
1660
check_subsample_n(int n,DATASET * dset,char ** pmask,PRN * prn)1661 static int check_subsample_n (int n, DATASET *dset,
1662 char **pmask, PRN *prn)
1663 {
1664 if (n == 0) {
1665 gretl_errmsg_set(_("No observations would be left!"));
1666 return E_DATA;
1667 } else if (n == dset->n) {
1668 /* not really an error, just a no-op */
1669 if (gretl_messages_on()) {
1670 pputs(prn, _("No observations were dropped!"));
1671 pputc(prn, '\n');
1672 }
1673 free(*pmask);
1674 *pmask = NULL;
1675 }
1676
1677 return 0;
1678 }
1679
1680 #define needs_string_arg(m) (m == SUBSAMPLE_RANDOM || \
1681 m == SUBSAMPLE_USE_DUMMY || \
1682 m == SUBSAMPLE_BOOLEAN)
1683
1684 static int
make_restriction_mask(int mode,const char * s,const int * list,DATASET * dset,const char * oldmask,char ** pmask,PRN * prn)1685 make_restriction_mask (int mode, const char *s,
1686 const int *list, DATASET *dset,
1687 const char *oldmask, char **pmask,
1688 PRN *prn)
1689 {
1690 char *mask = NULL;
1691 int sn = 0, err = 0;
1692
1693 if (needs_string_arg(mode) && (s == NULL || *s == '\0')) {
1694 return E_ARGS;
1695 }
1696
1697 mask = make_submask(dset->n);
1698 if (mask == NULL) {
1699 return E_ALLOC;
1700 }
1701
1702 #if SUBDEBUG
1703 fprintf(stderr, "make_restriction_mask: oldmask = %p\n", (void *) oldmask);
1704 #endif
1705
1706 /* construct subsample mask in one of several possible ways */
1707
1708 if (mode == SUBSAMPLE_DROP_MISSING) {
1709 err = make_missing_mask(list, dset, mask);
1710 } else if (mode == SUBSAMPLE_DROP_EMPTY) {
1711 err = make_empty_mask(list, dset, mask);
1712 } else if (mode == SUBSAMPLE_DROP_WKENDS) {
1713 err = make_weekday_mask(dset, mask);
1714 } else if (mode == SUBSAMPLE_RANDOM) {
1715 err = make_random_mask(s, oldmask, dset, mask);
1716 } else if (mode == SUBSAMPLE_USE_DUMMY) {
1717 err = mask_from_dummy(s, dset, mask);
1718 } else if (mode == SUBSAMPLE_BOOLEAN) {
1719 err = mask_from_temp_dummy(s, dset, mask, prn);
1720 } else {
1721 gretl_errmsg_set(_("Sub-sample command failed mysteriously"));
1722 err = 1;
1723 }
1724
1725 if (err) {
1726 /* exit now on unrecoverable error */
1727 free(mask);
1728 return err;
1729 }
1730
1731 /* cumulate sample restrictions, if appropriate */
1732 if (oldmask != NULL && mode != SUBSAMPLE_RANDOM) {
1733 sn = overlay_masks(mask, oldmask, dset->n);
1734 } else {
1735 sn = count_selected_cases(mask, dset);
1736 }
1737
1738 err = check_subsample_n(sn, dset, &mask, prn);
1739
1740 if (err) {
1741 free(mask);
1742 } else {
1743 *pmask = mask;
1744 }
1745
1746 return err;
1747 }
1748
finalize_panel_subset(DATASET * subset,DATASET * dset,int npad)1749 static void finalize_panel_subset (DATASET *subset,
1750 DATASET *dset,
1751 int npad)
1752 {
1753 int pdp = subset->pd;
1754 int den = 10.0;
1755
1756 while ((pdp = pdp / 10)) {
1757 den *= 10;
1758 }
1759
1760 subset->sd0 = 1.0 + 1.0 / den;
1761 ntolabel(subset->stobs, 0, subset);
1762 ntolabel(subset->endobs, subset->n - 1, subset);
1763
1764 if (dset->pangrps != NULL && npad == 0) {
1765 /* carry over panel group names from full dataset */
1766 subset->pangrps = gretl_strdup(dset->pangrps);
1767 }
1768 }
1769
1770 /* This is also used elsewhere:
1771
1772 in gui2/session.c, when re-establishing a previously sub-sampled data
1773 state on reopening a saved session
1774
1775 in gretl_func.c, on exit from a user function when the dataset was
1776 sub-sampled on entry to the function (and we need to re-establish the
1777 original sub-sample)
1778 */
1779
1780 int
restrict_sample_from_mask(char * mask,DATASET * dset,gretlopt opt)1781 restrict_sample_from_mask (char *mask, DATASET *dset, gretlopt opt)
1782 {
1783 DATASET *subset;
1784 gretlopt zopt = OPT_R;
1785 int err = 0;
1786
1787 if (dset->auxiliary) {
1788 fprintf(stderr, "restrict_sample_from_mask: attempting to restrict "
1789 "an auxiliary data\n");
1790 return E_DATA;
1791 }
1792
1793 if (mask == RESAMPLED) {
1794 fprintf(stderr, "restrict_sample_from_mask: got RESAMPLED!\n");
1795 return E_DATA;
1796 }
1797
1798 if ((opt & OPT_B) && !dataset_is_panel(dset)) {
1799 gretl_errmsg_set(_("--balanced option is invalid: the (full) "
1800 "dataset is not a panel"));
1801 return E_BADOPT;
1802 }
1803
1804 subset = datainfo_new();
1805 if (subset == NULL) {
1806 return E_ALLOC;
1807 }
1808
1809 subset->n = count_selected_cases(mask, dset);
1810 subset->v = dset->v;
1811
1812 #if SUBDEBUG
1813 fprintf(stderr, "restrict_sample_from_mask:\n new subset=%p, "
1814 "%d obs in subsample vs %d in full dataset\n",
1815 (void *) subset, subset->n, dset->n);
1816 #endif
1817
1818 if (dataset_is_panel(dset)) {
1819 /* are we able to reconstitute a panel? */
1820 int nunits = count_panel_units(mask, dset);
1821 int ok = 0, npad = 0;
1822
1823 if (nunits > 1 && subset->n > nunits) {
1824 /* there's some possibility of doing so */
1825 if (opt & OPT_B) {
1826 /* add padding rows? only if this was requested */
1827 npad = make_panel_submask(mask, dset, &err);
1828 if (err) {
1829 free(subset);
1830 return err;
1831 }
1832 ok = 1;
1833 } else {
1834 ok = mask_leaves_balanced_panel(mask, dset);
1835 }
1836 if (ok) {
1837 subset->structure = STACKED_TIME_SERIES;
1838 subset->n += npad;
1839 subset->pd = subset->n / nunits;
1840 finalize_panel_subset(subset, dset, npad);
1841 }
1842 } else if (nunits == 1 && subset->n == dset->pd) {
1843 /* time series for single panel unit */
1844 subset->structure = SPECIAL_TIME_SERIES;
1845 }
1846 } else if (dated_daily_data(dset)) {
1847 /* see if we can preserve daily time series */
1848 err = try_for_daily_subset(mask, dset, subset, &zopt);
1849 }
1850
1851 if (!err) {
1852 /* set up the sub-sampled dataset */
1853 err = start_new_Z(subset, zopt);
1854 }
1855
1856 if (err) {
1857 free(subset);
1858 return err;
1859 }
1860
1861 #if SUBDEBUG
1862 fprintf(stderr, "started new Z for subset (v=%d, n=%d, Z=%p)\n",
1863 subset->v, subset->n, (void *) subset->Z);
1864 #endif
1865
1866 /* link (don't copy) varnames and descriptions, since these are
1867 not dependent on the series length */
1868 subset->varname = dset->varname;
1869 subset->varinfo = dset->varinfo;
1870 subset->descrip = dset->descrip;
1871 subset->mapfile = dset->mapfile;
1872
1873 /* set up case markers? */
1874 if (dset->markers) {
1875 err = dataset_allocate_obs_markers(subset);
1876 if (err) {
1877 free_Z(subset);
1878 free(subset);
1879 return E_ALLOC;
1880 }
1881 }
1882
1883 /* copy across data (and case markers, if any) */
1884 copy_data_to_subsample(subset, dset, dset->v, mask);
1885
1886 if (opt & OPT_T) {
1887 /* --permanent */
1888 check_models_for_subsample(mask, NULL);
1889 destroy_full_dataset(dset);
1890 #if RECODE_ON_PERMA
1891 recode_strvals(subset, OPT_NONE);
1892 #endif
1893 } else {
1894 err = backup_full_dataset(dset);
1895 subset->submask = copy_subsample_mask(mask, &err);
1896 }
1897
1898 /* switch pointers */
1899 *dset = *subset;
1900 free(subset);
1901
1902 return err;
1903 }
1904
expand_mask(char * tmpmask,const char * oldmask,int * err)1905 static char *expand_mask (char *tmpmask, const char *oldmask,
1906 int *err)
1907 {
1908 char *newmask = make_submask(fullset->n);
1909
1910 if (newmask == NULL) {
1911 *err = E_ALLOC;
1912 } else {
1913 /* map @tmpmask onto the full data range */
1914 int t, i = 0;
1915
1916 for (t=0; t<fullset->n; t++) {
1917 if (oldmask[t]) {
1918 newmask[t] = tmpmask[i++];
1919 }
1920 }
1921 }
1922
1923 return newmask;
1924 }
1925
1926 /* Below: we do this "precompute" thing if the dataset is already
1927 subsampled and the user wants to compound the restriction with a
1928 boolean restriction. One reason for this is that "obs" references
1929 in the new restriction may get out of whack if we restore the full
1930 dataset first. For example, say the spec is "obs!=50" to exclude
1931 observation 50: presumably the user means to exclude the 50th
1932 observation in the current, subsampled dataset, which may not be
1933 the same as the 50th observation in the full dataset.
1934 */
1935
precompute_mask(const char * s,const char * oldmask,DATASET * dset,PRN * prn,int * err)1936 static char *precompute_mask (const char *s, const char *oldmask,
1937 DATASET *dset, PRN *prn, int *err)
1938 {
1939 char *tmpmask = make_submask(dset->n);
1940 char *newmask = NULL;
1941
1942 #if SUBDEBUG
1943 fprintf(stderr, "restrict_sample: precomputing new mask\n");
1944 #endif
1945
1946 if (tmpmask == NULL) {
1947 *err = E_ALLOC;
1948 }
1949
1950 if (!*err) {
1951 /* fill out mask relative to current, restricted dataset */
1952 *err = mask_from_temp_dummy(s, dset, tmpmask, prn);
1953 }
1954
1955 if (!*err) {
1956 if (fullset != NULL) {
1957 newmask = expand_mask(tmpmask, oldmask, err);
1958 } else {
1959 newmask = tmpmask;
1960 tmpmask = NULL;
1961 }
1962 }
1963
1964 free(tmpmask);
1965
1966 return newmask;
1967 }
1968
1969 /* Intended for time series data: trim any missing values
1970 at the start and end of the current sample range, then
1971 check the remaining range for missing values and flag
1972 an error if any are found. If opt contains OPT_T (for
1973 permanence) and no error is encountered, we shrink the
1974 dataset to the contiguous range.
1975 */
1976
set_contiguous_sample(const int * list,DATASET * dset,gretlopt opt)1977 static int set_contiguous_sample (const int *list,
1978 DATASET *dset,
1979 gretlopt opt)
1980 {
1981 int ct1 = dset->t1;
1982 int ct2 = dset->t2;
1983 int err = 0;
1984
1985 if (list != NULL && list[0] > 0) {
1986 err = list_adjust_sample(list, &ct1, &ct2, dset, NULL);
1987 } else {
1988 int *biglist = NULL;
1989 int nvars = 0;
1990
1991 biglist = full_var_list(dset, &nvars);
1992 if (nvars == 0) {
1993 ; /* no-op */
1994 } else if (biglist == NULL) {
1995 err = E_ALLOC;
1996 } else {
1997 err = list_adjust_sample(biglist, &ct1, &ct2,
1998 dset, NULL);
1999 free(biglist);
2000 }
2001 }
2002
2003 if (!err) {
2004 int changed = ct1 != dset->t1 || ct2 != dset->t2;
2005
2006 dset->t1 = ct1;
2007 dset->t2 = ct2;
2008 if (changed && (opt & OPT_T)) {
2009 err = perma_sample(dset, OPT_T, NULL, NULL);
2010 }
2011 }
2012
2013 return err;
2014 }
2015
2016 /* Make a string representing the sample restriction. This is
2017 for reporting purposes. Note that in some cases the
2018 incoming @restr string may be NULL, for instance if the
2019 no-missing option is chosen via the gretl GUI.
2020 */
2021
make_restriction_string(DATASET * dset,char * old,const char * restr,int mode)2022 static int make_restriction_string (DATASET *dset, char *old,
2023 const char *restr, int mode)
2024 {
2025 char *s = NULL;
2026 int n = 0, err = 0;
2027
2028 dataset_clear_sample_record(dset);
2029
2030 if (old != NULL) {
2031 n = strlen(old);
2032 }
2033
2034 if (mode == SUBSAMPLE_RANDOM) {
2035 n += strlen("random");
2036 } else if (mode == SUBSAMPLE_DROP_MISSING) {
2037 n += strlen("no-missing");
2038 } else if (restr != NULL) {
2039 n += strlen(restr);
2040 }
2041
2042 if (n > 0) {
2043 s = malloc(n + 5);
2044 if (s == NULL) {
2045 err = E_ALLOC;
2046 }
2047 }
2048
2049 if (s != NULL) {
2050 *s = '\0';
2051 if (old != NULL) {
2052 strcpy(s, old);
2053 strcat(s, " && ");
2054 }
2055 if (mode == SUBSAMPLE_RANDOM) {
2056 strcat(s, "random");
2057 } else if (mode == SUBSAMPLE_DROP_MISSING) {
2058 strcat(s, "no-missing");
2059 } else if (restr != NULL) {
2060 strcat(s, restr);
2061 }
2062 dset->restriction = s;
2063 }
2064
2065 return err;
2066 }
2067
full_sample(const DATASET * dset)2068 static int full_sample (const DATASET *dset)
2069 {
2070 if (complex_subsampled()) {
2071 return 0;
2072 } else if (dset->t1 > 0 || dset->t2 < dset->n - 1) {
2073 return 0;
2074 } else {
2075 return 1;
2076 }
2077 }
2078
handle_ts_restrict(char * mask,DATASET * dset,gretlopt opt,int t1)2079 static int handle_ts_restrict (char *mask, DATASET *dset,
2080 gretlopt opt, int t1)
2081 {
2082 char stobs[OBSLEN];
2083 int pd = dset->pd;
2084 double sd0;
2085 int err;
2086
2087 ntolabel(stobs, t1, dset);
2088 sd0 = get_date_x(dset->pd, stobs);
2089
2090 err = restrict_sample_from_mask(mask, dset, opt);
2091
2092 if (!err) {
2093 /* re-establish time-series characteristics */
2094 dset->structure = TIME_SERIES;
2095 dset->pd = pd;
2096 dset->sd0 = sd0;
2097 strcpy(dset->stobs, stobs);
2098 ntolabel(dset->endobs, dset->n - 1, dset);
2099 }
2100
2101 return err;
2102 }
2103
check_restrict_options(gretlopt opt,const char * param)2104 static int check_restrict_options (gretlopt opt, const char *param)
2105 {
2106 /* We'll accept the redundant combination of the options --dummy
2107 (OPT_O) and --restrict (OPT_R), but other than that the options
2108 --dummy,
2109 --restrict,
2110 --no-missing (OPT_M),
2111 --no-all-missing (OPT_A) and
2112 --random (OPT_N)
2113 are all mutually incompatible.
2114 */
2115 if (incompatible_options(opt, OPT_O | OPT_M | OPT_A | OPT_N) ||
2116 incompatible_options(opt, OPT_R | OPT_M | OPT_A | OPT_N)) {
2117 return E_BADOPT;
2118 }
2119
2120 /* The --contiguous option (OPT_C) is compatible only with
2121 --no-missing (which is implied) and --permanent */
2122 if (opt & OPT_C) {
2123 if ((opt & ~(OPT_C | OPT_M | OPT_T)) != OPT_NONE) {
2124 return E_BADOPT;
2125 }
2126 }
2127
2128 if (opt & (OPT_O | OPT_R | OPT_N)) {
2129 /* parameter is required */
2130 if (param == NULL || *param == '\0') {
2131 return E_ARGS;
2132 }
2133 }
2134
2135 return 0;
2136 }
2137
check_permanent_option(gretlopt opt,DATASET * dset,int * n_models)2138 static int check_permanent_option (gretlopt opt,
2139 DATASET *dset,
2140 int *n_models)
2141 {
2142 if (gretl_function_depth() > 0) {
2143 /* can't permanently shrink the dataset within a function */
2144 gretl_errmsg_set(_("The dataset cannot be modified at present"));
2145 return E_DATA;
2146 } else if (!(opt & (OPT_O | OPT_M | OPT_A | OPT_N | OPT_R | OPT_C))) {
2147 /* we need some kind of restriction flag */
2148 return E_ARGS;
2149 } else if (gretl_in_gui_mode() && !(opt & OPT_F)) {
2150 /* GUI program, without internal "force" option */
2151 *n_models = n_stacked_models();
2152 if (*n_models > 0 && !full_sample(dset)) {
2153 /* too difficult to recover gracefully on error */
2154 gretl_errmsg_set(_("The full dataset must be restored before "
2155 "imposing a permanent sample restriction"));
2156 return E_DATA;
2157 }
2158 }
2159
2160 return 0;
2161 }
2162
2163 /* "Precomputing" a mask means computing a mask based on a
2164 current subsampled dataset (before restoring the full dataset,
2165 which is a part of the subsampling process). We do this if
2166 we're cumulating a boolean restriction on top of an existing
2167 restriction.
2168 */
2169
do_precompute(int mode,char * oldmask,const char * param)2170 static int do_precompute (int mode, char *oldmask, const char *param)
2171 {
2172 return oldmask != NULL && mode == SUBSAMPLE_BOOLEAN;
2173 }
2174
2175 /* restrict_sample:
2176 * @param: restriction string (or %NULL).
2177 * @list: list of variables in case of OPT_M (or %NULL).
2178 * @dset: dataset struct.
2179 * @state: structure representing program state (or %NULL).
2180 * @opt: option flags.
2181 * @prn: printing apparatus.
2182 * @n_dropped: location to receive count of dropped
2183 * observations, or NULL.
2184 *
2185 * Sub-sample the data set, based on the criterion of skipping all
2186 * observations with missing data values (OPT_M); or using as mask a
2187 * specified dummy variable (OPT_O); or masking with a specified
2188 * boolean condition (OPT_R); or selecting at random (OPT_N).
2189 *
2190 * In case OPT_M or OPT_C a @list of variables may be supplied; in
2191 * cases OPT_O, OPT_R and OPT_N, @param must contain specifics.
2192 *
2193 * In case OPT_P is included, the restriction will rePlace any
2194 * existing sample restriction, otherwise the resulting restriction
2195 * will be the logical product of the new restriction and any
2196 * existing restriction.
2197 *
2198 * In case the original dataset was a panel and OPT_B was given,
2199 * we'll pad with missing values if necessary, to try to reconstitute
2200 * a balanced panel.
2201 *
2202 * In case OPT_T is included, the sample restriction will be
2203 * permanent (the dataset is shrunk to the sample), otherwise the
2204 * restriction can be undone via the command "smpl full".
2205 *
2206 * Returns: 0 on success, non-zero error code on failure.
2207 */
2208
restrict_sample(const char * param,const int * list,DATASET * dset,ExecState * state,gretlopt opt,PRN * prn,int * n_dropped)2209 int restrict_sample (const char *param, const int *list,
2210 DATASET *dset, ExecState *state,
2211 gretlopt opt, PRN *prn,
2212 int *n_dropped)
2213 {
2214 char *oldrestr = NULL;
2215 char *oldmask = NULL;
2216 char *mask = NULL;
2217 int free_oldmask = 0;
2218 int permanent = 0;
2219 int n_models = 0;
2220 int n_orig = 0;
2221 int mode, err = 0;
2222
2223 if (dset == NULL || dset->Z == NULL) {
2224 return E_NODATA;
2225 }
2226
2227 n_orig = dset->n;
2228
2229 /* general check on incoming options */
2230 err = check_restrict_options(opt, param);
2231
2232 /* take a closer look at the --permanent option */
2233 if (!err && (opt & OPT_T)) {
2234 err = check_permanent_option(opt, dset, &n_models);
2235 if (!err) {
2236 permanent = 1;
2237 }
2238 }
2239
2240 if (err) {
2241 return err;
2242 }
2243
2244 gretl_error_clear();
2245
2246 #if FULLDEBUG || SUBDEBUG
2247 fprintf(stderr, "\nrestrict_sample: param='%s'\n", param);
2248 fprintf(stderr, " dset=%p, state=%p, fullset=%p\n", (void *) dset,
2249 (void *) state, (void *) fullset);
2250 printlist(list, "list param");
2251 #endif
2252
2253 if (opt & OPT_C) {
2254 /* --contiguous */
2255 return set_contiguous_sample(list, dset, opt);
2256 }
2257
2258 mode = get_restriction_mode(opt);
2259 if (mode == SUBSAMPLE_UNKNOWN) {
2260 gretl_errmsg_set("Unrecognized sample command");
2261 return 1;
2262 }
2263
2264 if (!(opt & OPT_P)) {
2265 /* not replacing but cumulating any existing restrictions */
2266 oldmask = make_current_sample_mask(dset, &err);
2267 if (err) {
2268 return err;
2269 }
2270 free_oldmask = 1;
2271 if (dset->restriction != NULL) {
2272 oldrestr = gretl_strdup(dset->restriction);
2273 }
2274 } else if (state != NULL && state->submask != NULL) {
2275 /* subsampling within a function: this necessarily
2276 cumulates with the incoming sample restriction
2277 recorded in state->submask
2278 */
2279 oldmask = state->submask;
2280 } else if (state != NULL) {
2281 /* FIXME? This is a loophole that potentially allows
2282 overriding of a (t1, t2) sample range at caller
2283 level, via "restrict replace" sampling within a
2284 function.
2285 */
2286 ;
2287 }
2288
2289 if (do_precompute(mode, oldmask, param)) {
2290 /* we come in here only if cumulating restrictions */
2291 mask = precompute_mask(param, oldmask, dset, prn, &err);
2292 }
2293
2294 if (!err && !full_sample(dset)) {
2295 /* restore the full data range, for housekeeping purposes */
2296 err = restore_full_sample(dset, NULL);
2297 }
2298
2299 if (err) {
2300 /* clean up and get out */
2301 free(mask);
2302 if (free_oldmask) {
2303 free(oldmask);
2304 }
2305 return err;
2306 }
2307
2308 if (mask == NULL) {
2309 /* not already handled by "precompute" above */
2310 err = make_restriction_mask(mode, param, list, dset,
2311 oldmask, &mask, prn);
2312 }
2313
2314 if (err && state != NULL && state->submask != NULL) {
2315 /* Should we try to restore the status quo ante on error?
2316 Too tricky, I think -- best to cancel the 'catch' flag
2317 if it's present.
2318 */
2319 if (state->cmd->flags & CMD_CATCH) {
2320 state->cmd->flags ^= CMD_CATCH;
2321 gretl_errmsg_append("Sorry, cannot 'catch' this error", err);
2322 }
2323 }
2324
2325 if (!err && n_models > 0) {
2326 err = check_models_for_subsample(mask, n_dropped);
2327 }
2328
2329 if (!err && mask != NULL) {
2330 int contiguous = 0;
2331 int t1 = 0, t2 = 0;
2332
2333 if (!(opt & OPT_N)) {
2334 /* don't bother with this for the --random case */
2335 contiguous = mask_contiguous(mask, dset, &t1, &t2);
2336 }
2337
2338 #if SUBDEBUG
2339 fprintf(stderr, "restrict sample: contiguous range? %s\n",
2340 contiguous ? "yes" : "no");
2341 #endif
2342
2343 if (contiguous) {
2344 if (permanent && dataset_is_time_series(dset)) {
2345 /* apply the restriction, but then re-establish the
2346 time-series character of the dataset
2347 */
2348 err = handle_ts_restrict(mask, dset, opt, t1);
2349 } else if (!permanent) {
2350 /* just move the sample range pointers, avoiding
2351 the overhead of creating a parallel dataset
2352 */
2353 dset->t1 = t1;
2354 dset->t2 = t2;
2355 /* but record mask? */
2356 } else {
2357 err = restrict_sample_from_mask(mask, dset, opt);
2358 }
2359 } else {
2360 err = restrict_sample_from_mask(mask, dset, opt);
2361 }
2362 }
2363
2364 free(mask);
2365
2366 if (free_oldmask) {
2367 free(oldmask);
2368 }
2369
2370 if (!err) {
2371 if (permanent) {
2372 dataset_clear_sample_record(dset);
2373 } else {
2374 make_restriction_string(dset, oldrestr, param, mode);
2375 }
2376 if (n_dropped != NULL) {
2377 *n_dropped = n_orig - sample_size(dset);
2378 }
2379 }
2380
2381 free(oldrestr);
2382
2383 #if PANDEBUG
2384 panreport("restrict sample", dset);
2385 #endif
2386
2387 return err;
2388 }
2389
2390 /* perma_sample:
2391 * @dset: dataset struct.
2392 * @opt: option flags: must be OPT_T.
2393 * @prn: printing apparatus.
2394 * @n_dropped: location to receive count of dropped models,
2395 * or NULL.
2396 *
2397 * Make the current sub-sampling of the dataset permanent.
2398 *
2399 * Returns: 0 on success, non-zero error code on failure.
2400 */
2401
perma_sample(DATASET * dset,gretlopt opt,PRN * prn,int * n_dropped)2402 int perma_sample (DATASET *dset, gretlopt opt, PRN *prn,
2403 int *n_dropped)
2404 {
2405 if (gretl_function_depth() > 0) {
2406 gretl_errmsg_set(_("The dataset cannot be modified at present"));
2407 return E_DATA;
2408 } else if (!dataset_is_subsampled(dset)) {
2409 pputs(prn, "smpl: nothing to be done\n");
2410 return 0;
2411 } else if (dset->submask == RESAMPLED) {
2412 pputs(prn, "smpl: dataset is resampled\n");
2413 return E_DATA;
2414 } else if (opt != OPT_T) {
2415 return E_BADOPT;
2416 }
2417
2418 #if RECODE_ON_PERMA
2419 recode_strvals(dset, OPT_NONE);
2420 #endif
2421
2422 if (dset->submask == NULL) {
2423 /* just trim observations */
2424 return dataset_shrink_obs_range(dset);
2425 }
2426
2427 if (n_dropped != NULL) {
2428 int err;
2429
2430 err = check_models_for_subsample(dset->submask, n_dropped);
2431 if (err) {
2432 return err;
2433 }
2434 } else {
2435 check_models_for_subsample(dset->submask, NULL);
2436 }
2437
2438 free(dset->submask);
2439 dset->submask = NULL;
2440 dataset_clear_sample_record(dset);
2441
2442 if (fullset->varname == dset->varname) {
2443 fullset->varname = NULL;
2444 }
2445 if (fullset->varinfo == dset->varinfo) {
2446 fullset->varinfo = NULL;
2447 }
2448 if (fullset->descrip == dset->descrip) {
2449 fullset->descrip = NULL;
2450 }
2451
2452 destroy_dataset(fullset);
2453 fullset = peerset = NULL;
2454
2455 return 0;
2456 }
2457
2458 enum {
2459 SMPL_T1,
2460 SMPL_T2
2461 };
2462
smpl_get_int(const char * s,DATASET * dset,int * err)2463 static int smpl_get_int (const char *s, DATASET *dset, int *err)
2464 {
2465 int k = -1;
2466
2467 if (integer_string(s)) {
2468 k = atoi(s);
2469 } else {
2470 double x;
2471
2472 if (gretl_is_scalar(s)) {
2473 x = gretl_scalar_get_value(s, NULL);
2474 } else {
2475 x = generate_scalar(s, dset, err);
2476 }
2477
2478 if (!na(x) && x < INT_MAX) {
2479 k = (int) x;
2480 }
2481 }
2482
2483 return k;
2484 }
2485
2486 /* take integer_string(s) as given here */
2487
probably_year(const char * s,DATASET * dset)2488 static int probably_year (const char *s, DATASET *dset)
2489 {
2490 int sval = atoi(s);
2491
2492 return (annual_data(dset) && dset->sd0 > 1000 &&
2493 sval > 1000 && sval > dset->n);
2494 }
2495
got_two_ints(const char * s,int * pi,int * pt)2496 static int got_two_ints (const char *s, int *pi, int *pt)
2497 {
2498 int n = 0;
2499
2500 if (strchr(s, ':') != NULL) {
2501 n = sscanf(s, "%d:%d", pi, pt);
2502 } else if (strchr(s, '.') != NULL) {
2503 n = sscanf(s, "%d.%d", pi, pt);
2504 }
2505
2506 return n == 2;
2507 }
2508
get_sample_limit(const char * s,DATASET * dset,int code)2509 static int get_sample_limit (const char *s, DATASET *dset, int code)
2510 {
2511 int ret = -1;
2512 int i, t;
2513 int err = 0;
2514
2515 #if SUBDEBUG
2516 fprintf(stderr, "get_sample_limit: s = '%s'\n", s);
2517 #endif
2518
2519 if (dataset_is_panel(dset) && got_two_ints(s, &i, &t)) {
2520 if (code == SMPL_T1 && t != 1) {
2521 gretl_errmsg_sprintf(_("'%s': invalid starting observation"), s);
2522 err = E_DATA;
2523 } else if (code == SMPL_T2 && t != dset->pd) {
2524 gretl_errmsg_sprintf(_("'%s': invalid ending observation"), s);
2525 err = E_DATA;
2526 }
2527 if (err) {
2528 return -1;
2529 }
2530 }
2531
2532 if (*s == '-' || *s == '+') {
2533 /* increment/decrement form */
2534 int incr = smpl_get_int(s + 1, dset, &err);
2535
2536 if (!err) {
2537 if (*s == '-') {
2538 incr = -incr;
2539 }
2540 if (dataset_is_panel(dset)) {
2541 incr *= dset->pd;
2542 }
2543 if (code == SMPL_T1) {
2544 ret = dset->t1 + incr;
2545 } else {
2546 ret = dset->t2 + incr;
2547 }
2548 }
2549 } else {
2550 /* absolute form */
2551 if (!integer_string(s) || probably_year(s, dset)) {
2552 ret = get_t_from_obs_string(s, dset);
2553 }
2554 if (ret < 0) {
2555 gretl_error_clear();
2556 ret = smpl_get_int(s, dset, &err);
2557 /* convert to base 0 */
2558 if (!err) ret--;
2559 }
2560 }
2561
2562 return ret;
2563 }
2564
2565 /* Catch the case where we're in a function and attempting to
2566 move t1 or t2 out of the range established on entry to the
2567 function: we don't want to carry forward any error message
2568 that implies t was out of the full data range.
2569 */
2570
maybe_clear_range_error(int t,DATASET * dset)2571 static void maybe_clear_range_error (int t, DATASET *dset)
2572 {
2573 if (t >= 0 && t < dset->n) {
2574 gretl_error_clear();
2575 }
2576 }
2577
2578 /**
2579 * set_sample:
2580 * @start: start of sample range.
2581 * @stop: end of sample range.
2582 * @dset: pointer to dataset struct.
2583 * @opt: options flag(s).
2584 *
2585 * Sets the current sample range for @dset to that specified by
2586 * @start and @stop, if possible.
2587 *
2588 * If @opt contains %OPT_T, make the sub-sampling permanent,
2589 * or in other words shrink the dataset to the selected range.
2590 *
2591 * Returns: 0 on successful completion, non-zero error code
2592 * otherwise.
2593 */
2594
set_sample(const char * start,const char * stop,DATASET * dset,gretlopt opt)2595 int set_sample (const char *start,
2596 const char *stop,
2597 DATASET *dset,
2598 gretlopt opt)
2599 {
2600 int nf, new_t1 = dset->t1, new_t2 = dset->t2;
2601 int tmin = 0, tmax = 0;
2602 int err = 0;
2603
2604 if (dset == NULL) {
2605 return E_NODATA;
2606 }
2607
2608 /* currently --quiet (OPT_Q) and --permanent (OPT_T)
2609 are the only acceptable options here
2610 */
2611 opt &= ~OPT_Q;
2612 if (opt != OPT_NONE && opt != OPT_T) {
2613 return E_BADOPT;
2614 }
2615
2616 gretl_error_clear();
2617
2618 nf = (start != NULL) + (stop != NULL);
2619
2620 #if SUBDEBUG
2621 fprintf(stderr, "set_sample: start='%s', stop='%s', dset=%p\n",
2622 start, stop, (void *) dset);
2623 fprintf(stderr, "dset->v = %d, dset->n = %d, pd = %d\n",
2624 dset->v, dset->n, dset->pd);
2625 #endif
2626
2627 if (nf == 2 && dset->n == 0) {
2628 /* database special */
2629 return db_set_sample(start, stop, dset);
2630 }
2631
2632 if (nf == 0) {
2633 /* no-op, just print the current sample */
2634 return 0;
2635 }
2636
2637 sample_range_get_extrema(dset, &tmin, &tmax);
2638
2639 #if SUBDEBUG
2640 fprintf(stderr, "sample extrema: lo = %d, hi = %d\n", tmin, tmax);
2641 #endif
2642
2643 if (nf == 1) {
2644 /* implicitly just setting the starting observation? */
2645 if (start == NULL) {
2646 return E_ARGS;
2647 }
2648 new_t1 = get_sample_limit(start, dset, SMPL_T1);
2649 if (new_t1 < tmin || new_t1 > tmax) {
2650 maybe_clear_range_error(new_t1, dset);
2651 gretl_errmsg_set(_("error in new starting obs"));
2652 return E_INVARG;
2653 }
2654 goto finish;
2655 }
2656
2657 /* now we're looking at the 2 fields case */
2658
2659 if (strcmp(start, ";")) {
2660 new_t1 = get_sample_limit(start, dset, SMPL_T1);
2661 if (new_t1 < tmin || new_t1 > tmax) {
2662 maybe_clear_range_error(new_t1, dset);
2663 gretl_errmsg_set(_("error in new starting obs"));
2664 err = E_DATA;
2665 }
2666 }
2667
2668 if (!err && strcmp(stop, ";")) {
2669 new_t2 = get_sample_limit(stop, dset, SMPL_T2);
2670 if (new_t2 < tmin || new_t2 > tmax) {
2671 maybe_clear_range_error(new_t2, dset);
2672 gretl_errmsg_set(_("error in new ending obs"));
2673 err = E_DATA;
2674 }
2675 }
2676
2677 if (!err && (new_t1 < tmin || new_t1 > new_t2)) {
2678 gretl_error_clear();
2679 gretl_errmsg_set(_("Invalid null sample"));
2680 err = E_DATA;
2681 }
2682
2683 finish:
2684
2685 if (!err && dataset_is_panel(dset)) {
2686 int T = dset->pd;
2687
2688 if (new_t1 % T != 0 || (new_t2 + 1) % T != 0) {
2689 gretl_errmsg_set("Invalid panel sample range");
2690 err = E_INVARG;
2691 }
2692 }
2693
2694 if (!err) {
2695 dset->t1 = new_t1;
2696 dset->t2 = new_t2;
2697 }
2698
2699 if (!err && (opt & OPT_T)) {
2700 err = perma_sample(dset, opt, NULL, NULL);
2701 }
2702
2703 return err;
2704 }
2705
set_panel_sample(const char * start,const char * stop,gretlopt opt,DATASET * dset)2706 int set_panel_sample (const char *start, const char *stop,
2707 gretlopt opt, DATASET *dset)
2708 {
2709 int s1, s2, err = 0;
2710
2711 if (!dataset_is_panel(dset)) {
2712 gretl_errmsg_sprintf(_("%s: inapplicable option"), print_flags(opt, SMPL));
2713 return E_BADOPT;
2714 }
2715
2716 /* maybe some day implement option to sample in the
2717 panel time dimension here? (not trivial)
2718 */
2719
2720 s1 = smpl_get_int(start, dset, &err);
2721 if (!err) {
2722 s2 = smpl_get_int(stop, dset, &err);
2723 }
2724
2725 if (!err) {
2726 if (s1 < 1) {
2727 gretl_errmsg_sprintf(_("invalid first unit %d"), s1);
2728 err = E_DATA;
2729 } else if (s2 > dset->n / dset->pd) {
2730 gretl_errmsg_sprintf(_("invalid last unit %d"), s2);
2731 err = E_DATA;
2732 } else if (s2 < s1) {
2733 gretl_errmsg_set(_("invalid null sample"));
2734 err = E_DATA;
2735 }
2736 }
2737
2738 if (!err) {
2739 int t1 = (s1 - 1) * dset->pd;
2740 int t2 = s2 * dset->pd - 1;
2741 int tmin = 0, tmax = 0;
2742
2743 sample_range_get_extrema(dset, &tmin, &tmax);
2744 if (t1 < tmin || t2 > tmax) {
2745 gretl_errmsg_set("sample range out of bounds");
2746 err = E_DATA;
2747 } else {
2748 dset->t1 = t1;
2749 dset->t2 = t2;
2750 }
2751 }
2752
2753 return err;
2754 }
2755
2756 /**
2757 * count_missing_values:
2758 * @dset: dataset struct.
2759 * @opt: use %OPT_V for verbose operation, %OPT_A to
2760 * examine all data.
2761 * @prn: printing struct.
2762 * @err: location to receive error code.
2763 *
2764 * Prints a count of missing values (if any) in the current
2765 * dataset over the currently defined sample range (or the
2766 * entire data range if %OPT_A is given). If %OPT_V is given
2767 * this includes a count of missing values at each observation;
2768 * otherwise it just includes global and per-variable counts.
2769 *
2770 * Returns: 0 if no missing values are found (or on error),
2771 * otherwise the total number of missing values.
2772 */
2773
count_missing_values(const DATASET * dset,gretlopt opt,PRN * prn,int * err)2774 int count_missing_values (const DATASET *dset, gretlopt opt,
2775 PRN *prn, int *err)
2776 {
2777 int missval = 0, missobs = 0, totvals = 0, oldmiss = 0;
2778 int T, t1, t2;
2779 int *missvec;
2780 double missfrac;
2781 int i, t, tmiss;
2782
2783 if (opt & OPT_A) {
2784 t1 = 0;
2785 t2 = dset->n - 1;
2786 } else {
2787 t1 = dset->t1;
2788 t2 = dset->t2;
2789 }
2790
2791 T = t2 - t1 + 1;
2792
2793 missvec = malloc(dset->v * sizeof missvec);
2794
2795 if (missvec == NULL) {
2796 *err = E_ALLOC;
2797 return 0;
2798 }
2799
2800 for (i=0; i<dset->v; i++) {
2801 missvec[i] = 0;
2802 }
2803
2804 for (t=t1; t<=t2; t++) {
2805 tmiss = 0;
2806 for (i=1; i<dset->v; i++) {
2807 if (series_is_hidden(dset, i)) {
2808 continue;
2809 }
2810 if (na(dset->Z[i][t])) {
2811 if (missvec[i] == 0) {
2812 missvec[0] += 1;
2813 }
2814 missvec[i] += 1;
2815 missval++;
2816 }
2817 totvals++;
2818 }
2819
2820 tmiss = missval - oldmiss;
2821
2822 if (tmiss) {
2823 missobs++;
2824
2825 if (opt & OPT_V) {
2826 /* verbose print by observation */
2827 if (dset->markers) {
2828 pprintf(prn, "%8s %4d %s\n", dset->S[t], tmiss,
2829 _("missing values"));
2830 } else {
2831 char tmp[OBSLEN];
2832
2833 ntolabel(tmp, t, dset);
2834 pprintf(prn, "%8s %4d %s\n", tmp, tmiss,
2835 _("missing values"));
2836 }
2837 }
2838 }
2839 oldmiss = missval;
2840 }
2841
2842 missfrac = 100.0 * (double) missobs / T;
2843
2844 pprintf(prn, _("\nNumber of observations (rows) with missing data "
2845 "values = %d (%.2f%%)\n"), missobs, missfrac);
2846
2847 pprintf(prn, _("Total number of missing data values = %d (%.2f%% "
2848 "of total data values)\n"), missval,
2849 (100.0 * (double) missval / totvals));
2850
2851 if (missvec[0] > 0) {
2852 pputc(prn, '\n');
2853 for (i=1; i<dset->v; i++) {
2854 if (missvec[i] > 0) {
2855 missfrac = 100.0 * (double) missvec[i] / T;
2856 pprintf(prn, "%8s: %d %s (%.2f%%); %d %s (%.2f%%)\n",
2857 dset->varname[i], missvec[i], _("missing values"),
2858 missfrac, T - missvec[i], _("valid values"),
2859 100.0 - missfrac);
2860 }
2861 }
2862 }
2863
2864 free(missvec);
2865
2866 return missval;
2867 }
2868
copy_series_info(DATASET * dest,const DATASET * src,int maxv)2869 static void copy_series_info (DATASET *dest, const DATASET *src,
2870 int maxv)
2871 {
2872 int i;
2873
2874 for (i=1; i<maxv; i++) {
2875 strcpy(dest->varname[i], src->varname[i]);
2876 if (src->varinfo != NULL) {
2877 copy_varinfo(dest->varinfo[i], src->varinfo[i]);
2878 }
2879 }
2880 }
2881
2882 /* Below: For a model that was estimated using a data sample which
2883 differs from the current sample, reconstitute the dataset
2884 appropriate to the model. If the model used a subsample, we use
2885 the special dummy variable (submask) saved with the model to select
2886 observations from the full dataset. Another possibility is that
2887 the model used the full dataset, which has by now been subsampled.
2888
2889 We attach this dataset to the model, so that it can be used for,
2890 e.g., residual or fitted versus actual plots. The attached data
2891 set will be destroyed when the model itself is destroyed.
2892
2893 By default we create a dataset containing series up to the
2894 highest numbered series associated with the model (regressand
2895 and regressors). If OPT_F is given we include all series; if
2896 OPT_G is given we include only the constant.
2897 */
2898
add_dataset_to_model(MODEL * pmod,const DATASET * dset,gretlopt opt)2899 int add_dataset_to_model (MODEL *pmod, const DATASET *dset,
2900 gretlopt opt)
2901 {
2902 const DATASET *srcset;
2903 char *mask = NULL;
2904 int maxv, sn = 0;
2905
2906 if (gretl_is_between_model(pmod)) {
2907 /* special case: we can't reconstruct the dataset here,
2908 but it may already be attached to @pmod.
2909 */
2910 if (pmod->dataset != NULL) {
2911 return 0;
2912 } else {
2913 return E_DATA;
2914 }
2915 }
2916
2917 if (pmod->dataset != NULL) {
2918 /* FIXME? */
2919 destroy_dataset(pmod->dataset);
2920 pmod->dataset = NULL;
2921 }
2922
2923 #if SUBDEBUG
2924 fprintf(stderr, "add_dataset_to_model: fullset=%p, pmod->submask=%p\n",
2925 (void *) fullset, (void *) pmod->submask);
2926 #endif
2927
2928 if (fullset != NULL) {
2929 sync_datainfo_members(dset);
2930 srcset = fullset;
2931 } else {
2932 srcset = dset;
2933 }
2934
2935 if (pmod->submask == NULL) {
2936 /* no subsample info: pmod was estimated on the full dataset,
2937 so we'll reconstruct the full dataset */
2938 sn = srcset->n;
2939 } else {
2940 /* pmod was estimated on a subsample, which has to
2941 be reconstructed */
2942 int t;
2943
2944 mask = calloc(srcset->n, 1);
2945 if (mask == NULL) {
2946 return E_ALLOC;
2947 }
2948 for (t=0; t<srcset->n; t++) {
2949 if (pmod->submask[t] > 0) {
2950 mask[t] = 1;
2951 sn++;
2952 }
2953 }
2954 if (sn == 0) {
2955 free(mask);
2956 return 1;
2957 }
2958 }
2959
2960 if (opt & OPT_F) {
2961 maxv = srcset->v;
2962 } else if (opt & OPT_G) {
2963 maxv = 1;
2964 } else {
2965 maxv = highest_numbered_var_in_model(pmod, dset) + 1;
2966 }
2967
2968 /* allocate auxiliary dataset */
2969 pmod->dataset = create_auxiliary_dataset(maxv, sn, 0);
2970 if (pmod->dataset == NULL) {
2971 free(mask);
2972 return E_ALLOC;
2973 }
2974
2975 #if SUBDEBUG
2976 fprintf(stderr, "pmod->dataset allocated at %p\n",
2977 (void *) pmod->dataset);
2978 #endif
2979
2980 /* copy across info on series */
2981 copy_series_info(pmod->dataset, srcset, maxv);
2982
2983 /* copy across data */
2984 copy_data_to_subsample(pmod->dataset, srcset, maxv, mask);
2985
2986 /* dataset characteristics such as pd: if we're rebuilding the
2987 full dataset copy these across; but if we're reconstructing a
2988 subsampled dataset these features from the full dataset will be
2989 wrong, and we stay with the simple defaults
2990 */
2991 if (pmod->submask == NULL) {
2992 pmod->dataset->pd = srcset->pd;
2993 pmod->dataset->sd0 = srcset->sd0;
2994 strcpy(pmod->dataset->stobs, srcset->stobs);
2995 strcpy(pmod->dataset->endobs, srcset->endobs);
2996 pmod->dataset->structure = srcset->structure;
2997 }
2998
2999 free(mask);
3000
3001 #if SUBDEBUG
3002 fputs("add_subsampled_dataset_to_model: success\n", stderr);
3003 #endif
3004
3005 return 0;
3006 }
3007
submasks_match(const DATASET * dset,const MODEL * pmod)3008 static int submasks_match (const DATASET *dset, const MODEL *pmod)
3009 {
3010 char *s1 = dset->submask;
3011 char *s2 = pmod->submask;
3012
3013 if (s1 == RESAMPLED && s2 == RESAMPLED) {
3014 return dset->n == pmod->full_n &&
3015 dset->rseed == pmod->smpl.rseed;
3016 } else if (s1 == RESAMPLED || s2 == RESAMPLED) {
3017 return 0;
3018 } else {
3019 return submask_cmp(s1, s2) == 0;
3020 }
3021 }
3022
3023 /* check the subsample mask from a model against @dset to
3024 see if it may have been estimated on a different
3025 (subsampled) data set from the current one
3026 */
3027
model_sample_problem(const MODEL * pmod,const DATASET * dset)3028 int model_sample_problem (const MODEL *pmod, const DATASET *dset)
3029 {
3030 int ret = 1;
3031
3032 if (pmod->ci == PANEL && (pmod->opt & OPT_B)) {
3033 ; /* special case: panel "between" model */
3034 } else if (pmod->submask == NULL) {
3035 /* the model has no sub-sampling info recorded */
3036 if (dset->submask == NULL) {
3037 /* data set is not sub-sampled either, OK */
3038 ret = 0;
3039 } else {
3040 fputs("dataset is subsampled, model is not\n", stderr);
3041 gretl_errmsg_set(_("dataset is subsampled, model is not\n"));
3042 ret = 1;
3043 }
3044 } else {
3045 /* the model does have sub-sampling info recorded */
3046 if (dset->submask == NULL) {
3047 fputs("model is subsampled, dataset is not\n", stderr);
3048 gretl_errmsg_set(_("model is subsampled, dataset is not\n"));
3049 ret = 1;
3050 } else if (submasks_match(dset, pmod)) {
3051 /* the subsamples (model and current data set) agree, OK */
3052 ret = 0;
3053 } else {
3054 /* the subsamples differ */
3055 gretl_errmsg_set(_("model and dataset subsamples not the same\n"));
3056 ret = 1;
3057 }
3058 }
3059
3060 return ret;
3061 }
3062
fcast_not_feasible(const MODEL * pmod,const DATASET * dset)3063 int fcast_not_feasible (const MODEL *pmod, const DATASET *dset)
3064 {
3065 int ret = E_DATA;
3066
3067 if (pmod->ci == PANEL && (pmod->opt & OPT_B)) {
3068 ; /* special case: panel "between" model */
3069 } else if (pmod->submask == NULL) {
3070 /* @pmod was estimated on the full dataset */
3071 if (dset->submask == NULL) {
3072 /* @dset not sub-sampled either, OK */
3073 ret = 0;
3074 } else if (dataset_is_cross_section(dset)) {
3075 /* @dset is cross-sectional subset, OK? */
3076 ret = 0;
3077 } else {
3078 gretl_errmsg_set(_("dataset is subsampled, model is not\n"));
3079 }
3080 } else {
3081 /* the model does have sub-sampling info recorded */
3082 if (dset->submask == NULL) {
3083 /* the dataset is not sub-sampled */
3084 if (dataset_is_cross_section(dset)) {
3085 ret = 0; /* OK? */
3086 } else if (is_panel_time_sample(pmod->submask, dset)) {
3087 ret = 0; /* OK? */
3088 } else {
3089 gretl_errmsg_set(_("model is subsampled, dataset is not\n"));
3090 }
3091 } else if (submasks_match(dset, pmod)) {
3092 /* the subsamples (model and current data set) agree, OK */
3093 ret = 0;
3094 } else {
3095 /* the subsamples differ */
3096 if (dataset_is_cross_section(fullset) &&
3097 dataset_is_cross_section(dset)) {
3098 ret = 0; /* OK ? */
3099 } else if (is_panel_time_sample(pmod->submask, fullset) &&
3100 is_panel_time_sample(dset->submask, fullset)) {
3101 /* we should be able to make this work */
3102 ret = 0; /* new 2019-03-23 */
3103 } else {
3104 gretl_errmsg_set(_("model and dataset subsamples not the same\n"));
3105 }
3106 }
3107 }
3108
3109 return ret;
3110 }
3111
same_dataset(const MODEL * pmod,const DATASET * dset)3112 int same_dataset (const MODEL *pmod, const DATASET *dset)
3113 {
3114 if (pmod->submask == NULL && dset->submask == NULL) {
3115 return 1;
3116 } else if (pmod->submask != NULL && dset->submask != NULL) {
3117 return submasks_match(dset, pmod);
3118 } else {
3119 return 0;
3120 }
3121 }
3122
dataset_type_string(char * str,const DATASET * dset)3123 static void dataset_type_string (char *str, const DATASET *dset)
3124 {
3125 if (dataset_is_time_series(dset)) {
3126 strcpy(str, _("time series"));
3127 } else if (dataset_is_panel(dset)) {
3128 strcpy(str, _("panel"));
3129 } else {
3130 strcpy(str, _("undated"));
3131 }
3132 }
3133
pd_string(char * str,const DATASET * dset)3134 static void pd_string (char *str, const DATASET *dset)
3135 {
3136 if (custom_time_series(dset)) {
3137 strcpy(str, _("special"));
3138 } else {
3139 switch (dset->pd) {
3140 case 1:
3141 strcpy(str, _("annual")); break;
3142 case 4:
3143 strcpy(str, _("quarterly")); break;
3144 case 12:
3145 strcpy(str, _("monthly")); break;
3146 case 24:
3147 strcpy(str, _("hourly")); break;
3148 case 52:
3149 strcpy(str, _("weekly")); break;
3150 case 5:
3151 strcpy(str, _("daily (5 days)")); break;
3152 case 6:
3153 strcpy(str, _("daily (6 days)")); break;
3154 case 7:
3155 strcpy(str, _("daily (7 days)")); break;
3156 case 10:
3157 strcpy(str, _("decennial")); break;
3158 default:
3159 strcpy(str, _("unknown")); break;
3160 }
3161 }
3162 }
3163
print_sample_obs(const DATASET * dset,PRN * prn)3164 void print_sample_obs (const DATASET *dset, PRN *prn)
3165 {
3166 char d1[OBSLEN], d2[OBSLEN];
3167
3168 ntolabel(d1, dset->t1, dset);
3169 ntolabel(d2, dset->t2, dset);
3170
3171 pprintf(prn, "%s: %s - %s", _("Current sample"), d1, d2);
3172 pprintf(prn, " (n = %d)\n", dset->t2 - dset->t1 + 1);
3173 }
3174
print_sample_status(const DATASET * dset,PRN * prn)3175 void print_sample_status (const DATASET *dset, PRN *prn)
3176 {
3177 char tmp[128];
3178
3179 if (complex_subsampled()) {
3180 pprintf(prn, "%s\n", _("Full dataset"));
3181 dataset_type_string(tmp, fullset);
3182 pprintf(prn, "%s: %s\n", _("Type"), tmp);
3183 if (dataset_is_time_series(fullset)) {
3184 pd_string(tmp, fullset);
3185 pprintf(prn, "%s: %s\n", _("Frequency"), tmp);
3186 } else if (dataset_is_panel(fullset)) {
3187 int nu = fullset->n / fullset->pd;
3188
3189 pprintf(prn, "%s: %d\n", _("Number of cross-sectional units"), nu);
3190 pprintf(prn, "%s: %d\n", _("Number of time periods"), fullset->pd);
3191 }
3192 pprintf(prn, "%s: %s - %s (n = %d)\n", _("Range"),
3193 fullset->stobs, fullset->endobs, fullset->n);
3194
3195 pprintf(prn, "\n%s\n", _("Subsampled data"));
3196 if (dset->restriction != NULL) {
3197 pprintf(prn, "%s: %s\n", _("restriction"), dset->restriction);
3198 }
3199 } else {
3200 pprintf(prn, "%s\n", _("Full dataset"));
3201 }
3202
3203 dataset_type_string(tmp, dset);
3204 pprintf(prn, "%s: %s\n", _("Type"), tmp);
3205 if (dataset_is_time_series(dset)) {
3206 pd_string(tmp, dset);
3207 pprintf(prn, "%s: %s\n", _("Frequency"), tmp);
3208 } else if (dataset_is_panel(dset)) {
3209 int nu = dset->n / dset->pd;
3210
3211 pprintf(prn, "%s: %d\n", _("Number of cross-sectional units"), nu);
3212 pprintf(prn, "%s: %d\n", _("Number of time periods"), dset->pd);
3213 }
3214 if (dset->t1 == 0 && dset->t2 == dset->n - 1) {
3215 pprintf(prn, "%s: %s - %s (n = %d)\n", _("Range"),
3216 dset->stobs, dset->endobs, dset->n);
3217 } else {
3218 pprintf(prn, "%s: %s - %s (n = %d)\n", _("Range"),
3219 dset->stobs, dset->endobs, dset->n);
3220 print_sample_obs(dset, prn);
3221 if (dset->restriction != NULL) {
3222 pprintf(prn, "(%s: %s)\n", _("restriction"), dset->restriction);
3223 }
3224 }
3225 }
3226
3227 /**
3228 * data_report:
3229 * @dset: data information struct.
3230 * @fname: filename for current datafile.
3231 * @prn: gretl printing struct.
3232 *
3233 * Write out a summary of the content of the current data set.
3234 *
3235 * Returns: 0 on successful completion, non-zero on error.
3236 *
3237 */
3238
data_report(const DATASET * dset,const char * fname,PRN * prn)3239 int data_report (const DATASET *dset, const char *fname, PRN *prn)
3240 {
3241 char startdate[OBSLEN], enddate[OBSLEN], tmp[MAXLEN];
3242 char tstr[48];
3243 const char *vlabel;
3244 int i;
3245
3246 ntolabel(startdate, 0, dset);
3247 ntolabel(enddate, dset->n - 1, dset);
3248
3249 sprintf(tmp, _("Data file %s\nas of"),
3250 (*fname != '\0')? fname : _("(unsaved)"));
3251
3252 print_time(tstr);
3253 pprintf(prn, "%s %s\n\n", tmp, tstr);
3254
3255 if (dset->descrip != NULL && *dset->descrip != '\0') {
3256 pprintf(prn, "%s:\n\n", _("Description"));
3257 pputs(prn, dset->descrip);
3258 pputs(prn, "\n\n");
3259 }
3260
3261 dataset_type_string(tmp, dset);
3262 pprintf(prn, "%s: %s\n", _("Type of data"), tmp);
3263
3264 if (dataset_is_time_series(dset)) {
3265 pd_string(tmp, dset);
3266 pprintf(prn, "%s: %s\n", _("Frequency"), tmp);
3267 }
3268
3269 pprintf(prn, "%s: %s - %s (n = %d)\n\n", _("Range"),
3270 startdate, enddate, dset->n);
3271
3272 pprintf(prn, "%s:\n\n", _("Listing of variables"));
3273
3274 for (i=1; i<dset->v; i++) {
3275 vlabel = series_get_label(dset, i);
3276 pprintf(prn, "%*s %s\n", VNAMELEN, dset->varname[i],
3277 vlabel == NULL ? "" : vlabel);
3278 }
3279
3280 return 0;
3281 }
3282