1 /*
2 * gretl -- Gnu Regression, Econometrics and Time-series Library
3 * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 #include "libgretl.h"
21 #include "gretl_string_table.h"
22 #include "libset.h"
23 #include "gretl_xml.h"
24 #include "gretl_midas.h"
25 #include "uservar.h"
26 #include "csvdata.h"
27 #include "join_priv.h"
28 #include "gretl_join.h"
29
30 #ifdef WIN32
31 # include "gretl_win32.h" /* for strptime() */
32 #endif
33
34 #define AGGDEBUG 0 /* aggregation in "join" */
35 #define TDEBUG 0 /* handling of time keys in "join" */
36 #define JDEBUG 0 /* joining in general */
37
38 enum {
39 JOIN_KEY,
40 JOIN_F1,
41 JOIN_F2,
42 JOIN_F3,
43 JOIN_KEY2,
44 JOIN_AUX,
45 JOIN_TARG
46 };
47
48 enum {
49 TCONV_FMT = 0,
50 TKEY_FMT = 1
51 };
52
53 typedef double keynum;
54
55 struct jr_row_ {
56 int n_keys; /* number of keys (needed for qsort callback) */
57 keynum keyval; /* primary key value */
58 keynum keyval2; /* secondary key value, if applicable */
59 int micro; /* high-frequency "key", if any */
60 int dset_row; /* associated row in the RHS or outer dataset */
61 double aux; /* auxiliary value */
62 };
63
64 typedef struct jr_row_ jr_row;
65
66 struct obskey_ {
67 char *timefmt; /* time format, as in strptime */
68 int keycol; /* the column holding the outer time-key */
69 int m_means_q; /* "monthly means quarterly" */
70 int numdates; /* flag for conversion from numeric to string */
71 int native; /* native time-series info */
72 };
73
74 typedef struct obskey_ obskey;
75
76 struct joiner_ {
77 int n_rows; /* number of rows in data table */
78 int n_keys; /* number of keys used (0, 1 or 2) */
79 int n_unique; /* number of unique primary key values on right */
80 jr_row *rows; /* array of table rows */
81 keynum *keys; /* array of unique (primary) key values as 64-bit ints */
82 int *key_freq; /* counts of occurrences of (primary) key values */
83 int *key_row; /* record of starting row in joiner table for primary keys */
84 int *str_keys; /* flags for string comparison of key(s) */
85 const int *l_keyno; /* list of key columns in left-hand dataset */
86 const int *r_keyno; /* list of key columns in right-hand dataset */
87 AggrType aggr; /* aggregation method for 1:n joining */
88 int seqval; /* sequence number for aggregation */
89 int auxcol; /* auxiliary data column for aggregation */
90 int midas_m; /* midas frequency ratio */
91 int midas_pd; /* frequency of outer dataset */
92 obskey *auto_keys; /* struct to hold info on obs-based key(s) */
93 DATASET *l_dset; /* the left-hand or inner dataset */
94 DATASET *r_dset; /* the right-hand or outer temporary dataset */
95 };
96
97 typedef struct joiner_ joiner;
98
99 struct jr_filter_ {
100 const char *expr; /* expression to be run through "genr" */
101 const double *val; /* (series) result of evaluating @expr */
102 char *vname1; /* first right-hand variable name */
103 char *vname2; /* second right-hand variable name */
104 char *vname3; /* third right-hand variable name */
105 };
106
107 typedef struct jr_filter_ jr_filter;
108
109 struct time_mapper {
110 int ncols; /* number of "timeconv" columns */
111 char **colnames; /* array of outer-dataset column names */
112 char *tname; /* the name of the "tkey", if among colnames, or NULL */
113 char **fmt; /* array of up to two time-format strings, or NULL */
114 char m_means_q[2]; /* array of "monthly means quarterly" flags */
115 };
116
117 struct jr_matcher_ {
118 keynum *k1; /* first key value, per observation */
119 keynum *k2; /* second key value per obs, or NULL */
120 int *pos; /* position of match in outer key array */
121 };
122
123 typedef struct jr_matcher_ jr_matcher;
124
125 #define KEYMISS -999
126
127 #define is_wildstr(s) (strchr(s, '*') || strchr(s, '?'))
128
129 /* file-scope global */
130 struct time_mapper tconv_map;
131
132 static int expand_jspec (joinspec *jspec, int addvars);
133
jr_filter_destroy(jr_filter * f)134 static void jr_filter_destroy (jr_filter *f)
135 {
136 if (f != NULL) {
137 free(f->vname1);
138 free(f->vname2);
139 free(f->vname3);
140 free(f);
141 }
142 }
143
joiner_destroy(joiner * jr)144 static void joiner_destroy (joiner *jr)
145 {
146 if (jr != NULL) {
147 free(jr->rows);
148 free(jr->keys);
149 free(jr->key_freq);
150 free(jr->key_row);
151 free(jr);
152 }
153 }
154
joiner_new(int nrows)155 static joiner *joiner_new (int nrows)
156 {
157 joiner *jr = malloc(sizeof *jr);
158
159 if (jr != NULL) {
160 jr->rows = calloc(nrows, sizeof *jr->rows);
161 if (jr->rows == NULL) {
162 free(jr);
163 jr = NULL;
164 }
165 }
166
167 if (jr != NULL) {
168 jr->n_rows = nrows;
169 jr->n_unique = 0;
170 jr->keys = NULL;
171 jr->key_freq = NULL;
172 jr->key_row = NULL;
173 jr->l_keyno = NULL;
174 jr->r_keyno = NULL;
175 }
176
177 return jr;
178 }
179
real_set_outer_auto_keys(joiner * jr,const char * s,int j,struct tm * tp)180 static int real_set_outer_auto_keys (joiner *jr, const char *s,
181 int j, struct tm *tp)
182 {
183 DATASET *l_dset = jr->l_dset;
184 int err = 0;
185
186 if (calendar_data(l_dset)) {
187 int y, m, d, eday;
188
189 y = tp->tm_year + 1900;
190 m = tp->tm_mon + 1;
191 d = tp->tm_mday;
192 eday = epoch_day_from_ymd(y, m, d);
193 if (eday < 0) {
194 if (s != NULL) {
195 gretl_errmsg_sprintf("'%s' is not a valid date", s);
196 }
197 err = E_DATA;
198 } else if (jr->n_keys == 2) {
199 /* use year and month */
200 jr->rows[j].n_keys = 2;
201 jr->rows[j].keyval = y;
202 jr->rows[j].keyval2 = m;
203 jr->rows[j].micro = 0;
204 } else {
205 /* use epoch day */
206 jr->rows[j].n_keys = 1;
207 jr->rows[j].keyval = eday;
208 jr->rows[j].keyval2 = 0;
209 jr->rows[j].micro = 0;
210 }
211 } else {
212 int lpd = l_dset->pd;
213 int major = tp->tm_year + 1900;
214 int minor = tp->tm_mon + 1;
215 int micro = 0;
216
217 if (jr->auto_keys->m_means_q) {
218 /* using the gretl-specific "%q" conversion */
219 if (minor > 4) {
220 gretl_errmsg_sprintf("'%s' is not a valid date", s);
221 err = E_DATA;
222 }
223 } else if (lpd == 4) {
224 /* map from month on right to quarter on left, but
225 preserve the month info in case we need it
226 */
227 micro = minor;
228 minor = (int) ceil(minor / 3.0);
229 }
230 if (!err && micro == 0) {
231 micro = tp->tm_mday;
232 }
233 if (!err) {
234 jr->rows[j].n_keys = 2;
235 jr->rows[j].keyval = major;
236 jr->rows[j].keyval2 = minor;
237 jr->rows[j].micro = micro;
238 }
239 }
240
241 return err;
242 }
243
244 /* Handle the case where the user gave a "%q" or "%Q" conversion
245 specifier (which we take to mean quarter). We convert this to %m
246 for use with strptime(), but record that the fact that "month means
247 quarter".
248 */
249
format_uses_quarterly(char * fmt)250 static int format_uses_quarterly (char *fmt)
251 {
252 char *s = fmt;
253 int i, ret = 0;
254
255 for (i=0; s[i]; i++) {
256 if (s[i] == '%' &&
257 (s[i+1] == 'q' || s[i+1] == 'Q') &&
258 (i == 0 || s[i-1] != '%')) {
259 s[i+1] = 'm';
260 ret = 1;
261 }
262 }
263
264 return ret;
265 }
266
timeconv_map_set(int ncols,char ** colnames,char * tname,char ** fmt)267 static void timeconv_map_set (int ncols, char **colnames,
268 char *tname, char **fmt)
269 {
270 tconv_map.ncols = ncols;
271 tconv_map.colnames = colnames;
272 tconv_map.tname = tname;
273 tconv_map.fmt = fmt;
274
275 if (fmt != NULL) {
276 if (fmt[TCONV_FMT] != NULL) {
277 tconv_map.m_means_q[TCONV_FMT] =
278 format_uses_quarterly(fmt[TCONV_FMT]);
279 }
280 if (fmt[TKEY_FMT] != NULL) {
281 tconv_map.m_means_q[TKEY_FMT] =
282 format_uses_quarterly(fmt[TKEY_FMT]);
283 }
284 }
285 }
286
timeconv_map_init(void)287 static void timeconv_map_init (void)
288 {
289 timeconv_map_set(0, NULL, NULL, NULL);
290 }
291
timeconv_map_destroy(void)292 static void timeconv_map_destroy (void)
293 {
294 if (tconv_map.colnames != NULL) {
295 strings_array_free(tconv_map.colnames, tconv_map.ncols);
296 }
297 if (tconv_map.fmt != NULL) {
298 strings_array_free(tconv_map.fmt, 2);
299 }
300 timeconv_map_init();
301 }
302
set_time_format(obskey * auto_keys,const char * fmt)303 static int set_time_format (obskey *auto_keys, const char *fmt)
304 {
305 if (auto_keys->timefmt != NULL) {
306 free(auto_keys->timefmt);
307 }
308 auto_keys->timefmt = gretl_strdup(fmt);
309 return auto_keys->timefmt == NULL ? E_ALLOC : 0;
310 }
311
312 /* convert a numerical value to string for use with strptime */
313
numdate_to_string(char * targ,double x)314 static int numdate_to_string (char *targ, double x)
315 {
316 if (na(x)) {
317 return E_MISSDATA;
318 } else {
319 sprintf(targ, "%.16g", x);
320 return 0;
321 }
322 }
323
324 /* Parse a string from row @i of the outer dataset and set the
325 key(s) on row @j of the joiner struct. The indices @i and @j may
326 not be equal if a filter is being used. Note: we don't come
327 here if the outer time-key column is subject to "tconvert"
328 treatment; in that case we use read_iso_basic instead.
329 */
330
read_outer_auto_keys(joiner * jr,int j,int i)331 static int read_outer_auto_keys (joiner *jr, int j, int i)
332 {
333 char *tfmt = jr->auto_keys->timefmt;
334 int numdates = jr->auto_keys->numdates;
335 int tcol = jr->auto_keys->keycol;
336 int pd = jr->l_dset->pd;
337 struct tm t = {0};
338 const char *s = NULL;
339 char *test = NULL;
340 char sconv[32];
341 int s_src = 0;
342 int err = 0;
343
344 if (tcol >= 0) {
345 /* using a specified column */
346 if (numdates) {
347 /* column is numeric, conversion needed */
348 numdate_to_string(sconv, jr->r_dset->Z[tcol][i]);
349 s = sconv;
350 s_src = 1;
351 } else {
352 /* column is string-valued, OK */
353 s = series_get_string_for_obs(jr->r_dset, tcol, i);
354 s_src = 2;
355 }
356 } else if (jr->auto_keys->native) {
357 /* using native time-series info on right */
358 ntolabel_8601(sconv, i, jr->r_dset);
359 s = sconv;
360 s_src = 1;
361 } else {
362 /* using first-column observation strings */
363 s = jr->r_dset->S[i];
364 s_src = 3;
365 }
366
367 if (s != NULL) {
368 /* note: with strptime, a NULL return means that an error
369 occurred while a non-NULL and non-empty return string
370 means a trailing portion of the input was not
371 processed.
372 */
373 test = strptime(s, tfmt, &t);
374 }
375
376 if (test == NULL || *test != '\0') {
377 err = E_DATA;
378 if (j == 0 && test != NULL && (pd == 12 || pd == 4 || pd == 1)) {
379 /* If we're looking at the first row of the filtered data,
380 allow for the possibility that we got "excess
381 precision", i.e. a daily date string when the left-hand
382 dataset is monthly, quarterly or annual.
383 */
384 char *chk = strptime(s, "%Y-%m-%d", &t);
385
386 if (chk != NULL && *chk == '\0') {
387 set_time_format(jr->auto_keys, "%Y-%m-%d");
388 err = 0; /* we might be OK, cancel the error for now */
389 }
390 }
391 if (err) {
392 gretl_errmsg_sprintf("'%s' does not match the format '%s'", s, tfmt);
393 fprintf(stderr, "time-format match error in read_outer_auto_keys:\n"
394 " remainder = '%s' (source = %s)\n", test ? test : "null",
395 s_src < 3 ? "specified time column" : "first-column strings");
396 }
397 }
398
399 if (!err) {
400 err = real_set_outer_auto_keys(jr, s, j, &t);
401 }
402
403 return err;
404 }
405
read_iso_basic(joiner * jr,int j,int i)406 static int read_iso_basic (joiner *jr, int j, int i)
407 {
408 int tcol = jr->auto_keys->keycol;
409 double x;
410 int err = 0;
411
412 x = jr->r_dset->Z[tcol][i];
413
414 if (na(x)) {
415 err = E_MISSDATA;
416 } else {
417 int y = (int) floor(x / 10000);
418 int m = (int) floor((x - 10000*y) / 100);
419 int d = (int) (x - 10000*y - 100*m);
420 guint32 ed = epoch_day_from_ymd(y, m, d);
421
422 if (ed <= 0) {
423 gretl_errmsg_sprintf("'%.8g' is not a valid date", x);
424 err = E_DATA;
425 } else if (calendar_data(jr->l_dset)) {
426 /* note: no need to go via struct tm */
427 jr->rows[j].n_keys = 1;
428 jr->rows[j].keyval = ed;
429 jr->rows[j].keyval2 = 0;
430 jr->rows[j].micro = 0;
431 } else {
432 struct tm t = {0};
433
434 t.tm_year = y - 1900;
435 t.tm_mon = m - 1;
436 t.tm_mday = d;
437 err = real_set_outer_auto_keys(jr, NULL, j, &t);
438 }
439 }
440
441 return err;
442 }
443
444 /* Evaluate the filter expression provided by the user, and if it
445 works OK count the number of rows on which the filter returns
446 non-zero. Flag an error if the filter gives NA on any row, since
447 it is then indeterminate.
448 */
449
evaluate_filter(jr_filter * filter,DATASET * r_dset,int * nrows)450 static int evaluate_filter (jr_filter *filter, DATASET *r_dset,
451 int *nrows)
452 {
453 char *line;
454 int i, err = 0;
455
456 line = gretl_strdup_printf("filtered__=%s", filter->expr);
457 if (line == NULL) {
458 err = E_ALLOC;
459 } else {
460 err = generate(line, r_dset, GRETL_TYPE_SERIES,
461 OPT_P | OPT_Q, NULL);
462 }
463
464 if (!err) {
465 int v = r_dset->v - 1;
466
467 filter->val = r_dset->Z[v];
468 *nrows = 0;
469
470 #if JDEBUG > 1
471 fprintf(stderr, "filter genr: '%s':\n", line);
472 for (i=0; i<r_dset->n; i++) {
473 fprintf(stderr, " %d: %g\n", i+1, filter->val[i]);
474 }
475 #endif
476 for (i=0; i<r_dset->n; i++) {
477 if (na(filter->val[i])) {
478 gretl_errmsg_sprintf("join filter: indeterminate "
479 "value on row %d", i+1);
480 err = E_MISSDATA;
481 break;
482 } else if (filter->val[i] != 0.0) {
483 *nrows += 1;
484 }
485 }
486 }
487
488 free(line);
489
490 return err;
491 }
492
dtoll(double x,int * err)493 static keynum dtoll (double x, int *err)
494 {
495 if (na(x)) {
496 *err = E_DATA;
497 return -1;
498 } else {
499 return x;
500 }
501 }
502
dtoll_full(double x,int key,int row,int * err)503 static keynum dtoll_full (double x, int key, int row, int *err)
504 {
505 if (na(x)) {
506 if (key == 2) {
507 gretl_errmsg_sprintf("%s: invalid secondary outer key value on row %d",
508 "join", row);
509 } else {
510 gretl_errmsg_sprintf("%s: invalid (primary) outer key value on row %d",
511 "join", row);
512 }
513 *err = E_DATA;
514 return -1;
515 } else {
516 return x;
517 }
518 }
519
520 /* Determine whether or not row @i of the outer data satisfies the
521 filter criterion; return 1 if the condition is met, 0 otherwise.
522 */
523
join_row_wanted(jr_filter * filter,int i)524 static int join_row_wanted (jr_filter *filter, int i)
525 {
526 int ret = filter->val[i] != 0;
527
528 #if JDEBUG > 2
529 fprintf(stderr, "join filter: %s row %d\n",
530 ret ? "keeping" : "discarding", i);
531 #endif
532
533 return ret;
534 }
535
outer_dataset(joinspec * jspec)536 static DATASET *outer_dataset (joinspec *jspec)
537 {
538 if (jspec->c != NULL) {
539 return csvdata_get_dataset(jspec->c);
540 } else {
541 return jspec->dset;
542 }
543 }
544
column_is_timecol(const char * colname)545 static int column_is_timecol (const char *colname)
546 {
547 int i, n = tconv_map.ncols;
548
549 for (i=0; i<n; i++) {
550 if (!strcmp(colname, tconv_map.colnames[i])) {
551 return 1;
552 }
553 }
554
555 return 0;
556 }
557
558 #define using_auto_keys(j) (j->auto_keys->timefmt != NULL)
559
build_joiner(joinspec * jspec,DATASET * l_dset,jr_filter * filter,AggrType aggr,int seqval,obskey * auto_keys,int n_keys,int * err)560 static joiner *build_joiner (joinspec *jspec,
561 DATASET *l_dset,
562 jr_filter *filter,
563 AggrType aggr,
564 int seqval,
565 obskey *auto_keys,
566 int n_keys,
567 int *err)
568 {
569 joiner *jr = NULL;
570 DATASET *r_dset = outer_dataset(jspec);
571 int keycol = jspec->colnums[JOIN_KEY];
572 int valcol = jspec->colnums[JOIN_TARG];
573 int key2col = jspec->colnums[JOIN_KEY2];
574 int auxcol = jspec->colnums[JOIN_AUX];
575 int i, nrows = r_dset->n;
576
577 #if JDEBUG
578 fprintf(stderr, "joiner column numbers:\n"
579 "KEY %d, VAL %d, F1 %d, F2 %d, F3 %d, KEY2 %d, AUX %d\n",
580 keycol, valcol, jspec->colnums[JOIN_F1],
581 jspec->colnums[JOIN_F2], jspec->colnums[JOIN_F3],
582 key2col, auxcol);
583 #endif
584
585 if (filter != NULL) {
586 *err = evaluate_filter(filter, r_dset, &nrows);
587 if (*err) {
588 return NULL;
589 } else if (nrows == 0) {
590 gretl_warnmsg_set(_("No matching data after filtering"));
591 return NULL;
592 }
593 }
594
595 #if JDEBUG
596 fprintf(stderr, "after filtering: dset->n = %d, nrows = %d\n",
597 r_dset->n, nrows);
598 #endif
599
600 jr = joiner_new(nrows);
601
602 if (jr == NULL) {
603 *err = E_ALLOC;
604 } else {
605 double **Z = r_dset->Z;
606 int use_iso_basic = 0;
607 int j = 0;
608
609 jr->aggr = aggr;
610 jr->seqval = seqval;
611 jr->auxcol = auxcol;
612 jr->l_dset = l_dset;
613 jr->r_dset = r_dset;
614 jr->auto_keys = auto_keys;
615 jr->n_keys = n_keys;
616 jr->midas_m = 0;
617
618 if (using_auto_keys(jr)) {
619 /* check for the case where the outer time-key
620 column is in the "tconvert" set: if so we
621 know it will be in YYYYMMDD format and we'll
622 give it special treatment
623 */
624 int tcol = jr->auto_keys->keycol;
625
626 if (tcol > 0 && jr->auto_keys->numdates) {
627 if (column_is_timecol(jr->r_dset->varname[tcol])) {
628 use_iso_basic = 1;
629 }
630 }
631 }
632
633 #if JDEBUG
634 fprintf(stderr, "use_iso_basic = %d\n", use_iso_basic);
635 #endif
636
637 /* Now transcribe the data we want: we're pulling from the
638 full outer dataset and writing into the array of joiner row
639 structs. At this point we're applying the join filter (if
640 any) but are not doing any matching by key to the inner
641 dataset.
642 */
643
644 for (i=0; i<r_dset->n && !*err; i++) {
645 if (filter != NULL && !join_row_wanted(filter, i)) {
646 continue;
647 }
648 /* the keys */
649 if (use_iso_basic) {
650 *err = read_iso_basic(jr, j, i);
651 } else if (using_auto_keys(jr)) {
652 *err = read_outer_auto_keys(jr, j, i);
653 } else if (keycol > 0) {
654 jr->rows[j].keyval = dtoll_full(Z[keycol][i], 1, i+1, err);
655 if (!*err && key2col > 0) {
656 /* double key */
657 jr->rows[j].n_keys = 2;
658 jr->rows[j].keyval2 = dtoll_full(Z[key2col][i], 2, i+1, err);
659 } else {
660 /* single key */
661 jr->rows[j].n_keys = 1;
662 jr->rows[j].keyval2 = 0;
663 }
664 } else {
665 /* no keys have been specified */
666 jr->rows[j].n_keys = 0;
667 jr->rows[j].keyval = 0;
668 jr->rows[j].keyval2 = 0;
669 }
670 /* "payload" data: record the dataset row */
671 jr->rows[j].dset_row = valcol > 0 ? i : -1;
672 /* the auxiliary data */
673 jr->rows[j].aux = auxcol > 0 ? Z[auxcol][i] : 0;
674 j++;
675 }
676 }
677
678 return jr;
679 }
680
681 /* qsort callback for sorting rows of the joiner struct */
682
compare_jr_rows(const void * a,const void * b)683 static int compare_jr_rows (const void *a, const void *b)
684 {
685 const jr_row *ra = a;
686 const jr_row *rb = b;
687 int ret;
688
689 ret = (ra->keyval > rb->keyval) - (ra->keyval < rb->keyval);
690
691 if (ret == 0 && ra->n_keys > 1) {
692 ret = (ra->keyval2 > rb->keyval2) - (ra->keyval2 < rb->keyval2);
693 }
694
695 if (ret == 0) {
696 /* ensure stable sort */
697 ret = a - b > 0 ? 1 : -1;
698 }
699
700 return ret;
701 }
702
703 /* Sort the rows of the joiner struct, by either one or two keys, then
704 figure out how many unique (primary) key values we have and
705 construct (a) an array of frequency of occurrence of these values
706 and (b) an array which records the first row of the joiner on
707 which each of these values is found.
708 */
709
joiner_sort(joiner * jr)710 static int joiner_sort (joiner *jr)
711 {
712 int matches = jr->n_rows;
713 int i, err = 0;
714
715 /* If there are string keys, we begin by mapping from the string
716 indices on the right -- held in the keyval and/or keyval2
717 members of the each joiner row -- to the indices for the same
718 strings on the left. This enables us to avoid doing string
719 comparisons when running aggr_value() later; we can just
720 compare the indices of the strings. In addition, if on any
721 given row we get no match for the right-hand key string on the
722 left (signalled by a strmap value of -1) we can exploit this
723 information by shuffling such rows to the end of the joiner
724 rectangle and ignoring them when aggregating.
725 */
726
727 if (jr->str_keys[0] || jr->str_keys[1]) {
728 series_table *stl, *str;
729 int *strmap;
730 int k, kmin, kmax, lkeyval, rkeyval;
731
732 kmin = jr->str_keys[0] ? 1 : 2;
733 kmax = jr->str_keys[1] ? 2 : 1;
734
735 for (k=kmin; k<=kmax; k++) {
736 stl = series_get_string_table(jr->l_dset, jr->l_keyno[k]);
737 str = series_get_string_table(jr->r_dset, jr->r_keyno[k]);
738 strmap = series_table_map(str, stl);
739
740 if (strmap == NULL) {
741 err = E_ALLOC;
742 break;
743 }
744
745 for (i=0; i<jr->n_rows; i++) {
746 if (k == 1) {
747 rkeyval = jr->rows[i].keyval;
748 } else if (jr->rows[i].keyval == INT_MAX) {
749 continue;
750 } else {
751 rkeyval = jr->rows[i].keyval2;
752 }
753 lkeyval = strmap[rkeyval];
754 #if JDEBUG > 1
755 fprintf(stderr, "k = %d, row %d, keyval: %d -> %d\n", k, i, rkeyval, lkeyval);
756 #endif
757 if (lkeyval > 0) {
758 if (k == 1) {
759 jr->rows[i].keyval = lkeyval;
760 } else {
761 jr->rows[i].keyval2 = lkeyval;
762 }
763 } else {
764 /* arrange for qsort to move row to end */
765 jr->rows[i].keyval = G_MAXDOUBLE;
766 matches--;
767 }
768 }
769
770 free(strmap);
771 }
772 }
773
774 if (err) {
775 return err;
776 }
777
778 qsort(jr->rows, jr->n_rows, sizeof *jr->rows, compare_jr_rows);
779
780 if (matches < jr->n_rows) {
781 jr->n_rows = matches;
782 }
783
784 jr->n_unique = 1;
785 for (i=1; i<jr->n_rows; i++) {
786 if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
787 jr->n_unique += 1;
788 }
789 }
790
791 jr->keys = malloc(jr->n_unique * sizeof *jr->keys);
792 jr->key_freq = malloc(jr->n_unique * sizeof *jr->key_freq);
793 jr->key_row = malloc(jr->n_unique * sizeof *jr->key_row);
794
795 if (jr->keys == NULL || jr->key_freq == NULL || jr->key_row == NULL) {
796 err = E_ALLOC;
797 } else {
798 int j = 0, nj = 1;
799
800 for (i=0; i<jr->n_unique; i++) {
801 jr->key_freq[i] = 0;
802 }
803
804 jr->keys[0] = jr->rows[0].keyval;
805 jr->key_row[0] = 0;
806
807 for (i=1; i<jr->n_rows; i++) {
808 if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
809 /* finalize info for key j */
810 jr->keys[j] = jr->rows[i-1].keyval;
811 jr->key_freq[j] = nj;
812 /* and initialize for next key */
813 nj = 1;
814 if (j < jr->n_unique - 1) {
815 jr->key_row[j+1] = i;
816 }
817 j++;
818 } else {
819 nj++;
820 }
821 }
822
823 /* make sure the last row is right */
824 jr->keys[j] = jr->rows[i-1].keyval;
825 jr->key_freq[j] = nj;
826 }
827
828 return err;
829 }
830
831 #if JDEBUG > 1
832
joiner_print(joiner * jr)833 static void joiner_print (joiner *jr)
834 {
835 char **labels = NULL;
836 jr_row *row;
837 int i;
838
839 if (jr->str_keys[0]) {
840 labels = series_get_string_vals(jr->l_dset, jr->l_keyno[1], NULL, 0);
841 }
842
843 fprintf(stderr, "\njoiner: n_rows = %d\n", jr->n_rows);
844 for (i=0; i<jr->n_rows; i++) {
845 row = &jr->rows[i];
846 if (row->n_keys > 1) {
847 fprintf(stderr, " row %d: keyvals=(%g,%g)\n",
848 i, row->keyval, row->keyval2);
849 } else {
850 int k = lrint(row->keyval) - 1;
851
852 if (jr->str_keys[0] && row->keyval >= 0) {
853 fprintf(stderr, " row %d: keyval=%g (%s)\n",
854 i, row->keyval, labels[k]);
855 } else {
856 fprintf(stderr, " row %d: keyval=%g\n",
857 i, row->keyval);
858 }
859 }
860 }
861
862 if (jr->keys != NULL) {
863 fprintf(stderr, " for primary key: n_unique = %d\n", jr->n_unique);
864 for (i=0; i<jr->n_unique; i++) {
865 fprintf(stderr," key value %g: count = %d\n",
866 jr->keys[i], jr->key_freq[i]);
867 }
868 }
869 }
870
871 # if JDEBUG > 2
print_outer_dataset(const DATASET * dset,const char * fname)872 static void print_outer_dataset (const DATASET *dset, const char *fname)
873 {
874 PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
875
876 pprintf(prn, "Data extracted from %s:\n", fname);
877 printdata(NULL, NULL, dset, OPT_O, prn);
878 gretl_print_destroy(prn);
879 }
880 # endif
881
882 #endif /* JDEBUG */
883
seqval_out_of_bounds(joiner * jr,int seqmax)884 static int seqval_out_of_bounds (joiner *jr, int seqmax)
885 {
886 if (jr->seqval < 0) {
887 /* counting down from last match */
888 return -jr->seqval > seqmax;
889 } else {
890 /* counting up from first match */
891 return jr->seqval > seqmax;
892 }
893 }
894
895 /* Do a binary search for the left-hand key value @targ in the sorted
896 array of unique right-hand key values, @vals; return the position
897 among @vals at which @targ matches, or -1 for no match.
898 */
899
binsearch(keynum targ,const keynum * vals,int n,int offset)900 static int binsearch (keynum targ, const keynum *vals, int n, int offset)
901 {
902 int m = n/2;
903
904 if (fabs((targ) - (vals[m])) < 1.0e-7) {
905 return m + offset;
906 } else if (targ < vals[0] || targ > vals[n-1]) {
907 return -1;
908 } else if (targ < vals[m]) {
909 return binsearch(targ, vals, m, offset);
910 } else {
911 return binsearch(targ, vals + m, n - m, offset + m);
912 }
913 }
914
915 /* In some cases we can figure out what aggr_value() should return
916 just based on the number of matches, @n, and the characteristics
917 of the joiner. If so, write the value into @x and return 1; if
918 not, return 0.
919 */
920
aggr_val_determined(joiner * jr,int n,double * x,int * err)921 static int aggr_val_determined (joiner *jr, int n, double *x, int *err)
922 {
923 if (jr->aggr == AGGR_COUNT) {
924 /* just return the number of matches */
925 *x = n;
926 return 1;
927 } else if (jr->aggr == AGGR_SEQ && seqval_out_of_bounds(jr, n)) {
928 /* out of bounds sequence index: return NA */
929 *x = NADBL;
930 return 1;
931 } else if (n > 1 && jr->aggr == AGGR_NONE) {
932 /* fail */
933 #if AGGDEBUG
934 fprintf(stderr, "aggr_val_determined(): got n=%d\n", n);
935 #endif
936 *err = E_DATA;
937 gretl_errmsg_set(_("You need to specify an aggregation "
938 "method for a 1:n join"));
939 *x = NADBL;
940 return 1;
941 } else {
942 /* not enough information so far */
943 return 0;
944 }
945 }
946
947 /* get month-day index from @dset time-series info */
948
midas_day_index(int t,DATASET * dset)949 static int midas_day_index (int t, DATASET *dset)
950 {
951 char obs[OBSLEN];
952 int y, m, d, idx = -1;
953
954 ntolabel(obs, t, dset);
955 if (sscanf(obs, YMD_READ_FMT, &y, &m, &d) == 3) {
956 idx = month_day_index(y, m, d, dset->pd);
957 }
958
959 return idx;
960 }
961
962 #define KEYMISS -999
963
matcher_get_k2(jr_matcher * matcher,int i)964 static keynum matcher_get_k2 (jr_matcher *matcher, int i)
965 {
966 return matcher->k2 == NULL ? 0 : matcher->k2[i];
967 }
968
matcher_set_k2(jr_matcher * matcher,int i,keynum val)969 static void matcher_set_k2 (jr_matcher *matcher, int i,
970 keynum val)
971 {
972 if (matcher->k2 != NULL) {
973 matcher->k2[i] = val;
974 }
975 }
976
977 #define midas_daily(j) (j->midas_m > 20)
978
979 #define min_max_cond(x,y,a) ((a==AGGR_MAX && x>y) || (a==AGGR_MIN && x<y))
980
981 /* aggr_value: here we're working on a given row of the left-hand
982 dataset. The values @key and (if applicable) @key2 are the
983 left-hand keys for this row. We count the key-matches on the
984 right and apply an aggregation procedure if the user specified
985 one. We return the value that should be entered for the imported
986 series on this row.
987
988 Note: @xmatch and @auxmatch are workspace arrays allocated by
989 the caller.
990 */
991
aggr_value(joiner * jr,jr_matcher * matcher,int s,int v,int revseq,double * xmatch,double * auxmatch,int * err)992 static double aggr_value (joiner *jr,
993 jr_matcher *matcher,
994 int s, int v,
995 int revseq,
996 double *xmatch,
997 double *auxmatch,
998 int *err)
999 {
1000 keynum key1 = matcher->k1[s];
1001 keynum key2 = matcher_get_k2(matcher, s);
1002 int pos = matcher->pos[s];
1003 double x, xa;
1004 int imin, imax;
1005 int i, n, ntotal;
1006
1007 #if AGGDEBUG
1008 fprintf(stderr, " key1 = %g: matched at position %d\n", key1, pos);
1009 #endif
1010
1011 /* how many matches at @pos? */
1012 n = jr->key_freq[pos];
1013
1014 #if AGGDEBUG
1015 fprintf(stderr, " number of primary matches = %d (n_keys=%d)\n",
1016 n, jr->n_keys);
1017 #endif
1018
1019 if (jr->n_keys == 1) {
1020 /* if there's just a single key, we can figure some
1021 cases out already */
1022 if (aggr_val_determined(jr, n, &x, err)) {
1023 return x;
1024 }
1025 }
1026
1027 if (jr->key_row[pos] < 0) {
1028 /* "can't happen" */
1029 return NADBL;
1030 }
1031
1032 /* set the range of rows for reading from the joiner rectangle */
1033 imin = jr->key_row[pos];
1034 imax = imin + n;
1035
1036 #if AGGDEBUG
1037 fprintf(stderr, " aggregation row range: %d to %d\n", imin+1, imax);
1038 #endif
1039
1040 if (jr->aggr == AGGR_MIDAS) {
1041 /* special case: MIDAS "spreading" */
1042 int daily = dated_daily_data(jr->r_dset);
1043 int gotit = 0;
1044
1045 x = NADBL;
1046
1047 for (i=imin; i<imax && !gotit; i++) {
1048 /* loop across primary key matches */
1049 jr_row *r = &jr->rows[i];
1050
1051 if (jr->n_keys == 1 || key2 == r->keyval2) {
1052 /* got secondary key match */
1053 int sub, t = r->dset_row;
1054 #if AGGDEBUG
1055 fprintf(stderr, " i=%d: 2-key match: %d,%d (revseq=%d)\n",
1056 i, (int) key1, (int) key2, revseq);
1057 #endif
1058 if (daily) {
1059 /* outer dataset has known daily structure */
1060 sub = midas_day_index(t, jr->r_dset);
1061 gotit = sub == revseq;
1062 } else if (midas_daily(jr) && r->micro > 0) {
1063 /* "other" daily data: r->micro holds day */
1064 sub = month_day_index((int) key1, (int) key2,
1065 r->micro, jr->midas_pd);
1066 gotit = sub == revseq;
1067 } else {
1068 if (r->micro > 0) {
1069 /* if present, this is derived from the outer
1070 time-key specification
1071 */
1072 sub = r->micro;
1073 } else {
1074 date_maj_min(t, jr->r_dset, NULL, &sub);
1075 }
1076 gotit = (sub - 1) % jr->midas_m + 1 == revseq;
1077 }
1078 if (gotit) {
1079 x = jr->r_dset->Z[v][t];
1080 }
1081 }
1082 }
1083
1084 /* and we're done */
1085 return x;
1086 }
1087
1088 /* We now fill out the array @xmatch with non-missing values
1089 from the matching outer rows. If we have a secondary key
1090 we screen for matches on that as we go.
1091 */
1092
1093 n = 0; /* will now hold count of non-NA matches */
1094 ntotal = 0; /* will ignore the OK/NA distinction */
1095
1096 for (i=imin; i<imax; i++) {
1097 jr_row *r = &jr->rows[i];
1098
1099 if (jr->n_keys == 1 || key2 == r->keyval2) {
1100 ntotal++;
1101 x = jr->r_dset->Z[v][r->dset_row];
1102 if (jr->auxcol) {
1103 xa = r->aux;
1104 if (!na(x) && na(xa)) {
1105 /* we can't know the min/max of the aux var */
1106 *err = E_MISSDATA;
1107 return NADBL;
1108 }
1109 if (!na(xa)) {
1110 auxmatch[n] = xa;
1111 xmatch[n++] = x;
1112 }
1113 } else if (!na(x)) {
1114 xmatch[n++] = x;
1115 }
1116 }
1117 }
1118
1119 if (jr->n_keys > 1) {
1120 /* we've already checked this for the 1-key case */
1121 if (aggr_val_determined(jr, n, &x, err)) {
1122 return x;
1123 }
1124 }
1125
1126 x = NADBL;
1127
1128 if (n == 0) {
1129 ; /* all matched observations are NA */
1130 } else if (jr->aggr == AGGR_NONE) {
1131 x = xmatch[0];
1132 } else if (jr->aggr == AGGR_SEQ) {
1133 int sval = jr->seqval;
1134
1135 i = sval < 0 ? n + sval : sval - 1;
1136 if (i >= 0 && i < n) {
1137 x = xmatch[i];
1138 }
1139 } else if (jr->aggr == AGGR_MAX || jr->aggr == AGGR_MIN) {
1140 if (jr->auxcol) {
1141 /* using the max/min of an auxiliary var */
1142 int idx = 0;
1143
1144 x = auxmatch[0];
1145 for (i=1; i<n; i++) {
1146 if (min_max_cond(auxmatch[i], x, jr->aggr)) {
1147 x = auxmatch[i];
1148 idx = i;
1149 }
1150 }
1151 x = xmatch[idx];
1152 } else {
1153 /* max/min of the actual data */
1154 x = xmatch[0];
1155 for (i=1; i<n; i++) {
1156 if (min_max_cond(xmatch[i], x, jr->aggr)) {
1157 x = xmatch[i];
1158 }
1159 }
1160 }
1161 } else if (jr->aggr == AGGR_SUM || jr->aggr == AGGR_AVG) {
1162 x = 0.0;
1163 for (i=0; i<n; i++) {
1164 x += xmatch[i];
1165 }
1166 if (jr->aggr == AGGR_AVG) {
1167 x /= n;
1168 }
1169 }
1170
1171 return x;
1172 }
1173
handle_midas_setup(joiner * jr,int i,int lv,int rv,int revseq)1174 static void handle_midas_setup (joiner *jr, int i, int lv, int rv,
1175 int revseq)
1176 {
1177 char label[MAXLABEL];
1178
1179 series_set_midas_period(jr->l_dset, lv, revseq);
1180 sprintf(label, "%s in sub-period %d",
1181 jr->r_dset->varname[rv], revseq);
1182 series_record_label(jr->l_dset, lv, label);
1183 series_set_midas_freq(jr->l_dset, lv, jr->r_dset->pd);
1184 if (i == 1) {
1185 series_set_midas_anchor(jr->l_dset, lv);
1186 }
1187 }
1188
1189 /* Handle the case where (a) the value from the right, @rz, is
1190 actually the coding of a string value, and (b) the LHS series is
1191 pre-existing and already has a string table attached. The RHS
1192 coding must be made consistent with that on the left. We reach this
1193 function only if we've verified that there are string tables on
1194 both sides, and that @rz is not NA.
1195 */
1196
maybe_adjust_string_code(series_table * rst,series_table * lst,double rz,int * err)1197 static double maybe_adjust_string_code (series_table *rst,
1198 series_table *lst,
1199 double rz, int *err)
1200 {
1201 const char *rstr = series_table_get_string(rst, rz);
1202 double lz = series_table_get_value(lst, rstr);
1203
1204 if (!na(lz)) {
1205 /* use the LHS encoding */
1206 rz = lz;
1207 } else {
1208 /* we need to append to the LHS string table */
1209 int n = series_table_add_string(lst, rstr);
1210
1211 if (n < 0) {
1212 *err = E_ALLOC;
1213 } else {
1214 rz = n;
1215 }
1216 }
1217
1218 return rz;
1219 }
1220
jr_matcher_free(jr_matcher * matcher)1221 static void jr_matcher_free (jr_matcher *matcher)
1222 {
1223 free(matcher->k1);
1224 free(matcher->k2);
1225 free(matcher->pos);
1226 }
1227
jr_matcher_init(jr_matcher * matcher,int nobs,int nkeys)1228 static int jr_matcher_init (jr_matcher *matcher, int nobs,
1229 int nkeys)
1230 {
1231 int i, err = 0;
1232
1233 matcher->k1 = malloc(nobs * sizeof *matcher->k1);
1234 matcher->pos = malloc(nobs * sizeof *matcher->pos);
1235 matcher->k2 = NULL;
1236
1237 if (matcher->k1 == NULL || matcher->pos == NULL) {
1238 err = E_ALLOC;
1239 } else if (nkeys == 2) {
1240 matcher->k2 = malloc(nobs * sizeof *matcher->k2);
1241 if (matcher->k2 == NULL) {
1242 err = E_ALLOC;
1243 }
1244 }
1245
1246 if (err) {
1247 jr_matcher_free(matcher);
1248 return err;
1249 }
1250
1251 for (i=0; i<nobs; i++) {
1252 matcher->k1[i] = 0;
1253 matcher->pos[i] = 0;
1254 if (matcher->k2 != NULL) {
1255 matcher->k2[i] = 0;
1256 }
1257 }
1258
1259 return 0;
1260 }
1261
get_all_inner_key_values(joiner * jr,const int * ikeyvars,jr_matcher * matcher)1262 static int get_all_inner_key_values (joiner *jr,
1263 const int *ikeyvars,
1264 jr_matcher *matcher)
1265 {
1266 DATASET *dset = jr->l_dset;
1267 int i, j, n = sample_size(dset);
1268 int err;
1269
1270 err = jr_matcher_init(matcher, n, jr->n_keys);
1271 if (err) {
1272 return err;
1273 }
1274
1275 for (i=dset->t1, j=0; i<=dset->t2 && !err; i++, j++) {
1276 if (using_auto_keys(jr)) {
1277 /* using the LHS dataset obs info */
1278 char obs[OBSLEN];
1279
1280 ntolabel(obs, i, dset);
1281 if (calendar_data(dset)) {
1282 guint32 ed = get_epoch_day(obs);
1283
1284 if (jr->n_keys == 2) {
1285 /* inner daily, outer monthly */
1286 int y, m, d;
1287
1288 ymd_bits_from_epoch_day(ed, &y, &m, &d);
1289 matcher->k1[j] = y;
1290 matcher->k2[j] = m;
1291 } else {
1292 matcher->k1[j] = ed;
1293 }
1294 } else {
1295 /* monthly or quarterly (FIXME any others?) */
1296 matcher->k1[j] = atoi(obs);
1297 matcher_set_k2(matcher, j, atoi(obs + 5));
1298 }
1299 } else {
1300 /* using regular LHS key series */
1301 double dk1, dk2 = 0;
1302 keynum k1 = 0, k2 = 0;
1303
1304 dk1 = dset->Z[ikeyvars[1]][i];
1305 if (jr->n_keys == 2) {
1306 dk2 = dset->Z[ikeyvars[2]][i];
1307 }
1308 if (na(dk1) || na(dk2)) {
1309 matcher->pos[j] = KEYMISS;
1310 } else {
1311 k1 = dtoll(dk1, &err);
1312 if (!err && jr->n_keys == 2) {
1313 k2 = dtoll(dk2, &err);
1314 }
1315 }
1316 if (!err && matcher->pos[j] != KEYMISS) {
1317 matcher->k1[j] = k1;
1318 matcher_set_k2(matcher, j, k2);
1319 }
1320 }
1321
1322 if (!err && matcher->pos[j] != KEYMISS) {
1323 /* look up position in outer keys array */
1324 matcher->pos[j] = binsearch(matcher->k1[j], jr->keys,
1325 jr->n_unique, 0);
1326 }
1327 }
1328
1329 return err;
1330 }
1331
1332 /* Returns the 0-based column index in the outer dataset associated
1333 with the i^th target variable for joining. Note that the arg @i is
1334 1-based, being a position within a gretl list.
1335 */
1336
outer_series_index(joinspec * jspec,int i)1337 static int outer_series_index (joinspec *jspec, int i)
1338 {
1339 if (i >= 1) {
1340 return jspec->colnums[JOIN_TARG + i - 1];
1341 } else {
1342 return -1;
1343 }
1344 }
1345
aggregate_data(joiner * jr,const int * ikeyvars,const int * targvars,joinspec * jspec,int orig_v,int * modified)1346 static int aggregate_data (joiner *jr, const int *ikeyvars,
1347 const int *targvars, joinspec *jspec,
1348 int orig_v, int *modified)
1349 {
1350 jr_matcher matcher = {0};
1351 series_table *rst = NULL;
1352 series_table *lst = NULL;
1353 DATASET *dset = jr->l_dset;
1354 double *xmatch = NULL;
1355 double *auxmatch = NULL;
1356 int revseq = 0;
1357 int i, t, nmax;
1358 int err = 0;
1359
1360 /* find the greatest (primary) key frequency */
1361 nmax = 0;
1362 for (i=0; i<jr->n_unique; i++) {
1363 if (jr->key_freq[i] > nmax) {
1364 nmax = jr->key_freq[i];
1365 }
1366 }
1367
1368 #if AGGDEBUG
1369 fprintf(stderr, "\naggregate data: max primary matches = %d\n", nmax);
1370 #endif
1371
1372 if (jr->aggr == AGGR_MIDAS) {
1373 /* reverse sequence number for MIDAS join */
1374 revseq = targvars[0];
1375 jr->midas_m = revseq;
1376 #if AGGDEBUG
1377 fprintf(stderr, "midas m = %d\n", jr->midas_m);
1378 #endif
1379 } else if (nmax > 0) {
1380 /* allocate workspace for aggregation */
1381 int nx = (jr->auxcol > 0)? 2 * nmax : nmax;
1382
1383 xmatch = malloc(nx * sizeof *xmatch);
1384 if (xmatch == NULL) {
1385 return E_ALLOC;
1386 }
1387 if (jr->auxcol) {
1388 auxmatch = xmatch + nmax;
1389 }
1390 }
1391
1392 err = get_all_inner_key_values(jr, ikeyvars, &matcher);
1393
1394 for (i=1; i<=targvars[0] && !err; i++) {
1395 /* loop across the series to be added/modified */
1396 int s, rv, lv = targvars[i];
1397 int strcheck = 0;
1398
1399 #if AGGDEBUG
1400 fprintf(stderr, "\nworking on series %d\n", i);
1401 #endif
1402
1403 if (jr->aggr == AGGR_MIDAS) {
1404 rv = jspec->colnums[JOIN_TARG];
1405 jr->midas_pd = jspec->midas_pd;
1406 } else {
1407 rv = outer_series_index(jspec, i);
1408 jr->midas_pd = 0;
1409 }
1410
1411 if (rv > 0) {
1412 /* check for the case where both the target variable on the
1413 left and the series to be imported are string-valued
1414 */
1415 rst = series_get_string_table(jr->r_dset, rv);
1416 lst = series_get_string_table(jr->l_dset, lv);
1417 strcheck = (rst != NULL && lst != NULL);
1418 }
1419
1420 /* run through the rows in the current sample range of the
1421 left-hand dataset, pick up the value of the inner key(s), and
1422 call aggr_value() to determine the value that should be
1423 imported from the right
1424 */
1425
1426 for (t=dset->t1, s=0; t<=dset->t2 && !err; t++, s++) {
1427 int nomatch = 0;
1428 double zt;
1429
1430 #if AGGDEBUG
1431 fprintf(stderr, " working on obs %d\n", t);
1432 #endif
1433 if (matcher.pos[s] == KEYMISS) {
1434 dset->Z[lv][t] = NADBL;
1435 continue;
1436 } else if (matcher.pos[s] < 0) {
1437 nomatch = 1;
1438 zt = (jr->aggr == AGGR_COUNT)? 0 : NADBL;
1439 } else {
1440 zt = aggr_value(jr, &matcher, s, rv, revseq, xmatch,
1441 auxmatch, &err);
1442 }
1443 #if AGGDEBUG
1444 if (na(zt)) {
1445 fprintf(stderr, " aggr_value: got NA (keys=%g,%g, err=%d)\n",
1446 matcher.k1[s], matcher_get_k2(&matcher, s), err);
1447 } else {
1448 fprintf(stderr, " aggr_value: got %.12g (keys=%g,%g, err=%d)\n",
1449 zt, matcher.k1[s], matcher_get_k2(&matcher, s), err);
1450 }
1451 #endif
1452 if (!err && strcheck && !na(zt)) {
1453 zt = maybe_adjust_string_code(rst, lst, zt, &err);
1454 }
1455 if (!err) {
1456 if (lv >= orig_v) {
1457 /* @lv is a newly added series */
1458 dset->Z[lv][t] = zt;
1459 } else if (zt != dset->Z[lv][t]) {
1460 if (nomatch && !na(dset->Z[lv][t])) {
1461 ; /* leave existing data alone (?) */
1462 } else {
1463 dset->Z[lv][t] = zt;
1464 *modified += 1;
1465 }
1466 }
1467 }
1468 }
1469
1470 if (!err && jr->aggr == AGGR_MIDAS) {
1471 handle_midas_setup(jr, i, lv, rv, revseq);
1472 }
1473
1474 revseq--;
1475 }
1476
1477 free(xmatch);
1478 jr_matcher_free(&matcher);
1479
1480 return err;
1481 }
1482
1483 /* Simple transcription: we come here only if there are no keys, and
1484 we've verified that the number of rows on the right is no greater
1485 than the number of rows in the current sample range on the left.
1486 */
1487
join_transcribe_data(joiner * jr,int lv,int newvar,joinspec * jspec,int * modified)1488 static int join_transcribe_data (joiner *jr, int lv, int newvar,
1489 joinspec *jspec, int *modified)
1490 {
1491 series_table *rst = NULL;
1492 series_table *lst = NULL;
1493 DATASET *dset = jr->l_dset;
1494 double zi;
1495 int strcheck = 0;
1496 int i, t, rv;
1497 int err = 0;
1498
1499 rv = outer_series_index(jspec, 1);
1500 if (rv < 0) {
1501 return E_DATA;
1502 }
1503
1504 rst = series_get_string_table(jr->r_dset, rv);
1505 lst = series_get_string_table(jr->l_dset, lv);
1506 strcheck = (rst != NULL && lst != NULL);
1507
1508 for (i=0; i<jr->n_rows && !err; i++) {
1509 jr_row *r = &jr->rows[i];
1510
1511 zi = jr->r_dset->Z[rv][r->dset_row];
1512 if (strcheck && !na(zi)) {
1513 zi = maybe_adjust_string_code(rst, lst, zi, &err);
1514 }
1515 if (!err) {
1516 t = dset->t1 + i;
1517 if (newvar) {
1518 dset->Z[lv][t] = zi;
1519 } else if (zi != dset->Z[lv][t]) {
1520 dset->Z[lv][t] = zi;
1521 *modified += 1;
1522 }
1523 }
1524 }
1525
1526 return err;
1527 }
1528
1529 #include "tsjoin.c"
1530
join_transcribe_multi_data(DATASET * l_dset,DATASET * r_dset,int * targlist,int orig_v,joinspec * jspec,ts_joiner * tjr,int * modified)1531 static int join_transcribe_multi_data (DATASET *l_dset,
1532 DATASET *r_dset,
1533 int *targlist,
1534 int orig_v,
1535 joinspec *jspec,
1536 ts_joiner *tjr,
1537 int *modified)
1538 {
1539 series_table *rst = NULL;
1540 series_table *lst = NULL;
1541 int i, s, t, lv, rv;
1542 int t1, t2, m = 0;
1543 int strcheck, newvar;
1544 double xit;
1545 int err = 0;
1546
1547 if (tjr == NULL) {
1548 t1 = l_dset->t1;
1549 t2 = l_dset->t2;
1550 } else {
1551 t1 = tjr->t1;
1552 t2 = tjr->t2;
1553 }
1554
1555 for (i=1; i<=targlist[0] && !err; i++) {
1556 lv = targlist[i];
1557 rv = outer_series_index(jspec, i);
1558 if (rv < 0) {
1559 gretl_errmsg_sprintf("join: '%s' not matched", l_dset->varname[lv]);
1560 err = E_DATA;
1561 } else {
1562 newvar = lv >= orig_v;
1563 if (newvar) {
1564 strcheck = 0;
1565 } else {
1566 rst = series_get_string_table(r_dset, rv);
1567 lst = series_get_string_table(l_dset, lv);
1568 strcheck = (rst != NULL && lst != NULL);
1569 }
1570 if (tjr != NULL) {
1571 m = tjr->rminor;
1572 s = tjr->rt1;
1573 } else {
1574 s = 0;
1575 }
1576 for (t=t1; t<=t2 && !err; t++) {
1577 xit = r_dset->Z[rv][s];
1578 if (strcheck && !na(xit)) {
1579 xit = maybe_adjust_string_code(rst, lst, xit, &err);
1580 }
1581 if (newvar) {
1582 l_dset->Z[lv][t] = xit;
1583 } else if (xit != l_dset->Z[lv][t]) {
1584 l_dset->Z[lv][t] = xit;
1585 *modified += 1;
1586 }
1587 if (tjr != NULL) {
1588 m = tj_continue(tjr, m, &s);
1589 } else {
1590 s++;
1591 }
1592 }
1593 }
1594 }
1595
1596 return err;
1597 }
1598
join_simple_range_check(DATASET * l_dset,DATASET * r_dset,int * targlist)1599 static int join_simple_range_check (DATASET *l_dset,
1600 DATASET *r_dset,
1601 int *targlist)
1602 {
1603 int err = 0;
1604
1605 if (r_dset->n != sample_size(l_dset)) {
1606 gretl_errmsg_set("join: the observation ranges don't match");
1607 err = E_DATA;
1608 } else if (r_dset->v - 1 < targlist[0]) {
1609 gretl_errmsg_set("join: series missing on the right");
1610 err = E_DATA;
1611 }
1612
1613 return err;
1614 }
1615
join_filter_new(int * err)1616 static jr_filter *join_filter_new (int *err)
1617 {
1618 jr_filter *filter = malloc(sizeof *filter);
1619
1620 if (filter == NULL) {
1621 *err = E_ALLOC;
1622 } else {
1623 filter->expr = NULL;
1624 filter->val = NULL;
1625 filter->vname1 = NULL;
1626 filter->vname2 = NULL;
1627 filter->vname3 = NULL;
1628 }
1629
1630 return filter;
1631 }
1632
1633 #if JDEBUG
1634
print_filter_vnames(jr_filter * f)1635 static void print_filter_vnames (jr_filter *f)
1636 {
1637 if (f == NULL) return;
1638
1639 if (f->vname1 != NULL) {
1640 fprintf(stderr, "filter varname 1 (target %d, JOIN_F1): %s\n",
1641 JOIN_F1, f->vname1);
1642 }
1643 if (f->vname2 != NULL) {
1644 fprintf(stderr, "filter varname 2 (target %d, JOIN_F2): %s\n",
1645 JOIN_F2, f->vname2);
1646 }
1647 if (f->vname3 != NULL) {
1648 fprintf(stderr, "filter varname 3 (target %d, JOIN_F3): %s\n",
1649 JOIN_F3, f->vname3);
1650 }
1651 }
1652
1653 #endif
1654
1655 /* Allocate the filter struct and crawl along the filter expression,
1656 @s, looking for up to three names of right-hand columns. We need to
1657 record these names so we can be sure to read the associated column
1658 data, or else the filter won't work. We could increase the maximum
1659 number of column-names to store but probably 3 is enough.
1660
1661 The heuristic for column-name detection is that we find a portion
1662 of @s which is legal as a gretl identifier but which is not
1663 enclosed in quotes, is not directly followed by '(', and is not
1664 the name of a scalar or string variable on the left.
1665 */
1666
make_join_filter(const char * s,int * err)1667 static jr_filter *make_join_filter (const char *s, int *err)
1668 {
1669 jr_filter *filter = join_filter_new(err);
1670
1671 if (filter != NULL) {
1672 char test[VNAMELEN];
1673 int n, ngot = 0;
1674
1675 filter->expr = s;
1676
1677 while (*s && ngot < 3) {
1678 if (*s == '"') {
1679 /* skip double-quoted stuff */
1680 s = strchr(s + 1, '"');
1681 if (s == NULL) {
1682 *err = E_PARSE;
1683 break;
1684 } else {
1685 s++;
1686 if (*s == '\0') {
1687 break;
1688 }
1689 }
1690 }
1691 n = gretl_namechar_spn(s);
1692 if (n > 0) {
1693 if (n < VNAMELEN && s[n] != '(') {
1694 *test = '\0';
1695 strncat(test, s, n);
1696 if (!gretl_is_scalar(test) && !gretl_is_string(test)) {
1697 if (ngot == 0) {
1698 filter->vname1 = gretl_strdup(test);
1699 } else if (ngot == 1) {
1700 filter->vname2 = gretl_strdup(test);
1701 } else if (ngot == 2) {
1702 filter->vname3 = gretl_strdup(test);
1703 }
1704 ngot++;
1705 }
1706 }
1707 s += n;
1708 } else {
1709 s++;
1710 }
1711 }
1712 }
1713
1714 #if JDEBUG
1715 print_filter_vnames(filter);
1716 #endif
1717
1718 return filter;
1719 }
1720
1721 /* Add series to hold the join data. We come here only if the
1722 target series is/are not already present in the left-hand
1723 dataset.
1724 */
1725
add_target_series(const char ** vnames,DATASET * dset,int * targvars,int n_add)1726 static int add_target_series (const char **vnames,
1727 DATASET *dset,
1728 int *targvars,
1729 int n_add)
1730 {
1731 int i, v = dset->v;
1732 int err;
1733
1734 err = dataset_add_NA_series(dset, n_add);
1735
1736 if (!err) {
1737 for (i=1; i<=targvars[0]; i++) {
1738 if (targvars[i] < 0) {
1739 strcpy(dset->varname[v], vnames[i-1]);
1740 targvars[i] = v++;
1741 }
1742 }
1743 }
1744
1745 return err;
1746 }
1747
1748 /* Parse either one or two elements (time-key column name, time format
1749 string) out of @s. If both elements are present they must be
1750 comma-separated; if only the second element is present it must be
1751 preceded by a comma.
1752 */
1753
process_time_key(const char * s,char * tkeyname,char * tkeyfmt)1754 static int process_time_key (const char *s, char *tkeyname,
1755 char *tkeyfmt)
1756 {
1757 int err = 0;
1758
1759 if (*s == ',') {
1760 /* only time-key format supplied */
1761 strncat(tkeyfmt, s + 1, 31);
1762 } else {
1763 const char *p = strchr(s, ',');
1764
1765 if (p == NULL) {
1766 /* only column name given */
1767 strncat(tkeyname, s, VNAMELEN - 1);
1768 } else {
1769 int n = p - s;
1770
1771 n = (n > VNAMELEN - 1)? VNAMELEN - 1 : n;
1772 strncat(tkeyname, s, n);
1773 strncat(tkeyfmt, p + 1, 31);
1774 }
1775 }
1776
1777 #if TDEBUG
1778 fprintf(stderr, "time key: name='%s', format='%s'\n",
1779 tkeyname, tkeyfmt);
1780 #endif
1781
1782 if (*tkeyname == '\0' && *tkeyfmt == '\0') {
1783 err = E_DATA;
1784 }
1785
1786 return err;
1787 }
1788
1789 /* Parse either one column-name or two comma-separated names out of
1790 @s. If @s contains a comma, we accept a zero-length name on either
1791 the left or the right -- but not both -- as indicating that we
1792 should use the corresponding inner key name.
1793 */
1794
process_outer_key(const char * s,int n_keys,char * name1,char * name2,gretlopt opt)1795 static int process_outer_key (const char *s, int n_keys,
1796 char *name1, char *name2,
1797 gretlopt opt)
1798 {
1799 int n_okeys = 0;
1800 int err = 0;
1801
1802 if (strchr(s, ',') == NULL) {
1803 /* just one outer key */
1804 strncat(name1, s, VNAMELEN - 1);
1805 n_okeys = 1;
1806 } else {
1807 /* two comma-separated keys */
1808 int n2 = 0, n1 = strcspn(s, ",");
1809
1810 if (n1 >= VNAMELEN) {
1811 err = E_PARSE;
1812 } else {
1813 strncat(name1, s, n1);
1814 s += n1 + 1;
1815 n2 = strlen(s);
1816 if (n2 >= VNAMELEN) {
1817 err = E_PARSE;
1818 } else {
1819 strcat(name2, s);
1820 }
1821 }
1822
1823 if (!err && n1 == 0 && n2 == 0) {
1824 /* both fields empty: wrong */
1825 err = E_PARSE;
1826 }
1827
1828 if (!err) {
1829 n_okeys = 2;
1830 }
1831 }
1832
1833 if (!err && n_okeys != n_keys) {
1834 err = E_PARSE;
1835 }
1836
1837 return err;
1838 }
1839
check_for_quarterly_format(obskey * auto_keys,int pd)1840 static int check_for_quarterly_format (obskey *auto_keys, int pd)
1841 {
1842 char *s = auto_keys->timefmt;
1843 int i, err = 0;
1844
1845 #if JDEBUG
1846 fprintf(stderr, "check_for_quarterly_format: '%s'\n", s);
1847 #endif
1848
1849 for (i=0; s[i]; i++) {
1850 if (s[i] == '%' &&
1851 (s[i+1] == 'q' || s[i+1] == 'Q') &&
1852 (i == 0 || s[i-1] != '%')) {
1853 if (pd == 4 || pd == 1) {
1854 s[i+1] = 'm';
1855 auto_keys->m_means_q = 1;
1856 } else {
1857 err = E_DATA;
1858 gretl_errmsg_sprintf("The '%c' format is not applicable "
1859 "for data with frequency %d",
1860 s[i+1], pd);
1861 }
1862 break;
1863 }
1864 }
1865
1866 return err;
1867 }
1868
1869 /* for use in determining optimal auto keys */
1870 #define daily(d) (d->pd >= 5 && d->pd <= 7)
1871
1872 /* time-series data on the left, and no explicit keys supplied */
1873
auto_keys_check(const DATASET * l_dset,const DATASET * r_dset,gretlopt opt,const char * tkeyfmt,obskey * auto_keys,int * n_keys,int * do_tsjoin)1874 static int auto_keys_check (const DATASET *l_dset,
1875 const DATASET *r_dset,
1876 gretlopt opt,
1877 const char *tkeyfmt,
1878 obskey *auto_keys,
1879 int *n_keys,
1880 int *do_tsjoin)
1881 {
1882 int lpd = l_dset->pd;
1883 int rpd = 0;
1884 int err = 0;
1885
1886 if (!dataset_is_time_series(l_dset)) {
1887 /* On the left we need a time-series dataset */
1888 err = E_DATA;
1889 goto bailout;
1890 }
1891
1892 if (dataset_is_time_series(r_dset)) {
1893 rpd = r_dset->pd;
1894 if (use_tsjoin(l_dset, r_dset)) {
1895 *do_tsjoin = 1;
1896 return 0;
1897 }
1898 auto_keys->native = 1;
1899 } else if (r_dset->S == NULL && auto_keys->keycol < 0) {
1900 /* On the right, we need either obs strings or a specified
1901 time column
1902 */
1903 err = E_DATA;
1904 goto bailout;
1905 }
1906
1907 if (*tkeyfmt != '\0') {
1908 /* the user supplied a time-format spec */
1909 err = set_time_format(auto_keys, tkeyfmt);
1910 if (!err) {
1911 err = check_for_quarterly_format(auto_keys, lpd);
1912 }
1913 if (!err) {
1914 if (annual_data(l_dset)) {
1915 *n_keys = 1;
1916 } else if (calendar_data(l_dset)) {
1917 *n_keys = 1;
1918 } else {
1919 *n_keys = 2;
1920 }
1921 }
1922 } else {
1923 /* default formats */
1924 if (calendar_data(l_dset)) {
1925 err = set_time_format(auto_keys, "%Y-%m-%d");
1926 if (daily(l_dset) && rpd == 12) {
1927 *n_keys = 2; /* use year and month */
1928 } else if (!err) {
1929 *n_keys = 1; /* use epoch day */
1930 }
1931 } else if (lpd == 12) {
1932 err = set_time_format(auto_keys, "%Y-%m");
1933 if (!err) {
1934 *n_keys = 2;
1935 }
1936 } else if (annual_data(l_dset)) {
1937 err = set_time_format(auto_keys, "%Y");
1938 if (!err) {
1939 *n_keys = 1;
1940 }
1941 } else if (lpd == 4) {
1942 /* try "excess precision" ISO daily? */
1943 err = set_time_format(auto_keys, "%Y-%m-%d");
1944 if (!err) {
1945 *n_keys = 2;
1946 }
1947 } else {
1948 err = E_PDWRONG;
1949 }
1950 }
1951
1952 bailout:
1953
1954 /* we should flag an error here only if the user
1955 explicitly requested use of this apparatus,
1956 by giving --tkey=<whatever> (OPT_K)
1957 */
1958 if (err && !(opt & OPT_K)) {
1959 err = 0;
1960 }
1961
1962 return err;
1963 }
1964
make_time_formats_array(char const ** fmts,char *** pS)1965 static int make_time_formats_array (char const **fmts, char ***pS)
1966 {
1967 char **S = strings_array_new(2);
1968 int i, err = 0;
1969
1970 if (S == NULL) {
1971 err = E_ALLOC;
1972 } else {
1973 for (i=0; i<2 && !err; i++) {
1974 if (fmts[i] != NULL) {
1975 S[i] = gretl_strdup(fmts[i]);
1976 if (S[i] == NULL) {
1977 err = E_ALLOC;
1978 }
1979 }
1980 }
1981 }
1982
1983 if (err) {
1984 strings_array_free(S, 2);
1985 } else {
1986 *pS = S;
1987 }
1988
1989 return err;
1990 }
1991
1992 /* Crawl along the string containing comma-separated names of columns
1993 that fall under the "tconvert" option. For each name, look for a
1994 match among the names of columns selected from the CSV file for
1995 some role in the join operation (data, key or whatever). It is
1996 not considered an error if there's no match for a given "tconvert"
1997 name; in that case the column will be ignored.
1998
1999 If @tconvfmt is non-NULL, this should hold a common format for
2000 date conversion.
2001
2002 If @keyfmt is not an empty string, it holds a specific format
2003 for the --tkey column. In that case we should check for tkey
2004 among the tconvert columns; if it's present we need to add its
2005 format to the timeconv_map apparatus (as an override to the
2006 common format, or an addendum if no common format is given).
2007 */
2008
process_tconvert_info(joinspec * jspec,const char * tconvcols,const char * tconvfmt,const char * keyfmt)2009 static int process_tconvert_info (joinspec *jspec,
2010 const char *tconvcols,
2011 const char *tconvfmt,
2012 const char *keyfmt)
2013 {
2014 int *list = NULL;
2015 char **names = NULL;
2016 char **fmts = NULL;
2017 const char *colname;
2018 const char *tkeyfmt = NULL;
2019 char *tkeyname = NULL;
2020 int nnames = 0;
2021 int i, j, err = 0;
2022
2023 names = gretl_string_split(tconvcols, &nnames, " ,");
2024 if (names == NULL) {
2025 err = E_ALLOC;
2026 }
2027
2028 /* match the names we got against the set of "wanted" columns */
2029
2030 for (i=0; i<nnames && !err; i++) {
2031 for (j=0; j<jspec->ncols; j++) {
2032 colname = jspec->colnames[j];
2033 if (colname != NULL && !strcmp(names[i], colname)) {
2034 gretl_list_append_term(&list, j);
2035 if (list == NULL) {
2036 err = E_ALLOC;
2037 } else if (*keyfmt != '\0' && j == JOIN_KEY) {
2038 /* we've got the time-key variable here,
2039 and a format has been given for it
2040 */
2041 tkeyfmt = keyfmt;
2042 tkeyname = names[i];
2043 }
2044 break;
2045 }
2046 }
2047 }
2048
2049 /* allocate and record the time-format info, if any */
2050 if (!err && list != NULL && (tconvfmt != NULL || tkeyfmt != NULL)) {
2051 char const *tmp[2] = {tconvfmt, tkeyfmt};
2052
2053 err = make_time_formats_array(tmp, &fmts);
2054 }
2055
2056 #if JDEBUG
2057 printlist(list, "timeconv list");
2058 #endif
2059
2060 if (!err && list != NULL) {
2061 timeconv_map_set(nnames, names, tkeyname, fmts);
2062 }
2063
2064 if (err) {
2065 /* clean up if need be */
2066 strings_array_free(names, nnames);
2067 free(list);
2068 } else {
2069 jspec->timecols = list;
2070 }
2071
2072 return err;
2073 }
2074
obskey_init(obskey * keys)2075 static void obskey_init (obskey *keys)
2076 {
2077 keys->timefmt = NULL;
2078 keys->keycol = -1;
2079 keys->m_means_q = 0;
2080 keys->numdates = 0;
2081 keys->native = 0;
2082 }
2083
2084 #define lr_mismatch(l,r) ((l > 0 && r == 0) || (r > 0 && l == 0))
2085
2086 /* Run a check pertaining to the nature of the "payload"
2087 (string-valued vs numeric) in relation to the aggregation
2088 method specified and the nature of the existing left-hand
2089 series, if any.
2090 */
2091
join_data_type_check(joinspec * jspec,const DATASET * l_dset,int * targvars,AggrType aggr)2092 static int join_data_type_check (joinspec *jspec,
2093 const DATASET *l_dset,
2094 int *targvars,
2095 AggrType aggr)
2096 {
2097 DATASET *r_dset = outer_dataset(jspec);
2098 int lstr, rstr;
2099 int i, vl, vr;
2100 int err = 0;
2101
2102 for (i=1; i<=targvars[0]; i++) {
2103 lstr = rstr = -1;
2104 vl = targvars[i];
2105 vr = outer_series_index(jspec, i);
2106 if (vl > 0) {
2107 /* there's an existing LHS series */
2108 lstr = is_string_valued(l_dset, vl);
2109 if (lstr && aggr == AGGR_COUNT) {
2110 /* count values can't be mixed with strings */
2111 err = E_TYPES;
2112 }
2113 }
2114 if (!err && vr > 0) {
2115 /* there's a payload variable on the right */
2116 rstr = is_string_valued(r_dset, vr);
2117 }
2118 if (!err && lr_mismatch(lstr, rstr)) {
2119 /* one of (L, R) is string-valued, but not the other */
2120 err = E_TYPES;
2121 }
2122 }
2123
2124 return err;
2125 }
2126
aggregation_type_check(joinspec * jspec,AggrType aggr)2127 static int aggregation_type_check (joinspec *jspec, AggrType aggr)
2128 {
2129 int err = 0;
2130
2131 if (aggr == AGGR_NONE || aggr == AGGR_COUNT || aggr == AGGR_SEQ) {
2132 ; /* no problem */
2133 } else {
2134 /* the aggregation method requires numerical data: flag
2135 an error if we got strings instead
2136 */
2137 const DATASET *dset = outer_dataset(jspec);
2138 int aggcol = 0;
2139
2140 if (jspec->colnums[JOIN_AUX] > 0) {
2141 aggcol = jspec->colnums[JOIN_AUX];
2142 } else if (jspec->colnums[JOIN_TARG] > 0) {
2143 aggcol = jspec->colnums[JOIN_TARG];
2144 }
2145
2146 if (aggcol > 0 && is_string_valued(dset, aggcol)) {
2147 gretl_errmsg_sprintf("'%s' is a string variable: aggregation type "
2148 "is not applicable", dset->varname[aggcol]);
2149 err = E_TYPES;
2150 }
2151 }
2152
2153 return err;
2154 }
2155
check_for_missing_columns(joinspec * jspec)2156 static int check_for_missing_columns (joinspec *jspec)
2157 {
2158 const char *name;
2159 int i;
2160
2161 /* Note: it's possible, though we hope unlikely, that our
2162 heuristic for extracting column names from the filter
2163 expression (if present) gave a false positive. In that case the
2164 name at position JOIN_F* might be spuriously "missing": not
2165 found, but not really wanted. To guard against this eventuality
2166 we'll skip the check for the JOIN_F* columns here. If either
2167 one is really missing, that will show up before long, when the
2168 filter is evaluated.
2169 */
2170
2171 for (i=0; i<jspec->ncols; i++) {
2172 if (i == JOIN_F1 || i == JOIN_F2 || i == JOIN_F3) {
2173 continue;
2174 }
2175 name = jspec->colnames[i];
2176 if (name != NULL && jspec->colnums[i] == 0) {
2177 gretl_errmsg_sprintf(_("%s: column '%s' was not found"), "join", name);
2178 return E_DATA;
2179 }
2180 #if JDEBUG
2181 if (name != NULL) {
2182 fprintf(stderr, "colname '%s' -> colnum %d\n", name, jspec->colnums[i]);
2183 }
2184 #endif
2185 }
2186
2187 return 0;
2188 }
2189
2190 /* Handle the situation where for one or other of the keys
2191 there's a type mismatch: string on left but not on right, or
2192 vice versa.
2193 */
2194
key_types_error(int lstr,int rstr)2195 static int key_types_error (int lstr, int rstr)
2196 {
2197 if (lstr) {
2198 gretl_errmsg_sprintf(_("%s: string key on left but numeric on right"), "join");
2199 } else {
2200 gretl_errmsg_sprintf(_("%s: string key on right but numeric on left"), "join");
2201 }
2202
2203 return E_TYPES;
2204 }
2205
set_up_outer_keys(joinspec * jspec,const DATASET * l_dset,gretlopt opt,const int * ikeyvars,int * okeyvars,obskey * auto_keys,int * str_keys)2206 static int set_up_outer_keys (joinspec *jspec, const DATASET *l_dset,
2207 gretlopt opt, const int *ikeyvars,
2208 int *okeyvars, obskey *auto_keys,
2209 int *str_keys)
2210 {
2211 const DATASET *r_dset = outer_dataset(jspec);
2212 int lstr, rstr;
2213 int err = 0;
2214
2215 if (jspec->colnums[JOIN_KEY] > 0) {
2216 okeyvars[0] += 1;
2217 okeyvars[1] = jspec->colnums[JOIN_KEY];
2218 }
2219
2220 if (jspec->colnums[JOIN_KEY2] > 0) {
2221 okeyvars[0] += 1;
2222 okeyvars[2] = jspec->colnums[JOIN_KEY2];
2223 }
2224
2225 if (opt & OPT_K) {
2226 /* time key on right */
2227 rstr = is_string_valued(r_dset, okeyvars[1]);
2228 auto_keys->keycol = okeyvars[1];
2229 if (!rstr) {
2230 /* flag the need to convert to string later */
2231 auto_keys->numdates = 1;
2232 }
2233 } else {
2234 /* regular key(s) on right */
2235 lstr = is_string_valued(l_dset, ikeyvars[1]);
2236 rstr = is_string_valued(r_dset, okeyvars[1]);
2237
2238 if (lstr != rstr) {
2239 err = key_types_error(lstr, rstr);
2240 } else if (lstr) {
2241 str_keys[0] = 1;
2242 }
2243
2244 if (!err && okeyvars[2] > 0) {
2245 lstr = is_string_valued(l_dset, ikeyvars[2]);
2246 rstr = is_string_valued(r_dset, okeyvars[2]);
2247 if (lstr != rstr) {
2248 err = key_types_error(lstr, rstr);
2249 } else if (lstr) {
2250 str_keys[1] = 1;
2251 }
2252 }
2253 }
2254
2255 return err;
2256 }
2257
first_available_index(joinspec * jspec)2258 static int first_available_index (joinspec *jspec)
2259 {
2260 int i;
2261
2262 for (i=JOIN_TARG; i<jspec->ncols; i++) {
2263 if (jspec->colnums[i] == 0) {
2264 return i;
2265 }
2266 }
2267
2268 return -1;
2269 }
2270
determine_gdt_matches(const char * fname,joinspec * jspec,int ** plist,int * addvars,int * omm,PRN * prn)2271 static int determine_gdt_matches (const char *fname,
2272 joinspec *jspec,
2273 int **plist,
2274 int *addvars,
2275 int *omm,
2276 PRN *prn)
2277 {
2278 char **vnames = NULL;
2279 int nv = 0;
2280 int err = 0;
2281
2282 err = gretl_read_gdt_varnames(fname, &vnames, &nv);
2283
2284 if (!err) {
2285 GPatternSpec *pspec;
2286 int *vlist = NULL;
2287 char **S = NULL;
2288 int i, j, ns = 0;
2289
2290 /* form array of unique wanted identifiers */
2291 for (i=0; i<jspec->ncols && !err; i++) {
2292 if (jspec->colnames[i] != NULL) {
2293 err = strings_array_add_uniq(&S, &ns, jspec->colnames[i], NULL);
2294 }
2295 }
2296
2297 for (i=0; i<ns && !err; i++) {
2298 int match = 0;
2299
2300 if (!strcmp(S[i], "$obsmajor")) {
2301 omm[0] = 1;
2302 continue;
2303 } else if (!strcmp(S[i], "$obsminor")) {
2304 omm[1] = 1;
2305 continue;
2306 }
2307
2308 if (prn != NULL) {
2309 pprintf(prn, "checking for '%s'\n", S[i]);
2310 }
2311
2312 if (is_wildstr(S[i])) {
2313 pspec = g_pattern_spec_new(S[i]);
2314 for (j=1; j<nv; j++) {
2315 if (g_pattern_match_string(pspec, vnames[j])) {
2316 match++;
2317 if (!in_gretl_list(vlist, j)) {
2318 gretl_list_append_term(&vlist, j);
2319 }
2320 }
2321 }
2322 g_pattern_spec_free(pspec);
2323 } else {
2324 for (j=1; j<nv; j++) {
2325 if (!strcmp(S[i], vnames[j])) {
2326 match = 1;
2327 if (!in_gretl_list(vlist, j)) {
2328 gretl_list_append_term(&vlist, j);
2329 }
2330 break;
2331 }
2332 }
2333 if (!match) {
2334 err = E_DATA;
2335 gretl_errmsg_sprintf("'%s': not found", S[i]);
2336 }
2337 }
2338 if (prn != NULL) {
2339 pprintf(prn, " found %d match(es)\n", match);
2340 }
2341 }
2342
2343 if (!err && (vlist == NULL || vlist[0] == 0)) {
2344 gretl_errmsg_set("No matching data were found");
2345 err = E_DATA;
2346 }
2347
2348 if (!err) {
2349 *addvars = vlist[0] - ns;
2350 *plist = vlist;
2351 } else {
2352 free(vlist);
2353 }
2354
2355 strings_array_free(S, ns);
2356 strings_array_free(vnames, nv);
2357 }
2358
2359 return err;
2360 }
2361
rhs_add_obsmajmin(int * omm,DATASET * dset)2362 static int rhs_add_obsmajmin (int *omm, DATASET *dset)
2363 {
2364 int err = dataset_add_series(dset, omm[0] + omm[1]);
2365
2366 if (!err) {
2367 int t, maj, min, vmaj = 0, vmin = 0;
2368 int *pmaj = NULL, *pmin = NULL;
2369
2370 if (omm[0]) {
2371 pmaj = &maj; vmaj = dset->v - 1 - omm[1];
2372 strcpy(dset->varname[vmaj], "$obsmajor");
2373 }
2374 if (omm[1]) {
2375 pmin = &min; vmin = dset->v - 1;
2376 strcpy(dset->varname[vmin], "$obsminor");
2377 }
2378 for (t=0; t<dset->n; t++) {
2379 date_maj_min(t, dset, pmaj, pmin);
2380 if (vmaj) {
2381 dset->Z[vmaj][t] = maj;
2382 }
2383 if (vmin) {
2384 dset->Z[vmin][t] = min;
2385 }
2386 }
2387 }
2388
2389 return err;
2390 }
2391
join_import_gdt(const char * fname,joinspec * jspec,gretlopt opt,PRN * prn)2392 static int join_import_gdt (const char *fname,
2393 joinspec *jspec,
2394 gretlopt opt,
2395 PRN *prn)
2396 {
2397 const char *cname;
2398 int *vlist = NULL;
2399 int orig_ncols = jspec->ncols;
2400 int i, vi, addvars = 0;
2401 int omm[2] = {0};
2402 int err = 0;
2403
2404 err = determine_gdt_matches(fname, jspec, &vlist, &addvars,
2405 omm, prn);
2406
2407 if (!err) {
2408 jspec->dset = datainfo_new();
2409 if (jspec->dset == NULL) {
2410 err = E_ALLOC;
2411 }
2412 }
2413
2414 if (!err) {
2415 err = gretl_read_gdt_subset(fname, jspec->dset, vlist, opt);
2416 }
2417
2418 if (!err && addvars > 0) {
2419 /* we have some extra vars due to wildcard expansion */
2420 err = expand_jspec(jspec, addvars);
2421 }
2422
2423 if (!err && (omm[0] || omm[1])) {
2424 /* we need to add $obsmajor and/or $obsminor on the right */
2425 err = rhs_add_obsmajmin(omm, jspec->dset);
2426 }
2427
2428 if (!err) {
2429 /* match up the imported series with their roles */
2430 for (i=0; i<orig_ncols && !err; i++) {
2431 cname = jspec->colnames[i];
2432 if (cname != NULL && !is_wildstr(cname)) {
2433 vi = current_series_index(jspec->dset, cname);
2434 if (vi < 0) {
2435 err = E_DATA;
2436 } else {
2437 jspec->colnums[i] = vi;
2438 }
2439 }
2440 }
2441 }
2442
2443 if (!err && addvars > 0) {
2444 /* register any extra imported series */
2445 int j, pos, idx;
2446
2447 for (i=1; i<jspec->dset->v; i++) {
2448 pos = 0;
2449 for (j=0; j<jspec->ncols; j++) {
2450 if (jspec->colnums[j] == i) {
2451 /* already registered */
2452 pos = i;
2453 break;
2454 }
2455 }
2456 if (pos == 0) {
2457 idx = first_available_index(jspec);
2458 if (idx < 0) {
2459 err = E_DATA;
2460 } else {
2461 jspec->colnums[idx] = i;
2462 jspec->colnames[idx] = jspec->dset->varname[i];
2463 }
2464 }
2465 }
2466 }
2467
2468 free(vlist);
2469
2470 return err;
2471 }
2472
2473 /* add/replace an entry in @jspec's colnames array,
2474 and record the fact that @src is now owned by
2475 @jspec so we can free it when we're finished
2476 and hence avoid leaking memory
2477 */
2478
jspec_push_tmpname(joinspec * jspec,int pos,char * src)2479 static int jspec_push_tmpname (joinspec *jspec,
2480 int pos,
2481 char *src)
2482 {
2483 jspec->colnames[pos] = src;
2484 return strings_array_donate(&jspec->tmpnames, &jspec->n_tmp, src);
2485 }
2486
determine_csv_matches(const char * fname,joinspec * jspec,PRN * prn)2487 static int determine_csv_matches (const char *fname,
2488 joinspec *jspec,
2489 PRN *prn)
2490 {
2491 gretlopt opt = OPT_NONE; /* FIXME? */
2492 char **vnames = NULL;
2493 int nv = 0;
2494 int err = 0;
2495
2496 err = probe_csv(fname, &vnames, &nv, &opt);
2497
2498 if (!err) {
2499 GPatternSpec *pspec;
2500 int nmatch = 0;
2501 int i, err = 0;
2502
2503 pspec = g_pattern_spec_new(jspec->colnames[JOIN_TARG]);
2504
2505 /* first determine the number of matches to @pspec */
2506 for (i=0; i<nv; i++) {
2507 if (g_pattern_match_string(pspec, vnames[i])) {
2508 nmatch++;
2509 }
2510 }
2511
2512 if (nmatch > 1) {
2513 /* we have some extra vars due to wildcard expansion */
2514 err = expand_jspec(jspec, nmatch - 1);
2515 }
2516
2517 if (!err && nmatch > 0) {
2518 int j = JOIN_TARG;
2519
2520 for (i=0; i<nv; i++) {
2521 if (g_pattern_match_string(pspec, vnames[i])) {
2522 jspec_push_tmpname(jspec, j++, vnames[i]);
2523 vnames[i] = NULL;
2524 }
2525 }
2526 }
2527
2528 g_pattern_spec_free(pspec);
2529 strings_array_free(vnames, nv);
2530 }
2531
2532 return err;
2533 }
2534
join_import_csv(const char * fname,joinspec * jspec,gretlopt opt,PRN * prn)2535 static int join_import_csv (const char *fname,
2536 joinspec *jspec,
2537 gretlopt opt,
2538 PRN *prn)
2539 {
2540 int err = 0;
2541
2542 if (jspec->wildcard) {
2543 err = determine_csv_matches(fname, jspec, prn);
2544 if (err) {
2545 pputs(prn, "join_import_csv: failed in matching varnames\n");
2546 }
2547 }
2548
2549 if (!err) {
2550 err = real_import_csv(fname, NULL, NULL, NULL, jspec,
2551 NULL, NULL, opt, prn);
2552 if (0 && !err) {
2553 /* question, 2021-01-09: this is zeroed out: why? */
2554 DATASET *dset = csvdata_get_dataset(jspec->c);
2555 int pd, reversed = 0;
2556
2557 fprintf(stderr, "join_import_csv: n=%d, v=%d, pd=%d, markers=%d\n",
2558 dset->n, dset->v, dset->pd, dset->markers);
2559 pd = test_markers_for_dates(dset, &reversed, NULL, prn);
2560 fprintf(stderr, "pd from markers: %d\n", pd);
2561 }
2562 }
2563
2564 return err;
2565 }
2566
join_range_check(joiner * jr,DATASET * dset,AggrType aggr)2567 static int join_range_check (joiner *jr, DATASET *dset, AggrType aggr)
2568 {
2569 int T = sample_size(dset);
2570 int err = 0;
2571
2572 if (jr->n_rows < T) {
2573 /* can we handle jr->n_rows < T? */
2574 if (aggr != AGGR_NONE) {
2575 err = E_DATA; /* No */
2576 } else if (dset->v == 1) {
2577 ; /* only const, OK */
2578 } else if (dset->v == 2 && !strcmp(dset->varname[1], "index")) {
2579 ; /* "nulldata" default, OK */
2580 } else {
2581 err = E_DATA; /* Not OK */
2582 }
2583 } else if (jr->n_rows != T) {
2584 gretl_errmsg_set(_("Series length does not match the dataset"));
2585 err = E_DATA;
2586 }
2587
2588 if (err) {
2589 gretl_errmsg_set(_("Series length does not match the dataset"));
2590 }
2591
2592 return err;
2593 }
2594
get_series_indices(const char ** vnames,int nvars,DATASET * dset,int * n_add,int * any_wild,int * err)2595 static int *get_series_indices (const char **vnames,
2596 int nvars,
2597 DATASET *dset,
2598 int *n_add,
2599 int *any_wild,
2600 int *err)
2601 {
2602 int *ret = gretl_list_new(nvars);
2603
2604 if (ret == NULL) {
2605 *err = E_ALLOC;
2606 } else {
2607 int i, v;
2608
2609 for (i=0; i<nvars && !*err; i++) {
2610 if (is_wildstr(vnames[i])) {
2611 *any_wild = 1;
2612 continue;
2613 }
2614 v = current_series_index(dset, vnames[i]);
2615 if (v == 0) {
2616 *err = E_DATA;
2617 } else {
2618 ret[i+1] = v;
2619 if (v < 0) {
2620 *n_add += 1;
2621 }
2622 }
2623 }
2624 }
2625
2626 if (!*err && nvars > 1 && *any_wild) {
2627 /* wildcard spec must be singleton varname spec */
2628 gretl_errmsg_set(_("Invalid join specification"));
2629 *err = E_DATA;
2630 }
2631
2632 if (*err) {
2633 free(ret);
2634 ret = NULL;
2635 }
2636
2637 return ret;
2638 }
2639
2640 /* figure the number of series to import */
2641
jspec_n_vars(joinspec * jspec)2642 static int jspec_n_vars (joinspec *jspec)
2643 {
2644 return jspec->ncols - JOIN_TARG;
2645 }
2646
2647 /* we come here if we have determined that the
2648 import series specification includes a wildcard
2649 ('*' or '?')
2650 */
2651
revise_series_indices(joinspec * jspec,DATASET * dset,int * n_add,int * err)2652 static int *revise_series_indices (joinspec *jspec,
2653 DATASET *dset,
2654 int *n_add,
2655 int *err)
2656 {
2657 int nvars = jspec_n_vars(jspec);
2658 int *ret = gretl_list_new(nvars);
2659
2660 ret = gretl_list_new(nvars);
2661
2662 if (ret == NULL) {
2663 *err = E_ALLOC;
2664 } else {
2665 jspec->wildnames = strings_array_new(nvars);
2666 if (jspec->wildnames == NULL) {
2667 *err = E_ALLOC;
2668 }
2669 }
2670
2671 if (!*err) {
2672 const char *cname;
2673 int i, v;
2674
2675 /* zero the count of added vars */
2676 *n_add = 0;
2677
2678 for (i=0; i<nvars && !*err; i++) {
2679 cname = jspec->colnames[JOIN_TARG + i];
2680 v = current_series_index(dset, cname);
2681 if (v == 0) {
2682 *err = E_DATA;
2683 } else {
2684 ret[i+1] = v;
2685 if (v < 0) {
2686 /* not a current series */
2687 if (gretl_type_from_name(cname, NULL)) {
2688 *err = E_TYPES;
2689 } else {
2690 *n_add += 1;
2691 }
2692 }
2693 if (!*err) {
2694 jspec->wildnames[i] = gretl_strdup(cname);
2695 }
2696 }
2697 }
2698 }
2699
2700 if (*err) {
2701 free(ret);
2702 ret = NULL;
2703 }
2704
2705 return ret;
2706 }
2707
set_up_jspec(joinspec * jspec,const char ** vnames,int nvars,gretlopt opt,int any_wild,AggrType aggr,int midas_pd)2708 static int set_up_jspec (joinspec *jspec,
2709 const char **vnames,
2710 int nvars,
2711 gretlopt opt,
2712 int any_wild,
2713 AggrType aggr,
2714 int midas_pd)
2715 {
2716 int i, j, ncols = JOIN_TARG + nvars;
2717
2718 jspec->colnames = malloc(ncols * sizeof *jspec->colnames);
2719 jspec->colnums = malloc(ncols * sizeof *jspec->colnums);
2720
2721 if (jspec->colnames == NULL || jspec->colnums == NULL) {
2722 return E_ALLOC;
2723 }
2724
2725 jspec->ncols = ncols;
2726 jspec->wildcard = any_wild;
2727 jspec->wildnames = NULL;
2728 jspec->tmpnames = NULL;
2729 jspec->n_tmp = 0;
2730 jspec->mdsbase = NULL;
2731 jspec->mdsnames = NULL;
2732 jspec->auto_midas = 0;
2733 jspec->midas_pd = 0;
2734
2735 if (aggr == AGGR_MIDAS) {
2736 jspec->mdsbase = vnames[0];
2737 jspec->auto_midas = 1; /* provisional! */
2738 jspec->midas_pd = midas_pd;
2739 for (i=0; i<ncols; i++) {
2740 jspec->colnames[i] = NULL;
2741 jspec->colnums[i] = 0;
2742 }
2743 } else {
2744 j = 1;
2745 for (i=0; i<ncols; i++) {
2746 if (i > JOIN_TARG) {
2747 jspec->colnames[i] = vnames[j++];
2748 } else {
2749 jspec->colnames[i] = NULL;
2750 }
2751 jspec->colnums[i] = 0;
2752 }
2753 }
2754
2755 return 0;
2756 }
2757
expand_jspec(joinspec * jspec,int addvars)2758 static int expand_jspec (joinspec *jspec, int addvars)
2759 {
2760 int i, ncols = jspec->ncols + addvars;
2761 char **colnames;
2762 int *colnums;
2763
2764 colnames = realloc(jspec->colnames, ncols * sizeof *colnames);
2765 colnums = realloc(jspec->colnums, ncols * sizeof *colnums);
2766
2767 if (colnames == NULL || colnums == NULL) {
2768 return E_ALLOC;
2769 }
2770
2771 jspec->colnames = (const char **) colnames;
2772 jspec->colnums = colnums;
2773
2774 for (i=jspec->ncols; i<ncols; i++) {
2775 jspec->colnames[i] = NULL;
2776 jspec->colnums[i] = 0;
2777 }
2778
2779 jspec->ncols = ncols;
2780
2781 return 0;
2782 }
2783
clear_jspec(joinspec * jspec,joiner * jr)2784 static void clear_jspec (joinspec *jspec, joiner *jr)
2785 {
2786 free(jspec->colnames);
2787 free(jspec->colnums);
2788
2789 if (jspec->timecols != NULL) {
2790 free(jspec->timecols);
2791 }
2792
2793 if (jspec->c != NULL) {
2794 csvdata_free(jspec->c);
2795 } else if (jspec->dset != NULL) {
2796 destroy_dataset(jspec->dset);
2797 }
2798
2799 if (jspec->wildnames != NULL) {
2800 strings_array_free(jspec->wildnames, jspec_n_vars(jspec));
2801 }
2802
2803 if (jr != NULL && jspec->mdsnames != NULL) {
2804 strings_array_free(jspec->mdsnames, jr->midas_m);
2805 }
2806
2807 if (jspec->tmpnames != NULL) {
2808 strings_array_free(jspec->tmpnames, jspec->n_tmp);
2809 }
2810 }
2811
midas_revise_jspec(joinspec * jspec,DATASET * dset,int * n_add,int * err)2812 static int *midas_revise_jspec (joinspec *jspec,
2813 DATASET *dset,
2814 int *n_add,
2815 int *err)
2816 {
2817 DATASET *rdset = outer_dataset(jspec);
2818 int m, rpd = 0, nvars = 0;
2819 int *ret = NULL;
2820
2821 if (jspec->midas_pd == 0) {
2822 /* outer pd not specified on input: so take
2823 it from the discovered dataset
2824 */
2825 rpd = jspec->midas_pd = rdset->pd;
2826 }
2827
2828 m = midas_m_from_pd(dset, jspec->midas_pd);
2829
2830 if (m == 0) {
2831 gretl_errmsg_sprintf("frequency %d in import data: \"spread\" will "
2832 "not work", rpd);
2833 *err = E_PDWRONG;
2834 return NULL;
2835 } else {
2836 nvars = m;
2837 }
2838
2839 ret = gretl_list_new(nvars);
2840 jspec->mdsnames = strings_array_new(nvars);
2841
2842 if (ret == NULL || jspec->mdsnames == NULL) {
2843 *err = E_ALLOC;
2844 }
2845
2846 if (!*err) {
2847 char *cname, tmp[VNAMELEN];
2848 int i, v, extlen;
2849 char mc;
2850
2851 /* zero the count of added vars */
2852 *n_add = 0;
2853
2854 /* create base for naming vars */
2855 mc = rpd == 12 ? 'm' : rpd == 4 ? 'q' : 'd';
2856 extlen = m < 10 ? 3 : 4;
2857 *tmp = '\0';
2858 strncat(tmp, jspec->mdsbase, VNAMELEN - 1);
2859 gretl_trunc(tmp, VNAMELEN - extlen - 1);
2860
2861 for (i=0; i<nvars && !*err; i++) {
2862 cname = gretl_strdup_printf("%s_%c%d", tmp, mc, nvars - i);
2863 if (cname == NULL) {
2864 *err = E_ALLOC;
2865 break;
2866 }
2867 v = current_series_index(dset, cname);
2868 ret[i+1] = v;
2869 if (v < 0) {
2870 /* not a current series */
2871 if (gretl_type_from_name(cname, NULL)) {
2872 *err = E_TYPES;
2873 } else {
2874 *n_add += 1;
2875 }
2876 }
2877 if (*err) {
2878 free(cname);
2879 } else {
2880 jspec->mdsnames[i] = cname;
2881 }
2882 }
2883 }
2884
2885 return ret;
2886 }
2887
maybe_transfer_string_table(DATASET * l_dset,DATASET * r_dset,joinspec * jspec,int * targvars,int orig_v)2888 static void maybe_transfer_string_table (DATASET *l_dset,
2889 DATASET *r_dset,
2890 joinspec *jspec,
2891 int *targvars,
2892 int orig_v)
2893 {
2894 int i, lv, rv;
2895
2896 for (i=1; i<=targvars[0]; i++) {
2897 lv = targvars[i];
2898 if (lv >= orig_v) {
2899 /* it's a new series */
2900 rv = outer_series_index(jspec, i);
2901 if (rv > 0 && is_string_valued(r_dset, rv)) {
2902 /* let the new series grab the RHS string table */
2903 steal_string_table(l_dset, lv, r_dset, rv);
2904 }
2905 }
2906 }
2907 }
2908
initial_midas_check(int nvars,int any_wild,int pd,DATASET * dset)2909 static int initial_midas_check (int nvars, int any_wild, int pd,
2910 DATASET *dset)
2911 {
2912 int err;
2913
2914 if (pd != 0 && pd != 12 && pd != 4 && pd != 5 && pd != 6 && pd != 7) {
2915 /* unacceptable outer data frequency */
2916 err = E_PDWRONG;
2917 } else if (nvars == 1 && (annual_data(dset) || quarterly_or_monthly(dset))) {
2918 /* might be OK, if no wildcard */
2919 err = any_wild ? E_DATA : 0;
2920 } else {
2921 err = E_DATA;
2922 }
2923
2924 if (err) {
2925 gretl_errmsg_set(_("Invalid join specification"));
2926 }
2927
2928 return err;
2929 }
2930
has_native_suffix(const char * fname)2931 static int has_native_suffix (const char *fname)
2932 {
2933 return has_suffix(fname, ".gdt") || has_suffix(fname, ".gdtb");
2934 }
2935
2936 /**
2937 * gretl_join_data:
2938 * @fname: name of data file.
2939 * @vnames: name(s) of variables to create or modify.
2940 * @nvars: the number of elements in @vnames.
2941 * @dset: pointer to dataset.
2942 * @ikeyvars: list of 1 or 2 "inner" key variables, or NULL.
2943 * @okey: string specifying "outer" key(s) or NULL.
2944 * @filtstr: string specifying filter, or NULL.
2945 * @srcname: name of variable to import at source, or NULL.
2946 * @aggr: aggregation method specifier.
2947 * @seqval: 1-based sequence number for aggregation, or 0.
2948 * @auxname: name of auxiliary column for max or min aggregation,
2949 * or NULL.
2950 * @tconvstr: string specifying date columns for conversion, or NULL.
2951 * @tconvfmt: string giving format(s) for "timeconv" columns, or NULL.
2952 * @midas_pd: hint regarding pd for --aggr=spread (?).
2953 * @opt: may contain OPT_V for verbose operation, OPT_H to assume
2954 * no header row.
2955 * @prn: gretl printing struct (or NULL).
2956 *
2957 * Opens a delimited text data file or gdt file and carries out a
2958 * "join" operation to pull data into the current working dataset.
2959 *
2960 * Returns: 0 on successful completion, non-zero otherwise.
2961 */
2962
gretl_join_data(const char * fname,const char ** vnames,int nvars,DATASET * dset,const int * ikeyvars,const char * okey,const char * filtstr,const char * srcname,AggrType aggr,int seqval,const char * auxname,const char * tconvstr,const char * tconvfmt,int midas_pd,gretlopt opt,PRN * prn)2963 int gretl_join_data (const char *fname,
2964 const char **vnames,
2965 int nvars,
2966 DATASET *dset,
2967 const int *ikeyvars,
2968 const char *okey,
2969 const char *filtstr,
2970 const char *srcname,
2971 AggrType aggr,
2972 int seqval,
2973 const char *auxname,
2974 const char *tconvstr,
2975 const char *tconvfmt,
2976 int midas_pd,
2977 gretlopt opt,
2978 PRN *prn)
2979 {
2980 DATASET *outer_dset = NULL;
2981 joinspec jspec = {0};
2982 joiner *jr = NULL;
2983 jr_filter *filter = NULL;
2984 const char *varname;
2985 int okeyvars[3] = {0, 0, 0};
2986 char okeyname1[VNAMELEN] = {0};
2987 char okeyname2[VNAMELEN] = {0};
2988 char tkeyfmt[32] = {0};
2989 obskey auto_keys;
2990 int *targvars = NULL;
2991 int do_tsjoin = 0;
2992 int orig_v = dset->v;
2993 int add_v = 0;
2994 int modified = 0;
2995 int any_wild = 0;
2996 int verbose = (opt & OPT_V);
2997 int str_keys[2] = {0};
2998 int n_keys = 0;
2999 int err = 0;
3000
3001 /** Step 0: preliminaries **/
3002
3003 if (vnames == NULL || nvars < 1) {
3004 return E_DATA;
3005 }
3006
3007 varname = vnames[0];
3008
3009 targvars = get_series_indices(vnames, nvars, dset, &add_v,
3010 &any_wild, &err);
3011
3012 if (!err && srcname != NULL) {
3013 /* If we have a spec for the original name of a series
3014 to import (@srcname), we cannot accept more than one
3015 target name (in @vnames), nor can we accept any
3016 wildcard specification -- unless we're doing a
3017 "MIDAS join", in which case a single wildcard spec
3018 is OK for the target series.
3019 */
3020 if ((nvars > 1 || any_wild) && aggr != AGGR_MIDAS) {
3021 gretl_errmsg_set(_("Invalid join specification"));
3022 err = E_DATA;
3023 }
3024 }
3025
3026 if (!err && aggr == AGGR_MIDAS) {
3027 err = initial_midas_check(nvars, any_wild, midas_pd, dset);
3028 }
3029
3030 if (err) {
3031 return err;
3032 }
3033
3034 err = set_up_jspec(&jspec, vnames, nvars, opt, any_wild,
3035 aggr, midas_pd);
3036 if (err) {
3037 return err;
3038 }
3039
3040 if (ikeyvars != NULL) {
3041 n_keys = ikeyvars[0];
3042 }
3043
3044 obskey_init(&auto_keys);
3045 timeconv_map_init();
3046
3047 #if JDEBUG
3048 fputs("*** gretl_join_data:\n", stderr);
3049 fprintf(stderr, " filename = '%s'\n", fname);
3050 if (nvars > 1) {
3051 int i;
3052
3053 fputs(" target series names:\n", stderr);
3054 for (i=0; i<nvars; i++) {
3055 fprintf(stderr, " '%s'\n", vnames[i]);
3056 }
3057 } else {
3058 fprintf(stderr, " target series name = '%s'\n", varname);
3059 }
3060 if (n_keys > 0) {
3061 fprintf(stderr, " inner key series ID = %d\n", ikeyvars[1]);
3062 if (n_keys == 2) {
3063 fprintf(stderr, " second inner key series ID = %d\n", ikeyvars[2]);
3064 }
3065 }
3066 if (okey != NULL) {
3067 fprintf(stderr, " outer key = '%s'\n", okey);
3068 } else if (n_keys > 0) {
3069 fprintf(stderr, " outer key = '%s' (from inner key)\n",
3070 dset->varname[ikeyvars[1]]);
3071 if (n_keys == 2) {
3072 fprintf(stderr, " second outer key = '%s' (from inner)\n",
3073 dset->varname[ikeyvars[2]]);
3074 }
3075 }
3076 if (filtstr != NULL) {
3077 fprintf(stderr, " filter = '%s'\n", filtstr);
3078 }
3079 if (srcname != NULL) {
3080 fprintf(stderr, " source data series = '%s'\n", srcname);
3081 } else if (aggr != AGGR_COUNT) {
3082 fprintf(stderr, " source data series: assuming '%s' (from inner varname)\n",
3083 varname);
3084 }
3085 fprintf(stderr, " aggregation method = %d\n", aggr);
3086 if (auxname != NULL) {
3087 fprintf(stderr, " aggr auxiliary column = '%s'\n", auxname);
3088 }
3089 if (tconvstr != NULL) {
3090 fprintf(stderr, " timeconv = '%s'\n", tconvstr);
3091 }
3092 if (tconvfmt != NULL) {
3093 fprintf(stderr, " tconvfmt = '%s'\n", tconvfmt);
3094 }
3095 #endif
3096
3097 /* Step 1: process the arguments we got with regard to filtering
3098 and keys: extract the names of the columns that are required
3099 from the outer datafile, checking for errors as we go.
3100 */
3101
3102 if (filtstr != NULL) {
3103 filter = make_join_filter(filtstr, &err);
3104 }
3105
3106 if (!err && okey != NULL) {
3107 if (opt & OPT_K) {
3108 /* cancel automatic MIDAS flag if present */
3109 jspec.auto_midas = 0;
3110 err = process_time_key(okey, okeyname1, tkeyfmt);
3111 } else {
3112 err = process_outer_key(okey, n_keys, okeyname1, okeyname2, opt);
3113 }
3114 }
3115
3116 /* Step 2: set up the array of required column names,
3117 jspec.colnames. This is an array of const *char, pointers
3118 to strings that "live" elsewhere. We leave any unneeded
3119 elements as NULL.
3120 */
3121
3122 if (!err) {
3123 /* handle the primary outer key column, if any */
3124 if (*okeyname1 != '\0') {
3125 jspec.colnames[JOIN_KEY] = okeyname1;
3126 } else if (n_keys > 0) {
3127 jspec.colnames[JOIN_KEY] = dset->varname[ikeyvars[1]];
3128 }
3129
3130 /* and the secondary outer key, if any */
3131 if (*okeyname2 != '\0') {
3132 jspec.colnames[JOIN_KEY2] = okeyname2;
3133 } else if (n_keys > 1) {
3134 jspec.colnames[JOIN_KEY2] = dset->varname[ikeyvars[2]];
3135 }
3136
3137 /* the data or "payload" column */
3138 if (aggr != AGGR_COUNT) {
3139 if (srcname != NULL) {
3140 jspec.colnames[JOIN_TARG] = srcname;
3141 } else {
3142 jspec.colnames[JOIN_TARG] = varname;
3143 }
3144 }
3145
3146 /* the filter columns, if applicable */
3147 if (filter != NULL) {
3148 jspec.colnames[JOIN_F1] = filter->vname1;
3149 jspec.colnames[JOIN_F2] = filter->vname2;
3150 jspec.colnames[JOIN_F3] = filter->vname3;
3151 }
3152
3153 /* the auxiliary var for aggregation, if present */
3154 if (auxname != NULL) {
3155 jspec.colnames[JOIN_AUX] = auxname;
3156 }
3157 }
3158
3159 /* Step 3: handle the tconvert and tconv-fmt options */
3160
3161 if (!err && tconvstr != NULL) {
3162 err = process_tconvert_info(&jspec, tconvstr, tconvfmt, tkeyfmt);
3163 }
3164
3165 /* Step 4: read data from the outer file; check we got all the
3166 required columns; check that nothing is screwed up type-wise
3167 */
3168
3169 if (!err) {
3170 PRN *vprn = verbose ? prn : NULL;
3171
3172 if (has_native_suffix(fname)) {
3173 gretlopt gdt_opt = OPT_NONE;
3174
3175 if (dataset_is_time_series(dset)) {
3176 /* import obs markers: may be needed */
3177 gdt_opt = OPT_M;
3178 }
3179 err = join_import_gdt(fname, &jspec, gdt_opt, vprn);
3180 } else {
3181 err = join_import_csv(fname, &jspec, opt, vprn);
3182 }
3183 if (!err) {
3184 outer_dset = outer_dataset(&jspec);
3185 }
3186 #if JDEBUG > 2
3187 if (!err) {
3188 print_outer_dataset(outer_dset, fname);
3189 }
3190 #endif
3191 if (!err) {
3192 err = check_for_missing_columns(&jspec);
3193 }
3194 if (!err) {
3195 err = aggregation_type_check(&jspec, aggr);
3196 }
3197 if (!err) {
3198 err = join_data_type_check(&jspec, dset, targvars, aggr);
3199 }
3200 }
3201
3202 if (!err && verbose) {
3203 int i;
3204
3205 pprintf(prn, _("Outer dataset: read %d columns and %d rows\n"),
3206 outer_dset->v - 1, outer_dset->n);
3207 for (i=1; i<outer_dset->v; i++) {
3208 pprintf(prn, " col %d: '%s'\n", i, outer_dset->varname[i]);
3209 }
3210 }
3211
3212 /* Step 5: set up keys and check for conformability errors */
3213
3214 if (!err && jspec.colnames[JOIN_KEY] != NULL) {
3215 err = set_up_outer_keys(&jspec, dset, opt, ikeyvars, okeyvars,
3216 &auto_keys, str_keys);
3217 }
3218
3219 if (!err && n_keys == 0 && dataset_is_time_series(dset)) {
3220 err = auto_keys_check(dset, outer_dset, opt, tkeyfmt,
3221 &auto_keys, &n_keys, &do_tsjoin);
3222 if (do_tsjoin) {
3223 goto transcribe;
3224 }
3225 }
3226
3227 if (!err && n_keys == 0 && filter == NULL && aggr == 0 &&
3228 auto_keys.timefmt == NULL) {
3229 /* the simple case: no need to build joiner struct */
3230 err = join_simple_range_check(dset, outer_dset, targvars);
3231 goto transcribe;
3232 }
3233
3234 /* Step 6: build the joiner struct from the outer dataset,
3235 applying a filter if one is specified
3236 */
3237
3238 if (!err) {
3239 jr = build_joiner(&jspec, dset, filter, aggr, seqval,
3240 &auto_keys, n_keys, &err);
3241 if (!err && jr == NULL) {
3242 /* no matching data to join */
3243 goto bailout;
3244 }
3245 }
3246
3247 if (!err && filter != NULL && verbose) {
3248 pprintf(prn, "Filter: %d rows were selected\n", jr->n_rows);
3249 }
3250
3251 /* Step 7: transcribe more info and sort the "joiner" struct */
3252
3253 if (!err) {
3254 jr->n_keys = n_keys;
3255 jr->str_keys = str_keys;
3256 jr->l_keyno = ikeyvars;
3257 jr->r_keyno = okeyvars;
3258 if (jr->n_keys > 0) {
3259 err = joiner_sort(jr);
3260 }
3261 #if JDEBUG > 1
3262 if (!err) joiner_print(jr);
3263 #endif
3264 }
3265
3266 /* Step 8: another check now the joiner struct is ready */
3267
3268 if (!err && jr != NULL && jr->n_keys == 0) {
3269 err = join_range_check(jr, dset, aggr);
3270 }
3271
3272 transcribe:
3273
3274 /* step 9: revise information on the series to be imported
3275 if we came across a wildcard specification or a case
3276 of MIDAS importation
3277 */
3278
3279 if (!err && (jspec.wildcard || aggr == AGGR_MIDAS)) {
3280 free(targvars);
3281 if (aggr == AGGR_MIDAS) {
3282 targvars = midas_revise_jspec(&jspec, dset, &add_v, &err);
3283 } else {
3284 targvars = revise_series_indices(&jspec, dset, &add_v, &err);
3285 }
3286 }
3287
3288 /* Step 10: transcribe or aggregate the data */
3289
3290 if (!err && add_v > 0) {
3291 /* we need to add one or more new series on the left */
3292 if (jspec.wildnames != NULL) {
3293 err = add_target_series((const char **) jspec.wildnames,
3294 dset, targvars, add_v);
3295 } else if (jspec.mdsnames != NULL) {
3296 err = add_target_series((const char **) jspec.mdsnames,
3297 dset, targvars, add_v);
3298 } else {
3299 err = add_target_series(vnames, dset, targvars, add_v);
3300 }
3301 }
3302
3303 if (!err) {
3304 if (jr == NULL && do_tsjoin) {
3305 ts_joiner tjr = {0};
3306
3307 fill_ts_joiner(dset, outer_dset, &tjr);
3308 err = join_transcribe_multi_data(dset, outer_dset, targvars,
3309 orig_v, &jspec, &tjr,
3310 &modified);
3311 } else if (jr == NULL) {
3312 err = join_transcribe_multi_data(dset, outer_dset, targvars,
3313 orig_v, &jspec, NULL,
3314 &modified);
3315 } else if (jr->n_keys == 0) {
3316 err = join_transcribe_data(jr, targvars[1], add_v,
3317 &jspec, &modified);
3318 } else {
3319 err = aggregate_data(jr, ikeyvars, targvars, &jspec,
3320 orig_v, &modified);
3321 /* complete the job for MIDAS daily import */
3322 if (!err && aggr == AGGR_MIDAS && midas_daily(jr)) {
3323 postprocess_daily_data(dset, targvars);
3324 }
3325 }
3326 }
3327
3328 #if JDEBUG
3329 fprintf(stderr, "join: add_v = %d, modified = %d\n",
3330 add_v, modified);
3331 #endif
3332
3333 if (!err && add_v > 0 && jspec.colnums[JOIN_TARG] > 0) {
3334 /* we added one or more new series */
3335 if (aggr != AGGR_MIDAS) {
3336 maybe_transfer_string_table(dset, outer_dset, &jspec,
3337 targvars, orig_v);
3338 }
3339 }
3340
3341 if (err) {
3342 dataset_drop_last_variables(dset, dset->v - orig_v);
3343 } else {
3344 if (add_v || modified) {
3345 set_dataset_is_changed(dset, 1);
3346 }
3347 if (gretl_messages_on()) {
3348 if (add_v) {
3349 pputs(prn, _("Data appended OK\n"));
3350 } else if (modified) {
3351 pputs(prn, _("Data modified OK\n"));
3352 } else {
3353 pputs(prn, _("No changes were made to the dataset\n"));
3354 }
3355 }
3356 }
3357
3358 bailout:
3359
3360 /* null out file-scope "timeconv" globals */
3361 timeconv_map_destroy();
3362
3363 if (auto_keys.timefmt != NULL) {
3364 free(auto_keys.timefmt);
3365 }
3366
3367 clear_jspec(&jspec, jr);
3368 joiner_destroy(jr);
3369 jr_filter_destroy(filter);
3370 free(targvars);
3371
3372 return err;
3373 }
3374
3375 /* called in csvdata.c */
3376
timecol_get_format(const DATASET * dset,int v,char ** pfmt,int * q)3377 int timecol_get_format (const DATASET *dset, int v,
3378 char **pfmt, int *q)
3379 {
3380 if (tconv_map.fmt == NULL) {
3381 /* no formats present */
3382 return 0;
3383 } else if (tconv_map.tname == NULL) {
3384 /* get the common "tconvert" format */
3385 *pfmt = tconv_map.fmt[TCONV_FMT];
3386 *q = tconv_map.m_means_q[TCONV_FMT];
3387 return 1;
3388 } else if (!strcmp(dset->varname[v], tconv_map.tname)) {
3389 /* get the tkey-specific format */
3390 *pfmt = tconv_map.fmt[TKEY_FMT];
3391 *q = tconv_map.m_means_q[TKEY_FMT];
3392 return 1;
3393 } else if (tconv_map.fmt[TCONV_FMT] != NULL) {
3394 /* get the other one */
3395 *pfmt = tconv_map.fmt[TCONV_FMT];
3396 *q = tconv_map.m_means_q[TCONV_FMT];
3397 return 1;
3398 }
3399
3400 return 0;
3401 }
3402