1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #include "libgretl.h"
21 #include "gretl_string_table.h"
22 #include "libset.h"
23 #include "gretl_xml.h"
24 #include "gretl_midas.h"
25 #include "uservar.h"
26 #include "csvdata.h"
27 #include "join_priv.h"
28 #include "gretl_join.h"
29 
30 #ifdef WIN32
31 # include "gretl_win32.h" /* for strptime() */
32 #endif
33 
34 #define AGGDEBUG 0  /* aggregation in "join" */
35 #define TDEBUG 0    /* handling of time keys in "join" */
36 #define JDEBUG 0    /* joining in general */
37 
38 enum {
39     JOIN_KEY,
40     JOIN_F1,
41     JOIN_F2,
42     JOIN_F3,
43     JOIN_KEY2,
44     JOIN_AUX,
45     JOIN_TARG
46 };
47 
48 enum {
49     TCONV_FMT = 0,
50     TKEY_FMT = 1
51 };
52 
53 typedef double keynum;
54 
55 struct jr_row_ {
56     int n_keys;     /* number of keys (needed for qsort callback) */
57     keynum keyval;  /* primary key value */
58     keynum keyval2; /* secondary key value, if applicable */
59     int micro;      /* high-frequency "key", if any */
60     int dset_row;   /* associated row in the RHS or outer dataset */
61     double aux;     /* auxiliary value */
62 };
63 
64 typedef struct jr_row_ jr_row;
65 
66 struct obskey_ {
67     char *timefmt; /* time format, as in strptime */
68     int keycol;    /* the column holding the outer time-key */
69     int m_means_q; /* "monthly means quarterly" */
70     int numdates;  /* flag for conversion from numeric to string */
71     int native;    /* native time-series info */
72 };
73 
74 typedef struct obskey_ obskey;
75 
76 struct joiner_ {
77     int n_rows;     /* number of rows in data table */
78     int n_keys;     /* number of keys used (0, 1 or 2) */
79     int n_unique;   /* number of unique primary key values on right */
80     jr_row *rows;   /* array of table rows */
81     keynum *keys;   /* array of unique (primary) key values as 64-bit ints */
82     int *key_freq;  /* counts of occurrences of (primary) key values */
83     int *key_row;   /* record of starting row in joiner table for primary keys */
84     int *str_keys;  /* flags for string comparison of key(s) */
85     const int *l_keyno; /* list of key columns in left-hand dataset */
86     const int *r_keyno; /* list of key columns in right-hand dataset */
87     AggrType aggr;      /* aggregation method for 1:n joining */
88     int seqval;         /* sequence number for aggregation */
89     int auxcol;         /* auxiliary data column for aggregation */
90     int midas_m;        /* midas frequency ratio */
91     int midas_pd;       /* frequency of outer dataset */
92     obskey *auto_keys;  /* struct to hold info on obs-based key(s) */
93     DATASET *l_dset;    /* the left-hand or inner dataset */
94     DATASET *r_dset;    /* the right-hand or outer temporary dataset */
95 };
96 
97 typedef struct joiner_ joiner;
98 
99 struct jr_filter_ {
100     const char *expr;  /* expression to be run through "genr" */
101     const double *val; /* (series) result of evaluating @expr */
102     char *vname1;      /* first right-hand variable name */
103     char *vname2;      /* second right-hand variable name */
104     char *vname3;      /* third right-hand variable name */
105 };
106 
107 typedef struct jr_filter_ jr_filter;
108 
109 struct time_mapper {
110     int ncols;         /* number of "timeconv" columns */
111     char **colnames;   /* array of outer-dataset column names */
112     char *tname;       /* the name of the "tkey", if among colnames, or NULL */
113     char **fmt;        /* array of up to two time-format strings, or NULL */
114     char m_means_q[2]; /* array of "monthly means quarterly" flags */
115 };
116 
117 struct jr_matcher_ {
118     keynum *k1;    /* first key value, per observation */
119     keynum *k2;    /* second key value per obs, or NULL */
120     int *pos;      /* position of match in outer key array */
121 };
122 
123 typedef struct jr_matcher_ jr_matcher;
124 
125 #define KEYMISS -999
126 
127 #define is_wildstr(s) (strchr(s, '*') || strchr(s, '?'))
128 
129 /* file-scope global */
130 struct time_mapper tconv_map;
131 
132 static int expand_jspec (joinspec *jspec, int addvars);
133 
jr_filter_destroy(jr_filter * f)134 static void jr_filter_destroy (jr_filter *f)
135 {
136     if (f != NULL) {
137         free(f->vname1);
138         free(f->vname2);
139         free(f->vname3);
140         free(f);
141     }
142 }
143 
joiner_destroy(joiner * jr)144 static void joiner_destroy (joiner *jr)
145 {
146     if (jr != NULL) {
147         free(jr->rows);
148         free(jr->keys);
149         free(jr->key_freq);
150         free(jr->key_row);
151         free(jr);
152     }
153 }
154 
joiner_new(int nrows)155 static joiner *joiner_new (int nrows)
156 {
157     joiner *jr = malloc(sizeof *jr);
158 
159     if (jr != NULL) {
160         jr->rows = calloc(nrows, sizeof *jr->rows);
161         if (jr->rows == NULL) {
162             free(jr);
163             jr = NULL;
164         }
165     }
166 
167     if (jr != NULL) {
168         jr->n_rows = nrows;
169         jr->n_unique = 0;
170         jr->keys = NULL;
171         jr->key_freq = NULL;
172         jr->key_row = NULL;
173         jr->l_keyno = NULL;
174         jr->r_keyno = NULL;
175     }
176 
177     return jr;
178 }
179 
real_set_outer_auto_keys(joiner * jr,const char * s,int j,struct tm * tp)180 static int real_set_outer_auto_keys (joiner *jr, const char *s,
181                                      int j, struct tm *tp)
182 {
183     DATASET *l_dset = jr->l_dset;
184     int err = 0;
185 
186     if (calendar_data(l_dset)) {
187         int y, m, d, eday;
188 
189         y = tp->tm_year + 1900;
190         m = tp->tm_mon + 1;
191         d = tp->tm_mday;
192         eday = epoch_day_from_ymd(y, m, d);
193         if (eday < 0) {
194             if (s != NULL) {
195                 gretl_errmsg_sprintf("'%s' is not a valid date", s);
196             }
197             err = E_DATA;
198         } else if (jr->n_keys == 2) {
199             /* use year and month */
200             jr->rows[j].n_keys = 2;
201             jr->rows[j].keyval = y;
202             jr->rows[j].keyval2 = m;
203             jr->rows[j].micro = 0;
204         } else {
205             /* use epoch day */
206             jr->rows[j].n_keys = 1;
207             jr->rows[j].keyval = eday;
208             jr->rows[j].keyval2 = 0;
209             jr->rows[j].micro = 0;
210         }
211     } else {
212         int lpd = l_dset->pd;
213         int major = tp->tm_year + 1900;
214         int minor = tp->tm_mon + 1;
215         int micro = 0;
216 
217         if (jr->auto_keys->m_means_q) {
218             /* using the gretl-specific "%q" conversion */
219             if (minor > 4) {
220                 gretl_errmsg_sprintf("'%s' is not a valid date", s);
221                 err = E_DATA;
222             }
223         } else if (lpd == 4) {
224             /* map from month on right to quarter on left, but
225                preserve the month info in case we need it
226             */
227             micro = minor;
228             minor = (int) ceil(minor / 3.0);
229         }
230         if (!err && micro == 0) {
231             micro = tp->tm_mday;
232         }
233         if (!err) {
234             jr->rows[j].n_keys = 2;
235             jr->rows[j].keyval = major;
236             jr->rows[j].keyval2 = minor;
237             jr->rows[j].micro = micro;
238         }
239     }
240 
241     return err;
242 }
243 
244 /* Handle the case where the user gave a "%q" or "%Q" conversion
245    specifier (which we take to mean quarter). We convert this to %m
246    for use with strptime(), but record that the fact that "month means
247    quarter".
248 */
249 
format_uses_quarterly(char * fmt)250 static int format_uses_quarterly (char *fmt)
251 {
252     char *s = fmt;
253     int i, ret = 0;
254 
255     for (i=0; s[i]; i++) {
256         if (s[i] == '%' &&
257             (s[i+1] == 'q' || s[i+1] == 'Q') &&
258             (i == 0 || s[i-1] != '%')) {
259             s[i+1] = 'm';
260             ret = 1;
261         }
262     }
263 
264     return ret;
265 }
266 
timeconv_map_set(int ncols,char ** colnames,char * tname,char ** fmt)267 static void timeconv_map_set (int ncols, char **colnames,
268                               char *tname, char **fmt)
269 {
270     tconv_map.ncols = ncols;
271     tconv_map.colnames = colnames;
272     tconv_map.tname = tname;
273     tconv_map.fmt = fmt;
274 
275     if (fmt != NULL) {
276         if (fmt[TCONV_FMT] != NULL) {
277             tconv_map.m_means_q[TCONV_FMT] =
278                 format_uses_quarterly(fmt[TCONV_FMT]);
279         }
280         if (fmt[TKEY_FMT] != NULL) {
281             tconv_map.m_means_q[TKEY_FMT] =
282                 format_uses_quarterly(fmt[TKEY_FMT]);
283         }
284     }
285 }
286 
timeconv_map_init(void)287 static void timeconv_map_init (void)
288 {
289     timeconv_map_set(0, NULL, NULL, NULL);
290 }
291 
timeconv_map_destroy(void)292 static void timeconv_map_destroy (void)
293 {
294     if (tconv_map.colnames != NULL) {
295         strings_array_free(tconv_map.colnames, tconv_map.ncols);
296     }
297     if (tconv_map.fmt != NULL) {
298         strings_array_free(tconv_map.fmt, 2);
299     }
300     timeconv_map_init();
301 }
302 
set_time_format(obskey * auto_keys,const char * fmt)303 static int set_time_format (obskey *auto_keys, const char *fmt)
304 {
305     if (auto_keys->timefmt != NULL) {
306         free(auto_keys->timefmt);
307     }
308     auto_keys->timefmt = gretl_strdup(fmt);
309     return auto_keys->timefmt == NULL ? E_ALLOC : 0;
310 }
311 
312 /* convert a numerical value to string for use with strptime */
313 
numdate_to_string(char * targ,double x)314 static int numdate_to_string (char *targ, double x)
315 {
316     if (na(x)) {
317         return E_MISSDATA;
318     } else {
319         sprintf(targ, "%.16g", x);
320         return 0;
321     }
322 }
323 
324 /* Parse a string from row @i of the outer dataset and set the
325    key(s) on row @j of the joiner struct. The indices @i and @j may
326    not be equal if a filter is being used. Note: we don't come
327    here if the outer time-key column is subject to "tconvert"
328    treatment; in that case we use read_iso_basic instead.
329 */
330 
read_outer_auto_keys(joiner * jr,int j,int i)331 static int read_outer_auto_keys (joiner *jr, int j, int i)
332 {
333     char *tfmt = jr->auto_keys->timefmt;
334     int numdates = jr->auto_keys->numdates;
335     int tcol = jr->auto_keys->keycol;
336     int pd = jr->l_dset->pd;
337     struct tm t = {0};
338     const char *s = NULL;
339     char *test = NULL;
340     char sconv[32];
341     int s_src = 0;
342     int err = 0;
343 
344     if (tcol >= 0) {
345         /* using a specified column */
346         if (numdates) {
347             /* column is numeric, conversion needed */
348             numdate_to_string(sconv, jr->r_dset->Z[tcol][i]);
349             s = sconv;
350             s_src = 1;
351         } else {
352             /* column is string-valued, OK */
353             s = series_get_string_for_obs(jr->r_dset, tcol, i);
354             s_src = 2;
355         }
356     } else if (jr->auto_keys->native) {
357         /* using native time-series info on right */
358         ntolabel_8601(sconv, i, jr->r_dset);
359         s = sconv;
360         s_src = 1;
361     } else {
362         /* using first-column observation strings */
363         s = jr->r_dset->S[i];
364         s_src = 3;
365     }
366 
367     if (s != NULL) {
368         /* note: with strptime, a NULL return means that an error
369            occurred while a non-NULL and non-empty return string
370            means a trailing portion of the input was not
371            processed.
372         */
373         test = strptime(s, tfmt, &t);
374     }
375 
376     if (test == NULL || *test != '\0') {
377         err = E_DATA;
378         if (j == 0 && test != NULL && (pd == 12 || pd == 4 || pd == 1)) {
379             /* If we're looking at the first row of the filtered data,
380                allow for the possibility that we got "excess
381                precision", i.e. a daily date string when the left-hand
382                dataset is monthly, quarterly or annual.
383             */
384             char *chk = strptime(s, "%Y-%m-%d", &t);
385 
386             if (chk != NULL && *chk == '\0') {
387                 set_time_format(jr->auto_keys, "%Y-%m-%d");
388                 err = 0; /* we might be OK, cancel the error for now */
389             }
390         }
391         if (err) {
392             gretl_errmsg_sprintf("'%s' does not match the format '%s'", s, tfmt);
393             fprintf(stderr, "time-format match error in read_outer_auto_keys:\n"
394                     " remainder = '%s' (source = %s)\n", test ? test : "null",
395                     s_src < 3 ? "specified time column" : "first-column strings");
396         }
397     }
398 
399     if (!err) {
400         err = real_set_outer_auto_keys(jr, s, j, &t);
401     }
402 
403     return err;
404 }
405 
read_iso_basic(joiner * jr,int j,int i)406 static int read_iso_basic (joiner *jr, int j, int i)
407 {
408     int tcol = jr->auto_keys->keycol;
409     double x;
410     int err = 0;
411 
412     x = jr->r_dset->Z[tcol][i];
413 
414     if (na(x)) {
415         err = E_MISSDATA;
416     } else {
417         int y = (int) floor(x / 10000);
418         int m = (int) floor((x - 10000*y) / 100);
419         int d = (int) (x - 10000*y - 100*m);
420         guint32 ed = epoch_day_from_ymd(y, m, d);
421 
422         if (ed <= 0) {
423             gretl_errmsg_sprintf("'%.8g' is not a valid date", x);
424             err = E_DATA;
425         } else if (calendar_data(jr->l_dset)) {
426             /* note: no need to go via struct tm */
427             jr->rows[j].n_keys = 1;
428             jr->rows[j].keyval = ed;
429             jr->rows[j].keyval2 = 0;
430             jr->rows[j].micro = 0;
431         } else {
432             struct tm t = {0};
433 
434             t.tm_year = y - 1900;
435             t.tm_mon = m - 1;
436             t.tm_mday = d;
437             err = real_set_outer_auto_keys(jr, NULL, j, &t);
438         }
439     }
440 
441     return err;
442 }
443 
444 /* Evaluate the filter expression provided by the user, and if it
445    works OK count the number of rows on which the filter returns
446    non-zero.  Flag an error if the filter gives NA on any row, since
447    it is then indeterminate.
448 */
449 
evaluate_filter(jr_filter * filter,DATASET * r_dset,int * nrows)450 static int evaluate_filter (jr_filter *filter, DATASET *r_dset,
451                             int *nrows)
452 {
453     char *line;
454     int i, err = 0;
455 
456     line = gretl_strdup_printf("filtered__=%s", filter->expr);
457     if (line == NULL) {
458         err = E_ALLOC;
459     } else {
460         err = generate(line, r_dset, GRETL_TYPE_SERIES,
461                        OPT_P | OPT_Q, NULL);
462     }
463 
464     if (!err) {
465         int v = r_dset->v - 1;
466 
467         filter->val = r_dset->Z[v];
468         *nrows = 0;
469 
470 #if JDEBUG > 1
471         fprintf(stderr, "filter genr: '%s':\n", line);
472         for (i=0; i<r_dset->n; i++) {
473             fprintf(stderr, " %d: %g\n", i+1, filter->val[i]);
474         }
475 #endif
476         for (i=0; i<r_dset->n; i++) {
477             if (na(filter->val[i])) {
478                 gretl_errmsg_sprintf("join filter: indeterminate "
479                                      "value on row %d", i+1);
480                 err = E_MISSDATA;
481                 break;
482             } else if (filter->val[i] != 0.0) {
483                 *nrows += 1;
484             }
485         }
486     }
487 
488     free(line);
489 
490     return err;
491 }
492 
dtoll(double x,int * err)493 static keynum dtoll (double x, int *err)
494 {
495     if (na(x)) {
496         *err = E_DATA;
497         return -1;
498     } else {
499         return x;
500     }
501 }
502 
dtoll_full(double x,int key,int row,int * err)503 static keynum dtoll_full (double x, int key, int row, int *err)
504 {
505     if (na(x)) {
506         if (key == 2) {
507             gretl_errmsg_sprintf("%s: invalid secondary outer key value on row %d",
508                                  "join", row);
509         } else {
510             gretl_errmsg_sprintf("%s: invalid (primary) outer key value on row %d",
511                                  "join", row);
512         }
513         *err = E_DATA;
514         return -1;
515     } else {
516         return x;
517     }
518 }
519 
520 /* Determine whether or not row @i of the outer data satisfies the
521    filter criterion; return 1 if the condition is met, 0 otherwise.
522 */
523 
join_row_wanted(jr_filter * filter,int i)524 static int join_row_wanted (jr_filter *filter, int i)
525 {
526     int ret = filter->val[i] != 0;
527 
528 #if JDEBUG > 2
529     fprintf(stderr, "join filter: %s row %d\n",
530             ret ? "keeping" : "discarding", i);
531 #endif
532 
533     return ret;
534 }
535 
outer_dataset(joinspec * jspec)536 static DATASET *outer_dataset (joinspec *jspec)
537 {
538     if (jspec->c != NULL) {
539         return csvdata_get_dataset(jspec->c);
540     } else {
541         return jspec->dset;
542     }
543 }
544 
column_is_timecol(const char * colname)545 static int column_is_timecol (const char *colname)
546 {
547     int i, n = tconv_map.ncols;
548 
549     for (i=0; i<n; i++) {
550         if (!strcmp(colname, tconv_map.colnames[i])) {
551             return 1;
552         }
553     }
554 
555     return 0;
556 }
557 
558 #define using_auto_keys(j) (j->auto_keys->timefmt != NULL)
559 
build_joiner(joinspec * jspec,DATASET * l_dset,jr_filter * filter,AggrType aggr,int seqval,obskey * auto_keys,int n_keys,int * err)560 static joiner *build_joiner (joinspec *jspec,
561                              DATASET *l_dset,
562                              jr_filter *filter,
563                              AggrType aggr,
564                              int seqval,
565                              obskey *auto_keys,
566                              int n_keys,
567                              int *err)
568 {
569     joiner *jr = NULL;
570     DATASET *r_dset = outer_dataset(jspec);
571     int keycol  = jspec->colnums[JOIN_KEY];
572     int valcol  = jspec->colnums[JOIN_TARG];
573     int key2col = jspec->colnums[JOIN_KEY2];
574     int auxcol  = jspec->colnums[JOIN_AUX];
575     int i, nrows = r_dset->n;
576 
577 #if JDEBUG
578     fprintf(stderr, "joiner column numbers:\n"
579             "KEY %d, VAL %d, F1 %d, F2 %d, F3 %d, KEY2 %d, AUX %d\n",
580             keycol, valcol, jspec->colnums[JOIN_F1],
581             jspec->colnums[JOIN_F2], jspec->colnums[JOIN_F3],
582             key2col, auxcol);
583 #endif
584 
585     if (filter != NULL) {
586         *err = evaluate_filter(filter, r_dset, &nrows);
587         if (*err) {
588             return NULL;
589         } else if (nrows == 0) {
590             gretl_warnmsg_set(_("No matching data after filtering"));
591             return NULL;
592         }
593     }
594 
595 #if JDEBUG
596     fprintf(stderr, "after filtering: dset->n = %d, nrows = %d\n",
597             r_dset->n, nrows);
598 #endif
599 
600     jr = joiner_new(nrows);
601 
602     if (jr == NULL) {
603         *err = E_ALLOC;
604     } else {
605         double **Z = r_dset->Z;
606         int use_iso_basic = 0;
607         int j = 0;
608 
609         jr->aggr = aggr;
610         jr->seqval = seqval;
611         jr->auxcol = auxcol;
612         jr->l_dset = l_dset;
613         jr->r_dset = r_dset;
614         jr->auto_keys = auto_keys;
615         jr->n_keys = n_keys;
616         jr->midas_m = 0;
617 
618         if (using_auto_keys(jr)) {
619             /* check for the case where the outer time-key
620                column is in the "tconvert" set: if so we
621                know it will be in YYYYMMDD format and we'll
622                give it special treatment
623             */
624             int tcol = jr->auto_keys->keycol;
625 
626             if (tcol > 0 && jr->auto_keys->numdates) {
627                 if (column_is_timecol(jr->r_dset->varname[tcol])) {
628                     use_iso_basic = 1;
629                 }
630             }
631         }
632 
633 #if JDEBUG
634         fprintf(stderr, "use_iso_basic = %d\n", use_iso_basic);
635 #endif
636 
637         /* Now transcribe the data we want: we're pulling from the
638            full outer dataset and writing into the array of joiner row
639            structs. At this point we're applying the join filter (if
640            any) but are not doing any matching by key to the inner
641            dataset.
642         */
643 
644         for (i=0; i<r_dset->n && !*err; i++) {
645             if (filter != NULL && !join_row_wanted(filter, i)) {
646                 continue;
647             }
648             /* the keys */
649             if (use_iso_basic) {
650                 *err = read_iso_basic(jr, j, i);
651             } else if (using_auto_keys(jr)) {
652                 *err = read_outer_auto_keys(jr, j, i);
653             } else if (keycol > 0) {
654                 jr->rows[j].keyval = dtoll_full(Z[keycol][i], 1, i+1, err);
655                 if (!*err && key2col > 0) {
656                     /* double key */
657                     jr->rows[j].n_keys = 2;
658                     jr->rows[j].keyval2 = dtoll_full(Z[key2col][i], 2, i+1, err);
659                 } else {
660                     /* single key */
661                     jr->rows[j].n_keys = 1;
662                     jr->rows[j].keyval2 = 0;
663                 }
664             } else {
665                 /* no keys have been specified */
666                 jr->rows[j].n_keys = 0;
667                 jr->rows[j].keyval = 0;
668                 jr->rows[j].keyval2 = 0;
669             }
670             /* "payload" data: record the dataset row */
671             jr->rows[j].dset_row = valcol > 0 ? i : -1;
672             /* the auxiliary data */
673             jr->rows[j].aux = auxcol > 0 ? Z[auxcol][i] : 0;
674             j++;
675         }
676     }
677 
678     return jr;
679 }
680 
681 /* qsort callback for sorting rows of the joiner struct */
682 
compare_jr_rows(const void * a,const void * b)683 static int compare_jr_rows (const void *a, const void *b)
684 {
685     const jr_row *ra = a;
686     const jr_row *rb = b;
687     int ret;
688 
689     ret = (ra->keyval > rb->keyval) - (ra->keyval < rb->keyval);
690 
691     if (ret == 0 && ra->n_keys > 1) {
692         ret = (ra->keyval2 > rb->keyval2) - (ra->keyval2 < rb->keyval2);
693     }
694 
695     if (ret == 0) {
696         /* ensure stable sort */
697         ret = a - b > 0 ? 1 : -1;
698     }
699 
700     return ret;
701 }
702 
703 /* Sort the rows of the joiner struct, by either one or two keys, then
704    figure out how many unique (primary) key values we have and
705    construct (a) an array of frequency of occurrence of these values
706    and (b) an array which records the first row of the joiner on
707    which each of these values is found.
708 */
709 
joiner_sort(joiner * jr)710 static int joiner_sort (joiner *jr)
711 {
712     int matches = jr->n_rows;
713     int i, err = 0;
714 
715     /* If there are string keys, we begin by mapping from the string
716        indices on the right -- held in the keyval and/or keyval2
717        members of the each joiner row -- to the indices for the same
718        strings on the left. This enables us to avoid doing string
719        comparisons when running aggr_value() later; we can just
720        compare the indices of the strings. In addition, if on any
721        given row we get no match for the right-hand key string on the
722        left (signalled by a strmap value of -1) we can exploit this
723        information by shuffling such rows to the end of the joiner
724        rectangle and ignoring them when aggregating.
725     */
726 
727     if (jr->str_keys[0] || jr->str_keys[1]) {
728         series_table *stl, *str;
729         int *strmap;
730         int k, kmin, kmax, lkeyval, rkeyval;
731 
732         kmin = jr->str_keys[0] ? 1 : 2;
733         kmax = jr->str_keys[1] ? 2 : 1;
734 
735         for (k=kmin; k<=kmax; k++) {
736             stl = series_get_string_table(jr->l_dset, jr->l_keyno[k]);
737             str = series_get_string_table(jr->r_dset, jr->r_keyno[k]);
738             strmap = series_table_map(str, stl);
739 
740             if (strmap == NULL) {
741                 err = E_ALLOC;
742                 break;
743             }
744 
745             for (i=0; i<jr->n_rows; i++) {
746                 if (k == 1) {
747                     rkeyval = jr->rows[i].keyval;
748                 } else if (jr->rows[i].keyval == INT_MAX) {
749                     continue;
750                 } else {
751                     rkeyval = jr->rows[i].keyval2;
752                 }
753                 lkeyval = strmap[rkeyval];
754 #if JDEBUG > 1
755                 fprintf(stderr, "k = %d, row %d, keyval: %d -> %d\n", k, i, rkeyval, lkeyval);
756 #endif
757                 if (lkeyval > 0) {
758                     if (k == 1) {
759                         jr->rows[i].keyval = lkeyval;
760                     } else {
761                         jr->rows[i].keyval2 = lkeyval;
762                     }
763                 } else {
764                     /* arrange for qsort to move row to end */
765                     jr->rows[i].keyval = G_MAXDOUBLE;
766                     matches--;
767                 }
768             }
769 
770             free(strmap);
771         }
772     }
773 
774     if (err) {
775         return err;
776     }
777 
778     qsort(jr->rows, jr->n_rows, sizeof *jr->rows, compare_jr_rows);
779 
780     if (matches < jr->n_rows) {
781         jr->n_rows = matches;
782     }
783 
784     jr->n_unique = 1;
785     for (i=1; i<jr->n_rows; i++) {
786         if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
787             jr->n_unique += 1;
788         }
789     }
790 
791     jr->keys = malloc(jr->n_unique * sizeof *jr->keys);
792     jr->key_freq = malloc(jr->n_unique * sizeof *jr->key_freq);
793     jr->key_row = malloc(jr->n_unique * sizeof *jr->key_row);
794 
795     if (jr->keys == NULL || jr->key_freq == NULL || jr->key_row == NULL) {
796         err = E_ALLOC;
797     } else {
798         int j = 0, nj = 1;
799 
800         for (i=0; i<jr->n_unique; i++) {
801             jr->key_freq[i] = 0;
802         }
803 
804         jr->keys[0] = jr->rows[0].keyval;
805         jr->key_row[0] = 0;
806 
807         for (i=1; i<jr->n_rows; i++) {
808             if (jr->rows[i].keyval != jr->rows[i-1].keyval) {
809                 /* finalize info for key j */
810                 jr->keys[j] = jr->rows[i-1].keyval;
811                 jr->key_freq[j] = nj;
812                 /* and initialize for next key */
813                 nj = 1;
814                 if (j < jr->n_unique - 1) {
815                     jr->key_row[j+1] = i;
816                 }
817                 j++;
818             } else {
819                 nj++;
820             }
821         }
822 
823         /* make sure the last row is right */
824         jr->keys[j] = jr->rows[i-1].keyval;
825         jr->key_freq[j] = nj;
826     }
827 
828     return err;
829 }
830 
831 #if JDEBUG > 1
832 
joiner_print(joiner * jr)833 static void joiner_print (joiner *jr)
834 {
835     char **labels = NULL;
836     jr_row *row;
837     int i;
838 
839     if (jr->str_keys[0]) {
840         labels = series_get_string_vals(jr->l_dset, jr->l_keyno[1], NULL, 0);
841     }
842 
843     fprintf(stderr, "\njoiner: n_rows = %d\n", jr->n_rows);
844     for (i=0; i<jr->n_rows; i++) {
845         row = &jr->rows[i];
846         if (row->n_keys > 1) {
847             fprintf(stderr, " row %d: keyvals=(%g,%g)\n",
848                     i, row->keyval, row->keyval2);
849         } else {
850             int k = lrint(row->keyval) - 1;
851 
852             if (jr->str_keys[0] && row->keyval >= 0) {
853                 fprintf(stderr, " row %d: keyval=%g (%s)\n",
854                         i, row->keyval, labels[k]);
855             } else {
856                 fprintf(stderr, " row %d: keyval=%g\n",
857                         i, row->keyval);
858             }
859         }
860     }
861 
862     if (jr->keys != NULL) {
863         fprintf(stderr, " for primary key: n_unique = %d\n", jr->n_unique);
864         for (i=0; i<jr->n_unique; i++) {
865             fprintf(stderr,"  key value %g: count = %d\n",
866                     jr->keys[i], jr->key_freq[i]);
867         }
868     }
869 }
870 
871 # if JDEBUG > 2
print_outer_dataset(const DATASET * dset,const char * fname)872 static void print_outer_dataset (const DATASET *dset, const char *fname)
873 {
874     PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
875 
876     pprintf(prn, "Data extracted from %s:\n", fname);
877     printdata(NULL, NULL, dset, OPT_O, prn);
878     gretl_print_destroy(prn);
879 }
880 # endif
881 
882 #endif /* JDEBUG */
883 
seqval_out_of_bounds(joiner * jr,int seqmax)884 static int seqval_out_of_bounds (joiner *jr, int seqmax)
885 {
886     if (jr->seqval < 0) {
887         /* counting down from last match */
888         return -jr->seqval > seqmax;
889     } else {
890         /* counting up from first match */
891         return jr->seqval > seqmax;
892     }
893 }
894 
895 /* Do a binary search for the left-hand key value @targ in the sorted
896    array of unique right-hand key values, @vals; return the position
897    among @vals at which @targ matches, or -1 for no match.
898 */
899 
binsearch(keynum targ,const keynum * vals,int n,int offset)900 static int binsearch (keynum targ, const keynum *vals, int n, int offset)
901 {
902     int m = n/2;
903 
904     if (fabs((targ) - (vals[m])) < 1.0e-7) {
905         return m + offset;
906     } else if (targ < vals[0] || targ > vals[n-1]) {
907         return -1;
908     } else if (targ < vals[m]) {
909         return binsearch(targ, vals, m, offset);
910     } else {
911         return binsearch(targ, vals + m, n - m, offset + m);
912     }
913 }
914 
915 /* In some cases we can figure out what aggr_value() should return
916    just based on the number of matches, @n, and the characteristics
917    of the joiner. If so, write the value into @x and return 1; if
918    not, return 0.
919 */
920 
aggr_val_determined(joiner * jr,int n,double * x,int * err)921 static int aggr_val_determined (joiner *jr, int n, double *x, int *err)
922 {
923     if (jr->aggr == AGGR_COUNT) {
924         /* just return the number of matches */
925         *x = n;
926         return 1;
927     } else if (jr->aggr == AGGR_SEQ && seqval_out_of_bounds(jr, n)) {
928         /* out of bounds sequence index: return NA */
929         *x = NADBL;
930         return 1;
931     } else if (n > 1 && jr->aggr == AGGR_NONE) {
932         /* fail */
933 #if AGGDEBUG
934         fprintf(stderr, "aggr_val_determined(): got n=%d\n", n);
935 #endif
936         *err = E_DATA;
937         gretl_errmsg_set(_("You need to specify an aggregation "
938                            "method for a 1:n join"));
939         *x = NADBL;
940         return 1;
941     } else {
942         /* not enough information so far */
943         return 0;
944     }
945 }
946 
947 /* get month-day index from @dset time-series info */
948 
midas_day_index(int t,DATASET * dset)949 static int midas_day_index (int t, DATASET *dset)
950 {
951     char obs[OBSLEN];
952     int y, m, d, idx = -1;
953 
954     ntolabel(obs, t, dset);
955     if (sscanf(obs, YMD_READ_FMT, &y, &m, &d) == 3) {
956         idx = month_day_index(y, m, d, dset->pd);
957     }
958 
959     return idx;
960 }
961 
962 #define KEYMISS -999
963 
matcher_get_k2(jr_matcher * matcher,int i)964 static keynum matcher_get_k2 (jr_matcher *matcher, int i)
965 {
966     return matcher->k2 == NULL ? 0 : matcher->k2[i];
967 }
968 
matcher_set_k2(jr_matcher * matcher,int i,keynum val)969 static void matcher_set_k2 (jr_matcher *matcher, int i,
970 			    keynum val)
971 {
972     if (matcher->k2 != NULL) {
973 	matcher->k2[i] = val;
974     }
975 }
976 
977 #define midas_daily(j) (j->midas_m > 20)
978 
979 #define min_max_cond(x,y,a) ((a==AGGR_MAX && x>y) || (a==AGGR_MIN && x<y))
980 
981 /* aggr_value: here we're working on a given row of the left-hand
982    dataset. The values @key and (if applicable) @key2 are the
983    left-hand keys for this row. We count the key-matches on the
984    right and apply an aggregation procedure if the user specified
985    one. We return the value that should be entered for the imported
986    series on this row.
987 
988    Note: @xmatch and @auxmatch are workspace arrays allocated by
989    the caller.
990 */
991 
aggr_value(joiner * jr,jr_matcher * matcher,int s,int v,int revseq,double * xmatch,double * auxmatch,int * err)992 static double aggr_value (joiner *jr,
993 			  jr_matcher *matcher,
994 			  int s, int v,
995                           int revseq,
996                           double *xmatch,
997                           double *auxmatch,
998                           int *err)
999 {
1000     keynum key1 = matcher->k1[s];
1001     keynum key2 = matcher_get_k2(matcher, s);
1002     int pos = matcher->pos[s];
1003     double x, xa;
1004     int imin, imax;
1005     int i, n, ntotal;
1006 
1007 #if AGGDEBUG
1008     fprintf(stderr, " key1 = %g: matched at position %d\n", key1, pos);
1009 #endif
1010 
1011     /* how many matches at @pos? */
1012     n = jr->key_freq[pos];
1013 
1014 #if AGGDEBUG
1015     fprintf(stderr, "  number of primary matches = %d (n_keys=%d)\n",
1016             n, jr->n_keys);
1017 #endif
1018 
1019     if (jr->n_keys == 1) {
1020         /* if there's just a single key, we can figure some
1021            cases out already */
1022         if (aggr_val_determined(jr, n, &x, err)) {
1023             return x;
1024         }
1025     }
1026 
1027     if (jr->key_row[pos] < 0) {
1028         /* "can't happen" */
1029         return NADBL;
1030     }
1031 
1032     /* set the range of rows for reading from the joiner rectangle */
1033     imin = jr->key_row[pos];
1034     imax = imin + n;
1035 
1036 #if AGGDEBUG
1037     fprintf(stderr, "  aggregation row range: %d to %d\n", imin+1, imax);
1038 #endif
1039 
1040     if (jr->aggr == AGGR_MIDAS) {
1041         /* special case: MIDAS "spreading" */
1042         int daily = dated_daily_data(jr->r_dset);
1043         int gotit = 0;
1044 
1045         x = NADBL;
1046 
1047         for (i=imin; i<imax && !gotit; i++) {
1048             /* loop across primary key matches */
1049             jr_row *r = &jr->rows[i];
1050 
1051             if (jr->n_keys == 1 || key2 == r->keyval2) {
1052                 /* got secondary key match */
1053                 int sub, t = r->dset_row;
1054 #if AGGDEBUG
1055                 fprintf(stderr, "  i=%d: 2-key match: %d,%d (revseq=%d)\n",
1056                         i, (int) key1, (int) key2, revseq);
1057 #endif
1058                 if (daily) {
1059                     /* outer dataset has known daily structure */
1060                     sub = midas_day_index(t, jr->r_dset);
1061                     gotit = sub == revseq;
1062                 } else if (midas_daily(jr) && r->micro > 0) {
1063                     /* "other" daily data: r->micro holds day */
1064                     sub = month_day_index((int) key1, (int) key2,
1065                                           r->micro, jr->midas_pd);
1066                     gotit = sub == revseq;
1067                 } else {
1068                     if (r->micro > 0) {
1069                         /* if present, this is derived from the outer
1070                            time-key specification
1071                         */
1072                         sub = r->micro;
1073                     } else {
1074                         date_maj_min(t, jr->r_dset, NULL, &sub);
1075                     }
1076                     gotit = (sub - 1) % jr->midas_m + 1 == revseq;
1077                 }
1078                 if (gotit) {
1079                     x = jr->r_dset->Z[v][t];
1080                 }
1081             }
1082         }
1083 
1084         /* and we're done */
1085         return x;
1086     }
1087 
1088     /* We now fill out the array @xmatch with non-missing values
1089        from the matching outer rows. If we have a secondary key
1090        we screen for matches on that as we go.
1091     */
1092 
1093     n = 0;      /* will now hold count of non-NA matches */
1094     ntotal = 0; /* will ignore the OK/NA distinction */
1095 
1096     for (i=imin; i<imax; i++) {
1097         jr_row *r = &jr->rows[i];
1098 
1099         if (jr->n_keys == 1 || key2 == r->keyval2) {
1100             ntotal++;
1101             x = jr->r_dset->Z[v][r->dset_row];
1102             if (jr->auxcol) {
1103                 xa = r->aux;
1104                 if (!na(x) && na(xa)) {
1105                     /* we can't know the min/max of the aux var */
1106                     *err = E_MISSDATA;
1107                     return NADBL;
1108                 }
1109                 if (!na(xa)) {
1110                     auxmatch[n] = xa;
1111                     xmatch[n++] = x;
1112                 }
1113             } else if (!na(x)) {
1114                 xmatch[n++] = x;
1115             }
1116         }
1117     }
1118 
1119     if (jr->n_keys > 1) {
1120         /* we've already checked this for the 1-key case */
1121         if (aggr_val_determined(jr, n, &x, err)) {
1122             return x;
1123         }
1124     }
1125 
1126     x = NADBL;
1127 
1128     if (n == 0) {
1129         ; /* all matched observations are NA */
1130     } else if (jr->aggr == AGGR_NONE) {
1131         x = xmatch[0];
1132     } else if (jr->aggr == AGGR_SEQ) {
1133         int sval = jr->seqval;
1134 
1135         i = sval < 0 ? n + sval : sval - 1;
1136         if (i >= 0 && i < n) {
1137             x = xmatch[i];
1138         }
1139     } else if (jr->aggr == AGGR_MAX || jr->aggr == AGGR_MIN) {
1140         if (jr->auxcol) {
1141             /* using the max/min of an auxiliary var */
1142             int idx = 0;
1143 
1144             x = auxmatch[0];
1145             for (i=1; i<n; i++) {
1146                 if (min_max_cond(auxmatch[i], x, jr->aggr)) {
1147                     x = auxmatch[i];
1148                     idx = i;
1149                 }
1150             }
1151             x = xmatch[idx];
1152         } else {
1153             /* max/min of the actual data */
1154             x = xmatch[0];
1155             for (i=1; i<n; i++) {
1156                 if (min_max_cond(xmatch[i], x, jr->aggr)) {
1157                     x = xmatch[i];
1158                 }
1159             }
1160         }
1161     } else if (jr->aggr == AGGR_SUM || jr->aggr == AGGR_AVG) {
1162         x = 0.0;
1163         for (i=0; i<n; i++) {
1164             x += xmatch[i];
1165         }
1166         if (jr->aggr == AGGR_AVG) {
1167             x /= n;
1168         }
1169     }
1170 
1171     return x;
1172 }
1173 
handle_midas_setup(joiner * jr,int i,int lv,int rv,int revseq)1174 static void handle_midas_setup (joiner *jr, int i, int lv, int rv,
1175                                 int revseq)
1176 {
1177     char label[MAXLABEL];
1178 
1179     series_set_midas_period(jr->l_dset, lv, revseq);
1180     sprintf(label, "%s in sub-period %d",
1181             jr->r_dset->varname[rv], revseq);
1182     series_record_label(jr->l_dset, lv, label);
1183     series_set_midas_freq(jr->l_dset, lv, jr->r_dset->pd);
1184     if (i == 1) {
1185         series_set_midas_anchor(jr->l_dset, lv);
1186     }
1187 }
1188 
1189 /* Handle the case where (a) the value from the right, @rz, is
1190    actually the coding of a string value, and (b) the LHS series is
1191    pre-existing and already has a string table attached. The RHS
1192    coding must be made consistent with that on the left. We reach this
1193    function only if we've verified that there are string tables on
1194    both sides, and that @rz is not NA.
1195 */
1196 
maybe_adjust_string_code(series_table * rst,series_table * lst,double rz,int * err)1197 static double maybe_adjust_string_code (series_table *rst,
1198                                         series_table *lst,
1199                                         double rz, int *err)
1200 {
1201     const char *rstr = series_table_get_string(rst, rz);
1202     double lz = series_table_get_value(lst, rstr);
1203 
1204     if (!na(lz)) {
1205         /* use the LHS encoding */
1206         rz = lz;
1207     } else {
1208         /* we need to append to the LHS string table */
1209         int n = series_table_add_string(lst, rstr);
1210 
1211         if (n < 0) {
1212             *err = E_ALLOC;
1213         } else {
1214             rz = n;
1215         }
1216     }
1217 
1218     return rz;
1219 }
1220 
jr_matcher_free(jr_matcher * matcher)1221 static void jr_matcher_free (jr_matcher *matcher)
1222 {
1223     free(matcher->k1);
1224     free(matcher->k2);
1225     free(matcher->pos);
1226 }
1227 
jr_matcher_init(jr_matcher * matcher,int nobs,int nkeys)1228 static int jr_matcher_init (jr_matcher *matcher, int nobs,
1229                             int nkeys)
1230 {
1231     int i, err = 0;
1232 
1233     matcher->k1 = malloc(nobs * sizeof *matcher->k1);
1234     matcher->pos = malloc(nobs * sizeof *matcher->pos);
1235     matcher->k2 = NULL;
1236 
1237     if (matcher->k1 == NULL || matcher->pos == NULL) {
1238         err = E_ALLOC;
1239     } else if (nkeys == 2) {
1240         matcher->k2 = malloc(nobs * sizeof *matcher->k2);
1241         if (matcher->k2 == NULL) {
1242             err = E_ALLOC;
1243         }
1244     }
1245 
1246     if (err) {
1247         jr_matcher_free(matcher);
1248         return err;
1249     }
1250 
1251     for (i=0; i<nobs; i++) {
1252         matcher->k1[i] = 0;
1253         matcher->pos[i] = 0;
1254         if (matcher->k2 != NULL) {
1255             matcher->k2[i] = 0;
1256         }
1257     }
1258 
1259     return 0;
1260 }
1261 
get_all_inner_key_values(joiner * jr,const int * ikeyvars,jr_matcher * matcher)1262 static int get_all_inner_key_values (joiner *jr,
1263                                      const int *ikeyvars,
1264                                      jr_matcher *matcher)
1265 {
1266     DATASET *dset = jr->l_dset;
1267     int i, j, n = sample_size(dset);
1268     int err;
1269 
1270     err = jr_matcher_init(matcher, n, jr->n_keys);
1271     if (err) {
1272         return err;
1273     }
1274 
1275     for (i=dset->t1, j=0; i<=dset->t2 && !err; i++, j++) {
1276         if (using_auto_keys(jr)) {
1277             /* using the LHS dataset obs info */
1278             char obs[OBSLEN];
1279 
1280             ntolabel(obs, i, dset);
1281             if (calendar_data(dset)) {
1282                 guint32 ed = get_epoch_day(obs);
1283 
1284                 if (jr->n_keys == 2) {
1285                     /* inner daily, outer monthly */
1286                     int y, m, d;
1287 
1288                     ymd_bits_from_epoch_day(ed, &y, &m, &d);
1289                     matcher->k1[j] = y;
1290                     matcher->k2[j] = m;
1291                 } else {
1292                     matcher->k1[j] = ed;
1293                 }
1294             } else {
1295                 /* monthly or quarterly (FIXME any others?) */
1296                 matcher->k1[j] = atoi(obs);
1297 		matcher_set_k2(matcher, j, atoi(obs + 5));
1298             }
1299         } else {
1300             /* using regular LHS key series */
1301             double dk1, dk2 = 0;
1302             keynum k1 = 0, k2 = 0;
1303 
1304             dk1 = dset->Z[ikeyvars[1]][i];
1305             if (jr->n_keys == 2) {
1306                 dk2 = dset->Z[ikeyvars[2]][i];
1307             }
1308             if (na(dk1) || na(dk2)) {
1309                 matcher->pos[j] = KEYMISS;
1310             } else {
1311                 k1 = dtoll(dk1, &err);
1312                 if (!err && jr->n_keys == 2) {
1313                     k2 = dtoll(dk2, &err);
1314                 }
1315             }
1316             if (!err && matcher->pos[j] != KEYMISS) {
1317                 matcher->k1[j] = k1;
1318 		matcher_set_k2(matcher, j, k2);
1319             }
1320         }
1321 
1322         if (!err && matcher->pos[j] != KEYMISS) {
1323             /* look up position in outer keys array */
1324             matcher->pos[j] = binsearch(matcher->k1[j], jr->keys,
1325                                         jr->n_unique, 0);
1326         }
1327     }
1328 
1329     return err;
1330 }
1331 
1332 /* Returns the 0-based column index in the outer dataset associated
1333    with the i^th target variable for joining. Note that the arg @i is
1334    1-based, being a position within a gretl list.
1335 */
1336 
outer_series_index(joinspec * jspec,int i)1337 static int outer_series_index (joinspec *jspec, int i)
1338 {
1339     if (i >= 1) {
1340         return jspec->colnums[JOIN_TARG + i - 1];
1341     } else {
1342         return -1;
1343     }
1344 }
1345 
aggregate_data(joiner * jr,const int * ikeyvars,const int * targvars,joinspec * jspec,int orig_v,int * modified)1346 static int aggregate_data (joiner *jr, const int *ikeyvars,
1347                            const int *targvars, joinspec *jspec,
1348                            int orig_v, int *modified)
1349 {
1350     jr_matcher matcher = {0};
1351     series_table *rst = NULL;
1352     series_table *lst = NULL;
1353     DATASET *dset = jr->l_dset;
1354     double *xmatch = NULL;
1355     double *auxmatch = NULL;
1356     int revseq = 0;
1357     int i, t, nmax;
1358     int err = 0;
1359 
1360     /* find the greatest (primary) key frequency */
1361     nmax = 0;
1362     for (i=0; i<jr->n_unique; i++) {
1363         if (jr->key_freq[i] > nmax) {
1364             nmax = jr->key_freq[i];
1365         }
1366     }
1367 
1368 #if AGGDEBUG
1369     fprintf(stderr, "\naggregate data: max primary matches = %d\n", nmax);
1370 #endif
1371 
1372     if (jr->aggr == AGGR_MIDAS) {
1373         /* reverse sequence number for MIDAS join */
1374         revseq = targvars[0];
1375         jr->midas_m = revseq;
1376 #if AGGDEBUG
1377         fprintf(stderr, "midas m = %d\n", jr->midas_m);
1378 #endif
1379     } else if (nmax > 0) {
1380         /* allocate workspace for aggregation */
1381         int nx = (jr->auxcol > 0)? 2 * nmax : nmax;
1382 
1383         xmatch = malloc(nx * sizeof *xmatch);
1384         if (xmatch == NULL) {
1385             return E_ALLOC;
1386         }
1387         if (jr->auxcol) {
1388             auxmatch = xmatch + nmax;
1389         }
1390     }
1391 
1392     err = get_all_inner_key_values(jr, ikeyvars, &matcher);
1393 
1394     for (i=1; i<=targvars[0] && !err; i++) {
1395         /* loop across the series to be added/modified */
1396         int s, rv, lv = targvars[i];
1397         int strcheck = 0;
1398 
1399 #if AGGDEBUG
1400         fprintf(stderr, "\nworking on series %d\n", i);
1401 #endif
1402 
1403         if (jr->aggr == AGGR_MIDAS) {
1404             rv = jspec->colnums[JOIN_TARG];
1405             jr->midas_pd = jspec->midas_pd;
1406         } else {
1407             rv = outer_series_index(jspec, i);
1408             jr->midas_pd = 0;
1409         }
1410 
1411         if (rv > 0) {
1412             /* check for the case where both the target variable on the
1413                left and the series to be imported are string-valued
1414             */
1415             rst = series_get_string_table(jr->r_dset, rv);
1416             lst = series_get_string_table(jr->l_dset, lv);
1417             strcheck = (rst != NULL && lst != NULL);
1418         }
1419 
1420         /* run through the rows in the current sample range of the
1421            left-hand dataset, pick up the value of the inner key(s), and
1422            call aggr_value() to determine the value that should be
1423            imported from the right
1424         */
1425 
1426         for (t=dset->t1, s=0; t<=dset->t2 && !err; t++, s++) {
1427 	    int nomatch = 0;
1428             double zt;
1429 
1430 #if AGGDEBUG
1431             fprintf(stderr, " working on obs %d\n", t);
1432 #endif
1433             if (matcher.pos[s] == KEYMISS) {
1434                 dset->Z[lv][t] = NADBL;
1435                 continue;
1436             } else if (matcher.pos[s] < 0) {
1437 		nomatch = 1;
1438 		zt = (jr->aggr == AGGR_COUNT)? 0 : NADBL;
1439 	    } else {
1440 		zt = aggr_value(jr, &matcher, s, rv, revseq, xmatch,
1441 				auxmatch, &err);
1442 	    }
1443 #if AGGDEBUG
1444             if (na(zt)) {
1445                 fprintf(stderr, " aggr_value: got NA (keys=%g,%g, err=%d)\n",
1446                         matcher.k1[s], matcher_get_k2(&matcher, s), err);
1447             } else {
1448                 fprintf(stderr, " aggr_value: got %.12g (keys=%g,%g, err=%d)\n",
1449                         zt, matcher.k1[s], matcher_get_k2(&matcher, s), err);
1450             }
1451 #endif
1452             if (!err && strcheck && !na(zt)) {
1453                 zt = maybe_adjust_string_code(rst, lst, zt, &err);
1454             }
1455             if (!err) {
1456                 if (lv >= orig_v) {
1457                     /* @lv is a newly added series */
1458                     dset->Z[lv][t] = zt;
1459                 } else if (zt != dset->Z[lv][t]) {
1460                     if (nomatch && !na(dset->Z[lv][t])) {
1461                         ; /* leave existing data alone (?) */
1462                     } else {
1463                         dset->Z[lv][t] = zt;
1464                         *modified += 1;
1465                     }
1466                 }
1467             }
1468         }
1469 
1470         if (!err && jr->aggr == AGGR_MIDAS) {
1471             handle_midas_setup(jr, i, lv, rv, revseq);
1472         }
1473 
1474         revseq--;
1475     }
1476 
1477     free(xmatch);
1478     jr_matcher_free(&matcher);
1479 
1480     return err;
1481 }
1482 
1483 /* Simple transcription: we come here only if there are no keys, and
1484    we've verified that the number of rows on the right is no greater
1485    than the number of rows in the current sample range on the left.
1486 */
1487 
join_transcribe_data(joiner * jr,int lv,int newvar,joinspec * jspec,int * modified)1488 static int join_transcribe_data (joiner *jr, int lv, int newvar,
1489                                  joinspec *jspec, int *modified)
1490 {
1491     series_table *rst = NULL;
1492     series_table *lst = NULL;
1493     DATASET *dset = jr->l_dset;
1494     double zi;
1495     int strcheck = 0;
1496     int i, t, rv;
1497     int err = 0;
1498 
1499     rv = outer_series_index(jspec, 1);
1500     if (rv < 0) {
1501         return E_DATA;
1502     }
1503 
1504     rst = series_get_string_table(jr->r_dset, rv);
1505     lst = series_get_string_table(jr->l_dset, lv);
1506     strcheck = (rst != NULL && lst != NULL);
1507 
1508     for (i=0; i<jr->n_rows && !err; i++) {
1509         jr_row *r = &jr->rows[i];
1510 
1511         zi = jr->r_dset->Z[rv][r->dset_row];
1512         if (strcheck && !na(zi)) {
1513             zi = maybe_adjust_string_code(rst, lst, zi, &err);
1514         }
1515         if (!err) {
1516             t = dset->t1 + i;
1517             if (newvar) {
1518                 dset->Z[lv][t] = zi;
1519             } else if (zi != dset->Z[lv][t]) {
1520                 dset->Z[lv][t] = zi;
1521                 *modified += 1;
1522             }
1523         }
1524     }
1525 
1526     return err;
1527 }
1528 
1529 #include "tsjoin.c"
1530 
join_transcribe_multi_data(DATASET * l_dset,DATASET * r_dset,int * targlist,int orig_v,joinspec * jspec,ts_joiner * tjr,int * modified)1531 static int join_transcribe_multi_data (DATASET *l_dset,
1532                                        DATASET *r_dset,
1533                                        int *targlist,
1534                                        int orig_v,
1535                                        joinspec *jspec,
1536                                        ts_joiner *tjr,
1537                                        int *modified)
1538 {
1539     series_table *rst = NULL;
1540     series_table *lst = NULL;
1541     int i, s, t, lv, rv;
1542     int t1, t2, m = 0;
1543     int strcheck, newvar;
1544     double xit;
1545     int err = 0;
1546 
1547     if (tjr == NULL) {
1548         t1 = l_dset->t1;
1549         t2 = l_dset->t2;
1550     } else {
1551         t1 = tjr->t1;
1552         t2 = tjr->t2;
1553     }
1554 
1555     for (i=1; i<=targlist[0] && !err; i++) {
1556         lv = targlist[i];
1557         rv = outer_series_index(jspec, i);
1558         if (rv < 0) {
1559             gretl_errmsg_sprintf("join: '%s' not matched", l_dset->varname[lv]);
1560             err = E_DATA;
1561         } else {
1562             newvar = lv >= orig_v;
1563             if (newvar) {
1564                 strcheck = 0;
1565             } else {
1566                 rst = series_get_string_table(r_dset, rv);
1567                 lst = series_get_string_table(l_dset, lv);
1568                 strcheck = (rst != NULL && lst != NULL);
1569             }
1570             if (tjr != NULL) {
1571                 m = tjr->rminor;
1572                 s = tjr->rt1;
1573             } else {
1574                 s = 0;
1575             }
1576             for (t=t1; t<=t2 && !err; t++) {
1577                 xit = r_dset->Z[rv][s];
1578                 if (strcheck && !na(xit)) {
1579                     xit = maybe_adjust_string_code(rst, lst, xit, &err);
1580                 }
1581                 if (newvar) {
1582                     l_dset->Z[lv][t] = xit;
1583                 } else if (xit != l_dset->Z[lv][t]) {
1584                     l_dset->Z[lv][t] = xit;
1585                     *modified += 1;
1586                 }
1587                 if (tjr != NULL) {
1588                     m = tj_continue(tjr, m, &s);
1589                 } else {
1590                     s++;
1591                 }
1592             }
1593         }
1594     }
1595 
1596     return err;
1597 }
1598 
join_simple_range_check(DATASET * l_dset,DATASET * r_dset,int * targlist)1599 static int join_simple_range_check (DATASET *l_dset,
1600                                     DATASET *r_dset,
1601                                     int *targlist)
1602 {
1603     int err = 0;
1604 
1605     if (r_dset->n != sample_size(l_dset)) {
1606         gretl_errmsg_set("join: the observation ranges don't match");
1607         err = E_DATA;
1608     } else if (r_dset->v - 1 < targlist[0]) {
1609         gretl_errmsg_set("join: series missing on the right");
1610         err = E_DATA;
1611     }
1612 
1613     return err;
1614 }
1615 
join_filter_new(int * err)1616 static jr_filter *join_filter_new (int *err)
1617 {
1618     jr_filter *filter = malloc(sizeof *filter);
1619 
1620     if (filter == NULL) {
1621         *err = E_ALLOC;
1622     } else {
1623         filter->expr = NULL;
1624         filter->val = NULL;
1625         filter->vname1 = NULL;
1626         filter->vname2 = NULL;
1627         filter->vname3 = NULL;
1628     }
1629 
1630     return filter;
1631 }
1632 
1633 #if JDEBUG
1634 
print_filter_vnames(jr_filter * f)1635 static void print_filter_vnames (jr_filter *f)
1636 {
1637     if (f == NULL) return;
1638 
1639     if (f->vname1 != NULL) {
1640         fprintf(stderr, "filter varname 1 (target %d, JOIN_F1): %s\n",
1641                 JOIN_F1, f->vname1);
1642     }
1643     if (f->vname2 != NULL) {
1644         fprintf(stderr, "filter varname 2 (target %d, JOIN_F2): %s\n",
1645                 JOIN_F2, f->vname2);
1646     }
1647     if (f->vname3 != NULL) {
1648         fprintf(stderr, "filter varname 3 (target %d, JOIN_F3): %s\n",
1649                 JOIN_F3, f->vname3);
1650     }
1651 }
1652 
1653 #endif
1654 
1655 /* Allocate the filter struct and crawl along the filter expression,
1656    @s, looking for up to three names of right-hand columns. We need to
1657    record these names so we can be sure to read the associated column
1658    data, or else the filter won't work. We could increase the maximum
1659    number of column-names to store but probably 3 is enough.
1660 
1661    The heuristic for column-name detection is that we find a portion
1662    of @s which is legal as a gretl identifier but which is not
1663    enclosed in quotes, is not directly followed by '(', and is not
1664    the name of a scalar or string variable on the left.
1665 */
1666 
make_join_filter(const char * s,int * err)1667 static jr_filter *make_join_filter (const char *s, int *err)
1668 {
1669     jr_filter *filter = join_filter_new(err);
1670 
1671     if (filter != NULL) {
1672         char test[VNAMELEN];
1673         int n, ngot = 0;
1674 
1675         filter->expr = s;
1676 
1677         while (*s && ngot < 3) {
1678             if (*s == '"') {
1679                 /* skip double-quoted stuff */
1680                 s = strchr(s + 1, '"');
1681                 if (s == NULL) {
1682                     *err = E_PARSE;
1683                     break;
1684                 } else {
1685                     s++;
1686                     if (*s == '\0') {
1687                         break;
1688                     }
1689                 }
1690             }
1691             n = gretl_namechar_spn(s);
1692             if (n > 0) {
1693                 if (n < VNAMELEN && s[n] != '(') {
1694                     *test = '\0';
1695                     strncat(test, s, n);
1696                     if (!gretl_is_scalar(test) && !gretl_is_string(test)) {
1697                         if (ngot == 0) {
1698                             filter->vname1 = gretl_strdup(test);
1699                         } else if (ngot == 1) {
1700                             filter->vname2 = gretl_strdup(test);
1701                         } else if (ngot == 2) {
1702                             filter->vname3 = gretl_strdup(test);
1703                         }
1704                         ngot++;
1705                     }
1706                 }
1707                 s += n;
1708             } else {
1709                 s++;
1710             }
1711         }
1712     }
1713 
1714 #if JDEBUG
1715     print_filter_vnames(filter);
1716 #endif
1717 
1718     return filter;
1719 }
1720 
1721 /* Add series to hold the join data.  We come here only if the
1722    target series is/are not already present in the left-hand
1723    dataset.
1724 */
1725 
add_target_series(const char ** vnames,DATASET * dset,int * targvars,int n_add)1726 static int add_target_series (const char **vnames,
1727                               DATASET *dset,
1728                               int *targvars,
1729                               int n_add)
1730 {
1731     int i, v = dset->v;
1732     int err;
1733 
1734     err = dataset_add_NA_series(dset, n_add);
1735 
1736     if (!err) {
1737         for (i=1; i<=targvars[0]; i++) {
1738             if (targvars[i] < 0) {
1739                 strcpy(dset->varname[v], vnames[i-1]);
1740                 targvars[i] = v++;
1741             }
1742         }
1743     }
1744 
1745     return err;
1746 }
1747 
1748 /* Parse either one or two elements (time-key column name, time format
1749    string) out of @s. If both elements are present they must be
1750    comma-separated; if only the second element is present it must be
1751    preceded by a comma.
1752 */
1753 
process_time_key(const char * s,char * tkeyname,char * tkeyfmt)1754 static int process_time_key (const char *s, char *tkeyname,
1755                              char *tkeyfmt)
1756 {
1757     int err = 0;
1758 
1759     if (*s == ',') {
1760         /* only time-key format supplied */
1761         strncat(tkeyfmt, s + 1, 31);
1762     } else {
1763         const char *p = strchr(s, ',');
1764 
1765         if (p == NULL) {
1766             /* only column name given */
1767             strncat(tkeyname, s, VNAMELEN - 1);
1768         } else {
1769             int n = p - s;
1770 
1771             n = (n > VNAMELEN - 1)? VNAMELEN - 1 : n;
1772             strncat(tkeyname, s, n);
1773             strncat(tkeyfmt, p + 1, 31);
1774         }
1775     }
1776 
1777 #if TDEBUG
1778     fprintf(stderr, "time key: name='%s', format='%s'\n",
1779             tkeyname, tkeyfmt);
1780 #endif
1781 
1782     if (*tkeyname == '\0' && *tkeyfmt == '\0') {
1783         err = E_DATA;
1784     }
1785 
1786     return err;
1787 }
1788 
1789 /* Parse either one column-name or two comma-separated names out of
1790    @s. If @s contains a comma, we accept a zero-length name on either
1791    the left or the right -- but not both -- as indicating that we
1792    should use the corresponding inner key name.
1793 */
1794 
process_outer_key(const char * s,int n_keys,char * name1,char * name2,gretlopt opt)1795 static int process_outer_key (const char *s, int n_keys,
1796                               char *name1, char *name2,
1797                               gretlopt opt)
1798 {
1799     int n_okeys = 0;
1800     int err = 0;
1801 
1802     if (strchr(s, ',') == NULL) {
1803         /* just one outer key */
1804         strncat(name1, s, VNAMELEN - 1);
1805         n_okeys = 1;
1806     } else {
1807         /* two comma-separated keys */
1808         int n2 = 0, n1 = strcspn(s, ",");
1809 
1810         if (n1 >= VNAMELEN) {
1811             err = E_PARSE;
1812         } else {
1813             strncat(name1, s, n1);
1814             s += n1 + 1;
1815             n2 = strlen(s);
1816             if (n2 >= VNAMELEN) {
1817                 err = E_PARSE;
1818             } else {
1819                 strcat(name2, s);
1820             }
1821         }
1822 
1823         if (!err && n1 == 0 && n2 == 0) {
1824             /* both fields empty: wrong */
1825             err = E_PARSE;
1826         }
1827 
1828         if (!err) {
1829             n_okeys = 2;
1830         }
1831     }
1832 
1833     if (!err && n_okeys != n_keys) {
1834         err = E_PARSE;
1835     }
1836 
1837     return err;
1838 }
1839 
check_for_quarterly_format(obskey * auto_keys,int pd)1840 static int check_for_quarterly_format (obskey *auto_keys, int pd)
1841 {
1842     char *s = auto_keys->timefmt;
1843     int i, err = 0;
1844 
1845 #if JDEBUG
1846     fprintf(stderr, "check_for_quarterly_format: '%s'\n", s);
1847 #endif
1848 
1849     for (i=0; s[i]; i++) {
1850         if (s[i] == '%' &&
1851             (s[i+1] == 'q' || s[i+1] == 'Q') &&
1852             (i == 0 || s[i-1] != '%')) {
1853             if (pd == 4 || pd == 1) {
1854                 s[i+1] = 'm';
1855                 auto_keys->m_means_q = 1;
1856             } else {
1857                 err = E_DATA;
1858                 gretl_errmsg_sprintf("The '%c' format is not applicable "
1859                                      "for data with frequency %d",
1860                                      s[i+1], pd);
1861             }
1862             break;
1863         }
1864     }
1865 
1866     return err;
1867 }
1868 
1869 /* for use in determining optimal auto keys */
1870 #define daily(d) (d->pd >= 5 && d->pd <= 7)
1871 
1872 /* time-series data on the left, and no explicit keys supplied */
1873 
auto_keys_check(const DATASET * l_dset,const DATASET * r_dset,gretlopt opt,const char * tkeyfmt,obskey * auto_keys,int * n_keys,int * do_tsjoin)1874 static int auto_keys_check (const DATASET *l_dset,
1875                             const DATASET *r_dset,
1876                             gretlopt opt,
1877                             const char *tkeyfmt,
1878                             obskey *auto_keys,
1879                             int *n_keys,
1880                             int *do_tsjoin)
1881 {
1882     int lpd = l_dset->pd;
1883     int rpd = 0;
1884     int err = 0;
1885 
1886     if (!dataset_is_time_series(l_dset)) {
1887         /* On the left we need a time-series dataset */
1888         err = E_DATA;
1889         goto bailout;
1890     }
1891 
1892     if (dataset_is_time_series(r_dset)) {
1893         rpd = r_dset->pd;
1894         if (use_tsjoin(l_dset, r_dset)) {
1895             *do_tsjoin = 1;
1896             return 0;
1897         }
1898         auto_keys->native = 1;
1899     } else if (r_dset->S == NULL && auto_keys->keycol < 0) {
1900         /* On the right, we need either obs strings or a specified
1901            time column
1902         */
1903         err = E_DATA;
1904         goto bailout;
1905     }
1906 
1907     if (*tkeyfmt != '\0') {
1908         /* the user supplied a time-format spec */
1909         err = set_time_format(auto_keys, tkeyfmt);
1910         if (!err) {
1911             err = check_for_quarterly_format(auto_keys, lpd);
1912         }
1913         if (!err) {
1914             if (annual_data(l_dset)) {
1915                 *n_keys = 1;
1916             } else if (calendar_data(l_dset)) {
1917                 *n_keys = 1;
1918             } else {
1919                 *n_keys = 2;
1920             }
1921         }
1922     } else {
1923         /* default formats */
1924         if (calendar_data(l_dset)) {
1925             err = set_time_format(auto_keys, "%Y-%m-%d");
1926             if (daily(l_dset) && rpd == 12) {
1927                 *n_keys = 2; /* use year and month */
1928             } else if (!err) {
1929                 *n_keys = 1; /* use epoch day */
1930             }
1931         } else if (lpd == 12) {
1932             err = set_time_format(auto_keys, "%Y-%m");
1933             if (!err) {
1934                 *n_keys = 2;
1935             }
1936         } else if (annual_data(l_dset)) {
1937             err = set_time_format(auto_keys, "%Y");
1938             if (!err) {
1939                 *n_keys = 1;
1940             }
1941         } else if (lpd == 4) {
1942             /* try "excess precision" ISO daily? */
1943             err = set_time_format(auto_keys, "%Y-%m-%d");
1944             if (!err) {
1945                 *n_keys = 2;
1946             }
1947         } else {
1948             err = E_PDWRONG;
1949         }
1950     }
1951 
1952  bailout:
1953 
1954     /* we should flag an error here only if the user
1955        explicitly requested use of this apparatus,
1956        by giving --tkey=<whatever> (OPT_K)
1957     */
1958     if (err && !(opt & OPT_K)) {
1959         err = 0;
1960     }
1961 
1962     return err;
1963 }
1964 
make_time_formats_array(char const ** fmts,char *** pS)1965 static int make_time_formats_array (char const **fmts, char ***pS)
1966 {
1967     char **S = strings_array_new(2);
1968     int i, err = 0;
1969 
1970     if (S == NULL) {
1971         err = E_ALLOC;
1972     } else {
1973         for (i=0; i<2 && !err; i++) {
1974             if (fmts[i] != NULL) {
1975                 S[i] = gretl_strdup(fmts[i]);
1976                 if (S[i] == NULL) {
1977                     err = E_ALLOC;
1978                 }
1979             }
1980         }
1981     }
1982 
1983     if (err) {
1984         strings_array_free(S, 2);
1985     } else {
1986         *pS = S;
1987     }
1988 
1989     return err;
1990 }
1991 
1992 /* Crawl along the string containing comma-separated names of columns
1993    that fall under the "tconvert" option. For each name, look for a
1994    match among the names of columns selected from the CSV file for
1995    some role in the join operation (data, key or whatever). It is
1996    not considered an error if there's no match for a given "tconvert"
1997    name; in that case the column will be ignored.
1998 
1999    If @tconvfmt is non-NULL, this should hold a common format for
2000    date conversion.
2001 
2002    If @keyfmt is not an empty string, it holds a specific format
2003    for the --tkey column. In that case we should check for tkey
2004    among the tconvert columns; if it's present we need to add its
2005    format to the timeconv_map apparatus (as an override to the
2006    common format, or an addendum if no common format is given).
2007 */
2008 
process_tconvert_info(joinspec * jspec,const char * tconvcols,const char * tconvfmt,const char * keyfmt)2009 static int process_tconvert_info (joinspec *jspec,
2010                                   const char *tconvcols,
2011                                   const char *tconvfmt,
2012                                   const char *keyfmt)
2013 {
2014     int *list = NULL;
2015     char **names = NULL;
2016     char **fmts = NULL;
2017     const char *colname;
2018     const char *tkeyfmt = NULL;
2019     char *tkeyname = NULL;
2020     int nnames = 0;
2021     int i, j, err = 0;
2022 
2023     names = gretl_string_split(tconvcols, &nnames, " ,");
2024     if (names == NULL) {
2025         err = E_ALLOC;
2026     }
2027 
2028     /* match the names we got against the set of "wanted" columns */
2029 
2030     for (i=0; i<nnames && !err; i++) {
2031         for (j=0; j<jspec->ncols; j++) {
2032             colname = jspec->colnames[j];
2033             if (colname != NULL && !strcmp(names[i], colname)) {
2034                 gretl_list_append_term(&list, j);
2035                 if (list == NULL) {
2036                     err = E_ALLOC;
2037                 } else if (*keyfmt != '\0' && j == JOIN_KEY) {
2038                     /* we've got the time-key variable here,
2039                        and a format has been given for it
2040                     */
2041                     tkeyfmt = keyfmt;
2042                     tkeyname = names[i];
2043                 }
2044                 break;
2045             }
2046         }
2047     }
2048 
2049     /* allocate and record the time-format info, if any */
2050     if (!err && list != NULL && (tconvfmt != NULL || tkeyfmt != NULL)) {
2051         char const *tmp[2] = {tconvfmt, tkeyfmt};
2052 
2053         err = make_time_formats_array(tmp, &fmts);
2054     }
2055 
2056 #if JDEBUG
2057     printlist(list, "timeconv list");
2058 #endif
2059 
2060     if (!err && list != NULL) {
2061         timeconv_map_set(nnames, names, tkeyname, fmts);
2062     }
2063 
2064     if (err) {
2065         /* clean up if need be */
2066         strings_array_free(names, nnames);
2067         free(list);
2068     } else {
2069         jspec->timecols = list;
2070     }
2071 
2072     return err;
2073 }
2074 
obskey_init(obskey * keys)2075 static void obskey_init (obskey *keys)
2076 {
2077     keys->timefmt = NULL;
2078     keys->keycol = -1;
2079     keys->m_means_q = 0;
2080     keys->numdates = 0;
2081     keys->native = 0;
2082 }
2083 
2084 #define lr_mismatch(l,r) ((l > 0 && r == 0) || (r > 0 && l == 0))
2085 
2086 /* Run a check pertaining to the nature of the "payload"
2087    (string-valued vs numeric) in relation to the aggregation
2088    method specified and the nature of the existing left-hand
2089    series, if any.
2090 */
2091 
join_data_type_check(joinspec * jspec,const DATASET * l_dset,int * targvars,AggrType aggr)2092 static int join_data_type_check (joinspec *jspec,
2093                                  const DATASET *l_dset,
2094                                  int *targvars,
2095                                  AggrType aggr)
2096 {
2097     DATASET *r_dset = outer_dataset(jspec);
2098     int lstr, rstr;
2099     int i, vl, vr;
2100     int err = 0;
2101 
2102     for (i=1; i<=targvars[0]; i++) {
2103         lstr = rstr = -1;
2104         vl = targvars[i];
2105         vr = outer_series_index(jspec, i);
2106         if (vl > 0) {
2107             /* there's an existing LHS series */
2108             lstr = is_string_valued(l_dset, vl);
2109             if (lstr && aggr == AGGR_COUNT) {
2110                 /* count values can't be mixed with strings */
2111                 err = E_TYPES;
2112             }
2113         }
2114         if (!err && vr > 0) {
2115             /* there's a payload variable on the right */
2116             rstr = is_string_valued(r_dset, vr);
2117         }
2118         if (!err && lr_mismatch(lstr, rstr)) {
2119             /* one of (L, R) is string-valued, but not the other */
2120             err = E_TYPES;
2121         }
2122     }
2123 
2124     return err;
2125 }
2126 
aggregation_type_check(joinspec * jspec,AggrType aggr)2127 static int aggregation_type_check (joinspec *jspec, AggrType aggr)
2128 {
2129     int err = 0;
2130 
2131     if (aggr == AGGR_NONE || aggr == AGGR_COUNT || aggr == AGGR_SEQ) {
2132         ; /* no problem */
2133     } else {
2134         /* the aggregation method requires numerical data: flag
2135            an error if we got strings instead
2136         */
2137         const DATASET *dset = outer_dataset(jspec);
2138         int aggcol = 0;
2139 
2140         if (jspec->colnums[JOIN_AUX] > 0) {
2141             aggcol = jspec->colnums[JOIN_AUX];
2142         } else if (jspec->colnums[JOIN_TARG] > 0) {
2143             aggcol = jspec->colnums[JOIN_TARG];
2144         }
2145 
2146         if (aggcol > 0 && is_string_valued(dset, aggcol)) {
2147             gretl_errmsg_sprintf("'%s' is a string variable: aggregation type "
2148                                  "is not applicable", dset->varname[aggcol]);
2149             err = E_TYPES;
2150         }
2151     }
2152 
2153     return err;
2154 }
2155 
check_for_missing_columns(joinspec * jspec)2156 static int check_for_missing_columns (joinspec *jspec)
2157 {
2158     const char *name;
2159     int i;
2160 
2161     /* Note: it's possible, though we hope unlikely, that our
2162        heuristic for extracting column names from the filter
2163        expression (if present) gave a false positive. In that case the
2164        name at position JOIN_F* might be spuriously "missing": not
2165        found, but not really wanted. To guard against this eventuality
2166        we'll skip the check for the JOIN_F* columns here. If either
2167        one is really missing, that will show up before long, when the
2168        filter is evaluated.
2169     */
2170 
2171     for (i=0; i<jspec->ncols; i++) {
2172         if (i == JOIN_F1 || i == JOIN_F2 || i == JOIN_F3) {
2173             continue;
2174         }
2175         name = jspec->colnames[i];
2176         if (name != NULL && jspec->colnums[i] == 0) {
2177             gretl_errmsg_sprintf(_("%s: column '%s' was not found"), "join", name);
2178             return E_DATA;
2179         }
2180 #if JDEBUG
2181         if (name != NULL) {
2182             fprintf(stderr, "colname '%s' -> colnum %d\n", name, jspec->colnums[i]);
2183         }
2184 #endif
2185     }
2186 
2187     return 0;
2188 }
2189 
2190 /* Handle the situation where for one or other of the keys
2191    there's a type mismatch: string on left but not on right, or
2192    vice versa.
2193 */
2194 
key_types_error(int lstr,int rstr)2195 static int key_types_error (int lstr, int rstr)
2196 {
2197     if (lstr) {
2198         gretl_errmsg_sprintf(_("%s: string key on left but numeric on right"), "join");
2199     } else {
2200         gretl_errmsg_sprintf(_("%s: string key on right but numeric on left"), "join");
2201     }
2202 
2203     return E_TYPES;
2204 }
2205 
set_up_outer_keys(joinspec * jspec,const DATASET * l_dset,gretlopt opt,const int * ikeyvars,int * okeyvars,obskey * auto_keys,int * str_keys)2206 static int set_up_outer_keys (joinspec *jspec, const DATASET *l_dset,
2207                               gretlopt opt, const int *ikeyvars,
2208                               int *okeyvars, obskey *auto_keys,
2209                               int *str_keys)
2210 {
2211     const DATASET *r_dset = outer_dataset(jspec);
2212     int lstr, rstr;
2213     int err = 0;
2214 
2215     if (jspec->colnums[JOIN_KEY] > 0) {
2216         okeyvars[0] += 1;
2217         okeyvars[1] = jspec->colnums[JOIN_KEY];
2218     }
2219 
2220     if (jspec->colnums[JOIN_KEY2] > 0) {
2221         okeyvars[0] += 1;
2222         okeyvars[2] = jspec->colnums[JOIN_KEY2];
2223     }
2224 
2225     if (opt & OPT_K) {
2226         /* time key on right */
2227         rstr = is_string_valued(r_dset, okeyvars[1]);
2228         auto_keys->keycol = okeyvars[1];
2229         if (!rstr) {
2230             /* flag the need to convert to string later */
2231             auto_keys->numdates = 1;
2232         }
2233     } else {
2234         /* regular key(s) on right */
2235         lstr = is_string_valued(l_dset, ikeyvars[1]);
2236         rstr = is_string_valued(r_dset, okeyvars[1]);
2237 
2238         if (lstr != rstr) {
2239             err = key_types_error(lstr, rstr);
2240         } else if (lstr) {
2241             str_keys[0] = 1;
2242         }
2243 
2244         if (!err && okeyvars[2] > 0) {
2245             lstr = is_string_valued(l_dset, ikeyvars[2]);
2246             rstr = is_string_valued(r_dset, okeyvars[2]);
2247             if (lstr != rstr) {
2248                 err = key_types_error(lstr, rstr);
2249             } else if (lstr) {
2250                 str_keys[1] = 1;
2251             }
2252         }
2253     }
2254 
2255     return err;
2256 }
2257 
first_available_index(joinspec * jspec)2258 static int first_available_index (joinspec *jspec)
2259 {
2260     int i;
2261 
2262     for (i=JOIN_TARG; i<jspec->ncols; i++) {
2263         if (jspec->colnums[i] == 0) {
2264             return i;
2265         }
2266     }
2267 
2268     return -1;
2269 }
2270 
determine_gdt_matches(const char * fname,joinspec * jspec,int ** plist,int * addvars,int * omm,PRN * prn)2271 static int determine_gdt_matches (const char *fname,
2272                                   joinspec *jspec,
2273                                   int **plist,
2274                                   int *addvars,
2275                                   int *omm,
2276                                   PRN *prn)
2277 {
2278     char **vnames = NULL;
2279     int nv = 0;
2280     int err = 0;
2281 
2282     err = gretl_read_gdt_varnames(fname, &vnames, &nv);
2283 
2284     if (!err) {
2285         GPatternSpec *pspec;
2286         int *vlist = NULL;
2287         char **S = NULL;
2288         int i, j, ns = 0;
2289 
2290         /* form array of unique wanted identifiers */
2291         for (i=0; i<jspec->ncols && !err; i++) {
2292             if (jspec->colnames[i] != NULL) {
2293                 err = strings_array_add_uniq(&S, &ns, jspec->colnames[i], NULL);
2294             }
2295         }
2296 
2297         for (i=0; i<ns && !err; i++) {
2298             int match = 0;
2299 
2300             if (!strcmp(S[i], "$obsmajor")) {
2301                 omm[0] = 1;
2302                 continue;
2303             } else if (!strcmp(S[i], "$obsminor")) {
2304                 omm[1] = 1;
2305                 continue;
2306             }
2307 
2308             if (prn != NULL) {
2309                 pprintf(prn, "checking for '%s'\n", S[i]);
2310             }
2311 
2312             if (is_wildstr(S[i])) {
2313                 pspec = g_pattern_spec_new(S[i]);
2314                 for (j=1; j<nv; j++) {
2315                     if (g_pattern_match_string(pspec, vnames[j])) {
2316                         match++;
2317                         if (!in_gretl_list(vlist, j)) {
2318                             gretl_list_append_term(&vlist, j);
2319                         }
2320                     }
2321                 }
2322                 g_pattern_spec_free(pspec);
2323             } else {
2324                 for (j=1; j<nv; j++) {
2325                     if (!strcmp(S[i], vnames[j])) {
2326                         match = 1;
2327                         if (!in_gretl_list(vlist, j)) {
2328                             gretl_list_append_term(&vlist, j);
2329                         }
2330                         break;
2331                     }
2332                 }
2333                 if (!match) {
2334                     err = E_DATA;
2335                     gretl_errmsg_sprintf("'%s': not found", S[i]);
2336                 }
2337             }
2338             if (prn != NULL) {
2339                 pprintf(prn, " found %d match(es)\n", match);
2340             }
2341         }
2342 
2343         if (!err && (vlist == NULL || vlist[0] == 0)) {
2344             gretl_errmsg_set("No matching data were found");
2345             err = E_DATA;
2346         }
2347 
2348         if (!err) {
2349             *addvars = vlist[0] - ns;
2350             *plist = vlist;
2351         } else {
2352             free(vlist);
2353         }
2354 
2355         strings_array_free(S, ns);
2356         strings_array_free(vnames, nv);
2357     }
2358 
2359     return err;
2360 }
2361 
rhs_add_obsmajmin(int * omm,DATASET * dset)2362 static int rhs_add_obsmajmin (int *omm, DATASET *dset)
2363 {
2364     int err = dataset_add_series(dset, omm[0] + omm[1]);
2365 
2366     if (!err) {
2367         int t, maj, min, vmaj = 0, vmin = 0;
2368         int *pmaj = NULL, *pmin = NULL;
2369 
2370         if (omm[0]) {
2371             pmaj = &maj; vmaj = dset->v - 1 - omm[1];
2372             strcpy(dset->varname[vmaj], "$obsmajor");
2373         }
2374         if (omm[1]) {
2375             pmin = &min; vmin = dset->v - 1;
2376             strcpy(dset->varname[vmin], "$obsminor");
2377         }
2378         for (t=0; t<dset->n; t++) {
2379             date_maj_min(t, dset, pmaj, pmin);
2380             if (vmaj) {
2381                 dset->Z[vmaj][t] = maj;
2382             }
2383             if (vmin) {
2384                 dset->Z[vmin][t] = min;
2385             }
2386         }
2387     }
2388 
2389     return err;
2390 }
2391 
join_import_gdt(const char * fname,joinspec * jspec,gretlopt opt,PRN * prn)2392 static int join_import_gdt (const char *fname,
2393                             joinspec *jspec,
2394                             gretlopt opt,
2395                             PRN *prn)
2396 {
2397     const char *cname;
2398     int *vlist = NULL;
2399     int orig_ncols = jspec->ncols;
2400     int i, vi, addvars = 0;
2401     int omm[2] = {0};
2402     int err = 0;
2403 
2404     err = determine_gdt_matches(fname, jspec, &vlist, &addvars,
2405                                 omm, prn);
2406 
2407     if (!err) {
2408         jspec->dset = datainfo_new();
2409         if (jspec->dset == NULL) {
2410             err = E_ALLOC;
2411         }
2412     }
2413 
2414     if (!err) {
2415         err = gretl_read_gdt_subset(fname, jspec->dset, vlist, opt);
2416     }
2417 
2418     if (!err && addvars > 0) {
2419         /* we have some extra vars due to wildcard expansion */
2420         err = expand_jspec(jspec, addvars);
2421     }
2422 
2423     if (!err && (omm[0] || omm[1])) {
2424         /* we need to add $obsmajor and/or $obsminor on the right */
2425         err = rhs_add_obsmajmin(omm, jspec->dset);
2426     }
2427 
2428     if (!err) {
2429         /* match up the imported series with their roles */
2430         for (i=0; i<orig_ncols && !err; i++) {
2431             cname = jspec->colnames[i];
2432             if (cname != NULL && !is_wildstr(cname)) {
2433                 vi = current_series_index(jspec->dset, cname);
2434                 if (vi < 0) {
2435                     err = E_DATA;
2436                 } else {
2437                     jspec->colnums[i] = vi;
2438                 }
2439             }
2440         }
2441     }
2442 
2443     if (!err && addvars > 0) {
2444         /* register any extra imported series */
2445         int j, pos, idx;
2446 
2447         for (i=1; i<jspec->dset->v; i++) {
2448             pos = 0;
2449             for (j=0; j<jspec->ncols; j++) {
2450                 if (jspec->colnums[j] == i) {
2451                     /* already registered */
2452                     pos = i;
2453                     break;
2454                 }
2455             }
2456             if (pos == 0) {
2457                 idx = first_available_index(jspec);
2458                 if (idx < 0) {
2459                     err = E_DATA;
2460                 } else {
2461                     jspec->colnums[idx] = i;
2462                     jspec->colnames[idx] = jspec->dset->varname[i];
2463                 }
2464             }
2465         }
2466     }
2467 
2468     free(vlist);
2469 
2470     return err;
2471 }
2472 
2473 /* add/replace an entry in @jspec's colnames array,
2474    and record the fact that @src is now owned by
2475    @jspec so we can free it when we're finished
2476    and hence avoid leaking memory
2477 */
2478 
jspec_push_tmpname(joinspec * jspec,int pos,char * src)2479 static int jspec_push_tmpname (joinspec *jspec,
2480                                int pos,
2481                                char *src)
2482 {
2483     jspec->colnames[pos] = src;
2484     return strings_array_donate(&jspec->tmpnames, &jspec->n_tmp, src);
2485 }
2486 
determine_csv_matches(const char * fname,joinspec * jspec,PRN * prn)2487 static int determine_csv_matches (const char *fname,
2488                                   joinspec *jspec,
2489                                   PRN *prn)
2490 {
2491     gretlopt opt = OPT_NONE; /* FIXME? */
2492     char **vnames = NULL;
2493     int nv = 0;
2494     int err = 0;
2495 
2496     err = probe_csv(fname, &vnames, &nv, &opt);
2497 
2498     if (!err) {
2499         GPatternSpec *pspec;
2500         int nmatch = 0;
2501         int i, err = 0;
2502 
2503         pspec = g_pattern_spec_new(jspec->colnames[JOIN_TARG]);
2504 
2505         /* first determine the number of matches to @pspec */
2506         for (i=0; i<nv; i++) {
2507             if (g_pattern_match_string(pspec, vnames[i])) {
2508                 nmatch++;
2509             }
2510         }
2511 
2512         if (nmatch > 1) {
2513             /* we have some extra vars due to wildcard expansion */
2514             err = expand_jspec(jspec, nmatch - 1);
2515         }
2516 
2517         if (!err && nmatch > 0) {
2518             int j = JOIN_TARG;
2519 
2520             for (i=0; i<nv; i++) {
2521                 if (g_pattern_match_string(pspec, vnames[i])) {
2522                     jspec_push_tmpname(jspec, j++, vnames[i]);
2523                     vnames[i] = NULL;
2524                 }
2525             }
2526         }
2527 
2528         g_pattern_spec_free(pspec);
2529         strings_array_free(vnames, nv);
2530     }
2531 
2532     return err;
2533 }
2534 
join_import_csv(const char * fname,joinspec * jspec,gretlopt opt,PRN * prn)2535 static int join_import_csv (const char *fname,
2536                             joinspec *jspec,
2537                             gretlopt opt,
2538                             PRN *prn)
2539 {
2540     int err = 0;
2541 
2542     if (jspec->wildcard) {
2543         err = determine_csv_matches(fname, jspec, prn);
2544         if (err) {
2545             pputs(prn, "join_import_csv: failed in matching varnames\n");
2546         }
2547     }
2548 
2549     if (!err) {
2550         err = real_import_csv(fname, NULL, NULL, NULL, jspec,
2551                               NULL, NULL, opt, prn);
2552         if (0 && !err) {
2553             /* question, 2021-01-09: this is zeroed out: why? */
2554             DATASET *dset = csvdata_get_dataset(jspec->c);
2555             int pd, reversed = 0;
2556 
2557             fprintf(stderr, "join_import_csv: n=%d, v=%d, pd=%d, markers=%d\n",
2558                     dset->n, dset->v, dset->pd, dset->markers);
2559             pd = test_markers_for_dates(dset, &reversed, NULL, prn);
2560             fprintf(stderr, "pd from markers: %d\n", pd);
2561         }
2562     }
2563 
2564     return err;
2565 }
2566 
join_range_check(joiner * jr,DATASET * dset,AggrType aggr)2567 static int join_range_check (joiner *jr, DATASET *dset, AggrType aggr)
2568 {
2569     int T = sample_size(dset);
2570     int err = 0;
2571 
2572     if (jr->n_rows < T) {
2573         /* can we handle jr->n_rows < T? */
2574         if (aggr != AGGR_NONE) {
2575             err = E_DATA; /* No */
2576         } else if (dset->v == 1) {
2577             ; /* only const, OK */
2578         } else if (dset->v == 2 && !strcmp(dset->varname[1], "index")) {
2579             ; /* "nulldata" default, OK */
2580         } else {
2581             err = E_DATA; /* Not OK */
2582         }
2583     } else if (jr->n_rows != T) {
2584         gretl_errmsg_set(_("Series length does not match the dataset"));
2585         err = E_DATA;
2586     }
2587 
2588     if (err) {
2589         gretl_errmsg_set(_("Series length does not match the dataset"));
2590     }
2591 
2592     return err;
2593 }
2594 
get_series_indices(const char ** vnames,int nvars,DATASET * dset,int * n_add,int * any_wild,int * err)2595 static int *get_series_indices (const char **vnames,
2596                                 int nvars,
2597                                 DATASET *dset,
2598                                 int *n_add,
2599                                 int *any_wild,
2600                                 int *err)
2601 {
2602     int *ret = gretl_list_new(nvars);
2603 
2604     if (ret == NULL) {
2605         *err = E_ALLOC;
2606     } else {
2607         int i, v;
2608 
2609         for (i=0; i<nvars && !*err; i++) {
2610             if (is_wildstr(vnames[i])) {
2611                 *any_wild = 1;
2612                 continue;
2613             }
2614             v = current_series_index(dset, vnames[i]);
2615             if (v == 0) {
2616                 *err = E_DATA;
2617             } else {
2618                 ret[i+1] = v;
2619                 if (v < 0) {
2620                     *n_add += 1;
2621                 }
2622             }
2623         }
2624     }
2625 
2626     if (!*err && nvars > 1 && *any_wild) {
2627         /* wildcard spec must be singleton varname spec */
2628         gretl_errmsg_set(_("Invalid join specification"));
2629         *err = E_DATA;
2630     }
2631 
2632     if (*err) {
2633         free(ret);
2634         ret = NULL;
2635     }
2636 
2637     return ret;
2638 }
2639 
2640 /* figure the number of series to import */
2641 
jspec_n_vars(joinspec * jspec)2642 static int jspec_n_vars (joinspec *jspec)
2643 {
2644     return jspec->ncols - JOIN_TARG;
2645 }
2646 
2647 /* we come here if we have determined that the
2648    import series specification includes a wildcard
2649    ('*' or '?')
2650 */
2651 
revise_series_indices(joinspec * jspec,DATASET * dset,int * n_add,int * err)2652 static int *revise_series_indices (joinspec *jspec,
2653                                    DATASET *dset,
2654                                    int *n_add,
2655                                    int *err)
2656 {
2657     int nvars = jspec_n_vars(jspec);
2658     int *ret = gretl_list_new(nvars);
2659 
2660     ret = gretl_list_new(nvars);
2661 
2662     if (ret == NULL) {
2663         *err = E_ALLOC;
2664     } else {
2665         jspec->wildnames = strings_array_new(nvars);
2666         if (jspec->wildnames == NULL) {
2667             *err = E_ALLOC;
2668         }
2669     }
2670 
2671     if (!*err) {
2672         const char *cname;
2673         int i, v;
2674 
2675         /* zero the count of added vars */
2676         *n_add = 0;
2677 
2678         for (i=0; i<nvars && !*err; i++) {
2679             cname = jspec->colnames[JOIN_TARG + i];
2680             v = current_series_index(dset, cname);
2681             if (v == 0) {
2682                 *err = E_DATA;
2683             } else {
2684                 ret[i+1] = v;
2685                 if (v < 0) {
2686                     /* not a current series */
2687                     if (gretl_type_from_name(cname, NULL)) {
2688                         *err = E_TYPES;
2689                     } else {
2690                         *n_add += 1;
2691                     }
2692                 }
2693                 if (!*err) {
2694                     jspec->wildnames[i] = gretl_strdup(cname);
2695                 }
2696             }
2697         }
2698     }
2699 
2700     if (*err) {
2701         free(ret);
2702         ret = NULL;
2703     }
2704 
2705     return ret;
2706 }
2707 
set_up_jspec(joinspec * jspec,const char ** vnames,int nvars,gretlopt opt,int any_wild,AggrType aggr,int midas_pd)2708 static int set_up_jspec (joinspec *jspec,
2709                          const char **vnames,
2710                          int nvars,
2711                          gretlopt opt,
2712                          int any_wild,
2713                          AggrType aggr,
2714                          int midas_pd)
2715 {
2716     int i, j, ncols = JOIN_TARG + nvars;
2717 
2718     jspec->colnames = malloc(ncols * sizeof *jspec->colnames);
2719     jspec->colnums = malloc(ncols * sizeof *jspec->colnums);
2720 
2721     if (jspec->colnames == NULL || jspec->colnums == NULL) {
2722         return E_ALLOC;
2723     }
2724 
2725     jspec->ncols = ncols;
2726     jspec->wildcard = any_wild;
2727     jspec->wildnames = NULL;
2728     jspec->tmpnames = NULL;
2729     jspec->n_tmp = 0;
2730     jspec->mdsbase = NULL;
2731     jspec->mdsnames = NULL;
2732     jspec->auto_midas = 0;
2733     jspec->midas_pd = 0;
2734 
2735     if (aggr == AGGR_MIDAS) {
2736         jspec->mdsbase = vnames[0];
2737         jspec->auto_midas = 1; /* provisional! */
2738         jspec->midas_pd = midas_pd;
2739         for (i=0; i<ncols; i++) {
2740             jspec->colnames[i] = NULL;
2741             jspec->colnums[i] = 0;
2742         }
2743     } else {
2744         j = 1;
2745         for (i=0; i<ncols; i++) {
2746             if (i > JOIN_TARG) {
2747                 jspec->colnames[i] = vnames[j++];
2748             } else {
2749                 jspec->colnames[i] = NULL;
2750             }
2751             jspec->colnums[i] = 0;
2752         }
2753     }
2754 
2755     return 0;
2756 }
2757 
expand_jspec(joinspec * jspec,int addvars)2758 static int expand_jspec (joinspec *jspec, int addvars)
2759 {
2760     int i, ncols = jspec->ncols + addvars;
2761     char **colnames;
2762     int *colnums;
2763 
2764     colnames = realloc(jspec->colnames, ncols * sizeof *colnames);
2765     colnums = realloc(jspec->colnums, ncols * sizeof *colnums);
2766 
2767     if (colnames == NULL || colnums == NULL) {
2768         return E_ALLOC;
2769     }
2770 
2771     jspec->colnames = (const char **) colnames;
2772     jspec->colnums = colnums;
2773 
2774     for (i=jspec->ncols; i<ncols; i++) {
2775         jspec->colnames[i] = NULL;
2776         jspec->colnums[i] = 0;
2777     }
2778 
2779     jspec->ncols = ncols;
2780 
2781     return 0;
2782 }
2783 
clear_jspec(joinspec * jspec,joiner * jr)2784 static void clear_jspec (joinspec *jspec, joiner *jr)
2785 {
2786     free(jspec->colnames);
2787     free(jspec->colnums);
2788 
2789     if (jspec->timecols != NULL) {
2790         free(jspec->timecols);
2791     }
2792 
2793     if (jspec->c != NULL) {
2794         csvdata_free(jspec->c);
2795     } else if (jspec->dset != NULL) {
2796         destroy_dataset(jspec->dset);
2797     }
2798 
2799     if (jspec->wildnames != NULL) {
2800         strings_array_free(jspec->wildnames, jspec_n_vars(jspec));
2801     }
2802 
2803     if (jr != NULL && jspec->mdsnames != NULL) {
2804         strings_array_free(jspec->mdsnames, jr->midas_m);
2805     }
2806 
2807     if (jspec->tmpnames != NULL) {
2808         strings_array_free(jspec->tmpnames, jspec->n_tmp);
2809     }
2810 }
2811 
midas_revise_jspec(joinspec * jspec,DATASET * dset,int * n_add,int * err)2812 static int *midas_revise_jspec (joinspec *jspec,
2813                                 DATASET *dset,
2814                                 int *n_add,
2815                                 int *err)
2816 {
2817     DATASET *rdset = outer_dataset(jspec);
2818     int m, rpd = 0, nvars = 0;
2819     int *ret = NULL;
2820 
2821     if (jspec->midas_pd == 0) {
2822         /* outer pd not specified on input: so take
2823            it from the discovered dataset
2824         */
2825         rpd = jspec->midas_pd = rdset->pd;
2826     }
2827 
2828     m = midas_m_from_pd(dset, jspec->midas_pd);
2829 
2830     if (m == 0) {
2831         gretl_errmsg_sprintf("frequency %d in import data: \"spread\" will "
2832                              "not work", rpd);
2833         *err = E_PDWRONG;
2834         return NULL;
2835     } else {
2836         nvars = m;
2837     }
2838 
2839     ret = gretl_list_new(nvars);
2840     jspec->mdsnames = strings_array_new(nvars);
2841 
2842     if (ret == NULL || jspec->mdsnames == NULL) {
2843         *err = E_ALLOC;
2844     }
2845 
2846     if (!*err) {
2847         char *cname, tmp[VNAMELEN];
2848         int i, v, extlen;
2849         char mc;
2850 
2851         /* zero the count of added vars */
2852         *n_add = 0;
2853 
2854         /* create base for naming vars */
2855         mc = rpd == 12 ? 'm' : rpd == 4 ? 'q' : 'd';
2856         extlen = m < 10 ? 3 : 4;
2857         *tmp = '\0';
2858         strncat(tmp, jspec->mdsbase, VNAMELEN - 1);
2859         gretl_trunc(tmp, VNAMELEN - extlen - 1);
2860 
2861         for (i=0; i<nvars && !*err; i++) {
2862             cname = gretl_strdup_printf("%s_%c%d", tmp, mc, nvars - i);
2863             if (cname == NULL) {
2864                 *err = E_ALLOC;
2865                 break;
2866             }
2867             v = current_series_index(dset, cname);
2868             ret[i+1] = v;
2869             if (v < 0) {
2870                 /* not a current series */
2871                 if (gretl_type_from_name(cname, NULL)) {
2872                     *err = E_TYPES;
2873                 } else {
2874                     *n_add += 1;
2875                 }
2876             }
2877             if (*err) {
2878                 free(cname);
2879             } else {
2880                 jspec->mdsnames[i] = cname;
2881             }
2882         }
2883     }
2884 
2885     return ret;
2886 }
2887 
maybe_transfer_string_table(DATASET * l_dset,DATASET * r_dset,joinspec * jspec,int * targvars,int orig_v)2888 static void maybe_transfer_string_table (DATASET *l_dset,
2889                                          DATASET *r_dset,
2890                                          joinspec *jspec,
2891                                          int *targvars,
2892                                          int orig_v)
2893 {
2894     int i, lv, rv;
2895 
2896     for (i=1; i<=targvars[0]; i++) {
2897         lv = targvars[i];
2898         if (lv >= orig_v) {
2899             /* it's a new series */
2900             rv = outer_series_index(jspec, i);
2901             if (rv > 0 && is_string_valued(r_dset, rv)) {
2902                 /* let the new series grab the RHS string table */
2903                 steal_string_table(l_dset, lv, r_dset, rv);
2904             }
2905         }
2906     }
2907 }
2908 
initial_midas_check(int nvars,int any_wild,int pd,DATASET * dset)2909 static int initial_midas_check (int nvars, int any_wild, int pd,
2910                                 DATASET *dset)
2911 {
2912     int err;
2913 
2914     if (pd != 0 && pd != 12 && pd != 4 && pd != 5 && pd != 6 && pd != 7) {
2915         /* unacceptable outer data frequency */
2916         err = E_PDWRONG;
2917     } else if (nvars == 1 && (annual_data(dset) || quarterly_or_monthly(dset))) {
2918         /* might be OK, if no wildcard */
2919         err = any_wild ? E_DATA : 0;
2920     } else {
2921         err = E_DATA;
2922     }
2923 
2924     if (err) {
2925         gretl_errmsg_set(_("Invalid join specification"));
2926     }
2927 
2928     return err;
2929 }
2930 
has_native_suffix(const char * fname)2931 static int has_native_suffix (const char *fname)
2932 {
2933     return has_suffix(fname, ".gdt") || has_suffix(fname, ".gdtb");
2934 }
2935 
2936 /**
2937  * gretl_join_data:
2938  * @fname: name of data file.
2939  * @vnames: name(s) of variables to create or modify.
2940  * @nvars: the number of elements in @vnames.
2941  * @dset: pointer to dataset.
2942  * @ikeyvars: list of 1 or 2 "inner" key variables, or NULL.
2943  * @okey: string specifying "outer" key(s) or NULL.
2944  * @filtstr: string specifying filter, or NULL.
2945  * @srcname: name of variable to import at source, or NULL.
2946  * @aggr: aggregation method specifier.
2947  * @seqval: 1-based sequence number for aggregation, or 0.
2948  * @auxname: name of auxiliary column for max or min aggregation,
2949  * or NULL.
2950  * @tconvstr: string specifying date columns for conversion, or NULL.
2951  * @tconvfmt: string giving format(s) for "timeconv" columns, or NULL.
2952  * @midas_pd: hint regarding pd for --aggr=spread (?).
2953  * @opt: may contain OPT_V for verbose operation, OPT_H to assume
2954  * no header row.
2955  * @prn: gretl printing struct (or NULL).
2956  *
2957  * Opens a delimited text data file or gdt file and carries out a
2958  * "join" operation to pull data into the current working dataset.
2959  *
2960  * Returns: 0 on successful completion, non-zero otherwise.
2961  */
2962 
gretl_join_data(const char * fname,const char ** vnames,int nvars,DATASET * dset,const int * ikeyvars,const char * okey,const char * filtstr,const char * srcname,AggrType aggr,int seqval,const char * auxname,const char * tconvstr,const char * tconvfmt,int midas_pd,gretlopt opt,PRN * prn)2963 int gretl_join_data (const char *fname,
2964                      const char **vnames,
2965                      int nvars,
2966                      DATASET *dset,
2967                      const int *ikeyvars,
2968                      const char *okey,
2969                      const char *filtstr,
2970                      const char *srcname,
2971                      AggrType aggr,
2972                      int seqval,
2973                      const char *auxname,
2974                      const char *tconvstr,
2975                      const char *tconvfmt,
2976                      int midas_pd,
2977                      gretlopt opt,
2978                      PRN *prn)
2979 {
2980     DATASET *outer_dset = NULL;
2981     joinspec jspec = {0};
2982     joiner *jr = NULL;
2983     jr_filter *filter = NULL;
2984     const char *varname;
2985     int okeyvars[3] = {0, 0, 0};
2986     char okeyname1[VNAMELEN] = {0};
2987     char okeyname2[VNAMELEN] = {0};
2988     char tkeyfmt[32] = {0};
2989     obskey auto_keys;
2990     int *targvars = NULL;
2991     int do_tsjoin = 0;
2992     int orig_v = dset->v;
2993     int add_v = 0;
2994     int modified = 0;
2995     int any_wild = 0;
2996     int verbose = (opt & OPT_V);
2997     int str_keys[2] = {0};
2998     int n_keys = 0;
2999     int err = 0;
3000 
3001     /** Step 0: preliminaries **/
3002 
3003     if (vnames == NULL || nvars < 1) {
3004         return E_DATA;
3005     }
3006 
3007     varname = vnames[0];
3008 
3009     targvars = get_series_indices(vnames, nvars, dset, &add_v,
3010                                   &any_wild, &err);
3011 
3012     if (!err && srcname != NULL) {
3013         /* If we have a spec for the original name of a series
3014            to import (@srcname), we cannot accept more than one
3015            target name (in @vnames), nor can we accept any
3016            wildcard specification -- unless we're doing a
3017            "MIDAS join", in which case a single wildcard spec
3018            is OK for the target series.
3019         */
3020         if ((nvars > 1 || any_wild) && aggr != AGGR_MIDAS) {
3021             gretl_errmsg_set(_("Invalid join specification"));
3022             err = E_DATA;
3023         }
3024     }
3025 
3026     if (!err && aggr == AGGR_MIDAS) {
3027         err = initial_midas_check(nvars, any_wild, midas_pd, dset);
3028     }
3029 
3030     if (err) {
3031         return err;
3032     }
3033 
3034     err = set_up_jspec(&jspec, vnames, nvars, opt, any_wild,
3035                        aggr, midas_pd);
3036     if (err) {
3037         return err;
3038     }
3039 
3040     if (ikeyvars != NULL) {
3041         n_keys = ikeyvars[0];
3042     }
3043 
3044     obskey_init(&auto_keys);
3045     timeconv_map_init();
3046 
3047 #if JDEBUG
3048     fputs("*** gretl_join_data:\n", stderr);
3049     fprintf(stderr, " filename = '%s'\n", fname);
3050     if (nvars > 1) {
3051         int i;
3052 
3053         fputs(" target series names:\n", stderr);
3054         for (i=0; i<nvars; i++) {
3055             fprintf(stderr, "  '%s'\n", vnames[i]);
3056         }
3057     } else {
3058         fprintf(stderr, " target series name = '%s'\n", varname);
3059     }
3060     if (n_keys > 0) {
3061         fprintf(stderr, " inner key series ID = %d\n", ikeyvars[1]);
3062         if (n_keys == 2) {
3063             fprintf(stderr, " second inner key series ID = %d\n", ikeyvars[2]);
3064         }
3065     }
3066     if (okey != NULL) {
3067         fprintf(stderr, " outer key = '%s'\n", okey);
3068     } else if (n_keys > 0) {
3069         fprintf(stderr, " outer key = '%s' (from inner key)\n",
3070                 dset->varname[ikeyvars[1]]);
3071         if (n_keys == 2) {
3072             fprintf(stderr, " second outer key = '%s' (from inner)\n",
3073                     dset->varname[ikeyvars[2]]);
3074         }
3075     }
3076     if (filtstr != NULL) {
3077         fprintf(stderr, " filter = '%s'\n", filtstr);
3078     }
3079     if (srcname != NULL) {
3080         fprintf(stderr, " source data series = '%s'\n", srcname);
3081     } else if (aggr != AGGR_COUNT) {
3082         fprintf(stderr, " source data series: assuming '%s' (from inner varname)\n",
3083                 varname);
3084     }
3085     fprintf(stderr, " aggregation method = %d\n", aggr);
3086     if (auxname != NULL) {
3087         fprintf(stderr, " aggr auxiliary column = '%s'\n", auxname);
3088     }
3089     if (tconvstr != NULL) {
3090         fprintf(stderr, " timeconv = '%s'\n", tconvstr);
3091     }
3092     if (tconvfmt != NULL) {
3093         fprintf(stderr, " tconvfmt = '%s'\n", tconvfmt);
3094     }
3095 #endif
3096 
3097     /* Step 1: process the arguments we got with regard to filtering
3098        and keys: extract the names of the columns that are required
3099        from the outer datafile, checking for errors as we go.
3100     */
3101 
3102     if (filtstr != NULL) {
3103         filter = make_join_filter(filtstr, &err);
3104     }
3105 
3106     if (!err && okey != NULL) {
3107         if (opt & OPT_K) {
3108             /* cancel automatic MIDAS flag if present */
3109             jspec.auto_midas = 0;
3110             err = process_time_key(okey, okeyname1, tkeyfmt);
3111         } else {
3112             err = process_outer_key(okey, n_keys, okeyname1, okeyname2, opt);
3113         }
3114     }
3115 
3116     /* Step 2: set up the array of required column names,
3117        jspec.colnames. This is an array of const *char, pointers
3118        to strings that "live" elsewhere. We leave any unneeded
3119        elements as NULL.
3120     */
3121 
3122     if (!err) {
3123         /* handle the primary outer key column, if any */
3124         if (*okeyname1 != '\0') {
3125             jspec.colnames[JOIN_KEY] = okeyname1;
3126         } else if (n_keys > 0) {
3127             jspec.colnames[JOIN_KEY] = dset->varname[ikeyvars[1]];
3128         }
3129 
3130         /* and the secondary outer key, if any */
3131         if (*okeyname2 != '\0') {
3132             jspec.colnames[JOIN_KEY2] = okeyname2;
3133         } else if (n_keys > 1) {
3134             jspec.colnames[JOIN_KEY2] = dset->varname[ikeyvars[2]];
3135         }
3136 
3137         /* the data or "payload" column */
3138         if (aggr != AGGR_COUNT) {
3139             if (srcname != NULL) {
3140                 jspec.colnames[JOIN_TARG] = srcname;
3141             } else {
3142                 jspec.colnames[JOIN_TARG] = varname;
3143             }
3144         }
3145 
3146         /* the filter columns, if applicable */
3147         if (filter != NULL) {
3148             jspec.colnames[JOIN_F1] = filter->vname1;
3149             jspec.colnames[JOIN_F2] = filter->vname2;
3150             jspec.colnames[JOIN_F3] = filter->vname3;
3151         }
3152 
3153         /* the auxiliary var for aggregation, if present */
3154         if (auxname != NULL) {
3155             jspec.colnames[JOIN_AUX] = auxname;
3156         }
3157     }
3158 
3159     /* Step 3: handle the tconvert and tconv-fmt options */
3160 
3161     if (!err && tconvstr != NULL) {
3162         err = process_tconvert_info(&jspec, tconvstr, tconvfmt, tkeyfmt);
3163     }
3164 
3165     /* Step 4: read data from the outer file; check we got all the
3166        required columns; check that nothing is screwed up type-wise
3167     */
3168 
3169     if (!err) {
3170         PRN *vprn = verbose ? prn : NULL;
3171 
3172         if (has_native_suffix(fname)) {
3173             gretlopt gdt_opt = OPT_NONE;
3174 
3175             if (dataset_is_time_series(dset)) {
3176                 /* import obs markers: may be needed */
3177                 gdt_opt = OPT_M;
3178             }
3179             err = join_import_gdt(fname, &jspec, gdt_opt, vprn);
3180         } else {
3181             err = join_import_csv(fname, &jspec, opt, vprn);
3182         }
3183         if (!err) {
3184             outer_dset = outer_dataset(&jspec);
3185         }
3186 #if JDEBUG > 2
3187         if (!err) {
3188             print_outer_dataset(outer_dset, fname);
3189         }
3190 #endif
3191         if (!err) {
3192             err = check_for_missing_columns(&jspec);
3193         }
3194         if (!err) {
3195             err = aggregation_type_check(&jspec, aggr);
3196         }
3197         if (!err) {
3198             err = join_data_type_check(&jspec, dset, targvars, aggr);
3199         }
3200     }
3201 
3202     if (!err && verbose) {
3203         int i;
3204 
3205         pprintf(prn, _("Outer dataset: read %d columns and %d rows\n"),
3206                 outer_dset->v - 1, outer_dset->n);
3207         for (i=1; i<outer_dset->v; i++) {
3208             pprintf(prn, " col %d: '%s'\n", i, outer_dset->varname[i]);
3209         }
3210     }
3211 
3212     /* Step 5: set up keys and check for conformability errors */
3213 
3214     if (!err && jspec.colnames[JOIN_KEY] != NULL) {
3215         err = set_up_outer_keys(&jspec, dset, opt, ikeyvars, okeyvars,
3216                                 &auto_keys, str_keys);
3217     }
3218 
3219     if (!err && n_keys == 0 && dataset_is_time_series(dset)) {
3220         err = auto_keys_check(dset, outer_dset, opt, tkeyfmt,
3221                               &auto_keys, &n_keys, &do_tsjoin);
3222         if (do_tsjoin) {
3223             goto transcribe;
3224         }
3225     }
3226 
3227     if (!err && n_keys == 0 && filter == NULL && aggr == 0 &&
3228         auto_keys.timefmt == NULL) {
3229         /* the simple case: no need to build joiner struct */
3230         err = join_simple_range_check(dset, outer_dset, targvars);
3231         goto transcribe;
3232     }
3233 
3234     /* Step 6: build the joiner struct from the outer dataset,
3235        applying a filter if one is specified
3236     */
3237 
3238     if (!err) {
3239         jr = build_joiner(&jspec, dset, filter, aggr, seqval,
3240                           &auto_keys, n_keys, &err);
3241         if (!err && jr == NULL) {
3242             /* no matching data to join */
3243             goto bailout;
3244         }
3245     }
3246 
3247     if (!err && filter != NULL && verbose) {
3248         pprintf(prn, "Filter: %d rows were selected\n", jr->n_rows);
3249     }
3250 
3251     /* Step 7: transcribe more info and sort the "joiner" struct */
3252 
3253     if (!err) {
3254         jr->n_keys = n_keys;
3255         jr->str_keys = str_keys;
3256         jr->l_keyno = ikeyvars;
3257         jr->r_keyno = okeyvars;
3258         if (jr->n_keys > 0) {
3259             err = joiner_sort(jr);
3260         }
3261 #if JDEBUG > 1
3262         if (!err) joiner_print(jr);
3263 #endif
3264     }
3265 
3266     /* Step 8: another check now the joiner struct is ready */
3267 
3268     if (!err && jr != NULL && jr->n_keys == 0) {
3269         err = join_range_check(jr, dset, aggr);
3270     }
3271 
3272  transcribe:
3273 
3274     /* step 9: revise information on the series to be imported
3275        if we came across a wildcard specification or a case
3276        of MIDAS importation
3277     */
3278 
3279     if (!err && (jspec.wildcard || aggr == AGGR_MIDAS)) {
3280         free(targvars);
3281         if (aggr == AGGR_MIDAS) {
3282             targvars = midas_revise_jspec(&jspec, dset, &add_v, &err);
3283         } else {
3284             targvars = revise_series_indices(&jspec, dset, &add_v, &err);
3285         }
3286     }
3287 
3288     /* Step 10: transcribe or aggregate the data */
3289 
3290     if (!err && add_v > 0) {
3291         /* we need to add one or more new series on the left */
3292         if (jspec.wildnames != NULL) {
3293             err = add_target_series((const char **) jspec.wildnames,
3294                                     dset, targvars, add_v);
3295         } else if (jspec.mdsnames != NULL) {
3296             err = add_target_series((const char **) jspec.mdsnames,
3297                                     dset, targvars, add_v);
3298         } else {
3299             err = add_target_series(vnames, dset, targvars, add_v);
3300         }
3301     }
3302 
3303     if (!err) {
3304         if (jr == NULL && do_tsjoin) {
3305             ts_joiner tjr = {0};
3306 
3307             fill_ts_joiner(dset, outer_dset, &tjr);
3308             err = join_transcribe_multi_data(dset, outer_dset, targvars,
3309                                              orig_v, &jspec, &tjr,
3310                                              &modified);
3311         } else if (jr == NULL) {
3312             err = join_transcribe_multi_data(dset, outer_dset, targvars,
3313                                              orig_v, &jspec, NULL,
3314                                              &modified);
3315         } else if (jr->n_keys == 0) {
3316             err = join_transcribe_data(jr, targvars[1], add_v,
3317                                        &jspec, &modified);
3318         } else {
3319             err = aggregate_data(jr, ikeyvars, targvars, &jspec,
3320                                  orig_v, &modified);
3321             /* complete the job for MIDAS daily import */
3322             if (!err && aggr == AGGR_MIDAS && midas_daily(jr)) {
3323                 postprocess_daily_data(dset, targvars);
3324             }
3325         }
3326     }
3327 
3328 #if JDEBUG
3329     fprintf(stderr, "join: add_v = %d, modified = %d\n",
3330             add_v, modified);
3331 #endif
3332 
3333     if (!err && add_v > 0 && jspec.colnums[JOIN_TARG] > 0) {
3334         /* we added one or more new series */
3335         if (aggr != AGGR_MIDAS) {
3336             maybe_transfer_string_table(dset, outer_dset, &jspec,
3337                                         targvars, orig_v);
3338         }
3339     }
3340 
3341     if (err) {
3342         dataset_drop_last_variables(dset, dset->v - orig_v);
3343     } else {
3344         if (add_v || modified) {
3345             set_dataset_is_changed(dset, 1);
3346         }
3347         if (gretl_messages_on()) {
3348             if (add_v) {
3349                 pputs(prn, _("Data appended OK\n"));
3350             } else if (modified) {
3351                 pputs(prn, _("Data modified OK\n"));
3352             } else {
3353                 pputs(prn, _("No changes were made to the dataset\n"));
3354             }
3355         }
3356     }
3357 
3358  bailout:
3359 
3360     /* null out file-scope "timeconv" globals */
3361     timeconv_map_destroy();
3362 
3363     if (auto_keys.timefmt != NULL) {
3364         free(auto_keys.timefmt);
3365     }
3366 
3367     clear_jspec(&jspec, jr);
3368     joiner_destroy(jr);
3369     jr_filter_destroy(filter);
3370     free(targvars);
3371 
3372     return err;
3373 }
3374 
3375 /* called in csvdata.c */
3376 
timecol_get_format(const DATASET * dset,int v,char ** pfmt,int * q)3377 int timecol_get_format (const DATASET *dset, int v,
3378 			char **pfmt, int *q)
3379 {
3380     if (tconv_map.fmt == NULL) {
3381 	/* no formats present */
3382         return 0;
3383     } else if (tconv_map.tname == NULL) {
3384         /* get the common "tconvert" format */
3385         *pfmt = tconv_map.fmt[TCONV_FMT];
3386         *q = tconv_map.m_means_q[TCONV_FMT];
3387         return 1;
3388     } else if (!strcmp(dset->varname[v], tconv_map.tname)) {
3389         /* get the tkey-specific format */
3390         *pfmt = tconv_map.fmt[TKEY_FMT];
3391         *q = tconv_map.m_means_q[TKEY_FMT];
3392         return 1;
3393     } else if (tconv_map.fmt[TCONV_FMT] != NULL) {
3394         /* get the other one */
3395         *pfmt = tconv_map.fmt[TCONV_FMT];
3396         *q = tconv_map.m_means_q[TCONV_FMT];
3397         return 1;
3398     }
3399 
3400     return 0;
3401 }
3402